diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index fcee7997ab48e1ac9c51782aae6347c73e10a3bf..373083d53b04a006fb7087d4387497184fbcc1ea 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -68,6 +68,23 @@ Python 3 POCL MPI:
     reports:
       junit: test/pytest.xml
 
+Python 3 POCL Examples:
+  script:
+  - export PY_EXE=python3
+  - export PYOPENCL_TEST=portable
+  - export EXTRA_INSTALL="pybind11 numpy mako mpi4py pyvisfile pymetis"
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-py-project-and-run-examples.sh
+  - ". ./build-py-project-and-run-examples.sh"
+  tags:
+  - python3
+  - pocl
+  - large-node
+  except:
+  - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
 Documentation:
   script:
   - EXTRA_INSTALL="pybind11 numpy"
diff --git a/examples/advection/var-velocity.py b/examples/advection/var-velocity.py
index 00e4f0c8496dced20a21e808e68986ecec1d5be2..bc720731a820fd063fda36657218315174f3c5f3 100644
--- a/examples/advection/var-velocity.py
+++ b/examples/advection/var-velocity.py
@@ -40,6 +40,9 @@ from grudge import sym, bind, DGDiscretizationWithBoundaries
 from pytools.obj_array import join_fields
 
 
+FINAL_TIME = 5
+
+
 def main(write_output=True, order=4):
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
@@ -105,10 +108,9 @@ def main(write_output=True, order=4):
     def rhs(t, u):
         return bound_op(queue, t=t, u=u)
 
-    final_time = 50
     dt = dt_factor * h/order**2
-    nsteps = (final_time // dt) + 1
-    dt = final_time/nsteps + 1e-15
+    nsteps = (FINAL_TIME // dt) + 1
+    dt = FINAL_TIME/nsteps + 1e-15
 
     from grudge.shortcuts import set_up_rk4
     dt_stepper = set_up_rk4("u", dt, u, rhs)
@@ -118,14 +120,14 @@ def main(write_output=True, order=4):
 
     step = 0
 
-    for event in dt_stepper.run(t_end=final_time):
+    for event in dt_stepper.run(t_end=FINAL_TIME):
         if isinstance(event, dt_stepper.StateComputed):
 
             step += 1
             if step % 30 == 0:
                 print(step)
 
-                vis.write_vtk_file("fld-%04d.vtu" % step,
+                vis.write_vtk_file("fld-var-velocity-%04d.vtu" % step,
                         [("u", event.state_component)])
 
 
diff --git a/examples/advection/weak.py b/examples/advection/weak.py
index cdadb49e79e85c08a611f856c54970bf3c417b12..bd9d61ca3e2bbfd416a271ffbbabca22f366e3fd 100644
--- a/examples/advection/weak.py
+++ b/examples/advection/weak.py
@@ -103,7 +103,7 @@ def main(write_output=True, order=4):
             step += 1
 
             #print(step, event.t, norm(queue, u=event.state_component[0]))
-            vis.write_vtk_file("fld-%04d.vtu" % step,
+            vis.write_vtk_file("fld-weak-%04d.vtu" % step,
                     [  ("u", event.state_component) ])
 
 
diff --git a/examples/conftest.py b/examples/conftest.py
deleted file mode 100644
index 273b1c4a6bd099f96831d406678c910fc9371722..0000000000000000000000000000000000000000
--- a/examples/conftest.py
+++ /dev/null
@@ -1,7 +0,0 @@
-# This file tells py.test to scan for test_xxx() functions in
-# any file below here, not just those named test_xxxx.py.
-
-
-def pytest_collect_file(path, parent):
-    if "grudge/examples" in str(path.dirpath()) and path.ext == ".py":
-        return parent.Module(path, parent=parent)
diff --git a/examples/dagrt-fusion.py b/examples/dagrt-fusion.py
new file mode 100755
index 0000000000000000000000000000000000000000..a69f936cb3da95c0c6d251997a2c7e7d7e01aafa
--- /dev/null
+++ b/examples/dagrt-fusion.py
@@ -0,0 +1,1212 @@
+#!/usr/bin/env python3
+"""Study of operator fusion (inlining) for time integration operators in Grudge.
+"""
+
+from __future__ import division, print_function
+
+__copyright__ = """
+Copyright (C) 2015 Andreas Kloeckner
+Copyright (C) 2019 Matt Wala
+"""
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import logging
+import numpy as np
+import six
+import sys
+import pyopencl as cl
+import pyopencl.array  # noqa
+import pytest
+
+import dagrt.language as lang
+import pymbolic.primitives as p
+import grudge.symbolic.mappers as gmap
+import grudge.symbolic.operators as op
+from grudge.execution import ExecutionMapper
+from grudge.function_registry import base_function_registry
+from pymbolic.mapper import Mapper
+from pymbolic.mapper.evaluator import EvaluationMapper \
+        as PymbolicEvaluationMapper
+from pytools import memoize
+
+from grudge import sym, bind, DGDiscretizationWithBoundaries
+from leap.rk import LSRK4Method
+
+from pyopencl.tools import (  # noqa
+        pytest_generate_tests_for_pyopencl as pytest_generate_tests)
+
+
+logging.basicConfig(level=logging.INFO)
+
+logger = logging.getLogger(__name__)
+
+
+PRINT_RESULTS_TO_STDOUT = True
+
+
+# {{{ topological sort
+
+def topological_sort(stmts, root_deps):
+    id_to_stmt = {stmt.id: stmt for stmt in stmts}
+
+    ordered_stmts = []
+    satisfied = set()
+
+    def satisfy_dep(name):
+        if name in satisfied:
+            return
+
+        stmt = id_to_stmt[name]
+        for dep in sorted(stmt.depends_on):
+            satisfy_dep(dep)
+        ordered_stmts.append(stmt)
+        satisfied.add(name)
+
+    for d in root_deps:
+        satisfy_dep(d)
+
+    return ordered_stmts
+
+# }}}
+
+
+# {{{ leap to grudge translation
+
+# Use evaluation, not identity mappers to propagate symbolic vectors to
+# outermost level.
+
+class DagrtToGrudgeRewriter(PymbolicEvaluationMapper):
+    def __init__(self, context):
+        self.context = context
+
+    def map_variable(self, expr):
+        return self.context[expr.name]
+
+    def map_call(self, expr):
+        raise ValueError("function call not expected")
+
+
+class GrudgeArgSubstitutor(gmap.SymbolicEvaluator):
+    def __init__(self, args):
+        super().__init__(context={})
+        self.args = args
+
+    def map_grudge_variable(self, expr):
+        if expr.name in self.args:
+            return self.args[expr.name]
+        return super().map_variable(expr)
+
+
+def transcribe_phase(dag, field_var_name, field_components, phase_name,
+                     sym_operator):
+    """Generate a Grudge operator for a Dagrt time integrator phase.
+
+    Arguments:
+
+        dag: The Dagrt code object for the time integrator
+
+        field_var_name: The name of the simulation variable
+
+        field_components: The number of components (fields) in the variable
+
+        phase_name: The name of the phase to transcribe
+
+        sym_operator: The Grudge symbolic expression to substitue for the
+            right-hand side evaluation in the Dagrt code
+    """
+    sym_operator = gmap.OperatorBinder()(sym_operator)
+    phase = dag.phases[phase_name]
+
+    ctx = {
+            "<t>": sym.var("input_t", sym.DD_SCALAR),
+            "<dt>": sym.var("input_dt", sym.DD_SCALAR),
+            f"<state>{field_var_name}": sym.make_sym_array(
+                f"input_{field_var_name}", field_components),
+            f"<p>residual": sym.make_sym_array(
+                "input_residual", field_components),
+    }
+
+    rhs_name = f"<func>{field_var_name}"
+    output_vars = [v for v in ctx]
+    yielded_states = []
+
+    from dagrt.codegen.transform import isolate_function_calls_in_phase
+    ordered_stmts = topological_sort(
+            isolate_function_calls_in_phase(
+                phase,
+                dag.get_stmt_id_generator(),
+                dag.get_var_name_generator()).statements,
+            phase.depends_on)
+
+    for stmt in ordered_stmts:
+        if stmt.condition is not True:
+            raise NotImplementedError(
+                "non-True condition (in statement '%s') not supported"
+                % stmt.id)
+
+        if isinstance(stmt, lang.Nop):
+            pass
+
+        elif isinstance(stmt, lang.Assign):
+            if not isinstance(stmt.lhs, p.Variable):
+                raise NotImplementedError("lhs of statement %s is not a variable: %s"
+                        % (stmt.id, stmt.lhs))
+            ctx[stmt.lhs.name] = sym.cse(
+                    DagrtToGrudgeRewriter(ctx)(stmt.rhs),
+                    (
+                        stmt.lhs.name
+                        .replace("<", "")
+                        .replace(">", "")))
+
+        elif isinstance(stmt, lang.AssignFunctionCall):
+            if stmt.function_id != rhs_name:
+                raise NotImplementedError(
+                        "statement '%s' calls unsupported function '%s'"
+                        % (stmt.id, stmt.function_id))
+
+            if stmt.parameters:
+                raise NotImplementedError(
+                    "statement '%s' calls function '%s' with positional arguments"
+                    % (stmt.id, stmt.function_id))
+
+            kwargs = {name: sym.cse(DagrtToGrudgeRewriter(ctx)(arg))
+                      for name, arg in stmt.kw_parameters.items()}
+
+            if len(stmt.assignees) != 1:
+                raise NotImplementedError(
+                    "statement '%s' calls function '%s' "
+                    "with more than one LHS"
+                    % (stmt.id, stmt.function_id))
+
+            assignee, = stmt.assignees
+            ctx[assignee] = GrudgeArgSubstitutor(kwargs)(sym_operator)
+
+        elif isinstance(stmt, lang.YieldState):
+            d2g = DagrtToGrudgeRewriter(ctx)
+            yielded_states.append(
+                    (
+                        stmt.time_id,
+                        d2g(stmt.time),
+                        stmt.component_id,
+                        d2g(stmt.expression)))
+
+        else:
+            raise NotImplementedError("statement %s is of unsupported type ''%s'"
+                        % (stmt.id, type(stmt).__name__))
+
+    return output_vars, [ctx[ov] for ov in output_vars], yielded_states
+
+# }}}
+
+
+# {{{ time integrator implementations
+
+class RK4TimeStepperBase(object):
+
+    def __init__(self, queue, component_getter):
+        self.queue = queue
+        self.component_getter = component_getter
+
+    def get_initial_context(self, fields, t_start, dt):
+        from pytools.obj_array import join_fields
+
+        # Flatten fields.
+        flattened_fields = []
+        for field in fields:
+            if isinstance(field, list):
+                flattened_fields.extend(field)
+            else:
+                flattened_fields.append(field)
+        flattened_fields = join_fields(*flattened_fields)
+        del fields
+
+        return {
+                "input_t": t_start,
+                "input_dt": dt,
+                self.state_name: flattened_fields,
+                "input_residual": flattened_fields,
+        }
+
+    def set_up_stepper(self, discr, field_var_name, sym_rhs, num_fields,
+                       function_registry=base_function_registry,
+                       exec_mapper_factory=ExecutionMapper):
+        dt_method = LSRK4Method(component_id=field_var_name)
+        dt_code = dt_method.generate()
+        self.field_var_name = field_var_name
+        self.state_name = f"input_{field_var_name}"
+
+        # Transcribe the phase.
+        output_vars, results, yielded_states = transcribe_phase(
+                dt_code, field_var_name, num_fields,
+                "primary", sym_rhs)
+
+        # Build the bound operator for the time integrator.
+        output_t = results[0]
+        output_dt = results[1]
+        output_states = results[2]
+        output_residuals = results[3]
+
+        assert len(output_states) == num_fields
+        assert len(output_states) == len(output_residuals)
+
+        from pytools.obj_array import join_fields
+        flattened_results = join_fields(output_t, output_dt, *output_states)
+
+        self.bound_op = bind(
+                discr, flattened_results,
+                function_registry=function_registry,
+                exec_mapper_factory=exec_mapper_factory)
+
+    def run(self, fields, t_start, dt, t_end, return_profile_data=False):
+        context = self.get_initial_context(fields, t_start, dt)
+
+        t = t_start
+
+        while t <= t_end:
+            if return_profile_data:
+                profile_data = dict()
+            else:
+                profile_data = None
+
+            results = self.bound_op(
+                    self.queue,
+                    profile_data=profile_data,
+                    **context)
+
+            if return_profile_data:
+                results = results[0]
+
+            t = results[0]
+            context["input_t"] = t
+            context["input_dt"] = results[1]
+            output_states = results[2:]
+            context[self.state_name] = output_states
+
+            result = (t, self.component_getter(output_states))
+            if return_profile_data:
+                result += (profile_data,)
+
+            yield result
+
+
+class RK4TimeStepper(RK4TimeStepperBase):
+
+    def __init__(self, queue, discr, field_var_name, grudge_bound_op,
+                 num_fields, component_getter, exec_mapper_factory=ExecutionMapper):
+        """Arguments:
+
+            field_var_name: The name of the simulation variable
+
+            grudge_bound_op: The BoundExpression for the right-hand side
+
+            num_fields: The number of components in the simulation variable
+
+            component_getter: A function, which, given an object array
+               representing the simulation variable, splits the array into
+               its components
+
+        """
+        super().__init__(queue, component_getter)
+
+        # Construct sym_rhs to have the effect of replacing the RHS calls in the
+        # dagrt code with calls of the grudge operator.
+        from grudge.symbolic.primitives import FunctionSymbol, Variable
+        call = sym.cse(
+                FunctionSymbol("grudge_op")(*(
+                    (Variable("t", dd=sym.DD_SCALAR),)
+                    + tuple(
+                        Variable(field_var_name, dd=sym.DD_VOLUME)[i]
+                        for i in range(num_fields)))))
+
+        from pytools.obj_array import join_fields
+        sym_rhs = join_fields(*(call[i] for i in range(num_fields)))
+
+        self.queue = queue
+        self.grudge_bound_op = grudge_bound_op
+
+        from grudge.function_registry import register_external_function
+
+        freg = register_external_function(
+                base_function_registry,
+                "grudge_op",
+                implementation=self._bound_op,
+                dd=sym.DD_VOLUME)
+
+        self.set_up_stepper(
+                discr, field_var_name, sym_rhs, num_fields,
+                freg,
+                exec_mapper_factory)
+
+        self.component_getter = component_getter
+
+    def _bound_op(self, queue, t, *args, profile_data=None):
+        from pytools.obj_array import join_fields
+        context = {
+                "t": t,
+                self.field_var_name: join_fields(*args)}
+        result = self.grudge_bound_op(
+                queue, profile_data=profile_data, **context)
+        if profile_data is not None:
+            result = result[0]
+        return result
+
+    def get_initial_context(self, fields, t_start, dt):
+        context = super().get_initial_context(fields, t_start, dt)
+        context["grudge_op"] = self._bound_op
+        return context
+
+
+class FusedRK4TimeStepper(RK4TimeStepperBase):
+
+    def __init__(self, queue, discr, field_var_name, sym_rhs, num_fields,
+                 component_getter, exec_mapper_factory=ExecutionMapper):
+        super().__init__(queue, component_getter)
+        self.set_up_stepper(
+                discr, field_var_name, sym_rhs, num_fields,
+                base_function_registry,
+                exec_mapper_factory)
+
+# }}}
+
+
+# {{{ problem setup code
+
+def get_strong_wave_op_with_discr(cl_ctx, dims=2, order=4):
+    from meshmode.mesh.generation import generate_regular_rect_mesh
+    mesh = generate_regular_rect_mesh(
+            a=(-0.5,)*dims,
+            b=(0.5,)*dims,
+            n=(16,)*dims)
+
+    logger.debug("%d elements", mesh.nelements)
+
+    discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=order)
+
+    source_center = np.array([0.1, 0.22, 0.33])[:dims]
+    source_width = 0.05
+    source_omega = 3
+
+    sym_x = sym.nodes(mesh.dim)
+    sym_source_center_dist = sym_x - source_center
+    sym_t = sym.ScalarVariable("t")
+
+    from grudge.models.wave import StrongWaveOperator
+    from meshmode.mesh import BTAG_ALL, BTAG_NONE
+    op = StrongWaveOperator(-0.1, dims,
+            source_f=(
+                sym.sin(source_omega*sym_t)
+                * sym.exp(
+                    -np.dot(sym_source_center_dist, sym_source_center_dist)
+                    / source_width**2)),
+            dirichlet_tag=BTAG_NONE,
+            neumann_tag=BTAG_NONE,
+            radiation_tag=BTAG_ALL,
+            flux_type="upwind")
+
+    op.check_bc_coverage(mesh)
+
+    return (op, discr)
+
+
+def get_strong_wave_component(state_component):
+    return (state_component[0], state_component[1:])
+
+# }}}
+
+
+# {{{ equivalence check between fused and non-fused versions
+
+def test_stepper_equivalence(ctx_factory, order=4):
+    cl_ctx = ctx_factory()
+    queue = cl.CommandQueue(cl_ctx)
+
+    dims = 2
+
+    op, discr = get_strong_wave_op_with_discr(cl_ctx, dims=dims, order=order)
+
+    if dims == 2:
+        dt = 0.04
+    elif dims == 3:
+        dt = 0.02
+
+    from pytools.obj_array import join_fields
+    ic = join_fields(discr.zeros(queue),
+            [discr.zeros(queue) for i in range(discr.dim)])
+
+    bound_op = bind(discr, op.sym_operator())
+
+    stepper = RK4TimeStepper(
+            queue, discr, "w", bound_op, 1 + discr.dim, get_strong_wave_component)
+
+    fused_stepper = FusedRK4TimeStepper(
+            queue, discr, "w", op.sym_operator(), 1 + discr.dim,
+            get_strong_wave_component)
+
+    t_start = 0
+    t_end = 0.5
+    nsteps = int(np.ceil((t_end + 1e-9) / dt))
+    print("dt=%g nsteps=%d" % (dt, nsteps))
+
+    step = 0
+
+    norm = bind(discr, sym.norm(2, sym.var("u_ref") - sym.var("u")))
+
+    fused_steps = fused_stepper.run(ic, t_start, dt, t_end)
+
+    for t_ref, (u_ref, v_ref) in stepper.run(ic, t_start, dt, t_end):
+        step += 1
+        logger.debug("step %d/%d", step, nsteps)
+        t, (u, v) = next(fused_steps)
+        assert t == t_ref, step
+        assert norm(queue, u=u, u_ref=u_ref) <= 1e-13, step
+
+# }}}
+
+
+# {{{ execution mapper wrapper
+
+class ExecutionMapperWrapper(Mapper):
+
+    def __init__(self, queue, context, bound_op):
+        self.inner_mapper = ExecutionMapper(queue, context, bound_op)
+        self.queue = queue
+        self.context = context
+        self.bound_op = bound_op
+
+    def map_variable(self, expr):
+        # Needed, because bound op execution can ask for variable values.
+        return self.inner_mapper.map_variable(expr)
+
+    def map_grudge_variable(self, expr):
+        # See map_variable()
+        return self.inner_mapper.map_grudge_variable(expr)
+
+# }}}
+
+
+# {{{ mem op counter implementation
+
+class ExecutionMapperWithMemOpCounting(ExecutionMapperWrapper):
+    # This is a skeleton implementation that only has just enough functionality
+    # for the wave-min example to work.
+
+    # {{{ expressions
+
+    def map_profiled_call(self, expr, profile_data):
+        args = [self.inner_mapper.rec(p) for p in expr.parameters]
+        return self.inner_mapper.function_registry[expr.function.name](
+                self.queue, *args, profile_data=profile_data)
+
+    def map_profiled_essentially_elementwise_linear(self, op, field_expr,
+                                                    profile_data):
+        result = getattr(self.inner_mapper, op.mapper_method)(op, field_expr)
+
+        if profile_data is not None:
+            # We model the cost to load the input and write the output.  In
+            # particular, we assume the elementwise matrices are negligible in
+            # size and thus ignorable.
+
+            field = self.inner_mapper.rec(field_expr)
+            profile_data["bytes_read"] = (
+                    profile_data.get("bytes_read", 0) + field.nbytes)
+            profile_data["bytes_written"] = (
+                    profile_data.get("bytes_written", 0) + result.nbytes)
+
+            if op.mapper_method == "map_interpolation":
+                profile_data["interp_bytes_read"] = (
+                        profile_data.get("interp_bytes_read", 0) + field.nbytes)
+                profile_data["interp_bytes_written"] = (
+                        profile_data.get("interp_bytes_written", 0) + result.nbytes)
+
+        return result
+
+    # }}}
+
+    # {{{ instruction mappings
+
+    def process_assignment_expr(self, expr, profile_data):
+        if isinstance(expr, p.Call):
+            assert expr.mapper_method == "map_call"
+            val = self.map_profiled_call(expr, profile_data)
+
+        elif isinstance(expr, sym.OperatorBinding):
+            if isinstance(
+                    expr.op,
+                    (
+                        # TODO: Not comprehensive.
+                        op.InterpolationOperator,
+                        op.RefFaceMassOperator,
+                        op.RefInverseMassOperator,
+                        op.OppositeInteriorFaceSwap)):
+                val = self.map_profiled_essentially_elementwise_linear(
+                        expr.op, expr.field, profile_data)
+
+            else:
+                assert False, ("unknown operator: %s" % expr.op)
+
+        else:
+            logger.debug("assignment not profiled: %s", expr)
+            val = self.inner_mapper.rec(expr)
+
+        return val
+
+    def map_insn_assign(self, insn, profile_data):
+        result = []
+        for name, expr in zip(insn.names, insn.exprs):
+            result.append((name, self.process_assignment_expr(expr, profile_data)))
+        return result, []
+
+    def map_insn_loopy_kernel(self, insn, profile_data):
+        kwargs = {}
+        kdescr = insn.kernel_descriptor
+        for name, expr in six.iteritems(kdescr.input_mappings):
+            val = self.inner_mapper.rec(expr)
+            kwargs[name] = val
+            assert not isinstance(val, np.ndarray)
+            if profile_data is not None and isinstance(val, pyopencl.array.Array):
+                profile_data["bytes_read"] = (
+                        profile_data.get("bytes_read", 0) + val.nbytes)
+                profile_data["bytes_read_by_scalar_assignments"] = (
+                        profile_data.get("bytes_read_by_scalar_assignments", 0)
+                        + val.nbytes)
+
+        discr = self.inner_mapper.discrwb.discr_from_dd(kdescr.governing_dd)
+        for name in kdescr.scalar_args():
+            v = kwargs[name]
+            if isinstance(v, (int, float)):
+                kwargs[name] = discr.real_dtype.type(v)
+            elif isinstance(v, complex):
+                kwargs[name] = discr.complex_dtype.type(v)
+            elif isinstance(v, np.number):
+                pass
+            else:
+                raise ValueError("unrecognized scalar type for variable '%s': %s"
+                        % (name, type(v)))
+
+        kwargs["grdg_n"] = discr.nnodes
+        evt, result_dict = kdescr.loopy_kernel(self.queue, **kwargs)
+
+        for val in result_dict.values():
+            assert not isinstance(val, np.ndarray)
+            if profile_data is not None and isinstance(val, pyopencl.array.Array):
+                profile_data["bytes_written"] = (
+                        profile_data.get("bytes_written", 0) + val.nbytes)
+                profile_data["bytes_written_by_scalar_assignments"] = (
+                        profile_data.get("bytes_written_by_scalar_assignments", 0)
+                        + val.nbytes)
+
+        return list(result_dict.items()), []
+
+    def map_insn_assign_to_discr_scoped(self, insn, profile_data=None):
+        assignments = []
+
+        for name, expr in zip(insn.names, insn.exprs):
+            logger.debug("assignment not profiled: %s <- %s", name, expr)
+            inner_mapper = self.inner_mapper
+            value = inner_mapper.rec(expr)
+            inner_mapper.discrwb._discr_scoped_subexpr_name_to_value[name] = value
+            assignments.append((name, value))
+
+        return assignments, []
+
+    def map_insn_assign_from_discr_scoped(self, insn, profile_data=None):
+        return [(
+            insn.name,
+            self.inner_mapper.
+                discrwb._discr_scoped_subexpr_name_to_value[insn.name])], []
+
+    def map_insn_rank_data_swap(self, insn, profile_data):
+        raise NotImplementedError("no profiling for instruction: %s" % insn)
+
+    def map_insn_diff_batch_assign(self, insn, profile_data):
+        assignments, futures = self.inner_mapper.map_insn_diff_batch_assign(insn)
+
+        if profile_data is not None:
+            # We model the cost to load the input and write the output.  In
+            # particular, we assume the elementwise matrices are negligible in
+            # size and thus ignorable.
+
+            field = self.inner_mapper.rec(insn.field)
+            profile_data["bytes_read"] = (
+                    profile_data.get("bytes_read", 0) + field.nbytes)
+
+            for _, value in assignments:
+                profile_data["bytes_written"] = (
+                        profile_data.get("bytes_written", 0) + value.nbytes)
+
+        return assignments, futures
+
+    # }}}
+
+# }}}
+
+
+# {{{ mem op counter check
+
+def test_assignment_memory_model(ctx_factory):
+    cl_ctx = ctx_factory()
+    queue = cl.CommandQueue(cl_ctx)
+
+    _, discr = get_strong_wave_op_with_discr(cl_ctx, dims=2, order=3)
+
+    # Assignment instruction
+    bound_op = bind(
+            discr,
+            sym.Variable("input0", sym.DD_VOLUME)
+            + sym.Variable("input1", sym.DD_VOLUME),
+            exec_mapper_factory=ExecutionMapperWithMemOpCounting)
+
+    input0 = discr.zeros(queue)
+    input1 = discr.zeros(queue)
+
+    result, profile_data = bound_op(
+            queue,
+            profile_data={},
+            input0=input0,
+            input1=input1)
+
+    assert profile_data["bytes_read"] == input0.nbytes + input1.nbytes
+    assert profile_data["bytes_written"] == result.nbytes
+
+
+@pytest.mark.parametrize("use_fusion", (True, False))
+def test_stepper_mem_ops(ctx_factory, use_fusion):
+    cl_ctx = ctx_factory()
+    queue = cl.CommandQueue(cl_ctx)
+
+    dims = 2
+
+    op, discr = get_strong_wave_op_with_discr(cl_ctx, dims=dims, order=3)
+
+    t_start = 0
+    dt = 0.04
+    t_end = 0.2
+
+    from pytools.obj_array import join_fields
+    ic = join_fields(discr.zeros(queue),
+            [discr.zeros(queue) for i in range(discr.dim)])
+
+    if not use_fusion:
+        bound_op = bind(
+                discr, op.sym_operator(),
+                exec_mapper_factory=ExecutionMapperWithMemOpCounting)
+
+        stepper = RK4TimeStepper(
+                queue, discr, "w", bound_op, 1 + discr.dim,
+                get_strong_wave_component,
+                exec_mapper_factory=ExecutionMapperWithMemOpCounting)
+
+    else:
+        stepper = FusedRK4TimeStepper(
+                queue, discr, "w", op.sym_operator(), 1 + discr.dim,
+                get_strong_wave_component,
+                exec_mapper_factory=ExecutionMapperWithMemOpCounting)
+
+    step = 0
+
+    nsteps = int(np.ceil((t_end + 1e-9) / dt))
+    for (_, _, profile_data) in stepper.run(
+            ic, t_start, dt, t_end, return_profile_data=True):
+        step += 1
+        logger.info("step %d/%d", step, nsteps)
+
+    logger.info("using fusion? %s", use_fusion)
+    logger.info("bytes read: %d", profile_data["bytes_read"])
+    logger.info("bytes written: %d", profile_data["bytes_written"])
+    logger.info("bytes total: %d",
+            profile_data["bytes_read"] + profile_data["bytes_written"])
+
+# }}}
+
+
+# {{{ execution mapper with timing
+
+SECONDS_PER_NANOSECOND = 10**9
+
+
+class TimingFuture(object):
+
+    def __init__(self, start_event, stop_event):
+        self.start_event = start_event
+        self.stop_event = stop_event
+
+    def elapsed(self):
+        cl.wait_for_events([self.start_event, self.stop_event])
+        return (
+                self.stop_event.profile.end
+                - self.start_event.profile.end) / SECONDS_PER_NANOSECOND
+
+
+from collections.abc import MutableSequence
+
+
+class TimingFutureList(MutableSequence):
+
+    def __init__(self, *args, **kwargs):
+        self._list = list(*args, **kwargs)
+
+    def __len__(self):
+        return len(self._list)
+
+    def __getitem__(self, idx):
+        return self._list[idx]
+
+    def __setitem__(self, idx, val):
+        self._list[idx] = val
+
+    def __delitem__(self, idx):
+        del self._list[idx]
+
+    def insert(self, idx, val):
+        self._list.insert(idx, val)
+
+    def elapsed(self):
+        return sum(future.elapsed() for future in self._list)
+
+
+def time_insn(f):
+    time_field_name = "time_%s" % f.__name__
+
+    def wrapper(self, insn, profile_data):
+        if profile_data is None:
+            return f(self, insn, profile_data)
+
+        start = cl.enqueue_marker(self.queue)
+        retval = f(self, insn, profile_data)
+        end = cl.enqueue_marker(self.queue)
+        profile_data\
+                .setdefault(time_field_name, TimingFutureList())\
+                .append(TimingFuture(start, end))
+
+        return retval
+
+    return wrapper
+
+
+class ExecutionMapperWithTiming(ExecutionMapperWrapper):
+
+    def map_profiled_call(self, expr, profile_data):
+        args = [self.inner_mapper.rec(p) for p in expr.parameters]
+        return self.inner_mapper.function_registry[expr.function.name](
+                self.queue, *args, profile_data=profile_data)
+
+    def map_profiled_operator_binding(self, expr, profile_data):
+        if profile_data is None:
+            return self.inner_mapper.map_operator_binding(expr)
+
+        start = cl.enqueue_marker(self.queue)
+        retval = self.inner_mapper.map_operator_binding(expr)
+        end = cl.enqueue_marker(self.queue)
+        time_field_name = "time_op_%s" % expr.op.mapper_method
+        profile_data\
+                .setdefault(time_field_name, TimingFutureList())\
+                .append(TimingFuture(start, end))
+
+        return retval
+
+    def map_insn_assign_to_discr_scoped(self, insn, profile_data):
+        return self.inner_mapper.map_insn_assign_to_discr_scoped(insn, profile_data)
+
+    def map_insn_assign_from_discr_scoped(self, insn, profile_data):
+        return self.\
+            inner_mapper.map_insn_assign_from_discr_scoped(insn, profile_data)
+
+    @time_insn
+    def map_insn_loopy_kernel(self, *args, **kwargs):
+        return self.inner_mapper.map_insn_loopy_kernel(*args, **kwargs)
+
+    def map_insn_assign(self, insn, profile_data):
+        if len(insn.exprs) == 1:
+            if isinstance(insn.exprs[0], p.Call):
+                assert insn.exprs[0].mapper_method == "map_call"
+                val = self.map_profiled_call(insn.exprs[0], profile_data)
+                return [(insn.names[0], val)], []
+            elif isinstance(insn.exprs[0], sym.OperatorBinding):
+                assert insn.exprs[0].mapper_method == "map_operator_binding"
+                val = self.map_profiled_operator_binding(insn.exprs[0], profile_data)
+                return [(insn.names[0], val)], []
+
+        return self.inner_mapper.map_insn_assign(insn, profile_data)
+
+    @time_insn
+    def map_insn_diff_batch_assign(self, insn, profile_data):
+        return self.inner_mapper.map_insn_diff_batch_assign(insn, profile_data)
+
+# }}}
+
+
+# {{{ timing check
+
+@pytest.mark.parametrize("use_fusion", (True, False))
+def test_stepper_timing(ctx_factory, use_fusion):
+    cl_ctx = ctx_factory()
+    queue = cl.CommandQueue(
+            cl_ctx,
+            properties=cl.command_queue_properties.PROFILING_ENABLE)
+
+    dims = 3
+
+    op, discr = get_strong_wave_op_with_discr(cl_ctx, dims=dims, order=3)
+
+    t_start = 0
+    dt = 0.04
+    t_end = 0.1
+
+    from pytools.obj_array import join_fields
+    ic = join_fields(discr.zeros(queue),
+            [discr.zeros(queue) for i in range(discr.dim)])
+
+    if not use_fusion:
+        bound_op = bind(
+                discr, op.sym_operator(),
+                exec_mapper_factory=ExecutionMapperWithTiming)
+
+        stepper = RK4TimeStepper(
+                queue, discr, "w", bound_op, 1 + discr.dim,
+                get_strong_wave_component,
+                exec_mapper_factory=ExecutionMapperWithTiming)
+
+    else:
+        stepper = FusedRK4TimeStepper(
+                queue, discr, "w", op.sym_operator(), 1 + discr.dim,
+                get_strong_wave_component,
+                exec_mapper_factory=ExecutionMapperWithTiming)
+
+    step = 0
+
+    import time
+    t = time.time()
+    nsteps = int(np.ceil((t_end + 1e-9) / dt))
+    for (_, _, profile_data) in stepper.run(
+            ic, t_start, dt, t_end, return_profile_data=True):
+        step += 1
+        tn = time.time()
+        logger.info("step %d/%d: %f", step, nsteps, tn - t)
+        t = tn
+
+    logger.info("fusion? %s", use_fusion)
+    for key, value in profile_data.items():
+        if isinstance(value, TimingFutureList):
+            print(key, value.elapsed())
+
+# }}}
+
+
+# {{{ paper outputs
+
+def get_example_stepper(queue, dims=2, order=3, use_fusion=True,
+                        exec_mapper_factory=ExecutionMapper,
+                        return_ic=False):
+    op, discr = get_strong_wave_op_with_discr(queue.context, dims=dims, order=3)
+
+    if not use_fusion:
+        bound_op = bind(
+                discr, op.sym_operator(),
+                exec_mapper_factory=exec_mapper_factory)
+
+        stepper = RK4TimeStepper(
+                queue, discr, "w", bound_op, 1 + discr.dim,
+                get_strong_wave_component,
+                exec_mapper_factory=exec_mapper_factory)
+
+    else:
+        stepper = FusedRK4TimeStepper(
+                queue, discr, "w", op.sym_operator(), 1 + discr.dim,
+                get_strong_wave_component,
+                exec_mapper_factory=exec_mapper_factory)
+
+    if return_ic:
+        from pytools.obj_array import join_fields
+        ic = join_fields(discr.zeros(queue),
+                [discr.zeros(queue) for i in range(discr.dim)])
+        return stepper, ic
+
+    return stepper
+
+
+def latex_table(table_format, header, rows):
+    result = []
+    _ = result.append
+    _(rf"\begin{{tabular}}{{{table_format}}}")
+    _(r"\toprule")
+    _(" & ".join(rf"\multicolumn{{1}}{{c}}{{{item}}}" for item in header) + r" \\")
+    _(r"\midrule")
+    for row in rows:
+        _(" & ".join(row) + r" \\")
+    _(r"\bottomrule")
+    _(r"\end{tabular}")
+    return "\n".join(result)
+
+
+def ascii_table(table_format, header, rows):
+    from pytools import Table
+    table = Table()
+    table.add_row(header)
+
+    for input_row in rows:
+        row = []
+        for item in input_row:
+            if item.startswith(r"\num{"):
+                # Strip \num{...} formatting
+                row.append(item[5:-1])
+            else:
+                row.append(item)
+        table.add_row(row)
+
+    return str(table)
+
+
+if PRINT_RESULTS_TO_STDOUT:
+    table = ascii_table
+else:
+    table = latex_table
+
+
+def problem_stats(order=3):
+    cl_ctx = cl.create_some_context()
+
+    _, dg_discr_2d = get_strong_wave_op_with_discr(cl_ctx, dims=2, order=order)
+    print("Number of 2D elements:", dg_discr_2d.mesh.nelements)
+    vol_discr_2d = dg_discr_2d.discr_from_dd("vol")
+    dofs_2d = {group.nunit_nodes for group in vol_discr_2d.groups}
+    from pytools import one
+    print("Number of DOFs per 2D element:", one(dofs_2d))
+
+    _, dg_discr_3d = get_strong_wave_op_with_discr(cl_ctx, dims=3, order=order)
+    print("Number of 3D elements:", dg_discr_3d.mesh.nelements)
+    vol_discr_3d = dg_discr_3d.discr_from_dd("vol")
+    dofs_3d = {group.nunit_nodes for group in vol_discr_3d.groups}
+    from pytools import one
+    print("Number of DOFs per 3D element:", one(dofs_3d))
+
+
+def statement_counts_table():
+    cl_ctx = cl.create_some_context()
+    queue = cl.CommandQueue(cl_ctx)
+
+    fused_stepper = get_example_stepper(queue, use_fusion=True)
+    stepper = get_example_stepper(queue, use_fusion=False)
+
+    if PRINT_RESULTS_TO_STDOUT:
+        print("==== Statement Counts ====")
+        outf = sys.stdout
+    else:
+        out_path = "statement-counts.tex"
+        outf = open(out_path, "w")
+
+    print(
+        table(
+            "lr",
+            ("Operator", "Grudge Node Count"),
+            (
+                ("Time integration: baseline",
+                 r"\num{%d}" % len(stepper.bound_op.eval_code.instructions)),
+                ("Right-hand side: baseline",
+                 r"\num{%d}" % len(stepper.grudge_bound_op.eval_code.instructions)),
+                ("Inlined operator",
+                 r"\num{%d}" % len(fused_stepper.bound_op.eval_code.instructions))
+            )),
+        file=outf)
+
+
+@memoize(key=lambda queue, dims: dims)
+def mem_ops_results(queue, dims):
+    fused_stepper = get_example_stepper(
+            queue,
+            dims=dims,
+            use_fusion=True,
+            exec_mapper_factory=ExecutionMapperWithMemOpCounting)
+
+    stepper, ic = get_example_stepper(
+            queue,
+            dims=dims,
+            use_fusion=False,
+            exec_mapper_factory=ExecutionMapperWithMemOpCounting,
+            return_ic=True)
+
+    t_start = 0
+    dt = 0.02
+    t_end = 0.02
+
+    result = {}
+
+    for (_, _, profile_data) in stepper.run(
+            ic, t_start, dt, t_end, return_profile_data=True):
+        pass
+
+    result["nonfused_bytes_read"] = profile_data["bytes_read"]
+    result["nonfused_bytes_written"] = profile_data["bytes_written"]
+    result["nonfused_bytes_total"] = \
+            result["nonfused_bytes_read"] \
+            + result["nonfused_bytes_written"]
+
+    result["nonfused_bytes_read_by_scalar_assignments"] = \
+            profile_data["bytes_read_by_scalar_assignments"]
+    result["nonfused_bytes_written_by_scalar_assignments"] = \
+            profile_data["bytes_written_by_scalar_assignments"]
+    result["nonfused_bytes_total_by_scalar_assignments"] = \
+            result["nonfused_bytes_read_by_scalar_assignments"] \
+            + result["nonfused_bytes_written_by_scalar_assignments"]
+
+    for (_, _, profile_data) in fused_stepper.run(
+            ic, t_start, dt, t_end, return_profile_data=True):
+        pass
+
+    result["fused_bytes_read"] = profile_data["bytes_read"]
+    result["fused_bytes_written"] = profile_data["bytes_written"]
+    result["fused_bytes_total"] = \
+            result["fused_bytes_read"] \
+            + result["fused_bytes_written"]
+
+    result["fused_bytes_read_by_scalar_assignments"] = \
+            profile_data["bytes_read_by_scalar_assignments"]
+    result["fused_bytes_written_by_scalar_assignments"] = \
+            profile_data["bytes_written_by_scalar_assignments"]
+    result["fused_bytes_total_by_scalar_assignments"] = \
+            result["fused_bytes_read_by_scalar_assignments"] \
+            + result["fused_bytes_written_by_scalar_assignments"]
+
+    return result
+
+
+def scalar_assignment_percent_of_total_mem_ops_table():
+    cl_ctx = cl.create_some_context()
+    queue = cl.CommandQueue(cl_ctx)
+
+    result2d = mem_ops_results(queue, 2)
+    result3d = mem_ops_results(queue, 3)
+
+    if PRINT_RESULTS_TO_STDOUT:
+        print("==== Scalar Assigment % of Total Mem Ops ====")
+        outf = sys.stdout
+    else:
+        out_path = "scalar-assignments-mem-op-percentage.tex"
+        outf = open(out_path, "w")
+
+    print(
+        table(
+            "lr",
+            ("Operator",
+             r"\parbox{1in}{\centering \% Memory Ops. Due to Scalar Assignments}"),
+            (
+                ("2D: Baseline",
+                 "%.1f" % (
+                        100 * result2d["nonfused_bytes_total_by_scalar_assignments"]
+                        / result2d["nonfused_bytes_total"])),
+                ("2D: Inlined",
+                 "%.1f" % (
+                        100 * result2d["fused_bytes_total_by_scalar_assignments"]
+                        / result2d["fused_bytes_total"])),
+                ("3D: Baseline",
+                 "%.1f" % (
+                        100 * result3d["nonfused_bytes_total_by_scalar_assignments"]
+                        / result3d["nonfused_bytes_total"])),
+                ("3D: Inlined",
+                  "%.1f" % (
+                        100 * result3d["fused_bytes_total_by_scalar_assignments"]
+                        / result3d["fused_bytes_total"])),
+            )),
+        file=outf)
+
+
+def scalar_assignment_effect_of_fusion_mem_ops_table():
+    cl_ctx = cl.create_some_context()
+    queue = cl.CommandQueue(cl_ctx)
+
+    result2d = mem_ops_results(queue, 2)
+    result3d = mem_ops_results(queue, 3)
+
+    if PRINT_RESULTS_TO_STDOUT:
+        print("==== Scalar Assigment Inlining Impact ====")
+        outf = sys.stdout
+    else:
+        out_path = "scalar-assignments-mem-op-percentage.tex"
+        outf = open(out_path, "w")
+
+    print(
+        table(
+            "lrrrr",
+            ("Operator",
+             r"Bytes Read",
+             r"Bytes Written",
+             r"Total",
+             r"\% of Baseline"),
+            (
+                ("2D: Baseline",
+                 r"\num{%d}" % (
+                     result2d["nonfused_bytes_read_by_scalar_assignments"]),
+                 r"\num{%d}" % (
+                     result2d["nonfused_bytes_written_by_scalar_assignments"]),
+                 r"\num{%d}" % (
+                     result2d["nonfused_bytes_total_by_scalar_assignments"]),
+                 "100"),
+                ("2D: Inlined",
+                 r"\num{%d}" % (
+                     result2d["fused_bytes_read_by_scalar_assignments"]),
+                 r"\num{%d}" % (
+                     result2d["fused_bytes_written_by_scalar_assignments"]),
+                 r"\num{%d}" % (
+                     result2d["fused_bytes_total_by_scalar_assignments"]),
+                 r"%.1f" % (
+                     100 * result2d["fused_bytes_total_by_scalar_assignments"]
+                     / result2d["nonfused_bytes_total_by_scalar_assignments"])),
+                ("3D: Baseline",
+                 r"\num{%d}" % (
+                     result3d["nonfused_bytes_read_by_scalar_assignments"]),
+                 r"\num{%d}" % (
+                     result3d["nonfused_bytes_written_by_scalar_assignments"]),
+                 r"\num{%d}" % (
+                     result3d["nonfused_bytes_total_by_scalar_assignments"]),
+                 "100"),
+                ("3D: Inlined",
+                 r"\num{%d}" % (
+                     result3d["fused_bytes_read_by_scalar_assignments"]),
+                 r"\num{%d}" % (
+                     result3d["fused_bytes_written_by_scalar_assignments"]),
+                 r"\num{%d}" % (
+                     result3d["fused_bytes_total_by_scalar_assignments"]),
+                 r"%.1f" % (
+                     100 * result3d["fused_bytes_total_by_scalar_assignments"]
+                     / result3d["nonfused_bytes_total_by_scalar_assignments"])),
+            )),
+        file=outf)
+
+# }}}
+
+
+def main():
+    if 1:
+        # Run tests.
+        from py.test import main
+        result = main([__file__])
+        assert result == 0
+
+    # Run examples.
+    problem_stats()
+    statement_counts_table()
+    scalar_assignment_percent_of_total_mem_ops_table()
+    scalar_assignment_effect_of_fusion_mem_ops_table()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/geometry.py b/examples/geometry.py
index e10254bfb9b10323b61c150dd5f9105a2525df79..6e146f21dd31545b392d8006d45d94bed7a9db02 100644
--- a/examples/geometry.py
+++ b/examples/geometry.py
@@ -26,7 +26,7 @@ THE SOFTWARE.
 
 import numpy as np  # noqa
 import pyopencl as cl
-from grudge import sym, bind, Discretization, shortcuts
+from grudge import sym, bind, DGDiscretizationWithBoundaries, shortcuts
 
 
 def main(write_output=True):
@@ -36,14 +36,14 @@ def main(write_output=True):
     from meshmode.mesh.generation import generate_warped_rect_mesh
     mesh = generate_warped_rect_mesh(dim=2, order=4, n=6)
 
-    discr = Discretization(cl_ctx, mesh, order=4)
+    discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=4)
 
     sym_op = sym.normal(sym.BTAG_ALL, mesh.dim)
     #sym_op = sym.nodes(mesh.dim, where=sym.BTAG_ALL)
     print(sym.pretty(sym_op))
     op = bind(discr, sym_op)
     print()
-    print(op.code)
+    print(op.eval_code)
 
     vec = op(queue)
 
diff --git a/examples/maxwell/cavities.py b/examples/maxwell/cavities.py
index 85a7c3dec0cef99b4a72fe9c23439c4212c0c32d..a58f27399ccfb6e32a16d6cf594ed2a3c78076b2 100644
--- a/examples/maxwell/cavities.py
+++ b/examples/maxwell/cavities.py
@@ -28,11 +28,14 @@ THE SOFTWARE.
 import numpy as np
 import pyopencl as cl
 from grudge.shortcuts import set_up_rk4
-from grudge import sym, bind, Discretization
+from grudge import sym, bind, DGDiscretizationWithBoundaries
 
 from grudge.models.em import get_rectangular_cavity_mode
 
 
+STEPS = 60
+
+
 def main(dims, write_output=True, order=4):
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
@@ -43,7 +46,7 @@ def main(dims, write_output=True, order=4):
             b=(1.0,)*dims,
             n=(5,)*dims)
 
-    discr = Discretization(cl_ctx, mesh, order=order)
+    discr = DGDiscretizationWithBoundaries(cl_ctx, mesh, order=order)
 
     if 0:
         epsilon0 = 8.8541878176e-12  # C**2 / (N m**2)
@@ -84,7 +87,7 @@ def main(dims, write_output=True, order=4):
 
     dt_stepper = set_up_rk4("w", dt, fields, rhs)
 
-    final_t = dt * 600
+    final_t = dt * STEPS
     nsteps = int(final_t/dt)
 
     print("dt=%g nsteps=%d" % (dt, nsteps))
@@ -102,7 +105,7 @@ def main(dims, write_output=True, order=4):
     e, h = op.split_eh(fields)
 
     if 1:
-        vis.write_vtk_file("fld-%04d.vtu" % step,
+        vis.write_vtk_file("fld-cavities-%04d.vtu" % step,
                 [
                     ("e", e),
                     ("h", h),
@@ -119,7 +122,7 @@ def main(dims, write_output=True, order=4):
                     time()-t_last_step)
             if step % 10 == 0:
                 e, h = op.split_eh(event.state_component)
-                vis.write_vtk_file("fld-%04d.vtu" % step,
+                vis.write_vtk_file("fld-cavities-%04d.vtu" % step,
                         [
                             ("e", e),
                             ("h", h),
diff --git a/examples/wave/var-propagation-speed.py b/examples/wave/var-propagation-speed.py
index f161d0346c1349e508b5ed5a32737e1cccf61312..a7a634f7a8b79f3006525059e70cc4379581127c 100644
--- a/examples/wave/var-propagation-speed.py
+++ b/examples/wave/var-propagation-speed.py
@@ -35,7 +35,7 @@ def main(write_output=True, order=4):
     cl_ctx = cl.create_some_context()
     queue = cl.CommandQueue(cl_ctx)
 
-    dims = 3
+    dims = 2
     from meshmode.mesh.generation import generate_regular_rect_mesh
     mesh = generate_regular_rect_mesh(
             a=(-0.5,)*dims,
@@ -90,7 +90,7 @@ def main(write_output=True, order=4):
 
     dt_stepper = set_up_rk4("w", dt, fields, rhs)
 
-    final_t = 10
+    final_t = 1
     nsteps = int(final_t/dt)
     print("dt=%g nsteps=%d" % (dt, nsteps))
 
@@ -113,7 +113,7 @@ def main(write_output=True, order=4):
             print(step, event.t, norm(queue, u=event.state_component[0]),
                     time()-t_last_step)
             if step % 10 == 0:
-                vis.write_vtk_file("fld-%04d.vtu" % step,
+                vis.write_vtk_file("fld-var-propogation-speed-%04d.vtu" % step,
                         [
                             ("u", event.state_component[0]),
                             ("v", event.state_component[1:]),
diff --git a/examples/wave/wave-min-mpi.py b/examples/wave/wave-min-mpi.py
index 04d0b8a371bef57013441932343085a22659782b..08705655cf874bf7701bb1c3f998c0d6ea7a6e06 100644
--- a/examples/wave/wave-min-mpi.py
+++ b/examples/wave/wave-min-mpi.py
@@ -134,7 +134,7 @@ def main(write_output=True, order=4):
                     time()-t_last_step)
             if step % 10 == 0:
                 vis.write_vtk_file(
-                        "fld-%03d-%04d.vtu" % (
+                        "fld-wave-min-mpi-%03d-%04d.vtu" % (
                             rank,
                             step,
                             ),
diff --git a/examples/wave/wave-min.py b/examples/wave/wave-min.py
index d7ebffd33a7732c621d159622804fed064f365b3..d793e3a09716b883d91c90f1fddfb2f96bacf1a2 100644
--- a/examples/wave/wave-min.py
+++ b/examples/wave/wave-min.py
@@ -113,7 +113,7 @@ def main(write_output=True, order=4):
             print(step, event.t, norm(queue, u=event.state_component[0]),
                     time()-t_last_step)
             if step % 10 == 0:
-                vis.write_vtk_file("fld-%04d.vtu" % step,
+                vis.write_vtk_file("fld-wave-min-%04d.vtu" % step,
                         [
                             ("u", event.state_component[0]),
                             ("v", event.state_component[1:]),
diff --git a/grudge/execution.py b/grudge/execution.py
index 9e48aba39cd6e3aa5481f0bf9f31f3340381e4f6..4fe592dda96a69d410d4ec0d2b2e1ac28614415d 100644
--- a/grudge/execution.py
+++ b/grudge/execution.py
@@ -287,7 +287,7 @@ class ExecutionMapper(mappers.Evaluator,
 
     # {{{ instruction execution functions
 
-    def map_insn_rank_data_swap(self, insn):
+    def map_insn_rank_data_swap(self, insn, profile_data=None):
         local_data = self.rec(insn.field).get(self.queue)
         comm = self.discrwb.mpi_communicator
 
@@ -302,7 +302,7 @@ class ExecutionMapper(mappers.Evaluator,
                 MPIRecvFuture(recv_req, insn.name, remote_data_host, self.queue),
                 MPISendFuture(send_req)]
 
-    def map_insn_loopy_kernel(self, insn):
+    def map_insn_loopy_kernel(self, insn, profile_data=None):
         kwargs = {}
         kdescr = insn.kernel_descriptor
         for name, expr in six.iteritems(kdescr.input_mappings):
@@ -325,11 +325,11 @@ class ExecutionMapper(mappers.Evaluator,
         evt, result_dict = kdescr.loopy_kernel(self.queue, **kwargs)
         return list(result_dict.items()), []
 
-    def map_insn_assign(self, insn):
+    def map_insn_assign(self, insn, profile_data=None):
         return [(name, self.rec(expr))
                 for name, expr in zip(insn.names, insn.exprs)], []
 
-    def map_insn_assign_to_discr_scoped(self, insn):
+    def map_insn_assign_to_discr_scoped(self, insn, profile_data=None):
         assignments = []
         for name, expr in zip(insn.names, insn.exprs):
             value = self.rec(expr)
@@ -338,11 +338,11 @@ class ExecutionMapper(mappers.Evaluator,
 
         return assignments, []
 
-    def map_insn_assign_from_discr_scoped(self, insn):
+    def map_insn_assign_from_discr_scoped(self, insn, profile_data=None):
         return [(insn.name,
             self.discrwb._discr_scoped_subexpr_name_to_value[insn.name])], []
 
-    def map_insn_diff_batch_assign(self, insn):
+    def map_insn_diff_batch_assign(self, insn, profile_data=None):
         field = self.rec(insn.field)
         repr_op = insn.operators[0]
         # FIXME: There's no real reason why differentiation is special,
@@ -440,7 +440,7 @@ class MPISendFuture(object):
 class BoundOperator(object):
 
     def __init__(self, discrwb, discr_code, eval_code, debug_flags,
-            function_registry, allocator=None):
+            function_registry, exec_mapper_factory, allocator=None):
         self.discrwb = discrwb
         self.discr_code = discr_code
         self.eval_code = eval_code
@@ -448,6 +448,7 @@ class BoundOperator(object):
         self.debug_flags = debug_flags
         self.function_registry = function_registry
         self.allocator = allocator
+        self.exec_mapper_factory = exec_mapper_factory
 
     def __str__(self):
         sep = 75 * "=" + "\n"
@@ -481,7 +482,7 @@ class BoundOperator(object):
             # need to do discrwb-scope evaluation
             discrwb_eval_context = {}
             self.discr_code.execute(
-                    ExecutionMapper(queue, discrwb_eval_context, self))
+                    self.exec_mapper_factory(queue, discrwb_eval_context, self))
 
         # }}}
 
@@ -490,7 +491,7 @@ class BoundOperator(object):
             new_context[name] = with_object_array_or_scalar(replace_queue, var)
 
         return self.eval_code.execute(
-                ExecutionMapper(queue, new_context, self),
+                self.exec_mapper_factory(queue, new_context, self),
                 profile_data=profile_data,
                 log_quantities=log_quantities)
 
@@ -593,8 +594,9 @@ def process_sym_operator(discrwb, sym_operator, post_bind_mapper=None,
 
 
 def bind(discr, sym_operator, post_bind_mapper=lambda x: x,
-        debug_flags=set(), allocator=None,
-        function_registry=base_function_registry):
+        function_registry=base_function_registry,
+        exec_mapper_factory=ExecutionMapper,
+        debug_flags=frozenset(), allocator=None):
     # from grudge.symbolic.mappers import QuadratureUpsamplerRemover
     # sym_operator = QuadratureUpsamplerRemover(self.quad_min_degrees)(
     #         sym_operator)
@@ -621,7 +623,9 @@ def bind(discr, sym_operator, post_bind_mapper=lambda x: x,
 
     bound_op = BoundOperator(discr, discr_code, eval_code,
             function_registry=function_registry,
-            debug_flags=debug_flags, allocator=allocator)
+            exec_mapper_factory=exec_mapper_factory,
+            debug_flags=debug_flags,
+            allocator=allocator)
 
     if "dump_op_code" in debug_flags:
         from grudge.tools import open_unique_debug_file
diff --git a/grudge/symbolic/compiler.py b/grudge/symbolic/compiler.py
index 498e5172254ca7acb4a55fb25057e837d024fcfb..3aac28bb3c3a8f74d6db255863c811314d464d36 100644
--- a/grudge/symbolic/compiler.py
+++ b/grudge/symbolic/compiler.py
@@ -106,10 +106,13 @@ class LoopyKernelInstruction(Instruction):
     @memoize_method
     def get_dependencies(self):
         from pymbolic.primitives import Variable, Subscript
-        return set(
-                v
-                for v in self.kernel_descriptor.input_mappings.values()
-                if isinstance(v, (Variable, Subscript)))
+        result = set()
+        for v in self.kernel_descriptor.input_mappings.values():
+            if isinstance(v, Variable):
+                result.add(v)
+            elif isinstance(v, Subscript):
+                result.add(v.aggregate)
+        return result
 
     def __str__(self):
         knl_str = "\n".join(
@@ -523,7 +526,7 @@ class Code(object):
                                 log_quantities["rank_data_swap_timer"],
                                 log_quantities["rank_data_swap_counter"])
 
-                assignments, new_futures = mapper_method(insn)
+                assignments, new_futures = mapper_method(insn, profile_data)
 
                 for target, value in assignments:
                     if pre_assign_check is not None:
@@ -580,10 +583,6 @@ class Code(object):
                             break
 
         if len(done_insns) < len(self.instructions):
-            print("Unreachable instructions:")
-            for insn in set(self.instructions) - done_insns:
-                print("    ", insn)
-
             raise RuntimeError("not all instructions are reachable"
                     "--did you forget to pass a value for a placeholder?")
 
@@ -591,7 +590,7 @@ class Code(object):
             exec_sub_timer.stop().submit()
         from pytools.obj_array import with_object_array_or_scalar
         if profile_data is not None:
-            profile_data['total_time'] += time() - start_time
+            profile_data['total_time'] = time() - start_time
             return (with_object_array_or_scalar(exec_mapper, self.result),
                     profile_data)
         return with_object_array_or_scalar(exec_mapper, self.result)
diff --git a/grudge/symbolic/mappers/__init__.py b/grudge/symbolic/mappers/__init__.py
index df53cadf86e69cf3ca97081cf9c3618570120450..0e9b8e02d67b6f2b86d217d8c230263281d49a19 100644
--- a/grudge/symbolic/mappers/__init__.py
+++ b/grudge/symbolic/mappers/__init__.py
@@ -1230,6 +1230,7 @@ class CollectorMixin(OperatorReducerMixin, LocalOpReducerMixin, FluxOpReducerMix
     def map_constant(self, expr, *args, **kwargs):
         return OrderedSet()
 
+    map_variable = map_constant
     map_grudge_variable = map_constant
     map_function_symbol = map_constant
 
@@ -1275,6 +1276,33 @@ class FluxExchangeCollector(CSECachingMapperMixin, CollectorMixin, CombineMapper
 class Evaluator(pymbolic.mapper.evaluator.EvaluationMapper):
     pass
 
+
+class SymbolicEvaluator(pymbolic.mapper.evaluator.EvaluationMapper):
+    def map_operator_binding(self, expr, *args, **kwargs):
+        return expr.op(self.rec(expr.field, *args, **kwargs))
+
+    def map_node_coordinate_component(self, expr, *args, **kwargs):
+        return expr
+
+    def map_call(self, expr, *args, **kwargs):
+        return type(expr)(
+                expr.function,
+                tuple(self.rec(child, *args, **kwargs)
+                    for child in expr.parameters))
+
+    def map_call_with_kwargs(self, expr, *args, **kwargs):
+        return type(expr)(
+                expr.function,
+                tuple(self.rec(child, *args, **kwargs)
+                    for child in expr.parameters),
+                dict(
+                    (key, self.rec(val, *args, **kwargs))
+                    for key, val in six.iteritems(expr.kw_parameters))
+                    )
+
+    def map_common_subexpression(self, expr):
+        return type(expr)(self.rec(expr.child), expr.prefix, expr.scope)
+
 # }}}
 
 
diff --git a/grudge/symbolic/primitives.py b/grudge/symbolic/primitives.py
index c59f1518b5034a9b69c3aca4a22c0c4bab92d7a6..63adf3dbf7bfa776f35a0691e63967488515fb00 100644
--- a/grudge/symbolic/primitives.py
+++ b/grudge/symbolic/primitives.py
@@ -76,6 +76,7 @@ Symbols
 .. autoclass:: ScalarVariable
 .. autoclass:: make_sym_array
 .. autoclass:: make_sym_mv
+.. autoclass:: FunctionSymbol
 
 .. function :: sqrt(arg)
 .. function :: exp(arg)
diff --git a/setup.py b/setup.py
index 1f9ecbd04b99dbc64502f4a1bd4f2dc4a8376cdf..e975a5cbb9c9cf0f03f97099b2844d2eb51bbde3 100644
--- a/setup.py
+++ b/setup.py
@@ -53,8 +53,8 @@ def main():
               "pymbolic>=2013.2",
               "loo.py>=2013.1beta",
               "cgen>=2013.1.2",
-              "leap>=2015.1",
-              "dagrt>=2015.1",
+              "leap>=2019.1",
+              "dagrt>=2019.1",
 
               "six>=1.8",
               ])
diff --git a/examples/advection/advection.py b/unported-examples/advection/advection.py
similarity index 100%
rename from examples/advection/advection.py
rename to unported-examples/advection/advection.py
diff --git a/examples/benchmark_grudge/benchmark_mpi.py b/unported-examples/benchmark_grudge/benchmark_mpi.py
similarity index 100%
rename from examples/benchmark_grudge/benchmark_mpi.py
rename to unported-examples/benchmark_grudge/benchmark_mpi.py
diff --git a/examples/benchmark_grudge/run_benchmark.sh b/unported-examples/benchmark_grudge/run_benchmark.sh
similarity index 100%
rename from examples/benchmark_grudge/run_benchmark.sh
rename to unported-examples/benchmark_grudge/run_benchmark.sh
diff --git a/examples/burgers/burgers.py b/unported-examples/burgers/burgers.py
similarity index 100%
rename from examples/burgers/burgers.py
rename to unported-examples/burgers/burgers.py
diff --git a/examples/gas_dynamics/box-in-box.py b/unported-examples/gas_dynamics/box-in-box.py
similarity index 100%
rename from examples/gas_dynamics/box-in-box.py
rename to unported-examples/gas_dynamics/box-in-box.py
diff --git a/examples/gas_dynamics/euler/sine-wave.py b/unported-examples/gas_dynamics/euler/sine-wave.py
similarity index 100%
rename from examples/gas_dynamics/euler/sine-wave.py
rename to unported-examples/gas_dynamics/euler/sine-wave.py
diff --git a/examples/gas_dynamics/euler/sod-2d.py b/unported-examples/gas_dynamics/euler/sod-2d.py
similarity index 100%
rename from examples/gas_dynamics/euler/sod-2d.py
rename to unported-examples/gas_dynamics/euler/sod-2d.py
diff --git a/examples/gas_dynamics/euler/vortex-adaptive-grid.py b/unported-examples/gas_dynamics/euler/vortex-adaptive-grid.py
similarity index 100%
rename from examples/gas_dynamics/euler/vortex-adaptive-grid.py
rename to unported-examples/gas_dynamics/euler/vortex-adaptive-grid.py
diff --git a/examples/gas_dynamics/euler/vortex-sources.py b/unported-examples/gas_dynamics/euler/vortex-sources.py
similarity index 100%
rename from examples/gas_dynamics/euler/vortex-sources.py
rename to unported-examples/gas_dynamics/euler/vortex-sources.py
diff --git a/examples/gas_dynamics/euler/vortex.py b/unported-examples/gas_dynamics/euler/vortex.py
similarity index 100%
rename from examples/gas_dynamics/euler/vortex.py
rename to unported-examples/gas_dynamics/euler/vortex.py
diff --git a/examples/gas_dynamics/gas_dynamics_initials.py b/unported-examples/gas_dynamics/gas_dynamics_initials.py
similarity index 100%
rename from examples/gas_dynamics/gas_dynamics_initials.py
rename to unported-examples/gas_dynamics/gas_dynamics_initials.py
diff --git a/examples/gas_dynamics/lbm-simple.py b/unported-examples/gas_dynamics/lbm-simple.py
similarity index 100%
rename from examples/gas_dynamics/lbm-simple.py
rename to unported-examples/gas_dynamics/lbm-simple.py
diff --git a/examples/gas_dynamics/naca.py b/unported-examples/gas_dynamics/naca.py
similarity index 100%
rename from examples/gas_dynamics/naca.py
rename to unported-examples/gas_dynamics/naca.py
diff --git a/examples/gas_dynamics/navierstokes/shearflow.py b/unported-examples/gas_dynamics/navierstokes/shearflow.py
similarity index 100%
rename from examples/gas_dynamics/navierstokes/shearflow.py
rename to unported-examples/gas_dynamics/navierstokes/shearflow.py
diff --git a/examples/gas_dynamics/square.py b/unported-examples/gas_dynamics/square.py
similarity index 100%
rename from examples/gas_dynamics/square.py
rename to unported-examples/gas_dynamics/square.py
diff --git a/examples/gas_dynamics/wing.py b/unported-examples/gas_dynamics/wing.py
similarity index 100%
rename from examples/gas_dynamics/wing.py
rename to unported-examples/gas_dynamics/wing.py
diff --git a/examples/heat/heat.py b/unported-examples/heat/heat.py
similarity index 100%
rename from examples/heat/heat.py
rename to unported-examples/heat/heat.py
diff --git a/examples/maxwell/.gitignore b/unported-examples/maxwell/.gitignore
similarity index 100%
rename from examples/maxwell/.gitignore
rename to unported-examples/maxwell/.gitignore
diff --git a/examples/maxwell/dipole.py b/unported-examples/maxwell/dipole.py
similarity index 100%
rename from examples/maxwell/dipole.py
rename to unported-examples/maxwell/dipole.py
diff --git a/examples/maxwell/generate-bessel-zeros.py b/unported-examples/maxwell/generate-bessel-zeros.py
similarity index 100%
rename from examples/maxwell/generate-bessel-zeros.py
rename to unported-examples/maxwell/generate-bessel-zeros.py
diff --git a/examples/maxwell/inhom-cavity.py b/unported-examples/maxwell/inhom-cavity.py
similarity index 100%
rename from examples/maxwell/inhom-cavity.py
rename to unported-examples/maxwell/inhom-cavity.py
diff --git a/examples/maxwell/inhomogeneous_waveguide.mac b/unported-examples/maxwell/inhomogeneous_waveguide.mac
similarity index 100%
rename from examples/maxwell/inhomogeneous_waveguide.mac
rename to unported-examples/maxwell/inhomogeneous_waveguide.mac
diff --git a/examples/maxwell/maxwell-2d.py b/unported-examples/maxwell/maxwell-2d.py
similarity index 100%
rename from examples/maxwell/maxwell-2d.py
rename to unported-examples/maxwell/maxwell-2d.py
diff --git a/examples/maxwell/maxwell-pml.py b/unported-examples/maxwell/maxwell-pml.py
similarity index 100%
rename from examples/maxwell/maxwell-pml.py
rename to unported-examples/maxwell/maxwell-pml.py
diff --git a/examples/maxwell/notes.tm b/unported-examples/maxwell/notes.tm
similarity index 100%
rename from examples/maxwell/notes.tm
rename to unported-examples/maxwell/notes.tm
diff --git a/examples/poisson/helmholtz.py b/unported-examples/poisson/helmholtz.py
similarity index 100%
rename from examples/poisson/helmholtz.py
rename to unported-examples/poisson/helmholtz.py
diff --git a/examples/poisson/poisson.py b/unported-examples/poisson/poisson.py
similarity index 100%
rename from examples/poisson/poisson.py
rename to unported-examples/poisson/poisson.py
diff --git a/examples/wave/wiggly.py b/unported-examples/wave/wiggly.py
similarity index 100%
rename from examples/wave/wiggly.py
rename to unported-examples/wave/wiggly.py