diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 1caef802b7340c7308f1b6655711481b91f0d889..ea69114d6b21e1306f07cdf0684ac1a025bfbaac 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -12,6 +12,10 @@ Python 2.7 POCL:
   - pocl
   except:
   - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
 
 Python 2.7 with legacy PyOpenCL:
   script:
@@ -29,6 +33,10 @@ Python 2.7 with legacy PyOpenCL:
   except:
   - tags
   retry: 2
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
 
 Python 3.6 POCL:
   script:
@@ -43,6 +51,10 @@ Python 3.6 POCL:
   - pocl
   except:
   - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
 
 Python 3.6 POCL Twice With Cache:
   script:
@@ -59,6 +71,10 @@ Python 3.6 POCL Twice With Cache:
   - pocl
   except:
   - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
 
 # PyPy POCL:
 #   script:
@@ -77,7 +93,7 @@ Python 3.6 POCL Examples:
   script:
   - export PY_EXE=python3.6
   - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib jupyter nbconvert"
+  - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipykernel nbconvert"
   - ". ./build-py-project-and-run-examples.sh"
   tags:
   - python3.6
@@ -87,6 +103,7 @@ Python 3.6 POCL Examples:
   except:
   - tags
 
+
 CentOS binary:
   script:
   - (cd build-helpers; ./make-linux-build-docker.sh --nodate)
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 0000000000000000000000000000000000000000..601df74bd9a655b3b29decdbdf499d55b25b6385
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 Andreas Klöckner and contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/build-helpers/make-linux-build-docker-inner-part-2.sh b/build-helpers/make-linux-build-docker-inner-part-2.sh
index 1e35a1e1b9949b37f95b05ebeef223c8a5955ff8..035634b16072e0188270abd8736dab99ce31dada 100755
--- a/build-helpers/make-linux-build-docker-inner-part-2.sh
+++ b/build-helpers/make-linux-build-docker-inner-part-2.sh
@@ -23,6 +23,10 @@ git clone --recursive git://github.com/inducer/loopy
 cd loopy
 
 grep -v pyopencl requirements.txt > myreq.txt
+
+# needed for pyinstaller package to be usable
+echo packaging >> myreq.txt
+
 pip install -r myreq.txt
 python setup.py install
 
diff --git a/doc/ref_call.rst b/doc/ref_call.rst
index 4ff1ef2fc343f845cac372fe6149cd786796ef90..5a59e84282119209cc89eb18e3a4eda97725edf0 100644
--- a/doc/ref_call.rst
+++ b/doc/ref_call.rst
@@ -4,6 +4,8 @@ Calling Loopy Kernels and External Functions
 Goals of a function interface
 -----------------------------
 
+- *FIXME: * Needs to change after the new design of program.
+
 - Must be able to have complete information of the function just through the
   epxression node.
 - Must adhere to :mod:`loopy` semantics of immutability.
@@ -30,7 +32,7 @@ kernel, whose name has been resolved by the kernel. The process of matching a
 function idenitifier with the function definition is called "resolving".
 
 A pymbolic ``Call`` node can be converted to a ``ResolvedFunction`` if it
-is "resolved" by one of the ``function_scoper`` in a
+is "resolved" by one of the ``function_id_to_in_knl_callable_mapper`` in a
 :attr:`LoopKernel.scoped_functions`
 
 -  Functions already registered by the target. Some examples include --
@@ -41,11 +43,11 @@ is "resolved" by one of the ``function_scoper`` in a
 -  Functions registered as ``CallableKernels`` using
    ``lp.register_callable_kernel(...)``.
 -  Functions that have been provided through
-   ``lp.register_function_scoper(...)``
+   ``lp.register_function_id_to_in_knl_callable_mapper(...)``
 -  Functions that can be made known from the user through
    ``lp.register_function_mangler``. This is planned to be deprecated,
    as its functionality is superseded by
-   ``lp.register_function_scoper(...)``.
+   ``lp.register_function_id_to_in_knl_callable_mapper(...)``.
 
 Expressions after a function is scoped
 --------------------------------------
@@ -180,7 +182,7 @@ Changes on the target side to accommodate the new function interface
 --------------------------------------------------------------------
 
 The earlier "function\_mangler" as a member method of the class
-``lp.ASTBuilderBase`` will be replaced by ``function_scopers``. The
+``lp.ASTBuilderBase`` will be replaced by ``function_id_in_knl_callable_mapper``. The
 function scopers would return a list of functions with the signature
 ``(target, identifier)->lp.InKernelCallable``.
 
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index aaa3228526d242741044b779829957f0a90cdc5c..25082f88a10a7e3276c1ac73251633ee9ac93e29 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -334,7 +334,7 @@ that these dependencies show up there, too:
 
 .. doctest::
 
-    >>> print(knl.stringify(with_dependencies=True))
+    >>> print(knl.root_kernel.stringify(with_dependencies=True))
     ---------------------------------------------------------------------------
     KERNEL: loopy_kernel
     ---------------------------------------------------------------------------
@@ -1145,7 +1145,7 @@ the right by 1 in parallel:
    ...     end
    ...     """,
    ...      [
-   ...         lp.ArrayArg("arr", shape=("n",), dtype=np.int32),
+   ...         lp.GlobalArg("arr", shape=("n",), dtype=np.int32),
    ...          "...",
    ...      ],
    ...     name="rotate_v1",
@@ -1179,7 +1179,7 @@ Let us start with an example. Consider the kernel from above with a
 
 .. doctest::
 
-   >>> knl = lp.make_kernel(
+   >>> prog = lp.make_kernel(
    ...     "[n] -> {[i] : 0<=i<n}",
    ...     """
    ...     for i
@@ -1189,16 +1189,16 @@ Let us start with an example. Consider the kernel from above with a
    ...     end
    ...     """,
    ...      [
-   ...         lp.ArrayArg("arr", shape=("n",), dtype=np.int32),
+   ...         lp.GlobalArg("arr", shape=("n",), dtype=np.int32),
    ...          "...",
    ...      ],
    ...     name="rotate_v2",
    ...     assumptions="n mod 16 = 0")
-   >>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0")
+   >>> prog = lp.split_iname(prog, "i", 16, inner_tag="l.0", outer_tag="g.0")
 
 Here is what happens when we try to generate code for the kernel:
 
-   >>> cgr = lp.generate_code_v2(knl)
+   >>> cgr = lp.generate_code_v2(prog)
    Traceback (most recent call last):
    ...
    loopy.diagnostic.MissingDefinitionError: temporary variable 'tmp' gets used in subkernel 'rotate_v2_0' without a definition (maybe you forgot to call loopy.save_and_reload_temporaries?)
@@ -1207,8 +1207,10 @@ This happens due to the kernel splitting done by :mod:`loopy`. The splitting
 happens when the instruction schedule is generated. To see the schedule, we
 should call :func:`loopy.get_one_scheduled_kernel`:
 
-   >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
-   >>> print(knl)
+   >>> prog = lp.preprocess_kernel(prog)
+   >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table)
+   >>> prog = prog.with_root_kernel(knl)
+   >>> print(prog)
    ---------------------------------------------------------------------------
    KERNEL: rotate_v2
    ---------------------------------------------------------------------------
@@ -1237,10 +1239,10 @@ function adds instructions to the kernel without scheduling them. That means
 that :func:`loopy.get_one_scheduled_kernel` needs to be called one more time to
 put those instructions into the schedule.
 
-   >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
-   >>> knl = lp.save_and_reload_temporaries(knl)
-   >>> knl = lp.get_one_scheduled_kernel(knl)  # Schedule added instructions
-   >>> print(knl)
+   >>> prog = lp.save_and_reload_temporaries(prog)
+   >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table)  # Schedule added instructions
+   >>> prog = prog.with_root_kernel(knl)
+   >>> print(prog)
    ---------------------------------------------------------------------------
    KERNEL: rotate_v2
    ---------------------------------------------------------------------------
@@ -1279,7 +1281,7 @@ does in more detail:
 
 The kernel translates into two OpenCL kernels.
 
-   >>> cgr = lp.generate_code_v2(knl)
+   >>> cgr = lp.generate_code_v2(prog)
    >>> print(cgr.device_code())
    #define lid(N) ((int) get_local_id(N))
    #define gid(N) ((int) get_group_id(N))
@@ -1321,8 +1323,8 @@ tagged, as in the following example::
             "{ [i]: 0<=i<n }",
             "out[i%20] = out[i%20] + 2*a[i] {atomic}",
             [
-                lp.ArrayArg("out", dtype, shape=lp.auto, for_atomic=True),
-                lp.ArrayArg("a", dtype, shape=lp.auto),
+                lp.GlobalArg("out", dtype, shape=lp.auto, for_atomic=True),
+                lp.GlobalArg("a", dtype, shape=lp.auto),
                 "..."
                 ],
             assumptions="n>0")
@@ -1551,13 +1553,13 @@ information provided. Now we will count the operations:
 
 .. doctest::
 
-    >>> op_map = lp.get_op_map(knl)
+    >>> op_map = lp.get_op_map(knl, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(op_map))
-    Op(np:dtype('float32'), add, workitem) : ...
+    Op(np:dtype('float32'), add, subgroup) : ...
 
 Each line of output will look roughly like::
 
-    Op(np:dtype('float32'), add, workitem) : [l, m, n] -> { l * m * n : l > 0 and m > 0 and n > 0 }
+    Op(np:dtype('float32'), add, subgroup) : [l, m, n] -> { l * m * n : l > 0 and m > 0 and n > 0 }
 
 :func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{**
 :class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**. A
@@ -1579,12 +1581,12 @@ One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`:
 
     >>> param_dict = {'n': 256, 'm': 256, 'l': 8}
     >>> from loopy.statistics import CountGranularity as CG
-    >>> f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(param_dict)
-    >>> f32div = op_map[lp.Op(np.float32, 'div', CG.WORKITEM)].eval_with_dict(param_dict)
-    >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(param_dict)
-    >>> f64add = op_map[lp.Op(np.float64, 'add', CG.WORKITEM)].eval_with_dict(param_dict)
-    >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.WORKITEM)].eval_with_dict(param_dict)
-    >>> i32add = op_map[lp.Op(np.int32, 'add', CG.WORKITEM)].eval_with_dict(param_dict)
+    >>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(param_dict)
+    >>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(param_dict)
+    >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(param_dict)
+    >>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(param_dict)
+    >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP)].eval_with_dict(param_dict)
+    >>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP)].eval_with_dict(param_dict)
     >>> print("%i\n%i\n%i\n%i\n%i\n%i" %
     ...     (f32add, f32div, f32mul, f64add, f64mul, i32add))
     524288
diff --git a/examples/python/call-external.py b/examples/python/call-external.py
index 68618a7ecaaaba9c9c93495a05c2313a7660b377..c13d99bd06295096c26d6e113841c853f80645fc 100644
--- a/examples/python/call-external.py
+++ b/examples/python/call-external.py
@@ -7,14 +7,14 @@ from loopy.target.c import CTarget
 # {{{ blas callable
 
 class BLASCallable(lp.ScalarCallable):
-    def with_types(self, arg_id_to_dtype, kernel, program_callables_info):
+    def with_types(self, arg_id_to_dtype, kernel, callables_table):
         for i in range(0, 2):
             if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None:
                 # the types provided aren't mature enough to specialize the
                 # callable
                 return (
                         self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                        program_callables_info)
+                        callables_table)
 
         mat_dtype = arg_id_to_dtype[0].numpy_dtype
         vec_dtype = arg_id_to_dtype[1].numpy_dtype
@@ -34,7 +34,7 @@ class BLASCallable(lp.ScalarCallable):
         from loopy.types import NumpyType
         return self.copy(name_in_target=name_in_target,
                 arg_id_to_dtype={0: NumpyType(vec_dtype), 1: NumpyType(vec_dtype),
-                    -1: NumpyType(vec_dtype)}), program_callables_info
+                    -1: NumpyType(vec_dtype)}), callables_table
 
     def emit_call_insn(self, insn, target, expression_to_code_mapper):
         assert self.is_ready_for_codegen()
diff --git a/examples/python/global_barrier_removal.py b/examples/python/global_barrier_removal.py
index cc4926feeeb3815da1c66a2548bf3235df8f1fcc..be22e268c85fe985a98763426faf8cfadf73c5fb 100644
--- a/examples/python/global_barrier_removal.py
+++ b/examples/python/global_barrier_removal.py
@@ -1,7 +1,5 @@
 import numpy as np
 import loopy as lp
-import pyopencl as cl
-import pyopencl.array
 
 knl = lp.make_kernel(
         "{ [i,k]: 0<=i<n and 0<=k<3 }",
@@ -24,7 +22,7 @@ from loopy.preprocess import preprocess_kernel
 knl = preprocess_kernel(knl)
 
 from loopy.schedule import get_one_scheduled_kernel
-knl = get_one_scheduled_kernel(knl.root_kernel, knl.program_callables_info)
+knl = get_one_scheduled_kernel(knl.root_kernel, knl.callables_table)
 
 # map schedule onto host or device
 print(knl)
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 6ed21500051fb231f44c4b10b0c22ade3e164cd1..51d01b78e879a67e723a0b0a3a5fc54060630dc6 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -51,7 +51,7 @@ from loopy.kernel.data import (
 from loopy.kernel.function_interface import (
         CallableKernel, ScalarCallable)
 from loopy.program import (
-        Program, make_program_from_kernel)
+        Program, make_program)
 
 from loopy.kernel import LoopKernel, KernelState, kernel_state
 from loopy.kernel.tools import (
@@ -176,7 +176,7 @@ __all__ = [
 
         "ScalarCallable", "CallableKernel",
 
-        "Program", "make_program_from_kernel",
+        "Program", "make_program",
 
         "KernelArgument",
         "ValueArg", "ArrayArg", "GlobalArg", "ConstantArg", "ImageArg",
diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index 5ce80ed882dff7fc79f69c47a8f0c13c00308512..7e23ef06f669f660a6af57f8594be4fa8a45061f 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -118,7 +118,7 @@ def make_ref_args(program, impl_arg_info, queue, parameters):
             shape = evaluate_shape(arg.unvec_shape, parameters)
             dtype = kernel_arg.dtype
 
-            is_output = arg.base_name in program.root_kernel.get_written_variables()
+            is_output = kernel_arg.is_output_only
 
             if arg.arg_class is ImageArg:
                 storage_array = ary = cl_array.empty(
diff --git a/loopy/check.py b/loopy/check.py
index d6fa037c003689d14f1e85a0b6bbf4d6eef2ba2b..659e210fc614550f2bbfc942cdbe1ba5ee4b9b16 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -68,10 +68,6 @@ class UnscopedCallCollector(CombineMapper):
     :returns:
         An :class:`frozenset` of function names that are not scoped in
         the kernel.
-
-    .. note::
-        :class:`loopy.library.reduction.ArgExtOp` are ignored, as they are
-        never scoped in the pipeline.
     """
 
     def combine(self, values):
@@ -85,8 +81,7 @@ class UnscopedCallCollector(CombineMapper):
             kw_parameters={}))
 
     def map_call_with_kwargs(self, expr):
-        from loopy.library.reduction import ArgExtOp
-        if not isinstance(expr.function, (ResolvedFunction, ArgExtOp)):
+        if not isinstance(expr.function, ResolvedFunction):
             return (frozenset([expr.function.name]) |
                     self.combine((self.rec(child) for child in expr.parameters
                         + tuple(expr.kw_parameters.values()))))
@@ -103,7 +98,7 @@ class UnscopedCallCollector(CombineMapper):
     map_type_cast = map_constant
 
 
-def check_functions_are_scoped(kernel):
+def check_functions_are_resolved(kernel):
     """ Checks if all the calls in the instruction expression have been scoped,
     otherwise indicates to what all calls we await signature. Refer
     :class:`loopy.symbolic.ResolvedFunction` for a detailed explanation of a
@@ -125,7 +120,7 @@ def check_functions_are_scoped(kernel):
             pass
         else:
             raise NotImplementedError(
-                    "Unknown type of instruction %s" % type(insn).__name__)
+                    "Unsupported instruction type %s." % type(insn).__name__)
 
 # }}}
 
@@ -186,8 +181,8 @@ def check_loop_priority_inames_known(kernel):
 
 
 def _get_all_unique_iname_tags(kernel):
-    """Returns a set of all the iname tags used in *kernel* that
-    inherit from :class:`loopy.kernel.data.UniqueTag`.
+    """Returns an instance of :class:`set` of all the iname tags used in
+    *kernel* that inherit from :class:`loopy.kernel.data.UniqueTag`.
     """
     from loopy.kernel.data import UniqueTag
     iname_tags = [kernel.iname_to_tag.get(iname) for iname in
@@ -211,7 +206,7 @@ def check_multiple_tags_allowed(kernel):
                                  "tags: {1}".format(iname, tags))
 
 
-def check_for_double_use_of_hw_axes(kernel, program_callables_info):
+def check_for_double_use_of_hw_axes(kernel, callables_table):
     from loopy.kernel.data import UniqueTag
     from loopy.kernel.instruction import CallInstruction
     from loopy.kernel.function_interface import CallableKernel
@@ -229,7 +224,7 @@ def check_for_double_use_of_hw_axes(kernel, program_callables_info):
 
         # check usage of iname tags in the callee kernel
         if isinstance(insn, CallInstruction):
-            in_knl_callable = program_callables_info[
+            in_knl_callable = callables_table[
                     insn.expression.function.name]
             if isinstance(in_knl_callable, CallableKernel):
                 # check for collision in iname_tag keys in the instruction
@@ -719,13 +714,13 @@ def check_variable_access_ordered(kernel):
 # }}}
 
 
-def pre_schedule_checks(kernel, program_callables_info):
+def pre_schedule_checks(kernel, callables_table):
     try:
         logger.debug("%s: pre-schedule check: start" % kernel.name)
 
         check_for_duplicate_insn_ids(kernel)
         check_for_orphaned_user_hardware_axes(kernel)
-        check_for_double_use_of_hw_axes(kernel, program_callables_info)
+        check_for_double_use_of_hw_axes(kernel, callables_table)
         check_insn_attributes(kernel)
         check_loop_priority_inames_known(kernel)
         check_multiple_tags_allowed(kernel)
@@ -734,8 +729,8 @@ def pre_schedule_checks(kernel, program_callables_info):
         check_for_data_dependent_parallel_bounds(kernel)
         check_bounds(kernel)
         check_write_destinations(kernel)
-        # check_has_schedulable_iname_nesting(kernel)
-        # check_variable_access_ordered(kernel)
+        check_has_schedulable_iname_nesting(kernel)
+        check_variable_access_ordered(kernel)
 
         logger.debug("%s: pre-schedule check: done" % kernel.name)
     except KeyboardInterrupt:
@@ -753,7 +748,7 @@ def pre_schedule_checks(kernel, program_callables_info):
 
 # {{{ check for unused hw axes
 
-def _check_for_unused_hw_axes_in_kernel_chunk(kernel, program_callables_info,
+def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table,
         sched_index=None):
     from loopy.schedule import (CallKernel, RunInstruction,
             Barrier, EnterLoop, LeaveLoop, ReturnFromKernel,
@@ -770,7 +765,7 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, program_callables_info,
         _, past_end_i = gather_schedule_block(kernel.schedule, sched_index)
         group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs(
                 get_insn_ids_for_block_at(kernel.schedule, sched_index),
-                program_callables_info)
+                callables_table)
 
         group_axes = set(ax for ax, length in enumerate(group_size))
         local_axes = set(ax for ax, length in enumerate(local_size))
@@ -788,7 +783,7 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, program_callables_info,
         sched_item = kernel.schedule[i]
         if isinstance(sched_item, CallKernel):
             i = _check_for_unused_hw_axes_in_kernel_chunk(kernel,
-                    program_callables_info, i)
+                    callables_table, i)
 
         elif isinstance(sched_item, RunInstruction):
             insn = kernel.id_to_insn[sched_item.insn_id]
@@ -839,10 +834,10 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, program_callables_info,
     return past_end_i
 
 
-def check_for_unused_hw_axes_in_insns(kernel, program_callables_info):
+def check_for_unused_hw_axes_in_insns(kernel, callables_table):
     if kernel.schedule:
         _check_for_unused_hw_axes_in_kernel_chunk(kernel,
-                program_callables_info)
+                callables_table)
 
 # }}}
 
@@ -996,15 +991,15 @@ def check_that_shapes_and_strides_are_arguments(kernel):
 # }}}
 
 
-def pre_codegen_checks(kernel, program_callables_info):
+def pre_codegen_checks(kernel, callables_table):
     try:
         logger.debug("pre-codegen check %s: start" % kernel.name)
 
-        check_for_unused_hw_axes_in_insns(kernel, program_callables_info)
+        check_for_unused_hw_axes_in_insns(kernel, callables_table)
         check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel)
         check_that_temporaries_are_defined_in_subkernels_where_used(kernel)
         check_that_all_insns_are_scheduled(kernel)
-        kernel.target.pre_codegen_check(kernel, program_callables_info)
+        kernel.target.pre_codegen_check(kernel, callables_table)
         check_that_shapes_and_strides_are_arguments(kernel)
 
         logger.debug("pre-codegen check %s: done" % kernel.name)
diff --git a/loopy/cli.py b/loopy/cli.py
index a92922b1845d76dd7a700a93c05de3eecf8c28dd..060340d59e4414dd06c714bcd423a97029ca281b 100644
--- a/loopy/cli.py
+++ b/loopy/cli.py
@@ -205,7 +205,7 @@ def main():
         new_kernels = []
         for kernel in kernels:
             new_args = [
-                    lp.ArrayArg("occa_info", np.int32, shape=None)
+                    lp.GlobalArg("occa_info", np.int32, shape=None)
                     ] + kernel.args
             new_kernels.append(kernel.copy(args=new_args))
 
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index a0681c49026f314b44c98eabf73e8370bd9ccddc..8f3e15f28954cc73582e6b2126e7e045cca85dea 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -32,8 +32,8 @@ from pytools.persistent_dict import WriteOncePersistentDict
 from loopy.tools import LoopyKeyBuilder
 from loopy.version import DATA_MODEL_VERSION
 
-from loopy.symbolic import CombineMapper
 
+from loopy.symbolic import CombineMapper
 from functools import reduce
 
 from loopy.kernel.function_interface import CallableKernel
@@ -197,14 +197,16 @@ class CodeGenerationState(object):
 
     .. attribute:: schedule_index_end
 
-    .. attribute:: program_callables_info
+    .. attribute:: callables_table
+
+        An instance of :class:`loopy.CallablesTable`.
     """
 
     def __init__(self, kernel, target,
             implemented_data_info, implemented_domain, implemented_predicates,
             seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map,
             allow_complex,
-            program_callables_info,
+            callables_table,
             vectorization_info=None, var_name_generator=None,
             is_generating_device_code=None,
             gen_program_name=None,
@@ -219,7 +221,7 @@ class CodeGenerationState(object):
         self.seen_atomic_dtypes = seen_atomic_dtypes
         self.var_subst_map = var_subst_map.copy()
         self.allow_complex = allow_complex
-        self.program_callables_info = program_callables_info
+        self.callables_table = callables_table
         self.vectorization_info = vectorization_info
         self.var_name_generator = var_name_generator
         self.is_generating_device_code = is_generating_device_code
@@ -271,7 +273,7 @@ class CodeGenerationState(object):
                 seen_atomic_dtypes=self.seen_atomic_dtypes,
                 var_subst_map=var_subst_map or self.var_subst_map,
                 allow_complex=self.allow_complex,
-                program_callables_info=self.program_callables_info,
+                callables_table=self.callables_table,
                 vectorization_info=vectorization_info,
                 var_name_generator=self.var_name_generator,
                 is_generating_device_code=is_generating_device_code,
@@ -419,15 +421,19 @@ class PreambleInfo(ImmutableRecord):
 
 # {{{ main code generation entrypoint
 
-def generate_code_for_a_single_kernel(kernel, program_callables_info, target):
+def generate_code_for_a_single_kernel(kernel, callables_table, target):
     """
     :returns: a :class:`CodeGenerationResult`
+
+    :param kernel: An instance of :class:`loopy.LoopKernel`.
+    :param callables_table: An instance of
+        :class:`loopy.CallablesTable`.
     """
 
     from loopy.kernel import KernelState
     if kernel.schedule is None:
         from loopy.schedule import get_one_scheduled_kernel
-        kernel = get_one_scheduled_kernel(kernel, program_callables_info)
+        kernel = get_one_scheduled_kernel(kernel, callables_table)
 
     if kernel.state != KernelState.SCHEDULED:
         raise LoopyError("cannot generate code for a kernel that has not been "
@@ -449,7 +455,7 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info, target):
     # }}}
 
     from loopy.check import pre_codegen_checks
-    pre_codegen_checks(kernel, program_callables_info)
+    pre_codegen_checks(kernel, callables_table)
 
     logger.info("%s: generate code: start" % kernel.name)
 
@@ -509,9 +515,9 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info, target):
             gen_program_name=(
                 target.host_program_name_prefix
                 + kernel.name
-                + target.host_program_name_suffix),
+                + kernel.target.host_program_name_suffix),
             schedule_index_end=len(kernel.schedule),
-            program_callables_info=program_callables_info)
+            callables_table=callables_table)
 
     from loopy.codegen.result import generate_host_or_device_program
 
@@ -567,11 +573,16 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info, target):
 
 
 def generate_code_v2(program):
+    """
+    Returns an instance of :class:`CodeGenerationResult`.
+
+    :param program: An instance of :class:`loopy.Program`.
+    """
     from loopy.kernel import LoopKernel
-    from loopy.program import make_program_from_kernel
+    from loopy.program import make_program
 
     if isinstance(program, LoopKernel):
-        program = make_program_from_kernel(program)
+        program = make_program(program)
 
     from loopy.kernel import KernelState
     if program.root_kernel.state == KernelState.INITIAL:
@@ -583,24 +594,26 @@ def generate_code_v2(program):
 
     codegen_results = {}
 
-    for func_id, in_knl_callable in program.program_callables_info.items():
+    for func_id, in_knl_callable in program.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
             codegen_results[func_id] = (
                     generate_code_for_a_single_kernel(in_knl_callable.subkernel,
-                        program.program_callables_info, program.target))
+                        program.callables_table, program.target))
             if not in_knl_callable.subkernel.is_called_from_host:
                 assert codegen_results[func_id].host_program is None
 
-    device_preambles = set()
+    device_preambles = []
     for cgr in codegen_results.values():
-        device_preambles.update(cgr.device_preambles)
+        device_preambles.extend(cgr.device_preambles)
 
-    for in_knl_callable in program.program_callables_info.values():
+    # collecting the function declarations of callee kernels
+    for in_knl_callable in program.callables_table.values():
         for preamble in in_knl_callable.generate_preambles(program.target):
-            device_preambles.update([preamble])
+            device_preambles.append(preamble)
 
     collective_device_program = codegen_results[program.name].device_programs[0]
     callee_fdecls = []
+
     for func_id, callee_cgr in codegen_results.items():
         if func_id != program.name:
             assert len(callee_cgr.device_programs) == 1
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index bb62961c535a851ac4dc9e03724db3685395fe7c..5dfd9cb4343f0e4c2824380b1de1ab346344231d 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -116,7 +116,7 @@ def generate_code_for_sched_index(codegen_state, sched_index):
 
         glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs(
                 get_insn_ids_for_block_at(kernel.schedule, sched_index),
-                codegen_state.program_callables_info)
+                codegen_state.callables_table)
         if kernel.is_called_from_host:
             return merge_codegen_results(codegen_state, [
                 codegen_result,
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index 39cf20c7db3c36d7ad098f6b494d5a4c0020b97e..c282de79bf1d62b07c6177f5f3fd0472e59e1775 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -248,7 +248,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
         return next_func(codegen_state)
 
     global_size, local_size = kernel.get_grid_sizes_for_insn_ids(
-            insn_ids_for_block, codegen_state.program_callables_info)
+            insn_ids_for_block, codegen_state.callables_table)
 
     hw_inames_left = hw_inames_left[:]
     iname = hw_inames_left.pop()
diff --git a/loopy/frontend/fortran/tree.py b/loopy/frontend/fortran/tree.py
index b1df6e3d01317a315354cf10a55b9312090dc61f..6939bb6ad8727b3e9f11966285b6a22de84bb032 100644
--- a/loopy/frontend/fortran/tree.py
+++ b/loopy/frontend/fortran/tree.py
@@ -53,7 +53,7 @@ class FTreeWalkerBase(object):
 
     ENTITY_RE = re.compile(
             r"^(?P<name>[_0-9a-zA-Z]+)"
-            "(\((?P<shape>[-+*0-9:a-zA-Z, \t]+)\))?$")
+            r"(\((?P<shape>[-+*0-9:a-zA-Z, \t]+)\))?$")
 
     def parse_dimension_specs(self, node, dim_decls):
         def parse_bounds(bounds_str):
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index d2723c57ff6872d613adeff2e30f2e74b2616bb5..26db6ec4e6ac8b81ab7f4e0dcf1a000bd9e9fd3c 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -38,7 +38,7 @@ import re
 from pytools import UniqueNameGenerator, generate_unique_names
 
 from loopy.diagnostic import CannotBranchDomainTree, LoopyError
-from loopy.tools import natsorted
+from loopy.tools import natsorted, update_persistent_hash
 from loopy.diagnostic import StaticValueFindingError
 from loopy.kernel.data import filter_iname_tags_by_type
 from warnings import warn
@@ -222,10 +222,10 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         A subclass of :class:`loopy.TargetBase`.
 
     .. attribute:: is_called_from_host
-
         An instance of :class:`bool`. Will be set *False* for the kernel which
-        would be called from another top level kernels. Default value is
+        would be called from other top level kernels. Default value is
         *True*.
+
     """
 
     # {{{ constructor
@@ -253,7 +253,6 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
             state=KernelState.INITIAL,
             target=None,
-
             is_called_from_host=True,
 
             overridden_get_grid_sizes_for_insn_ids=None,
@@ -1036,8 +1035,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 self.get_iname_bounds(iname, constants_only=True).size,
                 constants_only=True)))
 
+    @memoize_method
     def get_grid_sizes_for_insn_ids_as_dicts(self, insn_ids,
-            program_callables_info, ignore_auto=False):
+            callables_table, ignore_auto=False):
         """Return a tuple (global_size, local_size) containing a grid that
         could accommodate execution of all instructions whose IDs are given
         in *insn_ids*.
@@ -1052,7 +1052,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         from loopy.kernel.tools import get_direct_callee_kernels
         callee_kernels = get_direct_callee_kernels(self,
-                program_callables_info, insn_ids)
+                callables_table, insn_ids)
 
         # }}}
 
@@ -1073,7 +1073,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         for callee_kernel in callee_kernels:
             gsize, lsize = callee_kernel.get_grid_sizes_for_insn_ids_as_dicts(
                     frozenset(insn.id for insn in callee_kernel.instructions),
-                    program_callables_info, ignore_auto)
+                    callables_table, ignore_auto)
 
             global_sizes.update(gsize)
             local_sizes.update(lsize)
@@ -1120,7 +1120,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         return global_sizes, local_sizes
 
-    def get_grid_sizes_for_insn_ids(self, insn_ids, program_callables_info,
+    @memoize_method
+    def get_grid_sizes_for_insn_ids(self, insn_ids, callables_table,
             ignore_auto=False):
         """Return a tuple (global_size, local_size) containing a grid that
         could accommodate execution of all instructions whose IDs are given
@@ -1134,14 +1135,14 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         if self.overridden_get_grid_sizes_for_insn_ids:
             return self.overridden_get_grid_sizes_for_insn_ids(
                     insn_ids,
-                    program_callables_info=program_callables_info,
+                    callables_table=callables_table,
                     ignore_auto=ignore_auto)
 
         assert self.is_called_from_host, ("Callee kernels do not have sufficient "
                 "information to compute grid sizes.")
 
         global_sizes, local_sizes = self.get_grid_sizes_for_insn_ids_as_dicts(
-                insn_ids, program_callables_info, ignore_auto=ignore_auto)
+                insn_ids, callables_table, ignore_auto=ignore_auto)
 
         def to_dim_tuple(size_dict, which, forced_sizes={}):
             forced_sizes = forced_sizes.copy()
@@ -1173,7 +1174,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes))
 
     def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids,
-            program_callables_info, ignore_auto=False):
+            callables_table, ignore_auto=False):
         """Return a tuple (global_size, local_size) containing a grid that
         could accommodate execution of all instructions whose IDs are given
         in *insn_ids*.
@@ -1184,7 +1185,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         """
 
         grid_size, group_size = self.get_grid_sizes_for_insn_ids(
-                insn_ids, program_callables_info, ignore_auto)
+                insn_ids, callables_table, ignore_auto)
 
         def tup_to_exprs(tup):
             from loopy.symbolic import pw_aff_to_expr
@@ -1192,7 +1193,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         return tup_to_exprs(grid_size), tup_to_exprs(group_size)
 
-    def get_grid_size_upper_bounds(self, program_callables_info, ignore_auto=False):
+    def get_grid_size_upper_bounds(self, callables_table, ignore_auto=False):
         """Return a tuple (global_size, local_size) containing a grid that
         could accommodate execution of *all* instructions in the kernel.
 
@@ -1200,10 +1201,10 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         """
         return self.get_grid_sizes_for_insn_ids(
                 frozenset(insn.id for insn in self.instructions),
-                program_callables_info,
+                callables_table,
                 ignore_auto=ignore_auto)
 
-    def get_grid_size_upper_bounds_as_exprs(self, program_callables_info,
+    def get_grid_size_upper_bounds_as_exprs(self, callables_table,
             ignore_auto=False):
         """Return a tuple (global_size, local_size) containing a grid that
         could accommodate execution of *all* instructions in the kernel.
@@ -1213,7 +1214,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         return self.get_grid_sizes_for_insn_ids_as_exprs(
                 frozenset(insn.id for insn in self.instructions),
-                program_callables_info,
+                callables_table,
                 ignore_auto=ignore_auto)
 
     # }}}
@@ -1407,9 +1408,10 @@ class LoopKernel(ImmutableRecordWithoutPickling):
     # {{{ direct execution
 
     def __call__(self, *args, **kwargs):
-        # FIXME: scream and then convert to a program
-        from loopy.program import make_program_from_kernel
-        program = make_program_from_kernel(self)
+        warn("Calling a LoopKernel is deprecated, call a Program "
+                "instead.", DeprecationWarning, stacklevel=2)
+        from loopy.program import make_program
+        program = make_program(self)
         return program(*args, **kwargs)
 
     # }}}
@@ -1514,14 +1516,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             "symbol_manglers",
             )
 
-    def update_persistent_hash(self, key_hash, key_builder):
-        """Custom hash computation function for use with
-        :class:`pytools.persistent_dict.PersistentDict`.
-
-        Only works in conjunction with :class:`loopy.tools.KeyBuilder`.
-        """
-        for field_name in self.hash_fields:
-            key_builder.rec(key_hash, getattr(self, field_name))
+    update_persistent_hash = update_persistent_hash
 
     def __hash__(self):
         from loopy.tools import LoopyKeyBuilder
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 186597c64734b54b8d08f0db43b57826d79f9567..0ed1f940131238a123616d95b8b99a426e10bfc7 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -833,6 +833,8 @@ class ArrayBase(ImmutableRecord):
                 dim_names=dim_names,
                 order=order,
                 alignment=alignment,
+                for_atomic=for_atomic,
+                target=target,
                 **kwargs)
 
     def __eq__(self, other):
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index 674eaca3fcd3f11b92d9e5e9c5a946364714e519..52e299b6140428515a0aab764c4b6e2dab0c0827 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -2120,7 +2120,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     target = kwargs.pop("target", None)
     seq_dependencies = kwargs.pop("seq_dependencies", False)
     fixed_parameters = kwargs.pop("fixed_parameters", {})
-    make_program = kwargs.pop("make_program", True)
+    is_callee_kernel = kwargs.pop("is_callee_kernel", False)
 
     if defines:
         from warnings import warn
@@ -2146,7 +2146,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
 
     # {{{ handle kernel language version
 
-    if make_program:
+    if not is_callee_kernel:
         from loopy.version import LANGUAGE_VERSION_SYMBOLS
 
         version_to_symbol = dict(
@@ -2337,19 +2337,19 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     check_for_duplicate_names(knl)
     check_written_variable_names(knl)
 
-    from loopy.kernel.tools import infer_arg_is_output_only
-    knl = infer_arg_is_output_only(knl)
+    from loopy.kernel.tools import infer_args_are_output_only
+    knl = infer_args_are_output_only(knl)
 
     from loopy.preprocess import prepare_for_caching
     knl = prepare_for_caching(knl)
 
     creation_plog.done()
 
-    if make_program:
-        from loopy.program import make_program_from_kernel
-        return make_program_from_kernel(knl)
-    else:
+    if is_callee_kernel:
         return knl
+    else:
+        from loopy.program import make_program
+        return make_program(knl)
 
 
 def make_function(*args, **kwargs):
@@ -2358,7 +2358,7 @@ def make_function(*args, **kwargs):
         raise LoopyError("lang_version should be set for program, not "
                 "functions.")
 
-    kwargs['make_program'] = False
+    kwargs['is_callee_kernel'] = True
     return make_kernel(*args, **kwargs)
 
 # }}}
diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py
index 2ea260656b8d89031849d43364977404e1ca6690..3e628f5c9a60f188c0033efa40c5fd762e53ba11 100644
--- a/loopy/kernel/function_interface.py
+++ b/loopy/kernel/function_interface.py
@@ -23,20 +23,26 @@ THE SOFTWARE.
 """
 
 
-import re
-import six
-
 from six.moves import zip
 
 from pytools import ImmutableRecord
 from loopy.diagnostic import LoopyError
 
-from loopy.symbolic import parse_tagged_name
+from loopy.tools import update_persistent_hash
+from loopy.kernel import LoopKernel
+
+__doc__ = """
 
-from loopy.symbolic import (ResolvedFunction, SubstitutionRuleMappingContext,
-        RuleAwareIdentityMapper, SubstitutionRuleExpander)
+.. currentmodule:: loopy
 
-from loopy.kernel import LoopKernel
+.. autoclass:: ValueArgDescriptor
+.. autoclass:: ArrayArgDescriptor
+.. autoclass:: InKernelCallable
+.. autoclass:: CallableKernel
+.. autoclass:: ScalarCallable
+.. autoclass:: ManglerCallable
+
+"""
 
 
 # {{{ argument descriptors
@@ -44,7 +50,7 @@ from loopy.kernel import LoopKernel
 class ValueArgDescriptor(ImmutableRecord):
     hash_fields = ()
 
-    update_persistent_hash = LoopKernel.update_persistent_hash
+    update_persistent_hash = update_persistent_hash
 
 
 class ArrayArgDescriptor(ImmutableRecord):
@@ -94,7 +100,7 @@ class ArrayArgDescriptor(ImmutableRecord):
             "address_space",
             "dim_tags")
 
-    update_persistent_hash = LoopKernel.update_persistent_hash
+    update_persistent_hash = update_persistent_hash
 
 # }}}
 
@@ -106,8 +112,6 @@ def get_kw_pos_association(kernel):
     Returns a tuple of ``(kw_to_pos, pos_to_kw)`` for the arguments in
     *kernel*.
     """
-    from loopy.kernel.tools import infer_arg_is_output_only
-    kernel = infer_arg_is_output_only(kernel)
     kw_to_pos = {}
     pos_to_kw = {}
 
@@ -131,7 +135,7 @@ class GridOverrideForCalleeKernel(ImmutableRecord):
     """
     Helper class to set the
     :attr:`loopy.kernel.LoopKernel.override_get_grid_size_for_insn_ids` of the
-    callee kernels. Refer
+    callee kernels. Refer to
     :func:`loopy.kernel.function_interface.GridOverrideForCalleeKernel.__call__`,
     :func:`loopy.kernel.function_interface.CallbleKernel.with_hw_axes_sizes`.
 
@@ -145,7 +149,7 @@ class GridOverrideForCalleeKernel(ImmutableRecord):
 
     .. note::
 
-        This class acts as a pseduo-callable and its significance lies in
+        This class acts as a pseudo-callable and its significance lies in
         solving picklability issues.
     """
     fields = set(["local_size", "global_size"])
@@ -154,7 +158,7 @@ class GridOverrideForCalleeKernel(ImmutableRecord):
         self.local_size = local_size
         self.global_size = global_size
 
-    def __call__(self, insn_ids, program_callables_info, ignore_auto=True):
+    def __call__(self, insn_ids, callables_table, ignore_auto=True):
         return self.local_size, self.global_size
 
 # }}}
@@ -168,22 +172,26 @@ class InKernelCallable(ImmutableRecord):
 
     .. attribute:: name
 
-        The name of the callable which can be encountered within a kernel.
+        The name of the callable which can be encountered within expressions in
+        a kernel.
 
     .. attribute:: arg_id_to_dtype
 
-        A mapping which indicates the arguments types and result types it would
-        be handling. This would be set once the callable is type specialized.
+        A mapping which indicates the arguments types and result types of the
+        callable.
 
     .. attribute:: arg_id_to_descr
 
         A mapping which gives indicates the argument shape and ``dim_tags`` it
-        would be responsible for generating code. These parameters would be set,
-        once it is shape and stride(``dim_tags``) specialized.
+        would be responsible for generating code.
 
     .. note::
+        - "``arg_id`` can either be an instance of :class:`int` integer
+          corresponding to the position of the argument or an instance of
+          :class:`str` corresponding to the name of keyword argument accepted
+          by the function.
 
-        Negative "id" values ``-i`` in the mapping attributes indicate
+        - Negative "arg_id" values ``-i`` in the mapping attributes indicate
         return value with (0-based) index *i*.
 
     .. automethod:: __init__
@@ -209,9 +217,9 @@ class InKernelCallable(ImmutableRecord):
     def __getinitargs__(self):
         return (self.arg_id_to_dtype, self.arg_id_to_descr)
 
-    update_persistent_hash = LoopKernel.update_persistent_hash
+    update_persistent_hash = update_persistent_hash
 
-    def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info):
+    def with_types(self, arg_id_to_dtype, caller_kernel, callables_table):
         """
         :arg arg_id_to_type: a mapping from argument identifiers
             (integers for positional arguments, names for keyword
@@ -228,12 +236,10 @@ class InKernelCallable(ImmutableRecord):
             Any argument information exists both by its positional and
             its keyword identifier.
         """
-        # FIXME: In all these with_** functions add that also passes a
-        # program_callables_info
 
         raise NotImplementedError()
 
-    def with_descrs(self, arg_id_to_descr, program_callables_info):
+    def with_descrs(self, arg_id_to_descr, callables_table):
         """
         :arg arg_id_to_descr: a mapping from argument identifiers
             (integers for positional arguments, names for keyword
@@ -284,7 +290,7 @@ class InKernelCallable(ImmutableRecord):
     def with_hw_axes_sizes(self, local_size, global_size):
         """
         Returns a copy of *self* with modifications to comply with the grid
-        sizes ``(local_size, global_size)`` of the kernel in which it is
+        sizes ``(local_size, global_size)`` of the program in which it is
         supposed to be called.
 
         :arg local_size: An instance of :class:`islpy.PwAff`.
@@ -298,7 +304,8 @@ class InKernelCallable(ImmutableRecord):
                 self.arg_id_to_descr is not None)
 
     def generate_preambles(self, target):
-        """ Yields the target specific preamble.
+        """
+        Yields the target specific preamble.
         """
         raise NotImplementedError()
 
@@ -333,12 +340,12 @@ class InKernelCallable(ImmutableRecord):
 
 class ScalarCallable(InKernelCallable):
     """
-    An abstranct interface the to a scalar callable encountered in a kernel.
+    An abstract interface the to a scalar callable encountered in a kernel.
 
     .. note::
 
         The :meth:`ScalarCallable.with_types` is intended to assist with type
-        specialization of the funciton and is expected to be supplemented in the
+        specialization of the function and is expected to be supplemented in the
         derived subclasses.
     """
 
@@ -361,16 +368,16 @@ class ScalarCallable(InKernelCallable):
         return (self.arg_id_to_dtype, self.arg_id_to_descr,
                 self.name_in_target)
 
-    def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info):
+    def with_types(self, arg_id_to_dtype, caller_kernel, callables_table):
         raise LoopyError("No type inference information present for "
                 "the function %s." % (self.name))
 
-    def with_descrs(self, arg_id_to_descr, program_callables_info):
+    def with_descrs(self, arg_id_to_descr, callables_table):
 
         arg_id_to_descr[-1] = ValueArgDescriptor()
         return (
                 self.copy(arg_id_to_descr=arg_id_to_descr),
-                program_callables_info)
+                callables_table)
 
     def with_hw_axes_sizes(self, global_size, local_size):
         return self.copy()
@@ -525,7 +532,7 @@ class CallableKernel(InKernelCallable):
         return self.subkernel.name
 
     def with_types(self, arg_id_to_dtype, caller_kernel,
-            program_callables_info):
+            callables_table):
         kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel)
 
         new_args = []
@@ -548,10 +555,10 @@ class CallableKernel(InKernelCallable):
 
         # infer the types of the written variables based on the knowledge
         # of the types of the arguments supplied
-        specialized_kernel, program_callables_info = (
+        specialized_kernel, callables_table = (
                 infer_unknown_types_for_a_single_kernel(
                     pre_specialized_subkernel,
-                    program_callables_info,
+                    callables_table,
                     expect_completion=True))
 
         new_arg_id_to_dtype = {}
@@ -564,9 +571,9 @@ class CallableKernel(InKernelCallable):
         # Return the kernel call with specialized subkernel and the corresponding
         # new arg_id_to_dtype
         return self.copy(subkernel=specialized_kernel,
-                arg_id_to_dtype=new_arg_id_to_dtype), program_callables_info
+                arg_id_to_dtype=new_arg_id_to_dtype), callables_table
 
-    def with_descrs(self, arg_id_to_descr, program_callables_info):
+    def with_descrs(self, arg_id_to_descr, callables_table):
 
         # tune the subkernel so that we have the matching shapes and
         # dim_tags
@@ -595,15 +602,15 @@ class CallableKernel(InKernelCallable):
                         type(descr))
         descriptor_specialized_knl = self.subkernel.copy(args=new_args)
         from loopy.preprocess import traverse_to_infer_arg_descr
-        descriptor_specialized_knl, program_callables_info = (
+        descriptor_specialized_knl, callables_table = (
                 traverse_to_infer_arg_descr(descriptor_specialized_knl,
-                    program_callables_info))
+                    callables_table))
 
         return (
                 self.copy(
                     subkernel=descriptor_specialized_knl,
                     arg_id_to_descr=arg_id_to_descr),
-                program_callables_info)
+                callables_table)
 
     def with_packing_for_args(self):
         from loopy.kernel.data import AddressSpace
@@ -696,7 +703,7 @@ class CallableKernel(InKernelCallable):
 
 class ManglerCallable(ScalarCallable):
     """
-    A callable whose characateristic is defined by a function mangler.
+    A callable whose characteristic is defined by a function mangler.
 
     .. attribute:: function_mangler
 
@@ -725,7 +732,7 @@ class ManglerCallable(ScalarCallable):
         return (self.name, self.function_mangler, self.arg_id_to_dtype,
                 self.arg_id_to_descr, self.name_in_target)
 
-    def with_types(self, arg_id_to_dtype, kernel, program_callables_info):
+    def with_types(self, arg_id_to_dtype, kernel, callables_table):
         if self.arg_id_to_dtype is not None:
             # specializing an already specialized function.
             for arg_id, dtype in arg_id_to_dtype.items():
@@ -749,7 +756,7 @@ class ManglerCallable(ScalarCallable):
             return (
                     self.copy(name_in_target=mangle_result.target_name,
                         arg_id_to_dtype=new_arg_id_to_dtype),
-                    program_callables_info)
+                    callables_table)
         else:
             # The function mangler does not agree with the arg id to dtypes
             # provided. Indicating that is illegal.
@@ -769,99 +776,4 @@ class ManglerCallable(ScalarCallable):
 
 # }}}
 
-
-# {{{ new pymbolic calls to scoped functions
-
-def next_indexed_variable(function):
-    """
-    Returns an instance of :class:`str` with the next indexed-name in the
-    sequence for the name of *function*.
-
-    *Example:* ``Variable('sin_0')`` will return ``'sin_1'``.
-
-    :arg function: Either an instance of :class:`pymbolic.primitives.Variable`
-        or :class:`loopy.reduction.ArgExtOp` or
-        :class:`loopy.reduction.SegmentedOp`.
-    """
-    from loopy.library.reduction import ArgExtOp, SegmentedOp
-    if isinstance(function, (ArgExtOp, SegmentedOp)):
-        return function.copy()
-    func_name = re.compile(r"^(?P<alpha>\S+?)_(?P<num>\d+?)$")
-
-    match = func_name.match(function.name)
-
-    if match is None:
-        if function.name[-1] == '_':
-            return "{old_name}0".format(old_name=function.name)
-        else:
-            return "{old_name}_0".format(old_name=function.name)
-
-    return "{alpha}_{num}".format(alpha=match.group('alpha'),
-            num=int(match.group('num'))+1)
-
-
-class FunctionNameChanger(RuleAwareIdentityMapper):
-    """
-    Changes the names of scoped functions in calls of expressions according to
-    the mapping ``calls_to_new_functions``
-    """
-
-    def __init__(self, rule_mapping_context, calls_to_new_names,
-            subst_expander):
-        super(FunctionNameChanger, self).__init__(rule_mapping_context)
-        self.calls_to_new_names = calls_to_new_names
-        self.subst_expander = subst_expander
-
-    def map_call(self, expr, expn_state):
-        name, tag = parse_tagged_name(expr.function)
-
-        if name not in self.rule_mapping_context.old_subst_rules:
-            expanded_expr = self.subst_expander(expr)
-            if expr in self.calls_to_new_names:
-                return type(expr)(
-                        ResolvedFunction(self.calls_to_new_names[expr]),
-                        tuple(self.rec(child, expn_state)
-                            for child in expr.parameters))
-            elif expanded_expr in self.calls_to_new_names:
-                # FIXME: this is horribly wrong logic.
-                # investigate how to make edits to a substitution rule
-                return type(expr)(
-                        ResolvedFunction(self.calls_to_new_names[expanded_expr]),
-                        tuple(self.rec(child, expn_state)
-                            for child in expanded_expr.parameters))
-            else:
-                return super(FunctionNameChanger, self).map_call(
-                        expr, expn_state)
-        else:
-            return self.map_substitution(name, tag, expr.parameters, expn_state)
-
-    def map_call_with_kwargs(self, expr, expn_state):
-
-        if expr in self.calls_to_new_names:
-            return type(expr)(
-                ResolvedFunction(self.calls_to_new_names[expr]),
-                tuple(self.rec(child, expn_state)
-                    for child in expr.parameters),
-                dict(
-                    (key, self.rec(val, expn_state))
-                    for key, val in six.iteritems(expr.kw_parameters))
-                    )
-        else:
-            return super(FunctionNameChanger, self).map_call_with_kwargs(
-                    expr, expn_state)
-
-
-def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names):
-    rule_mapping_context = SubstitutionRuleMappingContext(
-                    kernel.substitutions, kernel.get_var_name_generator())
-    subst_expander = SubstitutionRuleExpander(kernel.substitutions)
-    name_changer = FunctionNameChanger(rule_mapping_context,
-            pymbolic_calls_to_new_names, subst_expander)
-
-    return rule_mapping_context.finish_kernel(
-            name_changer.map_kernel(kernel))
-
-# }}}
-
-
 # vim: foldmethod=marker
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 8e238badb8cb9dbd2df270c56c28e6b923732f53..26856d64fc189191d818bd2ea3eac82c7bfa406e 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -755,7 +755,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn):
 # }}}
 
 
-def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=None):
+def assign_automatic_axes(kernel, callables_table, axis=0, local_size=None):
     logger.debug("%s: assign automatic axes" % kernel.name)
     # TODO: do the tag removal rigorously, might be easier after switching
     # to set() from tuple()
@@ -769,7 +769,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non
 
     if local_size is None:
         _, local_size = kernel.get_grid_size_upper_bounds_as_exprs(
-                program_callables_info, ignore_auto=True)
+                callables_table, ignore_auto=True)
 
     # {{{ axis assignment helper function
 
@@ -797,7 +797,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non
 
             return assign_automatic_axes(
                     kernel.copy(iname_to_tags=new_iname_to_tags),
-                    program_callables_info,
+                    callables_table,
                     axis=recursion_axis)
 
         if axis is None:
@@ -849,7 +849,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non
                             iname, inner_length=local_size[axis],
                             outer_tag=None, inner_tag=new_tag,
                             do_tagged_check=False),
-                        program_callables_info=program_callables_info,
+                        callables_table=callables_table,
                         axis=recursion_axis, local_size=local_size)
 
         if not kernel.iname_tags_of_type(iname, AutoLocalIndexTagBase):
@@ -871,7 +871,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non
             del new_iname_to_tags[iname]
 
         return assign_automatic_axes(kernel.copy(iname_to_tags=new_iname_to_tags),
-                program_callables_info, axis=recursion_axis, local_size=local_size)
+                callables_table, axis=recursion_axis, local_size=local_size)
 
     # }}}
 
@@ -940,7 +940,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non
         return kernel
     else:
         return assign_automatic_axes(kernel,
-                program_callables_info=program_callables_info, axis=axis+1,
+                callables_table=callables_table, axis=axis+1,
                 local_size=local_size)
 
 # }}}
@@ -1253,7 +1253,7 @@ def draw_dependencies_as_unicode_arrows(
         for dep in insn.depends_on:
             reverse_deps.setdefault(dep, set()).add(insn.id)
 
-    # mapping of (from_id, to_id) tuples to column_index
+    # mapping of to_id tuples to column_index
     dep_to_column = {}
 
     # {{{ find column assignments
@@ -1330,7 +1330,7 @@ def draw_dependencies_as_unicode_arrows(
 
             elif insn.id in starts:
                 starts.remove(insn.id)
-                if starts:
+                if starts or pointed_at_insn_id not in processed_ids:
                     # will continue downward
                     row[col] = do_flag_downward(u"├", pointed_at_insn_id)
 
@@ -1866,7 +1866,7 @@ def find_aliasing_equivalence_classes(kernel):
 
 # {{{ callee kernel tools
 
-def get_direct_callee_kernels(kernel, program_callables_info, insn_ids=None,):
+def get_direct_callee_kernels(kernel, callables_table, insn_ids=None,):
     """
     Returns an instance of :class:`frozenset` of all the callee kernels
     called in instructions in the *kernel* whose IDs are given in *insn_ids*.
@@ -1892,8 +1892,8 @@ def get_direct_callee_kernels(kernel, program_callables_info, insn_ids=None,):
         from loopy.kernel.instruction import (CallInstruction,
                 MultiAssignmentBase, CInstruction, _DataObliviousInstruction)
         if isinstance(insn, CallInstruction):
-            if insn.expression.function.name in program_callables_info:
-                in_knl_callable = program_callables_info[
+            if insn.expression.function.name in callables_table:
+                in_knl_callable = callables_table[
                         insn.expression.function.name]
                 if isinstance(in_knl_callable, CallableKernel):
                     return in_knl_callable.subkernel
@@ -1914,7 +1914,7 @@ def get_direct_callee_kernels(kernel, program_callables_info, insn_ids=None,):
 
 # {{{ direction helper tools
 
-def infer_arg_is_output_only(kernel):
+def infer_args_are_output_only(kernel):
     """
     Returns a copy of *kernel* with the attribute ``is_output_only`` set.
 
diff --git a/loopy/library/function.py b/loopy/library/function.py
index 8338875d0ec9f57dcce702a603293d038a9fbd02..f225b62f9f77b889c7137d69ff7e3944268641fa 100644
--- a/loopy/library/function.py
+++ b/loopy/library/function.py
@@ -26,44 +26,54 @@ from loopy.kernel.function_interface import ScalarCallable
 
 
 class MakeTupleCallable(ScalarCallable):
-    def with_types(self, arg_id_to_dtype, kernel, program_callables_info):
+    def with_types(self, arg_id_to_dtype, kernel, callables_table):
         new_arg_id_to_dtype = arg_id_to_dtype.copy()
         for i in range(len(arg_id_to_dtype)):
             if i in arg_id_to_dtype and arg_id_to_dtype[i] is not None:
                 new_arg_id_to_dtype[-i-1] = new_arg_id_to_dtype[i]
 
         return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
-            name_in_target="loopy_make_tuple"), program_callables_info)
+            name_in_target="loopy_make_tuple"), callables_table)
 
-    def with_descrs(self, arg_id_to_descr, program_callables_info):
+    def with_descrs(self, arg_id_to_descr, callables_table):
         from loopy.kernel.function_interface import ValueArgDescriptor
         new_arg_id_to_descr = dict(((id, ValueArgDescriptor()),
             (-id-1, ValueArgDescriptor())) for id in arg_id_to_descr.keys())
 
         return (
                 self.copy(arg_id_to_descr=new_arg_id_to_descr),
-                program_callables_info)
+                callables_table)
 
 
 class IndexOfCallable(ScalarCallable):
-    def with_types(self, arg_id_to_dtype, kernel, program_callables_info):
+    def with_types(self, arg_id_to_dtype, kernel, callables_table):
         new_arg_id_to_dtype = dict((i, dtype) for i, dtype in
                 arg_id_to_dtype.items() if dtype is not None)
         new_arg_id_to_dtype[-1] = kernel.index_dtype
 
         return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype),
-                program_callables_info)
+                callables_table)
 
 
-def loopy_specific_callable_scopers(target, identifier):
+def loopy_specific_callable_func_id_to_knl_callable_mappers(target, identifier):
+    """
+    Returns an instance of :class:`InKernelCallable` for the *idenitifer*
+    which is not present in *target*, but whose interface is given by
+    :mod:`loo.py`. Callables that fall in this category are --
+
+    - reductions leading to function calls like ``argmin``, ``argmax``.
+    - callables that have a predefined meaning in :mod:`loo.py` like
+      ``make_tuple``, ``index_of``, ``indexof_vec``.
+    """
     if identifier == "make_tuple":
         return MakeTupleCallable(name="make_tuple")
 
     if identifier in ["indexof", "indexof_vec"]:
         return IndexOfCallable(name=identifier)
 
-    from loopy.library.reduction import reduction_scoper
-    return reduction_scoper(target, identifier)
+    from loopy.library.reduction import (
+            reduction_func_id_to_in_knl_callable_mapper)
+    return reduction_func_id_to_in_knl_callable_mapper(target, identifier)
 
 
 # vim: foldmethod=marker
diff --git a/loopy/library/random123.py b/loopy/library/random123.py
index 59ca72df1c9f4e16ec94ee0d38eed30a9420f309..e59a892bb4c7b3bd7222bf61b29e0ade92195240 100644
--- a/loopy/library/random123.py
+++ b/loopy/library/random123.py
@@ -169,14 +169,14 @@ class Random123Callable(ScalarCallable):
     Records information about for the random123 functions.
     """
 
-    def with_types(self, arg_id_to_dtype, kernel, program_callables_info):
+    def with_types(self, arg_id_to_dtype, kernel, callables_table):
 
         if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or (
                 arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None):
             # the types provided aren't mature enough to specialize the
             # callable
             return (self.copy(),
-                    program_callables_info)
+                    callables_table)
 
         name = self.name
         target = kernel.target
@@ -195,7 +195,7 @@ class Random123Callable(ScalarCallable):
             return (
                     self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
                         name_in_target=fn+"_gen"),
-                    program_callables_info)
+                    callables_table)
 
         elif name == fn + "_f32":
             new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float32),
@@ -203,7 +203,7 @@ class Random123Callable(ScalarCallable):
                     -2: ctr_dtype, 0: ctr_dtype, 1:
                     key_dtype}
             return self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
-                    name_in_target=name), program_callables_info
+                    name_in_target=name), callables_table
 
         elif name == fn + "_f64":
             new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float64),
@@ -211,10 +211,10 @@ class Random123Callable(ScalarCallable):
                     -2: ctr_dtype, 0: ctr_dtype, 1:
                     key_dtype}
             return self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
-                    name_in_target=name), program_callables_info
+                    name_in_target=name), callables_table
 
         return (self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                program_callables_info)
+                callables_table)
 
     def generate_preambles(self, target):
         rng_variant = FUNC_NAMES_TO_RNG[self.name]
@@ -231,7 +231,7 @@ class Random123Callable(ScalarCallable):
         return
 
 
-def random123_function_scoper(target, identifier):
+def random123_function_id_to_in_knl_callable_mapper(target, identifier):
     if identifier in FUNC_NAMES_TO_RNG:
         return Random123Callable(name=identifier)
 
diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index 6ec8e4b219d93e717ebdc4a1965531c28171c84f..dd0e1e3e9e73792629dbefeafac63e6b70f9a4ef 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -31,7 +31,7 @@ import numpy as np
 from loopy.symbolic import FunctionIdentifier
 from loopy.diagnostic import LoopyError
 from loopy.types import NumpyType
-from loopy.kernel import LoopKernel
+from loopy.tools import update_persistent_hash
 
 
 class ReductionOperation(object):
@@ -227,8 +227,7 @@ class ReductionOpFunction(FunctionIdentifier):
     hash_fields = (
             "reduction_op",)
 
-    update_persistent_hash = LoopKernel.update_persistent_hash
-
+    update_persistent_hash = update_persistent_hash
 
 # }}}
 
@@ -286,7 +285,7 @@ class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation):
             "which",
             "op",)
 
-    update_persistent_hash = LoopKernel.update_persistent_hash
+    update_persistent_hash = update_persistent_hash
 
 
 class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation):
@@ -299,7 +298,7 @@ class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation):
             "op",
             "base_reduction_class",)
 
-    update_persistent_hash = LoopKernel.update_persistent_hash
+    update_persistent_hash = update_persistent_hash
 
 # }}}
 
@@ -355,7 +354,7 @@ class ArgMaxReductionOperation(_ArgExtremumReductionOperation):
             "update_comparison",
             "neutral_sign",)
 
-    update_persistent_hash = LoopKernel.update_persistent_hash
+    update_persistent_hash = update_persistent_hash
 
 
 class ArgMinReductionOperation(_ArgExtremumReductionOperation):
@@ -367,7 +366,7 @@ class ArgMinReductionOperation(_ArgExtremumReductionOperation):
             "update_comparison",
             "neutral_sign",)
 
-    update_persistent_hash = LoopKernel.update_persistent_hash
+    update_persistent_hash = update_persistent_hash
 
 # }}}
 
@@ -425,7 +424,7 @@ def parse_reduction_op(name):
 # {{{ reduction specific callables
 
 class ReductionCallable(ScalarCallable):
-    def with_types(self, arg_id_to_dtype, kernel, program_callables_info):
+    def with_types(self, arg_id_to_dtype, kernel, callables_table):
         scalar_dtype = arg_id_to_dtype[0]
         index_dtype = arg_id_to_dtype[1]
         result_dtypes = self.name.reduction_op.result_dtypes(kernel, scalar_dtype,
@@ -437,15 +436,15 @@ class ReductionCallable(ScalarCallable):
                 index_dtype) + "_op"
 
         return self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
-                name_in_target=name_in_target), program_callables_info
+                name_in_target=name_in_target), callables_table
 
-    def with_descr(self, arg_id_to_descr, program_callables_info):
+    def with_descr(self, arg_id_to_descr, callables_table):
         from loopy.library.kernel.function_interface import ValueArgDescriptor
         new_arg_id_to_descr = arg_id_to_descr.copy()
         new_arg_id_to_descr[-1] = ValueArgDescriptor()
         return (
                 self.copy(arg_id_to_descr=arg_id_to_descr),
-                program_callables_info)
+                callables_table)
 
     def generate_preambles(self, target):
         if isinstance(self.name, ArgExtOp):
@@ -503,8 +502,8 @@ class ReductionCallable(ScalarCallable):
         return
 
 
-def reduction_scoper(target, identifier):
-    if isinstance(identifier, (ArgExtOp, SegmentedOp)):
+def reduction_func_id_to_in_knl_callable_mapper(target, identifier):
+    if isinstance(identifier, ReductionOpFunction):
         return ReductionCallable(name=identifier)
 
     return None
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index f6e6181cb39b713c6ae0f4d727859de31e9b178a..aa536d7aec5394debf9ec69383101de149b1085a 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -62,7 +62,7 @@ def prepare_for_caching(kernel):
                 and not isinstance(dtype, OpaqueType)
                 and dtype is not lp.auto
                 and dtype.target is not tgt):
-            arg = arg.copy(dtype=dtype.with_target(kernel.target))
+            arg = arg.copy(dtype=dtype.with_target(tgt), target=tgt)
 
         new_args.append(arg)
 
@@ -70,7 +70,7 @@ def prepare_for_caching(kernel):
     for name, temp in six.iteritems(kernel.temporary_variables):
         dtype = temp.dtype
         if dtype is not None and dtype is not lp.auto and dtype.target is not tgt:
-            temp = temp.copy(dtype=dtype.with_target(tgt))
+            temp = temp.copy(dtype=dtype.with_target(tgt), target=tgt)
 
         new_temporary_variables[name] = temp
 
@@ -897,7 +897,7 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain):
 # }}}
 
 
-def realize_reduction_for_single_kernel(kernel, program_callables_info,
+def realize_reduction_for_single_kernel(kernel, callables_table,
         insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False,
         force_scan=False, force_outer_iname_for_scan=None):
     """Rewrites reductions into their imperative form. With *insn_id_filter*
@@ -1019,7 +1019,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info,
 
     # {{{ sequential
 
-    def map_reduction_seq(expr, rec, program_callables_info, nresults, arg_dtypes,
+    def map_reduction_seq(expr, rec, callables_table, nresults, arg_dtypes,
             reduction_dtypes):
         outer_insn_inames = temp_kernel.insn_inames(insn)
 
@@ -1137,7 +1137,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info,
                 v[iname].lt_set(v[0] + ubound)).get_basic_sets()
         return bs
 
-    def map_reduction_local(expr, rec, program_callables_info, nresults, arg_dtypes,
+    def map_reduction_local(expr, rec, callables_table, nresults, arg_dtypes,
             reduction_dtypes):
         red_iname, = expr.inames
 
@@ -1377,7 +1377,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info,
 
     # {{{ sequential scan
 
-    def map_scan_seq(expr, rec, program_callables_info, nresults, arg_dtypes,
+    def map_scan_seq(expr, rec, callables_table, nresults, arg_dtypes,
             reduction_dtypes, sweep_iname, scan_iname, sweep_min_value,
             scan_min_value, stride):
         outer_insn_inames = temp_kernel.insn_inames(insn)
@@ -1466,7 +1466,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info,
 
     # {{{ local-parallel scan
 
-    def map_scan_local(expr, rec, program_callables_info, nresults, arg_dtypes,
+    def map_scan_local(expr, rec, callables_table, nresults, arg_dtypes,
             reduction_dtypes, sweep_iname, scan_iname, sweep_min_value,
             scan_min_value, stride):
 
@@ -1475,7 +1475,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info,
         assert scan_size > 0
 
         if scan_size == 1:
-            return map_reduction_seq(expr, rec, program_callables_info,
+            return map_reduction_seq(expr, rec, callables_table,
                     nresults, arg_dtypes, reduction_dtypes)
 
         outer_insn_inames = temp_kernel.insn_inames(insn)
@@ -1675,15 +1675,15 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info,
 
     # {{{ seq/par dispatch
 
-    def map_reduction(expr, rec, program_callables_info, nresults=1):
+    def map_reduction(expr, rec, callables_table, nresults=1):
         # Only expand one level of reduction at a time, going from outermost to
         # innermost. Otherwise we get the (iname + insn) dependencies wrong.
 
         from loopy.type_inference import (
                 infer_arg_and_reduction_dtypes_for_reduction_expression)
-        arg_dtypes, reduction_dtypes, program_callables_info = (
+        arg_dtypes, reduction_dtypes, callables_table = (
                 infer_arg_and_reduction_dtypes_for_reduction_expression(
-                    temp_kernel, expr, program_callables_info, unknown_types_ok))
+                    temp_kernel, expr, callables_table, unknown_types_ok))
 
         outer_insn_inames = temp_kernel.insn_inames(insn)
         bad_inames = frozenset(expr.inames) & outer_insn_inames
@@ -1792,7 +1792,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info,
                             for tag in temp_kernel.iname_tags(sweep_iname))))
                 elif parallel:
                     return map_scan_local(
-                            expr, rec, program_callables_info, nresults,
+                            expr, rec, callables_table, nresults,
                             arg_dtypes, reduction_dtypes,
                             sweep_iname, scan_param.scan_iname,
                             scan_param.sweep_lower_bound,
@@ -1800,7 +1800,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info,
                             scan_param.stride)
                 elif sequential:
                     return map_scan_seq(
-                            expr, rec, program_callables_info, nresults,
+                            expr, rec, callables_table, nresults,
                             arg_dtypes, reduction_dtypes, sweep_iname,
                             scan_param.scan_iname,
                             scan_param.sweep_lower_bound,
@@ -1821,12 +1821,12 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info,
 
         if n_sequential:
             assert n_local_par == 0
-            return map_reduction_seq(expr, rec, program_callables_info,
+            return map_reduction_seq(expr, rec, callables_table,
                     nresults, arg_dtypes, reduction_dtypes)
         else:
             assert n_local_par > 0
             return map_reduction_local(
-                    expr, rec, program_callables_info, nresults, arg_dtypes,
+                    expr, rec, callables_table, nresults, arg_dtypes,
                     reduction_dtypes)
 
     # }}}
@@ -1861,12 +1861,12 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info,
         from loopy.symbolic import Reduction
         if isinstance(insn.expression, Reduction) and nresults > 1:
             new_expressions = cb_mapper(insn.expression,
-                    program_callables_info=program_callables_info,
+                    callables_table=callables_table,
                     nresults=nresults)
         else:
             new_expressions = (
                     cb_mapper(insn.expression,
-                        program_callables_info=program_callables_info),)
+                        callables_table=callables_table),)
 
         if generated_insns:
             # An expansion happened, so insert the generated stuff plus
@@ -1960,10 +1960,10 @@ def realize_reduction(program, *args, **kwargs):
     assert isinstance(program, Program)
 
     new_resolved_functions = {}
-    for func_id, in_knl_callable in program.program_callables_info.items():
+    for func_id, in_knl_callable in program.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
             new_subkernel = realize_reduction_for_single_kernel(
-                    in_knl_callable.subkernel, program.program_callables_info,
+                    in_knl_callable.subkernel, program.callables_table,
                     *args, **kwargs)
             in_knl_callable = in_knl_callable.copy(
                     subkernel=new_subkernel)
@@ -1976,9 +1976,9 @@ def realize_reduction(program, *args, **kwargs):
 
         new_resolved_functions[func_id] = in_knl_callable
 
-    new_program_callables_info = program.program_callables_info.copy(
+    new_callables_table = program.callables_table.copy(
             resolved_functions=new_resolved_functions)
-    return program.copy(program_callables_info=new_program_callables_info)
+    return program.copy(callables_table=new_callables_table)
 
 # }}}
 
@@ -2157,18 +2157,15 @@ def check_atomic_loads(kernel):
 
 class ArgDescrInferenceMapper(RuleAwareIdentityMapper):
     """
-    Returns a set of instances of :class:`tuple` (expr,
-    in_kernel_callable). The mapped `in_kernel_callable` of the
-    :class:`InKernelCallable` are descriptor specialized for the given
-    arguments.
+    Infers the :attr:`loopy`
     """
 
     def __init__(self, rule_mapping_context, caller_kernel,
-            program_callables_info):
+            callables_table):
         super(ArgDescrInferenceMapper, self).__init__(
                 rule_mapping_context)
         self.caller_kernel = caller_kernel
-        self.program_callables_info = program_callables_info
+        self.callables_table = callables_table
 
     def map_call(self, expr, expn_state, **kwargs):
         from pymbolic.primitives import Call, CallWithKwargs
@@ -2209,12 +2206,12 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper):
         combined_arg_id_to_descr.update(assignee_id_to_descr)
 
         # specializing the function according to the parameter description
-        in_knl_callable = self.program_callables_info[expr.function.name]
-        new_in_knl_callable, self.program_callables_info = (
+        in_knl_callable = self.callables_table[expr.function.name]
+        new_in_knl_callable, self.callables_table = (
                 in_knl_callable.with_descrs(
-                    combined_arg_id_to_descr, self.program_callables_info))
-        self.program_callables_info, new_func_id = (
-                self.program_callables_info.with_callable(
+                    combined_arg_id_to_descr, self.callables_table))
+        self.callables_table, new_func_id = (
+                self.callables_table.with_callable(
                     expr.function.function,
                     new_in_knl_callable))
 
@@ -2258,44 +2255,53 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper):
         return kernel.copy(instructions=new_insns)
 
 
-def traverse_to_infer_arg_descr(kernel, program_callables_info):
+def traverse_to_infer_arg_descr(kernel, callables_table):
     """
     Returns a copy of *kernel* with the argument shapes and strides matching for
     scoped functions in the *kernel*. Refer
     :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`.
-    """
-    # FIXME: update this docs, once the design is finalized
 
+    .. note::
+
+        Initiates a walk starting from *kernel* to all its callee kernels.
+    """
     from loopy.symbolic import SubstitutionRuleMappingContext
 
     rule_mapping_context = SubstitutionRuleMappingContext(
             kernel.substitutions, kernel.get_var_name_generator())
 
     arg_descr_inf_mapper = ArgDescrInferenceMapper(rule_mapping_context,
-            kernel, program_callables_info)
+            kernel, callables_table)
 
     descr_inferred_kernel = rule_mapping_context.finish_kernel(
             arg_descr_inf_mapper.map_kernel(kernel))
 
-    return descr_inferred_kernel, arg_descr_inf_mapper.program_callables_info
+    return descr_inferred_kernel, arg_descr_inf_mapper.callables_table
 
 
 def infer_arg_descr(program):
-    root_kernel_callable = program.program_callables_info[program.name]
-    program_callables_info = (
-            program.program_callables_info.with_edit_callables_mode())
+    """
+    Returns a copy of *program* with the
+    :attr:`loopy.InKernelCallable.arg_id_to_descr` inferred for all the
+    callables.
+    """
+    root_kernel_callable = program.callables_table[program.name]
+    old_callables_count = program.callables_table.callables_count
+    callables_table = (
+            program.callables_table.with_edit_callables_mode())
     root_kernel = program.root_kernel
 
-    new_root_kernel, program_callables_info = traverse_to_infer_arg_descr(
-            root_kernel, program_callables_info)
+    new_root_kernel, callables_table = traverse_to_infer_arg_descr(
+            root_kernel, callables_table)
     new_root_kernel_callable = root_kernel_callable.copy(
             subkernel=new_root_kernel)
-    program_callables_info, _ = program_callables_info.with_callable(program.name,
+    callables_table, _ = callables_table.with_callable(program.name,
             new_root_kernel_callable)
 
-    program_callables_info = program_callables_info.with_exit_edit_callables_mode()
+    callables_table = callables_table.with_exit_edit_callables_mode(
+            old_callables_count)
 
-    return program.copy(program_callables_info=program_callables_info)
+    return program.copy(callables_table=callables_table)
 
 # }}}
 
@@ -2305,7 +2311,7 @@ preprocess_cache = WriteOncePersistentDict(
         key_builder=LoopyKeyBuilder())
 
 
-def preprocess_single_kernel(kernel, program_callables_info, device=None):
+def preprocess_single_kernel(kernel, callables_table, device=None):
     from loopy.kernel import KernelState
     if kernel.state >= KernelState.PREPROCESSED:
         return kernel
@@ -2363,7 +2369,7 @@ def preprocess_single_kernel(kernel, program_callables_info, device=None):
     #   because it manipulates the depends_on field, which could prevent
     #   defaults from being applied.
     kernel = realize_reduction_for_single_kernel(kernel,
-            program_callables_info, unknown_types_ok=False)
+            callables_table, unknown_types_ok=False)
 
     # Ordering restriction:
     # add_axes_to_temporaries_for_ilp because reduction accumulators
@@ -2408,80 +2414,93 @@ def preprocess_single_kernel(kernel, program_callables_info, device=None):
     return kernel
 
 
-def preprocess_kernel(kernel, device=None):
-    # FIXME: error message?
-    return preprocess_program(kernel, device)
+# {{{ hw axes inference
+
+def infer_hw_axes_sizes(program):
+    """
+    Returns copy of *program* with the hardware axes sizes inferred.
+
+    .. note::
+
+        - Firstly, computes the collective hardware axes sizes from all the
+          callable kernels.
+        - Then, overrides the grid sizes of all the callable kernels to the
+          collective value.
+    """
+
+    local_size, global_size = program.get_grid_size_upper_bounds()
+
+    resolved_function_with_hw_axes_sizes_inferred = {}
+
+    for func_id, in_knl_callable in (
+            program.callables_table.items()):
+        if func_id == program.name:
+            resolved_function_with_hw_axes_sizes_inferred[func_id] = (
+                    in_knl_callable)
+        else:
+            resolved_function_with_hw_axes_sizes_inferred[func_id] = (
+                    in_knl_callable.with_hw_axes_sizes(local_size, global_size))
+
+    new_callables_table = (
+            program.callables_table.copy(
+                resolved_functions=resolved_function_with_hw_axes_sizes_inferred))
+
+    return program.copy(callables_table=new_callables_table)
+
+# }}}
 
 
 def preprocess_program(program, device=None):
 
     if device is not None:
+        # FIXME: Time to remove this? (Git blame shows 5 years ago)
         from warnings import warn
         warn("passing 'device' to preprocess_kernel() is deprecated",
                 DeprecationWarning, stacklevel=2)
 
     program = infer_unknown_types(program, expect_completion=False)
 
-    # {{{ preprocess the root kernel
+    # {{{ preprocess callable kernels
 
     # Callable editing restrictions:
     #
-    # - cannot edit program_callables_info in :meth:`preprocess_single_kernel`
-    #   as we are iterating over it.
+    # - should not edit callables_table in :meth:`preprocess_single_kernel`
+    #   as we are iterating over it.[1]
     #
-    # Refer: https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects
+    # [1] https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects
 
     new_resolved_functions = {}
-    for func_id, in_knl_callable in program.program_callables_info.items():
+    for func_id, in_knl_callable in program.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
             new_subkernel = preprocess_single_kernel(
-                    in_knl_callable.subkernel, program.program_callables_info,
+                    in_knl_callable.subkernel, program.callables_table,
                     device)
             in_knl_callable = in_knl_callable.copy(
                     subkernel=new_subkernel)
         elif isinstance(in_knl_callable, ScalarCallable):
             pass
         else:
-            raise NotImplementedError("Unknown type of callable %s." % (
+            raise NotImplementedError("Unknown callable type %s." % (
                 type(in_knl_callable).__name__))
 
         new_resolved_functions[func_id] = in_knl_callable
 
-    new_program_callables_info = program.program_callables_info.copy(
+    new_callables_table = program.callables_table.copy(
             resolved_functions=new_resolved_functions)
-    program = program.copy(program_callables_info=new_program_callables_info)
+    program = program.copy(callables_table=new_callables_table)
 
     # }}}
 
     # infer arg descrs of the callables
     program = infer_arg_descr(program)
 
-    # {{{ hw axes inference
+    program = infer_hw_axes_sizes(program)
 
-    # FIXME: think of wrapping this in a function?
-
-    local_size, global_size = program.get_grid_size_upper_bounds()
-
-    resolved_function_with_hw_axes_sizes_set = {}
-
-    for func_id, in_knl_callable in (
-            program.program_callables_info.items()):
-        if func_id == program.name:
-            resolved_function_with_hw_axes_sizes_set[func_id] = (
-                    in_knl_callable)
-        else:
-            resolved_function_with_hw_axes_sizes_set[func_id] = (
-                    in_knl_callable.with_hw_axes_sizes(local_size, global_size))
-
-    new_program_callables_info = (
-            program.program_callables_info.copy(
-                resolved_functions=resolved_function_with_hw_axes_sizes_set))
-
-    program = program.copy(program_callables_info=new_program_callables_info)
+    return program
 
-    # }}}
 
-    return program
+# FIXME: Do we add a deprecation warning?
+preprocess_kernel = preprocess_program
 
 
 # vim: foldmethod=marker
diff --git a/loopy/program.py b/loopy/program.py
index 096bd1eca86c130a488fbbbea2527f17315ac731..c8534f0511353da45977ab282df18a585b63e632 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -1,6 +1,6 @@
 from __future__ import division, absolute_import
 
-__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni"
 
 __license__ = """
 Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -29,12 +29,31 @@ from pytools import ImmutableRecord, memoize_method
 from pymbolic.primitives import Variable
 from functools import wraps
 
-from loopy.symbolic import RuleAwareIdentityMapper, ResolvedFunction
+from loopy.symbolic import (RuleAwareIdentityMapper, ResolvedFunction,
+        CombineMapper, SubstitutionRuleExpander)
 from loopy.kernel.function_interface import (
         CallableKernel, ScalarCallable)
+from loopy.kernel.instruction import (
+        MultiAssignmentBase, CInstruction, _DataObliviousInstruction)
 from loopy.diagnostic import LoopyError
+from loopy.library.reduction import ReductionOpFunction
 
 from loopy.kernel import LoopKernel
+from loopy.tools import update_persistent_hash
+from collections import Counter
+from pymbolic.primitives import Call, CallWithKwargs
+
+__doc__ = """
+
+.. currentmodule:: loopy
+
+.. autoclass:: Program
+.. autoclass:: CallablesTable
+
+.. autofunction:: make_program
+.. autofunction:: iterate_over_kernels_if_given_program
+
+"""
 
 
 class ResolvedFunctionMarker(RuleAwareIdentityMapper):
@@ -55,12 +74,11 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper):
     :arg function_ids: A container with instances of :class:`str` indicating
         the function identifiers to look for while scoping functions.
     """
-    def __init__(self, rule_mapping_context, kernel, program_callables_info,
+    def __init__(self, rule_mapping_context, kernel, callables_table,
             function_id_to_in_knl_callable_mappers):
         super(ResolvedFunctionMarker, self).__init__(rule_mapping_context)
         self.kernel = kernel
-        self.program_callables_info = program_callables_info
-        # FIXME: function_resolvesrs looks like a very bad name change it
+        self.callables_table = callables_table
         self.function_id_to_in_knl_callable_mappers = (
                 function_id_to_in_knl_callable_mappers)
 
@@ -71,7 +89,6 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper):
         :arg:`identifier` is known to any kernel function scoper, otherwise returns
         *None*.
         """
-        # FIXME change docs
         for func_id_to_in_knl_callable_mapper in (
                 self.function_id_to_in_knl_callable_mappers):
             # fixme: do we really need to given target for the function
@@ -83,7 +100,6 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper):
         return None
 
     def map_call(self, expr, expn_state):
-        from pymbolic.primitives import Call, CallWithKwargs
         from loopy.symbolic import parse_tagged_name
 
         name, tag = parse_tagged_name(expr.function)
@@ -108,9 +124,9 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper):
                 # associate the newly created ResolvedFunction with the
                 # resolved in-kernel callable
 
-                self.program_callables_info, new_func_id = (
-                        self.program_callables_info.with_callable(expr.function,
-                            in_knl_callable, True))
+                self.callables_table, new_func_id = (
+                        self.callables_table.with_added_callable(
+                            expr.function, in_knl_callable))
                 return type(expr)(
                         ResolvedFunction(new_func_id),
                         tuple(self.rec(child, expn_state)
@@ -129,56 +145,104 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper):
                 expr.operation.get_scalar_callables()):
             in_knl_callable = self.find_in_knl_callable_from_identifier(func_id)
             assert in_knl_callable is not None
-            self.program_callables_info, _ = (
-                    self.program_callables_info.with_callable(func_id,
-                        in_knl_callable, True))
+            self.callables_table, _ = (
+                    self.callables_table.with_added_callable(func_id,
+                        in_knl_callable))
         return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state)
 
 
-def initialize_program_callables_info_from_kernel(
-        kernel, func_id_to_kernel_callable_mappers):
-    program_callables_info = ProgramCallablesInfo({})
-    program_callables_info = program_callables_info.with_edit_callables_mode()
+def _default_func_id_to_kernel_callable_mappers(target):
+    """
+    Returns a list of functions that are provided through *target* by deafault.
+    """
+    from loopy.library.function import (
+            loopy_specific_callable_func_id_to_knl_callable_mappers)
+    return (
+            [loopy_specific_callable_func_id_to_knl_callable_mappers] + (
+                target.get_device_ast_builder().function_id_in_knl_callable_mapper(
+                    )))
+
+
+def initialize_callables_table_from_kernel(kernel):
+    """
+    Returns an instance of :class:`loopy.CallablesTable`, by resolving
+    the functions based on :mod:`loopy`'s default function resolvers.
+    """
+    # collect the default function resolvers
+    func_id_to_kernel_callable_mappers = (
+            _default_func_id_to_kernel_callable_mappers(kernel.target))
+    callables_table = CallablesTable({})
 
     from loopy.symbolic import SubstitutionRuleMappingContext
     rule_mapping_context = SubstitutionRuleMappingContext(
             kernel.substitutions, kernel.get_var_name_generator())
 
     resolved_function_marker = ResolvedFunctionMarker(
-            rule_mapping_context, kernel, program_callables_info,
+            rule_mapping_context, kernel, callables_table,
             func_id_to_kernel_callable_mappers)
 
-    # scoping fucntions and collecting the scoped functions
+    # mark the functions as "Resolved" in the expression nodes.
     kernel_with_functions_resolved = rule_mapping_context.finish_kernel(
             resolved_function_marker.map_kernel(kernel))
-    program_callables_info = resolved_function_marker.program_callables_info
+    # collect the update callables_table
+    callables_table = resolved_function_marker.callables_table
 
     callable_kernel = CallableKernel(kernel_with_functions_resolved)
-    program_callables_info, _ = program_callables_info.with_callable(
-            Variable(kernel.name), callable_kernel, True)
-    program_callables_info = (
-            program_callables_info.with_exit_edit_callables_mode())
 
-    return program_callables_info
+    # add the callable kernel to the callables_table
+    callables_table, _ = callables_table.with_added_callable(
+            Variable(kernel.name), callable_kernel)
+
+    return callables_table
 
 
 # {{{ program definition
 
 class Program(ImmutableRecord):
+    """
+    Records the information about all the callables in a :mod:`loopy` program.
+
+    .. attribute:: name
+
+        An instance of :class:`str`, also the name of the top-most level
+        :class:`loopy.LoopKernel`.
+
+    .. attribute:: callables_table
+
+        An instance of :class:`loopy.program.CallablesTable`.
+
+    .. attribute:: target
+
+        An instance of :class:`loopy.target.TargetBase`.
+
+    .. attribute:: func_id_to_in_knl_callables_mappers
+
+        A list of functions of the signature ``(target: TargetBase,
+        function_indentifier: str)`` that would return an instance of
+        :class:`loopy.kernel.function_interface.InKernelCallable` or *None*.
+
+    .. note::
+
+        - To create an instance of :class:`loopy.Program`, it is recommended to
+            go through :method:`loopy.make_kernel`.
+        - This data structure and its attributes should be considered
+          immutable, any modifications should be done through :method:`copy`.
+
+    .. automethod:: __init__
+    .. automethod:: with_root_kernel
+    """
     def __init__(self,
             name,
-            program_callables_info,
+            callables_table,
             target,
             func_id_to_in_knl_callable_mappers):
-        assert isinstance(program_callables_info, ProgramCallablesInfo)
+        assert isinstance(callables_table, CallablesTable)
 
-        # FIXME: check if all sanity checks have been covered?
-        # FIXME: The comments over here may need some attention.
-        assert name in program_callables_info
+        assert name in callables_table
 
         super(Program, self).__init__(
                 name=name,
-                program_callables_info=program_callables_info,
+                callables_table=callables_table,
                 target=target,
                 func_id_to_in_knl_callable_mappers=(
                     func_id_to_in_knl_callable_mappers))
@@ -187,18 +251,19 @@ class Program(ImmutableRecord):
 
     hash_fields = (
             "name",
-            "program_callables_info",
+            "callables_table",
             "target",)
 
-    update_persistent_hash = LoopKernel.update_persistent_hash
+    update_persistent_hash = update_persistent_hash
 
     def copy(self, **kwargs):
         if 'target' in kwargs:
+            # target attribute of all the callable kernels should be updated.
             target = kwargs['target']
             new_self = super(Program, self).copy(**kwargs)
             new_resolved_functions = {}
             for func_id, in_knl_callable in (
-                    new_self.program_callables_info.items()):
+                    new_self.callables_table.items()):
                 if isinstance(in_knl_callable, CallableKernel):
                     subkernel = in_knl_callable.subkernel
                     new_resolved_functions[func_id] = in_knl_callable.copy(
@@ -206,11 +271,11 @@ class Program(ImmutableRecord):
                 else:
                     new_resolved_functions[func_id] = in_knl_callable
 
-            program_callables_info = new_self.program_callables_info.copy(
+            callables_table = new_self.callables_table.copy(
                     resolved_functions=new_resolved_functions)
 
             return super(Program, new_self).copy(
-                    program_callables_info=program_callables_info)
+                    callables_table=callables_table)
         else:
             return super(Program, self).copy(**kwargs)
 
@@ -221,7 +286,7 @@ class Program(ImmutableRecord):
         *global_size* and *local_size* are :class:`islpy.PwAff` objects.
         """
         return self.root_kernel.get_grid_size_upper_bounds(
-                self.program_callables_info,
+                self.callables_table,
                 ignore_auto=ignore_auto)
 
     def get_grid_size_upper_bounds_as_exprs(self, ignore_auto=False):
@@ -231,7 +296,7 @@ class Program(ImmutableRecord):
         *global_size* and *local_size* are :mod:`pymbolic` expressions
         """
         return self.root_kernel.get_grid_size_upper_bounds_as_exprs(
-                self.program_callables_info,
+                self.callables_table,
                 ignore_auto=ignore_auto)
 
     # {{{ implementation arguments
@@ -266,27 +331,53 @@ class Program(ImmutableRecord):
 
     @property
     def root_kernel(self):
-        return self.program_callables_info[self.name].subkernel
+        """
+        Returns an instance of :class:`loopy.LoopKernel` denoting the topmost
+        level kernel.
+
+        .. note::
+
+            Syntactic sugar.
+        """
+        return self.callables_table[self.name].subkernel
 
     @property
     def arg_dict(self):
+        """
+        Returns ``arg_dict`` of the ``root_kernel``.
+
+        .. note::
+
+            Syntactic sugar.
+        """
         return self.root_kernel.arg_dict
 
+    @property
+    def args(self):
+        """
+        Returns ``args`` of the ``root_kernel``.
+
+        .. note::
+
+            Syntactic sugar.
+        """
+        return self.root_kernel.args[:]
+
     def with_root_kernel(self, root_kernel):
-        new_in_knl_callable = self.program_callables_info[
+        """
+        Returns a copy of *self* with the topmost level kernel as
+        *root_kernel*.
+        """
+        new_in_knl_callable = self.callables_table[
                 self.name].copy(subkernel=root_kernel)
         new_resolved_functions = (
-                self.program_callables_info.resolved_functions.copy())
+                self.callables_table.resolved_functions.copy())
         new_resolved_functions[self.name] = new_in_knl_callable
 
         return self.copy(
-                program_callables_info=self.program_callables_info.copy(
+                callables_table=self.callables_table.copy(
                     resolved_functions=new_resolved_functions))
 
-    @property
-    def args(self):
-        return self.root_kernel.args[:]
-
     def __call__(self, *args, **kwargs):
         key = self.target.get_kernel_executor_cache_key(*args, **kwargs)
         try:
@@ -298,51 +389,46 @@ class Program(ImmutableRecord):
         return pex(*args, **kwargs)
 
     def __str__(self):
-        # FIXME: make this better
-        print(self.program_callables_info.num_times_callables_called)
-        return (
-                (self.program_callables_info[
-                    self.name].subkernel).__str__() +
-                '\nResolved Functions: ' +
-                (self.program_callables_info.resolved_functions.keys()).__str__() +
-                '\n' + 75*'-' + '\n')
+        return self.root_kernel.__str__()
 
 # }}}
 
 
-def next_indexed_function_identifier(function):
+def next_indexed_function_identifier(function_id):
     """
     Returns an instance of :class:`str` with the next indexed-name in the
     sequence for the name of *function*.
 
-    *Example:* ``Variable('sin_0')`` will return ``'sin_1'``.
+    *Example:* ``'sin_0'`` will return ``'sin_1'``.
 
-    :arg function: Either an instance of :class:`pymbolic.primitives.Variable`
-        or :class:`loopy.reduction.ArgExtOp` or
-        :class:`loopy.reduction.SegmentedOp`.
+    :arg function_id: Either an instance of :class:`str`.
     """
-    from loopy.library.reduction import ArgExtOp, SegmentedOp
-    if isinstance(function, (ArgExtOp, SegmentedOp)):
-        return function.copy()
-    elif isinstance(function, str):
-        function = Variable(function)
 
-    assert isinstance(function, Variable)
+    # {{{ sanity checks
+
+    assert isinstance(function_id, str)
+
+    # }}}
+
     func_name = re.compile(r"^(?P<alpha>\S+?)_(?P<num>\d+?)$")
 
-    match = func_name.match(function.name)
+    match = func_name.match(function_id)
 
     if match is None:
-        if function.name[-1] == '_':
-            return "{old_name}0".format(old_name=function.name)
+        if function_id[-1] == '_':
+            return "{old_name}0".format(old_name=function_id)
         else:
-            return "{old_name}_0".format(old_name=function.name)
+            return "{old_name}_0".format(old_name=function_id)
 
     return "{alpha}_{num}".format(alpha=match.group('alpha'),
             num=int(match.group('num'))+1)
 
 
 class ResolvedFunctionRenamer(RuleAwareIdentityMapper):
+    """
+    Mapper to rename the resolved functions in an expression according to
+    *renaming_dict*.
+    """
     def __init__(self, rule_mapping_context, renaming_dict):
         super(ResolvedFunctionRenamer, self).__init__(
                 rule_mapping_context)
@@ -358,6 +444,10 @@ class ResolvedFunctionRenamer(RuleAwareIdentityMapper):
 
 def rename_resolved_functions_in_a_single_kernel(kernel,
         renaming_dict):
+    """
+    Returns a copy of *kernel* with the instances of :class:`ResolvedFunction`
+    renames according to *renaming_dict*.
+    """
     from loopy.symbolic import SubstitutionRuleMappingContext
     rule_mapping_context = SubstitutionRuleMappingContext(
                 kernel.substitutions, kernel.get_var_name_generator())
@@ -368,188 +458,421 @@ def rename_resolved_functions_in_a_single_kernel(kernel,
                 resolved_function_renamer.map_kernel(kernel)))
 
 
+# {{{ counting helpers
+
+class CallablesCountingMapper(CombineMapper):
+    """
+    Returns an instance of :class:`collections.Counter` with the count of
+    callables registered in *callables_table*.
+
+    .. attribute:: callables_table
+
+        An instance of :class:`loopy.program.CallablesTable`.
+    """
+    def __init__(self, callables_table):
+        self.callables_table = callables_table
+
+    def combine(self, values):
+        return sum(values, Counter())
+
+    def map_call(self, expr):
+
+        if isinstance(expr, CallWithKwargs):
+            kw_parameters = expr.kw_parameters
+        else:
+            assert isinstance(expr, Call)
+            kw_parameters = {}
+
+        if isinstance(expr.function, (ResolvedFunction)):
+            in_knl_callable = self.callables_table[expr.function.name]
+            if isinstance(in_knl_callable, ScalarCallable):
+                return (Counter([expr.function.name]) +
+                        self.combine((self.rec(child) for child in expr.parameters
+                            + tuple(kw_parameters.values()))))
+
+            elif isinstance(in_knl_callable, CallableKernel):
+
+                # callable kernels have more callables in them.
+                callables_count_in_subkernel = (
+                        count_callables_in_kernel(
+                            in_knl_callable.subkernel,
+                            self.callables_table))
+
+                return (Counter([expr.function.name]) +
+                        self.combine((self.rec(child) for child in expr.parameters
+                            + tuple(kw_parameters.values())))) + (
+                                    callables_count_in_subkernel)
+            else:
+                raise NotImplementedError("Unknown callable type %s." % (
+                    type))
+        else:
+            return (
+                    self.combine((self.rec(child) for child in expr.parameters
+                        + tuple(kw_parameters.values()))))
+
+    map_call_with_kwargs = map_call
+
+    def map_reduction(self, expr):
+        return Counter(expr.operation.get_scalar_callables()) + (
+                super(CallablesCountingMapper, self).map_reduction(expr))
+
+    def map_constant(self, expr):
+        return Counter()
+
+    map_variable = map_constant
+    map_function_symbol = map_constant
+    map_tagged_variable = map_constant
+    map_type_cast = map_constant
+
+
+@memoize_method
+def count_callables_in_kernel(kernel, callables_table):
+    """
+    Returns an instance of :class:`collections.Counter` representing the number
+    of callables in the *kernel* that are registered in
+    *callables_table*.
+    """
+    assert isinstance(kernel, LoopKernel)
+    callables_count = Counter()
+    callables_counting_mapper = CallablesCountingMapper(
+            callables_table)
+    subst_expander = SubstitutionRuleExpander(kernel.substitutions)
+
+    for insn in kernel.instructions:
+        if isinstance(insn, MultiAssignmentBase):
+            callables_count += (
+                    callables_counting_mapper(subst_expander(
+                        insn.expression)))
+        elif isinstance(insn, (_DataObliviousInstruction, CInstruction)):
+            pass
+        else:
+            raise NotImplementedError("Unknown instruction type %s." % (
+                type(insn)))
+
+    return callables_count
+
+# }}}
+
+
 # {{{ program callables info
 
-class ProgramCallablesInfo(ImmutableRecord):
-    # FIXME: dont evalutate num_times_called, rahter compute it from the
-    # resolved_functions
-    # FIXME: make the edit callables thing a ContextManager.
-    def __init__(self, resolved_functions, num_times_callables_called=None,
-            history=None, is_being_edited=False,
-            num_times_hit_during_editing={},
-            renames_needed_after_editing={}):
-
-        if num_times_callables_called is None:
-            num_times_callables_called = dict((func_id, 1) for func_id in
-                    resolved_functions)
+class CallablesTable(ImmutableRecord):
+    # FIXME: is CallablesTable a better name?(similar to symbol table in
+    # compilers.)
+    """
+    Records the information of all the callables called in a :class:`loopy.Program`.
+
+    .. attribute:: resolved_functions
+
+        An instance of :class:`dict` that contains a mapping from function
+        identifier to instances of
+        :class:`loopy.kernel.function_interface.InKernelCallable`
+
+    .. attribute:: history
+
+        An instance of :class:`dict` that contains a mapping from function
+        identifier to and instance of :class:`list`that would contain all the
+        names taken by a function before the current name.(For example: one
+        possibility could be ``{'sin_1': ['sin', 'sin_0', 'sin_1']}``)
+
+    .. attribute:: is_being_edited
+
+        An instance of :class:`bool` which is intended to aid the working of
+        :meth:`with_enter_edit_callables_mode`, :meth:`with_callable` and
+        :meth:`with_exit_edit_callables_mode`.
+
+    .. automethod:: __init__
+    .. automethod:: callables_count
+    .. automethod:: with_added_callable
+    .. automethod:: with_edit_callables_mode
+    .. automethod:: with_callable
+    .. automethod:: with_exit_edit_callables_mode
+    """
+    def __init__(self, resolved_functions,
+            history=None, is_being_edited=False):
+
         if history is None:
-            history = dict((func_id, set([func_id])) for func_id in
+            history = dict((func_id, frozenset([func_id])) for func_id in
                     resolved_functions)
 
-        super(ProgramCallablesInfo, self).__init__(
+        super(CallablesTable, self).__init__(
                 resolved_functions=resolved_functions,
-                num_times_callables_called=num_times_callables_called,
                 history=history,
-                is_being_edited=is_being_edited,
-                num_times_hit_during_editing=num_times_hit_during_editing,
-                renames_needed_after_editing=renames_needed_after_editing)
+                is_being_edited=is_being_edited)
 
     hash_fields = (
             "resolved_functions",
-            "num_times_callables_called",
             "is_being_edited",
-            "num_times_hit_during_editing",
-            "renames_needed_after_editing",
             "history")
 
-    update_persistent_hash = LoopKernel.update_persistent_hash
+    def __hash__(self):
+        return hash((
+            frozenset(six.iteritems(self.resolved_functions)),
+            frozenset(six.iteritems(self.history)),
+            self.is_being_edited
+            ))
+
+    update_persistent_hash = update_persistent_hash
+
+    @property
+    @memoize_method
+    def callables_count(self):
+        """
+        Returns an instance of :class:`collection.Counter` representing the number
+        of times the callables is called in callables_table.
+        """
+        root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable
+                in self.values() if
+                isinstance(in_knl_callable, CallableKernel) and
+                in_knl_callable.subkernel.is_called_from_host]
+
+        from collections import Counter
+        callables_count = Counter([root_kernel_name])
+        callables_count += (
+                count_callables_in_kernel(self[
+                    root_kernel_name].subkernel, self))
+
+        return callables_count
+
+    # {{{ interface to perform edits on callables
+
+    def with_added_callable(self, function, in_kernel_callable):
+        """
+        Returns an instance of :class:`tuple` of ``(new_self, new_function)``.
+        ``new_self`` is a copy of *self* with the *function* associated with the
+        *in_kernel_callable*. ``new_function`` is the function identifier that
+        should be noted in the expression node so that it could be associated
+        with an instance of :class:`InKernelCallable`.
+
+        .. note::
+
+            - Always checks whether the
+              :attr:``loopy.CallablesTable.resolved_functions` has
+              *in_kernel_callable*, does not introduce copies.
+
+            - The difference between
+              :meth:`loopy.CallablesTable.with_added_callable`
+              and :meth:`CallablesTable.with_callable` being that
+              the former has no support for renaming the callable back i.e.
+              ``with_callable`` supports renaming from ``sin_0`` to ``sin``,
+              if possible, through the member method
+              ``loopy.CallablesTable.with_exit_edit_callables_mode``
+
+              This subtle difference makes --
+
+              - :meth:`loopy.CallablesTable.with_added_callable` suitable
+                for usage while resolving the functions first time, where no
+                renaming is needed.
+
+              - :meth:`loopy.CallablesTable.with_callable` suitable for
+                implementing edits in callables during inference-walks.
+        """
+
+        # {{{ sanity checks
+
+        if isinstance(function, str):
+            function = Variable(function)
+
+        assert isinstance(function, (Variable, ReductionOpFunction))
+
+        # }}}
+
+        history = self.history.copy()
+
+        if in_kernel_callable in self.resolved_functions.values():
+            # the callable already exists, implies return the function
+            # identifier corresponding to that callable.
+            for func_id, in_knl_callable in self.resolved_functions.items():
+                if in_knl_callable == in_kernel_callable:
+                    history[func_id] = history[func_id] | frozenset([function.name])
+                    return (
+                            self.copy(
+                                history=history),
+                            func_id)
+        else:
+
+            # {{{ handle ReductionOpFunction
+
+            if isinstance(function, ReductionOpFunction):
+                unique_function_identifier = function.copy()
+                updated_resolved_functions = self.resolved_functions.copy()
+                updated_resolved_functions[unique_function_identifier] = (
+                        in_kernel_callable)
+                history[unique_function_identifier] = frozenset(
+                        [unique_function_identifier])
+
+                return (
+                        self.copy(
+                            history=history,
+                            resolved_functions=updated_resolved_functions),
+                        unique_function_identifier)
+
+            # }}}
+
+            unique_function_identifier = function.name
+
+            if isinstance(in_kernel_callable, CallableKernel) and (
+                    in_kernel_callable.subkernel.is_called_from_host):
+                # do not rename root kernel
+                pass
+            else:
+                while unique_function_identifier in self.resolved_functions:
+                    unique_function_identifier = (
+                            next_indexed_function_identifier(
+                                unique_function_identifier))
+
+        updated_resolved_functions = self.resolved_functions.copy()
+        updated_resolved_functions[unique_function_identifier] = (
+                in_kernel_callable)
+
+        history[unique_function_identifier] = frozenset(
+                [unique_function_identifier])
+
+        return (
+                self.copy(
+                    history=history,
+                    resolved_functions=updated_resolved_functions),
+                Variable(unique_function_identifier))
 
     def with_edit_callables_mode(self):
-        return self.copy(is_being_edited=True,
-                num_times_hit_during_editing=dict((func_id, 0) for func_id in
-                    self.resolved_functions))
+        """
+        Returns a copy of *self* for a walk traversal through all the callables.
+        """
+        return self.copy(
+                is_being_edited=True)
 
-    def with_callable(self, function, in_kernel_callable,
-            resolved_for_the_first_time=False):
+    def with_callable(self, function, in_kernel_callable):
         """
+        Returns an instance of :class:`tuple` ``(new_self, new_function)``.
+        Also refer -- :meth:`loopy.CallablesTable.with_added_callable`
+
+
         :arg function: An instance of :class:`pymbolic.primitives.Variable` or
             :class:`loopy.library.reduction.ReductionOpFunction`.
 
-        :arg in_kernel_callables: An instance of
+        :arg in_kernel_callable: An instance of
             :class:`loopy.InKernelCallable`.
 
         .. note::
 
-            Assumes that each callable is touched atmost once, the internal
-            working of this function fails if that is violated.
+            - Use :meth:`with_added_callable` if a callable is being resolved for the
+              first time.
         """
-        # FIXME: add a note about using enter and exit. ~KK
-        # FIXME: think about a better idea of "with_added_callable" this would
-        # be more convenient for developer-faced usage. ~KK
+
+        # {{{ non-edit mode
 
         if not self.is_being_edited:
             if function.name in self.resolved_functions and (
                     self.resolved_functions[function.name] == in_kernel_callable):
+                # if not being edited, check that the given function is
+                # equal to the old version of the callable.
                 return self, function
             else:
                 print('Old: ', self.resolved_functions[function.name])
                 print('New: ', in_kernel_callable)
-                raise LoopyError("Use 'enter_edit_callables_mode' first.")
+                raise LoopyError("Use 'with_enter_edit_callables_mode' first.")
 
-        from loopy.library.reduction import ArgExtOp, SegmentedOp
+        # }}}
 
         # {{{ sanity checks
 
         if isinstance(function, str):
             function = Variable(function)
 
-        assert isinstance(function, (Variable, ArgExtOp, SegmentedOp))
+        assert isinstance(function, (Variable, ReductionOpFunction))
 
         # }}}
 
-        renames_needed_after_editing = self.renames_needed_after_editing.copy()
-        num_times_hit_during_editing = self.num_times_hit_during_editing.copy()
-        num_times_callables_called = self.num_times_callables_called.copy()
         history = self.history.copy()
 
-        if not resolved_for_the_first_time:
-            if isinstance(function, (ArgExtOp, SegmentedOp)):
-                num_times_hit_during_editing[function] += 1
-            else:
-                num_times_hit_during_editing[function.name] += 1
-
-        if isinstance(function, (ArgExtOp, SegmentedOp)):
-            unique_function_identifier = function.copy()
-            if not resolved_for_the_first_time:
-                num_times_callables_called[function] -= 1
-
-            num_times_callables_called[unique_function_identifier] = 1
-
-            updated_resolved_functions = self.resolved_functions.copy()
-            updated_resolved_functions[unique_function_identifier] = (
-                    in_kernel_callable)
-
-            return (
-                    self.copy(
-                        resolved_functions=updated_resolved_functions,
-                        num_times_callables_called=num_times_callables_called,
-                        num_times_hit_during_editing=(
-                            num_times_hit_during_editing),
-                        renames_needed_after_editing=(
-                            renames_needed_after_editing)),
-                    unique_function_identifier)
-
         if in_kernel_callable in self.resolved_functions.values():
-            # the callable already exists, implies return the function
-            # identifier corresposing to that callable.
+
+            # the callable already exists, hence return the function
+            # identifier corresponding to that callable.
             for func_id, in_knl_callable in self.resolved_functions.items():
                 if in_knl_callable == in_kernel_callable:
-                    num_times_callables_called[func_id] += 1
-                    if not resolved_for_the_first_time:
-                        num_times_callables_called[function.name] -= 1
-                        if num_times_callables_called[function.name] == 0:
-                            renames_needed_after_editing[func_id] = function.name
-
-                        history[func_id] = history[func_id] | set([function.name])
+                    history[func_id] = history[func_id] | frozenset([function.name])
                     return (
                             self.copy(
-                                history=history,
-                                num_times_hit_during_editing=(
-                                    num_times_hit_during_editing),
-                                num_times_callables_called=(
-                                    num_times_callables_called),
-                                renames_needed_after_editing=(
-                                    renames_needed_after_editing)),
+                                history=history),
                             func_id)
         else:
-            # FIXME: maybe deal with the history over here?
-            # FIXME: once the code logic is running beautify this part.
-            # many "ifs" can be avoided
+            # {{{ handle ReductionOpFunction
+
+            if isinstance(function, ReductionOpFunction):
+                unique_function_identifier = function.copy()
+                updated_resolved_functions = self.resolved_functions.copy()
+                updated_resolved_functions[unique_function_identifier] = (
+                        in_kernel_callable)
+
+                return (
+                        self.copy(
+                            resolved_functions=updated_resolved_functions),
+                        unique_function_identifier)
+
+            # }}}
             unique_function_identifier = function.name
-            if (resolved_for_the_first_time or
-                    self.num_times_callables_called[function.name] > 1):
+
+            if isinstance(in_kernel_callable, CallableKernel) and (
+                    in_kernel_callable.subkernel.is_called_from_host):
+                # do not rename root kernel
+                pass
+            else:
                 while unique_function_identifier in self.resolved_functions:
                     unique_function_identifier = (
                             next_indexed_function_identifier(
                                 unique_function_identifier))
 
-            if not resolved_for_the_first_time:
-                num_times_callables_called[function.name] -= 1
-
-            num_times_callables_called[unique_function_identifier] = 1
-
         updated_resolved_functions = self.resolved_functions.copy()
         updated_resolved_functions[unique_function_identifier] = (
                 in_kernel_callable)
 
-        if not resolved_for_the_first_time:
-            history[unique_function_identifier] = (
-                    history[function.name] | set([unique_function_identifier]))
-        else:
-            history[unique_function_identifier] = set(
-                    [unique_function_identifier])
+        history[unique_function_identifier] = (
+                history[function.name] | frozenset([unique_function_identifier]))
 
         return (
                 self.copy(
                     history=history,
-                    resolved_functions=updated_resolved_functions,
-                    num_times_callables_called=num_times_callables_called,
-                    num_times_hit_during_editing=num_times_hit_during_editing,
-                    renames_needed_after_editing=renames_needed_after_editing),
+                    resolved_functions=updated_resolved_functions),
                 Variable(unique_function_identifier))
 
-    def with_exit_edit_callables_mode(self):
+    def with_exit_edit_callables_mode(self, old_callables_count):
+        """
+        Returns a copy of *self* with renaming of the callables done whenever
+        possible.
+
+        *For example: * If all the ``sin`` got diverged as ``sin_0, sin_1``,
+        then all the renaming is done such that one of flavors of the callable
+        is renamed back to ``sin``.
+        """
+
         assert self.is_being_edited
 
-        num_times_callables_called = {}
-        resolved_functions = {}
-        history = self.history.copy()
+        new_callables_count = self.callables_count
 
-        for func_id, in_knl_callable in self.resolved_functions.items():
+        # {{{ calculate the renames needed
+
+        renames_needed = {}
+        for old_func_id in old_callables_count-new_callables_count:
+            # this implies that all the function instances having the name
+            # "func_id" have been renamed to something else.
+            for new_func_id in (
+                    six.viewkeys(new_callables_count)-six.viewkeys(renames_needed)):
+                if old_func_id in self.history[new_func_id]:
+                    renames_needed[new_func_id] = old_func_id
+                    break
+        # }}}
+
+        new_resolved_functions = {}
+        new_history = {}
+
+        for func_id in new_callables_count:
+            in_knl_callable = self.resolved_functions[func_id]
             if isinstance(in_knl_callable, CallableKernel):
+                # if callable kernel, perform renames inside its expressions.
                 old_subkernel = in_knl_callable.subkernel
                 new_subkernel = rename_resolved_functions_in_a_single_kernel(
-                        old_subkernel, self.renames_needed_after_editing)
+                        old_subkernel, renames_needed)
                 in_knl_callable = (
                         in_knl_callable.copy(subkernel=new_subkernel))
             elif isinstance(in_knl_callable, ScalarCallable):
@@ -558,45 +881,23 @@ class ProgramCallablesInfo(ImmutableRecord):
                 raise NotImplementedError("Unknown callable type %s." %
                         type(in_knl_callable).__name__)
 
-            if func_id in self.renames_needed_after_editing:
-                history.pop(func_id)
-
-                new_func_id = self.renames_needed_after_editing[func_id]
-                resolved_functions[new_func_id] = (
+            if func_id in renames_needed:
+                new_func_id = renames_needed[func_id]
+                new_resolved_functions[new_func_id] = (
                         in_knl_callable)
-                num_times_callables_called[new_func_id] = (
-                        self.num_times_callables_called[func_id])
-
+                new_history[new_func_id] = self.history[func_id]
             else:
-                resolved_functions[func_id] = in_knl_callable
-                num_times_callables_called[func_id] = (
-                        self.num_times_callables_called[func_id])
+                new_resolved_functions[func_id] = in_knl_callable
+                new_history[func_id] = self.history[func_id]
 
         return self.copy(
                 is_being_edited=False,
-                resolved_functions=resolved_functions,
-                num_times_callables_called=num_times_callables_called,
-                num_times_hit_during_editing={},
-                renames_needed_after_editing={})
-
-    def with_deleted_callable(self, func_id, instances=1):
-        num_times_callables_called = self.num_times_callables_called.copy()
-        history = self.history.copy()
-        resolved_functions = self.resolved_functions.copy()
-
-        assert instances <= num_times_callables_called[func_id]
+                resolved_functions=new_resolved_functions,
+                history=new_history)
 
-        num_times_callables_called[func_id] -= instances
+    # }}}
 
-        if num_times_callables_called[func_id] == 0:
-            num_times_callables_called.pop(func_id)
-            history.pop(func_id)
-            resolved_functions.pop(func_id)
-
-        return self.copy(
-                resolved_functions=resolved_functions,
-                num_times_callables_called=num_times_callables_called,
-                history=history)
+    # {{{ behave like a dict(syntactic sugar)
 
     def __getitem__(self, item):
         return self.resolved_functions[item]
@@ -605,44 +906,53 @@ class ProgramCallablesInfo(ImmutableRecord):
         return item in self.resolved_functions
 
     def items(self):
-        return self.resolved_functions.items()
+        return six.iteritems(self.resolved_functions)
 
     def values(self):
-        return self.resolved_functions.values()
-
+        return six.itervalues(self.resolved_functions)
 
-# }}}
+    def keys(self):
+        return six.iterkeys(self.resolved_functions)
 
+    # }}}
 
-def default_func_id_to_kernel_callable_mappers(target):
+# }}}
 
-    from loopy.library.function import loopy_specific_callable_scopers
-    return (
-            [loopy_specific_callable_scopers] + (
-                target.get_device_ast_builder().function_scopers()))
 
+# {{{ helper functions
 
-def make_program_from_kernel(kernel):
+def make_program(kernel):
+    """
+    Returns an instance of :class:`loopy.Program` with the *kernel* as the root
+    kernel.
+    """
 
-    program_callables_info = initialize_program_callables_info_from_kernel(kernel,
-            default_func_id_to_kernel_callable_mappers(kernel.target))
+    # get the program callables info
+    callables_table = initialize_callables_table_from_kernel(kernel)
 
+    # get the program from program callables info
     program = Program(
             name=kernel.name,
-            program_callables_info=program_callables_info,
+            callables_table=callables_table,
             func_id_to_in_knl_callable_mappers=(
-                default_func_id_to_kernel_callable_mappers(kernel.target)),
+                _default_func_id_to_kernel_callable_mappers(kernel.target)),
             target=kernel.target)
 
     return program
 
 
 def iterate_over_kernels_if_given_program(transform_for_single_kernel):
+    """
+    Function wrapper for transformations of the type ``transform(kernel:
+    LoopKernel, *args, **kwargs): LoopKernel``. Returns a function with the
+    ``transform`` being implemented on all of the callable kernels in a
+    :class:`loopy.Program`.
+    """
     def _collective_transform(program_or_kernel, *args, **kwargs):
         if isinstance(program_or_kernel, Program):
             program = program_or_kernel
             new_resolved_functions = {}
-            for func_id, in_knl_callable in program.program_callables_info.items():
+            for func_id, in_knl_callable in program.callables_table.items():
                 if isinstance(in_knl_callable, CallableKernel):
                     new_subkernel = transform_for_single_kernel(
                             in_knl_callable.subkernel, *args, **kwargs)
@@ -657,9 +967,9 @@ def iterate_over_kernels_if_given_program(transform_for_single_kernel):
 
                 new_resolved_functions[func_id] = in_knl_callable
 
-            new_program_callables_info = program.program_callables_info.copy(
+            new_callables_table = program.callables_table.copy(
                     resolved_functions=new_resolved_functions)
-            return program.copy(program_callables_info=new_program_callables_info)
+            return program.copy(callables_table=new_callables_table)
         else:
             assert isinstance(program_or_kernel, LoopKernel)
             kernel = program_or_kernel
@@ -667,17 +977,6 @@ def iterate_over_kernels_if_given_program(transform_for_single_kernel):
 
     return wraps(transform_for_single_kernel)(_collective_transform)
 
-
-# {{{ ingoring this for now
-
-# if False and isinstance(function, (ArgExtOp, SegmentedOp)):
-# FIXME: ignoring this casse for now
-# FIXME: If a kernel has two flavors of ArgExtOp then they are
-# overwritten and hence not supported.(for now).
-# updated_resolved_functions = self.scoped_functions.copy()
-# updated_resolved_functions[function] = in_kernel_callable
-# return self.copy(updated_resolved_functions), function.copy()
-
 # }}}
 
 
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index 201bcc2562754a30d20d3903ecf01d0d8984b11e..3dc1c0bbe71e43c134360650c134e5d6d9940cc7 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -794,9 +794,13 @@ def generate_loop_schedules_internal(
 
         if not is_ready:
             if debug_mode:
-                print("instruction '%s' is missing insn depedencies '%s'" % (
-                        format_insn(kernel, insn.id), ",".join(
-                            insn.depends_on - sched_state.scheduled_insn_ids)))
+                # These are not that interesting when understanding scheduler
+                # failures.
+
+                # print("instruction '%s' is missing insn depedencies '%s'" % (
+                #         format_insn(kernel, insn.id), ",".join(
+                #             insn.depends_on - sched_state.scheduled_insn_ids)))
+                pass
             continue
 
         want = kernel.insn_inames(insn) - sched_state.parallel_inames
@@ -1832,7 +1836,7 @@ class MinRecursionLimitForScheduling(MinRecursionLimit):
 
 # {{{ main scheduling entrypoint
 
-def generate_loop_schedules(kernel, program_callables_info, debug_args={}):
+def generate_loop_schedules(kernel, callables_table, debug_args={}):
     """
     .. warning::
 
@@ -1846,18 +1850,18 @@ def generate_loop_schedules(kernel, program_callables_info, debug_args={}):
 
     with MinRecursionLimitForScheduling(kernel):
         for sched in generate_loop_schedules_inner(kernel,
-                program_callables_info, debug_args=debug_args):
+                callables_table, debug_args=debug_args):
             yield sched
 
 
-def generate_loop_schedules_inner(kernel, program_callables_info, debug_args={}):
+def generate_loop_schedules_inner(kernel, callables_table, debug_args={}):
     from loopy.kernel import KernelState
     if kernel.state not in (KernelState.PREPROCESSED, KernelState.SCHEDULED):
         raise LoopyError("cannot schedule a kernel that has not been "
                 "preprocessed")
 
     from loopy.check import pre_schedule_checks
-    pre_schedule_checks(kernel, program_callables_info)
+    pre_schedule_checks(kernel, callables_table)
 
     schedule_count = 0
 
@@ -1971,7 +1975,7 @@ def generate_loop_schedules_inner(kernel, program_callables_info, debug_args={})
                     kernel, gen_sched)
 
             gsize, lsize = (
-                    kernel.get_grid_size_upper_bounds(program_callables_info))
+                    kernel.get_grid_size_upper_bounds(callables_table))
 
             if (gsize or lsize):
                 if not kernel.options.disable_global_barriers:
@@ -2028,7 +2032,7 @@ schedule_cache = WriteOncePersistentDict(
         key_builder=LoopyKeyBuilder())
 
 
-def _get_one_scheduled_kernel_inner(kernel, program_callables_info):
+def _get_one_scheduled_kernel_inner(kernel, callables_table):
     # This helper function exists to ensure that the generator chain is fully
     # out of scope after the function returns. This allows it to be
     # garbage-collected in the exit handler of the
@@ -2038,10 +2042,10 @@ def _get_one_scheduled_kernel_inner(kernel, program_callables_info):
     #
     # See https://gitlab.tiker.net/inducer/sumpy/issues/31 for context.
 
-    return next(iter(generate_loop_schedules(kernel, program_callables_info)))
+    return next(iter(generate_loop_schedules(kernel, callables_table)))
 
 
-def get_one_scheduled_kernel(kernel, program_callables_info):
+def get_one_scheduled_kernel(kernel, callables_table):
     from loopy import CACHING_ENABLED
 
     sched_cache_key = kernel
@@ -2060,7 +2064,7 @@ def get_one_scheduled_kernel(kernel, program_callables_info):
         with ProcessLogger(logger, "%s: schedule" % kernel.name):
             with MinRecursionLimitForScheduling(kernel):
                 result = _get_one_scheduled_kernel_inner(kernel,
-                        program_callables_info)
+                        callables_table)
 
     if CACHING_ENABLED and not from_cache:
         schedule_cache.store_if_not_present(sched_cache_key, result)
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 08b7f89e91d1997a5583923571fbfa0140bb591d..c621ea727a0c25348370e78ccac08935ee2d18e6 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -32,8 +32,10 @@ from functools import reduce
 from loopy.kernel.data import (
         MultiAssignmentBase, TemporaryVariable, AddressSpace)
 from loopy.diagnostic import warn_with_kernel, LoopyError
-from pytools import Record
+from pytools import Record, memoize_method
 from loopy.kernel.function_interface import ScalarCallable, CallableKernel
+from loopy.kernel import LoopKernel
+from loopy.program import make_program
 
 
 __doc__ = """
@@ -61,12 +63,12 @@ __doc__ = """
 
 
 # FIXME: this is broken for the callable kernel design.
-# Qns:
-# - The variable name, what if multiple kernels use the same name?
+# - The variable name, what if multiple kernels use the same name?(needs a
+# different MemAccessInfo)
 # - We should also add the cumulative effect on the arguments of callee kernels
-# into the caller kernel.
-# FIXME: add an error that there is only one callable kernel. disable for
-# multiple callable kernels.
+# into the caller kernel
+# - Make changes to MemAccessInfo to include the effect of several kernels.
+# - Renovate `count`.
 
 # {{{ GuardedPwQPolynomial
 
@@ -648,11 +650,11 @@ class MemAccess(Record):
 # {{{ counter base
 
 class CounterBase(CombineMapper):
-    def __init__(self, knl, program_callables_info):
+    def __init__(self, knl, callables_table):
         self.knl = knl
-        self.program_callables_info = program_callables_info
+        self.callables_table = callables_table
         from loopy.type_inference import TypeInferenceMapper
-        self.type_inf = TypeInferenceMapper(knl, program_callables_info)
+        self.type_inf = TypeInferenceMapper(knl, callables_table)
 
     def combine(self, values):
         return sum(values)
@@ -707,11 +709,12 @@ class CounterBase(CombineMapper):
 # {{{ ExpressionOpCounter
 
 class ExpressionOpCounter(CounterBase):
-    def __init__(self, knl, program_callables_info):
+    def __init__(self, knl, callables_table, count_within_subscripts=True):
         self.knl = knl
-        self.program_callables_info = program_callables_info
+        self.callables_table = callables_table
+        self.count_within_subscripts = count_within_subscripts
         from loopy.type_inference import TypeInferenceMapper
-        self.type_inf = TypeInferenceMapper(knl, program_callables_info)
+        self.type_inf = TypeInferenceMapper(knl, callables_table)
 
     def combine(self, values):
         return sum(values)
@@ -725,7 +728,7 @@ class ExpressionOpCounter(CounterBase):
     def map_call(self, expr):
         from loopy.symbolic import ResolvedFunction
         if isinstance(expr.function, ResolvedFunction):
-            function_identifier = self.program_callables_info[
+            function_identifier = self.callables_table[
                     expr.function.name].name
         else:
             function_identifier = expr.function.name
@@ -733,18 +736,21 @@ class ExpressionOpCounter(CounterBase):
         return ToCountMap(
                     {Op(dtype=self.type_inf(expr),
                         name='func:'+function_identifier,
-                        count_granularity=CountGranularity.WORKITEM): 1}
+                        count_granularity=CountGranularity.SUBGROUP): 1}
                     ) + self.rec(expr.parameters)
 
     def map_subscript(self, expr):
-        return self.rec(expr.index)
+        if self.count_within_subscripts:
+            return self.rec(expr.index)
+        else:
+            return ToCountMap()
 
     def map_sum(self, expr):
         assert expr.children
         return ToCountMap(
                     {Op(dtype=self.type_inf(expr),
                         name='add',
-                        count_granularity=CountGranularity.WORKITEM):
+                        count_granularity=CountGranularity.SUBGROUP):
                      len(expr.children)-1}
                     ) + sum(self.rec(child) for child in expr.children)
 
@@ -753,18 +759,18 @@ class ExpressionOpCounter(CounterBase):
         assert expr.children
         return sum(ToCountMap({Op(dtype=self.type_inf(expr),
                                   name='mul',
-                                  count_granularity=CountGranularity.WORKITEM): 1})
+                                  count_granularity=CountGranularity.SUBGROUP): 1})
                    + self.rec(child)
                    for child in expr.children
                    if not is_zero(child + 1)) + \
                    ToCountMap({Op(dtype=self.type_inf(expr),
                                   name='mul',
-                                  count_granularity=CountGranularity.WORKITEM): -1})
+                                  count_granularity=CountGranularity.SUBGROUP): -1})
 
     def map_quotient(self, expr, *args):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='div',
-                              count_granularity=CountGranularity.WORKITEM): 1}) \
+                              count_granularity=CountGranularity.SUBGROUP): 1}) \
                                 + self.rec(expr.numerator) \
                                 + self.rec(expr.denominator)
 
@@ -774,14 +780,14 @@ class ExpressionOpCounter(CounterBase):
     def map_power(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='pow',
-                              count_granularity=CountGranularity.WORKITEM): 1}) \
+                              count_granularity=CountGranularity.SUBGROUP): 1}) \
                                 + self.rec(expr.base) \
                                 + self.rec(expr.exponent)
 
     def map_left_shift(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='shift',
-                              count_granularity=CountGranularity.WORKITEM): 1}) \
+                              count_granularity=CountGranularity.SUBGROUP): 1}) \
                                 + self.rec(expr.shiftee) \
                                 + self.rec(expr.shift)
 
@@ -790,13 +796,13 @@ class ExpressionOpCounter(CounterBase):
     def map_bitwise_not(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='bw',
-                              count_granularity=CountGranularity.WORKITEM): 1}) \
+                              count_granularity=CountGranularity.SUBGROUP): 1}) \
                                 + self.rec(expr.child)
 
     def map_bitwise_or(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='bw',
-                              count_granularity=CountGranularity.WORKITEM):
+                              count_granularity=CountGranularity.SUBGROUP):
                            len(expr.children)-1}) \
                                 + sum(self.rec(child) for child in expr.children)
 
@@ -820,7 +826,7 @@ class ExpressionOpCounter(CounterBase):
     def map_min(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='maxmin',
-                              count_granularity=CountGranularity.WORKITEM):
+                              count_granularity=CountGranularity.SUBGROUP):
                            len(expr.children)-1}) \
                + sum(self.rec(child) for child in expr.children)
 
@@ -936,7 +942,7 @@ class LocalMemAccessCounter(MemAccessCounter):
                     sub_map[MemAccess(
                                 mtype='local',
                                 dtype=dtype,
-                                count_granularity=CountGranularity.WORKITEM)
+                                count_granularity=CountGranularity.SUBGROUP)
                             ] = 1
                     return sub_map
 
@@ -956,7 +962,7 @@ class LocalMemAccessCounter(MemAccessCounter):
                         lid_strides=dict(sorted(six.iteritems(lid_strides))),
                         gid_strides=dict(sorted(six.iteritems(gid_strides))),
                         variable=name,
-                        count_granularity=CountGranularity.WORKITEM)] = 1
+                        count_granularity=CountGranularity.SUBGROUP)] = 1
 
         return sub_map
 
@@ -1111,7 +1117,7 @@ def count(kernel, set, space=None):
     from loopy.program import Program
     if isinstance(kernel, Program):
         if len([in_knl_callable for in_knl_callable in
-            kernel.program_callables_info.values() if isinstance(in_knl_callable,
+            kernel.callables_table.values() if isinstance(in_knl_callable,
                 CallableKernel)]) != 1:
             raise NotImplementedError("Currently only supported for program with "
                 "only one CallableKernel.")
@@ -1216,10 +1222,10 @@ def count(kernel, set, space=None):
     return add_assumptions_guard(kernel, count)
 
 
-def get_unused_hw_axes_factor(knl, program_callables_info, insn,
+def get_unused_hw_axes_factor(knl, callables_table, insn,
         disregard_local_axes, space=None):
     # FIXME: Multi-kernel support
-    gsize, lsize = knl.get_grid_size_upper_bounds(program_callables_info)
+    gsize, lsize = knl.get_grid_size_upper_bounds(callables_table)
 
     g_used = set()
     l_used = set()
@@ -1257,7 +1263,7 @@ def get_unused_hw_axes_factor(knl, program_callables_info, insn,
     return add_assumptions_guard(knl, result)
 
 
-def count_insn_runs(knl, program_callables_info, insn, count_redundant_work,
+def count_insn_runs(knl, callables_table, insn, count_redundant_work,
         disregard_local_axes=False):
 
     insn_inames = knl.insn_inames(insn)
@@ -1278,39 +1284,100 @@ def count_insn_runs(knl, program_callables_info, insn, count_redundant_work,
     c = count(knl, domain, space=space)
 
     if count_redundant_work:
-        unused_fac = get_unused_hw_axes_factor(knl, program_callables_info,
+        unused_fac = get_unused_hw_axes_factor(knl, callables_table,
                 insn, disregard_local_axes=disregard_local_axes, space=space)
         return c * unused_fac
     else:
         return c
 
+
+@memoize_method
+def _get_insn_count(knl, callables_table, insn_id, subgroup_size,
+        count_redundant_work, count_granularity=CountGranularity.WORKITEM):
+    insn = knl.id_to_insn[insn_id]
+
+    if count_granularity is None:
+        warn_with_kernel(knl, "get_insn_count_assumes_granularity",
+                         "get_insn_count: No count granularity passed, "
+                         "assuming %s granularity."
+                         % (CountGranularity.WORKITEM))
+        count_granularity == CountGranularity.WORKITEM
+
+    if count_granularity == CountGranularity.WORKITEM:
+        return count_insn_runs(
+            knl, callables_table, insn,
+            count_redundant_work=count_redundant_work,
+            disregard_local_axes=False)
+
+    ct_disregard_local = count_insn_runs(
+            knl, callables_table, insn, disregard_local_axes=True,
+            count_redundant_work=count_redundant_work)
+
+    if count_granularity == CountGranularity.WORKGROUP:
+        return ct_disregard_local
+    elif count_granularity == CountGranularity.SUBGROUP:
+        # get the group size
+        from loopy.symbolic import aff_to_expr
+        _, local_size = knl.get_grid_size_upper_bounds(callables_table)
+        workgroup_size = 1
+        if local_size:
+            for size in local_size:
+                s = aff_to_expr(size)
+                if not isinstance(s, int):
+                    raise LoopyError("Cannot count insn with %s granularity, "
+                                     "work-group size is not integer: %s"
+                                     % (CountGranularity.SUBGROUP, local_size))
+                workgroup_size *= s
+
+        warn_with_kernel(knl, "insn_count_subgroups_upper_bound",
+                "get_insn_count: when counting instruction %s with "
+                "count_granularity=%s, using upper bound for work-group size "
+                "(%d work-items) to compute sub-groups per work-group. When "
+                "multiple device programs present, actual sub-group count may be"
+                "lower." % (insn_id, CountGranularity.SUBGROUP, workgroup_size))
+
+        from pytools import div_ceil
+        return ct_disregard_local*div_ceil(workgroup_size, subgroup_size)
+    else:
+        # this should not happen since this is enforced in Op/MemAccess
+        raise ValueError("get_insn_count: count_granularity '%s' is"
+                "not allowed. count_granularity options: %s"
+                % (count_granularity, CountGranularity.ALL+[None]))
+
 # }}}
 
 
 # {{{ get_op_map
 
-
-def get_op_map_for_single_kernel(knl, program_callables_info,
+def get_op_map_for_single_kernel(knl, callables_table,
         numpy_types=True, count_redundant_work=False,
-               subgroup_size=None):
+        count_within_subscripts=True, subgroup_size=None):
 
     if not knl.options.ignore_boostable_into:
         raise LoopyError("Kernel '%s': Using operation counting requires the option "
                 "ignore_boostable_into to be set." % knl.name)
 
+    subgroup_size = _process_subgroup_size(knl, subgroup_size)
+
+    op_map = ToCountMap()
+    op_counter = ExpressionOpCounter(knl, callables_table,
+            count_within_subscripts)
+
     from loopy.kernel.instruction import (
             CallInstruction, CInstruction, Assignment,
             NoOpInstruction, BarrierInstruction)
 
-    op_map = ToCountMap()
-    op_counter = ExpressionOpCounter(knl,
-            program_callables_info=program_callables_info)
     for insn in knl.instructions:
         if isinstance(insn, (CallInstruction, CInstruction, Assignment)):
             ops = op_counter(insn.assignee) + op_counter(insn.expression)
-            op_map = op_map + ops*count_insn_runs(
-                    knl, program_callables_info, insn,
-                    count_redundant_work=count_redundant_work)
+            for key, val in six.iteritems(ops.count_map):
+                op_map = (
+                        op_map
+                        + ToCountMap({key: val})
+                        * _get_insn_count(knl, callables_table, insn.id,
+                            subgroup_size, count_redundant_work,
+                            key.count_granularity))
+
         elif isinstance(insn, (NoOpInstruction, BarrierInstruction)):
             pass
         else:
@@ -1333,7 +1400,7 @@ def get_op_map_for_single_kernel(knl, program_callables_info,
 
 
 def get_op_map(program, numpy_types=True, count_redundant_work=False,
-               subgroup_size=None):
+               count_within_subscripts=True, subgroup_size=None):
 
     """Count the number of operations in a loopy kernel.
 
@@ -1349,6 +1416,9 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False,
         (Likely desirable for performance modeling, but undesirable for code
         optimization.)
 
+    :arg count_within_subscripts: A :class:`bool` specifying whether to
+        count operations inside array indices.
+
     :arg subgroup_size: (currently unused) An :class:`int`, :class:`str`
         ``'guess'``, or *None* that specifies the sub-group size. An OpenCL
         sub-group is an implementation-dependent grouping of work-items within
@@ -1390,23 +1460,26 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False,
 
     """
 
+    if isinstance(program, LoopKernel):
+        program = make_program(program)
+
     from loopy.preprocess import preprocess_program, infer_unknown_types
     program = infer_unknown_types(program, expect_completion=True)
     program = preprocess_program(program)
 
     op_map = ToCountMap()
 
-    for func_id, in_knl_callable in program.program_callables_info.items():
+    callables_count = (
+                program.callables_table.callables_count)
+
+    for func_id, in_knl_callable in program.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
-            num_times_called = (
-                    program.program_callables_info.num_times_callables_called[
-                        func_id])
             knl = in_knl_callable.subkernel
             knl_op_map = get_op_map_for_single_kernel(knl,
-                        program.program_callables_info, numpy_types,
-                        count_redundant_work, subgroup_size)
+                    program.callables_table, numpy_types, count_redundant_work,
+                    count_within_subscripts, subgroup_size)
 
-            for i in range(num_times_called):
+            for i in range(callables_count[func_id]):
                 op_map += knl_op_map
         elif isinstance(in_knl_callable, ScalarCallable):
             pass
@@ -1433,21 +1506,16 @@ def _find_subgroup_size_for_knl(knl):
         return None
 
 
-# {{{ get_mem_access_map
-
-
-def get_access_map_for_single_kernel(knl, program_callables_info,
-        numpy_types=True, count_redundant_work=False, subgroup_size=None):
+@memoize_method
+def _process_subgroup_size(knl, subgroup_size_requested):
 
-    if not knl.options.ignore_boostable_into:
-        raise LoopyError("Kernel '%s': Using operation counting requires the option "
-                "ignore_boostable_into to be set." % knl.name)
-
-    if not isinstance(subgroup_size, int):
+    if isinstance(subgroup_size_requested, int):
+        return subgroup_size_requested
+    else:
         # try to find subgroup_size
         subgroup_size_guess = _find_subgroup_size_for_knl(knl)
 
-        if subgroup_size is None:
+        if subgroup_size_requested is None:
             if subgroup_size_guess is None:
                 # 'guess' was not passed and either no target device found
                 # or get_simd_group_size returned None
@@ -1457,84 +1525,39 @@ def get_access_map_for_single_kernel(knl, program_callables_info,
                                  "and kernel.target.device is set, or (3) pass "
                                  "subgroup_size='guess' and hope for the best.")
             else:
-                subgroup_size = subgroup_size_guess
+                return subgroup_size_guess
 
-        elif subgroup_size == 'guess':
+        elif subgroup_size_requested == 'guess':
             if subgroup_size_guess is None:
                 # unable to get subgroup_size from device, so guess
-                subgroup_size = 32
-                warn_with_kernel(knl, "get_mem_access_map_guessing_subgroup_size",
-                                 "get_mem_access_map: 'guess' sub-group size "
-                                 "passed, no target device found, wildly guessing "
-                                 "that sub-group size is %d." % (subgroup_size))
+                subgroup_size_guess = 32
+                warn_with_kernel(knl, "get_x_map_guessing_subgroup_size",
+                                 "'guess' sub-group size passed, no target device "
+                                 "found, wildly guessing that sub-group size is %d."
+                                 % (subgroup_size_guess))
+                return subgroup_size_guess
             else:
-                subgroup_size = subgroup_size_guess
+                return subgroup_size_guess
         else:
             raise ValueError("Invalid value for subgroup_size: %s. subgroup_size "
                              "must be integer, 'guess', or, if you're feeling "
-                             "lucky, None." % (subgroup_size))
+                             "lucky, None." % (subgroup_size_requested))
 
-    class CacheHolder(object):
-        pass
 
-    cache_holder = CacheHolder()
-    from pytools import memoize_in
-
-    @memoize_in(cache_holder, "insn_count")
-    def get_insn_count(knl, insn_id, count_granularity=CountGranularity.WORKITEM):
-        insn = knl.id_to_insn[insn_id]
-
-        if count_granularity is None:
-            warn_with_kernel(knl, "get_insn_count_assumes_granularity",
-                             "get_insn_count: No count granularity passed for "
-                             "MemAccess, assuming %s granularity."
-                             % (CountGranularity.WORKITEM))
-            count_granularity == CountGranularity.WORKITEM
-
-        if count_granularity == CountGranularity.WORKITEM:
-            return count_insn_runs(
-                knl, program_callables_info, insn,
-                count_redundant_work=count_redundant_work,
-                disregard_local_axes=False)
-
-        ct_disregard_local = count_insn_runs(
-                knl, program_callables_info, insn, disregard_local_axes=True,
-                count_redundant_work=count_redundant_work)
-
-        if count_granularity == CountGranularity.WORKGROUP:
-            return ct_disregard_local
-        elif count_granularity == CountGranularity.SUBGROUP:
-            # get the group size
-            from loopy.symbolic import aff_to_expr
-            _, local_size = knl.get_grid_size_upper_bounds(program_callables_info)
-            workgroup_size = 1
-            if local_size:
-                for size in local_size:
-                    s = aff_to_expr(size)
-                    if not isinstance(s, int):
-                        raise LoopyError("Cannot count insn with %s granularity, "
-                                         "work-group size is not integer: %s"
-                                         % (CountGranularity.SUBGROUP, local_size))
-                    workgroup_size *= s
-
-            warn_with_kernel(knl, "insn_count_subgroups_upper_bound",
-                    "get_insn_count: when counting instruction %s with "
-                    "count_granularity=%s, using upper bound for work-group size "
-                    "(%d work-items) to compute sub-groups per work-group. When "
-                    "multiple device programs present, actual sub-group count may be"
-                    "lower." % (insn_id, CountGranularity.SUBGROUP, workgroup_size))
-
-            from pytools import div_ceil
-            return ct_disregard_local*div_ceil(workgroup_size, subgroup_size)
-        else:
-            # this should not happen since this is enforced in MemAccess
-            raise ValueError("get_insn_count: count_granularity '%s' is"
-                    "not allowed. count_granularity options: %s"
-                    % (count_granularity, CountGranularity.ALL+[None]))
+# {{{ get_mem_access_map
+
+def get_mem_access_map_for_single_kernel(knl, callables_table,
+        numpy_types=True, count_redundant_work=False, subgroup_size=None):
+
+    if not knl.options.ignore_boostable_into:
+        raise LoopyError("Kernel '%s': Using operation counting requires the option "
+                "ignore_boostable_into to be set." % knl.name)
+
+    subgroup_size = _process_subgroup_size(knl, subgroup_size)
 
     access_map = ToCountMap()
-    access_counter_g = GlobalMemAccessCounter(knl, program_callables_info)
-    access_counter_l = LocalMemAccessCounter(knl, program_callables_info)
+    access_counter_g = GlobalMemAccessCounter(knl, callables_table)
+    access_counter_l = LocalMemAccessCounter(knl, callables_table)
 
     from loopy.kernel.instruction import (
             CallInstruction, CInstruction, Assignment,
@@ -1557,14 +1580,18 @@ def get_access_map_for_single_kernel(knl, program_callables_info,
                 access_map = (
                         access_map
                         + ToCountMap({key: val})
-                        * get_insn_count(knl, insn.id, key.count_granularity))
+                        * _get_insn_count(knl, callables_table, insn.id,
+                            subgroup_size, count_redundant_work,
+                            key.count_granularity))
 
             for key, val in six.iteritems(access_assignee.count_map):
 
                 access_map = (
                         access_map
                         + ToCountMap({key: val})
-                        * get_insn_count(knl, insn.id, key.count_granularity))
+                        * _get_insn_count(knl, callables_table, insn.id,
+                            subgroup_size, count_redundant_work,
+                            key.count_granularity))
 
         elif isinstance(insn, (NoOpInstruction, BarrierInstruction)):
             pass
@@ -1684,18 +1711,17 @@ def get_mem_access_map(program, numpy_types=True, count_redundant_work=False,
 
     access_map = ToCountMap()
 
-    for func_id, in_knl_callable in program.program_callables_info.items():
+    callables_count = program.callables_table.callables_count
+
+    for func_id, in_knl_callable in program.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
-            num_times_called = (
-                    program.program_callables_info.num_times_callables_called[
-                        func_id])
             knl = in_knl_callable.subkernel
-            knl_access_map = get_access_map_for_single_kernel(knl,
-                        program.program_callables_info, numpy_types,
+            knl_access_map = get_mem_access_map_for_single_kernel(knl,
+                        program.callables_table, numpy_types,
                         count_redundant_work, subgroup_size)
 
             # FIXME: didn't see any easy way to multiply
-            for i in range(num_times_called):
+            for i in range(callables_count[func_id]):
                 access_map += knl_access_map
         elif isinstance(in_knl_callable, ScalarCallable):
             pass
@@ -1705,13 +1731,12 @@ def get_mem_access_map(program, numpy_types=True, count_redundant_work=False,
 
     return access_map
 
-
 # }}}
 
 
 # {{{ get_synchronization_map
 
-def get_synchronization_map_for_single_kernel(knl, program_callables_info,
+def get_synchronization_map_for_single_kernel(knl, callables_table,
         subgroup_size=None):
 
     """Count the number of synchronization events each work-item encounters in
@@ -1757,7 +1782,7 @@ def get_synchronization_map_for_single_kernel(knl, program_callables_info,
     from loopy.schedule import (EnterLoop, LeaveLoop, Barrier,
             CallKernel, ReturnFromKernel, RunInstruction)
     from operator import mul
-    knl = lp.get_one_scheduled_kernel(knl, program_callables_info)
+    knl = lp.get_one_scheduled_kernel(knl, callables_table)
     iname_list = []
 
     result = ToCountMap()
@@ -1809,18 +1834,16 @@ def get_synchronization_map(program, subgroup_size=None):
     program = preprocess_program(program)
 
     sync_map = ToCountMap()
+    callables_count = program.callables_table.callables_count
 
-    for func_id, in_knl_callable in program.program_callables_info.items():
+    for func_id, in_knl_callable in program.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
-            num_times_called = (
-                    program.program_callables_info.num_times_callables_called[
-                        func_id])
             knl = in_knl_callable.subkernel
             knl_sync_map = get_synchronization_map_for_single_kernel(knl,
-                    program.program_callables_info, subgroup_size)
+                    program.callables_table, subgroup_size)
 
             # FIXME: didn't see any easy way to multiply
-            for i in range(num_times_called):
+            for i in range(callables_count[func_id]):
                 sync_map += knl_sync_map
         elif isinstance(in_knl_callable, ScalarCallable):
             pass
@@ -1874,7 +1897,7 @@ def gather_access_footprints_for_single_kernel(kernel, ignore_uncountable=False)
 def gather_access_footprints(program, ignore_uncountable=False):
     # FIMXE: works only for one callable kernel till now.
     if len([in_knl_callable for in_knl_callable in
-        program.program_callables_info.values() if isinstance(in_knl_callable,
+        program.callables_table.values() if isinstance(in_knl_callable,
             CallableKernel)]) != 1:
         raise NotImplementedError("Currently only supported for program with "
             "only one CallableKernel.")
@@ -1887,18 +1910,17 @@ def gather_access_footprints(program, ignore_uncountable=False):
     write_footprints = []
     read_footprints = []
 
-    for func_id, in_knl_callable in program.program_callables_info.items():
+    callables_count = program.callables_table.callables_count
+
+    for func_id, in_knl_callable in program.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
-            num_times_called = (
-                    program.program_callables_info.num_times_callables_called[
-                        func_id])
             knl = in_knl_callable.subkernel
             knl_write_footprints, knl_read_footprints = (
                     gather_access_footprints_for_single_kernel(knl,
                         ignore_uncountable))
 
             # FIXME: didn't see any easy way to multiply
-            for i in range(num_times_called):
+            for i in range(callables_count[func_id]):
                 write_footprints.extend(knl_write_footprints)
                 read_footprints.extend(knl_read_footprints)
 
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 54dd61966050de1149bff41f8845531b6c3687f8..5721c58effa6addd7cb79d1f373a9b27562cbd16 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -706,16 +706,16 @@ class ResolvedFunction(p.Expression):
     def __init__(self, function):
         if isinstance(function, str):
             function = p.Variable(function)
-        from loopy.library.reduction import ArgExtOp, SegmentedOp
-        assert isinstance(function, (p.Variable, ArgExtOp, SegmentedOp))
+        from loopy.library.reduction import ReductionOpFunction
+        assert isinstance(function, (p.Variable, ReductionOpFunction))
         self.function = function
 
     @property
     def name(self):
-        from loopy.library.reduction import ArgExtOp, SegmentedOp
+        from loopy.library.reduction import ReductionOpFunction
         if isinstance(self.function, p.Variable):
             return self.function.name
-        elif isinstance(self.function, (ArgExtOp, SegmentedOp)):
+        elif isinstance(self.function, ReductionOpFunction):
             return self.function
         else:
             raise LoopyError("Unexpected function type %s in ResolvedFunction." %
@@ -1902,7 +1902,7 @@ def get_access_range(domain, subscript, assumptions, shape=None,
             if shape is not None:
                 try:
                     shape_aff = guarded_aff_from_expr(access_map.space, shape[idim])
-                except ExpressionToAffineConversionError as sub_err:
+                except ExpressionToAffineConversionError:
                     pass
 
             if shape_aff is None:
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index e3b4853c3878106896a736d9f49a23665b5ff81b..f27ee4e96f11f686250bddf57ec87422c717373e 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -80,7 +80,7 @@ class TargetBase(object):
     def preprocess(self, kernel):
         return kernel
 
-    def pre_codegen_check(self, kernel, program_callables_info):
+    def pre_codegen_check(self, kernel, callables_table):
         pass
 
     # }}}
@@ -150,7 +150,7 @@ class ASTBuilderBase(object):
 
     # {{{ library
 
-    def function_scopers(self):
+    def function_id_in_knl_callable_mapper(self):
         """
         Returns an instance of list of the functions of signature
         ``(target, identifiers)`` returning either an instance of
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 52437ef8024fbd08261522362494924cd0e23d1b..58051e42f8898d374ded6b1e7dcd70eb3989f100 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -364,7 +364,7 @@ class CMathCallable(ScalarCallable):
     C-Target.
     """
 
-    def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info):
+    def with_types(self, arg_id_to_dtype, caller_kernel, callables_table):
         name = self.name
 
         if name in ["abs", "min", "max"]:
@@ -383,7 +383,7 @@ class CMathCallable(ScalarCallable):
                 # callable
                 return (
                         self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                        program_callables_info)
+                        callables_table)
 
             dtype = arg_id_to_dtype[0]
             dtype = dtype.numpy_dtype
@@ -411,7 +411,7 @@ class CMathCallable(ScalarCallable):
                     self.copy(name_in_target=name,
                         arg_id_to_dtype={0: NumpyType(dtype), -1:
                             NumpyType(dtype)}),
-                    program_callables_info)
+                    callables_table)
 
         # binary functions
         if name in ["fmax", "fmin", "pow", "atan2"]:
@@ -426,7 +426,7 @@ class CMathCallable(ScalarCallable):
                 # callable
                 return (
                         self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                        program_callables_info)
+                        callables_table)
 
             dtype = np.find_common_type(
                 [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items()
@@ -435,7 +435,7 @@ class CMathCallable(ScalarCallable):
             if dtype.kind == "c":
                 raise LoopyTypeError("%s does not support complex numbers")
 
-            elif dtype.kind == "f" and name in ["fmax", "fmin"]:
+            elif dtype.kind == "f":
                 from loopy.target.opencl import OpenCLTarget
                 if not isinstance(caller_kernel.target, OpenCLTarget):
                     if dtype == np.float64:
@@ -451,11 +451,11 @@ class CMathCallable(ScalarCallable):
             return (
                     self.copy(name_in_target=name,
                         arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}),
-                    program_callables_info)
+                    callables_table)
 
         return (
                 self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                program_callables_info)
+                callables_table)
 
 
 def scope_c_math_functions(target, identifier):
@@ -488,9 +488,9 @@ class CASTBuilder(ASTBuilderBase):
                     _preamble_generator,
                     ])
 
-    def function_scopers(self):
+    def function_id_in_knl_callable_mapper(self):
         return (
-                super(CASTBuilder, self).function_scopers() + [
+                super(CASTBuilder, self).function_id_in_knl_callable_mapper() + [
                     scope_c_math_functions])
 
     # }}}
@@ -897,7 +897,7 @@ class CASTBuilder(ASTBuilderBase):
 
         ecm = codegen_state.expression_to_code_mapper
         func_id = insn.expression.function.name
-        in_knl_callable = codegen_state.program_callables_info[func_id]
+        in_knl_callable = codegen_state.callables_table[func_id]
 
         if isinstance(in_knl_callable, ScalarCallable) and (
                 in_knl_callable.name_in_target == 'loopy_make_tuple'):
diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py
index b3c304d585f7282f5d8c6446e9503fecdb4b5a7e..d443739ac455d6a6d3f203286b84c2fe618e20ab 100644
--- a/loopy/target/c/c_execution.py
+++ b/loopy/target/c/c_execution.py
@@ -403,7 +403,7 @@ class CKernelExecutor(KernelExecutorBase):
         if self.program.root_kernel.options.write_cl:
             output = all_code
             if self.program.root_kernel.options.highlight_cl:
-                output = get_highlighted_code(code=output)
+                output = get_highlighted_code(output)
 
             if self.program.root_kernel.options.write_cl is True:
                 print(output)
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 2908c4efa4638ec8e35db781eb1c07997a795ace..97b6b43bbfc5b68ffa5bbb4af706d2f2caaf8121 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -55,7 +55,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
 
         if type_inf_mapper is None:
             type_inf_mapper = TypeInferenceMapper(self.kernel,
-                    self.codegen_state.program_callables_info)
+                    self.codegen_state.callables_table)
         self.type_inf_mapper = type_inf_mapper
 
         self.allow_complex = codegen_state.allow_complex
@@ -393,7 +393,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
         # {{{ implement indexof, indexof_vec
 
         identifier_name = (
-                self.codegen_state.program_callables_info[expr.function.name].name)
+                self.codegen_state.callables_table[expr.function.name].name)
         if identifier_name in ["indexof", "indexof_vec"]:
             if len(expr.parameters) != 1:
                 raise LoopyError("%s takes exactly one argument" % identifier_name)
@@ -436,11 +436,11 @@ class ExpressionToCExpressionMapper(IdentityMapper):
         # }}}
 
         from loopy.kernel.function_interface import ManglerCallable
-        if isinstance(self.codegen_state.program_callables_info[expr.function.name],
+        if isinstance(self.codegen_state.callables_table[expr.function.name],
                 ManglerCallable):
             from loopy.codegen import SeenFunction
             in_knl_callable = (
-                    self.codegen_state.program_callables_info[
+                    self.codegen_state.callables_table[
                         expr.function.name])
             mangle_result = in_knl_callable.mangle_result(self.kernel)
             self.codegen_state.seen_functions.add(
@@ -449,7 +449,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                         mangle_result.arg_dtypes))
 
         return (
-                self.codegen_state.program_callables_info[
+                self.codegen_state.callables_table[
                     expr.function.name].emit_call(
                         expression_to_code_mapper=self,
                     expression=expr,
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index 89cbfd034e6dfdddebd8811f3e2620c90cc285ad..6b4385bff2a580979eb1c551606760ee00b5e49a 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -123,7 +123,7 @@ _CUDA_SPECIFIC_FUNCTIONS = {
 class CudaCallable(ScalarCallable):
 
     def cuda_with_types(self, arg_id_to_dtype, caller_kernel,
-            program_callables_info):
+            callables_table):
 
         name = self.name
 
@@ -138,7 +138,7 @@ class CudaCallable(ScalarCallable):
                 # callable
                 return (
                         self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                        program_callables_info)
+                        callables_table)
 
             dtype = arg_id_to_dtype[0]
             scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["x"]
@@ -146,7 +146,7 @@ class CudaCallable(ScalarCallable):
                     self.copy(name_in_target=name, arg_id_to_dtype={-1:
                         NumpyType(scalar_dtype),
                         0: dtype, 1: dtype}),
-                    program_callables_info)
+                    callables_table)
 
         if name in _CUDA_SPECIFIC_FUNCTIONS:
             num_args = _CUDA_SPECIFIC_FUNCTIONS[name]
@@ -161,7 +161,7 @@ class CudaCallable(ScalarCallable):
                     # callable
                     return (
                             self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                            program_callables_info)
+                            callables_table)
 
             dtype = np.find_common_type(
                     [], [dtype.numpy_dtype for id, dtype in
@@ -177,11 +177,11 @@ class CudaCallable(ScalarCallable):
             return (
                     self.copy(name_in_target=name,
                         arg_id_to_dtype=updated_arg_id_to_dtype),
-                    program_callables_info)
+                    callables_table)
 
         return (
                 self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                program_callables_info)
+                callables_table)
 
 
 def scope_cuda_functions(target, identifier):
@@ -274,9 +274,9 @@ class CUDACASTBuilder(CASTBuilder):
 
     # {{{ library
 
-    def function_scopers(self):
+    def function_id_in_knl_callable_mapper(self):
         return [scope_cuda_functions] + (
-                super(CUDACASTBuilder, self).function_scopers())
+                super(CUDACASTBuilder, self).function_id_in_knl_callable_mapper())
 
     # }}}
 
@@ -303,7 +303,7 @@ class CUDACASTBuilder(CASTBuilder):
                 codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs(
                         get_insn_ids_for_block_at(
                             codegen_state.kernel.schedule, schedule_index),
-                        codegen_state.program_callables_info)
+                        codegen_state.callables_table)
 
         from loopy.symbolic import get_dependencies
         if not get_dependencies(local_grid_size):
@@ -344,7 +344,7 @@ class CUDACASTBuilder(CASTBuilder):
     _VEC_AXES = "xyzw"
 
     def add_vector_access(self, access_expr, index):
-        return access_expr.a(self._VEC_AXES[index])
+        return access_expr.attr(self._VEC_AXES[index])
 
     def emit_barrier(self, synchronization_kind, mem_kind, comment):
         """
diff --git a/loopy/target/execution.py b/loopy/target/execution.py
index 43963ddb2f47fb80475b7060b7929cf21d34142e..c067bc4b901ef3922236ac3dc1a8245aaf28c8aa 100644
--- a/loopy/target/execution.py
+++ b/loopy/target/execution.py
@@ -763,7 +763,7 @@ class KernelExecutorBase(object):
             from loopy.schedule import get_one_scheduled_kernel
             program = program.with_root_kernel(
                     get_one_scheduled_kernel(program.root_kernel,
-                        program.program_callables_info))
+                        program.callables_table))
 
         return program
 
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 5396318338e4b36148d4397390a33d024283150d..94a81a65a15bcc0da7530b51d5943d4b94bf74ac 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -172,9 +172,9 @@ class ISPCTarget(CTarget):
     host_program_name_suffix = ""
     device_program_name_suffix = "_inner"
 
-    def pre_codegen_check(self, kernel, program_callables_info):
+    def pre_codegen_check(self, kernel, callables_table):
         gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs(
-                program_callables_info)
+                callables_table)
         if len(lsize) > 1:
             for i, ls_i in enumerate(lsize[1:]):
                 if ls_i != 1:
@@ -445,9 +445,9 @@ class ISPCASTBuilder(CASTBuilder):
                 else:
                     for dep in get_dependencies(term):
                         if filter_iname_tags_by_type(
-                                kernel.iname_to_tags[dep], LocalIndexTag):
+                                kernel.iname_to_tags.get(dep, []), LocalIndexTag):
                             tag, = filter_iname_tags_by_type(
-                                kernel.iname_to_tags[dep], LocalIndexTag, 1)
+                                kernel.iname_to_tags.get(dep, []), LocalIndexTag, 1)
                             if tag.axis == 0:
                                 raise LoopyError(
                                     "streaming store must have stride 1 in "
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 44f782a726d7daa1b9371e50e2e9cbefc62eb7d9..36e59c3c8cffc6b8031cdacf5de82b2eeb1e96ec 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -172,7 +172,7 @@ class OpenCLCallable(ScalarCallable):
     :class:`loopy.target.c.CMathCallable`.
     """
 
-    def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info):
+    def with_types(self, arg_id_to_dtype, caller_kernel, callables_table):
         name = self.name
 
         if name in ["max", "min"]:
@@ -182,8 +182,7 @@ class OpenCLCallable(ScalarCallable):
             if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype:
                 return (
                         self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                        program_callables_info)
-
+                        callables_table)
             dtype = np.find_common_type(
                     [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items()
                         if (id >= 0 and dtype is not None)])
@@ -195,7 +194,7 @@ class OpenCLCallable(ScalarCallable):
                 return (
                         self.copy(name_in_target=name,
                             arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}),
-                        program_callables_info)
+                        callables_table)
             else:
                 # Unsupported type.
                 raise LoopyError("%s function not supported for the types %s" %
@@ -212,14 +211,14 @@ class OpenCLCallable(ScalarCallable):
                 # callable
                 return (
                         self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                        program_callables_info)
+                        callables_table)
 
             dtype = arg_id_to_dtype[0]
             scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["s0"]
             return (
                     self.copy(name_in_target=name, arg_id_to_dtype={-1:
                         NumpyType(scalar_dtype), 0: dtype, 1: dtype}),
-                    program_callables_info)
+                    callables_table)
 
         if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS:
             num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name]
@@ -234,7 +233,7 @@ class OpenCLCallable(ScalarCallable):
                     # callable
                     return (
                             self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                            program_callables_info)
+                            callables_table)
 
             dtype = np.find_common_type(
                     [], [dtype.numpy_dtype for id, dtype in
@@ -250,7 +249,7 @@ class OpenCLCallable(ScalarCallable):
             return (
                     self.copy(name_in_target=name,
                         arg_id_to_dtype=updated_arg_id_to_dtype),
-                    program_callables_info)
+                    callables_table)
 
         if name in VECTOR_LITERAL_FUNCS:
             base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name]
@@ -266,7 +265,7 @@ class OpenCLCallable(ScalarCallable):
                     # callable
                     return (
                             self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                            program_callables_info)
+                            callables_table)
 
             updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in
                     range(count))
@@ -276,13 +275,13 @@ class OpenCLCallable(ScalarCallable):
             return (
                     self.copy(name_in_target="(%s%d) " % (base_tp_name, count),
                         arg_id_to_dtype=updated_arg_id_to_dtype),
-                    program_callables_info)
+                    callables_table)
 
         # does not satisfy any of the conditions needed for specialization.
         # hence just returning a copy of the callable.
         return (
                 self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                program_callables_info)
+                callables_table)
 
 
 def scope_opencl_functions(target, identifier):
@@ -442,10 +441,10 @@ class OpenCLTarget(CTarget):
 class OpenCLCASTBuilder(CASTBuilder):
     # {{{ library
 
-    def function_scopers(self):
+    def function_id_in_knl_callable_mapper(self):
         return (
                 [scope_opencl_functions] + super(
-                    OpenCLCASTBuilder, self).function_scopers())
+                    OpenCLCASTBuilder, self).function_id_in_knl_callable_mapper())
 
     def symbol_manglers(self):
         return (
@@ -484,7 +483,7 @@ class OpenCLCASTBuilder(CASTBuilder):
         _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs(
                 get_insn_ids_for_block_at(
                     codegen_state.kernel.schedule, schedule_index),
-                codegen_state.program_callables_info)
+                codegen_state.callables_table)
 
         from loopy.symbolic import get_dependencies
         if not get_dependencies(local_sizes):
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index 03ba26930b677dc8f407b5ec73fa02722ebffec8..5ef56457223f8a89de4c37aed2f5a3e0ea5aa87b 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -125,7 +125,8 @@ def adjust_local_temp_var_storage(kernel, device):
 
             new_storage_shape = storage_shape
 
-        new_temp_vars[temp_var.name] = temp_var.copy(storage_shape=new_storage_shape)
+        new_temp_vars[temp_var.name] = temp_var.copy(
+                storage_shape=tuple(new_storage_shape))
 
     return kernel.copy(temporary_variables=new_temp_vars)
 
@@ -134,7 +135,7 @@ def adjust_local_temp_var_storage(kernel, device):
 
 # {{{ check sizes against device properties
 
-def check_sizes(kernel, program_callables_info, device):
+def check_sizes(kernel, callables_table, device):
     import loopy as lp
 
     from loopy.diagnostic import LoopyAdvisory, LoopyError
@@ -152,7 +153,7 @@ def check_sizes(kernel, program_callables_info, device):
             parameters[arg.name] = arg.approximately
 
     glens, llens = (
-            kernel.get_grid_size_upper_bounds_as_exprs(program_callables_info))
+            kernel.get_grid_size_upper_bounds_as_exprs(callables_table))
 
     if (max(len(glens), len(llens))
             > device.max_work_item_dimensions):
@@ -207,7 +208,7 @@ class PyOpenCLCallable(ScalarCallable):
     Records information about the callables which are not covered by
     :class:`loopy.target.opencl.OpenCLCallable`
     """
-    def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info):
+    def with_types(self, arg_id_to_dtype, caller_kernel, callables_table):
 
         name = self.name
 
@@ -221,7 +222,7 @@ class PyOpenCLCallable(ScalarCallable):
             # callable
             return (
                     self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                    program_callables_info)
+                    callables_table)
 
         dtype = arg_id_to_dtype[0]
 
@@ -238,7 +239,7 @@ class PyOpenCLCallable(ScalarCallable):
                         self.copy(name_in_target="%s_%s" % (tpname, name),
                             arg_id_to_dtype={0: dtype, -1: NumpyType(
                                 np.dtype(dtype.numpy_dtype.type(0).real))}),
-                        program_callables_info)
+                        callables_table)
 
         if name in ["sqrt", "exp", "log",
                 "sin", "cos", "tan",
@@ -256,7 +257,7 @@ class PyOpenCLCallable(ScalarCallable):
                 return (
                         self.copy(name_in_target="%s_%s" % (tpname, name),
                             arg_id_to_dtype={0: dtype, -1: dtype}),
-                        program_callables_info)
+                        callables_table)
             else:
                 # function calls for floating parameters.
                 numpy_dtype = dtype.numpy_dtype
@@ -267,14 +268,14 @@ class PyOpenCLCallable(ScalarCallable):
                 return (
                         self.copy(name_in_target=name,
                             arg_id_to_dtype={0: dtype, -1: dtype}),
-                        program_callables_info)
+                        callables_table)
 
         return (
                 self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                program_callables_info)
+                callables_table)
 
 
-def pyopencl_function_scoper(target, identifier):
+def pyopencl_function_id_to_in_knl_callable_mapper(target, identifier):
     if identifier in ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh",
             "tanh", "conj", "real", "imag", "abs"]:
         return PyOpenCLCallable(name=identifier)
@@ -397,8 +398,8 @@ class PyOpenCLTarget(OpenCLTarget):
             kernel = adjust_local_temp_var_storage(kernel, self.device)
         return kernel
 
-    def pre_codegen_check(self, kernel, program_callables_info):
-        check_sizes(kernel, program_callables_info, self.device)
+    def pre_codegen_check(self, kernel, callables_table):
+        check_sizes(kernel, callables_table, self.device)
 
     def get_host_ast_builder(self):
         return PyOpenCLPythonASTBuilder(self)
@@ -792,11 +793,13 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder):
 
     # {{{ library
 
-    def function_scopers(self):
-        from loopy.library.random123 import random123_function_scoper
+    def function_id_in_knl_callable_mapper(self):
+        from loopy.library.random123 import (
+                random123_function_id_to_in_knl_callable_mapper)
         return (
-                [pyopencl_function_scoper, random123_function_scoper] + super(
-                    PyOpenCLCASTBuilder, self).function_scopers())
+                [pyopencl_function_id_to_in_knl_callable_mapper,
+                    random123_function_id_to_in_knl_callable_mapper] + super(
+                    PyOpenCLCASTBuilder, self).function_id_in_knl_callable_mapper())
 
     def preamble_generators(self):
         return ([
diff --git a/loopy/target/python.py b/loopy/target/python.py
index cd6e611673754034238fbc6e8775c43eb3c4c2f4..1f83112ff8fd9f32f2e48f3c76a3de0abaad92fd 100644
--- a/loopy/target/python.py
+++ b/loopy/target/python.py
@@ -45,7 +45,7 @@ class ExpressionToPythonMapper(StringifyMapper):
 
         if type_inf_mapper is None:
             type_inf_mapper = TypeInferenceMapper(self.kernel,
-                    self.codegen_state.program_callables_info)
+                    self.codegen_state.callables_table)
         self.type_inf_mapper = type_inf_mapper
 
     def handle_unsupported_expression(self, victim, enclosing_prec):
@@ -85,7 +85,7 @@ class ExpressionToPythonMapper(StringifyMapper):
     def map_call(self, expr, enclosing_prec):
         from pymbolic.mapper.stringifier import PREC_NONE
 
-        identifier_name = self.codegen_state.program_callables_info[
+        identifier_name = self.codegen_state.callables_table[
                 expr.function.name].name
 
         if identifier_name in ["indexof", "indexof_vec"]:
@@ -93,7 +93,7 @@ class ExpressionToPythonMapper(StringifyMapper):
                     "indexof, indexof_vec not yet supported in Python")
 
         from loopy.kernel.function_interface import ManglerCallable
-        in_knl_callable = self.codegen_state.program_callables_info[
+        in_knl_callable = self.codegen_state.callables_table[
                 expr.function.name]
         if isinstance(in_knl_callable, ManglerCallable):
             from loopy.codegen import SeenFunction
@@ -180,10 +180,11 @@ class PythonASTBuilderBase(ASTBuilderBase):
 
     # {{{ code generation guts
 
-    def function_scopers(self):
+    def function_id_in_knl_callable_mapper(self):
         from loopy.target.c import scope_c_math_functions
         return (
-                super(PythonASTBuilderBase, self).function_scopers() +
+                super(PythonASTBuilderBase,
+                    self).function_id_in_knl_callable_mapper() +
                 [scope_c_math_functions])
 
     def preamble_generators(self):
diff --git a/loopy/tools.py b/loopy/tools.py
index b243a79492dacc17f70e5afc7626c17a6ee03774..52fc7d3ce8c12d6c8491cd363d42836f5b2aca5a 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -43,6 +43,17 @@ else:
         return isinstance(obj, (int, np.integer))
 
 
+def update_persistent_hash(obj, key_hash, key_builder):
+    """
+    Custom hash computation function for use with
+    :class:`pytools.persistent_dict.PersistentDict`.
+
+    Only works in conjunction with :class:`loopy.tools.KeyBuilder`.
+    """
+    for field_name in obj.hash_fields:
+        key_builder.rec(key_hash, getattr(obj, field_name))
+
+
 # {{{ custom KeyBuilder subclass
 
 class PersistentHashWalkMapper(LoopyWalkMapper, PersistentHashWalkMapperBase):
@@ -79,6 +90,11 @@ class LoopyKeyBuilder(KeyBuilderBase):
 
     update_for_defaultdict = update_for_dict
 
+    def update_for_frozenset(self, key_hash, key):
+        for set_key in sorted(key,
+                key=lambda obj: type(obj).__name__ + str(obj)):
+            self.rec(key_hash, set_key)
+
     def update_for_BasicSet(self, key_hash, key):  # noqa
         from islpy import Printer
         prn = Printer.to_str(key.get_ctx())
diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py
index 57c4397f998a74222bc482127076dee809de2bac..2519b6a14765400c9aaba249c1010d07d820355e 100644
--- a/loopy/transform/buffer.py
+++ b/loopy/transform/buffer.py
@@ -133,7 +133,7 @@ buffer_array_cache = WriteOncePersistentDict(
 
 
 # Adding an argument? also add something to the cache_key below.
-def buffer_array_for_single_kernel(kernel, program_callables_info, var_name,
+def buffer_array_for_single_kernel(kernel, callables_table, var_name,
         buffer_inames, init_expression=None, store_expression=None,
         within=None, default_tag="l.auto", temporary_scope=None,
         temporary_is_local=None, fetch_bounding_box=False):
@@ -534,7 +534,7 @@ def buffer_array_for_single_kernel(kernel, program_callables_info, var_name,
     kernel = tag_inames(kernel, new_iname_to_tag)
 
     from loopy.kernel.tools import assign_automatic_axes
-    kernel = assign_automatic_axes(kernel, program_callables_info)
+    kernel = assign_automatic_axes(kernel, callables_table)
 
     if CACHING_ENABLED:
         from loopy.preprocess import prepare_for_caching
@@ -548,10 +548,10 @@ def buffer_array(program, *args, **kwargs):
     assert isinstance(program, Program)
 
     new_resolved_functions = {}
-    for func_id, in_knl_callable in program.program_callables_info.items():
+    for func_id, in_knl_callable in program.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
             new_subkernel = buffer_array_for_single_kernel(
-                    in_knl_callable.subkernel, program.program_callables_info,
+                    in_knl_callable.subkernel, program.callables_table,
                     *args, **kwargs)
             in_knl_callable = in_knl_callable.copy(
                     subkernel=new_subkernel)
@@ -564,8 +564,8 @@ def buffer_array(program, *args, **kwargs):
 
         new_resolved_functions[func_id] = in_knl_callable
 
-    new_program_callables_info = program.program_callables_info.copy(
+    new_callables_table = program.callables_table.copy(
             resolved_functions=new_resolved_functions)
-    return program.copy(program_callables_info=new_program_callables_info)
+    return program.copy(callables_table=new_callables_table)
 
 # vim: foldmethod=marker
diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py
index dbda5d74f1b0465344288641ea70d6cf70c63ad6..749817bad2a07c57f3ab8086e6d45d92c559184d 100644
--- a/loopy/transform/callable.py
+++ b/loopy/transform/callable.py
@@ -31,7 +31,7 @@ from loopy.kernel import LoopKernel
 from pytools import ImmutableRecord
 from loopy.diagnostic import LoopyError
 from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase,
-        CInstruction, _DataObliviousInstruction)
+        Assignment, CInstruction, _DataObliviousInstruction)
 from loopy.symbolic import IdentityMapper, SubstitutionMapper, CombineMapper
 from loopy.isl_helpers import simplify_via_aff
 from loopy.kernel.function_interface import (get_kw_pos_association,
@@ -49,14 +49,22 @@ __doc__ = """
 
 # {{{ register function lookup
 
-def resolved_callables_from_function_lookup(program,
-        func_id_to_kernel_callable_mapper):
-    program_callables_info = program.program_callables_info
-    program_callables_info = program_callables_info.with_edit_callables_mode()
+def _resolved_callables_from_function_lookup(program,
+        func_id_to_in_kernel_callable_mapper):
+    """
+    Returns a copy of *program* with the expression nodes marked "Resolved"
+    if any match is found through the given
+    *func_id_to_in_kernel_callable_mapper*.
+
+    :arg func_id_to_in_kernel_callable_mapper: A function with signature
+        ``(target, identifier)`` that returns either an instance of
+        :class:`loopy.InKernelCallable` or *None*.
+    """
+    callables_table = program.callables_table
 
     callable_knls = dict(
             (func_id, in_knl_callable) for func_id, in_knl_callable in
-            program_callables_info.items() if isinstance(in_knl_callable,
+            callables_table.items() if isinstance(in_knl_callable,
                 CallableKernel))
     edited_callable_knls = {}
 
@@ -68,38 +76,34 @@ def resolved_callables_from_function_lookup(program,
                 kernel.substitutions, kernel.get_var_name_generator())
 
         resolved_function_marker = ResolvedFunctionMarker(
-                rule_mapping_context, kernel, program_callables_info,
-                [func_id_to_kernel_callable_mapper])
+                rule_mapping_context, kernel, callables_table,
+                [func_id_to_in_kernel_callable_mapper])
 
-        # scoping fucntions and collecting the scoped functions
         new_subkernel = rule_mapping_context.finish_kernel(
                 resolved_function_marker.map_kernel(kernel))
-        program_callables_info = resolved_function_marker.program_callables_info
+        callables_table = resolved_function_marker.callables_table
 
         edited_callable_knls[func_id] = in_knl_callable.copy(
                 subkernel=new_subkernel)
 
-    program_callables_info = (
-            program_callables_info.with_exit_edit_callables_mode())
-
     new_resolved_functions = {}
 
-    for func_id, in_knl_callable in program_callables_info.items():
+    for func_id, in_knl_callable in callables_table.items():
         if func_id in edited_callable_knls:
             new_resolved_functions[func_id] = edited_callable_knls[func_id]
         else:
             new_resolved_functions[func_id] = in_knl_callable
 
-    program_callables_info = program_callables_info.copy(
+    callables_table = callables_table.copy(
             resolved_functions=new_resolved_functions)
 
-    return program.copy(program_callables_info=program_callables_info)
+    return program.copy(callables_table=callables_table)
 
 
 def register_function_id_to_in_knl_callable_mapper(program,
         func_id_to_in_knl_callable_mapper):
     """
-    Returns a copy of *kernel* with the *function_lookup* registered.
+    Returns a copy of *program* with the *function_lookup* registered.
 
     :arg func_id_to_in_knl_callable_mapper: A function of signature ``(target,
         identifier)`` returning a
@@ -119,7 +123,7 @@ def register_function_id_to_in_knl_callable_mapper(program,
         new_func_id_mappers = program.func_id_to_in_knl_callable_mappers + (
                 [func_id_to_in_knl_callable_mapper])
 
-    program = resolved_callables_from_function_lookup(program,
+    program = _resolved_callables_from_function_lookup(program,
             func_id_to_in_knl_callable_mapper)
 
     new_program = program.copy(
@@ -169,7 +173,7 @@ def register_callable_kernel(program, callee_kernel):
     expected_num_assignees = len([arg for arg in callee_kernel.args if
         arg.is_output_only])
     expected_num_parameters = len(callee_kernel.args) - expected_num_assignees
-    for in_knl_callable in program.program_callables_info.values():
+    for in_knl_callable in program.callables_table.values():
         if isinstance(in_knl_callable, CallableKernel):
             caller_kernel = in_knl_callable.subkernel
             for insn in caller_kernel.instructions:
@@ -207,25 +211,20 @@ def register_callable_kernel(program, callee_kernel):
 
     # take the function resolvers from the Program and resolve the functions in
     # the callee kernel
-    program_callables_info = (
-            program.program_callables_info.with_edit_callables_mode())
-
     from loopy.symbolic import SubstitutionRuleMappingContext
     rule_mapping_context = SubstitutionRuleMappingContext(
             callee_kernel.substitutions,
             callee_kernel.get_var_name_generator())
 
     resolved_function_marker = ResolvedFunctionMarker(
-            rule_mapping_context, callee_kernel, program_callables_info,
+            rule_mapping_context, callee_kernel, program.callables_table,
             program.func_id_to_in_knl_callable_mappers)
 
     callee_kernel = rule_mapping_context.finish_kernel(
             resolved_function_marker.map_kernel(callee_kernel))
-    program_callables_info = resolved_function_marker.program_callables_info
+    callables_table = resolved_function_marker.callables_table.copy()
 
-    program_callables_info = (
-            program_callables_info.with_exit_edit_callables_mode())
-    program = program.copy(program_callables_info=program_callables_info)
+    program = program.copy(callables_table=callables_table)
 
     # making the target of the child kernel to be same as the target of parent
     # kernel.
@@ -456,15 +455,25 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction):
                 type(atomicity)(var_map[p.Variable(atomicity.var_name)].name)
                 for atomicity in insn.atomicity)
 
-        insn = insn.copy(
-            id=insn_id[insn.id],
-            within_inames=within_inames,
-            # TODO: probaby need to keep priority in callee kernel
-            priority=instruction.priority,
-            depends_on=depends_on,
-            tags=insn.tags | instruction.tags,
-            atomicity=new_atomicity
-        )
+        if isinstance(insn, Assignment):
+            insn = insn.copy(
+                id=insn_id[insn.id],
+                within_inames=within_inames,
+                # TODO: probaby need to keep priority in callee kernel
+                priority=instruction.priority,
+                depends_on=depends_on,
+                tags=insn.tags | instruction.tags,
+                atomicity=new_atomicity
+            )
+        else:
+            insn = insn.copy(
+                id=insn_id[insn.id],
+                within_inames=within_inames,
+                # TODO: probaby need to keep priority in callee kernel
+                priority=instruction.priority,
+                depends_on=depends_on,
+                tags=insn.tags | instruction.tags,
+            )
         inner_insns.append(insn)
 
     inner_insns.append(noop_end)
@@ -488,27 +497,22 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction):
 # {{{ inline callable kernel
 
 def _inline_single_callable_kernel(caller_kernel, function_name,
-        program_callables_info):
+        callables_table):
     old_insns = caller_kernel.instructions
     for insn in old_insns:
         if isinstance(insn, CallInstruction):
             # FIXME This seems to use identifiers across namespaces. Why not
             # check whether the function is a scoped function first? ~AK
-            if insn.expression.function.name in program_callables_info:
-                history_of_identifier = program_callables_info.history[
+            if insn.expression.function.name in callables_table:
+                history_of_identifier = callables_table.history[
                         insn.expression.function.name]
 
                 if function_name in history_of_identifier:
-                    in_knl_callable = program_callables_info[
+                    in_knl_callable = callables_table[
                             insn.expression.function.name]
                     assert isinstance(in_knl_callable, CallableKernel)
                     caller_kernel = _inline_call_instruction(
                             caller_kernel, in_knl_callable.subkernel, insn)
-                    program_callables_info = (
-                            program_callables_info.with_deleted_callable(
-                                insn.expression.function.name,
-                                program_callables_info.num_times_callables_called[
-                                    caller_kernel.name]))
         elif isinstance(insn, (MultiAssignmentBase, CInstruction,
                 _DataObliviousInstruction)):
             pass
@@ -517,7 +521,7 @@ def _inline_single_callable_kernel(caller_kernel, function_name,
                     "Unknown instruction type %s"
                     % type(insn).__name__)
 
-    return caller_kernel, program_callables_info
+    return caller_kernel, callables_table
 
 
 # FIXME This should take a 'within' parameter to be able to only inline
@@ -529,33 +533,33 @@ def inline_callable_kernel(program, function_name):
     """
     from loopy.preprocess import infer_arg_descr
     program = infer_arg_descr(program)
-    program_callables_info = program.program_callables_info
-    old_program_callables_info = program_callables_info.copy()
+    callables_table = program.callables_table
+    old_callables_table = callables_table.copy()
 
     edited_callable_kernels = {}
 
-    for func_id, in_knl_callable in old_program_callables_info.items():
-        if function_name not in old_program_callables_info.history[func_id] and (
+    for func_id, in_knl_callable in old_callables_table.items():
+        if function_name not in old_callables_table.history[func_id] and (
                 isinstance(in_knl_callable, CallableKernel)):
             caller_kernel = in_knl_callable.subkernel
-            caller_kernel, program_callables_info = (
+            caller_kernel, callables_table = (
                     _inline_single_callable_kernel(caller_kernel,
                         function_name,
-                        program_callables_info))
+                        callables_table))
             edited_callable_kernels[func_id] = in_knl_callable.copy(
                     subkernel=caller_kernel)
 
     new_resolved_functions = {}
-    for func_id, in_knl_callable in program_callables_info.items():
+    for func_id, in_knl_callable in callables_table.items():
         if func_id in edited_callable_kernels:
             new_resolved_functions[func_id] = edited_callable_kernels[func_id]
         else:
             new_resolved_functions[func_id] = in_knl_callable
 
-    program_callables_info = program_callables_info.copy(
+    callables_table = callables_table.copy(
             resolved_functions=new_resolved_functions)
 
-    return program.copy(program_callables_info=program_callables_info)
+    return program.copy(callables_table=callables_table)
 
 # }}}
 
@@ -725,20 +729,20 @@ def _match_caller_callee_argument_dimension_(program, callee_function_name):
             callee_function_name).map_kernel
 
     caller_knl,  = [in_knl_callable.subkernel for in_knl_callable in
-            program.program_callables_info.values() if isinstance(in_knl_callable,
+            program.callables_table.values() if isinstance(in_knl_callable,
                 CallableKernel) and
             is_invoking_callee(in_knl_callable.subkernel)]
 
-    old_callee_knl = program.program_callables_info[
+    old_callee_knl = program.callables_table[
             callee_function_name].subkernel
     new_callee_kernel = _match_caller_callee_argument_dimension_for_single_kernel(
             caller_knl, old_callee_knl)
 
-    new_program_callables_info = program.program_callables_info.copy()
-    new_program_callables_info.resolved_functions[callee_function_name] = (
-            new_program_callables_info[callee_function_name].copy(
+    new_callables_table = program.callables_table.copy()
+    new_callables_table.resolved_functions[callee_function_name] = (
+            new_callables_table[callee_function_name].copy(
                 subkernel=new_callee_kernel))
-    return program.copy(program_callables_info=new_program_callables_info)
+    return program.copy(callables_table=new_callables_table)
 
 # }}}
 
diff --git a/loopy/transform/data.py b/loopy/transform/data.py
index 5f4f2f2a77b927e4a4352077ed94492249ef75a0..888bedc1de4ba1993d505a2c660390d41e286a99 100644
--- a/loopy/transform/data.py
+++ b/loopy/transform/data.py
@@ -143,7 +143,7 @@ class _not_provided:  # noqa: N801
     pass
 
 
-def add_prefetch_for_single_kernel(kernel, program_callables_info, var_name,
+def add_prefetch_for_single_kernel(kernel, callables_table, var_name,
         sweep_inames=[], dim_arg_names=None,
 
         # "None" is a valid value here, distinct from the default.
@@ -334,7 +334,7 @@ def add_prefetch_for_single_kernel(kernel, program_callables_info, var_name,
     # warning message.
 
     from loopy.transform.precompute import precompute_for_single_kernel
-    new_kernel = precompute_for_single_kernel(kernel, program_callables_info,
+    new_kernel = precompute_for_single_kernel(kernel, callables_table,
             subst_use, sweep_inames, precompute_inames=dim_arg_names,
             default_tag=default_tag, dtype=arg.dtype,
             fetch_bounding_box=fetch_bounding_box,
@@ -373,10 +373,10 @@ def add_prefetch(program, *args, **kwargs):
     assert isinstance(program, Program)
 
     new_resolved_functions = {}
-    for func_id, in_knl_callable in program.program_callables_info.items():
+    for func_id, in_knl_callable in program.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
             new_subkernel = add_prefetch_for_single_kernel(
-                    in_knl_callable.subkernel, program.program_callables_info,
+                    in_knl_callable.subkernel, program.callables_table,
                     *args, **kwargs)
             in_knl_callable = in_knl_callable.copy(
                     subkernel=new_subkernel)
@@ -389,9 +389,9 @@ def add_prefetch(program, *args, **kwargs):
 
         new_resolved_functions[func_id] = in_knl_callable
 
-    new_program_callables_info = program.program_callables_info.copy(
+    new_callables_table = program.callables_table.copy(
             resolved_functions=new_resolved_functions)
-    return program.copy(program_callables_info=new_program_callables_info)
+    return program.copy(callables_table=new_callables_table)
 
 # }}}
 
diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py
index d43ce025b3b9a232ebf14a35a47da2dad645a872..9b83f242bde7923a3932a00b42f442954cf9a7db 100644
--- a/loopy/transform/fusion.py
+++ b/loopy/transform/fusion.py
@@ -292,50 +292,6 @@ def _fuse_two_kernels(knla, knlb):
 
 
 def fuse_loop_kernels(kernels, suffixes=None, data_flow=None):
-    """Return a kernel that performs all the operations in all entries
-    of *kernels*.
-
-    :arg kernels: A list of :class:`loopy.LoopKernel` instances to be fused.
-    :arg suffixes: If given, must be a list of strings of a length matching
-        that of *kernels*. This will be used to disambiguate the names
-        of temporaries, as described below.
-    :arg data_flow: A list of data dependencies
-        ``[(var_name, from_kernel, to_kernel), ...]``.
-        Based on this, the fuser will create dependencies between all
-        writers of *var_name* in ``kernels[from_kernel]`` to
-        readers of *var_name* in ``kernels[to_kernel]``.
-        *from_kernel* and *to_kernel* are indices into *kernels*.
-
-    The components of the kernels are fused as follows:
-
-    *   The resulting kernel will have a domain involving all the inames
-        and parameters occurring across *kernels*.
-        Inames with matching names across *kernels* are fused in such a way
-        that they remain a single iname in the fused kernel.
-        Use :func:`loopy.rename_iname` if this is not desired.
-
-    *   The projection of the domains of each pair of kernels onto their
-        common subset of inames must match in order for fusion to
-        succeed.
-
-    *   Assumptions are fused by taking their conjunction.
-
-    *   If kernel arguments with matching names are encountered across
-        *kernels*, their declarations must match in order for fusion to
-        succeed.
-
-    *   Temporaries are automatically renamed to remain uniquely associated
-        with each instruction stream.
-
-    *   The resulting kernel will contain all instructions from each entry
-        of *kernels*. Clashing instruction IDs will be renamed to ensure
-        uniqueness.
-
-    .. versionchanged:: 2016.2
-
-        *data_flow* was added in version 2016.2
-    """
-
     assert all(isinstance(knl, LoopKernel) for knl in kernels)
     kernels = list(kernels)
 
@@ -419,29 +375,80 @@ def fuse_loop_kernels(kernels, suffixes=None, data_flow=None):
 
 
 def fuse_kernels(programs, suffixes=None, data_flow=None):
+    """Return a kernel that performs all the operations in all entries
+    of *kernels*.
+
+    :arg kernels: A list of :class:`loopy.LoopKernel` instances to be fused.
+    :arg suffixes: If given, must be a list of strings of a length matching
+        that of *kernels*. This will be used to disambiguate the names
+        of temporaries, as described below.
+    :arg data_flow: A list of data dependencies
+        ``[(var_name, from_kernel, to_kernel), ...]``.
+        Based on this, the fuser will create dependencies between all
+        writers of *var_name* in ``kernels[from_kernel]`` to
+        readers of *var_name* in ``kernels[to_kernel]``.
+        *from_kernel* and *to_kernel* are indices into *kernels*.
+
+    The components of the kernels are fused as follows:
+
+    *   The resulting kernel will have a domain involving all the inames
+        and parameters occurring across *kernels*.
+        Inames with matching names across *kernels* are fused in such a way
+        that they remain a single iname in the fused kernel.
+        Use :func:`loopy.rename_iname` if this is not desired.
+
+    *   The projection of the domains of each pair of kernels onto their
+        common subset of inames must match in order for fusion to
+        succeed.
+
+    *   Assumptions are fused by taking their conjunction.
+
+    *   If kernel arguments with matching names are encountered across
+        *kernels*, their declarations must match in order for fusion to
+        succeed.
+
+    *   Temporaries are automatically renamed to remain uniquely associated
+        with each instruction stream.
+
+    *   The resulting kernel will contain all instructions from each entry
+        of *kernels*. Clashing instruction IDs will be renamed to ensure
+        uniqueness.
+
+    .. versionchanged:: 2016.2
+
+        *data_flow* was added in version 2016.2
+    """
+
+    # all the resolved functions in programs must be registered in
+    # main_callables_table
     main_prog_callables_info = (
-            programs[0].program_callables_info.with_edit_callables_mode())
+            programs[0].callables_table)
     old_root_kernel_callable = (
-            programs[0].program_callables_info[programs[0].name])
+            programs[0].callables_table[programs[0].name])
     kernels = [programs[0].root_kernel]
 
     # removing the callable collisions that maybe present
     for prog in programs[1:]:
         root_kernel = prog.root_kernel
         renames_needed = {}
-        for old_func_id, in_knl_callable in prog.program_callables_info.items():
+        for old_func_id, in_knl_callable in prog.callables_table.items():
             if isinstance(in_knl_callable, CallableKernel):
-                if in_knl_callable.name != prog.name:
+                # Fusing programs with multiple callable kernels is tough.
+                # Reason: Need to first figure out the order in which the
+                # callable kernels must be resolved into
+                # main_callables_table, because of renaming is
+                # needed to be done in the callable kernels before registering.
+                # Hence disabling it until required.
+                if in_knl_callable.subkernel.name != prog.name:
                     raise LoopyError("fuse_kernels cannot fuse programs with "
                             "multiple callable kernels.")
+
+                # root kernel are dealt at the end after performing all the
+                # renaming.
                 continue
-            num_times_called = (
-                    prog.program_callables_info.num_times_callables_called[
-                        old_func_id])
-            for i in range(num_times_called):
-                main_prog_callables_info, new_func_id = (
-                        main_prog_callables_info.with_callables(var(old_func_id),
-                            in_knl_callable, True))
+            main_prog_callables_info, new_func_id = (
+                    main_prog_callables_info.with_added_callable(var(old_func_id),
+                        in_knl_callable))
 
             if old_func_id != new_func_id:
                 renames_needed[old_func_id] = new_func_id
@@ -456,13 +463,11 @@ def fuse_kernels(programs, suffixes=None, data_flow=None):
     new_root_kernel_callable = old_root_kernel_callable.copy(
             subkernel=new_root_kernel.copy(name=programs[0].name))
 
-    main_prog_callables_info, _ = main_prog_callables_info.with_callable(
+    # TODO: change the name of the final root kernel.
+    main_prog_callables_info, _ = main_prog_callables_info.with_added_callable(
             var(programs[0].name), new_root_kernel_callable)
 
-    main_prog_callables_info = (
-            main_prog_callables_info.with_exit_edit_callables_mode())
-
     return programs[0].copy(
-            program_callables_info=main_prog_callables_info)
+            callables_table=main_prog_callables_info)
 
 # vim: foldmethod=marker
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index 93f6c53e84513c8e703c9cc51c66f5eb9f491084..fb6682f4866317c0852c3814268ada904b2391b8 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -146,8 +146,7 @@ class _InameSplitter(RuleAwareIdentityMapper):
                 and self.split_iname not in expn_state.arg_context
                 and self.within(
                     expn_state.kernel,
-                    expn_state.instruction,
-                    expn_state.stack)):
+                    expn_state.instruction)):
             new_inames = list(expr.inames)
             new_inames.remove(self.split_iname)
             new_inames.extend([self.outer_iname, self.inner_iname])
@@ -164,8 +163,7 @@ class _InameSplitter(RuleAwareIdentityMapper):
                 and self.split_iname not in expn_state.arg_context
                 and self.within(
                     expn_state.kernel,
-                    expn_state.instruction,
-                    expn_state.stack)):
+                    expn_state.instruction)):
             return self.replacement_index
         else:
             return super(_InameSplitter, self).map_variable(expr, expn_state)
@@ -184,6 +182,22 @@ def _split_iname_backend(kernel, split_iname,
         for syntax.
     """
 
+    from loopy.match import parse_match
+    within = parse_match(within)
+
+    # {{{ return the same kernel if no kernel matches
+
+    def _do_not_transform_if_no_within_matches():
+        for insn in kernel.instructions:
+            if within(kernel, insn):
+                return
+
+        return kernel
+
+    _do_not_transform_if_no_within_matches()
+
+    # }}}
+
     existing_tags = kernel.iname_tags(split_iname)
     from loopy.kernel.data import ForceSequentialTag, filter_iname_tags_by_type
     if (do_tagged_check and existing_tags
@@ -237,10 +251,15 @@ def _split_iname_backend(kernel, split_iname,
         name_dim_type, name_idx = space.get_var_dict()[split_iname]
         s = s.intersect(fixed_constraint_set)
 
-        if within is None:
-            s = s.project_out(name_dim_type, name_idx, 1)
+        def _project_out_only_if_all_instructions_in_within():
+            for insn in kernel.instructions:
+                if split_iname in insn.within_inames and (
+                        not within(kernel, insn)):
+                    return s
 
-        return s
+            return s.project_out(name_dim_type, name_idx, 1)
+
+        return _project_out_only_if_all_instructions_in_within()
 
     new_domains = [process_set(dom) for dom in kernel.domains]
 
@@ -256,7 +275,8 @@ def _split_iname_backend(kernel, split_iname,
 
     new_insns = []
     for insn in kernel.instructions:
-        if split_iname in insn.within_inames:
+        if split_iname in insn.within_inames and (
+                within(kernel, insn)):
             new_within_inames = (
                     (insn.within_inames.copy()
                     - frozenset([split_iname]))
@@ -291,9 +311,6 @@ def _split_iname_backend(kernel, split_iname,
             applied_iname_rewrites=applied_iname_rewrites,
             loop_priority=frozenset(new_priorities))
 
-    from loopy.match import parse_stack_match
-    within = parse_stack_match(within)
-
     rule_mapping_context = SubstitutionRuleMappingContext(
             kernel.substitutions, kernel.get_var_name_generator())
     ins = _InameSplitter(rule_mapping_context, within,
@@ -338,7 +355,7 @@ def split_iname(kernel, split_iname, inner_length,
     :arg inner_tag: The iname tag (see :ref:`iname-tags`) to apply to
         *inner_iname*.
     :arg within: a stack match as understood by
-        :func:`loopy.match.parse_stack_match`.
+        :func:`loopy.match.parse_match`.
     """
     assert isinstance(kernel, LoopKernel)
 
@@ -1078,7 +1095,7 @@ def get_iname_duplication_options_for_single_kernel(knl, use_boostable_into=Fals
 
 
 def get_iname_duplication_options(program, use_boostable_into=False):
-    for in_knl_callable in program.program_callables_info.values():
+    for in_knl_callable in program.callables_table.values():
         if isinstance(in_knl_callable, CallableKernel):
             for option in get_iname_duplication_options_for_single_kernel(
                     in_knl_callable.subkernel, use_boostable_into):
@@ -1104,7 +1121,7 @@ def has_schedulable_iname_nesting_for_single_kernel(knl):
 def has_schedulable_iname_nesting(program):
     return all(has_schedulable_iname_nesting_for_single_kernel(
         in_knl_callable.subkernel) for in_knl_callable in
-        program.program_callables_info.values() if isinstance(in_knl_callable,
+        program.callables_table.values() if isinstance(in_knl_callable,
             CallableKernel))
 
 # }}}
diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 93cf932b1bf555b64d502b2c7a8c2fcc658d6a25..f73110ecdff79d7c029c0dd0d895ef71ea68326b 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -42,7 +42,7 @@ def find_instructions_in_single_kernel(kernel, insn_match):
 def find_instructions(program, insn_match):
     assert isinstance(program, Program)
     insns = []
-    for in_knl_callable in program.program_callables_info.values():
+    for in_knl_callable in program.callables_table.values():
         if isinstance(in_knl_callable, CallableKernel):
             insns += (find_instructions_in_single_kernel(
                 in_knl_callable.subkernel, insn_match))
diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py
index 734072574cf970cd6093fef56105578b95e76c5f..e5ed850c655df726e79bd487c4ea6c603d94520b 100644
--- a/loopy/transform/pack_and_unpack_args.py
+++ b/loopy/transform/pack_and_unpack_args.py
@@ -37,7 +37,7 @@ __doc__ = """
 
 
 def pack_and_unpack_args_for_call_for_single_kernel(kernel,
-        program_callables_info, call_name, args_to_pack=None,
+        callables_table, call_name, args_to_pack=None,
         args_to_unpack=None):
     """
     Returns a a copy of *kernel* with instructions appended to copy the
@@ -63,10 +63,10 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel,
         if not isinstance(insn, CallInstruction):
             # pack and unpack call only be done for CallInstructions.
             continue
-        if insn.expression.function.name not in program_callables_info:
+        if insn.expression.function.name not in callables_table:
             continue
 
-        in_knl_callable = program_callables_info[
+        in_knl_callable = callables_table[
                 insn.expression.function.name]
 
         if in_knl_callable.name != call_name:
@@ -324,10 +324,10 @@ def pack_and_unpack_args_for_call(program, *args, **kwargs):
     assert isinstance(program, Program)
 
     new_resolved_functions = {}
-    for func_id, in_knl_callable in program.program_callables_info.items():
+    for func_id, in_knl_callable in program.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
             new_subkernel = pack_and_unpack_args_for_call_for_single_kernel(
-                    in_knl_callable.subkernel, program.program_callables_info,
+                    in_knl_callable.subkernel, program.callables_table,
                     *args, **kwargs)
             in_knl_callable = in_knl_callable.copy(
                     subkernel=new_subkernel)
@@ -340,8 +340,8 @@ def pack_and_unpack_args_for_call(program, *args, **kwargs):
 
         new_resolved_functions[func_id] = in_knl_callable
 
-    new_program_callables_info = program.program_callables_info.copy(
+    new_callables_table = program.callables_table.copy(
             resolved_functions=new_resolved_functions)
-    return program.copy(program_callables_info=new_program_callables_info)
+    return program.copy(callables_table=new_callables_table)
 
 # vim: foldmethod=marker
diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py
index 66c7114aee2a987c3a7810252ba3cc4f4d293c83..71b11fa24c49437032537d22aa7275b7e9e3d7f4 100644
--- a/loopy/transform/precompute.py
+++ b/loopy/transform/precompute.py
@@ -261,7 +261,7 @@ class _not_provided(object):  # noqa: N801
     pass
 
 
-def precompute_for_single_kernel(kernel, program_callables_info, subst_use,
+def precompute_for_single_kernel(kernel, callables_table, subst_use,
         sweep_inames=[], within=None, storage_axes=None, temporary_name=None,
         precompute_inames=None, precompute_outer_inames=None,
         storage_axis_to_tag={},
@@ -1047,7 +1047,7 @@ def precompute_for_single_kernel(kernel, program_callables_info, subst_use,
 
     if filter_iname_tags_by_type(new_iname_to_tag.values(), AutoFitLocalIndexTag):
         from loopy.kernel.tools import assign_automatic_axes
-        kernel = assign_automatic_axes(kernel, program_callables_info)
+        kernel = assign_automatic_axes(kernel, callables_table)
 
     return kernel
 
@@ -1056,10 +1056,10 @@ def precompute(program, *args, **kwargs):
     assert isinstance(program, Program)
 
     new_resolved_functions = {}
-    for func_id, in_knl_callable in program.program_callables_info.items():
+    for func_id, in_knl_callable in program.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
             new_subkernel = precompute_for_single_kernel(
-                    in_knl_callable.subkernel, program.program_callables_info,
+                    in_knl_callable.subkernel, program.callables_table,
                     *args, **kwargs)
             in_knl_callable = in_knl_callable.copy(
                     subkernel=new_subkernel)
@@ -1072,8 +1072,8 @@ def precompute(program, *args, **kwargs):
 
         new_resolved_functions[func_id] = in_knl_callable
 
-    new_program_callables_info = program.program_callables_info.copy(
+    new_callables_table = program.callables_table.copy(
             resolved_functions=new_resolved_functions)
-    return program.copy(program_callables_info=new_program_callables_info)
+    return program.copy(callables_table=new_callables_table)
 
 # vim: foldmethod=marker
diff --git a/loopy/transform/save.py b/loopy/transform/save.py
index 4b957b033fd4659732970b12ee3986b4028811cf..e463353ef9ea0860188b49fcb8f2f06bb96b0f41 100644
--- a/loopy/transform/save.py
+++ b/loopy/transform/save.py
@@ -235,9 +235,9 @@ class TemporarySaver(object):
         def new_shape(self):
             return self.hw_dims + self.non_hw_dims
 
-    def __init__(self, kernel, program_callables_info):
+    def __init__(self, kernel, callables_table):
         self.kernel = kernel
-        self.program_callables_info = program_callables_info
+        self.callables_table = callables_table
         self.var_name_gen = kernel.get_var_name_generator()
         self.insn_name_gen = kernel.get_instruction_id_generator()
 
@@ -441,7 +441,7 @@ class TemporarySaver(object):
 
         group_sizes, local_sizes = (
             self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids,
-                self.program_callables_info))
+                self.callables_table))
 
         if temporary.address_space == lp.AddressSpace.LOCAL:
             # Elide local axes in the save slot for local temporaries.
@@ -630,7 +630,7 @@ class TemporarySaver(object):
                     kernel = lp.add_nosync(kernel, "global", source, sink)
 
         from loopy.kernel.tools import assign_automatic_axes
-        return assign_automatic_axes(kernel, self.program_callables_info)
+        return assign_automatic_axes(kernel, self.callables_table)
 
     def save(self, temporary, subkernel):
         self.save_or_reload_impl(temporary, subkernel, "save")
@@ -754,12 +754,12 @@ def save_and_reload_temporaries(program):
         program = lp.preprocess_program(program)
         from loopy.schedule import get_one_scheduled_kernel
         knl = get_one_scheduled_kernel(program.root_kernel,
-                program.program_callables_info)
+                program.callables_table)
 
     assert knl.schedule is not None
 
     liveness = LivenessAnalysis(knl)
-    saver = TemporarySaver(knl, program.program_callables_info)
+    saver = TemporarySaver(knl, program.callables_table)
 
     from loopy.schedule.tools import (
         temporaries_read_in_subkernel, temporaries_written_in_subkernel)
diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py
index 6a93e0bd99bc779f66fa3fb0aea67b55ea246740..56ae123b78fde7bbf51c27317d74a50ecad0dd5b 100644
--- a/loopy/transform/subst.py
+++ b/loopy/transform/subst.py
@@ -511,7 +511,7 @@ def find_rules_matching(knl, pattern):
 
 def find_one_rule_matching(program, pattern):
     rules = []
-    for in_knl_callable in program.program_callables_info.values():
+    for in_knl_callable in program.callables_table.values():
         if isinstance(in_knl_callable, CallableKernel):
             knl = in_knl_callable.subkernel
             rules.extend(find_rules_matching(knl, pattern))
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index b434f03b146a7eb76ced59ddd5fa670fea4e7f09..5047dcc2743b8d47a16beaf67c69e54ac9afb554 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -35,8 +35,11 @@ from loopy.diagnostic import (
         TypeInferenceFailure, DependencyTypeInferenceFailure)
 from loopy.kernel.instruction import _DataObliviousInstruction
 
-from loopy.program import ProgramCallablesInfo
-from loopy.symbolic import SubArrayRef, LinearSubscript
+from loopy.program import CallablesTable
+from loopy.symbolic import (
+        LinearSubscript, parse_tagged_name, RuleAwareIdentityMapper,
+        SubstitutionRuleExpander, ResolvedFunction,
+        SubstitutionRuleMappingContext, SubArrayRef)
 from pymbolic.primitives import Variable, Subscript, Lookup
 
 import logging
@@ -52,7 +55,7 @@ def _debug(kernel, s, *args):
 def get_return_types_as_tuple(arg_id_to_dtype):
     """Returns the types of arguments in  a tuple format.
 
-    :param arg_id_to_dtype: An instance of :class:`dict` which denotes a
+    :arg arg_id_to_dtype: An instance of :class:`dict` which denotes a
                             mapping from the arguments to their inferred types.
     """
     return_arg_id_to_dtype = dict((id, dtype) for id, dtype in
@@ -62,10 +65,139 @@ def get_return_types_as_tuple(arg_id_to_dtype):
     return tuple(return_arg_id_to_dtype[id] for id in return_arg_pos)
 
 
+# {{{ renaming helpers
+
+class FunctionNameChanger(RuleAwareIdentityMapper):
+    """
+    Changes the names of scoped functions in calls of expressions according to
+    the mapping ``calls_to_new_functions``
+    """
+
+    def __init__(self, rule_mapping_context, calls_to_new_names,
+            subst_expander):
+        super(FunctionNameChanger, self).__init__(rule_mapping_context)
+        self.calls_to_new_names = calls_to_new_names
+        self.subst_expander = subst_expander
+
+    def map_call(self, expr, expn_state):
+        name, tag = parse_tagged_name(expr.function)
+
+        if name not in self.rule_mapping_context.old_subst_rules:
+            expanded_expr = self.subst_expander(expr)
+            if expr in self.calls_to_new_names:
+                return type(expr)(
+                        ResolvedFunction(self.calls_to_new_names[expr]),
+                        tuple(self.rec(child, expn_state)
+                            for child in expr.parameters))
+            elif expanded_expr in self.calls_to_new_names:
+                # FIXME: This is killing the substitution.
+                # Maybe using a RuleAwareIdentityMapper for TypeInferenceMapper
+                # would help.
+                return type(expr)(
+                        ResolvedFunction(self.calls_to_new_names[expanded_expr]),
+                        tuple(self.rec(child, expn_state)
+                            for child in expanded_expr.parameters))
+            else:
+                return super(FunctionNameChanger, self).map_call(
+                        expr, expn_state)
+        else:
+            return self.map_substitution(name, tag, expr.parameters, expn_state)
+
+    def map_call_with_kwargs(self, expr, expn_state):
+
+        if expr in self.calls_to_new_names:
+            return type(expr)(
+                ResolvedFunction(self.calls_to_new_names[expr]),
+                tuple(self.rec(child, expn_state)
+                    for child in expr.parameters),
+                dict(
+                    (key, self.rec(val, expn_state))
+                    for key, val in six.iteritems(expr.kw_parameters))
+                    )
+        else:
+            return super(FunctionNameChanger, self).map_call_with_kwargs(
+                    expr, expn_state)
+
+
+def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names):
+    """
+    Returns a copy of *kernel* with the names of pymbolic calls changed
+    according to the mapping given by *pymbolic_calls_new_names*.
+
+    :arg pymbolic_calls_to_new_names: A mapping from instances of
+        :class:`pymbolic.primitives.Call` to :class:`str`.
+
+    **Example: **
+
+        - Given a *kernel* --
+
+        .. code::
+
+            -------------------------------------------------------------
+            KERNEL: loopy_kernel
+            -------------------------------------------------------------
+            ARGUMENTS:
+            x: type: <auto/runtime>, shape: (10), dim_tags: (N0:stride:1)
+            y: type: <auto/runtime>, shape: (10), dim_tags: (N0:stride:1)
+            -------------------------------------------------------------
+            DOMAINS:
+            { [i] : 0 <= i <= 9 }
+            -------------------------------------------------------------
+            INAME IMPLEMENTATION TAGS:
+            i: None
+            -------------------------------------------------------------
+            INSTRUCTIONS:
+            for i
+                y[i] = ResolvedFunction('sin')(x[i])
+            end i
+            -------------------------------------------------------------
+
+        - And given a *pymbolic_calls_to_new_names* --
+
+        .. code::
+
+            {Call(ResolvedFunction(Variable('sin')), (Subscript(Variable('x'),
+            Variable('i')),))": 'sin_1'}
+
+        - The following *kernel* is returned --
+
+        .. code::
+
+            -------------------------------------------------------------
+            KERNEL: loopy_kernel
+            -------------------------------------------------------------
+            ARGUMENTS:
+            x: type: <auto/runtime>, shape: (10), dim_tags: (N0:stride:1)
+            y: type: <auto/runtime>, shape: (10), dim_tags: (N0:stride:1)
+            -------------------------------------------------------------
+            DOMAINS:
+            { [i] : 0 <= i <= 9 }
+            -------------------------------------------------------------
+            INAME IMPLEMENTATION TAGS:
+            i: None
+            -------------------------------------------------------------
+            INSTRUCTIONS:
+            for i
+                y[i] = ResolvedFunction('sin_1')(x[i])
+            end i
+            -------------------------------------------------------------
+    """
+    rule_mapping_context = SubstitutionRuleMappingContext(
+                    kernel.substitutions, kernel.get_var_name_generator())
+    subst_expander = SubstitutionRuleExpander(kernel.substitutions)
+    name_changer = FunctionNameChanger(rule_mapping_context,
+            pymbolic_calls_to_new_names, subst_expander)
+
+    return rule_mapping_context.finish_kernel(
+            name_changer.map_kernel(kernel))
+
+# }}}
+
+
 # {{{ type inference mapper
 
 class TypeInferenceMapper(CombineMapper):
-    def __init__(self, kernel, program_callables_info, new_assignments=None):
+    def __init__(self, kernel, callables_table, new_assignments=None):
         """
         :arg new_assignments: mapping from names to either
             :class:`loopy.kernel.data.TemporaryVariable`
@@ -74,12 +206,12 @@ class TypeInferenceMapper(CombineMapper):
             instances
         """
         self.kernel = kernel
-        assert isinstance(program_callables_info, ProgramCallablesInfo)
+        assert isinstance(callables_table, CallablesTable)
         if new_assignments is None:
             new_assignments = {}
         self.new_assignments = new_assignments
         self.symbols_with_unknown_types = set()
-        self.program_callables_info = program_callables_info
+        self.callables_table = callables_table
         self.old_calls_to_new_calls = {}
 
     def __call__(self, expr, return_tuple=False, return_dtype_set=False):
@@ -113,16 +245,16 @@ class TypeInferenceMapper(CombineMapper):
     # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x)
     # are Python-equal (for many common constants such as integers).
 
-    def copy(self, program_callables_info=None):
-        if program_callables_info is None:
-            program_callables_info = self.program_callables_info
-        return type(self)(self.kernel, program_callables_info,
+    def copy(self, callables_table=None):
+        if callables_table is None:
+            callables_table = self.callables_table
+        return type(self)(self.kernel, callables_table,
                 self.new_assignments)
 
     def with_assignments(self, names_to_vars):
         new_ass = self.new_assignments.copy()
         new_ass.update(names_to_vars)
-        return type(self)(self.kernel, self.program_callables_info, new_ass)
+        return type(self)(self.kernel, self.callables_table, new_ass)
 
     @staticmethod
     def combine(dtype_sets):
@@ -276,7 +408,6 @@ class TypeInferenceMapper(CombineMapper):
     def map_call(self, expr, return_tuple=False):
 
         from pymbolic.primitives import Variable, CallWithKwargs, Call
-        from loopy.symbolic import ResolvedFunction
 
         if isinstance(expr, CallWithKwargs):
             kw_parameters = expr.kw_parameters
@@ -300,7 +431,7 @@ class TypeInferenceMapper(CombineMapper):
 
         # specializing the known function wrt type
         if isinstance(expr.function, ResolvedFunction):
-            in_knl_callable = self.program_callables_info[expr.function.name]
+            in_knl_callable = self.callables_table[expr.function.name]
 
             # {{{ checking that there is no overwriting of types of in_knl_callable
 
@@ -326,8 +457,11 @@ class TypeInferenceMapper(CombineMapper):
                                         np.int64):
                             continue
 
+                        if np.can_cast(arg_id_to_dtype[id].dtype.type,
+                                in_knl_callable.arg_id_to_dtype[id].dtype.type):
+                            continue
+
                         # }}}
-                        continue
 
                         raise LoopyError("Overwriting a specialized function "
                                 "is illegal--maybe start with new instance of "
@@ -335,17 +469,17 @@ class TypeInferenceMapper(CombineMapper):
 
             # }}}
 
-            in_knl_callable, self.program_callables_info = (
+            in_knl_callable, self.callables_table = (
                     in_knl_callable.with_types(
                         arg_id_to_dtype, self.kernel,
-                        self.program_callables_info))
+                        self.callables_table))
 
             in_knl_callable = in_knl_callable.with_target(self.kernel.target)
 
             # storing the type specialized function so that it can be used for
             # later use
-            self.program_callables_info, new_function_id = (
-                    self.program_callables_info.with_callable(
+            self.callables_table, new_function_id = (
+                    self.callables_table.with_callable(
                         expr.function.function,
                         in_knl_callable))
 
@@ -408,9 +542,9 @@ class TypeInferenceMapper(CombineMapper):
                 in_knl_callable = ManglerCallable(
                         identifier, function_mangler, arg_id_to_dtype,
                         arg_id_to_descr, mangle_result.target_name)
-                self.program_callables_info, new_function_id = (
-                        self.program_callables_info.with_callable(
-                            expr.function, in_knl_callable, True))
+                self.callables_table, new_function_id = (
+                        self.callables_table.with_added_callable(
+                            expr.function, in_knl_callable))
 
                 if isinstance(expr, Call):
                     self.old_calls_to_new_calls[expr] = new_function_id
@@ -566,7 +700,7 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
 
     if var_name in kernel.all_params():
         return [kernel.index_dtype], [], {}, (
-                type_inf_mapper.program_callables_info)
+                type_inf_mapper.callables_table)
 
     from functools import partial
     debug = partial(_debug, kernel)
@@ -613,13 +747,13 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
     if not dtype_sets:
         return (
                 None, type_inf_mapper.symbols_with_unknown_types, None,
-                type_inf_mapper.program_callables_info)
+                type_inf_mapper.callables_table)
 
     result = type_inf_mapper.combine(dtype_sets)
 
     return (result, type_inf_mapper.symbols_with_unknown_types,
             type_inf_mapper.old_calls_to_new_calls,
-            type_inf_mapper.program_callables_info)
+            type_inf_mapper.callables_table)
 
 # }}}
 
@@ -646,7 +780,7 @@ class _DictUnionView:
 
 # {{{ infer_unknown_types
 
-def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info,
+def infer_unknown_types_for_a_single_kernel(kernel, callables_table,
         expect_completion=False):
     """Infer types on temporaries and arguments."""
 
@@ -709,7 +843,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info,
             new_temp_vars,
             new_arg_dict
             ])
-    type_inf_mapper = TypeInferenceMapper(kernel, program_callables_info,
+    type_inf_mapper = TypeInferenceMapper(kernel, callables_table,
             item_lookup)
 
     from loopy.symbolic import SubstitutionRuleExpander
@@ -745,11 +879,11 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info,
             debug("inferring type for %s %s", type(item).__name__, item.name)
 
             (result, symbols_with_unavailable_types,
-                    new_old_calls_to_new_calls, program_callables_info) = (
+                    new_old_calls_to_new_calls, callables_table) = (
                     _infer_var_type(
                             kernel, item.name, type_inf_mapper, subst_expander))
             type_inf_mapper = type_inf_mapper.copy(
-                    program_callables_info=program_callables_info)
+                    callables_table=callables_table)
 
             failed = not result
             if not failed:
@@ -768,9 +902,6 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info,
                         new_arg_dict[name] = item.copy(dtype=new_dtype)
                     else:
                         raise LoopyError("unexpected item type in type inference")
-                # TODO: I dont like in-place updates. Change this to something
-                # else. Perhaps add a function for doing this, which does it
-                # using a bunch of copies?
                 old_calls_to_new_calls.update(new_old_calls_to_new_calls)
             else:
                 debug("     failure")
@@ -859,8 +990,6 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info,
         if isinstance(insn, lp.MultiAssignmentBase):
             # just a dummy run over the expression, to pass over all the
             # functions
-            # FIXME: need a check over here which checks the instruction for
-            # unseen cases
             if _instruction_missed_during_inference(insn):
                 type_inf_mapper(insn.expression, return_tuple=isinstance(insn,
                     lp.CallInstruction), return_dtype_set=True)
@@ -871,7 +1000,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info,
             raise NotImplementedError("Unknown instructions type %s." % (
                 type(insn).__name__))
 
-    program_callables_info = type_inf_mapper.program_callables_info
+    callables_table = type_inf_mapper.callables_table
     old_calls_to_new_calls.update(type_inf_mapper.old_calls_to_new_calls)
 
     end_time = time.time()
@@ -883,9 +1012,6 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info,
             args=[new_arg_dict[arg.name] for arg in kernel.args],
             )
 
-    # this has to be subsitutition
-    from loopy.kernel.function_interface import (
-            change_names_of_pymbolic_calls)
     type_specialized_kernel = change_names_of_pymbolic_calls(
             pre_type_specialized_knl, old_calls_to_new_calls)
 
@@ -895,47 +1021,42 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info,
     if expect_completion:
         # if completion is expected, then it is important that all the
         # callables are scoped.
-        from loopy.check import check_functions_are_scoped
-        check_functions_are_scoped(type_specialized_kernel)
+        from loopy.check import check_functions_are_resolved
+        check_functions_are_resolved(type_specialized_kernel)
 
-    return type_specialized_kernel, program_callables_info
+    return type_specialized_kernel, callables_table
 
 
 def infer_unknown_types(program, expect_completion=False):
     """Infer types on temporaries and arguments."""
-    from loopy.kernel import LoopKernel
-    if isinstance(program, LoopKernel):
-        # FIXME: deprecate warning needed here
-        from loopy.program import make_program_from_kernel
-        program = make_program_from_kernel(program)
 
-    program_callables_info = program.program_callables_info
+    callables_table = program.callables_table
 
     type_uninferred_knl_callable = (
-            program_callables_info[program.name])
+            callables_table[program.name])
     type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel
 
-    program_callables_info = (
-            program.program_callables_info.with_edit_callables_mode())
-    root_kernel, program_callables_info = (
+    old_callables_count = callables_table.callables_count
+    callables_table = (
+            program.callables_table.with_edit_callables_mode())
+    root_kernel, callables_table = (
             infer_unknown_types_for_a_single_kernel(
                 type_uninferred_root_kernel,
-                program_callables_info, expect_completion))
+                callables_table, expect_completion))
 
     type_inferred_knl_callable = type_uninferred_knl_callable.copy(
             subkernel=root_kernel)
 
-    program_callables_info, _ = (
-            program_callables_info.with_callable(
+    callables_table, _ = (
+            callables_table.with_callable(
                 program.name,
                 type_inferred_knl_callable))
 
-    program_callables_info = (
-            program_callables_info.with_exit_edit_callables_mode())
+    callables_table = (
+            callables_table.with_exit_edit_callables_mode(
+                old_callables_count))
 
-    # FIXME: maybe put all of this in a function?
-    # need to infer functions that were left out during inference
-    return program.copy(program_callables_info=program_callables_info)
+    return program.copy(callables_table=callables_table)
 
 # }}}
 
@@ -943,8 +1064,8 @@ def infer_unknown_types(program, expect_completion=False):
 # {{{ reduction expression helper
 
 def infer_arg_and_reduction_dtypes_for_reduction_expression(
-        kernel, expr, program_callables_info, unknown_types_ok):
-    type_inf_mapper = TypeInferenceMapper(kernel, program_callables_info)
+        kernel, expr, callables_table, unknown_types_ok):
+    type_inf_mapper = TypeInferenceMapper(kernel, callables_table)
     import loopy as lp
 
     if expr.is_tuple_typed:
@@ -976,7 +1097,7 @@ def infer_arg_and_reduction_dtypes_for_reduction_expression(
             for dt in reduction_dtypes)
 
     return tuple(arg_dtypes), reduction_dtypes, (
-            type_inf_mapper.program_callables_info)
+            type_inf_mapper.callables_table)
 
 # }}}
 
diff --git a/requirements.txt b/requirements.txt
index a3e88cfea99e7413211c35d11464932f98e23758..97c2024764715d0a715520800e2e1dd467183479 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,4 @@ git+https://github.com/inducer/codepy.git
 git+https://github.com/inducer/f2py
 
 # Optional, needed for using the C preprocessor on Fortran
-ply>=3.6
-
-# This is needed for the pyinstaller executable to be usable.
-packaging
+ply>=3.6
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index b939ce0cf8b680bb1eb3501ed6d7f563e9c1c7b6..eec3dfd1f52ed97c58f5281716eac8fc18980094 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [flake8]
-ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814
+ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814,W504
 max-line-length=85
 exclude=
     loopy/target/c/compyte/ndarray,
diff --git a/test/test_diff.py b/test/test_diff.py
index 7e14a7ab5274b8e40fbc5d792a7303daf171dc17..d001233c0eced5ecaf9342b90da0487faefb21f3 100644
--- a/test/test_diff.py
+++ b/test/test_diff.py
@@ -66,7 +66,7 @@ def test_diff(ctx_factory):
 
     from loopy.transform.diff import diff_kernel
     dknl, diff_map = diff_kernel(knl, "z", "x")
-    dknl = lp.make_program_from_kernel(dknl)
+    dknl = lp.make_program(dknl)
     dknl = lp.remove_unused_arguments(dknl)
 
     dknl = lp.add_inames_to_insn(dknl, "diff_i0", "writes:a_dx or writes:a")
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 9dc74b94f72347e3b4287e244f06292ce60527b4..2c10b11820ef88a6984a505f42eb0e2b1a95f707 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -408,11 +408,14 @@ def test_ilp_write_race_detection_global(ctx_factory):
 
     knl = lp.tag_inames(knl, dict(j="ilp"))
 
+    knl = lp.preprocess_kernel(knl)
+
     with lp.CacheMode(False):
         from loopy.diagnostic import WriteRaceConditionWarning
         from warnings import catch_warnings
         with catch_warnings(record=True) as warn_list:
-            lp.generate_code_v2(knl)
+            list(lp.generate_loop_schedules(knl.root_kernel,
+                    knl.callables_table))
 
             assert any(isinstance(w.message, WriteRaceConditionWarning)
                     for w in warn_list)
@@ -1267,7 +1270,7 @@ def save_and_reload_temporaries_test(queue, prog, out_expect, debug=False):
     from loopy.transform.save import save_and_reload_temporaries
     prog = save_and_reload_temporaries(prog)
     prog = prog.with_root_kernel(lp.get_one_scheduled_kernel(prog.root_kernel,
-        prog.program_callables_info))
+        prog.callables_table))
 
     if debug:
         print(prog)
@@ -2220,7 +2223,7 @@ def test_unscheduled_insn_detection():
         "...")
 
     prog = lp.preprocess_kernel(prog)
-    knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info)
+    knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table)
     prog = prog.with_root_kernel(knl)
     insn1, = lp.find_instructions(prog, "id:insn1")
     insns = prog.root_kernel.instructions[:]
@@ -2390,7 +2393,7 @@ def test_barrier_insertion_near_top_of_loop():
     prog = lp.set_temporary_scope(prog, "a", "local")
     prog = lp.set_temporary_scope(prog, "b", "local")
     prog = lp.preprocess_kernel(prog)
-    knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info)
+    knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table)
 
     print(knl)
 
@@ -2418,7 +2421,7 @@ def test_barrier_insertion_near_bottom_of_loop():
     prog = lp.set_temporary_scope(prog, "a", "local")
     prog = lp.set_temporary_scope(prog, "b", "local")
     prog = lp.preprocess_kernel(prog)
-    knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info)
+    knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table)
 
     print(knl)
 
@@ -2477,7 +2480,7 @@ def test_multi_argument_reduction_type_inference():
             allow_simultaneous=True)
 
     t_inf_mapper = TypeInferenceMapper(prog.root_kernel,
-            prog.program_callables_info)
+            prog.callables_table)
 
     assert (
             t_inf_mapper(expr, return_tuple=True, return_dtype_set=True)
@@ -2834,7 +2837,7 @@ def test_no_barriers_for_nonoverlapping_access(second_index, expect_barrier):
     prog = lp.preprocess_kernel(prog)
 
     knl = lp.get_one_scheduled_kernel(prog.root_kernel,
-            prog.program_callables_info)
+            prog.callables_table)
 
     assert barrier_between(knl, "first", "second") == expect_barrier
 
@@ -2888,6 +2891,25 @@ def test_dep_cycle_printing_and_error():
         print(lp.generate_code(knl).device_code())
 
 
+def test_backwards_dep_printing_and_error():
+    knl = lp.make_kernel(
+            "{[i]: 0<=i<n}",
+            """
+            c[i] = a[i] + b[i]                       {id=insn1}
+            c[i] = 2*c[i]                            {id=insn2, dep=insn1}
+            c[i] = 7*c[i] + a[i]*a[i] + b[i]*b[i]    {id=insn3, dep=insn2}
+            b[i] = b[i] + c[i]                                 {id=insn4, dep=insn3}
+            d[i] = 7*a[i ]                                     {id=insn5, dep=insn4}
+            a[i] = a[i] + d[i]                                 {id=insn6, dep=insn5}
+            """, [
+                lp.GlobalArg('a, b', dtype=np.float64),
+                "..."
+            ])
+
+    # Used to crash with KeyError
+    print(knl)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py
index 4f802f8bff3ba00763825bc09dbc6051ff1ac527..1ba44e77e13a88ecbc05f4eecc6b9c7e397eb656 100644
--- a/test/test_numa_diff.py
+++ b/test/test_numa_diff.py
@@ -47,8 +47,8 @@ __all__ = [
 from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa
 
 
-@pytest.mark.parametrize("Nq", [7])
 @pytest.mark.parametrize("ilp_multiple", [1, 2])
+@pytest.mark.parametrize("Nq", [7])
 @pytest.mark.parametrize("opt_level", [11])
 def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
     ctx = ctx_factory()
@@ -231,7 +231,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
 
     if 1:
         print("OPS")
-        op_map = lp.get_op_map(hsv)
+        op_map = lp.get_op_map(hsv, subgroup_size=32)
         print(lp.stringify_stats_mapping(op_map))
 
         print("MEM")
diff --git a/test/test_reduction.py b/test/test_reduction.py
index 96dab405a98bc710eb7a50ec3f276e248cf44cf9..aaf11ee296341b8d8f4653e41bf8640c4e34583c 100644
--- a/test/test_reduction.py
+++ b/test/test_reduction.py
@@ -219,32 +219,38 @@ def test_local_parallel_reduction(ctx_factory, size):
 def test_global_parallel_reduction(ctx_factory, size):
     ctx = ctx_factory()
 
-    prog = lp.make_kernel(
+    knl = lp.make_kernel(
             "{[i]: 0 <= i < n }",
             """
             # Using z[0] instead of z works around a bug in ancient PyOpenCL.
-            z[0] = sum(i, i/13)
+            z[0] = sum(i, a[i])
             """)
 
-    ref_prog = prog
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
+    ref_knl = knl
 
     gsize = 128
-    prog = lp.split_iname(prog, "i", gsize * 20)
-    prog = lp.split_iname(prog, "i_inner", gsize, outer_tag="l.0")
-    prog = lp.split_reduction_inward(prog, "i_inner_inner")
-    prog = lp.split_reduction_inward(prog, "i_inner_outer")
+    knl = lp.split_iname(knl, "i", gsize * 20)
+    knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0")
+    knl = lp.split_reduction_outward(knl, "i_outer")
+    knl = lp.split_reduction_inward(knl, "i_inner_outer")
     from loopy.transform.data import reduction_arg_to_subst_rule
-    prog = reduction_arg_to_subst_rule(prog, "i_outer")
-    prog = lp.precompute(prog, "red_i_outer_arg", "i_outer",
+    knl = reduction_arg_to_subst_rule(knl, "i_outer")
+
+    knl = lp.precompute(knl, "red_i_outer_arg", "i_outer",
             temporary_scope=lp.temp_var_scope.GLOBAL,
             default_tag="l.auto")
-    prog = lp.realize_reduction(prog)
-    prog = lp.add_dependency(
-            prog, "writes:acc_i_outer",
+    knl = lp.realize_reduction(knl)
+    knl = lp.tag_inames(knl, "i_outer_0:g.0")
+
+    # Keep the i_outer accumulator on the  correct (lower) side of the barrier,
+    # otherwise there will be useless save/reload code generated.
+    knl = lp.add_dependency(
+            knl, "writes:acc_i_outer",
             "id:red_i_outer_arg_barrier")
 
     lp.auto_test_vs_ref(
-            ref_prog, ctx, prog, parameters={"n": size},
+            ref_knl, ctx, knl, parameters={"n": size},
             print_ref_code=True)
 
 
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 79c5ec7da0971b534588be3bfcd58a9f5fc8405a..41b44b5a7e9bbfe8f371e6a605ccfa8068a563b6 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -39,6 +39,9 @@ from pymbolic.primitives import Variable
 from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa
 
 
+SGS = 32  # Subgroup size
+
+
 def test_op_counter_basic():
 
     knl = lp.make_kernel(
@@ -54,21 +57,27 @@ def test_op_counter_basic():
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
                                        g=np.float64, h=np.float64))
-    op_map = lp.get_op_map(knl, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True,
+                           count_within_subscripts=True)
+    n_workgroups = 1
+    group_size = 1
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params)
-    f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params)
-    f32div = op_map[lp.Op(np.float32, 'div', CG.WORKITEM)].eval_with_dict(params)
-    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.WORKITEM)
+    f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params)
+    f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params)
+    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.SUBGROUP)
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP)
                     ].eval_with_dict(params)
-    assert f32add == f32mul == f32div == n*m*ell
-    assert f64mul == n*m
-    assert i32add == n*m*2
+    # (count-per-sub-group)*n_subgroups
+    assert f32add == f32mul == f32div == n*m*ell*n_subgroups
+    assert f64mul == n*m*n_subgroups
+    assert i32add == n*m*2*n_subgroups
 
 
 def test_op_counter_reduction():
@@ -81,15 +90,20 @@ def test_op_counter_reduction():
             name="matmul_serial", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    op_map = lp.get_op_map(knl, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
+    n_workgroups = 1
+    group_size = 1
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params)
-    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.WORKITEM)
+    f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.SUBGROUP)
                     ].eval_with_dict(params)
-    assert f32add == f32mul == n*m*ell
+    # (count-per-sub-group)*n_subgroups
+    assert f32add == f32mul == n*m*ell*n_subgroups
 
     op_map_dtype = op_map.group_by('dtype')
     f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
@@ -111,21 +125,26 @@ def test_op_counter_logic():
             name="logic", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    op_map = lp.get_op_map(knl, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
+    n_workgroups = 1
+    group_size = 1
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params)
-    f64add = op_map[lp.Op(np.float64, 'add', CG.WORKITEM)].eval_with_dict(params)
-    f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.WORKITEM)
+    f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params)
+    f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(params)
+    f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.SUBGROUP)
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP)
                     ].eval_with_dict(params)
-    assert f32mul == n*m
-    assert f64div == 2*n*m  # TODO why?
-    assert f64add == n*m
-    assert i32add == n*m
+    # (count-per-sub-group)*n_subgroups
+    assert f32mul == n*m*n_subgroups
+    assert f64div == 2*n*m*n_subgroups  # TODO why?
+    assert f64add == n*m*n_subgroups
+    assert i32add == n*m*n_subgroups
 
 
 def test_op_counter_specialops():
@@ -143,27 +162,33 @@ def test_op_counter_specialops():
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
                                        g=np.float64, h=np.float64))
-    op_map = lp.get_op_map(knl, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True,
+                           count_within_subscripts=True)
+    n_workgroups = 1
+    group_size = 1
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params)
-    f32div = op_map[lp.Op(np.float32, 'div', CG.WORKITEM)].eval_with_dict(params)
-    f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params)
-    f64pow = op_map[lp.Op(np.float64, 'pow', CG.WORKITEM)].eval_with_dict(params)
-    f64add = op_map[lp.Op(np.dtype(np.float64), 'add', CG.WORKITEM)
+    f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params)
+    f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params)
+    f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params)
+    f64pow = op_map[lp.Op(np.float64, 'pow', CG.SUBGROUP)].eval_with_dict(params)
+    f64add = op_map[lp.Op(np.dtype(np.float64), 'add', CG.SUBGROUP)
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP)
                     ].eval_with_dict(params)
-    f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', CG.WORKITEM)
+    f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', CG.SUBGROUP)
                     ].eval_with_dict(params)
-    f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', CG.WORKITEM)
+    f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', CG.SUBGROUP)
                     ].eval_with_dict(params)
-    assert f32div == 2*n*m*ell
-    assert f32mul == f32add == n*m*ell
-    assert f64add == 3*n*m
-    assert f64pow == i32add == f64rsq == f64sin == n*m
+    # (count-per-sub-group)*n_subgroups
+    assert f32div == 2*n*m*ell*n_subgroups
+    assert f32mul == f32add == n*m*ell*n_subgroups
+    assert f64add == 3*n*m*n_subgroups
+    assert f64pow == i32add == f64rsq == f64sin == n*m*n_subgroups
 
 
 def test_op_counter_bitwise():
@@ -183,26 +208,32 @@ def test_op_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int64, h=np.int64))
 
-    op_map = lp.get_op_map(knl, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True,
+                           count_within_subscripts=False)
+    n_workgroups = 1
+    group_size = 1
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    i32add = op_map[lp.Op(np.int32, 'add', CG.WORKITEM)].eval_with_dict(params)
-    i32bw = op_map[lp.Op(np.int32, 'bw', CG.WORKITEM)].eval_with_dict(params)
-    i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', CG.WORKITEM)
+    i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP)].eval_with_dict(params)
+    i32bw = op_map[lp.Op(np.int32, 'bw', CG.SUBGROUP)].eval_with_dict(params)
+    i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', CG.SUBGROUP)
                    ].eval_with_dict(params)
-    i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', CG.WORKITEM)
+    i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', CG.SUBGROUP)
                     ].eval_with_dict(params)
-    i64add = op_map[lp.Op(np.dtype(np.int64), 'add', CG.WORKITEM)
+    i64add = op_map[lp.Op(np.dtype(np.int64), 'add', CG.SUBGROUP)
                     ].eval_with_dict(params)
-    i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.WORKITEM)
+    i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.SUBGROUP)
                       ].eval_with_dict(params)
-    assert i32add == n*m+n*m*ell
-    assert i32bw == 2*n*m*ell
-    assert i64bw == 2*n*m
-    assert i64add == i64mul == n*m
-    assert i64shift == 2*n*m
+    # (count-per-sub-group)*n_subgroups
+    assert i32add == n*m*ell*n_subgroups
+    assert i32bw == 2*n*m*ell*n_subgroups
+    assert i64bw == 2*n*m*n_subgroups
+    assert i64add == i64mul == n*m*n_subgroups
+    assert i64shift == 2*n*m*n_subgroups
 
 
 def test_op_counter_triangular_domain():
@@ -228,15 +259,21 @@ def test_op_counter_triangular_domain():
 
     op_map = lp.get_op_map(
                     knl,
+                    subgroup_size=SGS,
                     count_redundant_work=True
-                    )[lp.Op(np.float64, 'mul', CG.WORKITEM)]
+                    )[lp.Op(np.float64, 'mul', CG.SUBGROUP)]
     value_dict = dict(m=13, n=200)
     flops = op_map.eval_with_dict(value_dict)
 
+    n_workgroups = 1
+    group_size = 1
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
+
     if expect_fallback:
-        assert flops == 144
+        assert flops == 144*n_subgroups
     else:
-        assert flops == 78
+        assert flops == 78*n_subgroups
 
 
 def test_mem_access_counter_basic():
@@ -254,10 +291,8 @@ def test_mem_access_counter_basic():
     knl = lp.add_and_infer_dtypes(knl,
                     dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
 
-    subgroup_size = 32
-
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
-                                    subgroup_size=subgroup_size)
+                                    subgroup_size=SGS)
 
     n = 512
     m = 256
@@ -266,7 +301,8 @@ def test_mem_access_counter_basic():
 
     n_workgroups = 1
     group_size = 1
-    subgroups_per_group = div_ceil(group_size, subgroup_size)
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
 
     f32l = mem_map[lp.MemAccess('global', np.float32,
                         lid_strides={}, gid_strides={},
@@ -289,9 +325,9 @@ def test_mem_access_counter_basic():
                         count_granularity=CG.SUBGROUP)
                     ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert f32l == (3*n*m*ell)*n_workgroups*subgroups_per_group
-    assert f64l == (2*n*m)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert f32l == (3*n*m*ell)*n_subgroups
+    assert f64l == (2*n*m)*n_subgroups
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                         lid_strides={}, gid_strides={},
@@ -304,9 +340,9 @@ def test_mem_access_counter_basic():
                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert f32s == (n*m*ell)*n_workgroups*subgroups_per_group
-    assert f64s == (n*m)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert f32s == (n*m*ell)*n_subgroups
+    assert f64s == (n*m)*n_subgroups
 
 
 def test_mem_access_counter_reduction():
@@ -320,10 +356,8 @@ def test_mem_access_counter_reduction():
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
 
-    subgroup_size = 32
-
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
-                                    subgroup_size=subgroup_size)
+                                    subgroup_size=SGS)
     n = 512
     m = 256
     ell = 128
@@ -331,7 +365,8 @@ def test_mem_access_counter_reduction():
 
     n_workgroups = 1
     group_size = 1
-    subgroups_per_group = div_ceil(group_size, subgroup_size)
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
 
     f32l = mem_map[lp.MemAccess('global', np.float32,
                         lid_strides={}, gid_strides={},
@@ -344,8 +379,8 @@ def test_mem_access_counter_reduction():
                         count_granularity=CG.SUBGROUP)
                     ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert f32l == (2*n*m*ell)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert f32l == (2*n*m*ell)*n_subgroups
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                         lid_strides={}, gid_strides={},
@@ -353,8 +388,8 @@ def test_mem_access_counter_reduction():
                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert f32s == (n*ell)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert f32s == (n*ell)*n_subgroups
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
                                  ).to_bytes().eval_and_sum(params)
@@ -379,10 +414,8 @@ def test_mem_access_counter_logic():
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
 
-    subgroup_size = 32
-
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
-                                    subgroup_size=subgroup_size)
+                                    subgroup_size=SGS)
     n = 512
     m = 256
     ell = 128
@@ -390,7 +423,8 @@ def test_mem_access_counter_logic():
 
     n_workgroups = 1
     group_size = 1
-    subgroups_per_group = div_ceil(group_size, subgroup_size)
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
 
     reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
 
@@ -404,10 +438,10 @@ def test_mem_access_counter_logic():
                                        direction='store')
                           ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert f32_g_l == (2*n*m)*n_workgroups*subgroups_per_group
-    assert f64_g_l == (n*m)*n_workgroups*subgroups_per_group
-    assert f64_g_s == (n*m)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert f32_g_l == (2*n*m)*n_subgroups
+    assert f64_g_l == (n*m)*n_subgroups
+    assert f64_g_s == (n*m)*n_subgroups
 
 
 def test_mem_access_counter_specialops():
@@ -425,10 +459,8 @@ def test_mem_access_counter_specialops():
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
                                             g=np.float64, h=np.float64))
 
-    subgroup_size = 32
-
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
-                                    subgroup_size=subgroup_size)
+                                    subgroup_size=SGS)
     n = 512
     m = 256
     ell = 128
@@ -436,7 +468,8 @@ def test_mem_access_counter_specialops():
 
     n_workgroups = 1
     group_size = 1
-    subgroups_per_group = div_ceil(group_size, subgroup_size)
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
 
     f32 = mem_map[lp.MemAccess('global', np.float32,
                         lid_strides={}, gid_strides={},
@@ -459,9 +492,9 @@ def test_mem_access_counter_specialops():
                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert f32 == (2*n*m*ell)*n_workgroups*subgroups_per_group
-    assert f64 == (2*n*m)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert f32 == (2*n*m*ell)*n_subgroups
+    assert f64 == (2*n*m)*n_subgroups
 
     f32 = mem_map[lp.MemAccess('global', np.float32,
                         lid_strides={}, gid_strides={},
@@ -474,16 +507,16 @@ def test_mem_access_counter_specialops():
                         count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert f32 == (n*m*ell)*n_workgroups*subgroups_per_group
-    assert f64 == (n*m)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert f32 == (n*m*ell)*n_subgroups
+    assert f64 == (n*m)*n_subgroups
 
     filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'],
                          count_granularity=CG.SUBGROUP)
     tot = filtered_map.eval_and_sum(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert tot == (n*m*ell + n*m)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert tot == (n*m*ell + n*m)*n_subgroups
 
 
 def test_mem_access_counter_bitwise():
@@ -503,10 +536,8 @@ def test_mem_access_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int32, h=np.int32))
 
-    subgroup_size = 32
-
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
-                                    subgroup_size=subgroup_size)
+                                    subgroup_size=SGS)
     n = 512
     m = 256
     ell = 128
@@ -514,7 +545,8 @@ def test_mem_access_counter_bitwise():
 
     n_workgroups = 1
     group_size = 1
-    subgroups_per_group = div_ceil(group_size, subgroup_size)
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
 
     i32 = mem_map[lp.MemAccess('global', np.int32,
                         lid_strides={}, gid_strides={},
@@ -537,8 +569,8 @@ def test_mem_access_counter_bitwise():
                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert i32 == (4*n*m+2*n*m*ell)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert i32 == (4*n*m+2*n*m*ell)*n_subgroups
 
     i32 = mem_map[lp.MemAccess('global', np.int32,
                         lid_strides={}, gid_strides={},
@@ -551,8 +583,8 @@ def test_mem_access_counter_bitwise():
                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert i32 == (n*m+n*m*ell)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert i32 == (n*m+n*m*ell)*n_subgroups
 
 
 def test_mem_access_counter_mixed():
@@ -571,7 +603,6 @@ def test_mem_access_counter_mixed():
                 x=np.float32))
 
     group_size_0 = 65
-    subgroup_size = 32
 
     knl = lp.split_iname(knl, "j", group_size_0)
     knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"})
@@ -583,10 +614,11 @@ def test_mem_access_counter_mixed():
 
     n_workgroups = div_ceil(ell, group_size_0)
     group_size = group_size_0
-    subgroups_per_group = div_ceil(group_size, subgroup_size)
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
 
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
-                                    subgroup_size=subgroup_size)
+                                    subgroup_size=SGS)
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                 lid_strides={}, gid_strides={},
                                 direction='load', variable='g',
@@ -617,9 +649,9 @@ def test_mem_access_counter_mixed():
                                 count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert f64uniform == (2*n*m)*n_workgroups*subgroups_per_group
-    assert f32uniform == (m*n)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert f64uniform == (2*n*m)*n_subgroups
+    assert f32uniform == (m*n)*n_subgroups
 
     expect_fallback = False
     import islpy as isl
@@ -651,8 +683,8 @@ def test_mem_access_counter_mixed():
                                 count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert f64uniform == m*n*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert f64uniform == m*n*n_subgroups
 
     if expect_fallback:
         if ell < group_size_0:
@@ -681,7 +713,7 @@ def test_mem_access_counter_nonconsec():
     knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"})
 
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
-                                    subgroup_size=32)  # noqa
+                                    subgroup_size=SGS)  # noqa
     n = 512
     m = 256
     ell = 128
@@ -939,30 +971,35 @@ def test_all_counters_parallel_matmul():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
+    group_size = bsize*bsize
+    n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize)
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
 
     sync_map = lp.get_synchronization_map(knl)
     assert len(sync_map) == 2
     assert sync_map["kernel_launch"].eval_with_dict(params) == 1
     assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/bsize
 
-    op_map = lp.get_op_map(knl, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
     f32mul = op_map[
-                        lp.Op(np.float32, 'mul', CG.WORKITEM)
+                        lp.Op(np.float32, 'mul', CG.SUBGROUP)
                         ].eval_with_dict(params)
     f32add = op_map[
-                        lp.Op(np.float32, 'add', CG.WORKITEM)
+                        lp.Op(np.float32, 'add', CG.SUBGROUP)
                         ].eval_with_dict(params)
     i32ops = op_map[
-                        lp.Op(np.int32, 'add', CG.WORKITEM)
+                        lp.Op(np.int32, 'add', CG.SUBGROUP)
                         ].eval_with_dict(params)
     i32ops += op_map[
-                        lp.Op(np.dtype(np.int32), 'mul', CG.WORKITEM)
+                        lp.Op(np.dtype(np.int32), 'mul', CG.SUBGROUP)
                         ].eval_with_dict(params)
 
-    assert f32mul+f32add == n*m*ell*2
+    # (count-per-sub-group)*n_subgroups
+    assert f32mul+f32add == m*2*n_subgroups
 
     mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True,
-                                           subgroup_size=32)
+                                           subgroup_size=SGS)
 
     f32s1lb = mem_access_map[lp.MemAccess('global', np.float32,
                              lid_strides={0: 1, 1: Variable('ell')},
@@ -991,33 +1028,36 @@ def test_all_counters_parallel_matmul():
 
     local_mem_map = lp.get_mem_access_map(knl,
                         count_redundant_work=True,
-                        subgroup_size=32).filter_by(mtype=['local'])
+                        subgroup_size=SGS).filter_by(mtype=['local'])
 
     local_mem_l = local_mem_map.filter_by(direction=['load']
                                           ).eval_and_sum(params)
-    assert local_mem_l == n*m*ell*2
+    # (count-per-sub-group)*n_subgroups
+    assert local_mem_l == m*2*n_subgroups
 
     local_mem_l_a = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
                                                direction='load',
                                                lid_strides={1: 16},
                                                gid_strides={},
                                                variable='a_fetch',
-                                               count_granularity=CG.WORKITEM)
+                                               count_granularity=CG.SUBGROUP)
                                   ].eval_with_dict(params)
     local_mem_l_b = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
                                                direction='load',
                                                lid_strides={0: 1},
                                                gid_strides={},
                                                variable='b_fetch',
-                                               count_granularity=CG.WORKITEM)
+                                               count_granularity=CG.SUBGROUP)
                                   ].eval_with_dict(params)
 
-    assert local_mem_l_a == local_mem_l_b == n*m*ell
+    # (count-per-sub-group)*n_subgroups
+    assert local_mem_l_a == local_mem_l_b == m*n_subgroups
 
     local_mem_s = local_mem_map.filter_by(direction=['store']
                                           ).eval_and_sum(params)
 
-    assert local_mem_s == n*m*ell*2/bsize
+    # (count-per-sub-group)*n_subgroups
+    assert local_mem_s == m*2/bsize*n_subgroups
 
 
 def test_gather_access_footprint():
@@ -1067,8 +1107,6 @@ def test_summations_and_filters():
     knl = lp.add_and_infer_dtypes(knl,
                     dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
 
-    subgroup_size = 32
-
     n = 512
     m = 256
     ell = 128
@@ -1076,24 +1114,25 @@ def test_summations_and_filters():
 
     n_workgroups = 1
     group_size = 1
-    subgroups_per_group = div_ceil(group_size, subgroup_size)
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
 
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
-                                    subgroup_size=subgroup_size)
+                                    subgroup_size=SGS)
 
     loads_a = mem_map.filter_by(direction=['load'], variable=['a'],
                                 count_granularity=[CG.SUBGROUP]
                                 ).eval_and_sum(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert loads_a == (2*n*m*ell)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert loads_a == (2*n*m*ell)*n_subgroups
 
     global_stores = mem_map.filter_by(mtype=['global'], direction=['store'],
                                       count_granularity=[CG.SUBGROUP]
                                       ).eval_and_sum(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert global_stores == (n*m*ell + n*m)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert global_stores == (n*m*ell + n*m)*n_subgroups
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'],
                                  count_granularity=[CG.SUBGROUP]
@@ -1102,9 +1141,9 @@ def test_summations_and_filters():
                                  count_granularity=[CG.SUBGROUP]
                                  ).to_bytes().eval_and_sum(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_workgroups*subgroups_per_group
-    assert st_bytes == (4*n*m*ell + 8*n*m)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_subgroups
+    assert st_bytes == (4*n*m*ell + 8*n*m)*n_subgroups
 
     # ignore stride and variable names in this map
     reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
@@ -1113,11 +1152,12 @@ def test_summations_and_filters():
     f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load')
                           ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert f32lall == (3*n*m*ell)*n_workgroups*subgroups_per_group
-    assert f64lall == (2*n*m)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert f32lall == (3*n*m*ell)*n_subgroups
+    assert f64lall == (2*n*m)*n_subgroups
 
-    op_map = lp.get_op_map(knl, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True,
+                           count_within_subscripts=True)
     #for k, v in op_map.items():
     #    print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v)
 
@@ -1149,8 +1189,8 @@ def test_summations_and_filters():
                key.direction == 'load'
     f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert f64l == (2*n*m)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert f64l == (2*n*m)*n_subgroups
 
 
 def test_strided_footprint():
diff --git a/test/test_target.py b/test/test_target.py
index 7b9d4f40ae83dadc8e8bd20946cadfbbad6c2f70..095bf0939d7439be440f7300f70f1ad4491f0fb9 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -323,6 +323,50 @@ def test_target_invalid_type_cast():
         lp.TypeCast(dtype, 1)
 
 
+def test_ispc_streaming_stores():
+    stream_dtype = np.float32
+    index_dtype = np.int32
+
+    knl = lp.make_kernel(
+            "{[i]: 0<=i<n}",
+            "a[i] = b[i] + scalar * c[i]",
+            target=lp.ISPCTarget(), index_dtype=index_dtype,
+            name="stream_triad")
+
+    vars = ["a", "b", "c", "scalar"]
+    knl = lp.assume(knl, "n>0")
+    knl = lp.split_iname(
+        knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1))
+    knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0")
+    knl = lp.tag_instructions(knl, "!streaming_store")
+
+    knl = lp.add_and_infer_dtypes(knl, {
+        var: stream_dtype
+        for var in vars
+        })
+
+    knl = lp.set_argument_order(knl, vars + ["n"])
+
+    lp.generate_code_v2(knl).all_code()
+
+
+def test_cuda_short_vector():
+    knl = lp.make_kernel(
+        "{ [i]: 0<=i<n }",
+        "out[i] = 2*a[i]",
+        target=lp.CudaTarget())
+
+    knl = lp.set_options(knl, write_code=True)
+    knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec")
+    knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4)
+    knl = lp.tag_array_axes(knl, "a,out", "C,vec")
+
+    knl = lp.set_options(knl, write_wrapper=True)
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
+
+    print(lp.generate_code_v2(knl).device_code())
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_transform.py b/test/test_transform.py
index d54a820a8a7dbced4ed8b3aab9af3f53a92499bc..04162331ddc92a4ef13ef0086fee2866a5b9f0af 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -543,6 +543,23 @@ def test_uniquify_instruction_ids():
     assert all(isinstance(id, str) for id in insn_ids)
 
 
+def test_split_iname_only_if_in_within():
+    prog = lp.make_kernel(
+            "{[i]: 0<=i<10}",
+            """
+            c[i] = 3*d[i] {id=to_split}
+            a[i] = 2*b[i] {id=not_to_split}
+            """)
+
+    prog = lp.split_iname(prog, "i", 4, within='id:to_split')
+
+    for insn in prog.root_kernel.instructions:
+        if insn.id == 'to_split':
+            assert insn.within_inames == frozenset({'i_outer', 'i_inner'})
+        if insn.id == 'not_to_split':
+            assert insn.within_inames == frozenset({'i'})
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/testlib.py b/test/testlib.py
index eebc792d00297adf4785f850582b415ca1d20439..853e2584a1e10732b3ec49cd737016734cdea5fa 100644
--- a/test/testlib.py
+++ b/test/testlib.py
@@ -9,9 +9,9 @@ class GridOverride(object):
         self.clean = clean
         self.vecsize = vecsize
 
-    def __call__(self, insn_ids, program_callables_info, ignore_auto=True):
+    def __call__(self, insn_ids, callables_table, ignore_auto=True):
         gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids,
-                program_callables_info, ignore_auto)
+                callables_table, ignore_auto)
         return gsize, (self.vecsize,)
 
 # }}}
@@ -139,14 +139,14 @@ class SeparateTemporariesPreambleTestPreambleGenerator(
 
 class Log2Callable(lp.ScalarCallable):
 
-    def with_types(self, arg_id_to_dtype, kernel, program_callables_info):
+    def with_types(self, arg_id_to_dtype, kernel, callables_table):
 
         if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None:
             # the types provided aren't mature enough to specialize the
             # callable
             return (
                     self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                    program_callables_info)
+                    callables_table)
 
         dtype = arg_id_to_dtype[0].numpy_dtype
 
@@ -168,7 +168,7 @@ class Log2Callable(lp.ScalarCallable):
                 self.copy(name_in_target=name_in_target,
                     arg_id_to_dtype={0: NumpyType(dtype), -1:
                         NumpyType(dtype)}),
-                program_callables_info)
+                callables_table)
 
 
 def register_log2_lookup(target, identifier):