diff --git a/doc/index.rst b/doc/index.rst
index 7baff3249a25e69019c06802901538500c1af971..8ab62928dcdddd72902994d72f1796c9bd47b3b5 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -43,6 +43,7 @@ Please check :ref:`installation` to get started.
     ref_creation
     ref_kernel
     ref_transform
+    ref_call
     ref_other
     misc
     ref_internals
diff --git a/doc/ref_call.rst b/doc/ref_call.rst
new file mode 100644
index 0000000000000000000000000000000000000000..5a59e84282119209cc89eb18e3a4eda97725edf0
--- /dev/null
+++ b/doc/ref_call.rst
@@ -0,0 +1,193 @@
+Calling Loopy Kernels and External Functions
+============================================
+
+Goals of a function interface
+-----------------------------
+
+- *FIXME: * Needs to change after the new design of program.
+
+- Must be able to have complete information of the function just through the
+  epxression node.
+- Must adhere to :mod:`loopy` semantics of immutability.
+- Must have a class instance linked with the expression node which would record
+  the properties of the function.
+- Must indicate in the expression if the function is known to the kernel. (This
+  is intended to be done by making the function expression node an instance of
+  ``ResolvedFunction`` as soon as the function definition is resolved by the
+  kernel)
+- Function overloading is not encouraged in :mod:`loopy` as it gives rise to
+  contention while debugging with the help of the kernel intermediate
+  representation and hence if the expression nodes point to different function
+  instances they must differ in their representation. For example: ``float
+  sin(float )`` and ``double sin(double )`` should diverge by having different
+  identifiers as soon as data type of the argument is inferred.
+- Must have an interface to register external functions.
+
+
+Scoped Function and resolving
+-----------------------------
+
+``ResolvedFunctions`` are pymbolic nodes within expressions in a ``Loo.py``
+kernel, whose name has been resolved by the kernel. The process of matching a
+function idenitifier with the function definition is called "resolving".
+
+A pymbolic ``Call`` node can be converted to a ``ResolvedFunction`` if it
+is "resolved" by one of the ``function_id_to_in_knl_callable_mapper`` in a
+:attr:`LoopKernel.scoped_functions`
+
+-  Functions already registered by the target. Some examples include --
+   ``sin()``, ``cos()``, ``exp()``, ``max()`` (for C-Targets.)
+-  Functions that are defined in ``Loo.py`` and are realized into
+   different set of instructions during code generation. Some examples
+   include ``make_tuple``, ``ArgExtOp``, ``index_of``, ...
+-  Functions registered as ``CallableKernels`` using
+   ``lp.register_callable_kernel(...)``.
+-  Functions that have been provided through
+   ``lp.register_function_id_to_in_knl_callable_mapper(...)``
+-  Functions that can be made known from the user through
+   ``lp.register_function_mangler``. This is planned to be deprecated,
+   as its functionality is superseded by
+   ``lp.register_function_id_to_in_knl_callable_mapper(...)``.
+
+Expressions after a function is scoped
+--------------------------------------
+
+Consider the following expression.
+
+::
+
+    sin(a[i]) + unknown_func(b[i]) + callable_knl_func(c[i])*mangler_call(d[i])
+
+During the kernel creation phase, the kernel would know that ``sin`` is
+a function known to the target and hence it should be scoped. And as
+expected, after ``make_kernel`` has been called the above expression
+would get converted to:
+
+::
+
+    ResolvedFunction(Variable('sin'))(a[i]) + unknown_func(b[i]) +
+    callable_knl_func(c[i])*mangler_call(d[i])
+
+This would also make an entry in the kernel's ``scoped_functions``
+dictionary as:
+
+::
+
+    {Variable('sin'): ScalarCallable(name='sin', arg_id_to_dtype=None,
+    arg_id_to_descr=None, name_in_target=None)}
+
+It might be noteworthy that at this step, it only scopes functions
+through their names without any information about the types of the
+function.
+
+Once, the user calls the transformation:
+``lp.register_callable_kernel(knl, 'callable_knl_func', callee_knl)``,
+the expression gets converted to:
+
+::
+
+    ResolvedFunction(Variable('sin'))(a[i]) + unknown_func(b[i]) +
+    ResolvedFunction('callable_knl_func')(c[i])*mangler_call(d[i])
+
+This also makes an entry in the ``scoped_functions`` dictionary as --
+
+::
+
+    {Variable('sin'): ScalarCallable(name='sin', arg_id_to_dtype=None,
+    arg_id_to_descr=None, name_in_target=None),
+    Variable('callable_knl_func'): CallableKernel(subkernel=LoopKernel(...),
+    arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None)}
+
+Now, if the user calls
+``register_function_mangler(knl, 'mangler_call')``, one might expect
+that the mangler call function should get scoped, but that does **not**
+happen, because the "old" ``function_manglers``, would return a match
+only if all the parameters of the function match viz. name, argument
+arity and argument types. Hence, the ``scoped_functions`` dictionary
+would remain unchanged.
+
+``ResolvedFunctions`` and specializations
+---------------------------------------
+
+Consider the same ``ResolvedFunction('sin')`` as above. This function
+although scoped does not the know the types i.e. it does yet know that
+for a ``C-Target``, whether it should emit ``sin`` or ``sinf`` or
+``sinl``. Hence, right now the function can be called as a
+"type-generic" function as further in the pipeline it can take any one
+of the above definitions. The functions go through a "specialization"
+processes at various points in the pipeline, where the attributes of the
+callables are resolved.
+
+-  During type inference, the functions go though type specialization
+   where in the ``arg_id_to_dtype`` of the functions is realized.
+-  During descriptor inference, the functions goes through a description
+   specialization where the ``arg_id_to_descr`` is populated. The
+   ``arg_id_to_descr`` contains important information regarding shape,
+   strides and scope of the arguments which form an important part of
+   ``CallableKernel`` as this information would be helpful to to
+   generate the function signature and make changes to the data access
+   pattern of the variables in the callee kernel.
+-  Whenever a ``ResolvedFunction`` goes through a specialization, this is
+   indicated by changing the name in the ``pymbolic`` node.
+
+If during type inference, it is inferred that the type of ``a[i]`` is
+``np.float32``. The new ``pymbolic`` node would be:
+
+::
+
+    ResolvedFunction('sin_0')(a[i]) + ...
+
+This name change is done so that it indicates that the node points to a
+different ``ScalarCallable`` in the dictionary. And hence a new entry is
+added to the ``scoped_functions`` dictionary as:
+
+::
+
+    {'sin': ScalarCallable(name='sin', arg_id_to_dtype=None,
+    arg_id_to_descr=None, name_in_target=None),
+    Variable('callable_knl_func'): CallableKernel(subkernel=LoopKernel(...),
+    arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None),
+    'sin_0': ScalarCallable(name='sin', arg_id_to_dtype={0:np.float32,
+    -1: np.float32}, arg_id_to_descr=None, name_in_target='sinf')}
+
+Description Inference
+---------------------
+
+Although this step has no significance for a ``ScalarCallable``, it
+forms a very important part of ``CallableKernel``. In which the
+``dim_tags``, ``shape`` and ``address_space`` of the arguments of the
+callable kernel is altered.
+
+-  The ``dim_tags`` attribute helps to ensure that the memory layout
+   between the caller and the callee kernel is coherent.
+-  The ``address_space`` attribute ensures that, while writing the device
+   code we emit the appropriate scope qualifiers for the function
+   declaration arguments.
+-  The ``shape`` attribute helps in:
+
+   -  Storage allocation.
+   -  Memory layout.
+   -  Out of bounds accesses to be caught in ``Loo.py``.
+
+Hence, in the ``Loo.py`` pipeline, one might expect the following
+developments of the ``sin`` pymbolic call expression node.
+
+::
+
+    sin -> (Kernel creation) -> ResolvedFunction(Variable('sin')) ->
+    (Type Inference) -> ResolvedFunction(Variable('sin_0')) ->
+    (Descriptor Inference) -> ResolvedFunction(Variable('sin_1'))
+
+Changes on the target side to accommodate the new function interface
+--------------------------------------------------------------------
+
+The earlier "function\_mangler" as a member method of the class
+``lp.ASTBuilderBase`` will be replaced by ``function_id_in_knl_callable_mapper``. The
+function scopers would return a list of functions with the signature
+``(target, identifier)->lp.InKernelCallable``.
+
+An example: Calling BLAS
+------------------------
+
+.. literalinclude:: ../examples/python/external-call.py
+
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index e48fcb31c3c5632459078db499a1068e114f9021..d93be3e58aaeafbe9298dae0c4856873a866651a 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -333,7 +333,7 @@ an explicit dependency:
     ...     """
     ...     out[j,i] = a[i,j] {id=transpose}
     ...     out[i,j] = 2*out[i,j]  {dep=transpose}
-    ...     """)
+    ...     """, name="transpose_and_dbl")
 
 ``{id=transpose}`` assigns the identifier *transpose* to the first
 instruction, and ``{dep=transpose}`` declares a dependency of the second
@@ -342,9 +342,9 @@ that these dependencies show up there, too:
 
 .. doctest::
 
-    >>> print(knl.stringify(with_dependencies=True))
+    >>> print(knl["transpose_and_dbl"].stringify(with_dependencies=True))
     ---------------------------------------------------------------------------
-    KERNEL: loopy_kernel
+    KERNEL: transpose_and_dbl
     ---------------------------------------------------------------------------
     ...
     ---------------------------------------------------------------------------
@@ -394,7 +394,7 @@ Let us take a look at the generated code for the above kernel:
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
+    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) transpose_and_dbl(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
     {
       for (int i = 0; i <= -1 + n; ++i)
         for (int j = 0; j <= -1 + n; ++j)
@@ -743,7 +743,7 @@ those for us:
 
 .. doctest::
 
-    >>> glob, loc = knl.get_grid_size_upper_bounds()
+    >>> glob, loc = knl["loopy_kernel"].get_grid_size_upper_bounds(knl.callables_table)
     >>> print(glob)
     (Aff("[n] -> { [(floor((127 + n)/128))] }"),)
     >>> print(loc)
@@ -1165,7 +1165,7 @@ this, :mod:`loopy` will complain that global barrier needs to be inserted:
    >>> cgr = lp.generate_code_v2(knl)
    Traceback (most recent call last):
    ...
-   loopy.diagnostic.MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed)
+   loopy.diagnostic.MissingBarrierError: rotate_v1: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed)
 
 The syntax for a inserting a global barrier instruction is
 ``... gbarrier``. :mod:`loopy` also supports manually inserting local
@@ -1186,7 +1186,7 @@ Let us start with an example. Consider the kernel from above with a
 
 .. doctest::
 
-   >>> knl = lp.make_kernel(
+   >>> prog = lp.make_kernel(
    ...     "[n] -> {[i] : 0<=i<n}",
    ...     """
    ...     for i
@@ -1201,11 +1201,11 @@ Let us start with an example. Consider the kernel from above with a
    ...      ],
    ...     name="rotate_v2",
    ...     assumptions="n mod 16 = 0")
-   >>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0")
+   >>> prog = lp.split_iname(prog, "i", 16, inner_tag="l.0", outer_tag="g.0")
 
 Here is what happens when we try to generate code for the kernel:
 
-   >>> cgr = lp.generate_code_v2(knl)
+   >>> cgr = lp.generate_code_v2(prog)
    Traceback (most recent call last):
    ...
    loopy.diagnostic.MissingDefinitionError: temporary variable 'tmp' gets used in subkernel 'rotate_v2_0' without a definition (maybe you forgot to call loopy.save_and_reload_temporaries?)
@@ -1214,8 +1214,10 @@ This happens due to the kernel splitting done by :mod:`loopy`. The splitting
 happens when the instruction schedule is generated. To see the schedule, we
 should call :func:`loopy.get_one_linearized_kernel`:
 
-   >>> knl = lp.get_one_linearized_kernel(lp.preprocess_kernel(knl))
-   >>> print(knl)
+   >>> prog = lp.preprocess_kernel(prog)
+   >>> knl = lp.get_one_linearized_kernel(prog["rotate_v2"], prog.callables_table)
+   >>> prog = prog.with_kernel(knl)
+   >>> print(prog)
    ---------------------------------------------------------------------------
    KERNEL: rotate_v2
    ---------------------------------------------------------------------------
@@ -1244,10 +1246,10 @@ function adds instructions to the kernel without scheduling them. That means
 that :func:`loopy.get_one_linearized_kernel` needs to be called one more time to
 put those instructions into the schedule.
 
-   >>> knl = lp.get_one_linearized_kernel(lp.preprocess_kernel(knl))
-   >>> knl = lp.save_and_reload_temporaries(knl)
-   >>> knl = lp.get_one_linearized_kernel(knl)  # Schedule added instructions
-   >>> print(knl)
+   >>> prog = lp.save_and_reload_temporaries(prog)
+   >>> knl = lp.get_one_linearized_kernel(prog["rotate_v2"], prog.callables_table)  # Schedule added instructions
+   >>> prog = prog.with_kernel(knl)
+   >>> print(prog)
    ---------------------------------------------------------------------------
    KERNEL: rotate_v2
    ---------------------------------------------------------------------------
@@ -1286,7 +1288,7 @@ does in more detail:
 
 The kernel translates into two OpenCL kernels.
 
-   >>> cgr = lp.generate_code_v2(knl)
+   >>> cgr = lp.generate_code_v2(prog)
    >>> print(cgr.device_code())
    #define lid(N) ((int) get_local_id(N))
    #define gid(N) ((int) get_group_id(N))
@@ -1312,7 +1314,7 @@ Now we can execute the kernel.
    >>> arr = cl.array.arange(queue, 16, dtype=np.int32)
    >>> print(arr)
    [ 0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15]
-   >>> evt, (out,) = knl(queue, arr=arr)
+   >>> evt, (out,) = prog(queue, arr=arr)
    >>> print(arr)
    [15  0  1  2  3  4  5  6  7  8  9 10 11 12 13 14]
 
@@ -1549,7 +1551,7 @@ containing different types of data:
     ...     """
     ...     c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k]
     ...     e[i, k] = g[i,k]*(2+h[i,k+1])
-    ...     """)
+    ...     """, name="stats_knl")
     >>> knl = lp.add_and_infer_dtypes(knl,
     ...     dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
 
@@ -1560,7 +1562,7 @@ information provided. Now we will count the operations:
 
     >>> op_map = lp.get_op_map(knl, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(op_map))
-    Op(np:dtype('float32'), add, subgroup) : ...
+    Op(np:dtype('float32'), add, subgroup, stats_knl) : ...
 
 Each line of output will look roughly like::
 
@@ -1586,12 +1588,12 @@ One way to evaluate these polynomials is with :meth:`islpy.PwQPolynomial.eval_wi
 
     >>> param_dict = {'n': 256, 'm': 256, 'l': 8}
     >>> from loopy.statistics import CountGranularity as CG
-    >>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(param_dict)
-    >>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(param_dict)
-    >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(param_dict)
-    >>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(param_dict)
-    >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP)].eval_with_dict(param_dict)
-    >>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP)].eval_with_dict(param_dict)
+    >>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict)
+    >>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict)
+    >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict)
+    >>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict)
+    >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict)
+    >>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict)
     >>> print("%i\n%i\n%i\n%i\n%i\n%i" %
     ...     (f32add, f32div, f32mul, f64add, f64mul, i32add))
     524288
@@ -1648,15 +1650,15 @@ we'll continue using the kernel from the previous example:
 
     >>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : ...
+    MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, stats_knl) : ...
     <BLANKLINE>
 
 Each line of output will look roughly like::
 
 
-    MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, stats_knl) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup, stats_knl) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup, stats_knl) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
 
 :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{**
 :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**.
@@ -1691,13 +1693,13 @@ We can evaluate these polynomials using :meth:`islpy.PwQPolynomial.eval_with_dic
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', None, CG.SUBGROUP)
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', None, CG.SUBGROUP, "stats_knl")
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', None, CG.SUBGROUP)
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', None, CG.SUBGROUP, "stats_knl")
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', None, CG.SUBGROUP)
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', None, CG.SUBGROUP, "stats_knl")
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', None, CG.SUBGROUP)
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', None, CG.SUBGROUP, "stats_knl")
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
@@ -1715,13 +1717,13 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
 
     >>> bytes_map = mem_map.to_bytes()
     >>> print(lp.stringify_stats_mapping(bytes_map))
-    MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : ...
+    MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, stats_knl) : ...
     <BLANKLINE>
     >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global']
     ...                                         ).group_by('direction')
     >>> print(lp.stringify_stats_mapping(global_ld_st_bytes))
-    MemAccess(None, None, None, None, load, None, None, None) : ...
-    MemAccess(None, None, None, None, store, None, None, None) : ...
+    MemAccess(None, None, None, None, load, None, None, None, None) : ...
+    MemAccess(None, None, None, None, store, None, None, None, None) : ...
     <BLANKLINE>
     >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load')
     ...                            ].eval_with_dict(param_dict)
@@ -1758,12 +1760,12 @@ this time.
     ...                             outer_tag="l.1", inner_tag="l.0")
     >>> mem_map = lp.get_mem_access_map(knl_consec, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem) : ...
-    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem) : ...
-    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem, stats_knl) : ...
+    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem, stats_knl) : ...
+    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem, stats_knl) : ...
+    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem, stats_knl) : ...
+    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem, stats_knl) : ...
+    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem, stats_knl) : ...
     <BLANKLINE>
 
 With this parallelization, consecutive work-items will access consecutive array
@@ -1773,13 +1775,13 @@ array accesses has not changed:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', None, CG.WORKITEM)
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', None, CG.WORKITEM, "stats_knl")
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', None, CG.WORKITEM)
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', None, CG.WORKITEM, "stats_knl")
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', None, CG.WORKITEM)
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', None, CG.WORKITEM, "stats_knl")
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', None, CG.WORKITEM)
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', None, CG.WORKITEM, "stats_knl")
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
@@ -1799,12 +1801,12 @@ we'll switch the inner and outer tags in our parallelization of the kernel:
     ...                                outer_tag="l.0", inner_tag="l.1")
     >>> mem_map = lp.get_mem_access_map(knl_nonconsec, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem) : ...
-    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem) : ...
-    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem, stats_knl) : ...
+    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem, stats_knl) : ...
+    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem, stats_knl) : ...
+    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem, stats_knl) : ...
+    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem, stats_knl) : ...
+    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem, stats_knl) : ...
     <BLANKLINE>
 
 With this parallelization, consecutive work-items will access *nonconsecutive*
@@ -1813,13 +1815,13 @@ changed:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', None, CG.WORKITEM)
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', None, CG.WORKITEM, "stats_knl")
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', None, CG.WORKITEM)
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', None, CG.WORKITEM, "stats_knl")
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', None, CG.WORKITEM)
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', None, CG.WORKITEM, "stats_knl")
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', None, CG.WORKITEM)
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', None, CG.WORKITEM, "stats_knl")
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
@@ -1853,14 +1855,14 @@ kernel from the previous example:
 
     >>> sync_map = lp.get_synchronization_map(knl)
     >>> print(lp.stringify_stats_mapping(sync_map))
-    kernel_launch : { 1 }
+    Sync(kernel_launch, stats_knl) : [l, m, n] -> { 1 }
     <BLANKLINE>
 
 We can evaluate this polynomial using :meth:`islpy.PwQPolynomial.eval_with_dict`:
 
 .. doctest::
 
-    >>> launch_count = sync_map["kernel_launch"].eval_with_dict(param_dict)
+    >>> launch_count = sync_map[lp.Sync("kernel_launch", "stats_knl")].eval_with_dict(param_dict)
     >>> print("Kernel launch count: %s" % launch_count)
     Kernel launch count: 1
 
@@ -1913,8 +1915,8 @@ count the barriers using :func:`loopy.get_synchronization_map`:
 
     >>> sync_map = lp.get_synchronization_map(knl)
     >>> print(lp.stringify_stats_mapping(sync_map))
-    barrier_local : { 1000 }
-    kernel_launch : { 1 }
+    Sync(barrier_local, loopy_kernel) : { 1000 }
+    Sync(kernel_launch, loopy_kernel) : { 1 }
     <BLANKLINE>
 
 Based on the kernel code printed above, we would expect each work-item to
diff --git a/examples/fortran/ipython-integration-demo.ipynb b/examples/fortran/ipython-integration-demo.ipynb
index 7a5c8257bf80fdfcc3d3b978a7dca2d401c48271..d9ac1f1b22a92b138e4f6432315f281b2a894aed 100644
--- a/examples/fortran/ipython-integration-demo.ipynb
+++ b/examples/fortran/ipython-integration-demo.ipynb
@@ -49,7 +49,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(fill)"
+    "print(prog)"
    ]
   },
   {
@@ -91,10 +91,10 @@
     "\n",
     "!$loopy begin\n",
     "!\n",
-    "! tr_fill, = lp.parse_fortran(SOURCE)\n",
+    "! tr_fill = lp.parse_fortran(SOURCE)\n",
     "! tr_fill = lp.split_iname(tr_fill, \"i\", split_amount,\n",
     "!     outer_tag=\"g.0\", inner_tag=\"l.0\")\n",
-    "! RESULT = [tr_fill]\n",
+    "! RESULT = tr_fill\n",
     "!\n",
     "!$loopy end"
    ]
@@ -105,7 +105,7 @@
    "metadata": {},
    "outputs": [],
    "source": [
-    "print(tr_fill)"
+    "print(prog)"
    ]
   },
   {
diff --git a/examples/fortran/matmul.floopy b/examples/fortran/matmul.floopy
index 23840f09a46ab97902a8d1ed7e078a7c70d36dec..733cdaac4d9153803dcb54d5c114a33871403bbf 100644
--- a/examples/fortran/matmul.floopy
+++ b/examples/fortran/matmul.floopy
@@ -13,7 +13,7 @@ subroutine dgemm(m,n,l,alpha,a,b,c)
 end subroutine
 
 !$loopy begin
-! dgemm, = lp.parse_fortran(SOURCE, FILENAME)
+! dgemm = lp.parse_fortran(SOURCE, FILENAME)
 ! dgemm = lp.split_iname(dgemm, "i", 16,
 !         outer_tag="g.0", inner_tag="l.1")
 ! dgemm = lp.split_iname(dgemm, "j", 8,
@@ -28,5 +28,5 @@ end subroutine
 ! dgemm = lp.precompute(dgemm, "b_acc", "j_inner,k_inner",
 !         precompute_outer_inames="i_outer, j_outer, k_outer",
 !         default_tag="l.auto")
-! RESULT = [dgemm]
+! RESULT = dgemm
 !$loopy end
diff --git a/examples/fortran/sparse.floopy b/examples/fortran/sparse.floopy
index 18542e6b0403a7ab475b3e357f18489847367c3d..2b156bdd709e8f4258492d258adb888ad16fbccd 100644
--- a/examples/fortran/sparse.floopy
+++ b/examples/fortran/sparse.floopy
@@ -23,11 +23,11 @@ subroutine sparse(rowstarts, colindices, values, m, n, nvals, x, y)
 end
 
 !$loopy begin
-! sparse, = lp.parse_fortran(SOURCE, FILENAME)
+! sparse = lp.parse_fortran(SOURCE, FILENAME)
 ! sparse = lp.split_iname(sparse, "i", 128)
 ! sparse = lp.tag_inames(sparse, {"i_outer": "g.0"})
 ! sparse = lp.tag_inames(sparse, {"i_inner": "l.0"})
 ! sparse = lp.split_iname(sparse, "j", 4)
 ! sparse = lp.tag_inames(sparse, {"j_inner": "unr"})
-! RESULT = [sparse]
+! RESULT = sparse
 !$loopy end
diff --git a/examples/fortran/tagging.floopy b/examples/fortran/tagging.floopy
index 87aacba68ae2fc6f3b7052325fcd2378e9880e47..c7ebb75667142a8bb470b32f1d92177e135db9b2 100644
--- a/examples/fortran/tagging.floopy
+++ b/examples/fortran/tagging.floopy
@@ -23,13 +23,13 @@ end
 !       "factor 4.0",
 !       "real_type real*8",
 !       ])
-! fill, = lp.parse_fortran(SOURCE, FILENAME)
+! fill = lp.parse_fortran(SOURCE, FILENAME)
 ! fill = lp.add_barrier(fill, "tag:init", "tag:mult", "gb1")
 ! fill = lp.split_iname(fill, "i", 128,
 !     outer_tag="g.0", inner_tag="l.0")
 ! fill = lp.split_iname(fill, "i_1", 128,
 !     outer_tag="g.0", inner_tag="l.0")
-! RESULT = [fill]
+! RESULT = fill
 !
 !$loopy end
 
diff --git a/examples/fortran/volumeKernel.floopy b/examples/fortran/volumeKernel.floopy
index c5784b63492063bfd2a9604c42dbf65b2ecb86bf..211c38049076cbe065ce847f948d724c293a032c 100644
--- a/examples/fortran/volumeKernel.floopy
+++ b/examples/fortran/volumeKernel.floopy
@@ -67,7 +67,7 @@ end subroutine volumeKernel
 
 !$loopy begin
 !
-! volumeKernel, = lp.parse_fortran(SOURCE, FILENAME)
+! volumeKernel = lp.parse_fortran(SOURCE, FILENAME)
 ! volumeKernel = lp.split_iname(volumeKernel,
 !     "e", 32, outer_tag="g.1", inner_tag="g.0")
 ! volumeKernel = lp.fix_parameters(volumeKernel,
@@ -76,6 +76,6 @@ end subroutine volumeKernel
 !     i="l.0", j="l.1", k="l.2",
 !     i_1="l.0", j_1="l.1", k_1="l.2"
 !     ))
-! RESULT = [volumeKernel]
+! RESULT = volumeKernel
 !
 !$loopy end
diff --git a/examples/python/call-external.py b/examples/python/call-external.py
new file mode 100644
index 0000000000000000000000000000000000000000..49b25d6e015780789c5e56af46d47a14e4611cf8
--- /dev/null
+++ b/examples/python/call-external.py
@@ -0,0 +1,99 @@
+import loopy as lp
+import numpy as np
+from loopy.diagnostic import LoopyError
+from loopy.target.c import CTarget
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa: F401
+
+
+# {{{ blas callable
+
+class CBLASGEMV(lp.ScalarCallable):
+    def with_types(self, arg_id_to_dtype, callables_table):
+        mat_dtype = arg_id_to_dtype.get(0)
+        vec_dtype = arg_id_to_dtype.get(1)
+
+        if mat_dtype is None or vec_dtype is None:
+            # types aren't specialized enough to be resolved
+            return self, callables_table
+
+        if mat_dtype != vec_dtype:
+            raise LoopyError("GEMV requires same dtypes for matrix and "
+                             "vector")
+
+        if vec_dtype.numpy_dtype == np.float32:
+            name_in_target = "cblas_sgemv"
+        elif vec_dtype. numpy_dtype == np.float64:
+            name_in_target = "cblas_dgemv"
+        else:
+            raise LoopyError("GEMV is only supported for float32 and float64 "
+                             "types")
+
+        return (self.copy(name_in_target=name_in_target,
+                          arg_id_to_dtype={0: vec_dtype,
+                                           1: vec_dtype,
+                                           -1: vec_dtype}),
+                callables_table)
+
+    def with_descrs(self, arg_id_to_descr, callables_table):
+        mat_descr = arg_id_to_descr.get(0)
+        vec_descr = arg_id_to_descr.get(1)
+        res_descr = arg_id_to_descr.get(-1)
+
+        if mat_descr is None or vec_descr is None or res_descr is None:
+            # shapes aren't specialized enough to be resolved
+            return self, callables_table
+
+        assert mat_descr.shape[1] == vec_descr.shape[0]
+        assert mat_descr.shape[0] == res_descr.shape[0]
+        assert len(vec_descr.shape) == len(res_descr.shape) == 1
+        # handling only the easy case when stride == 1
+        assert vec_descr.dim_tags[0].stride == 1
+        assert mat_descr.dim_tags[1].stride == 1
+        assert res_descr.dim_tags[0].stride == 1
+
+        return self.copy(arg_id_to_descr=arg_id_to_descr), callables_table
+
+    def emit_call_insn(self, insn, target, expression_to_code_mapper):
+        from pymbolic import var
+        mat_descr = self.arg_id_to_descr[0]
+        m, n = mat_descr.shape
+        ecm = expression_to_code_mapper
+        mat, vec = insn.expression.parameters
+        result, = insn.assignees
+
+        c_parameters = [var("CblasRowMajor"),
+                        var("CblasNoTrans"),
+                        m, n,
+                        1,
+                        ecm(mat).expr,
+                        1,
+                        ecm(vec).expr,
+                        1,
+                        ecm(result).expr,
+                        1]
+        return (var(self.name_in_target)(*c_parameters),
+                False  # cblas_gemv does not return anything
+                )
+
+    def generate_preambles(self, target):
+        assert isinstance(target, CTarget)
+        yield("99_cblas", "#include <cblas.h>")
+        return
+
+# }}}
+
+
+n = 10
+
+knl = lp.make_kernel(
+        "{:}",
+        """
+        y[:] = gemv(A[:, :], x[:])
+        """, [
+            lp.GlobalArg("A", dtype=np.float64, shape=(n, n)),
+            lp.GlobalArg("x", dtype=np.float64, shape=(n, )),
+            lp.GlobalArg("y", shape=(n, )), ...],
+        target=CTarget())
+
+knl = lp.register_callable(knl, "gemv", CBLASGEMV(name="gemv"))
+print(lp.generate_code_v2(knl).device_code())
diff --git a/examples/python/global_barrier_removal.py b/examples/python/global_barrier_removal.py
index 7f80175ebe82b8412a38708a5b1d32042d8061fe..d97fc3fa67adb22c17d4f60c2e4283aed727af8a 100644
--- a/examples/python/global_barrier_removal.py
+++ b/examples/python/global_barrier_removal.py
@@ -22,7 +22,9 @@ from loopy.preprocess import preprocess_kernel
 knl = preprocess_kernel(knl)
 
 from loopy.schedule import get_one_scheduled_kernel
-knl = get_one_scheduled_kernel(knl)
+knl = knl.with_kernel(get_one_scheduled_kernel(knl["loopy_kernel"],
+                                               knl.callables_table))
+
 
 # map schedule onto host or device
 print(knl)
diff --git a/examples/python/ispc-stream-harness.py b/examples/python/ispc-stream-harness.py
index 41fddfdee2ddf3b670bf9770ad8c4b3ec9ea7da1..ce40487b1f41a6a591134a21eeb14113fd8be4fa 100644
--- a/examples/python/ispc-stream-harness.py
+++ b/examples/python/ispc-stream-harness.py
@@ -29,8 +29,6 @@ def transform(knl, vars, stream_dtype):
 
 
 def gen_code(knl):
-    knl = lp.preprocess_kernel(knl)
-    knl = lp.get_one_scheduled_kernel(knl)
     codegen_result = lp.generate_code_v2(knl)
 
     return codegen_result.device_code() + "\n" + codegen_result.host_code()
diff --git a/examples/python/sparse.py b/examples/python/sparse.py
index 0e56df1bc3085976bfadd783f976fa912af45da8..b4dd07df40007db16ab588c26dfefb4aadb4b7eb 100644
--- a/examples/python/sparse.py
+++ b/examples/python/sparse.py
@@ -11,9 +11,9 @@ k = lp.make_kernel([
         <> length = rowend - rowstart
         y[i] = sum(j, values[rowstart+j] * x[colindices[rowstart + j]])
     end
-    """)
+    """, name="spmv")
 
 k = lp.add_and_infer_dtypes(k, {
-    "values,x": np.float64, "rowstarts,colindices": k.index_dtype
+    "values,x": np.float64, "rowstarts,colindices": k["spmv"].index_dtype
     })
-print(lp.generate_code(k)[0])
+print(lp.generate_code_v2(k).device_code())
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 9c4bfa6d0781677d0d3da8ddcd5ac44d2f90fee5..6cabbf614e0aa3ef938972a0e8af5c168467249a 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -24,13 +24,10 @@ THE SOFTWARE.
 from loopy.symbolic import (
         TaggedVariable, Reduction, LinearSubscript, TypeCast)
 from loopy.diagnostic import LoopyError, LoopyWarning
-
+from loopy.program import iterate_over_kernels_if_given_program
 
 # {{{ imported user interface
 
-from loopy.library.function import (
-        default_function_mangler, single_arg_function_mangler)
-
 from loopy.kernel.instruction import (
         LegacyStringInstructionTag, UseStreamingStoreTag,
         MemoryOrdering, memory_ordering,
@@ -47,6 +44,10 @@ from loopy.kernel.data import (
         TemporaryVariable,
         SubstitutionRule,
         CallMangleInfo)
+from loopy.kernel.function_interface import (
+        CallableKernel, ScalarCallable)
+from loopy.program import (
+        Program, make_program)
 
 from loopy.kernel import LoopKernel, KernelState, kernel_state
 from loopy.kernel.tools import (
@@ -59,7 +60,7 @@ from loopy.kernel.tools import (
         get_subkernels,
         get_subkernel_to_insn_id_map)
 from loopy.types import to_loopy_type
-from loopy.kernel.creation import make_kernel, UniqueName
+from loopy.kernel.creation import make_kernel, UniqueName, make_function
 from loopy.library.reduction import register_reduction_parser
 
 # {{{ import transforms
@@ -115,16 +116,21 @@ from loopy.transform.batch import to_batched
 from loopy.transform.parameter import assume, fix_parameters
 from loopy.transform.save import save_and_reload_temporaries
 from loopy.transform.add_barrier import add_barrier
+from loopy.transform.callable import (register_callable,
+        merge, inline_callable_kernel, rename_callable)
+from loopy.transform.pack_and_unpack_args import pack_and_unpack_args_for_call
+
 # }}}
 
 from loopy.type_inference import infer_unknown_types
-from loopy.preprocess import preprocess_kernel, realize_reduction
+from loopy.preprocess import (preprocess_kernel, realize_reduction,
+        preprocess_program, infer_arg_descr)
 from loopy.schedule import (
     generate_loop_schedules, get_one_scheduled_kernel, get_one_linearized_kernel)
-from loopy.statistics import (ToCountMap, CountGranularity,
+from loopy.statistics import (ToCountMap, ToCountPolynomialMap, CountGranularity,
         stringify_stats_mapping, Op, MemAccess, get_op_map, get_mem_access_map,
         get_synchronization_map, gather_access_footprints,
-        gather_access_footprint_bytes)
+        gather_access_footprint_bytes, Sync)
 from loopy.codegen import (
         PreambleInfo,
         generate_code, generate_code_v2, generate_body)
@@ -167,6 +173,10 @@ __all__ = [
         "CallInstruction", "CInstruction", "NoOpInstruction",
         "BarrierInstruction",
 
+        "ScalarCallable", "CallableKernel",
+
+        "Program", "make_program",
+
         "KernelArgument",
         "ValueArg", "ArrayArg", "GlobalArg", "ConstantArg", "ImageArg",
         "AddressSpace", "temp_var_scope",   # temp_var_scope is deprecated
@@ -174,9 +184,7 @@ __all__ = [
         "SubstitutionRule",
         "CallMangleInfo",
 
-        "default_function_mangler", "single_arg_function_mangler",
-
-        "make_kernel", "UniqueName",
+        "make_kernel", "UniqueName", "make_function",
 
         "register_reduction_parser",
 
@@ -229,6 +237,13 @@ __all__ = [
 
         "add_barrier",
 
+        "register_callable",
+        "merge",
+
+        "inline_callable_kernel", "rename_callable",
+
+        "pack_and_unpack_args_for_call",
+
         # }}}
 
         "get_dot_dependency_graph",
@@ -244,17 +259,20 @@ __all__ = [
 
         "infer_unknown_types",
 
-        "preprocess_kernel", "realize_reduction",
+        "preprocess_kernel", "realize_reduction", "preprocess_program",
+        "infer_arg_descr",
+
         "generate_loop_schedules",
         "get_one_scheduled_kernel", "get_one_linearized_kernel",
         "GeneratedProgram", "CodeGenerationResult",
         "PreambleInfo",
         "generate_code", "generate_code_v2", "generate_body",
 
-        "ToCountMap", "CountGranularity", "stringify_stats_mapping", "Op",
-        "MemAccess",  "get_op_map", "get_mem_access_map",
-        "get_synchronization_map", "gather_access_footprints",
-        "gather_access_footprint_bytes",
+        "ToCountMap", "ToCountPolynomialMap", "CountGranularity",
+        "stringify_stats_mapping", "Op", "MemAccess", "get_op_map",
+        "get_mem_access_map", "get_synchronization_map",
+        "gather_access_footprints", "gather_access_footprint_bytes",
+        "Sync",
 
         "CompiledKernel",
 
@@ -294,6 +312,7 @@ __all__ = [
 
 # {{{ set_options
 
+@iterate_over_kernels_if_given_program
 def set_options(kernel, *args, **kwargs):
     """Return a new kernel with the options given as keyword arguments, or from
     a string representation passed in as the first (and only) positional
@@ -301,6 +320,7 @@ def set_options(kernel, *args, **kwargs):
 
     See also :class:`Options`.
     """
+    assert isinstance(kernel, LoopKernel)
 
     if args and kwargs:
         raise TypeError("cannot pass both positional and keyword arguments")
@@ -332,6 +352,7 @@ def set_options(kernel, *args, **kwargs):
 
 # {{{ library registration
 
+@iterate_over_kernels_if_given_program
 def register_preamble_generators(kernel, preamble_generators):
     """
     :arg manglers: list of functions of signature ``(preamble_info)``
@@ -356,6 +377,7 @@ def register_preamble_generators(kernel, preamble_generators):
     return kernel.copy(preamble_generators=new_pgens)
 
 
+@iterate_over_kernels_if_given_program
 def register_symbol_manglers(kernel, manglers):
     from loopy.tools import unpickles_equally
 
@@ -373,6 +395,7 @@ def register_symbol_manglers(kernel, manglers):
     return kernel.copy(symbol_manglers=new_manglers)
 
 
+@iterate_over_kernels_if_given_program
 def register_function_manglers(kernel, manglers):
     """
     :arg manglers: list of functions of signature ``(kernel, name, arg_dtypes)``
@@ -438,7 +461,7 @@ class CacheMode:
 # {{{ make copy kernel
 
 def make_copy_kernel(new_dim_tags, old_dim_tags=None):
-    """Returns a :class:`LoopKernel` that changes the data layout
+    """Returns a :class:`loopy.Program` that changes the data layout
     of a variable (called "input") to the new layout specified by
     *new_dim_tags* from the one specified by *old_dim_tags*.
     *old_dim_tags* defaults to an all-C layout of the same rank
diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index 988e83f88c7a1f7a065813f3c1f9319695b0d97c..e3e41beef89c6796a4bef226b5f5f933f286478e 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -27,6 +27,7 @@ from pytools import Record
 import numpy as np
 
 import loopy as lp
+
 from loopy.diagnostic import LoopyError, AutomaticTestFailure
 
 
@@ -115,7 +116,7 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters):
             shape = evaluate_shape(arg.unvec_shape, parameters)
             dtype = kernel_arg.dtype
 
-            is_output = arg.base_name in kernel.get_written_variables()
+            is_output = kernel_arg.is_output
 
             if arg.arg_class is ImageArg:
                 storage_array = ary = cl_array.empty(
@@ -302,12 +303,10 @@ def _default_check_result(result, ref_result):
     if not np.allclose(ref_result, result, rtol=1e-3, atol=1e-3):
         l2_err = (
                 np.sum(np.abs(ref_result-result)**2)
-                /
-                np.sum(np.abs(ref_result)**2))
+                / np.sum(np.abs(ref_result)**2))
         linf_err = (
                 np.max(np.abs(ref_result-result))
-                /
-                np.max(np.abs(ref_result-result)))
+                / np.max(np.abs(ref_result-result)))
         return (False,
                 "results do not match -- (rel) l_2 err: %g, l_inf err: %g"
                 % (l2_err, linf_err))
@@ -366,12 +365,13 @@ def _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors, need_image_support
 # {{{ main automatic testing entrypoint
 
 def auto_test_vs_ref(
-        ref_knl, ctx, test_knl=None, op_count=[], op_label=[], parameters={},
+        ref_prog, ctx, test_prog=None, op_count=[], op_label=[], parameters={},
         print_ref_code=False, print_code=True, warmup_rounds=2,
         dump_binary=False,
         fills_entire_output=None, do_check=True, check_result=None,
         max_test_kernel_count=1,
-        quiet=False, blacklist_ref_vendors=[]):
+        quiet=False, blacklist_ref_vendors=[], ref_entrypoint=None,
+        test_entrypoint=None):
     """Compare results of `ref_knl` to the kernels generated by
     scheduling *test_knl*.
 
@@ -383,24 +383,37 @@ def auto_test_vs_ref(
 
     import pyopencl as cl
 
-    if test_knl is None:
-        test_knl = ref_knl
+    if test_prog is None:
+        test_prog = ref_prog
         do_check = False
 
-    if len(ref_knl.args) != len(test_knl.args):
-        raise LoopyError("ref_knl and test_knl do not have the same number "
+    if ref_entrypoint is None:
+        if len(ref_prog.entrypoints) != 1:
+            raise LoopyError("Unable to guess entrypoint for ref_prog.")
+        ref_entrypoint = list(ref_prog.entrypoints)[0]
+
+    if test_entrypoint is None:
+        if len(test_prog.entrypoints) != 1:
+            raise LoopyError("Unable to guess entrypoint for ref_prog.")
+        test_entrypoint = list(test_prog.entrypoints)[0]
+
+    ref_prog = lp.preprocess_kernel(ref_prog)
+    test_prog = lp.preprocess_kernel(test_prog)
+
+    if len(ref_prog[ref_entrypoint].args) != len(test_prog[test_entrypoint].args):
+        raise LoopyError("ref_prog and test_prog do not have the same number "
                 "of arguments")
 
-    for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)):
+    for i, (ref_arg, test_arg) in enumerate(zip(ref_prog[ref_entrypoint].args,
+            test_prog[test_entrypoint].args)):
         if ref_arg.name != test_arg.name:
-            raise LoopyError("ref_knl and test_knl argument lists disagree at index "
-                    "%d (1-based)" % (i+1))
+            raise LoopyError("ref_prog and test_prog argument lists disagree at "
+                    "index %d (1-based)" % (i+1))
 
         if ref_arg.dtype != test_arg.dtype:
-            raise LoopyError("ref_knl and test_knl argument lists disagree at index "
-                    "%d (1-based)" % (i+1))
+            raise LoopyError("ref_prog and test_prog argument lists disagree at "
+                    "index %d (1-based)" % (i+1))
 
-    from loopy.compiled import CompiledKernel
     from loopy.target.execution import get_highlighted_code
 
     if isinstance(op_count, (int, float)):
@@ -421,14 +434,15 @@ def auto_test_vs_ref(
     # {{{ compile and run reference code
 
     from loopy.type_inference import infer_unknown_types
-    ref_knl = infer_unknown_types(ref_knl, expect_completion=True)
+    ref_prog = infer_unknown_types(ref_prog, expect_completion=True)
 
     found_ref_device = False
 
     ref_errors = []
 
     from loopy.kernel.data import ImageArg
-    need_ref_image_support = any(isinstance(arg, ImageArg) for arg in ref_knl.args)
+    need_ref_image_support = any(isinstance(arg, ImageArg)
+                                 for arg in ref_prog[ref_entrypoint].args)
 
     for dev in _enumerate_cl_devices_for_ref_test(
             blacklist_ref_vendors, need_ref_image_support):
@@ -436,30 +450,26 @@ def auto_test_vs_ref(
         ref_ctx = cl.Context([dev])
         ref_queue = cl.CommandQueue(ref_ctx,
                 properties=cl.command_queue_properties.PROFILING_ENABLE)
+        ref_codegen_result = lp.generate_code_v2(ref_prog)
 
-        pp_ref_knl = lp.preprocess_kernel(ref_knl)
-
-        for knl in lp.generate_loop_schedules(pp_ref_knl):
-            ref_sched_kernel = knl
-            break
+        ref_implemented_data_info = ref_codegen_result.implemented_data_infos[
+                ref_entrypoint]
 
         logger.info("{} (ref): trying {} for the reference calculation".format(
-            ref_knl.name, dev))
+            ref_entrypoint, dev))
 
-        ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel)
         if not quiet and print_ref_code:
             print(75*"-")
             print("Reference Code:")
             print(75*"-")
-            print(get_highlighted_code(ref_compiled.get_code()))
+            print(get_highlighted_code(
+                ref_codegen_result.device_code()))
             print(75*"-")
 
-        ref_kernel_info = ref_compiled.kernel_info(frozenset())
-
         try:
             ref_args, ref_arg_data = \
-                    make_ref_args(ref_sched_kernel,
-                            ref_kernel_info.implemented_data_info,
+                    make_ref_args(ref_prog[ref_entrypoint],
+                            ref_implemented_data_info,
                             ref_queue, parameters)
             ref_args["out_host"] = False
         except cl.RuntimeError as e:
@@ -484,13 +494,13 @@ def auto_test_vs_ref(
         ref_queue.finish()
 
         logger.info("{} (ref): using {} for the reference calculation".format(
-            ref_knl.name, dev))
-        logger.info("%s (ref): run" % ref_knl.name)
+            ref_entrypoint, dev))
+        logger.info("%s (ref): run" % ref_entrypoint)
 
         ref_start = time()
 
         if not AUTO_TEST_SKIP_RUN:
-            ref_evt, _ = ref_compiled(ref_queue, **ref_args)
+            ref_evt, _ = ref_prog(ref_queue, **ref_args)
         else:
             ref_evt = cl.enqueue_marker(ref_queue)
 
@@ -498,7 +508,7 @@ def auto_test_vs_ref(
         ref_stop = time()
         ref_elapsed_wall = ref_stop-ref_start
 
-        logger.info("%s (ref): run done" % ref_knl.name)
+        logger.info("%s (ref): run done" % ref_entrypoint)
 
         ref_evt.wait()
         ref_elapsed_event = 1e-9*(ref_evt.profile.END-ref_evt.profile.START)
@@ -521,160 +531,144 @@ def auto_test_vs_ref(
 
     from loopy.kernel import KernelState
     from loopy.target.pyopencl import PyOpenCLTarget
-    if test_knl.state not in [
+    if test_prog[test_entrypoint].state not in [
             KernelState.PREPROCESSED,
             KernelState.LINEARIZED]:
-        if isinstance(test_knl.target, PyOpenCLTarget):
-            test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0]))
-
-        test_knl = lp.preprocess_kernel(test_knl)
+        if isinstance(test_prog.target, PyOpenCLTarget):
+            test_prog = test_prog.copy(target=PyOpenCLTarget(ctx.devices[0]))
 
-    if not test_knl.schedule:
-        test_kernels = lp.generate_loop_schedules(test_knl)
-    else:
-        test_kernels = [test_knl]
-
-    test_kernel_count = 0
+        test_prog = lp.preprocess_kernel(test_prog)
 
     from loopy.type_inference import infer_unknown_types
-    for i, kernel in enumerate(test_kernels):
-        test_kernel_count += 1
-        if test_kernel_count > max_test_kernel_count:
-            break
-
-        kernel = infer_unknown_types(kernel, expect_completion=True)
 
-        compiled = CompiledKernel(ctx, kernel)
+    test_prog = infer_unknown_types(test_prog, expect_completion=True)
+    test_prog_codegen_result = lp.generate_code_v2(test_prog)
+
+    args = make_args(test_prog[test_entrypoint],
+            test_prog_codegen_result.implemented_data_infos[
+                test_entrypoint],
+            queue, ref_arg_data, parameters)
+    args["out_host"] = False
+
+    if not quiet:
+        print(75*"-")
+        print("Kernel:")
+        print(75*"-")
+        if print_code:
+            print(get_highlighted_code(
+                test_prog_codegen_result.device_code()))
+            print(75*"-")
+        if dump_binary:
+            print(type(test_prog_codegen_result.cl_program))
+            print(test_prog_codegen_result.cl_program.binaries[0])
+            print(75*"-")
 
-        kernel_info = compiled.kernel_info(frozenset())
+    logger.info("%s: run warmup" % (test_entrypoint))
 
-        args = make_args(kernel,
-                kernel_info.implemented_data_info,
-                queue, ref_arg_data, parameters)
+    for i in range(warmup_rounds):
+        if not AUTO_TEST_SKIP_RUN:
+            test_prog(queue, **args)
 
-        args["out_host"] = False
+        if need_check and not AUTO_TEST_SKIP_RUN:
+            for arg_desc in ref_arg_data:
+                if arg_desc is None:
+                    continue
+                if not arg_desc.needs_checking:
+                    continue
 
-        if not quiet:
-            print(75*"-")
-            print("Kernel #%d:" % i)
-            print(75*"-")
-            if print_code:
-                print(compiled.get_highlighted_code())
-                print(75*"-")
-            if dump_binary:
-                # {{{ find cl program
+                from pyopencl.compyte.array import as_strided
+                ref_ary = as_strided(
+                        arg_desc.ref_storage_array.get(),
+                        shape=arg_desc.ref_shape,
+                        strides=arg_desc.ref_numpy_strides).flatten()
+                test_ary = as_strided(
+                        arg_desc.test_storage_array.get(),
+                        shape=arg_desc.test_shape,
+                        strides=arg_desc.test_numpy_strides).flatten()
+                common_len = min(len(ref_ary), len(test_ary))
+                ref_ary = ref_ary[:common_len]
+                test_ary = test_ary[:common_len]
 
-                for name in dir(kernel_info.cl_kernels):
-                    if name.startswith("__"):
-                        continue
-                    cl_kernel = getattr(kernel_info.cl_kernels, name)
-                    cl_program = cl_kernel.get_info(cl.kernel_info.PROGRAM)
-                    break
-                else:
-                    assert False, "could not find cl_program"
+                error_is_small, error = check_result(test_ary, ref_ary)
+                if not error_is_small:
+                    raise AutomaticTestFailure(error)
 
-                # }}}
+                need_check = False
 
-                print(type(cl_program))
-                if hasattr(cl_program, "binaries"):
-                    print(cl_program.binaries[0])
+    events = []
+    queue.finish()
 
-                print(75*"-")
+    logger.info("%s: warmup done" % (test_entrypoint))
 
-        logger.info("%s: run warmup" % (knl.name))
+    logger.info("%s: timing run" % (test_entrypoint))
 
-        for i in range(warmup_rounds):
-            if not AUTO_TEST_SKIP_RUN:
-                compiled(queue, **args)
-
-            if need_check and not AUTO_TEST_SKIP_RUN:
-                for arg_desc in ref_arg_data:
-                    if arg_desc is None:
-                        continue
-                    if not arg_desc.needs_checking:
-                        continue
-
-                    from pyopencl.compyte.array import as_strided
-                    ref_ary = as_strided(
-                            arg_desc.ref_storage_array.get(),
-                            shape=arg_desc.ref_shape,
-                            strides=arg_desc.ref_numpy_strides).flatten()
-                    test_ary = as_strided(
-                            arg_desc.test_storage_array.get(),
-                            shape=arg_desc.test_shape,
-                            strides=arg_desc.test_numpy_strides).flatten()
-                    common_len = min(len(ref_ary), len(test_ary))
-                    ref_ary = ref_ary[:common_len]
-                    test_ary = test_ary[:common_len]
-
-                    error_is_small, error = check_result(test_ary, ref_ary)
-                    if not error_is_small:
-                        raise AutomaticTestFailure(error)
-
-                    need_check = False
-
-        events = []
-        queue.finish()
+    timing_rounds = max(warmup_rounds, 1)
 
-        logger.info("%s: warmup done" % (knl.name))
+    while True:
+        from time import time
+        start_time = time()
 
-        logger.info("%s: timing run" % (knl.name))
+        evt_start = cl.enqueue_marker(queue)
 
-        timing_rounds = max(warmup_rounds, 1)
+        for i in range(timing_rounds):
+            if not AUTO_TEST_SKIP_RUN:
+                evt, _ = test_prog(queue, **args)
+                events.append(evt)
+            else:
+                events.append(cl.enqueue_marker(queue))
 
-        while True:
-            from time import time
-            start_time = time()
+        evt_end = cl.enqueue_marker(queue)
 
-            evt_start = cl.enqueue_marker(queue)
+        queue.finish()
+        stop_time = time()
 
-            for i in range(timing_rounds):
-                if not AUTO_TEST_SKIP_RUN:
-                    evt, _ = compiled(queue, **args)
-                    events.append(evt)
-                else:
-                    events.append(cl.enqueue_marker(queue))
+        for evt in events:
+            evt.wait()
+        evt_start.wait()
+        evt_end.wait()
 
-            evt_end = cl.enqueue_marker(queue)
+        elapsed_event = (1e-9*events[-1].profile.END
+                - 1e-9*events[0].profile.START) \
+                / timing_rounds
+        try:
+            elapsed_event_marker = ((1e-9*evt_end.profile.START
+                        - 1e-9*evt_start.profile.START)
+                    / timing_rounds)
+        except cl.RuntimeError:
+            elapsed_event_marker = None
 
-            queue.finish()
-            stop_time = time()
+        elapsed_wall = (stop_time-start_time)/timing_rounds
 
-            for evt in events:
-                evt.wait()
-            evt_start.wait()
-            evt_end.wait()
+        if elapsed_wall * timing_rounds < 0.3:
+            timing_rounds *= 4
+        else:
+            break
 
-            elapsed_event = (1e-9*events[-1].profile.END
-                    - 1e-9*events[0].profile.START) \
-                    / timing_rounds
-            try:
-                elapsed_event_marker = ((1e-9*evt_end.profile.START
-                            - 1e-9*evt_start.profile.START)
-                        / timing_rounds)
-            except cl.RuntimeError:
-                elapsed_event_marker = None
+    logger.info("%s: timing run done" % (test_entrypoint))
 
-            elapsed_wall = (stop_time-start_time)/timing_rounds
+    rates = ""
+    for cnt, lbl in zip(op_count, op_label):
+        rates += " {:g} {}/s".format(cnt/elapsed_wall, lbl)
 
-            if elapsed_wall * timing_rounds < 0.3:
-                timing_rounds *= 4
+    if not quiet:
+        def format_float_or_none(v):
+            if v is None:
+                return "<unavailable>"
             else:
-                break
+                return "%g" % v
 
-        logger.info("%s: timing run done" % (knl.name))
+        print("elapsed: %s s event, %s s marker-event %s s wall "
+                "(%d rounds)%s" % (
+                    format_float_or_none(elapsed_event),
+                    format_float_or_none(elapsed_event_marker),
+                    format_float_or_none(elapsed_wall), timing_rounds, rates))
 
-        rates = ""
+    if do_check:
+        ref_rates = ""
         for cnt, lbl in zip(op_count, op_label):
             rates += " {:g} {}/s".format(cnt/elapsed_wall, lbl)
 
         if not quiet:
-            def format_float_or_none(v):
-                if v is None:
-                    return "<unavailable>"
-                else:
-                    return "%g" % v
-
             print("elapsed: %s s event, %s s marker-event %s s wall "
                     "(%d rounds)%s" % (
                         format_float_or_none(elapsed_event),
diff --git a/loopy/check.py b/loopy/check.py
index 5804c514f95483a511a90e62dd083dcbe1ae0a74..4656abbd00cd34f3ab465fe6401b96df1e731711 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -23,15 +23,18 @@ THE SOFTWARE.
 
 from islpy import dim_type
 import islpy as isl
-from loopy.symbolic import WalkMapper
-from loopy.diagnostic import LoopyError, WriteRaceConditionWarning, warn_with_kernel
-from loopy.type_inference import TypeInferenceMapper
+from loopy.symbolic import WalkMapper, CombineMapper, ResolvedFunction
+from loopy.diagnostic import (LoopyError, WriteRaceConditionWarning,
+        warn_with_kernel)
+from loopy.type_inference import TypeReader
 from loopy.kernel.instruction import (MultiAssignmentBase, CallInstruction,
         CInstruction, _DataObliviousInstruction)
 from pytools import memoize_method
 
 from collections import defaultdict
 
+from functools import reduce
+
 import logging
 logger = logging.getLogger(__name__)
 
@@ -87,6 +90,65 @@ def check_identifiers_in_subst_rules(knl):
                     % (knl.name, rule.name,
                        ", ".join(deps-rule_allowed_identifiers)))
 
+
+class UnscopedCallCollector(CombineMapper):
+    """
+    Collects all the unscoped calls within a kernel.
+
+    :returns:
+        An :class:`frozenset` of function names that are not scoped in
+        the kernel.
+    """
+
+    def combine(self, values):
+        import operator
+        return reduce(operator.or_, values, frozenset())
+
+    def map_call(self, expr):
+        from pymbolic.primitives import CallWithKwargs
+        return self.rec(CallWithKwargs(
+            function=expr.function, parameters=expr.parameters,
+            kw_parameters={}))
+
+    def map_call_with_kwargs(self, expr):
+        if not isinstance(expr.function, ResolvedFunction):
+            return (frozenset([expr.function.name]) |
+                    self.combine(self.rec(child) for child in expr.parameters
+                        + tuple(expr.kw_parameters.values())))
+        else:
+            return self.combine(self.rec(child) for child in
+                expr.parameters+tuple(expr.kw_parameters.values()))
+
+    def map_constant(self, expr):
+        return frozenset()
+
+    map_variable = map_constant
+    map_function_symbol = map_constant
+    map_tagged_variable = map_constant
+    map_type_cast = map_constant
+
+
+def check_functions_are_resolved(kernel):
+    """ Checks if all call nodes in the *kernel* expression have been
+    resolved.
+    """
+    from loopy.symbolic import SubstitutionRuleExpander
+    subst_expander = SubstitutionRuleExpander(kernel.substitutions)
+
+    for insn in kernel.instructions:
+        if isinstance(insn, MultiAssignmentBase):
+            unscoped_calls = UnscopedCallCollector()(subst_expander(
+                insn.expression))
+            if unscoped_calls:
+                raise LoopyError("Unknown function '%s' -- register a "
+                                 "callable corresponding to it." %
+                                 set(unscoped_calls).pop())
+        elif isinstance(insn, (CInstruction, _DataObliviousInstruction)):
+            pass
+        else:
+            raise NotImplementedError(
+                    "Unsupported instruction type %s." % type(insn).__name__)
+
 # }}}
 
 
@@ -97,7 +159,7 @@ def check_identifiers_in_subst_rules(knl):
 VALID_NOSYNC_SCOPES = frozenset(["local", "global", "any"])
 
 
-class SubscriptIndicesIsIntChecker(TypeInferenceMapper):
+class SubscriptIndicesIsIntChecker(TypeReader):
     def map_subscript(self, expr):
         for idx in expr.index_tuple:
             type_inf_result = self.rec(idx)
@@ -113,12 +175,12 @@ class SubscriptIndicesIsIntChecker(TypeInferenceMapper):
         return self.rec(expr.aggregate)
 
 
-def check_for_integer_subscript_indices(kernel):
+def check_for_integer_subscript_indices(kernel, callables_table):
     """
     Checks is every array access is of type :class:`int`.
     """
     from pymbolic.primitives import Subscript
-    idx_int_checker = SubscriptIndicesIsIntChecker(kernel)
+    idx_int_checker = SubscriptIndicesIsIntChecker(kernel, callables_table)
     for insn in kernel.instructions:
         if isinstance(insn, MultiAssignmentBase):
             idx_int_checker(insn.expression, return_tuple=isinstance(insn,
@@ -191,6 +253,19 @@ def check_loop_priority_inames_known(kernel):
                 raise LoopyError("unknown iname '%s' in loop priorities" % iname)
 
 
+def _get_all_unique_iname_tags(kernel):
+    """Returns an instance of :class:`set` of all the iname tags used in
+    *kernel* that inherit from :class:`loopy.kernel.data.UniqueTag`.
+    """
+    from loopy.kernel.data import UniqueTag
+    from itertools import chain
+    iname_tags = list(chain(*(kernel.iname_to_tags.get(iname, []) for iname in
+                              kernel.all_inames())))
+    return {
+            tag for tag in iname_tags if
+            isinstance(tag, UniqueTag)}
+
+
 def check_multiple_tags_allowed(kernel):
     """
     Checks if a multiple tags of an iname are compatible.
@@ -208,12 +283,14 @@ def check_multiple_tags_allowed(kernel):
                                  "tags: {}".format(iname.name, iname.tags))
 
 
-def check_for_double_use_of_hw_axes(kernel):
+def check_for_double_use_of_hw_axes(kernel, callables_table):
     """
     Check if any instruction of *kernel* is within multiple inames tagged with
     the same hw axis tag.
     """
     from loopy.kernel.data import UniqueTag
+    from loopy.kernel.instruction import CallInstruction
+    from loopy.kernel.function_interface import CallableKernel
 
     for insn in kernel.instructions:
         insn_tag_keys = set()
@@ -226,6 +303,21 @@ def check_for_double_use_of_hw_axes(kernel):
 
                 insn_tag_keys.add(key)
 
+        # check usage of iname tags in the callee kernel
+        if isinstance(insn, CallInstruction):
+            in_knl_callable = callables_table[
+                    insn.expression.function.name]
+            if isinstance(in_knl_callable, CallableKernel):
+                # check for collision in iname_tag keys in the instruction
+                # due to the callee kernel
+                common_iname_tags = [tag for tag in
+                        _get_all_unique_iname_tags(in_knl_callable.subkernel)
+                        if tag.key in insn_tag_keys]
+                if common_iname_tags:
+                    raise LoopyError("instruction '%s' has multiple "
+                            "inames tagged '%s'" % (insn.id,
+                                common_iname_tags.pop()))
+
 
 def check_for_inactive_iname_access(kernel):
     """
@@ -237,9 +329,11 @@ def check_for_inactive_iname_access(kernel):
         if not expression_inames <= insn.within_inames:
             raise LoopyError(
                     "instruction '%s' references "
-                    "inames '%s' that the instruction does not depend on"
+                    "inames '%s' that the instruction does not depend on in "
+                    "the kernel '%s'"
                     % (insn.id,
-                        ", ".join(expression_inames - insn.within_inames)))
+                        ", ".join(expression_inames
+                                  - insn.within_inames), kernel.name))
 
 
 def check_for_unused_inames(kernel):
@@ -529,7 +623,7 @@ def check_write_destinations(kernel):
 
 def check_has_schedulable_iname_nesting(kernel):
     from loopy.transform.iname import (has_schedulable_iname_nesting,
-                                       get_iname_duplication_options)
+            get_iname_duplication_options)
     if not has_schedulable_iname_nesting(kernel):
         import itertools as it
         opt = get_iname_duplication_options(kernel)
@@ -834,14 +928,21 @@ def check_variable_access_ordered(kernel):
 # }}}
 
 
-def pre_schedule_checks(kernel):
+def pre_schedule_checks(kernel, callables_table):
     try:
         logger.debug("%s: pre-schedule check: start" % kernel.name)
 
-        check_for_integer_subscript_indices(kernel)
+        from loopy.kernel.data import auto
+        if all(arg.dtype not in [None, auto] for arg in kernel.args) and (
+                all(tv.dtype not in [None, auto] for tv in
+                    kernel.temporary_variables.values())):
+            # only check if all types are known
+            check_for_integer_subscript_indices(kernel, callables_table)
+
+        check_functions_are_resolved(kernel)
         check_for_duplicate_insn_ids(kernel)
         check_for_orphaned_user_hardware_axes(kernel)
-        check_for_double_use_of_hw_axes(kernel)
+        check_for_double_use_of_hw_axes(kernel, callables_table)
         check_insn_attributes(kernel)
         check_loop_priority_inames_known(kernel)
         check_multiple_tags_allowed(kernel)
@@ -870,7 +971,8 @@ def pre_schedule_checks(kernel):
 
 # {{{ check for unused hw axes
 
-def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None):
+def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table,
+        sched_index=None):
     from loopy.schedule import (CallKernel, RunInstruction,
             Barrier, EnterLoop, LeaveLoop, ReturnFromKernel,
             get_insn_ids_for_block_at, gather_schedule_block)
@@ -885,7 +987,8 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None):
         assert isinstance(kernel.schedule[sched_index], CallKernel)
         _, past_end_i = gather_schedule_block(kernel.schedule, sched_index)
         group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs(
-                get_insn_ids_for_block_at(kernel.schedule, sched_index))
+                get_insn_ids_for_block_at(kernel.schedule, sched_index),
+                callables_table)
 
         group_axes = {ax for ax, length in enumerate(group_size)}
         local_axes = {ax for ax, length in enumerate(local_size)}
@@ -902,7 +1005,8 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None):
     while i < loop_end_i:
         sched_item = kernel.schedule[i]
         if isinstance(sched_item, CallKernel):
-            i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i)
+            i = _check_for_unused_hw_axes_in_kernel_chunk(kernel,
+                    callables_table, i)
 
         elif isinstance(sched_item, RunInstruction):
             insn = kernel.id_to_insn[sched_item.insn_id]
@@ -957,9 +1061,10 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None):
     return past_end_i
 
 
-def check_for_unused_hw_axes_in_insns(kernel):
+def check_for_unused_hw_axes_in_insns(kernel, callables_table):
     if kernel.schedule:
-        _check_for_unused_hw_axes_in_kernel_chunk(kernel)
+        _check_for_unused_hw_axes_in_kernel_chunk(kernel,
+                callables_table)
 
 # }}}
 
@@ -1109,15 +1214,19 @@ def check_that_shapes_and_strides_are_arguments(kernel):
 # }}}
 
 
-def pre_codegen_checks(kernel):
+def pre_codegen_checks(kernel, callables_table):
     try:
         logger.debug("pre-codegen check %s: start" % kernel.name)
 
-        check_for_unused_hw_axes_in_insns(kernel)
+        # FIXME `check_for_unused_hw_axes_in_insns` currently flags a problem
+        # in the callee if a caller kernel, at a call site, uses hardware axes
+        # (say `g.0` and `g.1`). It does not seem that that knowledge is
+        # propagated to the callee.
+        # check_for_unused_hw_axes_in_insns(kernel, callables_table)
         check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel)
         check_that_temporaries_are_defined_in_subkernels_where_used(kernel)
         check_that_all_insns_are_scheduled(kernel)
-        kernel.target.pre_codegen_check(kernel)
+        kernel.target.pre_codegen_check(kernel, callables_table)
         check_that_shapes_and_strides_are_arguments(kernel)
 
         logger.debug("pre-codegen check %s: done" % kernel.name)
diff --git a/loopy/cli.py b/loopy/cli.py
index 4230b74967fc0fa7dcb0064bb712ee9ab140b299..a7d209ae87b2120f90a8d360c3ff9eb13bc925f5 100644
--- a/loopy/cli.py
+++ b/loopy/cli.py
@@ -63,11 +63,9 @@ def main():
     parser.add_argument("--target", choices=(
         "opencl", "ispc", "ispc-occa", "c", "c-fortran", "cuda"),
         default="opencl")
-    parser.add_argument("--name")
     parser.add_argument("--transform")
     parser.add_argument("--edit-code", action="store_true")
     parser.add_argument("--occa-defines")
-    parser.add_argument("--occa-add-dummy-arg", action="store_true")
     parser.add_argument("--print-ir", action="store_true")
     args = parser.parse_args()
 
@@ -106,9 +104,11 @@ def main():
                 ".loopy": "loopy",
                 ".floopy": "fortran",
                 ".f90": "fortran",
+                ".F90": "fortran",
                 ".fpp": "fortran",
                 ".f": "fortran",
                 ".f77": "fortran",
+                ".F77": "fortran",
                 }.get(ext)
         with open(args.infile) as infile_fd:
             infile_content = infile_fd.read()
@@ -159,10 +159,7 @@ def main():
             raise RuntimeError("loopy-lang requires 'lp_knl' "
                     "to be defined on exit")
 
-        if args.name is not None:
-            kernel = kernel.copy(name=args.name)
-
-        kernels = [kernel]
+        prg = [kernel]
 
     elif lang in ["fortran", "floopy", "fpp"]:
         pre_transform_code = None
@@ -179,69 +176,31 @@ def main():
                         defines_to_python_code(defines_fd.read())
                         + pre_transform_code)
 
-        kernels = lp.parse_transformed_fortran(
+        prg = lp.parse_transformed_fortran(
                 infile_content, pre_transform_code=pre_transform_code,
                 filename=args.infile)
 
-        if args.name is not None:
-            kernels = [kernel for kernel in kernels
-                    if kernel.name == args.name]
-
-        if not kernels:
-            raise RuntimeError("no kernels found (name specified: %s)"
-                    % args.name)
-
     else:
         raise RuntimeError("unknown language: '%s'"
                 % args.lang)
 
+    if not isinstance(prg, lp.Program):
+        # FIXME
+        assert isinstance(prg, list)  # of kernels
+        raise NotImplementedError("convert list of kernels to Program")
+
     if args.print_ir:
-        for kernel in kernels:
-            print(kernel, file=sys.stderr)
-
-    if args.occa_add_dummy_arg:
-        new_kernels = []
-        for kernel in kernels:
-            new_args = [
-                    lp.ArrayArg("occa_info", np.int32, shape=None)
-                    ] + kernel.args
-            new_kernels.append(kernel.copy(args=new_args))
-
-        kernels = new_kernels
-        del new_kernels
-
-    codes = []
-    from loopy.codegen import generate_code
-    for kernel in kernels:
-        kernel = lp.preprocess_kernel(kernel)
-        code, impl_arg_info = generate_code(kernel)
-        codes.append(code)
+        print(prg, file=sys.stderr)
+
+    prg = lp.preprocess_kernel(prg)
+    cgr = lp.generate_code_v2(prg)
 
     if args.outfile is not None:
         outfile = args.outfile
     else:
         outfile = "-"
 
-    code = "\n\n".join(codes)
-
-    # {{{ edit code if requested
-
-    import os
-    edit_kernel_env = os.environ.get("LOOPY_EDIT_KERNEL")
-    need_edit = args.edit_code
-    if not need_edit and edit_kernel_env is not None:
-        # Do not replace with "any()"--Py2.6/2.7 bug doesn't like
-        # comprehensions in functions with exec().
-
-        for k in kernels:
-            if edit_kernel_env.lower() in k.name.lower():
-                need_edit = True
-
-    if need_edit:
-        from pytools import invoke_editor
-        code = invoke_editor(code, filename="edit.cl")
-
-    # }}}
+    code = cgr.device_code()
 
     if outfile == "-":
         sys.stdout.write(code)
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 9bc8e079ccf0fc9239e38976f6e6e89db9aa9ff6..3c02a724b9b5c3bf5e3b3907df960fd7a1a2d178 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -20,16 +20,26 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-from loopy.diagnostic import LoopyError, warn
-from pytools import ImmutableRecord, ProcessLogger
+import logging
+logger = logging.getLogger(__name__)
+
 import islpy as isl
 
+from collections import OrderedDict
+from loopy.diagnostic import LoopyError, warn
+from pytools import ImmutableRecord
+
 from pytools.persistent_dict import WriteOncePersistentDict
 from loopy.tools import LoopyKeyBuilder
 from loopy.version import DATA_MODEL_VERSION
 
-import logging
-logger = logging.getLogger(__name__)
+
+from loopy.symbolic import CombineMapper
+from functools import reduce
+
+from loopy.kernel.function_interface import CallableKernel, ScalarCallable
+
+from pytools import ProcessLogger, memoize_method
 
 __doc__ = """
 .. currentmodule:: loopy.codegen
@@ -165,6 +175,7 @@ class SeenFunction(ImmutableRecord):
 class CodeGenerationState:
     """
     .. attribute:: kernel
+    .. attribute:: target
     .. attribute:: implemented_data_info
 
         a list of :class:`ImplementedDataInfo` objects.
@@ -207,21 +218,34 @@ class CodeGenerationState:
 
     .. attribute:: schedule_index_end
 
+    .. attribute:: callables_table
+
+        A mapping from callable names to instances of
+        :class:`loopy.kernel.function_interface.InKernelCallable`.
+
+    .. attribute:: is_entrypoint
+
+        A :class:`bool` to indicate if the code is being generated for an
+        entrypoint kernel
+
     .. attribute:: codegen_cache_manager
 
         An instance of :class:`loopy.codegen.tools.CodegenOperationCacheManager`.
     """
 
-    def __init__(self, kernel,
+    def __init__(self, kernel, target,
             implemented_data_info, implemented_domain, implemented_predicates,
             seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map,
             allow_complex,
+            callables_table,
+            is_entrypoint,
             vectorization_info=None, var_name_generator=None,
             is_generating_device_code=None,
             gen_program_name=None,
             schedule_index_end=None,
             codegen_cachemanager=None):
         self.kernel = kernel
+        self.target = target
         self.implemented_data_info = implemented_data_info
         self.implemented_domain = implemented_domain
         self.implemented_predicates = implemented_predicates
@@ -230,6 +254,8 @@ class CodeGenerationState:
         self.seen_atomic_dtypes = seen_atomic_dtypes
         self.var_subst_map = var_subst_map.copy()
         self.allow_complex = allow_complex
+        self.callables_table = callables_table
+        self.is_entrypoint = is_entrypoint
         self.vectorization_info = vectorization_info
         self.var_name_generator = var_name_generator
         self.is_generating_device_code = is_generating_device_code
@@ -239,19 +265,24 @@ class CodeGenerationState:
 
     # {{{ copy helpers
 
-    def copy(self, kernel=None, implemented_data_info=None,
+    def copy(self, kernel=None, target=None, implemented_data_info=None,
             implemented_domain=None, implemented_predicates=frozenset(),
-            var_subst_map=None, vectorization_info=None,
-            is_generating_device_code=None,
-            gen_program_name=None,
+            var_subst_map=None, is_entrypoint=None, vectorization_info=None,
+            is_generating_device_code=None, gen_program_name=None,
             schedule_index_end=None):
 
         if kernel is None:
             kernel = self.kernel
 
+        if target is None:
+            target = self.target
+
         if implemented_data_info is None:
             implemented_data_info = self.implemented_data_info
 
+        if is_entrypoint is None:
+            is_entrypoint = self.is_entrypoint
+
         if vectorization_info is False:
             vectorization_info = None
 
@@ -269,6 +300,7 @@ class CodeGenerationState:
 
         return CodeGenerationState(
                 kernel=kernel,
+                target=target,
                 implemented_data_info=implemented_data_info,
                 implemented_domain=implemented_domain or self.implemented_domain,
                 implemented_predicates=(
@@ -278,6 +310,8 @@ class CodeGenerationState:
                 seen_atomic_dtypes=self.seen_atomic_dtypes,
                 var_subst_map=var_subst_map or self.var_subst_map,
                 allow_complex=self.allow_complex,
+                callables_table=self.callables_table,
+                is_entrypoint=is_entrypoint,
                 vectorization_info=vectorization_info,
                 var_name_generator=self.var_name_generator,
                 is_generating_device_code=is_generating_device_code,
@@ -389,6 +423,32 @@ code_gen_cache = WriteOncePersistentDict(
          key_builder=LoopyKeyBuilder())
 
 
+class InKernelCallablesCollector(CombineMapper):
+    """
+    Returns an instance of :class:`frozenset` containing instances of
+    :class:`loopy.kernel.function_interface.InKernelCallable` in the
+    :attr:``kernel`.
+    """
+    def __init__(self, kernel):
+        self.kernel = kernel
+
+    def combine(self, values):
+        import operator
+        return reduce(operator.or_, values, frozenset())
+
+    def map_resolved_function(self, expr):
+        return frozenset([self.kernel.scoped_functions[
+            expr.name]])
+
+    def map_constant(self, expr):
+        return frozenset()
+
+    map_variable = map_constant
+    map_function_symbol = map_constant
+    map_tagged_variable = map_constant
+    map_type_cast = map_constant
+
+
 class PreambleInfo(ImmutableRecord):
     """
     .. attribute:: kernel
@@ -401,44 +461,21 @@ class PreambleInfo(ImmutableRecord):
 
 # {{{ main code generation entrypoint
 
-def generate_code_v2(kernel):
+def generate_code_for_a_single_kernel(kernel, callables_table, target,
+        is_entrypoint):
     """
     :returns: a :class:`CodeGenerationResult`
+
+    :param kernel: An instance of :class:`loopy.LoopKernel`.
     """
 
     from loopy.kernel import KernelState
-    if kernel.state == KernelState.INITIAL:
-        from loopy.preprocess import preprocess_kernel
-        kernel = preprocess_kernel(kernel)
-
-    if kernel.schedule is None:
-        from loopy.schedule import get_one_scheduled_kernel
-        kernel = get_one_scheduled_kernel(kernel)
-
-    if kernel.state != KernelState.LINEARIZED:
+    if kernel.state != KernelState.SCHEDULED:
         raise LoopyError("cannot generate code for a kernel that has not been "
                 "scheduled")
 
-    # {{{ cache retrieval
-
-    from loopy import CACHING_ENABLED
-
-    if CACHING_ENABLED:
-        input_kernel = kernel
-        try:
-            result = code_gen_cache[input_kernel]
-            logger.debug("%s: code generation cache hit" % kernel.name)
-            return result
-        except KeyError:
-            pass
-
-    # }}}
-
-    from loopy.type_inference import infer_unknown_types
-    kernel = infer_unknown_types(kernel, expect_completion=True)
-
     from loopy.check import pre_codegen_checks
-    pre_codegen_checks(kernel)
+    pre_codegen_checks(kernel, callables_table)
 
     codegen_plog = ProcessLogger(logger, f"{kernel.name}: generate code")
 
@@ -454,13 +491,13 @@ def generate_code_v2(kernel):
         if isinstance(arg, ArrayBase):
             implemented_data_info.extend(
                     arg.decl_info(
-                        kernel.target,
+                        target,
                         is_written=is_written,
                         index_dtype=kernel.index_dtype))
 
         elif isinstance(arg, ValueArg):
             implemented_data_info.append(ImplementedDataInfo(
-                target=kernel.target,
+                target=target,
                 name=arg.name,
                 dtype=arg.dtype,
                 arg_class=ValueArg,
@@ -486,6 +523,7 @@ def generate_code_v2(kernel):
 
     codegen_state = CodeGenerationState(
             kernel=kernel,
+            target=target,
             implemented_data_info=implemented_data_info,
             implemented_domain=initial_implemented_domain,
             implemented_predicates=frozenset(),
@@ -497,14 +535,17 @@ def generate_code_v2(kernel):
             var_name_generator=kernel.get_var_name_generator(),
             is_generating_device_code=False,
             gen_program_name=(
-                kernel.target.host_program_name_prefix
+                target.host_program_name_prefix
                 + kernel.name
                 + kernel.target.host_program_name_suffix),
             schedule_index_end=len(kernel.schedule),
+            callables_table=callables_table,
+            is_entrypoint=is_entrypoint,
             codegen_cachemanager=CodegenOperationCacheManager.from_kernel(kernel),
             )
 
     from loopy.codegen.result import generate_host_or_device_program
+
     codegen_result = generate_host_or_device_program(
             codegen_state,
             schedule_index=0)
@@ -539,7 +580,7 @@ def generate_code_v2(kernel):
             )
 
     preamble_generators = (kernel.preamble_generators
-            + kernel.target.get_device_ast_builder().preamble_generators())
+            + target.get_device_ast_builder().preamble_generators())
     for prea_gen in preamble_generators:
         preambles.extend(prea_gen(preamble_info))
 
@@ -555,10 +596,160 @@ def generate_code_v2(kernel):
 
     codegen_plog.done()
 
+    return codegen_result
+
+
+def diverge_callee_entrypoints(program):
+    """
+    If a kernel is both an entrypoint and a callee, then rename the callee.
+    """
+    from loopy.program import _get_callable_ids
+    from pytools import UniqueNameGenerator
+    callable_ids = _get_callable_ids(program.callables_table,
+            program.entrypoints)
+
+    new_callables = {}
+    renames = {}
+
+    vng = UniqueNameGenerator(set(program.callables_table.keys()))
+
+    for clbl_id in callable_ids & program.entrypoints:
+        renames[clbl_id] = vng(based_on=clbl_id)
+
+    for name, clbl in program.callables_table.items():
+        if isinstance(clbl, CallableKernel):
+            from loopy.program import (
+                    rename_resolved_functions_in_a_single_kernel)
+            knl = rename_resolved_functions_in_a_single_kernel(
+                    clbl.subkernel, renames)
+            new_callables[name] = clbl.copy(subkernel=knl)
+        elif isinstance(clbl, ScalarCallable):
+            new_callables[name] = clbl
+        else:
+            raise NotImplementedError(type(clbl))
+
+    for clbl_id in callable_ids & program.entrypoints:
+        knl = new_callables[clbl_id].subkernel.copy(name=renames[clbl_id])
+        new_callables[renames[clbl_id]] = new_callables[clbl_id].copy(
+                subkernel=knl)
+
+    return program.copy(callables_table=new_callables)
+
+
+@memoize_method
+def generate_code_v2(program):
+    """
+    Returns an instance of :class:`CodeGenerationResult`.
+
+    :param program: An instance of :class:`loopy.Program`.
+    """
+
+    from loopy.kernel import LoopKernel
+    from loopy.program import make_program
+    from loopy.codegen.result import CodeGenerationResult
+
+    # {{{ cache retrieval
+
+    from loopy import CACHING_ENABLED
+    from loopy.preprocess import prepare_for_caching
+
     if CACHING_ENABLED:
-        code_gen_cache.store_if_not_present(input_kernel, codegen_result)
+        input_program = prepare_for_caching(program)
+        try:
+            result = code_gen_cache[input_program]
+            logger.debug(f"Program with entrypoints {program.entrypoints}:"
+                         " code generation cache hit")
+            return result
+        except KeyError:
+            pass
 
-    return codegen_result
+    # }}}
+
+    if isinstance(program, LoopKernel):
+        program = make_program(program)
+
+    from loopy.kernel import KernelState
+    if program.state < KernelState.PREPROCESSED:
+        # Note that we cannot have preprocessing separately for everyone.
+        # Since, now the preprocessing of each one depends on the other.
+        # So we check if any one of the callable kernels are not preprocesses
+        # then, we have to do the preprocessing of every other kernel.
+        from loopy.preprocess import preprocess_program
+        program = preprocess_program(program)
+
+    from loopy.type_inference import infer_unknown_types
+    program = infer_unknown_types(program, expect_completion=True)
+
+    new_callables = {}
+
+    for name, clbl in program.callables_table.items():
+        if isinstance(clbl, CallableKernel):
+            from loopy.schedule import get_one_scheduled_kernel
+            knl = clbl.subkernel
+            if knl.schedule is None:
+                knl = get_one_scheduled_kernel(
+                            knl, program.callables_table)
+            new_callables[name] = clbl.copy(subkernel=knl)
+        elif isinstance(clbl, ScalarCallable):
+            new_callables[name] = clbl
+        else:
+            raise NotImplementedError(type(clbl))
+
+    program = program.copy(callables_table=new_callables)
+
+    program = diverge_callee_entrypoints(program)
+
+    host_programs = OrderedDict()
+    device_programs = []
+    device_preambles = []
+    callee_fdecls = []
+    implemented_data_infos = OrderedDict()
+
+    for func_id, in_knl_callable in program.callables_table.items():
+        if isinstance(in_knl_callable, CallableKernel):
+            #FIXME:
+            # 1. Diverge the kernels which are both entrypoint and callees at this
+            #    point. By diverge we should rename the callees in kernels.
+            # 2. Then pass the callee versions by saying is_entrypoint=False
+            cgr = generate_code_for_a_single_kernel(in_knl_callable.subkernel,
+                        program.callables_table, program.target, func_id in
+                        program.entrypoints)
+            if func_id in program.entrypoints:
+                assert len(cgr.host_programs) == 1
+                host_programs[func_id] = cgr.host_programs[func_id]
+                implemented_data_infos[func_id] = cgr.implemented_data_info
+            else:
+                # FIXME: This assertion should be valid
+                # assert cgr.host_programs == []
+                assert len(cgr.device_programs) == 1
+                #FIXME:
+                # if isinstance(callee_prog_ast, Collection):
+                #     for entry in callee_prog_ast.contents:
+                #         if isinstance(entry, FunctionBody):
+                #             callee_fdecls.append(entry.fdecl)
+                callee_fdecls.append(cgr.device_programs[0].ast.fdecl)
+
+            device_programs.extend(cgr.device_programs)
+            device_preambles.extend(cgr.device_preambles)
+
+        device_preambles.extend(list(in_knl_callable.generate_preambles(
+            program.target)))
+
+    # adding the callee fdecls to the device_programs
+    device_programs = ([device_programs[0].copy(
+            ast=program.target.get_device_ast_builder().ast_module.Collection(
+                callee_fdecls+[device_programs[0].ast]))] +
+            device_programs[1:])
+    cgr = CodeGenerationResult(
+            host_programs=host_programs,
+            device_programs=device_programs,
+            device_preambles=device_preambles,
+            implemented_data_infos=implemented_data_infos)
+
+    if CACHING_ENABLED:
+        code_gen_cache.store_if_not_present(input_program, cgr)
+
+    return cgr
 
 
 def generate_code(kernel, device=None):
@@ -572,8 +763,13 @@ def generate_code(kernel, device=None):
     if len(codegen_result.device_programs) > 1:
         raise LoopyError("kernel passed to generate_code yielded multiple "
                 "device programs. Use generate_code_v2.")
+    if len(codegen_result.host_programs) > 1:
+        raise LoopyError("kernel passed to generate_code yielded multiple "
+                "host programs. Use generate_code_v2.")
+
+    _, implemented_data_info = codegen_result.implemented_data_infos.popitem()
 
-    return codegen_result.device_code(), codegen_result.implemented_data_info
+    return codegen_result.device_code(), implemented_data_info
 
 # }}}
 
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index f7e953d9481aee705b785406462725ea25d860fe..f65c397424b7a498ec979369f6d9ed56e9c06aeb 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -23,6 +23,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+from collections import OrderedDict
 from loopy.codegen.result import merge_codegen_results, wrap_in_if
 import islpy as isl
 from loopy.schedule import (
@@ -89,17 +90,21 @@ def generate_code_for_sched_index(codegen_state, sched_index):
                 new_codegen_state, sched_index)
 
         glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs(
-                get_insn_ids_for_block_at(kernel.schedule, sched_index))
-
-        return merge_codegen_results(codegen_state, [
-            codegen_result,
-
-            codegen_state.ast_builder.get_kernel_call(
-                codegen_state,
-                sched_item.kernel_name,
-                glob_grid, loc_grid,
-                extra_args),
-            ])
+                get_insn_ids_for_block_at(kernel.schedule, sched_index),
+                codegen_state.callables_table)
+        if codegen_state.is_entrypoint:
+            return merge_codegen_results(codegen_state, [
+                codegen_result,
+
+                codegen_state.ast_builder.get_kernel_call(
+                    codegen_state,
+                    sched_item.kernel_name,
+                    glob_grid, loc_grid,
+                    extra_args),
+                ])
+        else:
+            # do not generate host code for non-entrypoint kernels
+            return codegen_result
 
     elif isinstance(sched_item, EnterLoop):
         tags = kernel.iname_tags(sched_item.iname)
@@ -149,7 +154,7 @@ def generate_code_for_sched_index(codegen_state, sched_index):
             if sched_item.synchronization_kind in ["global", "local"]:
                 # host code is assumed globally and locally synchronous
                 return CodeGenerationResult(
-                        host_program=None,
+                        host_programs=OrderedDict(),
                         device_programs=[],
                         implemented_domains={},
                         implemented_data_info=codegen_state.implemented_data_info)
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index d149eb95ab7c115b38cc1b819b1c24f7b4597170..c343483f0c60497f43cc9fde2981b3e5598b00b5 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -251,7 +251,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
         return next_func(codegen_state)
 
     global_size, local_size = kernel.get_grid_sizes_for_insn_ids(
-            insn_ids_for_block)
+            insn_ids_for_block, codegen_state.callables_table)
 
     hw_inames_left = hw_inames_left[:]
     iname = hw_inames_left.pop()
diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index 685df8fdec9ef0ea9e45223ceae563d943a69d79..358088922a31fe4a33d5d060c5b7194514d518c0 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -21,6 +21,7 @@ THE SOFTWARE.
 """
 
 from pytools import ImmutableRecord
+from collections import OrderedDict
 
 
 def process_preambles(preambles):
@@ -76,7 +77,11 @@ class GeneratedProgram(ImmutableRecord):
 
 class CodeGenerationResult(ImmutableRecord):
     """
-    .. attribute:: host_program
+    .. attribute:: host_programs
+
+        A mapping from entrypoints of a translation unit to instances of
+        :class:`GeneratedProgram` intended to be run on host.
+
     .. attribute:: device_programs
 
         A list of :class:`GeneratedProgram` instances
@@ -97,7 +102,7 @@ class CodeGenerationResult(ImmutableRecord):
     .. attribute:: implemented_data_info
 
         a list of :class:`loopy.codegen.ImplementedDataInfo` objects.
-        Only added at the very end of code generation.
+        Only added at the very end of code generation
     """
 
     @staticmethod
@@ -109,12 +114,12 @@ class CodeGenerationResult(ImmutableRecord):
 
         if codegen_state.is_generating_device_code:
             kwargs = {
-                    "host_program": None,
                     "device_programs": [prg],
+                    "host_programs": OrderedDict()
                     }
         else:
             kwargs = {
-                    "host_program": prg,
+                    "host_programs": OrderedDict({codegen_state.kernel.name: prg}),
                     "device_programs": [],
                     }
 
@@ -128,8 +133,9 @@ class CodeGenerationResult(ImmutableRecord):
 
         return (
                 "".join(preamble_codes)
-                +
-                str(self.host_program.ast))
+                + "\n"
+                + "\n\n".join(str(hp.ast)
+                              for hp in self.host_programs.values()))
 
     def device_code(self):
         preamble_codes = process_preambles(getattr(self, "device_preambles", []))
@@ -151,7 +157,8 @@ class CodeGenerationResult(ImmutableRecord):
                 + "\n"
                 + "\n\n".join(str(dp.ast) for dp in self.device_programs)
                 + "\n\n"
-                + str(self.host_program.ast))
+                + "\n\n".join(str(hp.ast) for hp in
+                    self.host_programs.values()))
 
     def current_program(self, codegen_state):
         if codegen_state.is_generating_device_code:
@@ -160,7 +167,11 @@ class CodeGenerationResult(ImmutableRecord):
             else:
                 result = None
         else:
-            result = self.host_program
+            if self.host_programs:
+                host_programs = self.host_programs.copy()
+                _, result = host_programs.popitem()
+            else:
+                result = None
 
         if result is None:
             ast = codegen_state.ast_builder.ast_block_class([])
@@ -184,7 +195,15 @@ class CodeGenerationResult(ImmutableRecord):
         else:
             assert program.name == codegen_state.gen_program_name
             assert not program.is_device_program
-            return self.copy(host_program=program)
+            host_programs = self.host_programs.copy()
+            if host_programs:
+                e, _ = host_programs.popitem()
+                assert codegen_state.kernel.name == e
+                host_programs[e] = program
+            else:
+                host_programs[codegen_state.kernel.name] = program
+            return self.copy(
+                    host_programs=host_programs)
 
     def current_ast(self, codegen_state):
         return self.current_program(codegen_state).ast
@@ -205,7 +224,7 @@ def merge_codegen_results(codegen_state, elements, collapse=True):
 
     if not elements:
         return CodeGenerationResult(
-                host_program=None,
+                host_programs=OrderedDict(),
                 device_programs=[],
                 implemented_domains={},
                 implemented_data_info=codegen_state.implemented_data_info)
@@ -302,28 +321,35 @@ def generate_host_or_device_program(codegen_state, schedule_index):
     else:
         codegen_result = build_loop_nest(codegen_state, schedule_index)
 
-    codegen_result = merge_codegen_results(
-            codegen_state,
-            ast_builder.generate_top_of_body(codegen_state)
-            + temp_decls
-            + [codegen_result],
-            collapse=False)
+    if (codegen_state.is_generating_device_code) or (
+            codegen_state.is_entrypoint):
+        codegen_result = merge_codegen_results(
+                codegen_state,
+                ast_builder.generate_top_of_body(codegen_state)
+                + temp_decls
+                + [codegen_result],
+                collapse=False)
 
-    cur_prog = codegen_result.current_program(codegen_state)
-    body_ast = cur_prog.ast
-    fdecl_ast = ast_builder.get_function_declaration(
-            codegen_state, codegen_result, schedule_index)
+        cur_prog = codegen_result.current_program(codegen_state)
+        body_ast = cur_prog.ast
+        fdecl_ast = ast_builder.get_function_declaration(
+                codegen_state, codegen_result, schedule_index)
 
-    fdef_ast = ast_builder.get_function_definition(
-            codegen_state, codegen_result,
-            schedule_index, fdecl_ast, body_ast)
+        fdef_ast = ast_builder.get_function_definition(
+                codegen_state, codegen_result,
+                schedule_index, fdecl_ast, body_ast)
 
-    codegen_result = codegen_result.with_new_program(
-            codegen_state,
-            cur_prog.copy(
-                ast=ast_builder.process_ast(fdef_ast),
-                body_ast=ast_builder.process_ast(body_ast)))
+        codegen_result = codegen_result.with_new_program(
+                codegen_state,
+                cur_prog.copy(
+                    ast=ast_builder.process_ast(fdef_ast),
+                    body_ast=ast_builder.process_ast(body_ast)))
+    else:
+        codegen_result = codegen_result.copy(
+                host_programs=OrderedDict())
 
     return codegen_result
 
 # }}}
+
+# vim: foldmethod=marker
diff --git a/loopy/compiled.py b/loopy/compiled.py
index f9313c6c95612ddba6566d7c8175d998e8312147..0fa18eacbc3a16059e06c33202c91f89cc39ef64 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -31,11 +31,11 @@ class CompiledKernel(PyOpenCLKernelExecutor):
     """
     .. automethod:: __call__
     """
-    def __init__(self, context, kernel):
+    def __init__(self, context, kernel, entrypoint):
         from warnings import warn
         warn("CompiledKernel is deprecated. Use LoopKernel.__call__ directly.",
                 DeprecationWarning, stacklevel=2)
 
-        super().__init__(context, kernel)
+        super().__init__(context, kernel, entrypoint)
 
 # }}}
diff --git a/loopy/frontend/fortran/__init__.py b/loopy/frontend/fortran/__init__.py
index 00dc837e16ad2a414a13c1ceaf4f36f3f3fb3049..4ad7cd21c48f7abd396afb977267e30cb9f2d501 100644
--- a/loopy/frontend/fortran/__init__.py
+++ b/loopy/frontend/fortran/__init__.py
@@ -20,7 +20,11 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+import logging
+logger = logging.getLogger(__name__)
+
 from loopy.diagnostic import LoopyError
+from pytools import ProcessLogger
 
 
 def c_preprocess(source, defines=None, filename=None, include_paths=None):
@@ -152,8 +156,9 @@ def parse_transformed_fortran(source, free_form=True, strict=True,
       :func:`parse_fortran`.
     * ``FILENAME``: the file name of the code being processed
 
-    The transform code must define ``RESULT``, conventionally a list of
-    kernels, which is returned from this function unmodified.
+    The transform code must define ``RESULT``, conventionally a list of kernels
+    or a :class:`loopy.Program`, which is returned from this function
+    unmodified.
 
     An example of *source* may look as follows::
 
@@ -234,11 +239,64 @@ def parse_transformed_fortran(source, free_form=True, strict=True,
     return proc_dict["RESULT"]
 
 
-def parse_fortran(source, filename="<floopy code>", free_form=True, strict=True,
-        seq_dependencies=None, auto_dependencies=None, target=None):
+def _add_assignees_to_calls(knl, all_kernels):
     """
-    :returns: a list of :class:`loopy.LoopKernel` objects
+    Returns a copy of *knl* coming from the fortran parser adjusted to the
+    loopy specification that written variables of a call must appear in the
+    assignee.
+
+    :param knl: An instance of :class:`loopy.LoopKernel`, which have incorrect
+        calls to the kernels in *all_kernels* by stuffing both the input and
+        output arguments into parameters.
+
+    :param all_kernels: An instance of :class:`list` of loopy kernels which
+        may be called by *kernel*.
     """
+    new_insns = []
+    subroutine_dict = {kernel.name: kernel for kernel in all_kernels}
+    from loopy.kernel.instruction import (Assignment, CallInstruction,
+            CInstruction, _DataObliviousInstruction,
+            modify_assignee_for_array_call)
+    from pymbolic.primitives import Call, Variable
+
+    for insn in knl.instructions:
+        if isinstance(insn, CallInstruction):
+            if isinstance(insn.expression, Call) and (
+                    insn.expression.function.name in subroutine_dict):
+                assignees = []
+                new_params = []
+                subroutine = subroutine_dict[insn.expression.function.name]
+                for par, arg in zip(insn.expression.parameters, subroutine.args):
+                    if arg.name in subroutine.get_written_variables():
+                        par = modify_assignee_for_array_call(par)
+                        assignees.append(par)
+                    if arg.name in subroutine.get_read_variables():
+                        new_params.append(par)
+                    if arg.name not in (subroutine.get_written_variables() |
+                            subroutine.get_read_variables()):
+                        new_params.append(par)
+
+                new_insns.append(
+                        insn.copy(
+                            assignees=tuple(assignees),
+                            expression=Variable(
+                                insn.expression.function.name)(*new_params)))
+            else:
+                new_insns.append(insn)
+            pass
+        elif isinstance(insn, (Assignment, CInstruction,
+                _DataObliviousInstruction)):
+            new_insns.append(insn)
+        else:
+            raise NotImplementedError(type(insn).__name__)
+
+    return knl.copy(instructions=new_insns)
+
+
+def parse_fortran(source, filename="<floopy code>", free_form=None, strict=None,
+        seq_dependencies=None, auto_dependencies=None, target=None):
+
+    parse_plog = ProcessLogger(logger, "parsing fortran file '%s'" % filename)
 
     if seq_dependencies is not None and auto_dependencies is not None:
         raise TypeError(
@@ -251,6 +309,10 @@ def parse_fortran(source, filename="<floopy code>", free_form=True, strict=True,
 
     if seq_dependencies is None:
         seq_dependencies = True
+    if free_form is None:
+        free_form = True
+    if strict is None:
+        strict = True
 
     import logging
     console = logging.StreamHandler()
@@ -271,7 +333,23 @@ def parse_fortran(source, filename="<floopy code>", free_form=True, strict=True,
     f2loopy = F2LoopyTranslator(filename, target=target)
     f2loopy(tree)
 
-    return f2loopy.make_kernels(seq_dependencies=seq_dependencies)
+    kernels = f2loopy.make_kernels(seq_dependencies=seq_dependencies)
+
+    from loopy.transform.callable import merge
+    prog = merge(kernels)
+    all_kernels = [clbl.subkernel
+                   for clbl in prog.callables_table.values()]
+
+    for knl in all_kernels:
+        prog.with_kernel(_add_assignees_to_calls(knl, all_kernels))
+
+    if len(all_kernels) == 1:
+        # guesssing in the case of only one function
+        prog = prog.with_entrypoints(all_kernels[0].name)
+
+    parse_plog.done()
+
+    return prog
 
 
 # vim: foldmethod=marker
diff --git a/loopy/frontend/fortran/expression.py b/loopy/frontend/fortran/expression.py
index 354a769a0f4b4762cc3e39befa8fb27723be5e72..cc93e914d0470c423812b69913a7185dca9c7b67 100644
--- a/loopy/frontend/fortran/expression.py
+++ b/loopy/frontend/fortran/expression.py
@@ -42,6 +42,25 @@ _and = intern("and")
 _or = intern("or")
 
 
+def tuple_to_complex_literal(expr):
+    if len(expr) != 2:
+        raise TranslationError("complex literals must have "
+                "two entries")
+
+    r, i = expr
+
+    r = np.array(r)[()]
+    i = np.array(i)[()]
+
+    dtype = (r.dtype.type(0) + i.dtype.type(0))
+    if dtype == np.float32:
+        dtype = np.complex64
+    else:
+        dtype = np.complex128
+
+    return dtype(float(r) + float(i)*1j)
+
+
 # {{{ expression parser
 
 class FortranExpressionParser(ExpressionParserBase):
@@ -176,24 +195,31 @@ class FortranExpressionParser(ExpressionParserBase):
             left_exp, did_something = ExpressionParserBase.parse_postfix(
                     self, pstate, min_precedence, left_exp)
 
-            if isinstance(left_exp, tuple) and min_precedence < self._PREC_FUNC_ARGS:
-                # this must be a complex literal
-                if len(left_exp) != 2:
-                    raise TranslationError("complex literals must have "
-                            "two entries")
+        return left_exp, did_something
 
-                r, i = left_exp
+    def parse_expression(self, pstate, min_precedence=0):
+        left_exp = self.parse_prefix(pstate)
 
-                dtype = (r.dtype.type(0) + i.dtype.type(0))
-                if dtype == np.float32:
-                    dtype = np.complex64
-                else:
-                    dtype = np.complex128
+        did_something = True
+        while did_something:
+            did_something = False
+            if pstate.is_at_end():
+                return left_exp
 
-                left_exp = dtype(float(r) + float(i)*1j)
+            result = self.parse_postfix(
+                    pstate, min_precedence, left_exp)
+            left_exp, did_something = result
 
-        return left_exp, did_something
+        from pymbolic.parser import FinalizedTuple
+        if isinstance(left_exp, FinalizedTuple):
+            # View all tuples that survive parsing as complex literals
+            # "FinalizedTuple" indicates that this tuple was enclosed
+            # in parens.
+            return tuple_to_complex_literal(left_exp)
+
+        return left_exp
 
 # }}}
 
+
 # vim: foldmethod=marker
diff --git a/loopy/frontend/fortran/translator.py b/loopy/frontend/fortran/translator.py
index 78eddfb549c4ebabbf933abd7832a449a44dbeec..22e532c6e4bf44a4989ac6ff90d75b5c939aab3c 100644
--- a/loopy/frontend/fortran/translator.py
+++ b/loopy/frontend/fortran/translator.py
@@ -35,12 +35,14 @@ from islpy import dim_type
 from loopy.symbolic import IdentityMapper
 from loopy.diagnostic import LoopyError
 from loopy.kernel.instruction import LegacyStringInstructionTag
-from pymbolic.primitives import Wildcard
+from pymbolic.primitives import (Wildcard, Slice)
 
 
 # {{{ subscript base shifter
 
-class SubscriptIndexBaseShifter(IdentityMapper):
+class SubscriptIndexAdjuster(IdentityMapper):
+    """Adjust base indices of subscripts and lengths of slices."""
+
     def __init__(self, scope):
         self.scope = scope
 
@@ -58,21 +60,63 @@ class SubscriptIndexBaseShifter(IdentityMapper):
         if not isinstance(subscript, tuple):
             subscript = (subscript,)
 
-        subscript = list(subscript)
-
         if len(dims) != len(subscript):
             raise TranslationError("inconsistent number of indices "
                     "to '%s'" % name)
 
+        new_subscript = []
         for i in range(len(dims)):
             if len(dims[i]) == 2:
-                # has a base index
-                subscript[i] -= dims[i][0]
+                # has an explicit base index
+                base_index, end_index = dims[i]
             elif len(dims[i]) == 1:
-                # base index is 1 implicitly
-                subscript[i] -= 1
+                base_index = 1
+                end_index, = dims[i]
+
+            sub_i = subscript[i]
+            if isinstance(sub_i, Slice):
+                start = sub_i.start
+                if start is None:
+                    start = base_index
+
+                step = sub_i.step
+                if step is None:
+                    step = 1
+
+                stop = sub_i.stop
+                if stop is None:
+                    stop = end_index
+
+                if step == 1:
+                    sub_i = Slice((
+                            start - base_index,
+
+                            # FIXME This is only correct for unit strides
+                            stop - base_index + 1,
+
+                            step
+                            ))
+                elif step == -1:
+                    sub_i = Slice((
+                            start - base_index,
+
+                            # FIXME This is only correct for unit strides
+                            stop - base_index - 1,
+
+                            step
+                            ))
+
+                else:
+                    # FIXME
+                    raise NotImplementedError("Fortran slice processing for "
+                            "non-unit strides")
+
+            else:
+                sub_i = sub_i - base_index
 
-        return expr.aggregate[self.rec(tuple(subscript))]
+            new_subscript.append(sub_i)
+
+        return expr.aggregate[self.rec(tuple(new_subscript))]
 
 # }}}
 
@@ -83,9 +127,6 @@ class Scope:
     def __init__(self, subprogram_name, arg_names=set()):
         self.subprogram_name = subprogram_name
 
-        # map name to data
-        self.data_statements = {}
-
         # map first letter to type
         self.implicit_types = {}
 
@@ -96,7 +137,7 @@ class Scope:
         self.type_map = {}
 
         # map name to data
-        self.data = {}
+        self.data_map = {}
 
         self.arg_names = arg_names
 
@@ -185,7 +226,7 @@ class Scope:
 
         expr = submap(expr)
 
-        subshift = SubscriptIndexBaseShifter(self)
+        subshift = SubscriptIndexAdjuster(self)
         expr = subshift(expr)
 
         return expr
@@ -216,11 +257,16 @@ class F2LoopyTranslator(FTreeWalkerBase):
 
         self.block_nest = []
 
+    def add_instruction(self, insn):
+        scope = self.scope_stack[-1]
+
+        scope.previous_instruction_id = insn.id
+        scope.instructions.append(insn)
+
     def add_expression_instruction(self, lhs, rhs):
         scope = self.scope_stack[-1]
 
-        new_id = intern("insn%d" % self.insn_id_counter)
-        self.insn_id_counter += 1
+        new_id = self.get_insn_id()
 
         from loopy.kernel.data import Assignment
         insn = Assignment(
@@ -231,8 +277,13 @@ class F2LoopyTranslator(FTreeWalkerBase):
                 predicates=frozenset(self.conditions),
                 tags=tuple(self.instruction_tags))
 
-        scope.previous_instruction_id = new_id
-        scope.instructions.append(insn)
+        self.add_instruction(insn)
+
+    def get_insn_id(self):
+        new_id = intern("insn%d" % self.insn_id_counter)
+        self.insn_id_counter += 1
+
+        return new_id
 
     # {{{ map_XXX functions
 
@@ -326,7 +377,8 @@ class F2LoopyTranslator(FTreeWalkerBase):
 
         tp = self.dtype_from_stmt(node)
 
-        for name, shape in self.parse_dimension_specs(node, node.entity_decls):
+        for name, shape, initializer in self.parse_dimension_specs(
+                node, node.entity_decls):
             if shape is not None:
                 assert name not in scope.dim_map
                 scope.dim_map[name] = shape
@@ -335,6 +387,9 @@ class F2LoopyTranslator(FTreeWalkerBase):
             assert name not in scope.type_map
             scope.type_map[name] = tp
 
+            assert name not in scope.data_map
+            scope.data_map[name] = initializer
+
         return []
 
     map_Logical = map_type_decl  # noqa: N815
@@ -346,7 +401,10 @@ class F2LoopyTranslator(FTreeWalkerBase):
     def map_Dimension(self, node):
         scope = self.scope_stack[-1]
 
-        for name, shape in self.parse_dimension_specs(node, node.items):
+        for name, shape, initializer in self.parse_dimension_specs(node, node.items):
+            if initializer is not None:
+                raise LoopyError("initializer in dimension statement")
+
             if shape is not None:
                 assert name not in scope.dim_map
                 scope.dim_map[name] = shape
@@ -435,7 +493,23 @@ class F2LoopyTranslator(FTreeWalkerBase):
         raise NotImplementedError("goto")
 
     def map_Call(self, node):
-        raise NotImplementedError("call")
+        scope = self.scope_stack[-1]
+
+        new_id = self.get_insn_id()
+
+        from pymbolic import var
+
+        from loopy.kernel.data import CallInstruction
+        insn = CallInstruction(
+                (), var(node.designator)(*(scope.process_expression_for_loopy(
+                    self.parse_expr(node, item)) for item in node.items)),
+                within_inames=frozenset(
+                    scope.active_loopy_inames),
+                id=new_id,
+                predicates=frozenset(self.conditions),
+                tags=tuple(self.instruction_tags))
+
+        self.add_instruction(insn)
 
     def map_Return(self, node):
         raise NotImplementedError("return")
@@ -443,11 +517,6 @@ class F2LoopyTranslator(FTreeWalkerBase):
     def map_ArithmeticIf(self, node):
         raise NotImplementedError("arithmetic-if")
 
-    def map_If(self, node):
-        raise NotImplementedError("if")
-        # node.expr
-        # node.content[0]
-
     def realize_conditional(self, node, context_cond=None):
         scope = self.scope_stack[-1]
 
@@ -474,6 +543,15 @@ class F2LoopyTranslator(FTreeWalkerBase):
 
         self.conditions.append(cond_expr)
 
+    def map_If(self, node):
+        self.realize_conditional(node, None)
+
+        for c in node.content:
+            self.rec(c)
+
+        self.conditions_data.pop()
+        self.conditions.pop()
+
     def map_IfThen(self, node):
         self.block_nest.append("if")
         self.realize_conditional(node, None)
@@ -672,6 +750,10 @@ class F2LoopyTranslator(FTreeWalkerBase):
             for arg_name in sub.arg_names:
                 dims = sub.dim_map.get(arg_name)
 
+                if sub.data_map.get(arg_name) is not None:
+                    raise NotImplementedError(
+                            "initializer for argument %s" % arg_name)
+
                 if dims is not None:
                     # default order is set to "F" in kernel creation below
                     kernel_data.append(
@@ -697,15 +779,22 @@ class F2LoopyTranslator(FTreeWalkerBase):
                 if sub.implicit_types is None and dtype is None:
                     continue
 
+                kwargs = {}
+                if sub.data_map.get(var_name) is not None:
+                    kwargs["read_only"] = True
+                    kwargs["address_space"] = lp.AddressSpace.PRIVATE
+                    kwargs["initializer"] = np.array(
+                            sub.data_map[var_name], dtype=dtype)
+
                 kernel_data.append(
                         lp.TemporaryVariable(
                             var_name, dtype=dtype,
-                            shape=sub.get_loopy_shape(var_name)))
+                            shape=sub.get_loopy_shape(var_name),
+                            **kwargs))
 
             # }}}
 
-            from loopy.version import MOST_RECENT_LANGUAGE_VERSION
-            knl = lp.make_kernel(
+            knl = lp.make_function(
                     sub.index_sets,
                     sub.instructions,
                     kernel_data,
@@ -714,11 +803,10 @@ class F2LoopyTranslator(FTreeWalkerBase):
                     index_dtype=self.index_dtype,
                     target=self.target,
                     seq_dependencies=seq_dependencies,
-                    lang_version=MOST_RECENT_LANGUAGE_VERSION
                     )
 
-            from loopy.loop import fuse_loop_domains
-            knl = fuse_loop_domains(knl)
+            from loopy.loop import merge_loop_domains
+            knl = merge_loop_domains(knl)
             knl = lp.fold_constants(knl)
 
             result.append(knl)
diff --git a/loopy/frontend/fortran/tree.py b/loopy/frontend/fortran/tree.py
index c33578dc844d1a77b084d8cf83cb5009cccc489d..0dc426572f69b7a8ce16dbc97a70f874f17954c4 100644
--- a/loopy/frontend/fortran/tree.py
+++ b/loopy/frontend/fortran/tree.py
@@ -52,7 +52,9 @@ class FTreeWalkerBase:
 
     ENTITY_RE = re.compile(
             r"^(?P<name>[_0-9a-zA-Z]+)\s*"
-            r"(\((?P<shape>[-+*/0-9:a-zA-Z, \t]+)\))?$")
+            r"(\((?P<shape>[-+*/0-9:a-zA-Z, \t]+)\))?"
+            r"(\s*=\s*(?P<initializer>.+))?"
+            "$")
 
     def parse_dimension_specs(self, node, dim_decls):
         def parse_bounds(bounds_str):
@@ -75,7 +77,31 @@ class FTreeWalkerBase:
             else:
                 shape = None
 
-            yield name, shape
+            init_str = groups["initializer"]
+            if init_str:
+                init_str = init_str.replace("(/", "[")
+                init_str = init_str.replace("/)", "]")
+                init_expr = self.parse_expr(node, init_str)
+
+                from numbers import Number
+                if isinstance(init_expr, Number):
+                    initializer = init_expr
+                elif isinstance(init_expr, list):
+                    for i, item in enumerate(init_expr):
+                        if not isinstance(item, Number):
+                            raise LoopyError("unexpected type of "
+                                    "item %d in initializer: %s"
+                                    % (i+1, type(init_expr).__name__))
+                    initializer = init_expr
+
+                else:
+                    raise LoopyError("unexpected type of initializer: %s"
+                            % type(init_expr).__name__)
+
+            else:
+                initializer = None
+
+            yield name, shape, initializer
 
     def __call__(self, expr, *args, **kwargs):
         return self.rec(expr, *args, **kwargs)
diff --git a/loopy/ipython_ext.py b/loopy/ipython_ext.py
index 7f9177e0ef8430cc450cb462641b12ed1a9f9b28..a469b46489786b39516ccda58a20130de4d0e7ea 100644
--- a/loopy/ipython_ext.py
+++ b/loopy/ipython_ext.py
@@ -8,9 +8,7 @@ class LoopyMagics(Magics):
     @cell_magic
     def fortran_kernel(self, line, cell):
         result = lp.parse_fortran(cell)
-
-        for knl in result:
-            self.shell.user_ns[knl.name] = knl
+        self.shell.user_ns["prog"] = result
 
     @cell_magic
     def transformed_fortran_kernel(self, line, cell):
@@ -18,8 +16,7 @@ class LoopyMagics(Magics):
                 cell,
                 transform_code_context=self.shell.user_ns)
 
-        for knl in result:
-            self.shell.user_ns[knl.name] = knl
+        self.shell.user_ns["prog"] = result
 
 
 def load_ipython_extension(ip):
diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index ec0be6f5c8bff088a6e5f1e661bf0048c3a79c83..8ed4d3d437ccb5216f2464d999bd5b24e530bae8 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -24,7 +24,7 @@ THE SOFTWARE.
 """
 
 
-from loopy.diagnostic import StaticValueFindingError
+from loopy.diagnostic import StaticValueFindingError, LoopyError
 
 import islpy as isl
 from islpy import dim_type
@@ -60,7 +60,30 @@ def dump_space(ls):
 
 # {{{ make_slab
 
-def make_slab(space, iname, start, stop):
+def make_slab(space, iname, start, stop, step=1):
+    """
+    Returns an instance of :class:`islpy._isl.BasicSet`, which satisfies the
+    constraint ``start <= step*iname < stop``.
+
+    :arg space: An instance of :class:`islpy._isl.Space`.
+
+    :arg iname:
+        Either an instance of :class:`str` as a name of the ``iname`` or a
+        tuple of ``(iname_dt, iname_dx)`` indicating the *iname* in the space.
+
+    :arg start:
+        An instance of :class:`int`  or an instance of
+        :class:`islpy._isl.Aff` indicating the lower bound of
+        ``step*iname``(inclusive).
+
+    :arg stop:
+        An instance of :class:`int`  or an instance of
+        :class:`islpy._isl.Aff` indicating the upper bound of
+        ``step*iname``.
+
+    :arg step:
+        An instance of :class:`int`.
+    """
     zero = isl.Aff.zero_on_domain(space)
 
     if isinstance(start, (isl.Aff, isl.PwAff)):
@@ -89,13 +112,25 @@ def make_slab(space, iname, start, stop):
 
     iname_aff = zero.add_coefficient_val(iname_dt, iname_idx, 1)
 
-    result = (isl.BasicSet.universe(space)
-            # start <= iname
-            .add_constraint(isl.Constraint.inequality_from_aff(
-                iname_aff - start))
-            # iname < stop
-            .add_constraint(isl.Constraint.inequality_from_aff(
-                stop-1 - iname_aff)))
+    if step > 0:
+        result = (isl.BasicSet.universe(space)
+                # start <= step*iname
+                .add_constraint(isl.Constraint.inequality_from_aff(
+                    step*iname_aff - start))
+                # step*iname < stop
+                .add_constraint(isl.Constraint.inequality_from_aff(
+                    stop-1 - step*iname_aff)))
+    elif step < 0:
+        result = (isl.BasicSet.universe(space)
+                # start >= (-step)*iname
+                .add_constraint(isl.Constraint.inequality_from_aff(
+                    step*iname_aff + start))
+                # (-step)*iname > stop
+                .add_constraint(isl.Constraint.inequality_from_aff(
+                    -stop-1 - step*iname_aff)))
+    else:
+        # step = 0
+        raise LoopyError("0 step not allowed in make_slab.")
 
     return result
 
@@ -427,11 +462,16 @@ def boxify(cache_manager, domain, box_inames, context):
 
 
 def simplify_via_aff(expr):
-    from loopy.symbolic import aff_from_expr, aff_to_expr, get_dependencies
+    from loopy.symbolic import aff_to_expr, guarded_aff_from_expr, get_dependencies
+    from loopy.diagnostic import ExpressionToAffineConversionError
+
     deps = get_dependencies(expr)
-    return aff_to_expr(aff_from_expr(
-        isl.Space.create_from_names(isl.DEFAULT_CONTEXT, list(deps)),
-        expr))
+    try:
+        return aff_to_expr(guarded_aff_from_expr(
+            isl.Space.create_from_names(isl.DEFAULT_CONTEXT, list(deps)),
+            expr))
+    except ExpressionToAffineConversionError:
+        return expr
 
 
 def project_out(set, inames):
@@ -578,7 +618,7 @@ def find_max_of_pwaff_with_params(pw_aff, n_allowed_params):
 
 def set_dim_name(obj, dt, pos, name):
     assert isinstance(name, str)
-    if isinstance(obj, isl.PwQPolynomial):
+    if isinstance(obj, (isl.PwQPolynomial, isl.BasicSet)):
         return obj.set_dim_name(dt, pos, name)
     elif isinstance(obj, isl.PwAff):
         # work around missing isl_pw_aff_set_dim_name for now.
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index f563c3d8ce995771e6a2ab9c31f776c018616761..e6c05c8782b13c6b861843f20d785baba3d7937b 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -34,11 +34,8 @@ import re
 
 from pytools import UniqueNameGenerator, generate_unique_names, natsorted
 
-from loopy.library.function import (
-        default_function_mangler,
-        single_arg_function_mangler)
-
 from loopy.diagnostic import CannotBranchDomainTree, LoopyError
+from loopy.tools import update_persistent_hash
 from loopy.diagnostic import StaticValueFindingError
 from loopy.kernel.data import filter_iname_tags_by_type, Iname
 from warnings import warn
@@ -108,8 +105,9 @@ class _deprecated_KernelState_SCHEDULED:  # noqa
 
 class KernelState:  # noqa
     INITIAL = 0
-    PREPROCESSED = 1
-    LINEARIZED = 2
+    CALLS_RESOLVED = 1
+    PREPROCESSED = 2
+    LINEARIZED = 3
 
     @_deprecated_KernelState_SCHEDULED
     def SCHEDULED():  # pylint:disable=no-method-argument
@@ -163,8 +161,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
     .. attribute:: domains
 
-        a list of :class:`islpy.BasicSet` instances
-        representing the :ref:`domain-tree`.
+        a list of :class:`islpy.BasicSet` instances representing the
+        :ref:`domain-tree`.
 
     .. attribute:: instructions
 
@@ -193,7 +191,6 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         :class:`loopy.TemporaryVariable`
         instances.
 
-    .. attribute:: function_manglers
     .. attribute:: symbol_manglers
 
     .. attribute:: substitutions
@@ -259,7 +256,6 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             inames=None,
             iname_to_tags=None,
             substitutions=None,
-            function_manglers=None,
             symbol_manglers=[],
 
             iname_slab_increments=None,
@@ -268,7 +264,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
             applied_iname_rewrites=None,
             cache_manager=None,
-            index_dtype=np.int32,
+            index_dtype=None,
             options=None,
 
             state=KernelState.INITIAL,
@@ -296,16 +292,6 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             temporary_variables = {}
         if substitutions is None:
             substitutions = {}
-        if function_manglers is None:
-            function_manglers = [
-                default_function_mangler,
-                single_arg_function_mangler,
-                ]
-        if symbol_manglers is None:
-            function_manglers = [
-                default_function_mangler,
-                single_arg_function_mangler,
-                ]
         if iname_slab_increments is None:
             iname_slab_increments = {}
 
@@ -338,6 +324,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 name: inames.get(name, Iname(name, frozenset()))
                 for name in _get_inames_from_domains(domains)}
 
+        if index_dtype is None:
+            index_dtype = np.int32
+
         # }}}
 
         # {{{ process assumptions
@@ -372,6 +361,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         if state not in [
                 KernelState.INITIAL,
+                KernelState.CALLS_RESOLVED,
                 KernelState.PREPROCESSED,
                 KernelState.LINEARIZED,
                 ]:
@@ -415,7 +405,6 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 substitutions=substitutions,
                 cache_manager=cache_manager,
                 applied_iname_rewrites=applied_iname_rewrites,
-                function_manglers=function_manglers,
                 symbol_manglers=symbol_manglers,
                 index_dtype=index_dtype,
                 options=options,
@@ -429,51 +418,6 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
     # }}}
 
-    # {{{ function mangling
-
-    def mangle_function(self, identifier, arg_dtypes, ast_builder=None):
-        if ast_builder is None:
-            ast_builder = self.target.get_device_ast_builder()
-
-        manglers = ast_builder.function_manglers() + self.function_manglers
-
-        for mangler in manglers:
-            mangle_result = mangler(self, identifier, arg_dtypes)
-            if mangle_result is not None:
-                from loopy.kernel.data import CallMangleInfo
-                if isinstance(mangle_result, CallMangleInfo):
-                    assert len(mangle_result.arg_dtypes) == len(arg_dtypes)
-                    return mangle_result
-
-                assert isinstance(mangle_result, tuple)
-
-                from warnings import warn
-                warn("'%s' returned a tuple instead of a CallMangleInfo instance. "
-                        "This is deprecated." % mangler.__name__,
-                        DeprecationWarning)
-
-                if len(mangle_result) == 2:
-                    result_dtype, target_name = mangle_result
-                    return CallMangleInfo(
-                            target_name=target_name,
-                            result_dtypes=(result_dtype,),
-                            arg_dtypes=None)
-
-                elif len(mangle_result) == 3:
-                    result_dtype, target_name, actual_arg_dtypes = mangle_result
-                    return CallMangleInfo(
-                            target_name=target_name,
-                            result_dtypes=(result_dtype,),
-                            arg_dtypes=actual_arg_dtypes)
-
-                else:
-                    raise ValueError("unexpected size of tuple returned by '%s'"
-                            % mangler.__name__)
-
-        return None
-
-    # }}}
-
     # {{{ symbol mangling
 
     def mangle_symbol(self, ast_builder, identifier):
@@ -547,6 +491,21 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         except KeyError:
             pass
 
+        if name in self.all_inames():
+            from loopy import TemporaryVariable
+            return TemporaryVariable(
+                    name=name,
+                    dtype=self.index_dtype,
+                    shape=())
+
+        try:
+            dtype, name = self.mangle_symbol(self.target.get_device_ast_builder(),
+                    name)
+            from loopy import ValueArg
+            return ValueArg(name, dtype)
+        except TypeError:
+            pass
+
         raise ValueError("nothing known about variable '%s'" % name)
 
     @property
@@ -1087,21 +1046,13 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 constants_only=True)))
 
     @memoize_method
-    def get_grid_sizes_for_insn_ids(self, insn_ids, ignore_auto=False):
-        """Return a tuple (global_size, local_size) containing a grid that
-        could accommodate execution of all instructions whose IDs are given
-        in *insn_ids*.
-
-        :arg insn_ids: a :class:`frozenset` of instruction IDs
-
-        *global_size* and *local_size* are :class:`islpy.PwAff` objects.
+    def get_grid_sizes_for_insn_ids_as_dicts(self, insn_ids,
+            callables_table, ignore_auto=False):
+        """
+        Returns a tuple of (global_sizes, local_sizes), where global_sizes,
+        local_sizes are the grid sizes accommodating all of *insn_ids*. The grid
+        sizes are a dict from the axis index to the corresponding grid size.
         """
-
-        if self.overridden_get_grid_sizes_for_insn_ids:
-            return self.overridden_get_grid_sizes_for_insn_ids(
-                    insn_ids,
-                    ignore_auto=ignore_auto)
-
         all_inames_by_insns = set()
         for insn_id in insn_ids:
             all_inames_by_insns |= self.insn_inames(insn_id)
@@ -1112,9 +1063,49 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                     % (", ".join(sorted(all_inames_by_insns)),
                         ", ".join(sorted(self.all_inames()))))
 
+        # {{{ include grid constraints due to callees
+
         global_sizes = {}
         local_sizes = {}
 
+        from loopy.kernel.data import ValueArg
+        from loopy.kernel.instruction import CallInstruction
+        from loopy.kernel.function_interface import (CallableKernel,
+                get_kw_pos_association)
+        from loopy.isl_helpers import subst_into_pwaff
+        from loopy.symbolic import ResolvedFunction
+
+        for insn in self.instructions:
+            if isinstance(insn, CallInstruction) and isinstance(
+                    insn.expression.function, ResolvedFunction):
+                clbl = callables_table[insn.expression.function.name]
+                if isinstance(clbl, CallableKernel):
+                    _, pos_to_kw = get_kw_pos_association(clbl.subkernel)
+                    subst_dict = {
+                            pos_to_kw[i]: param
+                            for i, param in enumerate(insn.expression.parameters)
+                            if isinstance(clbl.subkernel.arg_dict[pos_to_kw[i]],
+                                          ValueArg)}
+
+                    gsize, lsize = (
+                            clbl.subkernel.get_grid_sizes_for_insn_ids_as_dicts(
+                                frozenset(insn.id
+                                          for insn in clbl.subkernel.instructions),
+                                callables_table, ignore_auto))
+
+                    for tgt_dict, tgt_size in [(global_sizes, gsize),
+                                                (local_sizes, lsize)]:
+
+                        for iaxis, size in tgt_size.items():
+                            size = subst_into_pwaff(self.assumptions.space,
+                                    size, subst_dict)
+                            if iaxis in tgt_dict:
+                                tgt_dict[iaxis] = tgt_dict[iaxis].max(size)
+                            else:
+                                tgt_dict[iaxis] = size
+
+        # }}}
+
         from loopy.kernel.data import (
                 GroupIndexTag, LocalIndexTag,
                 AutoLocalIndexTagBase)
@@ -1156,6 +1147,32 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
             tgt_dict[tag.axis] = size
 
+        return global_sizes, local_sizes
+
+    @memoize_method
+    def get_grid_sizes_for_insn_ids(self, insn_ids, callables_table,
+            ignore_auto=False, return_dict=False):
+        """Return a tuple (global_size, local_size) containing a grid that
+        could accommodate execution of all instructions whose IDs are given
+        in *insn_ids*.
+
+        :arg insn_ids: a :class:`frozenset` of instruction IDs
+
+        *global_size* and *local_size* are :class:`islpy.PwAff` objects.
+        """
+
+        if self.overridden_get_grid_sizes_for_insn_ids:
+            return self.overridden_get_grid_sizes_for_insn_ids(
+                    insn_ids,
+                    callables_table=callables_table,
+                    ignore_auto=ignore_auto)
+
+        global_sizes, local_sizes = self.get_grid_sizes_for_insn_ids_as_dicts(
+                insn_ids, callables_table, ignore_auto=ignore_auto)
+
+        if return_dict:
+            return global_sizes, local_sizes
+
         def to_dim_tuple(size_dict, which, forced_sizes={}):
             forced_sizes = forced_sizes.copy()
 
@@ -1185,7 +1202,9 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         return (to_dim_tuple(global_sizes, "global"),
                 to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes))
 
-    def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, ignore_auto=False):
+    @memoize_method
+    def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids,
+            callables_table, ignore_auto=False, return_dict=False):
         """Return a tuple (global_size, local_size) containing a grid that
         could accommodate execution of all instructions whose IDs are given
         in *insn_ids*.
@@ -1196,7 +1215,15 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         """
 
         grid_size, group_size = self.get_grid_sizes_for_insn_ids(
-                insn_ids, ignore_auto)
+                insn_ids, callables_table, ignore_auto, return_dict)
+
+        if return_dict:
+            def dict_to_exprs(d):
+                from loopy.symbolic import pw_aff_to_expr
+                return {k: pw_aff_to_expr(v, int_ok=True)
+                        for k, v in d.items()}
+
+            return dict_to_exprs(grid_size), dict_to_exprs(group_size)
 
         def tup_to_exprs(tup):
             from loopy.symbolic import pw_aff_to_expr
@@ -1204,7 +1231,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         return tup_to_exprs(grid_size), tup_to_exprs(group_size)
 
-    def get_grid_size_upper_bounds(self, ignore_auto=False):
+    def get_grid_size_upper_bounds(self, callables_table, ignore_auto=False,
+            return_dict=False):
         """Return a tuple (global_size, local_size) containing a grid that
         could accommodate execution of *all* instructions in the kernel.
 
@@ -1212,18 +1240,19 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         """
         return self.get_grid_sizes_for_insn_ids(
                 frozenset(insn.id for insn in self.instructions),
-                ignore_auto=ignore_auto)
+                callables_table, ignore_auto=ignore_auto)
 
-    def get_grid_size_upper_bounds_as_exprs(self, ignore_auto=False):
+    def get_grid_size_upper_bounds_as_exprs(self, callables_table,
+            ignore_auto=False, return_dict=False):
         """Return a tuple (global_size, local_size) containing a grid that
         could accommodate execution of *all* instructions in the kernel.
 
         *global_size* and *local_size* are :mod:`pymbolic` expressions
         """
-
         return self.get_grid_sizes_for_insn_ids_as_exprs(
                 frozenset(insn.id for insn in self.instructions),
-                ignore_auto=ignore_auto)
+                callables_table, ignore_auto=ignore_auto,
+                return_dict=return_dict)
 
     # }}}
 
@@ -1454,14 +1483,11 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         """
         Execute the :class:`LoopKernel`.
         """
-        key = self.target.get_kernel_executor_cache_key(*args, **kwargs)
-        try:
-            kex = self._kernel_executor_cache[key]
-        except KeyError:
-            kex = self.target.get_kernel_executor(self, *args, **kwargs)
-            self._kernel_executor_cache[key] = kex
-
-        return kex(*args, **kwargs)
+        warn("Calling a LoopKernel is deprecated, call a Program "
+                "instead.", DeprecationWarning, stacklevel=2)
+        from loopy.program import make_program
+        program = make_program(self)
+        return program(*args, **kwargs)
 
     # }}}
 
@@ -1558,18 +1584,10 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             # resolve hash conflicts.
 
             "preamble_generators",
-            "function_manglers",
             "symbol_manglers",
             )
 
-    def update_persistent_hash(self, key_hash, key_builder):
-        """Custom hash computation function for use with
-        :class:`pytools.persistent_dict.PersistentDict`.
-
-        Only works in conjunction with :class:`loopy.tools.KeyBuilder`.
-        """
-        for field_name in self.hash_fields:
-            key_builder.rec(key_hash, getattr(self, field_name))
+    update_persistent_hash = update_persistent_hash
 
     @memoize_method
     def __hash__(self):
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 9fd166ab8f15bdc97006c94c7d03977b64c08292..8fdcb1386ecd2873d8f511095cf1914e0dff292b 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -88,6 +88,9 @@ class _StrideArrayDimTagBase(ArrayDimImplementationTag):
         :class:`ComputedStrideArrayDimTag` instances may occur.
     """
 
+    def depends_on(self):
+        raise NotImplementedError()
+
 
 class FixedStrideArrayDimTag(_StrideArrayDimTagBase):
     """An arg dimension implementation tag for a fixed (potentially
@@ -145,6 +148,14 @@ class FixedStrideArrayDimTag(_StrideArrayDimTagBase):
 
         return self.copy(stride=mapper(self.stride))
 
+    def depends_on(self):
+        from loopy.kernel.data import auto
+        from loopy.symbolic import DependencyMapper
+        if self.stride is auto:
+            return frozenset()
+
+        return DependencyMapper(composite_leaves=auto)(self.stride)
+
 
 class ComputedStrideArrayDimTag(_StrideArrayDimTagBase):
     """
@@ -179,6 +190,9 @@ class ComputedStrideArrayDimTag(_StrideArrayDimTagBase):
     def map_expr(self, mapper):
         return self
 
+    def depends_on(self):
+        return frozenset()
+
 
 class SeparateArrayArrayDimTag(ArrayDimImplementationTag):
     def stringify(self, include_target_axis):
@@ -190,6 +204,9 @@ class SeparateArrayArrayDimTag(ArrayDimImplementationTag):
     def map_expr(self, mapper):
         return self
 
+    def depends_on(self):
+        return frozenset()
+
 
 class VectorArrayDimTag(ArrayDimImplementationTag):
     def stringify(self, include_target_axis):
@@ -201,6 +218,9 @@ class VectorArrayDimTag(ArrayDimImplementationTag):
     def map_expr(self, mapper):
         return self
 
+    def depends_on(self):
+        return frozenset()
+
 
 NESTING_LEVEL_RE = re.compile(r"^N([-0-9]+)(?::(.*)|)$")
 PADDED_STRIDE_TAG_RE = re.compile(r"^([a-zA-Z]*)\(pad=(.*)\)$")
@@ -864,6 +884,7 @@ class ArrayBase(ImmutableRecord, Taggable):
                 order=order,
                 alignment=alignment,
                 for_atomic=for_atomic,
+                target=target,
                 tags=tags,
                 **kwargs)
 
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index a9129cd9317366fd10a9c1d44cf2410e90ba554d..8a2e9cde1b670936b6545a88ff89c412463f78fd 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -23,16 +23,18 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-
 import numpy as np
 
 from pymbolic.mapper import CSECachingMapperMixin
+from pymbolic.primitives import Slice, Variable, Subscript, Call
 from loopy.tools import intern_frozenset_of_ids, Optional
-from loopy.symbolic import IdentityMapper, WalkMapper
+from loopy.symbolic import (
+        IdentityMapper, WalkMapper, SubArrayRef)
 from loopy.kernel.data import (
         InstructionBase,
         MultiAssignmentBase, Assignment,
-        SubstitutionRule)
+        SubstitutionRule, AddressSpace, ValueArg)
+from loopy.program import iterate_over_kernels_if_given_program
 from loopy.diagnostic import LoopyError, warn_with_kernel
 import islpy as isl
 from islpy import dim_type
@@ -530,9 +532,11 @@ def parse_insn(groups, insn_options):
             assignee_names.append(inner_lhs_i.name)
         elif isinstance(inner_lhs_i, (Subscript, LinearSubscript)):
             assignee_names.append(inner_lhs_i.aggregate.name)
+        elif isinstance(inner_lhs_i, SubArrayRef):
+            assignee_names.append(inner_lhs_i.subscript.aggregate.name)
         else:
             raise LoopyError("left hand side of assignment '%s' must "
-                    "be variable or subscript" % (lhs_i,))
+                    "be variable, subscript or a SubArrayRef" % (lhs_i,))
 
         new_lhs.append(lhs_i)
 
@@ -1080,6 +1084,9 @@ def parse_domains(domains, defines):
 
         result.append(dom)
 
+    if result == []:
+        result = [isl.BasicSet("{:}")]
+
     return result
 
 # }}}
@@ -1168,8 +1175,7 @@ class ArgumentGuesser:
     def make_new_arg(self, arg_name):
         arg_name = arg_name.strip()
         import loopy as lp
-
-        from loopy.kernel.data import ValueArg, ArrayArg, AddressSpace
+        from loopy.kernel.data import ValueArg, ArrayArg
 
         if arg_name in self.all_params:
             return ValueArg(arg_name)
@@ -1720,7 +1726,7 @@ def _is_wildcard(s):
 
 
 def _resolve_dependencies(what, knl, insn, deps):
-    from loopy import find_instructions
+    from loopy.transform.instruction import find_instructions
     from loopy.match import MatchExpressionBase
 
     new_deps = []
@@ -1814,6 +1820,7 @@ def add_inferred_inames(knl):
 
 # {{{ apply single-writer heuristic
 
+@iterate_over_kernels_if_given_program
 def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True):
     logger.debug("%s: default deps" % kernel.name)
 
@@ -1882,9 +1889,211 @@ def apply_single_writer_depencency_heuristic(kernel, warn_if_used=True):
 # }}}
 
 
+# {{{ slice to sub array ref
+
+def normalize_slice_params(slice, dimension_length):
+    """
+    Returns the normalized slice parameters ``(start, stop, step)``.
+
+    :arg slice: An instance of :class:`pymbolic.primitives.Slice`.
+    :arg dimension_length: Length of the axis being sliced.
+    """
+    from pymbolic.primitives import Slice
+    assert isinstance(slice, Slice)
+    start, stop, step = slice.start, slice.stop, slice.step
+
+    # {{{ defaulting parameters
+
+    if step is None:
+        step = 1
+
+    if step == 0:
+        raise LoopyError("Slice cannot have 0 step size.")
+
+    if start is None:
+        if step > 0:
+            start = 0
+        else:
+            start = dimension_length-1
+
+    if stop is None:
+        if step > 0:
+            stop = dimension_length
+        else:
+            stop = -1
+
+    # }}}
+
+    return start, stop, step
+
+
+class SliceToInameReplacer(IdentityMapper):
+    """
+    Converts slices to instances of :class:`loopy.symbolic.SubArrayRef`.
+
+    .. attribute:: var_name_gen
+
+        Variable name generator, in order to generate unique inames within the
+        kernel domain.
+
+    .. attribute:: knl
+
+        An instance of :class:`loopy.LoopKernel`
+
+    .. attribute:: subarray_ref_bounds
+
+        A :class:`list` (one entry for each :class:`SubArrayRef` to be created)
+        of :class:`dict` instances to store the slices enountered in the
+        expressions as a mapping from ``iname`` to a tuple of ``(start, stop,
+        step)``, which describes the boxy (i.e. affine) constraints imposed on
+        the ``iname`` by the corresponding slice notation its intended to
+        replace.
+    """
+    def __init__(self, knl):
+        self.subarray_ref_bounds = []
+        self.knl = knl
+        self.var_name_gen = knl.get_var_name_generator()
+
+    def map_subscript(self, expr):
+        subscript_iname_bounds = {}
+
+        new_index = []
+        swept_inames = []
+        for i, index in enumerate(expr.index_tuple):
+            if isinstance(index, Slice):
+                unique_var_name = self.var_name_gen(based_on="i")
+                if expr.aggregate.name in self.knl.arg_dict:
+                    shape = self.knl.arg_dict[expr.aggregate.name].shape
+                else:
+                    assert expr.aggregate.name in self.knl.temporary_variables
+                    shape = self.knl.temporary_variables[
+                            expr.aggregate.name].shape
+                if shape is None or shape[i] is None:
+                    raise LoopyError("Slice notation is only supported for "
+                            "variables whose shapes are known at creation time "
+                            "-- maybe add the shape for '{}'.".format(
+                                expr.aggregate.name))
+
+                domain_length = shape[i]
+                start, stop, step = normalize_slice_params(index, domain_length)
+                subscript_iname_bounds[unique_var_name] = (start, stop, step)
+                new_index.append(start+step*Variable(unique_var_name))
+                swept_inames.append(Variable(unique_var_name))
+            else:
+                new_index.append(index)
+
+        if swept_inames:
+            self.subarray_ref_bounds.append(subscript_iname_bounds)
+            result = SubArrayRef(tuple(swept_inames), Subscript(
+                self.rec(expr.aggregate),
+                self.rec(tuple(new_index))))
+        else:
+            result = super().map_subscript(expr)
+
+        return result
+
+    def map_call(self, expr):
+        from pymbolic.primitives import CallWithKwargs
+        new_expr = self.rec(CallWithKwargs(expr.function, expr.parameters, {}))
+        return Call(new_expr.function, new_expr.parameters)
+
+    def map_call_with_kwargs(self, expr):
+        def _convert_array_to_slices(arg):
+            # FIXME: We do not support something like A[1] should point to the
+            # second row if 'A' is 3 x 3 array.
+            if isinstance(arg, Variable):
+                from loopy.kernel.data import auto
+                if (arg.name in self.knl.temporary_variables):
+                    if self.knl.temporary_variables[arg.name].shape in [
+                            auto, None]:
+                        # do not convert arrays with unknown shapes to slices.
+                        # (If an array of unknown shape was passed in error, will be
+                        # caught and raised during preprocessing).
+                        array_arg_shape = ()
+                    else:
+                        array_arg_shape = (
+                                self.knl.temporary_variables[arg.name].shape)
+                elif arg.name in self.knl.arg_dict:
+                    if isinstance(self.knl.arg_dict[arg.name], ValueArg):
+                        array_arg_shape = ()
+                    else:
+
+                        if self.knl.arg_dict[arg.name].shape in [
+                                auto, None]:
+                            # do not convert arrays with unknown shapes to slices.
+                            # (If an array of unknown shape was passed in error, will
+                            # be caught and raised during preprocessing).
+                            array_arg_shape = ()
+                        else:
+                            array_arg_shape = (
+                                    self.knl.arg_dict[arg.name].shape)
+                else:
+                    assert arg.name in self.knl.all_inames()
+                    array_arg_shape = ()
+
+                if array_arg_shape != ():
+                    return Subscript(arg, tuple(Slice(())
+                                                for _ in array_arg_shape))
+            return arg
+
+        from pymbolic.primitives import CallWithKwargs
+        return CallWithKwargs(expr.function,
+                tuple(self.rec(_convert_array_to_slices(par))
+                      for par in expr.parameters),
+                {kw: self.rec(_convert_array_to_slices(par))
+                 for kw, par in expr.kw_parameters.items()})
+
+    def get_iname_domain_as_isl_set(self):
+        """
+        Returns the extra domain constraints imposed by the slice inames,
+        recorded in :attr:`iname_domains`.
+        """
+        subarray_ref_domains = []
+        for sar_bounds in self.subarray_ref_bounds:
+            ctx = self.knl.isl_context
+            space = isl.Space.create_from_names(ctx,
+                    set=list(sar_bounds.keys()))
+            from loopy.symbolic import get_dependencies
+            args_as_params_for_domains = set()
+            for slice_ in sar_bounds.values():
+                args_as_params_for_domains |= get_dependencies(slice_)
+
+            space = space.add_dims(dim_type.param, len(args_as_params_for_domains))
+            for i, arg in enumerate(args_as_params_for_domains):
+                space = space.set_dim_name(dim_type.param, i, arg)
+
+            iname_set = isl.BasicSet.universe(space)
+
+            from loopy.isl_helpers import make_slab
+            for iname, (start, stop, step) in sar_bounds.items():
+                iname_set = iname_set & make_slab(space, iname, start, stop, step)
+
+            subarray_ref_domains.append(iname_set)
+
+        return subarray_ref_domains
+
+
+def realize_slices_array_inputs_as_sub_array_refs(kernel):
+    """
+    Returns a kernel with the instances of :class:`pymbolic.primitives.Slice`
+    encountered in expressions replaced as `loopy.symbolic.SubArrayRef`.
+    """
+    slice_replacer = SliceToInameReplacer(kernel)
+    new_insns = [insn.with_transformed_expressions(slice_replacer)
+                for insn in kernel.instructions]
+
+    return kernel.copy(
+            domains=(
+                kernel.domains
+                + slice_replacer.get_iname_domain_as_isl_set()),
+            instructions=new_insns)
+
+# }}}
+
+
 # {{{ kernel creation top-level
 
-def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
+def make_function(domains, instructions, kernel_data=["..."], **kwargs):
     """User-facing kernel creation entrypoint.
 
     :arg domains:
@@ -2047,7 +2256,11 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
 
         # This *is* gross. But it seems like the right thing interface-wise.
         import inspect
-        caller_globals = inspect.currentframe().f_back.f_globals
+        if inspect.currentframe().f_back.f_code.co_name == "make_kernel":
+            # if caller is "make_kernel", read globals from make_kernel's caller
+            caller_globals = inspect.currentframe().f_back.f_back.f_globals
+        else:
+            caller_globals = inspect.currentframe().f_back.f_globals
 
         for ver_sym in LANGUAGE_VERSION_SYMBOLS:
             try:
@@ -2064,7 +2277,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
             from loopy.version import (
                     MOST_RECENT_LANGUAGE_VERSION,
                     FALLBACK_LANGUAGE_VERSION)
-            warn("'lang_version' was not passed to make_kernel(). "
+            warn("'lang_version' was not passed to make_function(). "
                     "To avoid this warning, pass "
                     "lang_version={ver} in this invocation. "
                     "(Or say 'from loopy.version import "
@@ -2180,6 +2393,10 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     check_for_nonexistent_iname_deps(knl)
 
     knl = create_temporaries(knl, default_order)
+
+    # convert slices to iname domains
+    knl = realize_slices_array_inputs_as_sub_array_refs(knl)
+
     # -------------------------------------------------------------------------
     # Ordering dependency:
     # -------------------------------------------------------------------------
@@ -2217,15 +2434,25 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     check_for_duplicate_names(knl)
     check_written_variable_names(knl)
 
+    from loopy.kernel.tools import infer_args_are_input_output
+    knl = infer_args_are_input_output(knl)
+
     from loopy.preprocess import prepare_for_caching
     knl = prepare_for_caching(knl)
 
     creation_plog.done()
 
-    from loopy.kernel.tools import infer_arg_is_output_only
-    knl = infer_arg_is_output_only(knl)
+    from loopy.program import make_program
+    return make_program(knl)
+
+
+def make_kernel(*args, **kwargs):
+    tunit = make_function(*args, **kwargs)
+    name, = [name for name in tunit.callables_table]
+    return tunit.with_entrypoints(name)
+
 
-    return knl
+make_kernel.__doc__ = make_function.__doc__
 
 # }}}
 
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 620211cf29b464e297119ede597bb1abadaff193..377d13e61218df92ee4236988ab6d31af9a0289e 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -363,6 +363,8 @@ class KernelArgument(ImmutableRecord):
 
             dtype = None
         kwargs["dtype"] = dtype
+        kwargs["is_output"] = kwargs.pop("is_output", None)
+        kwargs["is_input"] = kwargs.pop("is_input", None)
 
         ImmutableRecord.__init__(self, **kwargs)
 
@@ -375,21 +377,39 @@ class ArrayArg(ArrayBase, KernelArgument):
             An attribute of :class:`AddressSpace` defining the address
             space in which the array resides.
 
-        .. attribute:: is_output_only
+        .. attribute:: is_output
 
-            An instance of :class:`bool`. If set to *True*, recorded to be
-            returned from the kernel.
+            An instance of :class:`bool`. If set to *True*, the array is used to
+            return information to the caller. If set to *False*, the callee does not
+            write to the array during a call.
+
+        .. attribute:: is_input
+
+            An instance of :class:`bool`. If set to *True*, expected to be provided
+            by the caller. If *False*, the callee does not depend on the array
+            at kernel entry.
         """)
 
     allowed_extra_kwargs = [
             "address_space",
-            "is_output_only",
+            "is_output",
+            "is_input",
             "tags"]
 
     def __init__(self, *args, **kwargs):
         if "address_space" not in kwargs:
             raise TypeError("'address_space' must be specified")
-        kwargs["is_output_only"] = kwargs.pop("is_output_only", False)
+
+        is_output_only = kwargs.pop("is_output_only", None)
+        if is_output_only is not None:
+            warn("'is_output_only' is deprecated. Use 'is_output', 'is_input'"
+                    " instead.", DeprecationWarning, stacklevel=2)
+            kwargs["is_output"] = is_output_only
+            kwargs["is_input"] = not is_output_only
+        else:
+            kwargs["is_output"] = kwargs.pop("is_output", None)
+            kwargs["is_input"] = kwargs.pop("is_input", None)
+
         super().__init__(*args, **kwargs)
 
     min_target_axes = 0
@@ -416,7 +436,8 @@ class ArrayArg(ArrayBase, KernelArgument):
         """
         super().update_persistent_hash(key_hash, key_builder)
         key_builder.rec(key_hash, self.address_space)
-        key_builder.rec(key_hash, self.is_output_only)
+        key_builder.rec(key_hash, self.is_output)
+        key_builder.rec(key_hash, self.is_input)
 
 
 # Making this a function prevents incorrect use in isinstance.
@@ -433,6 +454,17 @@ def GlobalArg(*args, **kwargs):
 
 class ConstantArg(ArrayBase, KernelArgument):
     __doc__ = ArrayBase.__doc__
+
+    def __init__(self, *args, **kwargs):
+        if kwargs.pop("address_space", AddressSpace.GLOBAL) != AddressSpace.GLOBAL:
+            raise LoopyError("'address_space' for ConstantArg must be GLOBAL.")
+        super().__init__(*args, **kwargs)
+
+    # Constant Arg cannot be an output
+    is_output = False
+    is_input = True
+    address_space = AddressSpace.GLOBAL
+
     min_target_axes = 0
     max_target_axes = 1
 
@@ -466,7 +498,7 @@ class ImageArg(ArrayBase, KernelArgument):
 
 class ValueArg(KernelArgument, Taggable):
     def __init__(self, name, dtype=None, approximately=1000, target=None,
-            is_output_only=False, tags=None):
+            is_output=False, is_input=True, tags=None):
         """
         :arg tags: A an instance of or Iterable of instances of
             :class:`pytools.tag.Tag` intended for consumption by an
@@ -477,7 +509,9 @@ class ValueArg(KernelArgument, Taggable):
                 dtype=dtype,
                 approximately=approximately,
                 target=target,
-                is_output_only=is_output_only, tags=tags)
+                is_output=is_output,
+                is_input=is_input,
+                tags=tags)
 
     def __str__(self):
         import loopy as lp
diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py
new file mode 100644
index 0000000000000000000000000000000000000000..6779a1bc75e50a56d2efa2f4a5dbaeb2e9b59e21
--- /dev/null
+++ b/loopy/kernel/function_interface.py
@@ -0,0 +1,936 @@
+__copyright__ = "Copyright (C) 2018 Andreas Kloeckner, Kaushik Kulkarni"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from pytools import ImmutableRecord
+from loopy.diagnostic import LoopyError
+
+from loopy.tools import update_persistent_hash
+from loopy.kernel import LoopKernel
+from loopy.kernel.array import ArrayBase
+from loopy.kernel.data import ValueArg, ArrayArg
+from loopy.symbolic import DependencyMapper, WalkMapper
+
+__doc__ = """
+
+.. currentmodule:: loopy
+
+.. autoclass:: ValueArgDescriptor
+.. autoclass:: ArrayArgDescriptor
+.. autoclass:: InKernelCallable
+.. autoclass:: CallableKernel
+.. autoclass:: ScalarCallable
+
+"""
+
+
+# {{{ argument descriptors
+
+class ValueArgDescriptor(ImmutableRecord):
+    hash_fields = ()
+
+    def map_expr(self, subst_mapper):
+        return self.copy()
+
+    def depends_on(self):
+        return frozenset()
+
+    update_persistent_hash = update_persistent_hash
+
+
+class ArrayArgDescriptor(ImmutableRecord):
+    """
+    Records information about an array argument to an in-kernel callable. To be
+    passed to and returned from
+    :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`, used for
+    matching shape and address space of caller and callee kernels.
+
+    ..attribute:: shape
+
+        Shape of the array.
+
+    .. attribute:: address_space
+
+        An attribute of :class:`loopy.kernel.data.AddressSpace`.
+
+    .. attribute:: dim_tags
+
+        A tuple of instances of
+        :class:`loopy.kernel.array.ArrayDimImplementationTag`
+
+    .. automethod:: map_expr
+    .. automethod:: depends_on
+    """
+
+    fields = {"shape", "address_space", "dim_tags"}
+
+    def __init__(self, shape, address_space, dim_tags):
+
+        # {{{ sanity checks
+
+        from loopy.kernel.array import ArrayDimImplementationTag
+        from loopy.kernel.data import auto
+
+        assert isinstance(shape, tuple) or shape in [None, auto]
+        assert isinstance(dim_tags, tuple) or dim_tags is None
+
+        if dim_tags:
+            # FIXME at least vector dim tags should be supported
+            assert all(isinstance(dim_tag, ArrayDimImplementationTag) for dim_tag in
+                    dim_tags)
+
+        # }}}
+
+        super().__init__(
+                shape=shape,
+                address_space=address_space,
+                dim_tags=dim_tags)
+
+    def map_expr(self, f):
+        """
+        Returns an instance of :class:`ArrayArgDescriptor` with its shapes, strides,
+        mapped by *f*.
+        """
+        if self.shape is not None:
+            new_shape = tuple(f(axis_len) for axis_len in self.shape)
+        else:
+            new_shape = None
+
+        if self.dim_tags is not None:
+            new_dim_tags = tuple(dim_tag.map_expr(f) for dim_tag in self.dim_tags)
+        else:
+            new_dim_tags = None
+
+        return self.copy(shape=new_shape, dim_tags=new_dim_tags)
+
+    def depends_on(self):
+        """
+        Returns class:`frozenset` of all the variable names the
+        :class:`ArrayArgDescriptor` depends on.
+        """
+        from loopy.kernel.data import auto
+        result = set()
+
+        if self.shape:
+            dep_mapper = DependencyMapper(composite_leaves=False)
+            for axis_len in self.shape:
+                if axis_len not in [None, auto]:
+                    result |= dep_mapper(axis_len)
+
+        if self.dim_tags:
+            for dim_tag in self.dim_tags:
+                result |= dim_tag.depends_on()
+
+        return frozenset(var.name for var in result)
+
+    def update_persistent_hash(self, key_hash, key_builder):
+        key_builder.update_for_pymbolic_expression(key_hash, self.shape)
+        key_builder.rec(key_hash, self.address_space)
+        key_builder.rec(key_hash, self.dim_tags)
+
+
+class ExpressionIsScalarChecker(WalkMapper):
+    def __init__(self, kernel):
+        self.kernel = kernel
+
+    def map_sub_array_ref(self, expr):
+        raise LoopyError("Sub-array refs can only be used as call's parameters"
+                f" or assignees. '{expr}'violates this.")
+
+    def map_call(self, expr):
+        for child in expr.parameters:
+            self.rec(child)
+
+    def map_call_with_kwargs(self, expr):
+        for child in expr.parameters + tuple(expr.kw_parameters.values()):
+            self.rec(child)
+
+    def map_subscript(self, expr):
+        for child in expr.index_tuple:
+            self.rec(child)
+
+    def map_variable(self, expr):
+        from loopy.kernel.data import TemporaryVariable, ArrayArg, auto
+        if expr.name in self.kernel.all_inames():
+            # inames are scalar
+            return
+
+        var = self.kernel.arg_dict.get(expr.name, None) or (
+                self.kernel.temporary_variables.get(expr.name, None))
+
+        if var is not None:
+            if isinstance(var, (ArrayArg, TemporaryVariable)) and (
+                    var.shape != () and var.shape is not auto):
+                raise LoopyError("Array regions can only passed as sub-array refs.")
+
+    def map_slice(self, expr):
+        raise LoopyError("Array regions can only passed as sub-array refs.")
+
+
+def get_arg_descriptor_for_expression(kernel, expr):
+    """
+    :returns: a :class:`ArrayArgDescriptor` or a :class:`ValueArgDescriptor`
+        describing the argument expression *expr* which occurs
+        in a call in the code of *kernel*.
+    """
+    from loopy.symbolic import (SubArrayRef, pw_aff_to_expr,
+            SweptInameStrideCollector)
+    from loopy.kernel.data import TemporaryVariable, ArrayArg
+
+    if isinstance(expr, SubArrayRef):
+        name = expr.subscript.aggregate.name
+        arg = kernel.get_var_descriptor(name)
+
+        if not isinstance(arg, (TemporaryVariable, ArrayArg)):
+            raise LoopyError("unsupported argument type "
+                    "'%s' of '%s' in call statement"
+                    % (type(arg).__name__, expr.name))
+
+        aspace = arg.address_space
+
+        from loopy.kernel.array import FixedStrideArrayDimTag as DimTag
+        sub_dim_tags = []
+        sub_shape = []
+
+        # FIXME This blindly assumes that dim_tag has a stride and
+        # will not work for non-stride dim tags (e.g. vec or sep).
+
+        # (AK) FIXME: This will almost always be nonlinear--when does this
+        # actually help? Maybe remove this?
+        # (KK) Reply: This helps in identifying identities like
+        # "2*(i//2) + i%2" := "i"
+        # See the kernel in
+        # test_callables.py::test_shape_translation_through_sub_array_refs
+
+        from loopy.symbolic import simplify_using_aff
+        linearized_index = simplify_using_aff(
+                kernel,
+                sum(dim_tag.stride*iname for dim_tag, iname in
+                    zip(arg.dim_tags, expr.subscript.index_tuple)))
+
+        strides_as_dict = SweptInameStrideCollector(
+                tuple(iname.name for iname in expr.swept_inames)
+                )(linearized_index)
+        sub_dim_tags = tuple(
+                # Not all swept inames necessarily occur in the expression.
+                DimTag(strides_as_dict.get(iname, 0))
+                for iname in expr.swept_inames)
+        sub_shape = tuple(
+                pw_aff_to_expr(
+                    kernel.get_iname_bounds(iname.name).upper_bound_pw_aff
+                    - kernel.get_iname_bounds(iname.name).lower_bound_pw_aff)+1
+                for iname in expr.swept_inames)
+
+        return ArrayArgDescriptor(
+                address_space=aspace,
+                dim_tags=sub_dim_tags,
+                shape=sub_shape)
+    else:
+        ExpressionIsScalarChecker(kernel)(expr)
+        return ValueArgDescriptor()
+
+# }}}
+
+
+# {{{ helper function for in-kernel callables
+
+def get_kw_pos_association(kernel):
+    """
+    Returns a tuple of ``(kw_to_pos, pos_to_kw)`` for the arguments in
+    *kernel*.
+    """
+    kw_to_pos = {}
+    pos_to_kw = {}
+
+    read_count = 0
+    write_count = -1
+
+    for arg in kernel.args:
+        if arg.is_output:
+            kw_to_pos[arg.name] = write_count
+            pos_to_kw[write_count] = arg.name
+            write_count -= 1
+        if arg.is_input:
+            # if an argument is both input and output then kw_to_pos is
+            # overwritten with its expected position in the parameters
+            kw_to_pos[arg.name] = read_count
+            pos_to_kw[read_count] = arg.name
+            read_count += 1
+
+    return kw_to_pos, pos_to_kw
+
+
+class GridOverrideForCalleeKernel(ImmutableRecord):
+    """
+    Helper class to set the
+    :attr:`loopy.kernel.LoopKernel.override_get_grid_size_for_insn_ids` of the
+    callee kernels. Refer to
+    :meth:`loopy.kernel.function_interface.GridOverrideForCalleeKernel.__call__`,
+    :meth:`loopy.kernel.function_interface.CallbleKernel.with_hw_axes_sizes`.
+
+    .. attribute:: global_size
+
+        The global work group size that to be set in the callee kernel.
+
+    .. attribute:: local_size
+
+        The local work group size that has to be set in the callee kernel.
+
+    .. note::
+
+        This class acts as a pseudo-callable and its significance lies in
+        solving picklability issues.
+    """
+    fields = {"local_size", "global_size"}
+
+    def __init__(self, global_size, local_size):
+        self.global_size = global_size
+        self.local_size = local_size
+
+    def __call__(self, insn_ids, callables_table, ignore_auto=True):
+        return self.global_size, self.local_size
+
+# }}}
+
+
+# {{{ template class
+
+class InKernelCallable(ImmutableRecord):
+    """
+    An abstract interface to define a callable encountered in a kernel.
+
+    .. attribute:: name
+
+        The name of the callable which can be encountered within expressions in
+        a kernel.
+
+    .. attribute:: arg_id_to_dtype
+
+        A mapping which indicates the arguments types and result types of the
+        callable.
+
+    .. attribute:: arg_id_to_descr
+
+        A mapping which gives indicates the argument shape and ``dim_tags`` it
+        would be responsible for generating code.
+
+    .. note::
+        - "``arg_id`` can either be an instance of :class:`int` integer
+          corresponding to the position of the argument or an instance of
+          :class:`str` corresponding to the name of keyword argument accepted
+          by the function.
+
+        - Negative "arg_id" values ``-i`` in the mapping attributes indicate
+        return value with (0-based) index *i*.
+
+    .. automethod:: __init__
+    .. automethod:: with_types
+    .. automethod:: with_descrs
+    .. automethod:: with_target
+    .. automethod:: with_hw_axes_sizes
+    .. automethod:: generate_preambles
+    .. automethod:: emit_call
+    .. automethod:: emit_call_insn
+    .. automethod:: is_ready_for_codegen
+    """
+
+    fields = {"arg_id_to_dtype", "arg_id_to_descr"}
+    init_arg_names = ("arg_id_to_dtype", "arg_id_to_descr")
+
+    def __init__(self, arg_id_to_dtype=None, arg_id_to_descr=None):
+
+        super().__init__(
+                arg_id_to_dtype=arg_id_to_dtype,
+                arg_id_to_descr=arg_id_to_descr)
+
+    def __getinitargs__(self):
+        return (self.arg_id_to_dtype, self.arg_id_to_descr)
+
+    update_persistent_hash = update_persistent_hash
+
+    def with_types(self, arg_id_to_dtype, callables_table):
+        """
+        :arg arg_id_to_type: a mapping from argument identifiers
+            (integers for positional arguments, names for keyword
+            arguments) to :class:`loopy.types.LoopyType` instances.
+            Unspecified/unknown types are not represented in *arg_id_to_type*.
+
+            Return values are denoted by negative integers, with the
+            first returned value identified as *-1*.
+
+        :returns: a tuple ``(new_self, arg_id_to_type)``, where *new_self* is a
+            new :class:`InKernelCallable` specialized for the given types,
+            and *arg_id_to_type* is a mapping of the same form as the
+            argument above, however it may have more information present.
+            Any argument information exists both by its positional and
+            its keyword identifier.
+        """
+
+        raise NotImplementedError()
+
+    def with_descrs(self, arg_id_to_descr, callables_table):
+        """
+        :arg arg_id_to_descr: a mapping from argument identifiers (integers for
+            positional arguments, names for keyword arguments) to
+            :class:`loopy.ArrayArgDescriptor` instances.  Unspecified/unknown
+            descriptors are not represented in *arg_id_to_descr*.
+
+            All the expressions in arg_id_to_descr must have variables that belong
+            to the callable's namespace.
+
+            Return values are denoted by negative integers, with the
+            first returned value identified as *-1*.
+
+        :returns: a copy of *self* which is a new instance of
+            :class:`InKernelCallable` specialized for the given types, and
+            *arg_id_to_descr* is a mapping of the same form as the argument above,
+            however it may have more information present.  Any argument information
+            exists both by its positional and its keyword identifier.
+        """
+
+        raise NotImplementedError()
+
+    def with_target(self, target):
+        """
+        Returns a copy of *self* with all the ``dtypes`` in
+        ``in_knl_callable.arg_id_to_dtype`` associated with the *target*. Refer
+        :meth:`loopy.types.LoopyType.with_target`.
+
+        :arg target: An instance of :class:`loopy.target.TargetBase`.
+        """
+
+        if target is None:
+            raise LoopyError("target cannot be None for with_target")
+
+        def with_target_if_not_None(dtype):
+            """
+            Returns a copy of :arg:`dtype` associated with the target. If
+            ``dtype`` is *None* returns *None*.
+            """
+            if dtype:
+                return dtype.with_target(target)
+            else:
+                return None
+
+        new_arg_id_to_dtype = None
+        if self.arg_id_to_dtype is not None:
+            new_arg_id_to_dtype = {id: with_target_if_not_None(dtype)
+                                   for id, dtype in self.arg_id_to_dtype.items()}
+
+        return self.copy(arg_id_to_dtype=new_arg_id_to_dtype)
+
+    def with_hw_axes_sizes(self, global_size, local_size):
+        """
+        Returns a copy of *self* with modifications to comply with the grid
+        sizes ``(local_size, global_size)`` of the program in which it is
+        supposed to be called.
+
+        :arg local_size: An instance of :class:`islpy.PwAff`.
+        :arg global_size: An instance of :class:`islpy.PwAff`.
+        """
+        raise NotImplementedError()
+
+    def is_ready_for_codegen(self):
+
+        return (self.arg_id_to_dtype is not None and
+                self.arg_id_to_descr is not None)
+
+    def generate_preambles(self, target):
+        """
+        Yields the target specific preamble.
+        """
+        raise NotImplementedError()
+
+    def emit_call(self, expression_to_code_mapper, expression, target):
+
+        raise NotImplementedError()
+
+    def emit_call_insn(self, insn, target, expression_to_code_mapper):
+        """
+        Returns a tuple of ``(call, assignee_is_returned)`` which is the target
+        facing function call that would be seen in the generated code. ``call``
+        is an instance of ``pymbolic.primitives.Call`` ``assignee_is_returned``
+        is an instance of :class:`bool` to indicate if the assignee is returned
+        by value of C-type targets.
+
+        *Example:* If ``assignee_is_returned=True``, then ``a, b = f(c, d)`` is
+            interpreted in the target as ``a = f(c, d, &b)``. If
+            ``assignee_is_returned=False``, then ``a, b = f(c, d)`` is interpreted
+            in the target as the statement ``f(c, d, &a, &b)``.
+        """
+
+        raise NotImplementedError()
+
+    def __hash__(self):
+
+        return hash(tuple(self.fields))
+
+    def with_added_arg(self, arg_dtype, arg_descr):
+        """
+        Registers a new argument to the callable and returns the name of the
+        argument in the callable's namespace.
+        """
+        raise NotImplementedError()
+
+# }}}
+
+
+# {{{ scalar callable
+
+class ScalarCallable(InKernelCallable):
+    """
+    An abstract interface the to a scalar callable encountered in a kernel.
+
+    .. note::
+
+        The :meth:`ScalarCallable.with_types` is intended to assist with type
+        specialization of the function and sub-classes must define it.
+    """
+
+    fields = {"name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"}
+    init_arg_names = ("name", "arg_id_to_dtype", "arg_id_to_descr",
+            "name_in_target")
+    hash_fields = ("name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target")
+
+    def __init__(self, name, arg_id_to_dtype=None,
+            arg_id_to_descr=None, name_in_target=None):
+
+        super().__init__(
+                arg_id_to_dtype=arg_id_to_dtype,
+                arg_id_to_descr=arg_id_to_descr)
+
+        self.name = name
+        self.name_in_target = name_in_target
+
+    def __getinitargs__(self):
+        return (self.arg_id_to_dtype, self.arg_id_to_descr,
+                self.name_in_target)
+
+    def with_types(self, arg_id_to_dtype, callables_table):
+        raise LoopyError("No type inference information present for "
+                "the function %s." % (self.name))
+
+    def with_descrs(self, arg_id_to_descr, callables_table):
+
+        arg_id_to_descr[-1] = ValueArgDescriptor()
+        return (
+                self.copy(arg_id_to_descr=arg_id_to_descr),
+                callables_table)
+
+    def with_hw_axes_sizes(self, global_size, local_size):
+        return self.copy()
+
+    def is_ready_for_codegen(self):
+
+        return (self.arg_id_to_dtype is not None and
+                self.arg_id_to_descr is not None)
+
+    # {{{ code generation
+
+    def emit_call(self, expression_to_code_mapper, expression, target):
+
+        assert self.is_ready_for_codegen()
+
+        # must have single assignee
+        assert len(expression.parameters) == len(self.arg_id_to_dtype) - 1
+        arg_dtypes = tuple(self.arg_id_to_dtype[id] for id in
+                range(len(self.arg_id_to_dtype)-1))
+
+        par_dtypes = tuple(expression_to_code_mapper.infer_type(par) for par in
+                expression.parameters)
+
+        from loopy.expression import dtype_to_type_context
+        # processing the parameters with the required dtypes
+        processed_parameters = tuple(
+                expression_to_code_mapper.rec(par,
+                    dtype_to_type_context(target, tgt_dtype),
+                    tgt_dtype)
+                for par, par_dtype, tgt_dtype in zip(
+                    expression.parameters, par_dtypes, arg_dtypes))
+
+        from pymbolic import var
+        return var(self.name_in_target)(*processed_parameters)
+
+    def emit_call_insn(self, insn, target, expression_to_code_mapper):
+        """
+        :arg insn: An instance of :class:`loopy.kernel.instructions.CallInstruction`.
+        :arg target: An instance of :class:`loopy.target.TargetBase`.
+        :arg expression_to_code_mapper: An instance of :class:`IdentityMapper`
+            responsible for code mapping from :mod:`loopy` syntax to the
+            **target syntax**.
+
+        :returns: A tuple of the call to be generated and an instance of
+            :class:`bool` whether the first assignee is a part of the LHS in
+            the assignment instruction.
+
+        .. note::
+
+            The default implementation returns the first assignees and the
+            references of the rest of the assignees are appended to the
+            arguments of the call.
+
+            *Example:* ``c, d = f(a, b)`` is returned as ``c = f(a, b, &d)``
+        """
+
+        # Currently this is formulated such that the first argument is returned
+        # and rest all are passed by reference as arguments to the function.
+        assert self.is_ready_for_codegen()
+
+        from loopy.kernel.instruction import CallInstruction
+
+        assert isinstance(insn, CallInstruction)
+
+        parameters = insn.expression.parameters
+        assignees = insn.assignees[1:]
+
+        par_dtypes = tuple(expression_to_code_mapper.infer_type(par) for par in
+                parameters)
+        arg_dtypes = tuple(self.arg_id_to_dtype[i] for i, _ in
+                enumerate(parameters))
+
+        assignee_dtypes = tuple(self.arg_id_to_dtype[-i-2] for i, _ in
+                enumerate(assignees))
+
+        from loopy.expression import dtype_to_type_context
+        from pymbolic.mapper.stringifier import PREC_NONE
+        from pymbolic import var
+
+        c_parameters = [
+                expression_to_code_mapper(par, PREC_NONE,
+                    dtype_to_type_context(target, tgt_dtype),
+                    tgt_dtype).expr
+                for par, par_dtype, tgt_dtype in zip(
+                    parameters, par_dtypes, arg_dtypes)]
+
+        for i, (a, tgt_dtype) in enumerate(zip(assignees, assignee_dtypes)):
+            if tgt_dtype != expression_to_code_mapper.infer_type(a):
+                raise LoopyError("Type Mismatch in function %s. Expected: %s"
+                        "Got: %s" % (self.name, tgt_dtype,
+                            expression_to_code_mapper.infer_type(a)))
+            c_parameters.append(
+                        var("&")(
+                            expression_to_code_mapper(a, PREC_NONE,
+                                dtype_to_type_context(target, tgt_dtype),
+                                tgt_dtype).expr))
+
+        # assignee is returned whenever the size of assignees is non zero.
+        first_assignee_is_returned = len(insn.assignees) > 0
+
+        return var(self.name_in_target)(*c_parameters), first_assignee_is_returned
+
+    def generate_preambles(self, target):
+        return
+        yield
+
+    # }}}
+
+    def with_added_arg(self, arg_dtype, arg_descr):
+        raise LoopyError("Cannot add args to scalar callables.")
+
+# }}}
+
+
+# {{{ callable kernel
+
+class CallableKernel(InKernelCallable):
+    """
+    Records informations about a callee kernel. Also provides interface through
+    member methods to make the callee kernel compatible to be called from a
+    caller kernel. The :meth:`loopy.register_callable_kernel` should be called
+    in order to initiate association between a function in caller kernel and
+    the callee kernel.
+
+    :meth:`CallableKernel.with_types` should be called in order to match
+    the ``dtypes`` of the arguments that are shared between the caller and the
+    callee kernel.
+
+    :meth:`CallableKernel.with_descrs` should be called in order to match
+    :attr:`ArrayArgDescriptor.dim_tags`, :attr:`ArrayArgDescriptor.shape`,
+    :attr:`ArrayArgDescriptor.address_space`` of the arguments shared between the
+    caller and the callee kernel.
+
+    :meth:`CallableKernel.with_hw_axes` should be called to set the grid
+    sizes for the :attr:`subkernel` of the callable.
+    """
+
+    fields = {"subkernel", "arg_id_to_dtype", "arg_id_to_descr"}
+    init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr")
+    hash_fields = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr")
+
+    def __init__(self, subkernel, arg_id_to_dtype=None,
+            arg_id_to_descr=None):
+        assert isinstance(subkernel, LoopKernel)
+
+        super().__init__(
+                arg_id_to_dtype=arg_id_to_dtype,
+                arg_id_to_descr=arg_id_to_descr)
+
+        self.subkernel = subkernel
+
+    def __getinitargs__(self):
+        return (self.subkernel, self.arg_id_to_dtype,
+                self.arg_id_to_descr)
+
+    @property
+    def name(self):
+        return self.subkernel.name
+
+    def with_types(self, arg_id_to_dtype, callables_table):
+        kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel)
+
+        new_args = []
+        for arg in self.subkernel.args:
+            kw = arg.name
+            if kw in arg_id_to_dtype:
+                # id exists as kw
+                new_args.append(arg.copy(dtype=arg_id_to_dtype[kw]))
+            elif kw_to_pos[kw] in arg_id_to_dtype:
+                # id exists as positional argument
+                new_args.append(arg.copy(
+                    dtype=arg_id_to_dtype[kw_to_pos[kw]]))
+            else:
+                new_args.append(arg)
+
+        from loopy.type_inference import (
+                infer_unknown_types_for_a_single_kernel)
+        pre_specialized_subkernel = self.subkernel.copy(
+                args=new_args)
+
+        # infer the types of the written variables based on the knowledge
+        # of the types of the arguments supplied
+        specialized_kernel, callables_table = (
+                infer_unknown_types_for_a_single_kernel(
+                    pre_specialized_subkernel,
+                    callables_table))
+
+        new_arg_id_to_dtype = {}
+        for pos, kw in pos_to_kw.items():
+            arg = specialized_kernel.arg_dict[kw]
+            if arg.dtype:
+                new_arg_id_to_dtype[kw] = arg.dtype
+                new_arg_id_to_dtype[pos] = arg.dtype
+
+        # Return the kernel call with specialized subkernel and the corresponding
+        # new arg_id_to_dtype
+        return self.copy(subkernel=specialized_kernel,
+                arg_id_to_dtype=new_arg_id_to_dtype), callables_table
+
+    def with_descrs(self, arg_id_to_descr, callables_table):
+
+        # arg_id_to_descr expressions provided are from the caller's namespace,
+        # need to register
+
+        kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel)
+
+        kw_to_callee_idx = {arg.name: i
+                            for i, arg in enumerate(self.subkernel.args)}
+
+        new_args = self.subkernel.args[:]
+
+        for arg_id, descr in arg_id_to_descr.items():
+            if isinstance(arg_id, int):
+                arg_id = pos_to_kw[arg_id]
+
+            callee_arg = new_args[kw_to_callee_idx[arg_id]]
+
+            # {{{ checks
+
+            if isinstance(callee_arg, ValueArg) and (
+                    isinstance(descr, ArrayArgDescriptor)):
+                raise LoopyError(f"In call to {self.subkernel.name}, '{arg_id}' "
+                        "expected to be a scalar, got an array region.")
+
+            if isinstance(callee_arg, ArrayArg) and (
+                    isinstance(descr, ValueArgDescriptor)):
+                raise LoopyError(f"In call to {self.subkernel.name}, '{arg_id}' "
+                        "expected to be an array, got a scalar.")
+
+            if (isinstance(descr, ArrayArgDescriptor)
+                    and isinstance(callee_arg.shape, tuple)
+                    and len(callee_arg.shape) != len(descr.shape)):
+                raise LoopyError(f"In call to {self.subkernel.name}, '{arg_id}'"
+                        " has a dimensionality mismatch, expected "
+                        f"{len(callee_arg.shape)}, got {len(descr.shape)}")
+
+            # }}}
+
+            if isinstance(descr, ArrayArgDescriptor):
+                callee_arg = callee_arg.copy(shape=descr.shape,
+                                             dim_tags=descr.dim_tags,
+                                             address_space=descr.address_space)
+            else:
+                # do nothing for a scalar arg.
+                assert isinstance(descr, ValueArgDescriptor)
+
+            new_args[kw_to_callee_idx[arg_id]] = callee_arg
+
+        subkernel = self.subkernel.copy(args=new_args)
+
+        from loopy.preprocess import traverse_to_infer_arg_descr
+        subkernel, callables_table = (
+                traverse_to_infer_arg_descr(subkernel,
+                    callables_table))
+
+        # {{{ update the arg descriptors
+
+        for arg in subkernel.args:
+            kw = arg.name
+            if isinstance(arg, ArrayBase):
+                arg_id_to_descr[kw] = (
+                        ArrayArgDescriptor(shape=arg.shape,
+                                           dim_tags=arg.dim_tags,
+                                           address_space=arg.address_space))
+            else:
+                assert isinstance(arg, ValueArg)
+                arg_id_to_descr[kw] = ValueArgDescriptor()
+
+            arg_id_to_descr[kw_to_pos[kw]] = arg_id_to_descr[kw]
+
+        # }}}
+
+        return (self.copy(subkernel=subkernel,
+                          arg_id_to_descr=arg_id_to_descr),
+                callables_table)
+
+    def with_added_arg(self, arg_dtype, arg_descr):
+        var_name = self.subkernel.get_var_name_generator()(based_on="_lpy_arg")
+
+        if isinstance(arg_descr, ValueArgDescriptor):
+            subknl = self.subkernel.copy(
+                    args=self.subkernel.args+[
+                        ValueArg(var_name, arg_dtype, self.subkernel.target)])
+
+            kw_to_pos, pos_to_kw = get_kw_pos_association(subknl)
+
+            if self.arg_id_to_dtype is None:
+                arg_id_to_dtype = {}
+            else:
+                arg_id_to_dtype = self.arg_id_to_dtype.copy()
+            if self.arg_id_to_descr is None:
+                arg_id_to_descr = {}
+            else:
+                arg_id_to_descr = self.arg_id_to_descr.copy()
+
+            arg_id_to_dtype[var_name] = arg_dtype
+            arg_id_to_descr[var_name] = arg_descr
+            arg_id_to_dtype[kw_to_pos[var_name]] = arg_dtype
+            arg_id_to_descr[kw_to_pos[var_name]] = arg_descr
+
+            return (self.copy(subkernel=subknl,
+                              arg_id_to_dtype=arg_id_to_dtype,
+                              arg_id_to_descr=arg_id_to_descr),
+                    var_name)
+
+        else:
+            # don't think this should ever be needed
+            raise NotImplementedError("with_added_arg not implemented for array"
+                    " types arguments.")
+
+    def with_packing_for_args(self):
+        from loopy.kernel.data import AddressSpace
+        kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel)
+
+        arg_id_to_descr = {}
+
+        for pos, kw in pos_to_kw.items():
+            arg = self.subkernel.arg_dict[kw]
+            arg_id_to_descr[pos] = ArrayArgDescriptor(
+                    shape=arg.shape,
+                    dim_tags=arg.dim_tags,
+                    address_space=AddressSpace.GLOBAL)
+
+        return self.copy(subkernel=self.subkernel,
+                arg_id_to_descr=arg_id_to_descr)
+
+    def with_hw_axes_sizes(self, gsize, lsize):
+        return self.copy(
+                subkernel=self.subkernel.copy(
+                    overridden_get_grid_sizes_for_insn_ids=(
+                        GridOverrideForCalleeKernel(gsize, lsize))))
+
+    def is_ready_for_codegen(self):
+        return (self.arg_id_to_dtype is not None and
+                self.arg_id_to_descr is not None)
+
+    def generate_preambles(self, target):
+        """ Yields the *target* specific preambles.
+        """
+        # FIXME Check that this is correct.
+
+        return
+        yield
+
+    def emit_call_insn(self, insn, target, expression_to_code_mapper):
+
+        assert self.is_ready_for_codegen()
+
+        from loopy.kernel.instruction import CallInstruction
+        from pymbolic.primitives import CallWithKwargs
+
+        assert isinstance(insn, CallInstruction)
+
+        parameters = insn.expression.parameters
+        kw_parameters = {}
+        if isinstance(insn.expression, CallWithKwargs):
+            kw_parameters = insn.expression.kw_parameters
+
+        assignees = insn.assignees
+
+        parameters = list(parameters)
+        par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)]
+        kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel)
+        for i in range(len(parameters), len(parameters)+len(kw_parameters)):
+            parameters.append(kw_parameters[pos_to_kw[i]])
+            par_dtypes.append(self.arg_id_to_dtype[pos_to_kw[i]])
+
+        # insert the assignees at the required positions
+        assignee_write_count = -1
+        for i, arg in enumerate(self.subkernel.args):
+            if arg.is_output:
+                if not arg.is_input:
+                    assignee = assignees[-assignee_write_count-1]
+                    parameters.insert(i, assignee)
+                    par_dtypes.insert(i, self.arg_id_to_dtype[assignee_write_count])
+
+                assignee_write_count -= 1
+
+        # no type casting in array calls
+        from loopy.expression import dtype_to_type_context
+        from pymbolic.mapper.stringifier import PREC_NONE
+        from loopy.symbolic import SubArrayRef
+        from pymbolic import var
+
+        c_parameters = [
+                expression_to_code_mapper(par, PREC_NONE,
+                    dtype_to_type_context(target, par_dtype),
+                    par_dtype).expr if isinstance(par, SubArrayRef) else
+                expression_to_code_mapper(par, PREC_NONE,
+                    dtype_to_type_context(target, par_dtype),
+                    par_dtype).expr
+                for par, par_dtype in zip(
+                    parameters, par_dtypes)]
+
+        return var(self.subkernel.name)(*c_parameters), False
+
+# }}}
+
+
+# vim: foldmethod=marker
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index 0e56d41612cb6903100718f5dc6c20076294a1a0..6b428ae9330695a8a6883377a9d30766d8c3b3d1 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -468,7 +468,7 @@ class InstructionBase(ImmutableRecord, Taggable):
 
 def _get_assignee_var_name(expr):
     from pymbolic.primitives import Variable, Subscript, Lookup
-    from loopy.symbolic import LinearSubscript
+    from loopy.symbolic import LinearSubscript, SubArrayRef
 
     if isinstance(expr, Lookup):
         expr = expr.aggregate
@@ -487,13 +487,20 @@ def _get_assignee_var_name(expr):
         assert isinstance(agg, Variable)
 
         return agg.name
+
+    elif isinstance(expr, SubArrayRef):
+        agg = expr.subscript.aggregate
+        assert isinstance(agg, Variable)
+
+        return agg.name
+
     else:
         raise RuntimeError("invalid lvalue '%s'" % expr)
 
 
 def _get_assignee_subscript_deps(expr):
     from pymbolic.primitives import Variable, Subscript, Lookup
-    from loopy.symbolic import LinearSubscript, get_dependencies
+    from loopy.symbolic import LinearSubscript, get_dependencies, SubArrayRef
 
     if isinstance(expr, Lookup):
         expr = expr.aggregate
@@ -504,6 +511,9 @@ def _get_assignee_subscript_deps(expr):
         return get_dependencies(expr.index)
     elif isinstance(expr, LinearSubscript):
         return get_dependencies(expr.index)
+    elif isinstance(expr, SubArrayRef):
+        return get_dependencies(expr.subscript.index) - (
+                frozenset(iname.name for iname in expr.swept_inames))
     else:
         raise RuntimeError("invalid lvalue '%s'" % expr)
 
@@ -1034,9 +1044,10 @@ class CallInstruction(MultiAssignmentBase):
                 predicates=predicates,
                 tags=tags)
 
-        from pymbolic.primitives import Call
+        from pymbolic.primitives import Call, CallWithKwargs
         from loopy.symbolic import Reduction
-        if not isinstance(expression, (Call, Reduction)) and expression is not None:
+        if not isinstance(expression, (Call, CallWithKwargs, Reduction)) and (
+                expression is not None):
             raise LoopyError("'expression' argument to CallInstruction "
                     "must be a function call")
 
@@ -1052,9 +1063,10 @@ class CallInstruction(MultiAssignmentBase):
             expression = parse(expression)
 
         from pymbolic.primitives import Variable, Subscript
-        from loopy.symbolic import LinearSubscript
+        from loopy.symbolic import LinearSubscript, SubArrayRef
         for assignee in assignees:
-            if not isinstance(assignee, (Variable, Subscript, LinearSubscript)):
+            if not isinstance(assignee, (Variable, Subscript, LinearSubscript,
+                    SubArrayRef)):
                 raise LoopyError("invalid lvalue '%s'" % assignee)
 
         self.assignees = assignees
@@ -1123,6 +1135,22 @@ class CallInstruction(MultiAssignmentBase):
             result += "\n" + 10*" " + "if (%s)" % " && ".join(self.predicates)
         return result
 
+    def arg_id_to_val(self):
+        """:returns: a :class:`dict` mapping argument identifiers (non-negative numbers
+            for positional arguments, strings for keyword args, and negative numbers
+            for assignees) to their respective values
+        """
+
+        from pymbolic.primitives import CallWithKwargs
+        arg_id_to_val = dict(enumerate(self.expression.parameters))
+        if isinstance(self.expression, CallWithKwargs):
+            for kw, val in self.expression.kw_parameters.items():
+                arg_id_to_val[kw] = val
+        for i, arg in enumerate(self.assignees):
+            arg_id_to_val[-i-1] = arg
+
+        return arg_id_to_val
+
     @property
     def atomicity(self):
         # Function calls can impossibly be atomic, and even the result assignment
@@ -1133,34 +1161,118 @@ class CallInstruction(MultiAssignmentBase):
 # }}}
 
 
+def subscript_contains_slice(subscript):
+    """Return *True* if the *subscript* contains an instance of
+    :class:`pymbolic.primitives.Slice` as of its indices.
+    """
+    from pymbolic.primitives import Subscript, Slice
+    assert isinstance(subscript, Subscript)
+    return any(isinstance(index, Slice) for index in subscript.index_tuple)
+
+
+def is_array_call(assignees, expression):
+    """
+    Returns *True* is the instruction is an array call.
+
+    An array call is a function call applied to array type objects. If any of
+    the arguemnts or assignees to the function is an array,
+    :meth:`is_array_call` will return *True*.
+    """
+    from pymbolic.primitives import Call, CallWithKwargs, Subscript
+    from loopy.symbolic import SubArrayRef
+
+    if not isinstance(expression, (Call, CallWithKwargs)):
+        return False
+
+    for par in expression.parameters+assignees:
+        if isinstance(par, SubArrayRef):
+            return True
+        elif isinstance(par, Subscript):
+            if subscript_contains_slice(par):
+                return True
+
+    # did not encounter SubArrayRef/Slice, hence must be a normal call
+    return False
+
+
+def modify_assignee_for_array_call(assignee):
+    """
+    Converts the assignee subscript or variable as a SubArrayRef.
+    """
+    from pymbolic.primitives import Subscript, Variable
+    from loopy.symbolic import SubArrayRef
+    if isinstance(assignee, SubArrayRef):
+        return assignee
+    elif isinstance(assignee, Subscript):
+        if subscript_contains_slice(assignee):
+            # Slice subscripted array are treated as SubArrayRef in the kernel
+            # Hence, making the behavior similar to that of `SubArrayref`
+            return assignee
+        else:
+            return SubArrayRef((), assignee)
+    elif isinstance(assignee, Variable):
+        return SubArrayRef((), Subscript(assignee, 0))
+    else:
+        raise LoopyError("ArrayCall only takes Variable, Subscript or "
+                "SubArrayRef as its inputs")
+
+
 def make_assignment(assignees, expression, temp_var_types=None, **kwargs):
+
     if temp_var_types is None:
         temp_var_types = (Optional(),) * len(assignees)
 
-    if len(assignees) == 1:
+    if len(assignees) != 1 or is_array_call(assignees, expression):
+        atomicity = kwargs.pop("atomicity", ())
+        if atomicity:
+            raise LoopyError("atomic operations with more than one "
+                    "left-hand side not supported")
+
+        from pymbolic.primitives import Call, CallWithKwargs
+        from loopy.symbolic import Reduction
+        if not isinstance(expression, (Call, CallWithKwargs, Reduction)):
+            raise LoopyError("right-hand side in multiple assignment must be "
+                    "function call or reduction, got: '%s'" % expression)
+
+        if not is_array_call(assignees, expression):
+            return CallInstruction(
+                    assignees=assignees,
+                    expression=expression,
+                    temp_var_types=temp_var_types,
+                    **kwargs)
+        else:
+            # In the case of an array call, it is important to have each
+            # assignee as an instance of SubArrayRef. If not given as a
+            # SubArrayRef
+            return CallInstruction(
+                    assignees=tuple(modify_assignee_for_array_call(
+                        assignee) for assignee in assignees),
+                    expression=expression,
+                    temp_var_types=temp_var_types,
+                    **kwargs)
+    else:
+        def _is_array(expr):
+            from loopy.symbolic import SubArrayRef
+            from pymbolic.primitives import (Subscript, Slice)
+            if isinstance(expr, SubArrayRef):
+                return True
+            if isinstance(expr, Subscript):
+                return any(isinstance(idx, Slice) for idx in
+                        expr.index_tuple)
+            return False
+
+        from loopy.symbolic import DependencyMapper
+        if any(_is_array(dep) for dep in DependencyMapper()((assignees,
+                expression))):
+            raise LoopyError("Array calls only supported as instructions"
+                    " with function call as RHS for now.")
+
         return Assignment(
                 assignee=assignees[0],
                 expression=expression,
                 temp_var_type=temp_var_types[0],
                 **kwargs)
 
-    atomicity = kwargs.pop("atomicity", ())
-    if atomicity:
-        raise LoopyError("atomic operations with more than one "
-                "left-hand side not supported")
-
-    from pymbolic.primitives import Call
-    from loopy.symbolic import Reduction
-    if not isinstance(expression, (Call, Reduction)):
-        raise LoopyError("right-hand side in multiple assignment must be "
-                "function call or reduction, got: '%s'" % expression)
-
-    return CallInstruction(
-            assignees=assignees,
-            expression=expression,
-            temp_var_types=temp_var_types,
-            **kwargs)
-
 
 # {{{ c instruction
 
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index e16cb59f3d3d2e3f6b116f8d13a0ea93e5268f65..5cae76192fb2d8201f3a2881a2fdb4c1d4f57438 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -32,26 +32,45 @@ import islpy as isl
 from islpy import dim_type
 from loopy.diagnostic import LoopyError, warn_with_kernel
 from pytools import memoize_on_first_arg, natsorted
-
+from loopy.kernel import LoopKernel
+from loopy.program import Program, iterate_over_kernels_if_given_program
+from loopy.kernel.function_interface import CallableKernel
 import logging
 logger = logging.getLogger(__name__)
 
 
 # {{{ add and infer argument dtypes
 
-def add_dtypes(kernel, dtype_dict):
+def add_dtypes(prog_or_kernel, dtype_dict):
     """Specify remaining unspecified argument/temporary variable types.
 
     :arg dtype_dict: a mapping from variable names to :class:`numpy.dtype`
         instances
     """
-    dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes(kernel, dtype_dict)
+    if isinstance(prog_or_kernel, Program):
+        kernel_names = [clbl.subkernel.name for clbl in
+                prog_or_kernel.callables_table.values() if isinstance(clbl,
+                    CallableKernel)]
+        if len(kernel_names) != 1:
+            raise LoopyError("add_dtypes may not take a Program with more than"
+                    " one callable kernels. Please provide individual kernels"
+                    " instead.")
+
+        kernel_name, = kernel_names
+
+        return prog_or_kernel.with_kernel(
+                add_dtypes(prog_or_kernel[kernel_name], dtype_dict))
+
+    assert isinstance(prog_or_kernel, LoopKernel)
+
+    dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes(
+            prog_or_kernel, dtype_dict)
 
     if dtype_dict_remainder:
         raise RuntimeError("unused argument dtypes: %s"
                 % ", ".join(dtype_dict_remainder))
 
-    return kernel.copy(args=new_args, temporary_variables=new_temp_vars)
+    return prog_or_kernel.copy(args=new_args, temporary_variables=new_temp_vars)
 
 
 def _add_dtypes_overdetermined(kernel, dtype_dict):
@@ -103,7 +122,18 @@ def get_arguments_with_incomplete_dtype(kernel):
             if arg.dtype is None]
 
 
-def add_and_infer_dtypes(kernel, dtype_dict, expect_completion=False):
+def add_and_infer_dtypes(prog, dtype_dict, expect_completion=False,
+        kernel_name=None):
+    assert isinstance(prog, Program)
+    if kernel_name is None:
+        kernel_names = [clbl.subkernel.name for clbl in
+                prog.callables_table.values() if isinstance(clbl,
+                    CallableKernel)]
+        if len(kernel_names) != 1:
+            raise LoopyError("Provide 'kernel_name' argument.")
+
+        kernel_name, = kernel_names
+
     processed_dtype_dict = {}
 
     for k, v in dtype_dict.items():
@@ -112,10 +142,10 @@ def add_and_infer_dtypes(kernel, dtype_dict, expect_completion=False):
             if subkey:
                 processed_dtype_dict[subkey] = v
 
-    kernel = add_dtypes(kernel, processed_dtype_dict)
+    prog = prog.with_kernel(add_dtypes(prog[kernel_name], processed_dtype_dict))
 
     from loopy.type_inference import infer_unknown_types
-    return infer_unknown_types(kernel, expect_completion=expect_completion)
+    return infer_unknown_types(prog, expect_completion=expect_completion)
 
 
 def _add_and_infer_dtypes_overdetermined(kernel, dtype_dict):
@@ -463,8 +493,10 @@ class DomainChanger:
 
 # {{{ graphviz / dot export
 
-def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False):
-    """Return a string in the `dot <https://graphviz.org/>`__ language depicting
+@iterate_over_kernels_if_given_program
+def get_dot_dependency_graph(kernel, callables_table, iname_cluster=True,
+        use_insn_id=False):
+    """Return a string in the `dot <https://graphviz.org/>`_ language depicting
     dependencies among kernel instructions.
     """
 
@@ -475,7 +507,7 @@ def get_dot_dependency_graph(kernel, iname_cluster=True, use_insn_id=False):
     if iname_cluster and not kernel.schedule:
         try:
             from loopy.schedule import get_one_scheduled_kernel
-            kernel = get_one_scheduled_kernel(kernel)
+            kernel = get_one_scheduled_kernel(kernel, callables_table)
         except RuntimeError as e:
             iname_cluster = False
             from warnings import warn
@@ -756,7 +788,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn):
 # }}}
 
 
-def assign_automatic_axes(kernel, axis=0, local_size=None):
+def assign_automatic_axes(kernel, callables_table, axis=0, local_size=None):
     logger.debug("%s: assign automatic axes" % kernel.name)
     # TODO: do the tag removal rigorously, might be easier after switching
     # to set() from tuple()
@@ -770,7 +802,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
 
     if local_size is None:
         _, local_size = kernel.get_grid_size_upper_bounds_as_exprs(
-                ignore_auto=True)
+                callables_table, ignore_auto=True)
 
     # {{{ axis assignment helper function
 
@@ -793,6 +825,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
                         if not isinstance(tag, AutoLocalIndexTagBase)))
             return assign_automatic_axes(
                     kernel.copy(inames=new_inames),
+                    callables_table,
                     axis=recursion_axis)
 
         if axis is None:
@@ -832,7 +865,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
         else:
             new_tag = LocalIndexTag(axis)
             if desired_length > local_size[axis]:
-                from loopy import split_iname, untag_inames
+                from loopy import untag_inames
+                from loopy.transform.iname import split_iname
 
                 # Don't be tempted to switch the outer tag to unroll--this may
                 # generate tons of code on some examples.
@@ -843,6 +877,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
                             iname, inner_length=local_size[axis],
                             outer_tag=None, inner_tag=new_tag,
                             do_tagged_check=False),
+                        callables_table=callables_table,
                         axis=recursion_axis, local_size=local_size)
 
         if not kernel.iname_tags_of_type(iname, AutoLocalIndexTagBase):
@@ -859,7 +894,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
         new_inames = kernel.inames.copy()
         new_inames[iname] = kernel.inames[iname].copy(tags=new_tags)
         return assign_automatic_axes(kernel.copy(inames=new_inames),
-                axis=recursion_axis, local_size=local_size)
+                callables_table, axis=recursion_axis, local_size=local_size)
 
     # }}}
 
@@ -927,7 +962,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
     if axis >= len(local_size):
         return kernel
     else:
-        return assign_automatic_axes(kernel, axis=axis+1,
+        return assign_automatic_axes(kernel,
+                callables_table=callables_table, axis=axis+1,
                 local_size=local_size)
 
 # }}}
@@ -1855,35 +1891,105 @@ def find_aliasing_equivalence_classes(kernel):
 # }}}
 
 
+# {{{ callee kernel tools
+
+def get_direct_callee_kernels(kernel, callables_table, insn_ids=None,):
+    """
+    Returns an instance of :class:`frozenset` of all the callee kernels
+    called in instructions in the *kernel* whose IDs are given in *insn_ids*.
+
+    :arg kernel: An instance of :class:`LoopKernel`.
+    :arg insn_ids: An instance of :class:`frozenset`.
+
+    If *insn_ids* is *None* returns all the callee kernels called by *kernel*.
+    """
+    #FIXME: explain what "direct" means
+
+    if insn_ids is None:
+        insn_ids = frozenset(insn.id for insn in kernel.instructions)
+
+    def _get_callee_kernel_if_insn_has_callable_kernel(insn_id):
+        """Returns callee kernel if the instruction has a call to a
+        :class:`loopy.kernel.function_interface.CallableKernel`. Otherwise
+        returns *None*.
+        """
+        insn = kernel.id_to_insn[insn_id]
+        from loopy.kernel.instruction import (CallInstruction,
+                MultiAssignmentBase, CInstruction, _DataObliviousInstruction)
+        from pymbolic.primitives import Call
+        if isinstance(insn, CallInstruction):
+            if isinstance(insn.expression, Call) and (
+                    insn.expression.function.name in callables_table):
+                in_knl_callable = callables_table[
+                        insn.expression.function.name]
+                if isinstance(in_knl_callable, CallableKernel):
+                    return in_knl_callable.subkernel
+        elif isinstance(insn, (MultiAssignmentBase,
+                CInstruction, _DataObliviousInstruction)):
+            pass
+        else:
+            raise NotImplementedError("Unknown type of instruction %s." %
+                    type(insn))
+
+        return None
+
+    return frozenset([_get_callee_kernel_if_insn_has_callable_kernel(insn_id)
+            for insn_id in insn_ids]) - frozenset([None])
+
+# }}}
+
+
 # {{{ direction helper tools
 
-def infer_arg_is_output_only(kernel):
+def infer_args_are_input_output(kernel):
     """
-    Returns a copy of *kernel* with the attribute ``is_output_only`` set.
+    Returns a copy of *kernel* with the attributes ``is_input`` and
+    ``is_output`` of the arguments set.
 
     .. note::
 
-        If the attribute ``is_output_only`` is not supplied from an user, then
-        infers it as an output argument if it is written at some point in the
-        kernel.
+        If the :attr:`~loopy.ArrayArg.is_output` is not supplied from a user,
+        then the array is inferred as an output argument if it is written at
+        some point in the kernel.
+
+        If the :attr:`~loopy.ArrayArg.is_input` is not supplied from a user,
+        then the array is inferred as an input argument if it is either read at
+        some point in the kernel or it is neither read nor written.
     """
     from loopy.kernel.data import ArrayArg, ValueArg, ConstantArg, ImageArg
     new_args = []
+
     for arg in kernel.args:
         if isinstance(arg, ArrayArg):
-            if arg.is_output_only is not None:
-                assert isinstance(arg.is_output_only, bool)
-                new_args.append(arg)
+            if arg.is_output is not None:
+                assert isinstance(arg.is_output, bool)
             else:
                 if arg.name in kernel.get_written_variables():
-                    new_args.append(arg.copy(is_output_only=True))
+                    arg = arg.copy(is_output=True)
+                else:
+                    arg = arg.copy(is_output=False)
+
+            if arg.is_input is not None:
+                assert isinstance(arg.is_input, bool)
+            else:
+                if arg.name in kernel.get_read_variables() or (
+                        (arg.name not in kernel.get_read_variables()) and (
+                            arg.name not in kernel.get_written_variables())):
+                    arg = arg.copy(is_input=True)
                 else:
-                    new_args.append(arg.copy(is_output_only=False))
+                    arg = arg.copy(is_input=False)
         elif isinstance(arg, (ConstantArg, ImageArg, ValueArg)):
-            new_args.append(arg)
+            pass
         else:
             raise NotImplementedError("Unkonwn argument type %s." % type(arg))
 
+        if not (arg.is_input or arg.is_output):
+            raise LoopyError("Kernel argument must be either input or output."
+                    " '{}' in '{}' does not follow it.".format(arg.name,
+                        kernel.name))
+
+        new_args.append(arg)
+
     return kernel.copy(args=new_args)
 
 # }}}
diff --git a/loopy/library/function.py b/loopy/library/function.py
index 99af08169c0ea053a1671e0ab087f24a86c16e3b..d7558960ab0c7e2c4f045655a068fc67d0785797 100644
--- a/loopy/library/function.py
+++ b/loopy/library/function.py
@@ -20,38 +20,109 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-
-def default_function_mangler(kernel, name, arg_dtypes):
-    from loopy.library.reduction import reduction_function_mangler
-
-    manglers = [reduction_function_mangler, tuple_function_mangler]
-    for mangler in manglers:
-        result = mangler(kernel, name, arg_dtypes)
-        if result is not None:
-            return result
-
-    return None
-
-
-def single_arg_function_mangler(kernel, name, arg_dtypes):
-    if len(arg_dtypes) == 1:
-        dtype, = arg_dtypes
-
-        from loopy.kernel.data import CallMangleInfo
-        return CallMangleInfo(name, (dtype,), (dtype,))
-
-    return None
-
-
-def tuple_function_mangler(kernel, name, arg_dtypes):
-    if name == "make_tuple":
-        from loopy.kernel.data import CallMangleInfo
-        return CallMangleInfo(
-                target_name="loopy_make_tuple",
-                result_dtypes=arg_dtypes,
-                arg_dtypes=arg_dtypes)
-
-    return None
+from loopy.kernel.function_interface import ScalarCallable
+from loopy.diagnostic import LoopyError
+from loopy.types import NumpyType
+import numpy as np
+
+
+class MakeTupleCallable(ScalarCallable):
+    def with_types(self, arg_id_to_dtype, callables_table):
+        new_arg_id_to_dtype = arg_id_to_dtype.copy()
+        for i in range(len(arg_id_to_dtype)):
+            if i in arg_id_to_dtype and arg_id_to_dtype[i] is not None:
+                new_arg_id_to_dtype[-i-1] = new_arg_id_to_dtype[i]
+
+        return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
+            name_in_target="loopy_make_tuple"), callables_table)
+
+    def with_descrs(self, arg_id_to_descr, callables_table):
+        from loopy.kernel.function_interface import ValueArgDescriptor
+        new_arg_id_to_descr = {(id, ValueArgDescriptor()):
+            (-id-1, ValueArgDescriptor()) for id in arg_id_to_descr.keys()}
+
+        return (
+                self.copy(arg_id_to_descr=new_arg_id_to_descr),
+                callables_table)
+
+
+class IndexOfCallable(ScalarCallable):
+    def with_types(self, arg_id_to_dtype, callables_table):
+        new_arg_id_to_dtype = {i: dtype
+                               for i, dtype in arg_id_to_dtype.items()
+                               if dtype is not None}
+        new_arg_id_to_dtype[-1] = NumpyType(np.int32)
+
+        return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype),
+                callables_table)
+
+    def emit_call(self, expression_to_code_mapper, expression, target):
+        from pymbolic.primitives import Subscript
+
+        if len(expression.parameters) != 1:
+            raise LoopyError("%s takes exactly one argument" % self.name)
+        arg, = expression.parameters
+        if not isinstance(arg, Subscript):
+            raise LoopyError(
+                    "argument to %s must be a subscript" % self.name)
+
+        ary = expression_to_code_mapper.find_array(arg)
+
+        from loopy.kernel.array import get_access_info
+        from pymbolic import evaluate
+        access_info = get_access_info(expression_to_code_mapper.kernel.target,
+                ary, arg.index, lambda expr: evaluate(expr,
+                    expression_to_code_mapper.codegen_state.var_subst_map),
+                expression_to_code_mapper.codegen_state.vectorization_info)
+
+        from loopy.kernel.data import ImageArg
+        if isinstance(ary, ImageArg):
+            raise LoopyError("%s does not support images" % self.name)
+
+        if self.name == "indexof":
+            return access_info.subscripts[0]
+        elif self.name == "indexof_vec":
+            from loopy.kernel.array import VectorArrayDimTag
+            ivec = None
+            for iaxis, dim_tag in enumerate(ary.dim_tags):
+                if isinstance(dim_tag, VectorArrayDimTag):
+                    ivec = iaxis
+
+            if ivec is None:
+                return access_info.subscripts[0]
+            else:
+                return (
+                    access_info.subscripts[0]*ary.shape[ivec]
+                    + access_info.vector_index)
+
+        else:
+            raise RuntimeError("should not get here")
+
+    def emit_call_insn(self, insn, target, expression_to_code_mapper):
+        return self.emit_call(
+                expression_to_code_mapper,
+                insn.expression,
+                target), True
+
+
+def get_loopy_callables():
+    """
+    Returns a mapping from function ids to corresponding
+    :class:`loopy.kernel.function_interface.InKernelCallable` for functions
+    whose interface is provided by :mod:`loopy`. Callables that fall in this
+    category are --
+
+    - reductions leading to function calls like ``argmin``, ``argmax``.
+    - callables that have a predefined meaning in :mod:`loo.py` like
+      ``make_tuple``, ``index_of``, ``indexof_vec``.
+    """
+    known_callables = {
+            "make_tuple": MakeTupleCallable(name="make_tuple"),
+            "indexof": IndexOfCallable(name="indexof"),
+            "indexof_vec": IndexOfCallable(name="indexof_vec"),
+            }
+
+    return known_callables
 
 
 # vim: foldmethod=marker
diff --git a/loopy/library/random123.py b/loopy/library/random123.py
index 7f24dd3a0e3699fb0bb55ac1d4022645dedac854..2d4f82205904aa7dcaf27c803a56f2f3442c59be 100644
--- a/loopy/library/random123.py
+++ b/loopy/library/random123.py
@@ -26,6 +26,7 @@ THE SOFTWARE.
 
 from pytools import ImmutableRecord
 from mako.template import Template
+from loopy.kernel.function_interface import ScalarCallable
 import numpy as np
 
 
@@ -162,60 +163,86 @@ double${ width } ${ name }_f64(
 # }}}
 
 
-def random123_preamble_generator(preamble_info):
-    for f in preamble_info.seen_functions:
-        try:
-            rng_variant = FUNC_NAMES_TO_RNG[f.name]
-        except KeyError:
-            continue
+class Random123Callable(ScalarCallable):
+    """
+    Records information about for the random123 functions.
+    """
+    fields = ScalarCallable.fields | {"target"}
+
+    def __init__(self, name, arg_id_to_dtype=None,
+            arg_id_to_descr=None, name_in_target=None, target=None):
+
+        super().__init__(
+                name=name,
+                arg_id_to_dtype=arg_id_to_dtype,
+                arg_id_to_descr=arg_id_to_descr,
+                name_in_target=name_in_target)
+
+        self.target = target
+
+    def with_types(self, arg_id_to_dtype, callables_table):
+
+        if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or (
+                arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None):
+            # the types provided aren't mature enough to specialize the
+            # callable
+            return (self.copy(),
+                    callables_table)
+
+        name = self.name
+        target = self.target
+
+        rng_variant = FUNC_NAMES_TO_RNG[name]
+
+        from loopy.types import NumpyType
+        base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits]
+        ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width)
+        key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width)
+
+        fn = rng_variant.full_name
+        if name == fn:
+            new_arg_id_to_dtype = {-1: ctr_dtype, -2: ctr_dtype, 0: ctr_dtype, 1:
+                    key_dtype}
+            return (
+                    self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
+                        name_in_target=fn+"_gen"),
+                    callables_table)
+
+        elif name == fn + "_f32":
+            new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float32),
+                rng_variant.width),
+                    -2: ctr_dtype, 0: ctr_dtype, 1:
+                    key_dtype}
+            return self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
+                    name_in_target=name), callables_table
+
+        elif name == fn + "_f64":
+            new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float64),
+                rng_variant.width),
+                    -2: ctr_dtype, 0: ctr_dtype, 1:
+                    key_dtype}
+            return self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
+                    name_in_target=name), callables_table
+
+        return (self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                callables_table)
+
+    def generate_preambles(self, target):
+        rng_variant = FUNC_NAMES_TO_RNG[self.name]
 
         from loopy.target.pyopencl import PyOpenCLTarget
         yield ("90-random123-"+rng_variant.full_name,
                 PREAMBLE_TEMPLATE.render(
                     is_pyopencl_target=isinstance(
-                        preamble_info.kernel.target,
+                        target,
                         PyOpenCLTarget),
                     rng_variant=rng_variant,
                     ))
 
+        return
 
-def random123_function_mangler(kernel, name, arg_dtypes):
-    try:
-        rng_variant = FUNC_NAMES_TO_RNG[name]
-    except KeyError:
-        return None
-
-    from loopy.types import NumpyType
-    target = kernel.target
-    base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits]
-    ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width)
-    key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width)
-
-    from loopy.kernel.data import CallMangleInfo
-    fn = rng_variant.full_name
-    if name == fn:
-        return CallMangleInfo(
-                target_name=fn+"_gen",
-                result_dtypes=(ctr_dtype, ctr_dtype),
-                arg_dtypes=(ctr_dtype, key_dtype))
-
-    elif name == fn + "_f32":
-        return CallMangleInfo(
-                target_name=name,
-                result_dtypes=(
-                    target.vector_dtype(NumpyType(np.float32), rng_variant.width),
-                    ctr_dtype),
-                arg_dtypes=(ctr_dtype, key_dtype))
-
-    elif name == fn + "_f64":
-        return CallMangleInfo(
-                target_name=name,
-                result_dtypes=(
-                    target.vector_dtype(NumpyType(np.float64), rng_variant.width),
-                    ctr_dtype),
-                arg_dtypes=(ctr_dtype, key_dtype))
-
-    else:
-        return None
+
+def get_random123_callables(target):
+    return {id_: Random123Callable(id_, target=target) for id_ in FUNC_NAMES_TO_RNG}
 
 # vim: foldmethod=marker
diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index 6ca763442d3bb7e4f9044b738cb67e70aca703b1..1d53d06b063619726837f467ca12de11599a819c 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -22,11 +22,14 @@ THE SOFTWARE.
 
 
 from pymbolic import var
+from loopy.symbolic import ResolvedFunction
+from loopy.kernel.function_interface import ScalarCallable
 import numpy as np
 
 from loopy.symbolic import FunctionIdentifier
 from loopy.diagnostic import LoopyError
 from loopy.types import NumpyType
+from loopy.tools import update_persistent_hash
 
 __doc__ = """
 .. currentmodule:: loopy.library.reduction
@@ -50,7 +53,7 @@ class ReductionOperation:
     equality-comparable.
     """
 
-    def result_dtypes(self, target, *arg_dtypes):
+    def result_dtypes(self, *arg_dtypes):
         """
         :arg arg_dtypes: may be None if not known
         :returns: None if not known, otherwise the returned type
@@ -62,7 +65,7 @@ class ReductionOperation:
     def arg_count(self):
         raise NotImplementedError
 
-    def neutral_element(self, *dtypes):
+    def neutral_element(self, dtypes, callables_table, target):
         raise NotImplementedError
 
     def __hash__(self):
@@ -109,10 +112,11 @@ class ScalarReductionOperation(ReductionOperation):
     def arg_count(self):
         return 1
 
-    def result_dtypes(self, kernel, arg_dtype):
+    def result_dtypes(self, arg_dtype):
         if self.forced_result_type is not None:
-            return (self.parse_result_type(
-                    kernel.target, self.forced_result_type),)
+            raise NotImplementedError()
+            # return (self.parse_result_type(
+            #         kernel.target, self.forced_result_type),)
 
         if arg_dtype is None:
             return None
@@ -136,29 +140,43 @@ class ScalarReductionOperation(ReductionOperation):
 
 
 class SumReductionOperation(ScalarReductionOperation):
-    def neutral_element(self, dtype):
+    def neutral_element(self, dtype, callables_table, target):
         # FIXME: Document that we always use an int here.
-        return 0
+        from loopy import auto
+        if dtype not in [None, auto] and dtype.numpy_dtype.kind == "f":
+            return 0.0, callables_table
 
-    def __call__(self, dtype, operand1, operand2):
-        return operand1 + operand2
+        return 0, callables_table
+
+    def __call__(self, dtype, operand1, operand2, callables_table, target):
+        return operand1 + operand2, callables_table
 
 
 class ProductReductionOperation(ScalarReductionOperation):
-    def neutral_element(self, dtype):
+    def neutral_element(self, dtype, callables_table, target):
         # FIXME: Document that we always use an int here.
-        return 1
+        from loopy import auto
+        if dtype not in [None, auto] and dtype.numpy_dtype.kind == "f":
+            return 1.0, callables_table
 
-    def __call__(self, dtype, operand1, operand2):
-        return operand1 * operand2
+        return 1, callables_table
+
+    def __call__(self, dtype, operand1, operand2, callables_table, target):
+        return operand1 * operand2, callables_table
 
 
 def get_le_neutral(dtype):
     """Return a number y that satisfies (x <= y) for all y."""
 
     if dtype.numpy_dtype.kind == "f":
-        # OpenCL 1.1, section 6.11.2
-        return var("INFINITY")
+        # OpenCL 1.2, section 6.12.2
+        if dtype.numpy_dtype.itemsize == 4:
+            #float
+            return var("INFINITY")
+        elif dtype.numpy_dtype.itemsize == 8:
+            #double
+            return var("HUGE_VAL")
+
     elif dtype.numpy_dtype.kind == "i":
         # OpenCL 1.1, section 6.11.3
         if dtype.numpy_dtype.itemsize == 4:
@@ -175,8 +193,13 @@ def get_ge_neutral(dtype):
     """Return a number y that satisfies (x >= y) for all y."""
 
     if dtype.numpy_dtype.kind == "f":
-        # OpenCL 1.1, section 6.11.2
-        return -var("INFINITY")
+        # OpenCL 1.2, section 6.12.2
+        if dtype.numpy_dtype.itemsize == 4:
+            #float
+            return -var("INFINITY")
+        elif dtype.numpy_dtype.itemsize == 8:
+            #double
+            return -var("HUGE_VAL")
     elif dtype.numpy_dtype.kind == "i":
         # OpenCL 1.1, section 6.11.3
         if dtype.numpy_dtype.itemsize == 4:
@@ -190,19 +213,47 @@ def get_ge_neutral(dtype):
 
 
 class MaxReductionOperation(ScalarReductionOperation):
-    def neutral_element(self, dtype):
-        return get_ge_neutral(dtype)
+    def neutral_element(self, dtype, callables_table, target):
+        return get_ge_neutral(dtype), callables_table
 
-    def __call__(self, dtype, operand1, operand2):
-        return var("max")(operand1, operand2)
+    def __call__(self, dtype, operand1, operand2, callables_table, target):
+        dtype, = dtype
+        from loopy.program import update_table
+
+        # getting the callable 'max' from target
+        max_scalar_callable = target.get_device_ast_builder().known_callables["max"]
+
+        # type specialize the callable
+        max_scalar_callable, callables_table = max_scalar_callable.with_types(
+                {0: dtype, 1: dtype}, callables_table)
+
+        # populate callables_table
+        func_id, callables_table = update_table(callables_table, "max",
+                max_scalar_callable)
+
+        return ResolvedFunction(func_id)(operand1, operand2), callables_table
 
 
 class MinReductionOperation(ScalarReductionOperation):
-    def neutral_element(self, dtype):
-        return get_le_neutral(dtype)
+    def neutral_element(self, dtype, callables_table, target):
+        return get_le_neutral(dtype), callables_table
 
-    def __call__(self, dtype, operand1, operand2):
-        return var("min")(operand1, operand2)
+    def __call__(self, dtype, operand1, operand2, callables_table, target):
+        dtype, = dtype
+        from loopy.program import update_table
+
+        # getting the callable 'min' from target
+        min_scalar_callable = target.get_device_ast_builder().known_callables["min"]
+
+        # type specialize the callable
+        min_scalar_callable, callables_table = min_scalar_callable.with_types(
+                {0: dtype, 1: dtype}, callables_table)
+
+        # populate callables_table
+        func_id, callables_table = update_table(callables_table, "min",
+                min_scalar_callable)
+
+        return ResolvedFunction(func_id)(operand1, operand2), callables_table
 
 
 # {{{ base class for symbolic reduction ops
@@ -226,6 +277,10 @@ class ReductionOpFunction(FunctionIdentifier):
 
         return type(self)(reduction_op)
 
+    hash_fields = (
+            "reduction_op",)
+
+    update_persistent_hash = update_persistent_hash
 
 # }}}
 
@@ -257,13 +312,30 @@ class _SegmentedScalarReductionOperation(ReductionOperation):
                 scalar_dtype.numpy_dtype.type.__name__,
                 segment_flag_dtype.numpy_dtype.type.__name__)
 
-    def neutral_element(self, scalar_dtype, segment_flag_dtype):
-        scalar_neutral_element = self.inner_reduction.neutral_element(scalar_dtype)
-        return var("make_tuple")(scalar_neutral_element,
-                segment_flag_dtype.numpy_dtype.type(0))
+    def neutral_element(self, scalar_dtype, segment_flag_dtype,
+            callables_table, target):
+        from loopy.library.function import MakeTupleCallable
+        from loopy.program import update_table
 
-    def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype):
-        return (self.inner_reduction.result_dtypes(kernel, scalar_dtype)
+        scalar_neutral_element, calables_table = (
+                self.inner_reduction.neutral_element(
+                    scalar_dtype, callables_table, target))
+
+        make_tuple_callable = MakeTupleCallable(
+                name="make_tuple")
+
+        make_tuple_callable, callables_table = make_tuple_callable.with_types(
+                dict(enumerate([scalar_dtype, segment_flag_dtype])),
+                callables_table)
+
+        func_id, callables_table = update_table(
+                callables_table, "make_tuple", make_tuple_callable)
+
+        return ResolvedFunction(func_id)(scalar_neutral_element,
+                segment_flag_dtype.numpy_dtype.type(0)), callables_table
+
+    def result_dtypes(self, scalar_dtype, segment_flag_dtype):
+        return (self.inner_reduction.result_dtypes(scalar_dtype)
                 + (segment_flag_dtype,))
 
     def __str__(self):
@@ -273,10 +345,26 @@ class _SegmentedScalarReductionOperation(ReductionOperation):
         return hash(type(self))
 
     def __eq__(self, other):
-        return type(self) == type(other)
+        return type(self) == type(other) and (self.inner_reduction ==
+                other.inner_reduction)
 
-    def __call__(self, dtypes, operand1, operand2):
-        return SegmentedOp(self)(*(operand1 + operand2))
+    def __call__(self, dtypes, operand1, operand2, callables_table, target):
+        segmented_scalar_callable = ReductionCallable(
+                SegmentedOp(self))
+
+        # type specialize the callable
+        segmented_scalar_callable, callables_table = (
+                segmented_scalar_callable.with_types(
+                    {0: dtypes[0], 1: dtypes[1], 2: dtypes[0], 3: dtypes[1]},
+                    callables_table))
+
+        # populate callables_table
+        from loopy.program import update_table
+        func_id, callables_table = update_table(
+                callables_table, SegmentedOp(self), segmented_scalar_callable)
+
+        return (ResolvedFunction(func_id)(*(operand1 + operand2)),
+                callables_table)
 
 
 class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation):
@@ -284,34 +372,24 @@ class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation):
     which = "sum"
     op = "((%s) + (%s))"
 
+    hash_fields = (
+            "which",
+            "op",)
+
+    update_persistent_hash = update_persistent_hash
+
 
 class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation):
     base_reduction_class = ProductReductionOperation
     op = "((%s) * (%s))"
     which = "product"
 
+    hash_fields = (
+            "which",
+            "op",
+            "base_reduction_class",)
 
-def get_segmented_function_preamble(kernel, func_id, arg_dtypes):
-    op = func_id.reduction_op
-    scalar_dtype = arg_dtypes[0]
-    segment_flag_dtype = arg_dtypes[1]
-    prefix = op.prefix(scalar_dtype, segment_flag_dtype)
-
-    return (prefix, """
-    inline %(scalar_t)s %(prefix)s_op(
-        %(scalar_t)s op1, %(segment_flag_t)s segment_flag1,
-        %(scalar_t)s op2, %(segment_flag_t)s segment_flag2,
-        %(segment_flag_t)s *segment_flag_out)
-    {
-        *segment_flag_out = segment_flag1 | segment_flag2;
-        return segment_flag2 ? op2 : %(combined)s;
-    }
-    """ % dict(
-            scalar_t=kernel.target.dtype_to_typename(scalar_dtype),
-            prefix=prefix,
-            segment_flag_t=kernel.target.dtype_to_typename(segment_flag_dtype),
-            combined=op.op % ("op1", "op2"),
-            ))
+    update_persistent_hash = update_persistent_hash
 
 # }}}
 
@@ -337,15 +415,30 @@ class _ArgExtremumReductionOperation(ReductionOperation):
                 scalar_dtype.numpy_dtype.type.__name__,
                 index_dtype.numpy_dtype.type.__name__)
 
-    def result_dtypes(self, kernel, scalar_dtype, index_dtype):
+    def result_dtypes(self, scalar_dtype, index_dtype):
         return (scalar_dtype, index_dtype)
 
-    def neutral_element(self, scalar_dtype, index_dtype):
+    def neutral_element(self, scalar_dtype, index_dtype, callables_table,
+            target):
         scalar_neutral_func = (
                 get_ge_neutral if self.neutral_sign < 0 else get_le_neutral)
         scalar_neutral_element = scalar_neutral_func(scalar_dtype)
-        return var("make_tuple")(scalar_neutral_element,
-                index_dtype.numpy_dtype.type(-1))
+
+        from loopy.library.function import MakeTupleCallable
+        from loopy.program import update_table
+        make_tuple_callable = MakeTupleCallable(
+                name="make_tuple")
+
+        make_tuple_callable, callables_table = make_tuple_callable.with_types(
+                dict(enumerate([scalar_dtype, index_dtype])),
+                callables_table)
+
+        # populate callables_table
+        func_id, callables_table = update_table(callables_table, "make_tuple",
+                make_tuple_callable)
+
+        return ResolvedFunction(func_id)(scalar_neutral_element,
+                index_dtype.numpy_dtype.type(-1)), callables_table
 
     def __str__(self):
         return self.which
@@ -360,8 +453,22 @@ class _ArgExtremumReductionOperation(ReductionOperation):
     def arg_count(self):
         return 2
 
-    def __call__(self, dtypes, operand1, operand2):
-        return ArgExtOp(self)(*(operand1 + operand2))
+    def __call__(self, dtypes, operand1, operand2, callables_table, target):
+        arg_ext_scalar_callable = ReductionCallable(ArgExtOp(self))
+
+        # type specialize the callable
+        arg_ext_scalar_callable, callables_table = (
+                arg_ext_scalar_callable.with_types(
+                    {0: dtypes[0], 1: dtypes[1], 2: dtypes[0], 3: dtypes[1]},
+                    callables_table))
+
+        # populate callables_table
+        from loopy.program import update_table
+        func_id, callables_table = update_table(
+                callables_table, ArgExtOp(self), arg_ext_scalar_callable)
+
+        return (ResolvedFunction(func_id)(*(operand1 + operand2)),
+                callables_table)
 
 
 class ArgMaxReductionOperation(_ArgExtremumReductionOperation):
@@ -369,43 +476,23 @@ class ArgMaxReductionOperation(_ArgExtremumReductionOperation):
     update_comparison = ">="
     neutral_sign = -1
 
+    hash_fields = ("which",
+            "update_comparison",
+            "neutral_sign",)
+
+    update_persistent_hash = update_persistent_hash
+
 
 class ArgMinReductionOperation(_ArgExtremumReductionOperation):
     which = "min"
     update_comparison = "<="
     neutral_sign = +1
 
+    hash_fields = ("which",
+            "update_comparison",
+            "neutral_sign",)
 
-def get_argext_preamble(kernel, func_id, arg_dtypes):
-    op = func_id.reduction_op
-    scalar_dtype = arg_dtypes[0]
-    index_dtype = arg_dtypes[1]
-
-    prefix = op.prefix(scalar_dtype, index_dtype)
-
-    return (prefix, """
-    inline %(scalar_t)s %(prefix)s_op(
-        %(scalar_t)s op1, %(index_t)s index1,
-        %(scalar_t)s op2, %(index_t)s index2,
-        %(index_t)s *index_out)
-    {
-        if (op2 %(comp)s op1)
-        {
-            *index_out = index2;
-            return op2;
-        }
-        else
-        {
-            *index_out = index1;
-            return op1;
-        }
-    }
-    """ % dict(
-            scalar_t=kernel.target.dtype_to_typename(scalar_dtype),
-            prefix=prefix,
-            index_t=kernel.target.dtype_to_typename(index_dtype),
-            comp=op.update_comparison,
-            ))
+    update_persistent_hash = update_persistent_hash
 
 # }}}
 
@@ -460,70 +547,86 @@ def parse_reduction_op(name):
 # }}}
 
 
-def reduction_function_mangler(kernel, func_id, arg_dtypes):
-    if isinstance(func_id, ArgExtOp):
-        from loopy.target.opencl import CFamilyTarget
-        if not isinstance(kernel.target, CFamilyTarget):
-            raise LoopyError("%s: only C-like targets supported for now" % func_id)
-
-        op = func_id.reduction_op
-        scalar_dtype = arg_dtypes[0]
-        index_dtype = arg_dtypes[1]
-
-        from loopy.kernel.data import CallMangleInfo
-        return CallMangleInfo(
-                target_name="%s_op" % op.prefix(
-                    scalar_dtype, index_dtype),
-                result_dtypes=op.result_dtypes(
-                    kernel, scalar_dtype, index_dtype),
-                arg_dtypes=(
-                    scalar_dtype,
-                    index_dtype,
-                    scalar_dtype,
-                    index_dtype),
-                )
-
-    elif isinstance(func_id, SegmentedOp):
-        from loopy.target.opencl import CFamilyTarget
-        if not isinstance(kernel.target, CFamilyTarget):
-            raise LoopyError("%s: only C-like targets supported for now" % func_id)
-
-        op = func_id.reduction_op
-        scalar_dtype = arg_dtypes[0]
-        segment_flag_dtype = arg_dtypes[1]
-
-        from loopy.kernel.data import CallMangleInfo
-        return CallMangleInfo(
-                target_name="%s_op" % op.prefix(
-                    scalar_dtype, segment_flag_dtype),
-                result_dtypes=op.result_dtypes(
-                    kernel, scalar_dtype, segment_flag_dtype),
-                arg_dtypes=(
-                    scalar_dtype,
-                    segment_flag_dtype,
-                    scalar_dtype,
-                    segment_flag_dtype),
-                )
-
-    return None
+# {{{ reduction specific callables
+
+class ReductionCallable(ScalarCallable):
+    def with_types(self, arg_id_to_dtype, callables_table):
+        scalar_dtype = arg_id_to_dtype[0]
+        index_dtype = arg_id_to_dtype[1]
+        result_dtypes = self.name.reduction_op.result_dtypes(scalar_dtype,
+                index_dtype)
+        new_arg_id_to_dtype = arg_id_to_dtype.copy()
+        new_arg_id_to_dtype[-1] = result_dtypes[0]
+        new_arg_id_to_dtype[-2] = result_dtypes[1]
+        name_in_target = self.name.reduction_op.prefix(scalar_dtype,
+                index_dtype) + "_op"
+
+        return self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
+                name_in_target=name_in_target), callables_table
+
+    def with_descrs(self, arg_id_to_descr, callables_table):
+        from loopy.kernel.function_interface import ValueArgDescriptor
+        new_arg_id_to_descr = arg_id_to_descr.copy()
+        new_arg_id_to_descr[-1] = ValueArgDescriptor()
+        return (
+                self.copy(arg_id_to_descr=arg_id_to_descr),
+                callables_table)
+
+    def generate_preambles(self, target):
+        if isinstance(self.name, ArgExtOp):
+            op = self.name.reduction_op
+            scalar_dtype = self.arg_id_to_dtype[-1]
+            index_dtype = self.arg_id_to_dtype[-2]
+
+            prefix = op.prefix(scalar_dtype, index_dtype)
+
+            yield (prefix, """
+            inline {scalar_t} {prefix}_op(
+                {scalar_t} op1, {index_t} index1,
+                {scalar_t} op2, {index_t} index2,
+                {index_t} *index_out)
+            {{
+                if (op2 {comp} op1)
+                {{
+                    *index_out = index2;
+                    return op2;
+                }}
+                else
+                {{
+                    *index_out = index1;
+                    return op1;
+                }}
+            }}
+            """.format(
+                   scalar_t=target.dtype_to_typename(scalar_dtype),
+                   prefix=prefix,
+                   index_t=target.dtype_to_typename(index_dtype),
+                   comp=op.update_comparison,
+                   ))
+        elif isinstance(self.name, SegmentedOp):
+            op = self.name.reduction_op
+            scalar_dtype = self.arg_id_to_dtype[-1]
+            segment_flag_dtype = self.arg_id_to_dtype[-2]
+            prefix = op.prefix(scalar_dtype, segment_flag_dtype)
+
+            yield (prefix, """
+            inline {scalar_t} {prefix}_op(
+                {scalar_t} op1, {segment_flag_t} segment_flag1,
+                {scalar_t} op2, {segment_flag_t} segment_flag2,
+                {segment_flag_t} *segment_flag_out)
+            {{
+                *segment_flag_out = segment_flag1 | segment_flag2;
+                return segment_flag2 ? op2 : {combined};
+            }}
+            """.format(
+                   scalar_t=target.dtype_to_typename(scalar_dtype),
+                   prefix=prefix,
+                   segment_flag_t=target.dtype_to_typename(segment_flag_dtype),
+                   combined=op.op % ("op1", "op2"),
+                   ))
+
+        return
 
-
-def reduction_preamble_generator(preamble_info):
-    from loopy.target.opencl import OpenCLTarget
-
-    for func in preamble_info.seen_functions:
-        if isinstance(func.name, ArgExtOp):
-            if not isinstance(preamble_info.kernel.target, OpenCLTarget):
-                raise LoopyError("only OpenCL supported for now")
-
-            yield get_argext_preamble(preamble_info.kernel, func.name,
-                    func.arg_dtypes)
-
-        elif isinstance(func.name, SegmentedOp):
-            if not isinstance(preamble_info.kernel.target, OpenCLTarget):
-                raise LoopyError("only OpenCL supported for now")
-
-            yield get_segmented_function_preamble(preamble_info.kernel, func.name,
-                    func.arg_dtypes)
+# }}}
 
 # vim: fdm=marker
diff --git a/loopy/loop.py b/loopy/loop.py
index 7f5744b482fa2fb6cfbed64ee27486af9cb36e40..73ca8d72824071b36bf91798ba9a1ea14e624db7 100644
--- a/loopy/loop.py
+++ b/loopy/loop.py
@@ -22,13 +22,15 @@ THE SOFTWARE.
 
 
 import islpy as isl
+from loopy.program import iterate_over_kernels_if_given_program
 
 
 def potential_loop_nest_map(kernel):
     """Returns a dictionary mapping inames to other inames that *could*
     be nested around them.
 
-    :seealso: :func:`loopy.schedule.loop_nest_map`
+    * :seealso: :func:`loopy.schedule.loop_nest_map`
+    * :seealso: :func:`loopy.schedule.find_loop_nest_around_map`
     """
 
     result = {}
@@ -52,7 +54,9 @@ def potential_loop_nest_map(kernel):
     return result
 
 
-def fuse_loop_domains(kernel):
+@iterate_over_kernels_if_given_program
+def merge_loop_domains(kernel):
+    # FIXME: This should be moved to loopy.transforms.iname
     from loopy.kernel.tools import is_domain_dependent_on_inames
 
     while True:
@@ -60,11 +64,13 @@ def fuse_loop_domains(kernel):
         parents_per_domain = kernel.parents_per_domain()
         all_parents_per_domain = kernel.all_parents_per_domain()
 
+        iname_to_insns = kernel.iname_to_insns()
+
         new_domains = None
 
         for inner_iname, outer_inames in lnm.items():
             for outer_iname in outer_inames:
-                # {{{ check if it's safe to fuse
+                # {{{ check if it's safe to merge
 
                 inner_domain_idx = kernel.get_home_domain_index(inner_iname)
                 outer_domain_idx = kernel.get_home_domain_index(outer_iname)
@@ -72,12 +78,28 @@ def fuse_loop_domains(kernel):
                 if inner_domain_idx == outer_domain_idx:
                     break
 
+                if (not iname_to_insns[inner_iname]
+                        or not iname_to_insns[outer_iname]):
+                    # Inames without instructions occur when used in
+                    # a SubArrayRef. We don't want monster SubArrayRef domains,
+                    # so refuse to merge those.
+                    continue
+
+                if iname_to_insns[inner_iname] != iname_to_insns[outer_iname]:
+                    # The two inames are imperfectly nested. Domain fusion
+                    # might be invalid when the inner loop is empty, leading to
+                    # the outer loop also being empty.
+
+                    # FIXME: Not fully correct, does not consider reductions
+                    # https://gitlab.tiker.net/inducer/loopy/issues/172
+                    continue
+
                 if (
                         outer_domain_idx in all_parents_per_domain[inner_domain_idx]
                         and not
                         outer_domain_idx == parents_per_domain[inner_domain_idx]):
                     # Outer domain is not a direct parent of the inner
-                    # domain. Unable to fuse.
+                    # domain. Unable to merge.
                     continue
 
                 outer_dom = kernel.domains[outer_domain_idx]
@@ -87,7 +109,7 @@ def fuse_loop_domains(kernel):
                 if is_domain_dependent_on_inames(kernel, inner_domain_idx,
                         outer_inames):
                     # Bounds of inner domain depend on outer domain.
-                    # Unable to fuse.
+                    # Unable to merge.
                     continue
 
                 # }}}
diff --git a/loopy/match.py b/loopy/match.py
index 9160402b48c81e4126f0f73f8fde6f6f5406e8b4..7ecbfcfaef925890f2de9951e70feb9bf3fbbf6f 100644
--- a/loopy/match.py
+++ b/loopy/match.py
@@ -50,6 +50,7 @@ Match expressions
 .. autoclass:: Tagged
 .. autoclass:: Writes
 .. autoclass:: Reads
+.. autoclass:: InKernel
 .. autoclass:: Iname
 """
 
@@ -74,6 +75,7 @@ _id = intern("_id")
 _tag = intern("_tag")
 _writes = intern("_writes")
 _reads = intern("_reads")
+_in_kernel = intern("_in_kernel")
 _iname = intern("_iname")
 
 _whitespace = intern("_whitespace")
@@ -93,13 +95,14 @@ _LEX_TABLE = [
     (_tag, RE(r"tag:([\w?*]+)")),
     (_writes, RE(r"writes:([\w?*]+)")),
     (_reads, RE(r"reads:([\w?*]+)")),
+    (_in_kernel, RE(r"in_kernel:([\w?*]+)")),
     (_iname, RE(r"iname:([\w?*]+)")),
 
     (_whitespace, RE("[ \t]+")),
     ]
 
 
-_TERMINALS = ([_id, _tag, _writes, _reads, _iname])
+_TERMINALS = ([_id, _tag, _writes, _reads, _in_kernel, _iname])
 
 # {{{ operator precedence
 
@@ -293,6 +296,11 @@ class Reads(GlobMatchExpressionBase):
                 for name in matchable.read_dependency_names())
 
 
+class InKernel(GlobMatchExpressionBase):
+    def __call__(self, kernel, matchable):
+        return self.re.match(kernel.name)
+
+
 class Iname(GlobMatchExpressionBase):
     def __call__(self, kernel, matchable):
         return any(self.re.match(name)
@@ -330,6 +338,10 @@ def parse_match(expr):
             result = Reads(pstate.next_match_obj().group(1))
             pstate.advance()
             return result
+        elif next_tag is _in_kernel:
+            result = InKernel(pstate.next_match_obj().group(1))
+            pstate.advance()
+            return result
         elif next_tag is _iname:
             result = Iname(pstate.next_match_obj().group(1))
             pstate.advance()
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index e59c275d29c96775c143942e6c2477b78a8a2c07..90e527ae4a29715a81608079d4fdd88025cc0abf 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -20,11 +20,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+import logging
+logger = logging.getLogger(__name__)
 
 from loopy.diagnostic import (
         LoopyError, WriteRaceConditionWarning, warn_with_kernel,
         LoopyAdvisory)
-
 import islpy as isl
 
 from pytools.persistent_dict import WriteOncePersistentDict
@@ -35,23 +36,34 @@ from loopy.kernel.data import make_assignment, filter_iname_tags_by_type
 from loopy.kernel.tools import kernel_has_global_barriers
 # for the benefit of loopy.statistics, for now
 from loopy.type_inference import infer_unknown_types
-from loopy.transform.iname import remove_any_newly_unused_inames
+from loopy.symbolic import RuleAwareIdentityMapper, ReductionCallbackMapper
+# from loopy.transform.iname import remove_any_newly_unused_inames
 
-import logging
-logger = logging.getLogger(__name__)
+from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction,
+        CallInstruction,  _DataObliviousInstruction)
+from loopy.kernel import LoopKernel
+from loopy.program import Program
+from loopy.kernel.function_interface import CallableKernel, ScalarCallable
+
+from pytools import ProcessLogger
+from functools import partial
 
 
 # {{{ prepare for caching
 
-def prepare_for_caching(kernel):
+def prepare_for_caching_inner(kernel):
     import loopy as lp
+    from loopy.types import OpaqueType
     new_args = []
 
     tgt = kernel.target
 
     for arg in kernel.args:
         dtype = arg.dtype
-        if dtype is not None and dtype is not lp.auto and dtype.target is not tgt:
+        if (dtype is not None
+                and not isinstance(dtype, OpaqueType)
+                and dtype is not lp.auto
+                and dtype.target is not tgt):
             arg = arg.copy(dtype=dtype.with_target(tgt), target=tgt)
 
         new_args.append(arg)
@@ -70,6 +82,32 @@ def prepare_for_caching(kernel):
 
     return kernel
 
+
+def prepare_for_caching(program):
+    if isinstance(program, LoopKernel):
+        return prepare_for_caching_inner(program)
+
+    assert isinstance(program, Program)
+    tgt = program.target
+
+    new_clbls = {}
+    for name, clbl in program.callables_table.items():
+        if clbl.arg_id_to_dtype is not None:
+            arg_id_to_dtype = {id: dtype.with_target(tgt)
+                               for id, dtype in clbl.arg_id_to_dtype.items()}
+            clbl = clbl.copy(arg_id_to_dtype=arg_id_to_dtype)
+        if isinstance(clbl, ScalarCallable):
+            pass
+        elif isinstance(clbl, CallableKernel):
+            subknl = prepare_for_caching_inner(clbl.subkernel)
+            clbl = clbl.copy(subkernel=subknl)
+        else:
+            raise NotImplementedError(type(clbl))
+
+        new_clbls[name] = clbl
+
+    return program.copy(callables_table=new_clbls)
+
 # }}}
 
 
@@ -244,15 +282,11 @@ def find_temporary_address_space(kernel):
             desired_aspace_per_insn.append(desired_aspace)
 
         if not desired_aspace_per_insn:
-            if temp_var.initializer is None:
-                warn_with_kernel(kernel, "temp_to_write(%s)" % temp_var.name,
-                        "temporary variable '%s' never written, eliminating"
-                        % temp_var.name, LoopyAdvisory)
-            else:
-                raise LoopyError("temporary variable '%s': never written, "
-                        "cannot automatically determine address space"
-                        % temp_var.name)
+            warn_with_kernel(kernel, "temp_to_write(%s)" % temp_var.name,
+                    "cannot automatically determine address space of '%s'"
+                    % temp_var.name, LoopyAdvisory)
 
+            new_temp_vars[temp_var.name] = temp_var
             continue
 
         overall_aspace = max(desired_aspace_per_insn)
@@ -741,7 +775,7 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel):
 
     # }}}
 
-    from loopy.kernel.instruction import CallInstruction
+    from loopy.kernel.instruction import CallInstruction, is_array_call
     for insn in kernel.instructions:
         if not isinstance(insn, CallInstruction):
             continue
@@ -749,6 +783,9 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel):
         if len(insn.assignees) <= 1:
             continue
 
+        if is_array_call(insn.assignees, insn.expression):
+            continue
+
         assignees = insn.assignees
         assignee_var_names = insn.assignee_var_names()
 
@@ -882,10 +919,21 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain):
 # }}}
 
 
-@remove_any_newly_unused_inames
-def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
-                      automagic_scans_ok=False, force_scan=False,
-                      force_outer_iname_for_scan=None):
+class RealizeReductionCallbackMapper(ReductionCallbackMapper):
+    def __init__(self, callback, callables_table):
+        super().__init__(callback)
+        self.callables_table = callables_table
+
+    def map_reduction(self, expr, **kwargs):
+        result, self.callables_table = self.callback(expr, self.rec,
+                **kwargs)
+        return result
+
+
+# @remove_any_newly_unused_inames
+def realize_reduction_for_single_kernel(kernel, callables_table,
+        insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False,
+        force_scan=False, force_outer_iname_for_scan=None):
     """Rewrites reductions into their imperative form. With *insn_id_filter*
     specified, operate only on the instruction with an instruction id matching
     *insn_id_filter*.
@@ -1005,7 +1053,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
     # {{{ sequential
 
-    def map_reduction_seq(expr, rec, nresults, arg_dtypes,
+    def map_reduction_seq(expr, rec, callables_table, nresults, arg_dtypes,
             reduction_dtypes):
         outer_insn_inames = insn.within_inames
 
@@ -1037,13 +1085,16 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
         init_id = insn_id_gen(
                 "{}_{}_init".format(insn.id, "_".join(expr.inames)))
 
+        expression, callables_table = expr.operation.neutral_element(
+                *arg_dtypes, callables_table=callables_table, target=kernel.target)
+
         init_insn = make_assignment(
                 id=init_id,
                 assignees=acc_vars,
                 within_inames=outer_insn_inames - frozenset(expr.inames),
                 within_inames_is_final=insn.within_inames_is_final,
                 depends_on=init_insn_depends_on,
-                expression=expr.operation.neutral_element(*arg_dtypes)
+                expression=expression,
 
                 # Do not inherit predicates: Those might read variables
                 # that may not yet be set, and we don't have a great way
@@ -1087,13 +1138,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
         else:
             reduction_expr = expr.expr
 
+        expression, callables_table = expr.operation(
+                arg_dtypes,
+                _strip_if_scalar(acc_vars, acc_vars),
+                reduction_expr,
+                callables_table,
+                kernel.target)
+
         reduction_insn = make_assignment(
                 id=update_id,
                 assignees=acc_vars,
-                expression=expr.operation(
-                    arg_dtypes,
-                    _strip_if_scalar(acc_vars, acc_vars),
-                    reduction_expr),
+                expression=expression,
                 depends_on=frozenset(reduction_insn_depends_on) | insn.depends_on,
                 within_inames=update_insn_iname_deps,
                 within_inames_is_final=insn.within_inames_is_final,
@@ -1105,9 +1160,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
         if nresults == 1:
             assert len(acc_vars) == 1
-            return acc_vars[0]
+            return acc_vars[0], callables_table
         else:
-            return acc_vars
+            return acc_vars, callables_table
 
     # }}}
 
@@ -1139,7 +1194,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                 v[iname].lt_set(v[0] + ubound)).get_basic_sets()
         return bs
 
-    def map_reduction_local(expr, rec, nresults, arg_dtypes,
+    def map_reduction_local(expr, rec, callables_table, nresults, arg_dtypes,
             reduction_dtypes):
         red_iname, = expr.inames
 
@@ -1190,7 +1245,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
         base_iname_deps = outer_insn_inames - frozenset(expr.inames)
 
-        neutral = expr.operation.neutral_element(*arg_dtypes)
+        neutral, callables_table = expr.operation.neutral_element(*arg_dtypes,
+                callables_table=callables_table, target=kernel.target)
         init_id = insn_id_gen(f"{insn.id}_{red_iname}_init")
         init_insn = make_assignment(
                 id=init_id,
@@ -1250,17 +1306,20 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
             reduction_expr = expr.expr
 
         transfer_id = insn_id_gen(f"{insn.id}_{red_iname}_transfer")
+        expression, callables_table = expr.operation(
+                arg_dtypes,
+                _strip_if_scalar(
+                    neutral_var_names,
+                    tuple(var(nvn) for nvn in neutral_var_names)),
+                reduction_expr,
+                callables_table,
+                kernel.target)
         transfer_insn = make_assignment(
                 id=transfer_id,
                 assignees=tuple(
                     acc_var[outer_local_iname_vars + (var(red_iname),)]
                     for acc_var in acc_vars),
-                expression=expr.operation(
-                    arg_dtypes,
-                    _strip_if_scalar(
-                        neutral_var_names,
-                        tuple(var(nvn) for nvn in neutral_var_names)),
-                    reduction_expr),
+                expression=expression,
                 within_inames=(
                     (outer_insn_inames - frozenset(expr.inames))
                     | frozenset([red_iname])),
@@ -1289,22 +1348,26 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
             new_iname_tags[stage_exec_iname] = kernel.iname_tags(red_iname)
 
             stage_id = insn_id_gen("red_%s_stage_%d" % (red_iname, istage))
+            expression, callables_table = expr.operation(
+                    arg_dtypes,
+                    _strip_if_scalar(acc_vars, tuple(
+                        acc_var[
+                            outer_local_iname_vars + (var(stage_exec_iname),)]
+                        for acc_var in acc_vars)),
+                    _strip_if_scalar(acc_vars, tuple(
+                        acc_var[
+                            outer_local_iname_vars + (
+                                var(stage_exec_iname) + new_size,)]
+                        for acc_var in acc_vars)),
+                    callables_table,
+                    kernel.target)
+
             stage_insn = make_assignment(
                     id=stage_id,
                     assignees=tuple(
                         acc_var[outer_local_iname_vars + (var(stage_exec_iname),)]
                         for acc_var in acc_vars),
-                    expression=expr.operation(
-                        arg_dtypes,
-                        _strip_if_scalar(acc_vars, tuple(
-                            acc_var[
-                                outer_local_iname_vars + (var(stage_exec_iname),)]
-                            for acc_var in acc_vars)),
-                        _strip_if_scalar(acc_vars, tuple(
-                            acc_var[
-                                outer_local_iname_vars + (
-                                    var(stage_exec_iname) + new_size,)]
-                            for acc_var in acc_vars))),
+                    expression=expression,
                     within_inames=(
                         base_iname_deps | frozenset([stage_exec_iname])),
                     within_inames_is_final=insn.within_inames_is_final,
@@ -1325,9 +1388,10 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
         if nresults == 1:
             assert len(acc_vars) == 1
-            return acc_vars[0][outer_local_iname_vars + (0,)]
+            return acc_vars[0][outer_local_iname_vars + (0,)], callables_table
         else:
-            return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars]
+            return [acc_var[outer_local_iname_vars + (0,)] for acc_var in
+                    acc_vars], callables_table
     # }}}
 
     # {{{ utils (stateful)
@@ -1386,7 +1450,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
     # {{{ sequential scan
 
-    def map_scan_seq(expr, rec, nresults, arg_dtypes,
+    def map_scan_seq(expr, rec, callables_table, nresults, arg_dtypes,
             reduction_dtypes, sweep_iname, scan_iname, sweep_min_value,
             scan_min_value, stride):
         outer_insn_inames = insn.within_inames
@@ -1423,6 +1487,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
             if global_barrier is not None:
                 init_insn_depends_on |= frozenset([global_barrier])
 
+        expression, callables_table = expr.operation.neutral_element(
+                *arg_dtypes, callables_table=callables_table, target=kernel.target)
+
         init_insn = make_assignment(
                 id=init_id,
                 assignees=acc_vars,
@@ -1430,7 +1497,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                     (sweep_iname,) + expr.inames),
                 within_inames_is_final=insn.within_inames_is_final,
                 depends_on=init_insn_depends_on,
-                expression=expr.operation.neutral_element(*arg_dtypes),
+                expression=expression,
                 # Do not inherit predicates: Those might read variables
                 # that may not yet be set, and we don't have a great way
                 # of figuring out what the dependencies of the accumulator
@@ -1456,13 +1523,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
         if insn.within_inames_is_final:
             update_insn_iname_deps = insn.within_inames | {track_iname}
 
+        expression, callables_table = expr.operation(
+                arg_dtypes,
+                _strip_if_scalar(acc_vars, acc_vars),
+                _strip_if_scalar(acc_vars, updated_inner_exprs),
+                callables_table,
+                kernel.target)
+
         scan_insn = make_assignment(
                 id=update_id,
                 assignees=acc_vars,
-                expression=expr.operation(
-                    arg_dtypes,
-                    _strip_if_scalar(acc_vars, acc_vars),
-                    _strip_if_scalar(acc_vars, updated_inner_exprs)),
+                expression=expression,
                 depends_on=frozenset(update_insn_depends_on),
                 within_inames=update_insn_iname_deps,
                 no_sync_with=insn.no_sync_with,
@@ -1476,25 +1547,25 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
         if nresults == 1:
             assert len(acc_vars) == 1
-            return acc_vars[0]
+            return acc_vars[0], callables_table
         else:
-            return acc_vars
+            return acc_vars, callables_table
 
     # }}}
 
     # {{{ local-parallel scan
 
-    def map_scan_local(expr, rec, nresults, arg_dtypes,
-            reduction_dtypes, sweep_iname, scan_iname,
-            sweep_min_value, scan_min_value, stride):
+    def map_scan_local(expr, rec, callables_table, nresults, arg_dtypes,
+            reduction_dtypes, sweep_iname, scan_iname, sweep_min_value,
+            scan_min_value, stride):
 
         scan_size = _get_int_iname_size(sweep_iname)
 
         assert scan_size > 0
 
         if scan_size == 1:
-            return map_reduction_seq(
-                    expr, rec, nresults, arg_dtypes, reduction_dtypes)
+            return map_reduction_seq(expr, rec, callables_table,
+                    nresults, arg_dtypes, reduction_dtypes)
 
         outer_insn_inames = insn.within_inames
 
@@ -1552,7 +1623,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
         base_iname_deps = (outer_insn_inames
                 - frozenset(expr.inames) - frozenset([sweep_iname]))
 
-        neutral = expr.operation.neutral_element(*arg_dtypes)
+        neutral, callables_table = expr.operation.neutral_element(
+                *arg_dtypes, callables_table=callables_table, target=kernel.target)
 
         init_insn_depends_on = insn.depends_on
 
@@ -1660,19 +1732,23 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
             write_stage_id = insn_id_gen(
                     "scan_%s_write_stage_%d" % (scan_iname, istage))
+
+            expression, callables_table = expr.operation(
+                arg_dtypes,
+                _strip_if_scalar(acc_vars, read_vars),
+                _strip_if_scalar(acc_vars, tuple(
+                    acc_var[
+                        outer_local_iname_vars + (var(stage_exec_iname),)]
+                    for acc_var in acc_vars)),
+                callables_table,
+                kernel.target)
+
             write_stage_insn = make_assignment(
                     id=write_stage_id,
                     assignees=tuple(
                         acc_var[outer_local_iname_vars + (var(stage_exec_iname),)]
                         for acc_var in acc_vars),
-                    expression=expr.operation(
-                        arg_dtypes,
-                        _strip_if_scalar(acc_vars, read_vars),
-                        _strip_if_scalar(acc_vars, tuple(
-                            acc_var[
-                                outer_local_iname_vars + (var(stage_exec_iname),)]
-                            for acc_var in acc_vars))
-                        ),
+                    expression=expression,
                     within_inames=(
                         base_iname_deps | frozenset([stage_exec_iname])),
                     within_inames_is_final=insn.within_inames_is_final,
@@ -1693,16 +1769,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
         if nresults == 1:
             assert len(acc_vars) == 1
-            return acc_vars[0][outer_local_iname_vars + (output_idx,)]
+            return (acc_vars[0][outer_local_iname_vars + (output_idx,)],
+                    callables_table)
         else:
             return [acc_var[outer_local_iname_vars + (output_idx,)]
-                    for acc_var in acc_vars]
+                    for acc_var in acc_vars], callables_table
 
     # }}}
 
     # {{{ seq/par dispatch
 
-    def map_reduction(expr, rec, nresults=1):
+    def map_reduction(expr, rec, callables_table, nresults=1):
         # Only expand one level of reduction at a time, going from outermost to
         # innermost. Otherwise we get the (iname + insn) dependencies wrong.
 
@@ -1710,7 +1787,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                 infer_arg_and_reduction_dtypes_for_reduction_expression)
         arg_dtypes, reduction_dtypes = (
                 infer_arg_and_reduction_dtypes_for_reduction_expression(
-                        temp_kernel, expr, unknown_types_ok))
+                    temp_kernel, expr, callables_table, unknown_types_ok))
 
         outer_insn_inames = insn.within_inames
         bad_inames = frozenset(expr.inames) & outer_insn_inames
@@ -1790,7 +1867,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
             # to reduce over. It's rather similar to an array with () shape in
             # numpy.)
 
-            return expr.expr
+            return expr.expr, callables_table
 
         # }}}
 
@@ -1819,15 +1896,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                             for tag in temp_kernel.iname_tags(sweep_iname))))
                 elif parallel:
                     return map_scan_local(
-                            expr, rec, nresults, arg_dtypes, reduction_dtypes,
+                            expr, rec, callables_table, nresults,
+                            arg_dtypes, reduction_dtypes,
                             sweep_iname, scan_param.scan_iname,
                             scan_param.sweep_lower_bound,
                             scan_param.scan_lower_bound,
                             scan_param.stride)
                 elif sequential:
                     return map_scan_seq(
-                            expr, rec, nresults, arg_dtypes, reduction_dtypes,
-                            sweep_iname, scan_param.scan_iname,
+                            expr, rec, callables_table, nresults,
+                            arg_dtypes, reduction_dtypes, sweep_iname,
+                            scan_param.scan_iname,
                             scan_param.sweep_lower_bound,
                             scan_param.scan_lower_bound,
                             scan_param.stride)
@@ -1846,17 +1925,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
         if n_sequential:
             assert n_local_par == 0
-            return map_reduction_seq(
-                    expr, rec, nresults, arg_dtypes, reduction_dtypes)
+            return map_reduction_seq(expr, rec, callables_table,
+                    nresults, arg_dtypes, reduction_dtypes)
         else:
             assert n_local_par > 0
             return map_reduction_local(
-                    expr, rec, nresults, arg_dtypes, reduction_dtypes)
+                    expr, rec, callables_table, nresults, arg_dtypes,
+                    reduction_dtypes)
 
     # }}}
 
-    from loopy.symbolic import ReductionCallbackMapper
-    cb_mapper = ReductionCallbackMapper(map_reduction)
+    cb_mapper = RealizeReductionCallbackMapper(map_reduction, callables_table)
 
     insn_queue = kernel.instructions[:]
     insn_id_replacements = {}
@@ -1885,9 +1964,12 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
         # Run reduction expansion.
         from loopy.symbolic import Reduction
         if isinstance(insn.expression, Reduction) and nresults > 1:
-            new_expressions = cb_mapper(insn.expression, nresults=nresults)
+            new_expressions = cb_mapper(insn.expression,
+                    callables_table=cb_mapper.callables_table,
+                    nresults=nresults)
         else:
-            new_expressions = (cb_mapper(insn.expression),)
+            new_expressions = cb_mapper(insn.expression,
+                    callables_table=cb_mapper.callables_table),
 
         if generated_insns:
             # An expansion happened, so insert the generated stuff plus
@@ -1967,13 +2049,32 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
     kernel = lp.replace_instruction_ids(kernel, insn_id_replacements)
 
-    kernel = lp.tag_inames(kernel, new_iname_tags)
+    from loopy.transform.iname import tag_inames
+    kernel = tag_inames(kernel, new_iname_tags)
 
     kernel = (
             _hackily_ensure_multi_assignment_return_values_are_scoped_private(
                 kernel))
 
-    return kernel
+    return kernel, cb_mapper.callables_table
+
+
+def realize_reduction(program, *args, **kwargs):
+    assert isinstance(program, Program)
+
+    callables_table = dict(program.callables_table)
+    kernels_to_scan = [in_knl_callable.subkernel
+            for in_knl_callable in program.callables_table.values()
+            if isinstance(in_knl_callable, CallableKernel)]
+
+    for knl in kernels_to_scan:
+        new_knl, callables_table = realize_reduction_for_single_kernel(
+                knl, callables_table, *args, **kwargs)
+        in_knl_callable = callables_table[knl.name].copy(
+                subkernel=new_knl)
+        callables_table[knl.name] = in_knl_callable
+
+    return program.copy(callables_table=callables_table)
 
 # }}}
 
@@ -2043,37 +2144,254 @@ def check_atomic_loads(kernel):
 # }}}
 
 
-preprocess_cache = WriteOncePersistentDict(
-        "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION,
-        key_builder=LoopyKeyBuilder())
+# {{{ arg_descr_inference
 
+class ArgDescrInferenceMapper(RuleAwareIdentityMapper):
+    """
+    Infers the :attr:`loopy`
+    """
 
-def preprocess_kernel(kernel, device=None):
-    if device is not None:
-        from warnings import warn
-        warn("passing 'device' to preprocess_kernel() is deprecated",
-                DeprecationWarning, stacklevel=2)
+    def __init__(self, rule_mapping_context, caller_kernel,
+            callables_table):
+        super().__init__(
+                rule_mapping_context)
+        self.caller_kernel = caller_kernel
+        self.callables_table = callables_table
+
+    def map_call(self, expr, expn_state, assignees=None):
+        from pymbolic.primitives import Call, CallWithKwargs, Variable
+        from loopy.kernel.function_interface import ValueArgDescriptor
+        from loopy.symbolic import ResolvedFunction
+        from loopy.kernel.array import ArrayBase
+        from loopy.kernel.data import ValueArg
+        from pymbolic.mapper.substitutor import make_subst_func
+        from loopy.symbolic import SubstitutionMapper
 
-    from loopy.kernel import KernelState
-    if kernel.state >= KernelState.PREPROCESSED:
-        return kernel
+        if not isinstance(expr.function, ResolvedFunction):
+            # ignore if the call is not to a ResolvedFunction
+            return super().map_call(expr, expn_state)
 
-    # {{{ cache retrieval
+        arg_id_to_val = dict(enumerate(expr.parameters))
+        if isinstance(expr, CallWithKwargs):
+            arg_id_to_val.update(expr.kw_parameters)
 
-    from loopy import CACHING_ENABLED
-    if CACHING_ENABLED:
-        input_kernel = kernel
+        if assignees is not None:
+            # If supplied with assignees then this is a CallInstruction
+            for i, arg in enumerate(assignees):
+                arg_id_to_val[-i-1] = arg
 
-        try:
-            result = preprocess_cache[kernel]
-            logger.debug("%s: preprocess cache hit" % kernel.name)
-            return result
-        except KeyError:
-            pass
+        from loopy.kernel.function_interface import get_arg_descriptor_for_expression
+        arg_id_to_descr = {
+                arg_id: get_arg_descriptor_for_expression(
+                    self.caller_kernel, arg)
+                for arg_id, arg in arg_id_to_val.items()}
+        in_knl_callable = self.callables_table[expr.function.name]
 
-    # }}}
+        # {{{ translating descriptor expressions to the callable's namespace
+
+        deps_as_params = []
+        subst_map = {}
+
+        deps = frozenset().union(*(descr.depends_on()
+                                   for descr in arg_id_to_descr.values()))
+
+        assert deps <= self.caller_kernel.all_variable_names()
+
+        for dep in deps:
+            caller_arg = self.caller_kernel.arg_dict.get(dep, None)
+            caller_arg = self.caller_kernel.temporary_variables.get(dep, caller_arg)
+
+            if not (isinstance(caller_arg, ValueArg) or (isinstance(caller_arg,
+                    ArrayBase) and caller_arg.shape == ())):
+                raise NotImplementedError(f"Obtained '{dep}' as a dependency for"
+                        f" call '{expr.function.name}' which is not a scalar.")
+
+            in_knl_callable, callee_name = in_knl_callable.with_added_arg(
+                    caller_arg.dtype, ValueArgDescriptor())
+
+            subst_map[dep] = Variable(callee_name)
+            deps_as_params.append(Variable(dep))
+
+        mapper = SubstitutionMapper(make_subst_func(subst_map))
+        arg_id_to_descr = {id_: descr.map_expr(mapper)
+                           for id_, descr in arg_id_to_descr.items()}
+
+        # }}}
+
+        # specializing the function according to the parameter description
+        new_in_knl_callable, self.callables_table = (
+                in_knl_callable.with_descrs(
+                    arg_id_to_descr, self.callables_table))
+
+        # find the deps of the new in kernel callablen and add those arguments to
+
+        self.callables_table, new_func_id = (
+                self.callables_table.with_callable(
+                    expr.function.function,
+                    new_in_knl_callable))
+
+        if isinstance(expr, Call):
+            return Call(
+                    ResolvedFunction(new_func_id),
+                    tuple(self.rec(child, expn_state)
+                          for child in expr.parameters)
+                    + tuple(deps_as_params))
+        else:
+            # FIXME: Order for vars when kwargs are present?
+            assert isinstance(expr, CallWithKwargs)
+            return CallWithKwargs(
+                    ResolvedFunction(new_func_id),
+                    tuple(self.rec(child, expn_state)
+                        for child in expr.parameters),
+                    {
+                        key: self.rec(val, expn_state)
+                        for key, val in expr.kw_parameters.items()}
+                    )
+
+    map_call_with_kwargs = map_call
+
+    def __call__(self, expr, kernel, insn, assignees=None):
+        from loopy.kernel.data import InstructionBase
+        from loopy.symbolic import IdentityMapper, ExpansionState
+        assert insn is None or isinstance(insn, InstructionBase)
+
+        return IdentityMapper.__call__(self, expr,
+                ExpansionState(
+                    kernel=kernel,
+                    instruction=insn,
+                    stack=(),
+                    arg_context={}), assignees=assignees)
+
+    def map_kernel(self, kernel):
+
+        new_insns = []
+
+        for insn in kernel.instructions:
+            if isinstance(insn, CallInstruction):
+                # In call instructions the assignees play an important in
+                # determining the arg_id_to_descr
+                mapper = partial(self, kernel=kernel, insn=insn,
+                        assignees=insn.assignees)
+                new_insns.append(insn.with_transformed_expressions(mapper))
+            elif isinstance(insn, MultiAssignmentBase):
+                mapper = partial(self, kernel=kernel, insn=insn)
+                new_insns.append(insn.with_transformed_expressions(mapper))
+            elif isinstance(insn, (_DataObliviousInstruction, CInstruction)):
+                new_insns.append(insn)
+            else:
+                raise NotImplementedError("arg_descr_inference for %s instruction" %
+                        type(insn))
+
+        return kernel.copy(instructions=new_insns)
+
+
+def traverse_to_infer_arg_descr(kernel, callables_table):
+    """
+    Returns a copy of *kernel* with the argument shapes and strides matching for
+    resolved functions in the *kernel*. Refer
+    :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`.
+
+    .. note::
+
+        Initiates a walk starting from *kernel* to all its callee kernels.
+    """
+    from loopy.symbolic import SubstitutionRuleMappingContext
+
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            kernel.substitutions, kernel.get_var_name_generator())
+
+    arg_descr_inf_mapper = ArgDescrInferenceMapper(rule_mapping_context,
+            kernel, callables_table)
+
+    descr_inferred_kernel = rule_mapping_context.finish_kernel(
+            arg_descr_inf_mapper.map_kernel(kernel))
+
+    return descr_inferred_kernel, arg_descr_inf_mapper.callables_table
+
+
+def infer_arg_descr(program):
+    """
+    Returns a copy of *program* with the
+    :attr:`loopy.InKernelCallable.arg_id_to_descr` inferred for all the
+    callables.
+    """
+    from loopy.program import make_clbl_inf_ctx, resolve_callables
+    from loopy.kernel.array import ArrayBase
+    from loopy.kernel.function_interface import (ArrayArgDescriptor,
+            ValueArgDescriptor)
+    from loopy import auto, ValueArg
 
-    logger.info("%s: preprocess start" % kernel.name)
+    program = resolve_callables(program)
+
+    clbl_inf_ctx = make_clbl_inf_ctx(program.callables_table,
+            program.entrypoints)
+
+    renamed_entrypoints = set()
+
+    for e in program.entrypoints:
+        def _tuple_or_None(s):
+            if isinstance(s, tuple):
+                return s
+            elif s in [None, auto]:
+                return s
+            else:
+                return s,
+
+        arg_id_to_descr = {}
+        for arg in program[e].args:
+            if isinstance(arg, ArrayBase):
+                if arg.shape not in (None, auto):
+                    arg_id_to_descr[arg.name] = ArrayArgDescriptor(
+                            _tuple_or_None(arg.shape), arg.address_space,
+                            arg.dim_tags)
+            elif isinstance(arg, ValueArg):
+                arg_id_to_descr[arg.name] = ValueArgDescriptor()
+            else:
+                raise NotImplementedError()
+        new_callable, clbl_inf_ctx = program.callables_table[e].with_descrs(
+                arg_id_to_descr, clbl_inf_ctx)
+        clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable)
+        renamed_entrypoints.add(new_name.name)
+
+    return clbl_inf_ctx.finish_program(program, renamed_entrypoints)
+
+# }}}
+
+
+# {{{  inline_kernels_with_gbarriers
+
+
+def inline_kernels_with_gbarriers(program):
+    from loopy.kernel.instruction import BarrierInstruction
+    from loopy.transform.callable import inline_callable_kernel
+
+    def has_gbarrier(knl):
+        return any((isinstance(insn, BarrierInstruction)
+                    and insn.synchronization_kind == "global")
+                   for insn in knl.instructions)
+
+    callees_to_inline = [name for name, knl_clbl in program.callables_table.items()
+                         if (isinstance(knl_clbl, CallableKernel)
+                             and has_gbarrier(knl_clbl.subkernel))]
+
+    for callee_to_inline in callees_to_inline:
+        program = inline_callable_kernel(program, callee_to_inline)
+
+    return program
+
+
+# }}}
+
+
+preprocess_cache = WriteOncePersistentDict(
+        "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION,
+        key_builder=LoopyKeyBuilder())
+
+
+def preprocess_single_kernel(kernel, callables_table, device=None):
+    from loopy.kernel import KernelState
+
+    prepro_logger = ProcessLogger(logger, "%s: preprocess" % kernel.name)
 
     from loopy.check import check_identifiers_in_subst_rules
     check_identifiers_in_subst_rules(kernel)
@@ -2089,20 +2407,82 @@ def preprocess_kernel(kernel, device=None):
 
     # }}}
 
-    from loopy.transform.subst import expand_subst
-    kernel = expand_subst(kernel)
-
     # Ordering restriction:
     # Type inference and reduction iname uniqueness don't handle substitutions.
     # Get them out of the way.
 
-    kernel = infer_unknown_types(kernel, expect_completion=False)
-
     check_for_writes_to_predicates(kernel)
     check_reduction_iname_uniqueness(kernel)
 
+    # Ordering restriction:
+    # add_axes_to_temporaries_for_ilp because reduction accumulators
+    # need to be duplicated by this.
+
+    kernel = realize_ilp(kernel)
+
+    kernel = find_temporary_address_space(kernel)
+
+    # check for atomic loads, much easier to do here now that the dependencies
+    # have been established
+    kernel = check_atomic_loads(kernel)
+
+    kernel = kernel.target.preprocess(kernel)
+
+    kernel = kernel.copy(
+            state=KernelState.PREPROCESSED)
+
+    prepro_logger.done()
+
+    return kernel
+
+
+def preprocess_program(program, device=None):
+
+    # {{{ cache retrieval
+
+    from loopy import CACHING_ENABLED
+    if CACHING_ENABLED:
+        input_program = program
+
+        try:
+            result = preprocess_cache[program]
+            logger.debug(f"program with entrypoints: {program.entrypoints}"
+                    " preprocess cache hit")
+            return result
+        except KeyError:
+            pass
+
+    # }}}
+
+    from loopy.kernel import KernelState
+    if program.state >= KernelState.PREPROCESSED:
+        return program
+
+    if len([clbl for clbl in program.callables_table.values() if
+            isinstance(clbl, CallableKernel)]) == 1:
+        program = program.with_entrypoints(",".join(clbl.name for clbl in
+            program.callables_table.values() if isinstance(clbl,
+                CallableKernel)))
+
+    if not program.entrypoints:
+        raise LoopyError("Translation unit did not receive any entrypoints")
+
+    from loopy.program import resolve_callables
+    program = resolve_callables(program)
+
+    if device is not None:
+        # FIXME: Time to remove this? (Git blame shows 5 years ago)
+        from warnings import warn
+        warn("passing 'device' to preprocess_kernel() is deprecated",
+                DeprecationWarning, stacklevel=2)
+
+    program = infer_unknown_types(program, expect_completion=False)
+
+    from loopy.transform.subst import expand_subst
+    program = expand_subst(program)
+
     from loopy.kernel.creation import apply_single_writer_depencency_heuristic
-    kernel = apply_single_writer_depencency_heuristic(kernel)
+    program = apply_single_writer_depencency_heuristic(program)
 
     # Ordering restrictions:
     #
@@ -2113,26 +2493,44 @@ def preprocess_kernel(kernel, device=None):
     #   because it manipulates the depends_on field, which could prevent
     #   defaults from being applied.
 
-    kernel = realize_reduction(kernel, unknown_types_ok=False)
+    program = realize_reduction(program, unknown_types_ok=False)
 
-    # Ordering restriction:
-    # add_axes_to_temporaries_for_ilp because reduction accumulators
-    # need to be duplicated by this.
+    # {{{ preprocess callable kernels
 
-    kernel = realize_ilp(kernel)
+    # Callable editing restrictions:
+    #
+    # - should not edit callables_table in :meth:`preprocess_single_kernel`
+    #   as we are iterating over it.[1]
+    #
+    # [1] https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects
+
+    new_callables = {}
+    for func_id, in_knl_callable in program.callables_table.items():
+        if isinstance(in_knl_callable, CallableKernel):
+            new_subkernel = preprocess_single_kernel(
+                    in_knl_callable.subkernel, program.callables_table,
+                    device)
+            in_knl_callable = in_knl_callable.copy(
+                    subkernel=new_subkernel)
+        elif isinstance(in_knl_callable, ScalarCallable):
+            pass
+        else:
+            raise NotImplementedError("Unknown callable type %s." % (
+                type(in_knl_callable).__name__))
 
-    kernel = find_temporary_address_space(kernel)
+        new_callables[func_id] = in_knl_callable
 
-    # check for atomic loads, much easier to do here now that the dependencies
-    # have been established
-    kernel = check_atomic_loads(kernel)
+    program = program.copy(callables_table=new_callables)
 
-    kernel = kernel.target.preprocess(kernel)
+    # }}}
 
-    logger.info("%s: preprocess done" % kernel.name)
+    # infer arg descrs of the callables
+    program = infer_arg_descr(program)
 
-    kernel = kernel.copy(
-            state=KernelState.PREPROCESSED)
+    # Ordering restriction:
+    # callees with gbarrier in them must be inlined after inferrring arg_descr.
+    # inline_kernels_with_gbarriers does not recursively inline the callees.
+    program = inline_kernels_with_gbarriers(program)
 
     # {{{ prepare for caching
 
@@ -2142,15 +2540,20 @@ def preprocess_kernel(kernel, device=None):
     # this target information.
 
     if CACHING_ENABLED:
-        input_kernel = prepare_for_caching(input_kernel)
+        input_program = prepare_for_caching(input_program)
 
-    kernel = prepare_for_caching(kernel)
+    program = prepare_for_caching(program)
 
     # }}}
 
     if CACHING_ENABLED:
-        preprocess_cache.store_if_not_present(input_kernel, kernel)
+        preprocess_cache.store_if_not_present(input_program, program)
+
+    return program
+
+
+# FIXME: Do we add a deprecation warning?
+preprocess_kernel = preprocess_program
 
-    return kernel
 
 # vim: foldmethod=marker
diff --git a/loopy/program.py b/loopy/program.py
new file mode 100644
index 0000000000000000000000000000000000000000..792abe59aa5f2b3660453d9f54d7c3d3dec94500
--- /dev/null
+++ b/loopy/program.py
@@ -0,0 +1,793 @@
+__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import re
+import collections
+
+from pytools import ImmutableRecord
+from pymbolic.primitives import Variable
+from functools import wraps
+
+from loopy.symbolic import (RuleAwareIdentityMapper, ResolvedFunction,
+        CombineMapper, SubstitutionRuleMappingContext)
+from loopy.kernel.function_interface import (
+        CallableKernel, ScalarCallable)
+from loopy.kernel.instruction import (
+        MultiAssignmentBase, CInstruction, _DataObliviousInstruction)
+from loopy.diagnostic import LoopyError
+from loopy.library.reduction import ReductionOpFunction
+
+from loopy.kernel import LoopKernel
+from loopy.tools import update_persistent_hash
+from pymbolic.primitives import Call, CallWithKwargs
+from functools import reduce
+from pyrsistent import pmap, PMap
+
+__doc__ = """
+
+.. currentmodule:: loopy
+
+.. autoclass:: Program
+
+.. autofunction:: make_program
+.. autofunction:: iterate_over_kernels_if_given_program
+
+"""
+
+
+def find_in_knl_callable_from_identifier(
+        function_id_to_in_knl_callable_mappers, target, identifier):
+    """
+    Returns an instance of
+    :class:`loopy.kernel.function_interface.InKernelCallable` if the
+    :arg:`identifier` is known to any kernel function scoper, otherwise returns
+    *None*.
+    """
+    for func_id_to_in_knl_callable_mapper in (
+            function_id_to_in_knl_callable_mappers):
+        # fixme: do we really need to given target for the function
+        in_knl_callable = func_id_to_in_knl_callable_mapper(
+                target, identifier)
+        if in_knl_callable is not None:
+            return in_knl_callable
+
+    return None
+
+
+def _is_a_reduction_op(expr):
+    if isinstance(expr, ResolvedFunction):
+        return _is_a_reduction_op(expr.function)
+
+    from loopy.library.reduction import ReductionOpFunction
+    return isinstance(expr, ReductionOpFunction)
+
+
+class CallableResolver(RuleAwareIdentityMapper):
+    """
+    Resolves callables in expressions and records the names of the calls
+    resolved.
+
+    .. attribute:: known_callables
+
+        An instance of :class:`frozenset` of the call names to be resolved.
+
+    .. attribute:: rule_mapping_context
+
+        An instance of :class:`loopy.symbolic.RuleMappingContext`.
+    """
+    def __init__(self, rule_mapping_context, known_callables):
+        assert isinstance(known_callables, frozenset)
+
+        super().__init__(rule_mapping_context)
+
+        self.known_callables = known_callables
+
+        # a record of the call names that were resolved
+        self.calls_resolved = set()
+
+    def map_call(self, expr, expn_state):
+        from loopy.symbolic import parse_tagged_name
+
+        if not _is_a_reduction_op(expr.function):
+            name, tag = parse_tagged_name(expr.function)
+        else:
+            if isinstance(expr.function, ResolvedFunction):
+                name = expr.function.function
+            else:
+                name = expr.function
+
+        if name in self.known_callables:
+            params = tuple(self.rec(par, expn_state) for par in expr.parameters)
+
+            # record that we resolved a call
+            self.calls_resolved.add(name)
+
+            function = expr.function
+
+            if not isinstance(expr.function, ResolvedFunction):
+                function = ResolvedFunction(expr.function)
+
+            return Call(function, params)
+
+        return super().map_call(expr, expn_state)
+
+    def map_call_with_kwargs(self, expr, expn_state):
+        from loopy.symbolic import parse_tagged_name
+        name, tag = parse_tagged_name(expr.function)
+
+        if name in self.known_callables:
+            params = tuple(self.rec(par, expn_state) for par in expr.parameters)
+            kw_params = {kw: self.rec(par, expn_state)
+                         for kw, par in expr.kw_parameters.items()}
+
+            # record that we resolved a call
+            self.calls_resolved.add(name)
+
+            return CallWithKwargs(ResolvedFunction(expr.function), params, kw_params)
+
+        return super().map_call_with_kwargs(expr, expn_state)
+
+
+# {{{ program
+
+class Program(ImmutableRecord):
+    """
+    Records the information about all the callables in a :mod:`loopy` program.
+
+    .. attribute:: entrypoints
+
+        A :class:`frozenset` of the names of the kernels which
+        could be called from the host.
+
+    .. attribute:: callables_table
+
+        An instance of :class:`pyrsistent.PMap` mapping the function
+        identifiers in a kernel to their associated instances of
+        :class:`loopy.kernel.function_interface.InKernelCallable`.
+
+    .. attribute:: target
+
+        An instance of :class:`loopy.target.TargetBase`.
+
+    .. attribute:: func_id_to_in_knl_callables_mappers
+
+        A :class:`frozenset` of functions of the signature ``(target:
+        TargetBase, function_indentifier: str)`` that would return an instance
+        of :class:`loopy.kernel.function_interface.InKernelCallable` or *None*.
+
+    .. note::
+
+        - To create an instance of :class:`loopy.Program`, it is recommended to
+            go through :method:`loopy.make_kernel`.
+        - This data structure and its attributes should be considered
+          immutable, any modifications should be done through :method:`copy`.
+
+    .. automethod:: __init__
+    .. method:: __getitem__
+
+        Look up the resolved callable with identifier *name*.
+    """
+    def __init__(self,
+            entrypoints=frozenset(),
+            callables_table=pmap(),
+            target=None,
+            func_id_to_in_knl_callable_mappers=[]):
+
+        # {{{ sanity checks
+
+        assert isinstance(callables_table, collections.abc.Mapping)
+        assert isinstance(entrypoints, frozenset)
+
+        if not isinstance(callables_table, PMap):
+            callables_table = pmap(callables_table)
+
+        # }}}
+
+        super().__init__(
+                entrypoints=entrypoints,
+                callables_table=pmap(callables_table),
+                target=target,
+                func_id_to_in_knl_callable_mappers=(
+                    func_id_to_in_knl_callable_mappers))
+
+        self._program_executor_cache = {}
+
+    hash_fields = (
+            "entrypoints",
+            "callables_table",
+            "target",)
+
+    update_persistent_hash = update_persistent_hash
+
+    def copy(self, **kwargs):
+        target = kwargs.pop("target", None)
+        program = super().copy(**kwargs)
+        if target:
+            from loopy.kernel import KernelState
+            if max(callable_knl.subkernel.state
+                   for callable_knl in self.callables_table.values()
+                   if isinstance(callable_knl, CallableKernel)) > (
+                            KernelState.INITIAL):
+                if not isinstance(target, type(self.target)):
+                    raise LoopyError("One of the kernels in the program has been "
+                            "preprocessed, cannot modify target now.")
+
+            new_callables = {}
+            for func_id, clbl in program.callables_table.items():
+                if isinstance(clbl, CallableKernel):
+                    knl = clbl.subkernel
+                    knl = knl.copy(target=target)
+                    clbl = clbl.copy(subkernel=knl)
+                elif isinstance(clbl, ScalarCallable):
+                    pass
+                else:
+                    raise NotImplementedError()
+                new_callables[func_id] = clbl
+
+            program = super().copy(
+                callables_table=new_callables, target=target)
+
+        return program
+
+    def with_entrypoints(self, entrypoints):
+        """
+        :param entrypoints: Either a comma-separated :class:`str` or
+        :class:`frozenset`.
+        """
+        if isinstance(entrypoints, str):
+            entrypoints = frozenset([e.strip() for e in
+                entrypoints.split(",")])
+
+        assert isinstance(entrypoints, frozenset)
+
+        return self.copy(entrypoints=entrypoints)
+
+    @property
+    def state(self):
+        """ Returns an instance of :class:`loopy.kernel.KernelState`. """
+        return min(callable_knl.subkernel.state for callable_knl in
+                self.callables_table.values() if
+                isinstance(callable_knl, CallableKernel))
+
+    def with_kernel(self, kernel):
+        """
+        If *self* contains a callable kernel with *kernel*'s name, replaces its
+        subkernel and returns a copy of *self*. Else records a new callable
+        kernel with *kernel* as its subkernel.
+
+        :arg kernel: An instance of :class:`loopy.kernel.LoopKernel`.
+        :returns: Copy of *self* with updated callable kernels.
+        """
+        if kernel.name in self.callables_table:
+            # update the callable kernel
+            new_in_knl_callable = self.callables_table[kernel.name].copy(
+                    subkernel=kernel)
+            new_callables = self.callables_table.remove(kernel.name).set(
+                    kernel.name, new_in_knl_callable)
+            return self.copy(callables_table=new_callables)
+        else:
+            # add a new callable kernel
+            clbl = CallableKernel(kernel)
+            new_callables = self.callables_table.set(kernel.name, clbl)
+            return self.copy(callables_table=new_callables)
+
+    def __getitem__(self, name):
+        result = self.callables_table[name]
+        if isinstance(result, CallableKernel):
+            return result.subkernel
+        else:
+            return result
+
+    def __call__(self, *args, **kwargs):
+        entrypoint = kwargs.get("entrypoint", None)
+
+        if entrypoint is None:
+            # did not receive an entrypoint for the program to execute
+            if len(self.entrypoints) == 1:
+                entrypoint, = list(self.entrypoints)
+            else:
+                raise TypeError("Program.__call__() missing 1 required"
+                        " keyword argument: 'entrypoint'")
+
+        if entrypoint not in self.entrypoints:
+            raise LoopyError("'{}' not in list possible entrypoints supplied to"
+                    " the program. Maybe you want to invoke 'with_entrypoints'"
+                    " before calling the program.".format(entrypoint))
+
+        kwargs["entrypoint"] = entrypoint
+
+        key = self.target.get_kernel_executor_cache_key(*args, **kwargs)
+        try:
+            pex = self._program_executor_cache[key]
+        except KeyError:
+            pex = self.target.get_kernel_executor(self, *args, **kwargs)
+            self._program_executor_cache[key] = pex
+
+        return pex(*args, **kwargs)
+
+    def __str__(self):
+        # FIXME: do a topological sort by the call graph
+
+        def strify_callable(clbl):
+            return str(clbl.subkernel)
+
+        return "\n".join(
+                strify_callable(clbl)
+                for name, clbl in self.callables_table.items()
+                if isinstance(clbl, CallableKernel))
+
+    def __setstate__(self, state_obj):
+        super().__setstate__(state_obj)
+
+        self._program_executor_cache = {}
+
+    def __hash__(self):
+        from loopy.tools import LoopyKeyBuilder
+        from pytools.persistent_dict import new_hash
+        key_hash = new_hash()
+        self.update_persistent_hash(key_hash, LoopyKeyBuilder())
+        return hash(key_hash.digest())
+
+# }}}
+
+
+def next_indexed_function_identifier(function_id):
+    """
+    Returns an instance of :class:`str` with the next indexed-name in the
+    sequence for the name of *function*.
+
+    *Example:* ``'sin_0'`` will return ``'sin_1'``.
+
+    :arg function_id: Either an instance of :class:`str`.
+    """
+
+    # {{{ sanity checks
+
+    assert isinstance(function_id, str)
+
+    # }}}
+
+    func_name = re.compile(r"^(?P<alpha>\S+?)_(?P<num>\d+?)$")
+
+    match = func_name.match(function_id)
+
+    if match is None:
+        if function_id[-1] == "_":
+            return f"{function_id}0"
+        else:
+            return f"{function_id}_0"
+
+    return "{alpha}_{num}".format(alpha=match.group("alpha"),
+            num=int(match.group("num"))+1)
+
+
+class ResolvedFunctionRenamer(RuleAwareIdentityMapper):
+    """
+    Mapper to rename the resolved functions in an expression according to
+    *renaming_dict*.
+    """
+    def __init__(self, rule_mapping_context, renaming_dict):
+        super().__init__(
+                rule_mapping_context)
+        self.renaming_dict = renaming_dict
+
+    def map_resolved_function(self, expr, expn_state):
+        if expr.name in self.renaming_dict:
+            return ResolvedFunction(self.renaming_dict[expr.name])
+        else:
+            return super().map_resolved_function(
+                    expr, expn_state)
+
+
+def rename_resolved_functions_in_a_single_kernel(kernel,
+        renaming_dict):
+    """
+    Returns a copy of *kernel* with the instances of :class:`ResolvedFunction`
+    renames according to *renaming_dict*.
+    """
+    from loopy.symbolic import SubstitutionRuleMappingContext
+    rule_mapping_context = SubstitutionRuleMappingContext(
+                kernel.substitutions, kernel.get_var_name_generator())
+    resolved_function_renamer = ResolvedFunctionRenamer(rule_mapping_context,
+            renaming_dict)
+    return (
+            rule_mapping_context.finish_kernel(
+                resolved_function_renamer.map_kernel(kernel)))
+
+
+class CallablesIDCollector(CombineMapper):
+    """
+    Returns an instance of :class:`frozenset` containing instances of
+    :class:`loopy.kernel.function_interface.InKernelCallable` in the
+    :attr:``kernel`.
+    """
+    def combine(self, values):
+        import operator
+        return reduce(operator.or_, values, frozenset())
+
+    def map_resolved_function(self, expr):
+        return frozenset([expr.name])
+
+    def map_constant(self, expr):
+        return frozenset()
+
+    def map_kernel(self, kernel):
+        callables_in_insn = frozenset()
+
+        for insn in kernel.instructions:
+            if isinstance(insn, MultiAssignmentBase):
+                callables_in_insn = callables_in_insn | (
+                        self(insn.expression))
+            elif isinstance(insn, (CInstruction, _DataObliviousInstruction)):
+                pass
+            else:
+                raise NotImplementedError(type(insn).__name__)
+
+        for rule in kernel.substitutions.values():
+            callables_in_insn = callables_in_insn | (
+                    self(rule.expression))
+
+        return callables_in_insn
+
+    map_variable = map_constant
+    map_function_symbol = map_constant
+    map_tagged_variable = map_constant
+    map_type_cast = map_constant
+
+
+def _get_callable_ids_for_knl(knl, callables):
+    clbl_id_collector = CallablesIDCollector()
+
+    return frozenset().union(*(
+        _get_callable_ids_for_knl(callables[clbl].subkernel, callables) |
+        frozenset([clbl]) if isinstance(callables[clbl], CallableKernel) else
+        frozenset([clbl])
+        for clbl in clbl_id_collector.map_kernel(knl)))
+
+
+def _get_callable_ids(callables, entrypoints):
+    return frozenset().union(*(
+        _get_callable_ids_for_knl(callables[e].subkernel, callables) for e in
+        entrypoints))
+
+
+def make_clbl_inf_ctx(callables, entrypoints):
+    return CallablesInferenceContext(callables, _get_callable_ids(callables,
+        entrypoints))
+
+
+class CallablesInferenceContext(ImmutableRecord):
+    def __init__(self, callables, old_callable_ids, history={}):
+        assert isinstance(callables, collections.abc.Mapping)
+        callables = dict(callables)
+
+        super().__init__(
+                callables=callables,
+                old_callable_ids=old_callable_ids,
+                history=history)
+
+    # {{{ interface to perform edits on callables
+
+    def with_callable(self, function, in_kernel_callable):
+        """
+        Returns an instance of :class:`tuple` ``(new_self, new_function)``.
+
+        :arg function: An instance of :class:`pymbolic.primitives.Variable` or
+            :class:`loopy.library.reduction.ReductionOpFunction`.
+
+        :arg in_kernel_callable: An instance of
+            :class:`loopy.InKernelCallable`.
+        """
+
+        # {{{ sanity checks
+
+        if isinstance(function, str):
+            function = Variable(function)
+
+        assert isinstance(function, (Variable, ReductionOpFunction))
+
+        # }}}
+
+        history = self.history.copy()
+
+        if in_kernel_callable in self.callables.values():
+            # the callable already exists, hence return the function
+            # identifier corresponding to that callable.
+            for func_id, in_knl_callable in self.callables.items():
+                if in_knl_callable == in_kernel_callable:
+                    history[func_id] = function.name
+                    if isinstance(func_id, str):
+                        return (
+                                self.copy(
+                                    history=history),
+                                Variable(func_id))
+                    else:
+                        assert isinstance(func_id, ReductionOpFunction)
+                        return (
+                                self.copy(
+                                    history=history),
+                                func_id)
+
+            assert False
+        else:
+            # {{{ handle ReductionOpFunction
+
+            if isinstance(function, ReductionOpFunction):
+                # FIXME: Check if we have 2 ArgMax functions
+                # with different types in the same kernel the generated code
+                # does not mess up the types.
+                unique_function_identifier = function.copy()
+                updated_callables = self.callables.copy()
+                updated_callables[unique_function_identifier] = (
+                        in_kernel_callable)
+
+                return (
+                        self.copy(
+                            callables=updated_callables),
+                        unique_function_identifier)
+
+            # }}}
+
+            unique_function_identifier = function.name
+
+            while unique_function_identifier in self.callables:
+                unique_function_identifier = (
+                        next_indexed_function_identifier(
+                            unique_function_identifier))
+
+        updated_callables = self.callables.copy()
+        updated_callables[unique_function_identifier] = (
+                in_kernel_callable)
+
+        history[unique_function_identifier] = function.name
+
+        return (
+                self.copy(
+                    history=history,
+                    callables=updated_callables),
+                Variable(unique_function_identifier))
+
+    def finish_program(self, program, renamed_entrypoints):
+        """
+        Returns a copy of *program* with renaming of the callables done whenever
+        needed.
+
+        *For example: * If all the ``sin`` got diverged as ``sin_0, sin_1``,
+        then all the renaming is done such that one of flavors of the callable
+        is renamed back to ``sin``.
+
+        :param renamed_entrypoints: A :class:`frozenset` of the names of the
+            renamed callable kernels which correspond to the entrypoints in
+            *self.callables_table*.
+        """
+        assert len(renamed_entrypoints) == len(program.entrypoints)
+        new_callable_ids = _get_callable_ids(self.callables, renamed_entrypoints)
+
+        callees_with_entrypoint_names = (program.entrypoints &
+                new_callable_ids) - renamed_entrypoints
+
+        renames = {}
+        new_callables = {}
+
+        for c in callees_with_entrypoint_names:
+            unique_function_identifier = c
+
+            while unique_function_identifier in self.callables:
+                unique_function_identifier = (
+                        next_indexed_function_identifier(
+                            unique_function_identifier))
+
+            renames[c] = unique_function_identifier
+
+        # we should perform a rewrite here.
+
+        for e in renamed_entrypoints:
+            renames[e] = self.history[e]
+            assert renames[e] in program.entrypoints
+
+        # {{{ calculate the renames needed
+
+        for old_func_id in ((self.old_callable_ids-new_callable_ids) -
+                program.entrypoints):
+            # at this point we should not rename anything to the names of
+            # entrypoints
+            for new_func_id in (new_callable_ids-renames.keys()) & set(
+                    self.history.keys()):
+                if old_func_id == self.history[new_func_id]:
+                    renames[new_func_id] = old_func_id
+                    break
+        # }}}
+
+        for e in renamed_entrypoints:
+            new_subkernel = self.callables[e].subkernel.copy(name=self.history[e])
+            new_subkernel = rename_resolved_functions_in_a_single_kernel(
+                    new_subkernel, renames)
+            new_callables[self.history[e]] = self.callables[e].copy(
+                    subkernel=new_subkernel)
+
+        for func_id in new_callable_ids-renamed_entrypoints:
+            in_knl_callable = self.callables[func_id]
+            if isinstance(in_knl_callable, CallableKernel):
+                # if callable kernel, perform renames inside its expressions.
+                old_subkernel = in_knl_callable.subkernel
+                new_subkernel = rename_resolved_functions_in_a_single_kernel(
+                        old_subkernel, renames)
+                in_knl_callable = (
+                        in_knl_callable.copy(subkernel=new_subkernel))
+            elif isinstance(in_knl_callable, ScalarCallable):
+                pass
+            else:
+                raise NotImplementedError("Unknown callable type %s." %
+                        type(in_knl_callable).__name__)
+
+            if func_id in renames:
+                new_func_id = renames[func_id]
+                if isinstance(in_knl_callable, CallableKernel):
+                    in_knl_callable = (in_knl_callable.copy(
+                        subkernel=in_knl_callable.subkernel.copy(
+                            name=new_func_id)))
+                new_callables[new_func_id] = in_knl_callable
+            else:
+                if isinstance(in_knl_callable, CallableKernel):
+                    in_knl_callable = in_knl_callable.copy(
+                        subkernel=in_knl_callable.subkernel.copy(
+                            name=func_id))
+                new_callables[func_id] = in_knl_callable
+
+        return program.copy(callables_table=new_callables)
+
+    # }}}
+
+    def __getitem__(self, name):
+        result = self.callables[name]
+        return result
+
+
+# {{{ helper functions
+
+def make_program(kernel):
+    """
+    Returns an instance of :class:`loopy.Program` with *kernel* as the only
+    callable kernel.
+    """
+
+    program = Program(
+            callables_table={
+                kernel.name: CallableKernel(kernel)},
+            target=kernel.target)
+
+    return program
+
+
+def iterate_over_kernels_if_given_program(transform_for_single_kernel):
+    """
+    Function wrapper for transformations of the type ``transform(kernel:
+    LoopKernel, *args, **kwargs): LoopKernel``. Returns a function with the
+    ``transform`` being implemented on all of the callable kernels in a
+    :class:`loopy.Program`.
+    """
+    def _collective_transform(*args, **kwargs):
+        if "program" in kwargs:
+            program_or_kernel = kwargs.pop("program")
+        elif "kernel" in kwargs:
+            program_or_kernel = kwargs.pop("kernel")
+        else:
+            program_or_kernel = args[0]
+            args = args[1:]
+
+        if isinstance(program_or_kernel, Program):
+            program = program_or_kernel
+            new_callables = {}
+            for func_id, in_knl_callable in program.callables_table.items():
+                if isinstance(in_knl_callable, CallableKernel):
+                    new_subkernel = transform_for_single_kernel(
+                            in_knl_callable.subkernel, *args, **kwargs)
+                    in_knl_callable = in_knl_callable.copy(
+                            subkernel=new_subkernel)
+                elif isinstance(in_knl_callable, ScalarCallable):
+                    pass
+                else:
+                    raise NotImplementedError("Unknown type of callable %s." % (
+                        type(in_knl_callable).__name__))
+
+                new_callables[func_id] = in_knl_callable
+
+            return program.copy(callables_table=new_callables)
+        else:
+            assert isinstance(program_or_kernel, LoopKernel)
+            kernel = program_or_kernel
+            return transform_for_single_kernel(kernel, *args, **kwargs)
+
+    return wraps(transform_for_single_kernel)(_collective_transform)
+
+
+def update_table(callables_table, clbl_id, clbl):
+    from loopy.kernel.function_interface import InKernelCallable
+    assert isinstance(clbl, InKernelCallable)
+
+    for i, c in callables_table.items():
+        if c == clbl:
+            return i, callables_table
+
+    while clbl_id in callables_table:
+        clbl_id = next_indexed_function_identifier(clbl_id)
+
+    callables_table[clbl_id] = clbl
+
+    return clbl_id, callables_table
+
+# }}}
+
+
+def resolve_callables(program):
+    """
+    Returns a :class:`Program` with known :class:`pymbolic.primitives.Call`
+    expression nodes converted to :class:`loopy.symbolic.ResolvedFunction`.
+    """
+    from loopy.library.function import get_loopy_callables
+    from loopy.kernel import KernelState
+
+    if program.state >= KernelState.CALLS_RESOLVED:
+        # program's callables have been resolved
+        return program
+
+    # get registered callables
+    known_callables = dict(program.callables_table)
+    # get target specific callables
+    known_callables.update(program.target.get_device_ast_builder().known_callables)
+    # get loopy specific callables
+    known_callables.update(get_loopy_callables())
+
+    callables_table = {}
+
+    # callables: name of the calls seen in the program
+    callables = set(program.entrypoints)
+
+    while callables:
+        clbl_name = callables.pop()
+        clbl = known_callables[clbl_name]
+
+        if isinstance(clbl, CallableKernel):
+            knl = clbl.subkernel
+
+            rule_mapping_context = SubstitutionRuleMappingContext(
+                    knl.substitutions, knl.get_var_name_generator())
+            clbl_resolver = CallableResolver(rule_mapping_context,
+                                             frozenset(known_callables))
+            knl = rule_mapping_context.finish_kernel(clbl_resolver.map_kernel(knl))
+            knl = knl.copy(state=KernelState.CALLS_RESOLVED)
+
+            # add the updated callable kernel to the table
+            callables_table[clbl_name] = clbl.copy(subkernel=knl)
+
+            # note the resolved callable for traversal
+            callables.update(clbl_resolver.calls_resolved - set(callables_table))
+        elif isinstance(clbl, ScalarCallable):
+            # nothing to resolve within a scalar callable
+            callables_table[clbl_name] = clbl
+        else:
+            raise NotImplementedError(f"{type(clbl)}")
+
+    return program.copy(callables_table=callables_table)
+
+
+# vim: foldmethod=marker
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index c6a9ec3ac1ed27048321deffabc746617e600dd8..91f7cf70f773234f069acba91be84a8745f50440 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -1748,16 +1748,17 @@ def _insn_ids_reaching_end(schedule, kind, reverse):
     return insn_ids_alive_at_scope[-1]
 
 
-def append_barrier_or_raise_error(schedule, dep, verify_only):
+def append_barrier_or_raise_error(kernel_name, schedule, dep, verify_only):
     if verify_only:
         from loopy.diagnostic import MissingBarrierError
         raise MissingBarrierError(
-                "Dependency '%s' (for variable '%s') "
+                "%s: Dependency '%s' (for variable '%s') "
                 "requires synchronization "
                 "by a %s barrier (add a 'no_sync_with' "
                 "instruction option to state that no "
                 "synchronization is needed)"
                 % (
+                    kernel_name,
                     dep.dep_descr.format(
                         tgt=dep.target.id, src=dep.source.id),
                     dep.variable,
@@ -1828,7 +1829,8 @@ def insert_barriers(kernel, schedule, synchronization_kind, verify_only, level=0
                 for dep in chain.from_iterable(
                         dep_tracker.gen_dependencies_with_target_at(insn)
                         for insn in loop_head):
-                    append_barrier_or_raise_error(result, dep, verify_only)
+                    append_barrier_or_raise_error(
+                            kernel.name, result, dep, verify_only)
                     # This barrier gets inserted outside the loop, hence it is
                     # executed unconditionally and so kills all sources before
                     # the loop.
@@ -1860,7 +1862,8 @@ def insert_barriers(kernel, schedule, synchronization_kind, verify_only, level=0
             elif isinstance(sched_item, RunInstruction):
                 for dep in dep_tracker.gen_dependencies_with_target_at(
                         sched_item.insn_id):
-                    append_barrier_or_raise_error(result, dep, verify_only)
+                    append_barrier_or_raise_error(
+                            kernel.name, result, dep, verify_only)
                     dep_tracker.discard_all_sources()
                     break
                 result.append(sched_item)
@@ -1926,7 +1929,7 @@ class MinRecursionLimitForScheduling(MinRecursionLimit):
 
 # {{{ main scheduling entrypoint
 
-def generate_loop_schedules(kernel, debug_args={}):
+def generate_loop_schedules(kernel, callables_table, debug_args={}):
     """
     .. warning::
 
@@ -1939,17 +1942,18 @@ def generate_loop_schedules(kernel, debug_args={}):
     """
 
     with MinRecursionLimitForScheduling(kernel):
-        yield from generate_loop_schedules_inner(kernel, debug_args=debug_args)
+        yield from generate_loop_schedules_inner(kernel,
+                callables_table, debug_args=debug_args)
 
 
-def generate_loop_schedules_inner(kernel, debug_args={}):
+def generate_loop_schedules_inner(kernel, callables_table, debug_args={}):
     from loopy.kernel import KernelState
     if kernel.state not in (KernelState.PREPROCESSED, KernelState.LINEARIZED):
         raise LoopyError("cannot schedule a kernel that has not been "
                 "preprocessed")
 
     from loopy.check import pre_schedule_checks
-    pre_schedule_checks(kernel)
+    pre_schedule_checks(kernel, callables_table)
 
     schedule_count = 0
 
@@ -2061,7 +2065,8 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
             gen_sched = convert_barrier_instructions_to_barriers(
                     kernel, gen_sched)
 
-            gsize, lsize = kernel.get_grid_size_upper_bounds()
+            gsize, lsize = (
+                    kernel.get_grid_size_upper_bounds(callables_table))
 
             if (gsize or lsize):
                 if not kernel.options.disable_global_barriers:
@@ -2118,7 +2123,7 @@ schedule_cache = WriteOncePersistentDict(
         key_builder=LoopyKeyBuilder())
 
 
-def _get_one_scheduled_kernel_inner(kernel):
+def _get_one_scheduled_kernel_inner(kernel, callables_table):
     # This helper function exists to ensure that the generator chain is fully
     # out of scope after the function returns. This allows it to be
     # garbage-collected in the exit handler of the
@@ -2128,19 +2133,19 @@ def _get_one_scheduled_kernel_inner(kernel):
     #
     # See https://gitlab.tiker.net/inducer/sumpy/issues/31 for context.
 
-    return next(iter(generate_loop_schedules(kernel)))
+    return next(iter(generate_loop_schedules(kernel, callables_table)))
 
 
-def get_one_scheduled_kernel(kernel):
+def get_one_scheduled_kernel(kernel, callables_table):
     warn_with_kernel(
         kernel, "get_one_scheduled_kernel_deprecated",
         "get_one_scheduled_kernel is deprecated. "
         "Use get_one_linearized_kernel instead.",
         DeprecationWarning)
-    return get_one_linearized_kernel(kernel)
+    return get_one_linearized_kernel(kernel, callables_table)
 
 
-def get_one_linearized_kernel(kernel):
+def get_one_linearized_kernel(kernel, callables_table):
     from loopy import CACHING_ENABLED
 
     sched_cache_key = kernel
@@ -2158,7 +2163,8 @@ def get_one_linearized_kernel(kernel):
     if not from_cache:
         with ProcessLogger(logger, "%s: schedule" % kernel.name):
             with MinRecursionLimitForScheduling(kernel):
-                result = _get_one_scheduled_kernel_inner(kernel)
+                result = _get_one_scheduled_kernel_inner(kernel,
+                        callables_table)
 
     if CACHING_ENABLED and not from_cache:
         schedule_cache.store_if_not_present(sched_cache_key, result)
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 0192aa27dad1ac279c1e6b877fca3d2c1cae7e2d..f5ecf5b757ceeca222a7bd77cbefe9d236c20c60 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1,4 +1,9 @@
-__copyright__ = "Copyright (C) 2015 James Stevens"
+__copyright__ = """
+Copyright (C) 2015 James Stevens
+Copyright (C) 2018 Kaushik Kulkarni
+Copyright (C) 2019 Andreas Kloeckner
+"""
+
 
 __license__ = """
 Permission is hereby granted, free of charge, to any person obtaining a copy
@@ -24,12 +29,14 @@ import loopy as lp
 from islpy import dim_type
 import islpy as isl
 from pymbolic.mapper import CombineMapper
-from functools import reduce
 from loopy.kernel.data import (
         MultiAssignmentBase, TemporaryVariable, AddressSpace)
 from loopy.diagnostic import warn_with_kernel, LoopyError
 from loopy.symbolic import CoefficientCollector
-from pytools import Record, memoize_method
+from pytools import ImmutableRecord, memoize_method
+from loopy.kernel.function_interface import CallableKernel
+from loopy.program import Program
+from functools import partial
 
 
 __doc__ = """
@@ -37,6 +44,7 @@ __doc__ = """
 .. currentmodule:: loopy
 
 .. autoclass:: ToCountMap
+.. autoclass:: ToCountPolynomialMap
 .. autoclass:: CountGranularity
 .. autoclass:: Op
 .. autoclass:: MemAccess
@@ -56,6 +64,19 @@ __doc__ = """
 """
 
 
+# FIXME:
+# - The SUBGROUP granularity is completely broken if the root kernel
+#   contains the grid and the operations get counted in the callee.
+#   To test, most of those are set to WORKITEM instead below (marked
+#   with FIXMEs). This leads to value mismatches and key errors in
+#   the tests.
+# - Currently, nothing prevents summation across different
+#   granularities, which is guaranteed to yield bogus results.
+# - AccessFootprintGatherer needs to be redone to match get_op_map and
+#   get_mem_access_map style
+# - Test for the subkernel functionality need to be written
+
+
 def get_kernel_parameter_space(kernel):
     return isl.Space.create_from_names(kernel.isl_context,
             set=[], params=sorted(list(kernel.outer_params()))).params()
@@ -69,11 +90,25 @@ def get_kernel_zero_pwqpolynomial(kernel):
 
 # {{{ GuardedPwQPolynomial
 
+def _get_param_tuple(obj):
+    return tuple(
+            obj.get_dim_name(dim_type.param, i)
+            for i in range(obj.dim(dim_type.param)))
+
+
 class GuardedPwQPolynomial:
     def __init__(self, pwqpolynomial, valid_domain):
+        assert isinstance(pwqpolynomial, isl.PwQPolynomial)
         self.pwqpolynomial = pwqpolynomial
         self.valid_domain = valid_domain
 
+        assert (_get_param_tuple(pwqpolynomial.space)
+                == _get_param_tuple(valid_domain.space))
+
+    @property
+    def space(self):
+        return self.valid_domain.space
+
     def __add__(self, other):
         if isinstance(other, GuardedPwQPolynomial):
             return GuardedPwQPolynomial(
@@ -122,7 +157,7 @@ class GuardedPwQPolynomial:
         return str(self.pwqpolynomial)
 
     def __repr__(self):
-        return repr(self.pwqpolynomial)
+        return "Guarded" + repr(self.pwqpolynomial)
 
 # }}}
 
@@ -130,7 +165,20 @@ class GuardedPwQPolynomial:
 # {{{ ToCountMap
 
 class ToCountMap:
-    """Maps any type of key to an arithmetic type.
+    """A map from work descriptors like :class:`Op` and :class:`MemAccess`
+    to any arithmetic type.
+
+    .. automethod:: __getitem__
+    .. automethod:: __str__
+    .. automethod:: __repr__
+    .. automethod:: __len__
+    .. automethod:: get
+    .. automethod:: items
+    .. automethod:: keys
+    .. automethod:: values
+
+    .. automethod:: copy
+    .. automethod:: with_set_attributes
 
     .. automethod:: filter_by
     .. automethod:: filter_by_func
@@ -141,17 +189,20 @@ class ToCountMap:
 
     """
 
-    def __init__(self, init_dict=None, val_type=GuardedPwQPolynomial):
-        if init_dict is None:
-            init_dict = {}
-        self.count_map = init_dict
-        self.val_type = val_type
+    def __init__(self, count_map=None):
+        if count_map is None:
+            count_map = {}
+
+        self.count_map = count_map
+
+    def _zero(self):
+        return 0
 
     def __add__(self, other):
         result = self.count_map.copy()
         for k, v in other.count_map.items():
             result[k] = self.count_map.get(k, 0) + v
-        return ToCountMap(result, self.val_type)
+        return self.copy(count_map=result)
 
     def __radd__(self, other):
         if other != 0:
@@ -159,13 +210,14 @@ class ToCountMap:
                                 "to {} {}. ToCountMap may only be added to "
                                 "0 and other ToCountMap objects."
                                 .format(type(other), other))
+
         return self
 
     def __mul__(self, other):
         if isinstance(other, GuardedPwQPolynomial):
-            return ToCountMap({
-                index: self.count_map[index]*other
-                for index in self.keys()})
+            return self.copy({
+                index: other*value
+                for index, value in self.count_map.items()})
         else:
             raise ValueError("ToCountMap: Attempted to multiply "
                                 "ToCountMap by {} {}."
@@ -174,21 +226,17 @@ class ToCountMap:
     __rmul__ = __mul__
 
     def __getitem__(self, index):
-        try:
-            return self.count_map[index]
-        except KeyError:
-            #TODO what is the best way to handle this?
-            if self.val_type is GuardedPwQPolynomial:
-                return GuardedPwQPolynomial.zero()
-            else:
-                return 0
-
-    def __setitem__(self, index, value):
-        self.count_map[index] = value
+        return self.count_map[index]
 
     def __repr__(self):
         return repr(self.count_map)
 
+    def __str__(self):
+        return "\n".join(
+                f"{k}: {v}"
+                for k, v in sorted(self.count_map.items(),
+                    key=lambda k: str(k)))
+
     def __len__(self):
         return len(self.count_map)
 
@@ -201,17 +249,19 @@ class ToCountMap:
     def keys(self):
         return self.count_map.keys()
 
-    def pop(self, item):
-        return self.count_map.pop(item)
+    def values(self):
+        return self.count_map.values()
+
+    def copy(self, count_map=None):
+        if count_map is None:
+            count_map = self.count_map
 
-    def copy(self):
-        return ToCountMap(dict(self.count_map), self.val_type)
+        return type(self)(count_map=count_map)
 
     def with_set_attributes(self, **kwargs):
-        return ToCountMap({
+        return self.copy(count_map={
             key.copy(**kwargs): val
-            for key, val in self.count_map.items()},
-            self.val_type)
+            for key, val in self.count_map.items()})
 
     def filter_by(self, **kwargs):
         """Remove items without specified key fields.
@@ -238,28 +288,25 @@ class ToCountMap:
 
         """
 
-        result_map = ToCountMap(val_type=self.val_type)
+        new_count_map = {}
 
-        from loopy.types import to_loopy_type
-        if "dtype" in kwargs.keys():
-            kwargs["dtype"] = [to_loopy_type(d) for d in kwargs["dtype"]]
+        class _Sentinel:
+            pass
 
-        # for each item in self.count_map
-        for self_key, self_val in self.items():
-            try:
-                # check to see if key attribute values match all filters
-                for arg_field, allowable_vals in kwargs.items():
-                    attr_val = getattr(self_key, arg_field)
-                    # see if the value is in the filter list
-                    if attr_val not in allowable_vals:
-                        break
-                else:  # loop terminated without break or error
-                    result_map[self_key] = self_val
-            except(AttributeError):
-                # the field passed is not a field of this key
-                continue
+        new_kwargs = {}
+        for arg_field, allowable_vals in kwargs.items():
+            if arg_field == "dtype":
+                from loopy.types import to_loopy_type
+                allowable_vals = [to_loopy_type(dtype) for dtype in allowable_vals]
+
+            new_kwargs[arg_field] = allowable_vals
 
-        return result_map
+        for key, val in self.count_map.items():
+            if all(getattr(key, arg_field, _Sentinel) in allowable_vals
+                    for arg_field, allowable_vals in new_kwargs.items()):
+                new_count_map[key] = val
+
+        return self.copy(count_map=new_count_map)
 
     def filter_by_func(self, func):
         """Keep items that pass a test.
@@ -286,14 +333,13 @@ class ToCountMap:
 
         """
 
-        result_map = ToCountMap(val_type=self.val_type)
+        new_count_map = {}
 
-        # for each item in self.count_map, call func on the key
-        for self_key, self_val in self.items():
+        for self_key, self_val in self.count_map.items():
             if func(self_key):
-                result_map[self_key] = self_val
+                new_count_map[self_key] = self_val
 
-        return result_map
+        return self.copy(count_map=new_count_map)
 
     def group_by(self, *args):
         """Group map items together, distinguishing by only the key fields
@@ -341,7 +387,7 @@ class ToCountMap:
 
         """
 
-        result_map = ToCountMap(val_type=self.val_type)
+        new_count_map = {}
 
         # make sure all item keys have same type
         if self.count_map:
@@ -350,22 +396,17 @@ class ToCountMap:
                 raise ValueError("ToCountMap: group_by() function may only "
                                  "be used on ToCountMaps with uniform keys")
         else:
-            return result_map
-
-        # for each item in self.count_map
-        for self_key, self_val in self.items():
-            new_key = key_type()
+            return self
 
-            # set all specified fields
-            for field in args:
-                setattr(new_key, field, getattr(self_key, field))
+        for self_key, self_val in self.count_map.items():
+            new_key = key_type(
+                    **{
+                        field: getattr(self_key, field)
+                        for field in args})
 
-            if new_key in result_map.keys():
-                result_map[new_key] += self_val
-            else:
-                result_map[new_key] = self_val
+            new_count_map[new_key] = new_count_map.get(new_key, 0) + self_val
 
-        return result_map
+        return self.copy(count_map=new_count_map)
 
     def to_bytes(self):
         """Convert counts to bytes using data type in map key.
@@ -398,48 +439,74 @@ class ToCountMap:
 
         """
 
-        result = self.copy()
-
-        for key, val in self.items():
-            bytes_processed = int(key.dtype.itemsize) * val
-            result[key] = bytes_processed
+        new_count_map = {}
 
-        #TODO again, is this okay?
-        result.val_type = int
+        for key, val in self.count_map.items():
+            new_count_map[key] = int(key.dtype.itemsize) * val
 
-        return result
+        return self.copy(new_count_map)
 
     def sum(self):
-        """Add all counts in ToCountMap.
+        """:return: A sum of the values of the dictionary."""
 
-        :return: An :class:`islpy.PwQPolynomial` or :class:`int` containing the
-            sum of counts.
+        total = self._zero()
 
-        """
+        for k, v in self.count_map.items():
+            total = v + total
 
-        if self.val_type is GuardedPwQPolynomial:
-            total = GuardedPwQPolynomial.zero()
-        else:
-            total = 0
-
-        for k, v in self.items():
-            total += v
         return total
 
-    #TODO test and document
-    def eval(self, params):
-        result = self.copy()
-        for key, val in self.items():
-            result[key] = val.eval_with_dict(params)
-        result.val_type = int
-        return result
+# }}}
+
+
+# {{{ ToCountPolynomialMap
+
+class ToCountPolynomialMap(ToCountMap):
+    """Maps any type of key to a :class:`islpy.PwQPolynomial` or a
+    :class:`GuardedPwQPolynomial`.
+    """
+
+    def __init__(self, space, count_map=None):
+        if not isinstance(space, isl.Space):
+            raise TypeError(
+                    "first argument to ToCountPolynomialMap must be "
+                    "of type islpy.Space")
 
-    def eval_and_sum(self, params):
-        """Add all counts in :class:`ToCountMap` and evaluate with provided
-        parameter dict.
+        assert space.is_params()
+        self.space = space
 
-        :return: An :class:`int` containing the sum of all counts in the
-            :class:`ToCountMap` evaluated with the parameters provided.
+        space_param_tuple = _get_param_tuple(space)
+
+        for key, val in count_map.items():
+            if isinstance(val, isl.PwQPolynomial):
+                assert val.dim(dim_type.out) == 1
+            elif isinstance(val, GuardedPwQPolynomial):
+                assert val.pwqpolynomial.dim(dim_type.out) == 1
+            else:
+                raise TypeError("unexpected value type")
+
+            assert _get_param_tuple(val.space) == space_param_tuple
+
+        super().__init__(count_map)
+
+    def _zero(self):
+        space = self.space.insert_dims(dim_type.out, 0, 1)
+        return isl.PwQPolynomial.zero(space)
+
+    def copy(self, count_map=None, space=None):
+        if count_map is None:
+            count_map = self.count_map
+
+        if space is None:
+            space = self.space
+
+        return type(self)(space, count_map)
+
+    def eval_and_sum(self, params=None):
+        """Add all counts and evaluate with provided parameter dict *params*
+
+        :return: An :class:`int` containing the sum of all counts
+            evaluated with the parameters provided.
 
         Example usage::
 
@@ -454,18 +521,69 @@ class ToCountMap:
             # (now use these counts to, e.g., predict performance)
 
         """
+        if params is None:
+            params = {}
+
         return self.sum().eval_with_dict(params)
 
 # }}}
 
 
+# {{{ subst_into_to_count_map
+
+def subst_into_guarded_pwqpolynomial(new_space, guarded_poly, subst_dict):
+    from loopy.isl_helpers import subst_into_pwqpolynomial, get_param_subst_domain
+
+    poly = subst_into_pwqpolynomial(
+            new_space, guarded_poly.pwqpolynomial, subst_dict)
+
+    valid_domain = guarded_poly.valid_domain
+    i_begin_subst_space = valid_domain.dim(dim_type.param)
+
+    valid_domain, subst_domain, _ = get_param_subst_domain(
+            new_space, guarded_poly.valid_domain, subst_dict)
+
+    valid_domain = valid_domain & subst_domain
+    valid_domain = valid_domain.project_out(dim_type.param, 0, i_begin_subst_space)
+    return GuardedPwQPolynomial(poly, valid_domain)
+
+
+def subst_into_to_count_map(space, tcm, subst_dict):
+    from loopy.isl_helpers import subst_into_pwqpolynomial
+    new_count_map = {}
+    for key, value in tcm.count_map.items():
+        if isinstance(value, GuardedPwQPolynomial):
+            new_count_map[key] = subst_into_guarded_pwqpolynomial(
+                    space, value, subst_dict)
+
+        elif isinstance(value, isl.PwQPolynomial):
+            new_count_map[key] = subst_into_pwqpolynomial(space, value, subst_dict)
+
+        elif isinstance(value, int):
+            new_count_map[key] = value
+
+        else:
+            raise ValueError("unexpected value type")
+
+    return tcm.copy(space=space, count_map=new_count_map)
+
+# }}}
+
+
 def stringify_stats_mapping(m):
+
+    from warnings import warn
+    warn("stringify_stats_mapping is deprecated and will be removed in 2020."
+            " Use ToCountMap.__str__() instead.", DeprecationWarning, stacklevel=2)
+
     result = ""
     for key in sorted(m.keys(), key=lambda k: str(k)):
         result += ("{} : {}\n".format(key, m[key]))
     return result
 
 
+# {{{ CountGranularity
+
 class CountGranularity:
     """Strings specifying whether an operation should be counted once per
     *work-item*, *sub-group*, or *work-group*.
@@ -492,10 +610,12 @@ class CountGranularity:
     WORKGROUP = "workgroup"
     ALL = [WORKITEM, SUBGROUP, WORKGROUP]
 
+# }}}
+
 
 # {{{ Op descriptor
 
-class Op(Record):
+class Op(ImmutableRecord):
     """A descriptor for a type of arithmetic operation.
 
     .. attribute:: dtype
@@ -521,34 +641,41 @@ class Op(Record):
        implementation-dependent grouping of work-items within a work-group,
        analagous to an NVIDIA CUDA warp.
 
+    .. attribute:: kernel_name
+
+        A :class:`str` representing the kernel name where the operation occurred.
+
     """
 
-    def __init__(self, dtype=None, name=None, count_granularity=None):
+    def __init__(self, dtype=None, name=None, count_granularity=None,
+            kernel_name=None):
         if count_granularity not in CountGranularity.ALL+[None]:
             raise ValueError("Op.__init__: count_granularity '%s' is "
                     "not allowed. count_granularity options: %s"
                     % (count_granularity, CountGranularity.ALL+[None]))
-        if dtype is None:
-            Record.__init__(self, dtype=dtype, name=name,
-                            count_granularity=count_granularity)
-        else:
+
+        if dtype is not None:
             from loopy.types import to_loopy_type
-            Record.__init__(self, dtype=to_loopy_type(dtype), name=name,
-                            count_granularity=count_granularity)
+            dtype = to_loopy_type(dtype)
 
-    def __hash__(self):
-        return hash(repr(self))
+        super().__init__(dtype=dtype, name=name,
+                        count_granularity=count_granularity,
+                        kernel_name=kernel_name)
 
     def __repr__(self):
         # Record.__repr__ overridden for consistent ordering and conciseness
-        return f"Op({self.dtype}, {self.name}, {self.count_granularity})"
+        if self.kernel_name is not None:
+            return (f'Op("{self.dtype}", "{self.name}", "{self.count_granularity}",'
+                    f' "{self.kernel_name}")')
+        else:
+            return f'Op("{self.dtype}", "{self.name}", "{self.count_granularity}")'
 
 # }}}
 
 
 # {{{ MemAccess descriptor
 
-class MemAccess(Record):
+class MemAccess(ImmutableRecord):
     """A descriptor for a type of memory access.
 
     .. attribute:: mtype
@@ -608,12 +735,15 @@ class MemAccess(Record):
        implementation-dependent grouping of work-items within a work-group,
        analagous to an NVIDIA CUDA warp.
 
+    .. attribute:: kernel_name
+
+        A :class:`str` representing the kernel name where the operation occurred.
     """
 
     def __init__(self, mtype=None, dtype=None, lid_strides=None, gid_strides=None,
                  direction=None, variable=None,
                  *, variable_tags=None, variable_tag=None,
-                 count_granularity=None):
+                 count_granularity=None, kernel_name=None):
 
         if count_granularity not in CountGranularity.ALL+[None]:
             raise ValueError("Op.__init__: count_granularity '%s' is "
@@ -638,18 +768,16 @@ class MemAccess(Record):
 
         # }}}
 
-        if dtype is None:
-            Record.__init__(self, mtype=mtype, dtype=dtype, lid_strides=lid_strides,
+        if dtype is not None:
+            from loopy.types import to_loopy_type
+            dtype = to_loopy_type(dtype)
+
+        ImmutableRecord.__init__(self, mtype=mtype, dtype=dtype,
+                            lid_strides=lid_strides,
                             gid_strides=gid_strides, direction=direction,
                             variable=variable, variable_tags=variable_tags,
-                            count_granularity=count_granularity)
-        else:
-            from loopy.types import to_loopy_type
-            Record.__init__(self, mtype=mtype, dtype=to_loopy_type(dtype),
-                            lid_strides=lid_strides, gid_strides=gid_strides,
-                            direction=direction, variable=variable,
-                            variable_tags=variable_tags,
-                            count_granularity=count_granularity)
+                            count_granularity=count_granularity,
+                            kernel_name=kernel_name)
 
     @property
     def variable_tag(self):
@@ -666,13 +794,12 @@ class MemAccess(Record):
         return tag
 
     def __hash__(self):
-        # Note that this means lid_strides and gid_strides must be sorted
-        # in self.__repr__()
+        # dicts in gid_strides and lid_strides aren't natively hashable
         return hash(repr(self))
 
     def __repr__(self):
         # Record.__repr__ overridden for consistent ordering and conciseness
-        return "MemAccess({}, {}, {}, {}, {}, {}, {}, {})".format(
+        return "MemAccess({}, {}, {}, {}, {}, {}, {}, {}, {})".format(
             self.mtype,
             self.dtype,
             None if self.lid_strides is None else dict(
@@ -682,33 +809,101 @@ class MemAccess(Record):
             self.direction,
             self.variable,
             self.variable_tags,
-            self.count_granularity)
+            self.count_granularity,
+            self.kernel_name)
+# }}}
+
+
+# {{{ Sync descriptor
+
+class Sync(ImmutableRecord):
+    """A descriptor for a type of synchronization.
+
+    .. attribute:: kind
+
+       A string describing the synchronization kind, e.g. ``"barrier_global"`` or
+       ``"barrier_local"`` or ``"kernel_launch"``.
+
+    .. attribute:: kernel_name
+
+        A :class:`str` representing the kernel name where the operation occurred.
+    """
+
+    def __init__(self, kind=None, kernel_name=None):
+        super().__init__(kind=kind, kernel_name=kernel_name)
+
+    def __repr__(self):
+        # Record.__repr__ overridden for consistent ordering and conciseness
+        return f"Sync({self.kind}, {self.kernel_name})"
 
 # }}}
 
 
-# {{{ counter base
+# {{{ CounterBase
 
 class CounterBase(CombineMapper):
-    def __init__(self, knl):
+    def __init__(self, knl, callables_table, kernel_rec):
         self.knl = knl
-        from loopy.type_inference import TypeInferenceMapper
-        self.type_inf = TypeInferenceMapper(knl)
+        self.callables_table = callables_table
+        self.kernel_rec = kernel_rec
+
+        from loopy.type_inference import TypeReader
+        self.type_inf = TypeReader(knl, callables_table)
+        self.zero = get_kernel_zero_pwqpolynomial(self.knl)
+        self.one = self.zero + 1
+
+    @property
+    @memoize_method
+    def param_space(self):
+        return get_kernel_parameter_space(self.knl)
+
+    def new_poly_map(self, count_map):
+        return ToCountPolynomialMap(self.param_space, count_map)
+
+    def new_zero_poly_map(self):
+        return self.new_poly_map({})
 
     def combine(self, values):
         return sum(values)
 
     def map_constant(self, expr):
-        return ToCountMap()
+        return self.new_zero_poly_map()
 
     def map_call(self, expr):
-        return self.rec(expr.parameters)
+        from loopy.symbolic import ResolvedFunction
+        assert isinstance(expr.function, ResolvedFunction)
+        clbl = self.callables_table[expr.function.name]
+
+        from loopy.kernel.function_interface import (CallableKernel,
+                get_kw_pos_association)
+        from loopy.kernel.data import ValueArg
+        if isinstance(clbl, CallableKernel):
+            sub_result = self.kernel_rec(clbl.subkernel)
+            _, pos_to_kw = get_kw_pos_association(clbl.subkernel)
+
+            subst_dict = {
+                    pos_to_kw[i]: param
+                    for i, param in enumerate(expr.parameters)
+                    if isinstance(clbl.subkernel.arg_dict[pos_to_kw[i]],
+                                  ValueArg)}
+
+            return subst_into_to_count_map(
+                    self.param_space,
+                    sub_result, subst_dict) \
+                    + self.rec(expr.parameters)
+
+        else:
+            raise NotImplementedError()
+
+    def map_call_with_kwargs(self, expr):
+        # FIXME
+        raise NotImplementedError()
 
     def map_sum(self, expr):
         if expr.children:
             return sum(self.rec(child) for child in expr.children)
         else:
-            return ToCountMap()
+            return self.new_zero_poly_map()
 
     map_product = map_sum
 
@@ -737,8 +932,8 @@ class CounterBase(CombineMapper):
     map_derivative = map_common_subexpression
     map_slice = map_common_subexpression
 
-    # preprocessing should have removed these
     def map_reduction(self, expr):
+        # preprocessing should have removed these
         raise RuntimeError("%s encountered %s--not supposed to happen"
                 % (type(self).__name__, type(expr).__name__))
 
@@ -748,60 +943,81 @@ class CounterBase(CombineMapper):
 # {{{ ExpressionOpCounter
 
 class ExpressionOpCounter(CounterBase):
-    def __init__(self, knl, count_within_subscripts=True):
-        self.knl = knl
+    def __init__(self, knl, callables_table, kernel_rec,
+            count_within_subscripts=True):
+        super().__init__(
+                knl, callables_table, kernel_rec)
         self.count_within_subscripts = count_within_subscripts
-        from loopy.type_inference import TypeInferenceMapper
-        self.type_inf = TypeInferenceMapper(knl)
+
+    arithmetic_count_granularity = CountGranularity.SUBGROUP
 
     def combine(self, values):
         return sum(values)
 
     def map_constant(self, expr):
-        return ToCountMap()
+        return self.new_zero_poly_map()
 
     map_tagged_variable = map_constant
     map_variable = map_constant
 
     def map_call(self, expr):
-        return ToCountMap(
-                    {Op(dtype=self.type_inf(expr),
-                        name="func:"+str(expr.function),
-                        count_granularity=CountGranularity.SUBGROUP): 1}
-                    ) + self.rec(expr.parameters)
+        from loopy.symbolic import ResolvedFunction
+        assert isinstance(expr.function, ResolvedFunction)
+        clbl = self.callables_table[expr.function.name]
+
+        from loopy.kernel.function_interface import CallableKernel
+        if not isinstance(clbl, CallableKernel):
+            return self.new_poly_map(
+                        {Op(dtype=self.type_inf(expr),
+                            name="func:"+clbl.name,
+                            count_granularity=self.arithmetic_count_granularity,
+                            kernel_name=self.knl.name): self.one}
+                        ) + self.rec(expr.parameters)
+        else:
+            return super().map_call(expr)
 
     def map_subscript(self, expr):
         if self.count_within_subscripts:
             return self.rec(expr.index)
         else:
-            return ToCountMap()
+            return self.new_zero_poly_map()
+
+    def map_sub_array_ref(self, expr):
+        # generates an array view, considered free
+        return self.new_zero_poly_map()
 
     def map_sum(self, expr):
         assert expr.children
-        return ToCountMap(
+        return self.new_poly_map(
                     {Op(dtype=self.type_inf(expr),
                         name="add",
-                        count_granularity=CountGranularity.SUBGROUP):
-                     len(expr.children)-1}
+                        count_granularity=self.arithmetic_count_granularity,
+                        kernel_name=self.knl.name):
+                     self.zero + (len(expr.children)-1)}
                     ) + sum(self.rec(child) for child in expr.children)
 
     def map_product(self, expr):
         from pymbolic.primitives import is_zero
         assert expr.children
-        return sum(ToCountMap({Op(dtype=self.type_inf(expr),
+        return sum(self.new_poly_map({Op(dtype=self.type_inf(expr),
                                   name="mul",
-                                  count_granularity=CountGranularity.SUBGROUP): 1})
+                                  count_granularity=(
+                                      self.arithmetic_count_granularity),
+                                  kernel_name=self.knl.name): self.one})
                    + self.rec(child)
                    for child in expr.children
                    if not is_zero(child + 1)) + \
-                   ToCountMap({Op(dtype=self.type_inf(expr),
+                   self.new_poly_map({Op(dtype=self.type_inf(expr),
                                   name="mul",
-                                  count_granularity=CountGranularity.SUBGROUP): -1})
+                                  count_granularity=(
+                                      self.arithmetic_count_granularity),
+                                  kernel_name=self.knl.name): -self.one})
 
     def map_quotient(self, expr, *args):
-        return ToCountMap({Op(dtype=self.type_inf(expr),
+        return self.new_poly_map({Op(dtype=self.type_inf(expr),
                               name="div",
-                              count_granularity=CountGranularity.SUBGROUP): 1}) \
+                              count_granularity=self.arithmetic_count_granularity,
+                              kernel_name=self.knl.name): self.one}) \
                                 + self.rec(expr.numerator) \
                                 + self.rec(expr.denominator)
 
@@ -809,32 +1025,36 @@ class ExpressionOpCounter(CounterBase):
     map_remainder = map_quotient
 
     def map_power(self, expr):
-        return ToCountMap({Op(dtype=self.type_inf(expr),
+        return self.new_poly_map({Op(dtype=self.type_inf(expr),
                               name="pow",
-                              count_granularity=CountGranularity.SUBGROUP): 1}) \
+                              count_granularity=self.arithmetic_count_granularity,
+                              kernel_name=self.knl.name): self.one}) \
                                 + self.rec(expr.base) \
                                 + self.rec(expr.exponent)
 
     def map_left_shift(self, expr):
-        return ToCountMap({Op(dtype=self.type_inf(expr),
+        return self.new_poly_map({Op(dtype=self.type_inf(expr),
                               name="shift",
-                              count_granularity=CountGranularity.SUBGROUP): 1}) \
+                              count_granularity=self.arithmetic_count_granularity,
+                              kernel_name=self.knl.name): self.one}) \
                                 + self.rec(expr.shiftee) \
                                 + self.rec(expr.shift)
 
     map_right_shift = map_left_shift
 
     def map_bitwise_not(self, expr):
-        return ToCountMap({Op(dtype=self.type_inf(expr),
+        return self.new_poly_map({Op(dtype=self.type_inf(expr),
                               name="bw",
-                              count_granularity=CountGranularity.SUBGROUP): 1}) \
+                              count_granularity=self.arithmetic_count_granularity,
+                              kernel_name=self.knl.name): self.one}) \
                                 + self.rec(expr.child)
 
     def map_bitwise_or(self, expr):
-        return ToCountMap({Op(dtype=self.type_inf(expr),
+        return self.new_poly_map({Op(dtype=self.type_inf(expr),
                               name="bw",
-                              count_granularity=CountGranularity.SUBGROUP):
-                           len(expr.children)-1}) \
+                              count_granularity=self.arithmetic_count_granularity,
+                              kernel_name=self.knl.name):
+                           self.zero + (len(expr.children)-1)}) \
                                 + sum(self.rec(child) for child in expr.children)
 
     map_bitwise_xor = map_bitwise_or
@@ -855,9 +1075,10 @@ class ExpressionOpCounter(CounterBase):
                + self.rec(expr.else_)
 
     def map_min(self, expr):
-        return ToCountMap({Op(dtype=self.type_inf(expr),
+        return self.new_poly_map({Op(dtype=self.type_inf(expr),
                               name="maxmin",
-                              count_granularity=CountGranularity.SUBGROUP):
+                              count_granularity=self.arithmetic_count_granularity,
+                              kernel_name=self.knl.name):
                            len(expr.children)-1}) \
                + sum(self.rec(child) for child in expr.children)
 
@@ -898,6 +1119,8 @@ class _IndexStrideCoefficientCollector(CoefficientCollector):
 # }}}
 
 
+# {{{ _get_lid_and_gid_strides
+
 def _get_lid_and_gid_strides(knl, array, index):
     # find all local and global index tags and corresponding inames
     from loopy.symbolic import get_dependencies
@@ -982,28 +1205,49 @@ def _get_lid_and_gid_strides(knl, array, index):
 
     return get_iname_strides(lid_to_iname), get_iname_strides(gid_to_iname)
 
+# }}}
 
-class MemAccessCounter(CounterBase):
-    pass
+
+# {{{ MemAccessCounterBase
+
+class MemAccessCounterBase(CounterBase):
+    def map_sub_array_ref(self, expr):
+        # generates an array view, considered free
+        return self.new_zero_poly_map()
+
+    def map_call(self, expr):
+        from loopy.symbolic import ResolvedFunction
+        assert isinstance(expr.function, ResolvedFunction)
+        clbl = self.callables_table[expr.function.name]
+
+        from loopy.kernel.function_interface import CallableKernel
+        if not isinstance(clbl, CallableKernel):
+            return self.rec(expr.parameters)
+        else:
+            return super().map_call(expr)
+
+# }}}
 
 
 # {{{ LocalMemAccessCounter
 
-class LocalMemAccessCounter(MemAccessCounter):
+class LocalMemAccessCounter(MemAccessCounterBase):
+    local_mem_count_granularity = CountGranularity.SUBGROUP
+
     def count_var_access(self, dtype, name, index):
-        sub_map = ToCountMap()
+        count_map = {}
         if name in self.knl.temporary_variables:
             array = self.knl.temporary_variables[name]
             if isinstance(array, TemporaryVariable) and (
                     array.address_space == AddressSpace.LOCAL):
                 if index is None:
                     # no subscript
-                    sub_map[MemAccess(
+                    count_map[MemAccess(
                                 mtype="local",
                                 dtype=dtype,
-                                count_granularity=CountGranularity.SUBGROUP)
-                            ] = 1
-                    return sub_map
+                                count_granularity=self.local_mem_count_granularity,
+                                kernel_name=self.knl.name)] = self.one
+                    return self.new_poly_map(count_map)
 
                 array = self.knl.temporary_variables[name]
 
@@ -1015,15 +1259,16 @@ class LocalMemAccessCounter(MemAccessCounter):
                 lid_strides, gid_strides = _get_lid_and_gid_strides(
                                                 self.knl, array, index_tuple)
 
-                sub_map[MemAccess(
+                count_map[MemAccess(
                         mtype="local",
                         dtype=dtype,
                         lid_strides=dict(sorted(lid_strides.items())),
                         gid_strides=dict(sorted(gid_strides.items())),
                         variable=name,
-                        count_granularity=CountGranularity.SUBGROUP)] = 1
+                        count_granularity=self.local_mem_count_granularity,
+                        kernel_name=self.knl.name)] = self.one
 
-        return sub_map
+        return self.new_poly_map(count_map)
 
     def map_variable(self, expr):
         return self.count_var_access(
@@ -1042,7 +1287,7 @@ class LocalMemAccessCounter(MemAccessCounter):
 
 # {{{ GlobalMemAccessCounter
 
-class GlobalMemAccessCounter(MemAccessCounter):
+class GlobalMemAccessCounter(MemAccessCounterBase):
     def map_variable(self, expr):
         name = expr.name
 
@@ -1050,17 +1295,18 @@ class GlobalMemAccessCounter(MemAccessCounter):
             array = self.knl.arg_dict[name]
         else:
             # this is a temporary variable
-            return ToCountMap()
+            return self.new_zero_poly_map()
 
         if not isinstance(array, lp.ArrayArg):
             # this array is not in global memory
-            return ToCountMap()
+            return self.new_zero_poly_map()
 
-        return ToCountMap({MemAccess(mtype="global",
-                                     dtype=self.type_inf(expr), lid_strides={},
-                                     gid_strides={}, variable=name,
-                                     count_granularity=CountGranularity.WORKITEM): 1}
-                          ) + self.rec(expr.index)
+        return self.new_poly_map({MemAccess(mtype="global",
+                    dtype=self.type_inf(expr), lid_strides={},
+                    gid_strides={}, variable=name,
+                    count_granularity=CountGranularity.WORKITEM,
+                    kernel_name=self.knl.name): self.one}
+                    ) + self.rec(expr.index)
 
     def map_subscript(self, expr):
         name = expr.aggregate.name
@@ -1086,19 +1332,27 @@ class GlobalMemAccessCounter(MemAccessCounter):
         lid_strides, gid_strides = _get_lid_and_gid_strides(
                                         self.knl, array, index_tuple)
 
-        count_granularity = CountGranularity.WORKITEM if (
-                                0 in lid_strides and lid_strides[0] != 0
-                                ) else CountGranularity.SUBGROUP
+        global_access_count_granularity = CountGranularity.SUBGROUP
 
-        return ToCountMap({MemAccess(
+        # Account for broadcasts once per subgroup
+        count_granularity = CountGranularity.WORKITEM if (
+                # if the stride in lid.0 is known
+                0 in lid_strides
+                and
+                # it is nonzero
+                lid_strides[0] != 0
+                ) else global_access_count_granularity
+
+        return self.new_poly_map({MemAccess(
                             mtype="global",
                             dtype=self.type_inf(expr),
                             lid_strides=dict(sorted(lid_strides.items())),
                             gid_strides=dict(sorted(gid_strides.items())),
                             variable=name,
                             variable_tags=var_tags,
-                            count_granularity=count_granularity
-                            ): 1}
+                            count_granularity=count_granularity,
+                            kernel_name=self.knl.name,
+                            ): self.one}
                           ) + self.rec(expr.index_tuple)
 
 # }}}
@@ -1174,10 +1428,19 @@ class AccessFootprintGatherer(CombineMapper):
 # {{{ count
 
 def add_assumptions_guard(kernel, pwqpolynomial):
-    return GuardedPwQPolynomial(pwqpolynomial, kernel.assumptions)
+    return GuardedPwQPolynomial(
+            pwqpolynomial,
+            kernel.assumptions.align_params(pwqpolynomial.space))
 
 
 def count(kernel, set, space=None):
+    if isinstance(kernel, Program):
+        kernel_names = [i for i, clbl in kernel.callables_table.items()
+                if isinstance(clbl, CallableKernel)]
+        if len(kernel_names) > 1:
+            raise LoopyError()
+        return count(kernel[kernel_names[0]], set, space)
+
     try:
         if space is not None:
             set = set.align_params(space)
@@ -1186,7 +1449,7 @@ def count(kernel, set, space=None):
     except AttributeError:
         pass
 
-    count = isl.PwQPolynomial.zero(
+    total_count = isl.PwQPolynomial.zero(
             set.space
             .drop_dims(dim_type.set, 0, set.dim(dim_type.set))
             .add_dims(dim_type.set, 1))
@@ -1248,7 +1511,7 @@ def count(kernel, set, space=None):
             # }}}
 
         if bset_count is not None:
-            count += bset_count
+            total_count += bset_count
 
         is_subset = bset <= bset_rebuilt
         is_superset = bset >= bset_rebuilt
@@ -1273,12 +1536,12 @@ def count(kernel, set, space=None):
                         "number of integer points in your loop "
                         "domain.")
 
-    return add_assumptions_guard(kernel, count)
+    return add_assumptions_guard(kernel, total_count)
 
 
-def get_unused_hw_axes_factor(knl, insn, disregard_local_axes):
+def get_unused_hw_axes_factor(knl, callables_table, insn, disregard_local_axes):
     # FIXME: Multi-kernel support
-    gsize, lsize = knl.get_grid_size_upper_bounds()
+    gsize, lsize = knl.get_grid_size_upper_bounds(callables_table)
 
     g_used = set()
     l_used = set()
@@ -1327,29 +1590,29 @@ def count_inames_domain(knl, inames):
     return count(knl, domain, space=space)
 
 
-def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False):
+def count_insn_runs(knl, callables_table, insn, count_redundant_work,
+        disregard_local_axes=False):
 
     insn_inames = insn.within_inames
 
     if disregard_local_axes:
         from loopy.kernel.data import LocalIndexTag
-        insn_inames = [iname
-                for iname in insn_inames
-                if not knl.iname_tags_of_type(iname, LocalIndexTag)]
+        insn_inames = frozenset(
+                [iname for iname in insn_inames
+                    if not knl.iname_tags_of_type(iname, LocalIndexTag)])
 
     c = count_inames_domain(knl, insn_inames)
 
     if count_redundant_work:
-        unused_fac = get_unused_hw_axes_factor(knl, insn,
-                        disregard_local_axes=disregard_local_axes)
+        unused_fac = get_unused_hw_axes_factor(knl, callables_table,
+                insn, disregard_local_axes=disregard_local_axes)
         return c * unused_fac
     else:
         return c
 
 
-@memoize_method
-def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work,
-                    count_granularity=CountGranularity.WORKITEM):
+def _get_insn_count(knl, callables_table, insn_id, subgroup_size,
+        count_redundant_work, count_granularity=CountGranularity.WORKITEM):
     insn = knl.id_to_insn[insn_id]
 
     if count_granularity is None:
@@ -1361,19 +1624,21 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work,
 
     if count_granularity == CountGranularity.WORKITEM:
         return count_insn_runs(
-            knl, insn, count_redundant_work=count_redundant_work,
+            knl, callables_table, insn,
+            count_redundant_work=count_redundant_work,
             disregard_local_axes=False)
 
     ct_disregard_local = count_insn_runs(
-            knl, insn, disregard_local_axes=True,
+            knl, callables_table, insn, disregard_local_axes=True,
             count_redundant_work=count_redundant_work)
 
     if count_granularity == CountGranularity.WORKGROUP:
         return ct_disregard_local
     elif count_granularity == CountGranularity.SUBGROUP:
-        # get the group size
+        # {{{ compute workgroup_size
+
         from loopy.symbolic import aff_to_expr
-        _, local_size = knl.get_grid_size_upper_bounds()
+        _, local_size = knl.get_grid_size_upper_bounds(callables_table)
         workgroup_size = 1
         if local_size:
             for size in local_size:
@@ -1393,15 +1658,18 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work,
                                      % (CountGranularity.SUBGROUP, local_size))
                 workgroup_size *= s
 
+        # }}}
+
         warn_with_kernel(knl, "insn_count_subgroups_upper_bound",
                 "get_insn_count: when counting instruction %s with "
                 "count_granularity=%s, using upper bound for work-group size "
                 "(%d work-items) to compute sub-groups per work-group. When "
-                "multiple device programs present, actual sub-group count may be"
+                "multiple device programs present, actual sub-group count may be "
                 "lower." % (insn_id, CountGranularity.SUBGROUP, workgroup_size))
 
         from pytools import div_ceil
         return ct_disregard_local*div_ceil(workgroup_size, subgroup_size)
+
     else:
         # this should not happen since this is enforced in Op/MemAccess
         raise ValueError("get_insn_count: count_granularity '%s' is"
@@ -1413,17 +1681,52 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work,
 
 # {{{ get_op_map
 
-def get_op_map(knl, numpy_types=True, count_redundant_work=False,
-               count_within_subscripts=True, subgroup_size=None):
+def _get_op_map_for_single_kernel(knl, callables_table,
+        count_redundant_work,
+        count_within_subscripts, subgroup_size):
+
+    subgroup_size = _process_subgroup_size(knl, subgroup_size)
+
+    kernel_rec = partial(_get_op_map_for_single_kernel,
+            callables_table=callables_table,
+            count_redundant_work=count_redundant_work,
+            count_within_subscripts=count_within_subscripts,
+            subgroup_size=subgroup_size)
+
+    op_counter = ExpressionOpCounter(knl, callables_table, kernel_rec,
+            count_within_subscripts)
+    op_map = op_counter.new_zero_poly_map()
+
+    from loopy.kernel.instruction import (
+            CallInstruction, CInstruction, Assignment,
+            NoOpInstruction, BarrierInstruction)
+
+    for insn in knl.instructions:
+        if isinstance(insn, (CallInstruction, CInstruction, Assignment)):
+            ops = op_counter(insn.assignees) + op_counter(insn.expression)
+            for key, val in ops.count_map.items():
+                count = _get_insn_count(knl, callables_table, insn.id,
+                            subgroup_size, count_redundant_work,
+                            key.count_granularity)
+                op_map = op_map + ToCountMap({key: val}) * count
+
+        elif isinstance(insn, (NoOpInstruction, BarrierInstruction)):
+            pass
+        else:
+            raise NotImplementedError("unexpected instruction item type: '%s'"
+                    % type(insn).__name__)
+
+    return op_map
+
+
+def get_op_map(program, numpy_types=True, count_redundant_work=False,
+               count_within_subscripts=True, subgroup_size=None,
+               entrypoint=None):
 
     """Count the number of operations in a loopy kernel.
 
     :arg knl: A :class:`loopy.LoopKernel` whose operations are to be counted.
 
-    :arg numpy_types: A :class:`bool` specifying whether the types in the
-        returned mapping should be numpy types instead of
-        :class:`loopy.types.LoopyType`.
-
     :arg count_redundant_work: Based on usage of hardware axes or other
         specifics, a kernel may perform work redundantly. This :class:`bool`
         flag indicates whether this work should be included in the count.
@@ -1474,53 +1777,37 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False,
 
     """
 
-    subgroup_size = _process_subgroup_size(knl, subgroup_size)
+    if entrypoint is None:
+        if len(program.entrypoints) > 1:
+            raise LoopyError("Must provide entrypoint")
 
-    from loopy.preprocess import preprocess_kernel, infer_unknown_types
-    knl = infer_unknown_types(knl, expect_completion=True)
-    knl = preprocess_kernel(knl)
+        entrypoint = list(program.entrypoints)[0]
 
-    op_map = ToCountMap()
-    op_counter = ExpressionOpCounter(knl, count_within_subscripts)
+    assert entrypoint in program.entrypoints
 
-    from loopy.kernel.instruction import (
-            CallInstruction, CInstruction, Assignment,
-            NoOpInstruction, BarrierInstruction)
+    from loopy.preprocess import preprocess_program, infer_unknown_types
+    program = preprocess_program(program)
 
-    for insn in knl.instructions:
-        if isinstance(insn, (CallInstruction, CInstruction, Assignment)):
-            ops = op_counter(insn.assignee) + op_counter(insn.expression)
-            for key, val in ops.count_map.items():
-                op_map = (
-                        op_map
-                        + ToCountMap({key: val})
-                        * _get_insn_count(knl, insn.id, subgroup_size,
-                                         count_redundant_work,
-                                         key.count_granularity))
+    # Ordering restriction: preprocess might insert arguments to
+    # make strides valid. Those also need to go through type inference.
+    program = infer_unknown_types(program, expect_completion=True)
 
-        elif isinstance(insn, (NoOpInstruction, BarrierInstruction)):
-            pass
-        else:
-            raise NotImplementedError("unexpected instruction item type: '%s'"
-                    % type(insn).__name__)
+    if numpy_types is not None:
+        from warnings import warn
+        warn("numpy_types is being ignored and will be removed in 2020.",
+                DeprecationWarning, stacklevel=2)
 
-    if numpy_types:
-        return ToCountMap(
-                    init_dict={
-                        Op(
-                            dtype=op.dtype.numpy_dtype,
-                            name=op.name,
-                            count_granularity=op.count_granularity):
-                        ct
-                        for op, ct in op_map.count_map.items()},
-                    val_type=op_map.val_type
-                    )
-    else:
-        return op_map
+    return _get_op_map_for_single_kernel(
+            program[entrypoint], program.callables_table,
+            count_redundant_work=count_redundant_work,
+            count_within_subscripts=count_within_subscripts,
+            subgroup_size=subgroup_size)
 
 # }}}
 
 
+# {{{ subgoup size finding
+
 def _find_subgroup_size_for_knl(knl):
     from loopy.target.pyopencl import PyOpenCLTarget
     if isinstance(knl.target, PyOpenCLTarget) and knl.target.device is not None:
@@ -1572,20 +1859,66 @@ def _process_subgroup_size(knl, subgroup_size_requested):
                              "must be integer, 'guess', or, if you're feeling "
                              "lucky, None." % (subgroup_size_requested))
 
+# }}}
+
 
 # {{{ get_mem_access_map
 
-def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
-                       subgroup_size=None):
+def _get_mem_access_map_for_single_kernel(knl, callables_table,
+        count_redundant_work, subgroup_size):
+
+    subgroup_size = _process_subgroup_size(knl, subgroup_size)
+
+    kernel_rec = partial(_get_mem_access_map_for_single_kernel,
+            callables_table=callables_table,
+            count_redundant_work=count_redundant_work,
+            subgroup_size=subgroup_size)
+
+    access_counter_g = GlobalMemAccessCounter(
+            knl, callables_table, kernel_rec)
+    access_counter_l = LocalMemAccessCounter(
+            knl, callables_table, kernel_rec)
+    access_map = access_counter_g.new_zero_poly_map()
+
+    from loopy.kernel.instruction import (
+            CallInstruction, CInstruction, Assignment,
+            NoOpInstruction, BarrierInstruction)
+
+    for insn in knl.instructions:
+        if isinstance(insn, (CallInstruction, CInstruction, Assignment)):
+            insn_access_map = (
+                        access_counter_g(insn.expression)
+                        + access_counter_l(insn.expression)
+                        ).with_set_attributes(direction="load")
+            for assignee in insn.assignees:
+                insn_access_map = insn_access_map + (
+                        access_counter_g(assignee)
+                        + access_counter_l(assignee)
+                        ).with_set_attributes(direction="store")
+
+            for key, val in insn_access_map.count_map.items():
+                count = _get_insn_count(knl, callables_table, insn.id,
+                            subgroup_size, count_redundant_work,
+                            key.count_granularity)
+                access_map = access_map + ToCountMap({key: val}) * count
+
+        elif isinstance(insn, (NoOpInstruction, BarrierInstruction)):
+            pass
+
+        else:
+            raise NotImplementedError("unexpected instruction item type: '%s'"
+                    % type(insn).__name__)
+
+    return access_map
+
+
+def get_mem_access_map(program, numpy_types=None, count_redundant_work=False,
+                       subgroup_size=None, entrypoint=None):
     """Count the number of memory accesses in a loopy kernel.
 
     :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be
         counted.
 
-    :arg numpy_types: A :class:`bool` specifying whether the types in the
-        returned mapping should be numpy types instead of
-        :class:`loopy.types.LoopyType`.
-
     :arg count_redundant_work: Based on usage of hardware axes or other
         specifics, a kernel may perform work redundantly. This :class:`bool`
         flag indicates whether this work should be included in the count.
@@ -1662,72 +1995,86 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
 
     """
 
-    subgroup_size = _process_subgroup_size(knl, subgroup_size)
+    if entrypoint is None:
+        if len(program.entrypoints) > 1:
+            raise LoopyError("Must provide entrypoint")
 
-    from loopy.preprocess import preprocess_kernel, infer_unknown_types
-    knl = infer_unknown_types(knl, expect_completion=True)
-    knl = preprocess_kernel(knl)
+        entrypoint = list(program.entrypoints)[0]
 
-    access_map = ToCountMap()
-    access_counter_g = GlobalMemAccessCounter(knl)
-    access_counter_l = LocalMemAccessCounter(knl)
+    assert entrypoint in program.entrypoints
 
-    from loopy.kernel.instruction import (
-            CallInstruction, CInstruction, Assignment,
-            NoOpInstruction, BarrierInstruction)
+    from loopy.preprocess import preprocess_program, infer_unknown_types
 
-    for insn in knl.instructions:
-        if isinstance(insn, (CallInstruction, CInstruction, Assignment)):
-            insn_access_map = (
-                    access_counter_g(insn.expression)
-                    + access_counter_l(insn.expression)
-                    ).with_set_attributes(direction="load")
+    program = preprocess_program(program)
+    # Ordering restriction: preprocess might insert arguments to
+    # make strides valid. Those also need to go through type inference.
+    program = infer_unknown_types(program, expect_completion=True)
 
-            for assignee in insn.assignees:
-                insn_access_map += (
-                        access_counter_g(assignee)
-                        + access_counter_l(assignee)
-                        ).with_set_attributes(direction="store")
-
-            for key, val in insn_access_map.count_map.items():
-                access_map = (
-                        access_map
-                        + ToCountMap({key: val})
-                        * _get_insn_count(knl, insn.id, subgroup_size,
-                                          count_redundant_work,
-                                          key.count_granularity))
-        elif isinstance(insn, (NoOpInstruction, BarrierInstruction)):
-            pass
-        else:
-            raise NotImplementedError("unexpected instruction item type: '%s'"
-                    % type(insn).__name__)
+    if numpy_types is not None:
+        from warnings import warn
+        warn("numpy_types is being ignored and will be removed in 2020.",
+                DeprecationWarning, stacklevel=2)
 
-    if numpy_types:
-        return ToCountMap(
-                    init_dict={
-                        MemAccess(
-                            mtype=mem_access.mtype,
-                            dtype=mem_access.dtype.numpy_dtype,
-                            lid_strides=mem_access.lid_strides,
-                            gid_strides=mem_access.gid_strides,
-                            direction=mem_access.direction,
-                            variable=mem_access.variable,
-                            variable_tags=mem_access.variable_tags,
-                            count_granularity=mem_access.count_granularity):
-                        ct
-                        for mem_access, ct in access_map.count_map.items()},
-                    val_type=access_map.val_type
-                    )
-    else:
-        return access_map
+    return _get_mem_access_map_for_single_kernel(
+            program[entrypoint], program.callables_table,
+            count_redundant_work=count_redundant_work,
+            subgroup_size=subgroup_size)
 
 # }}}
 
 
 # {{{ get_synchronization_map
 
-def get_synchronization_map(knl, subgroup_size=None):
+def _get_synchronization_map_for_single_kernel(knl, callables_table,
+        subgroup_size=None):
+
+    knl = lp.get_one_scheduled_kernel(knl, callables_table)
+
+    from loopy.schedule import (EnterLoop, LeaveLoop, Barrier,
+            CallKernel, ReturnFromKernel, RunInstruction)
+
+    kernel_rec = partial(_get_synchronization_map_for_single_kernel,
+            callables_table=callables_table,
+            subgroup_size=subgroup_size)
 
+    sync_counter = CounterBase(knl, callables_table, kernel_rec)
+    sync_map = sync_counter.new_zero_poly_map()
+
+    iname_list = []
+
+    for sched_item in knl.schedule:
+        if isinstance(sched_item, EnterLoop):
+            if sched_item.iname:  # (if not empty)
+                iname_list.append(sched_item.iname)
+        elif isinstance(sched_item, LeaveLoop):
+            if sched_item.iname:  # (if not empty)
+                iname_list.pop()
+
+        elif isinstance(sched_item, Barrier):
+            sync_map = sync_map + ToCountMap(
+                    {Sync(
+                        "barrier_%s" % sched_item.synchronization_kind,
+                        knl.name): count_inames_domain(knl, frozenset(iname_list))})
+
+        elif isinstance(sched_item, RunInstruction):
+            pass
+
+        elif isinstance(sched_item, CallKernel):
+            sync_map = sync_map + ToCountMap(
+                    {Sync("kernel_launch", knl.name):
+                        count_inames_domain(knl, frozenset(iname_list))})
+
+        elif isinstance(sched_item, ReturnFromKernel):
+            pass
+
+        else:
+            raise LoopyError("unexpected schedule item: %s"
+                    % type(sched_item).__name__)
+
+    return sync_map
+
+
+def get_synchronization_map(program, subgroup_size=None, entrypoint=None):
     """Count the number of synchronization events each work-item encounters in
     a loopy kernel.
 
@@ -1763,79 +2110,30 @@ def get_synchronization_map(knl, subgroup_size=None):
         # (now use this count to, e.g., predict performance)
 
     """
+    if entrypoint is None:
+        if len(program.entrypoints) > 1:
+            raise LoopyError("Must provide entrypoint")
 
-    from loopy.preprocess import preprocess_kernel, infer_unknown_types
-    from loopy.schedule import (EnterLoop, LeaveLoop, Barrier,
-            CallKernel, ReturnFromKernel, RunInstruction)
-    from operator import mul
-    knl = infer_unknown_types(knl, expect_completion=True)
-    knl = preprocess_kernel(knl)
-    knl = lp.get_one_scheduled_kernel(knl)
-    iname_list = []
-
-    result = ToCountMap()
-
-    one = isl.PwQPolynomial("{ 1 }")
+        entrypoint = list(program.entrypoints)[0]
 
-    def get_count_poly(iname_list):
-        if iname_list:  # (if iname_list is not empty)
-            ct = (count(knl, (
-                            knl.get_inames_domain(iname_list).
-                            project_out_except(iname_list, [dim_type.set])
-                            )), )
-            return reduce(mul, ct)
-        else:
-            return one
+    assert entrypoint in program.entrypoints
+    from loopy.preprocess import preprocess_program, infer_unknown_types
 
-    for sched_item in knl.schedule:
-        if isinstance(sched_item, EnterLoop):
-            if sched_item.iname:  # (if not empty)
-                iname_list.append(sched_item.iname)
-        elif isinstance(sched_item, LeaveLoop):
-            if sched_item.iname:  # (if not empty)
-                iname_list.pop()
-
-        elif isinstance(sched_item, Barrier):
-            result = result + ToCountMap({"barrier_%s" %
-                                          sched_item.synchronization_kind:
-                                          get_count_poly(iname_list)})
-
-        elif isinstance(sched_item, CallKernel):
-            result = result + ToCountMap(
-                    {"kernel_launch": get_count_poly(iname_list)})
-
-        elif isinstance(sched_item, (ReturnFromKernel, RunInstruction)):
-            pass
-
-        else:
-            raise LoopyError("unexpected schedule item: %s"
-                    % type(sched_item).__name__)
+    program = preprocess_program(program)
+    # Ordering restriction: preprocess might insert arguments to
+    # make strides valid. Those also need to go through type inference.
+    program = infer_unknown_types(program, expect_completion=True)
 
-    return result
+    return _get_synchronization_map_for_single_kernel(
+            program[entrypoint], program.callables_table,
+            subgroup_size=subgroup_size)
 
 # }}}
 
 
 # {{{ gather_access_footprints
 
-def gather_access_footprints(kernel, ignore_uncountable=False):
-    """Return a dictionary mapping ``(var_name, direction)`` to
-    :class:`islpy.Set` instances capturing which indices of each the array
-    *var_name* are read/written (where *direction* is either ``read`` or
-    ``write``.
-
-    :arg ignore_uncountable: If *False*, an error will be raised for accesses
-        on which the footprint cannot be determined (e.g. data-dependent or
-        nonlinear indices)
-    """
-
-    from loopy.preprocess import preprocess_kernel, infer_unknown_types
-    kernel = infer_unknown_types(kernel, expect_completion=True)
-
-    from loopy.kernel import KernelState
-    if kernel.state < KernelState.PREPROCESSED:
-        kernel = preprocess_kernel(kernel)
-
+def _gather_access_footprints_for_single_kernel(kernel, ignore_uncountable):
     write_footprints = []
     read_footprints = []
 
@@ -1858,6 +2156,48 @@ def gather_access_footprints(kernel, ignore_uncountable=False):
             write_footprints.append(afg(insn.assignees))
         read_footprints.append(afg(insn.expression))
 
+    return write_footprints, read_footprints
+
+
+def gather_access_footprints(program, ignore_uncountable=False, entrypoint=None):
+    """Return a dictionary mapping ``(var_name, direction)`` to
+    :class:`islpy.Set` instances capturing which indices of each the array
+    *var_name* are read/written (where *direction* is either ``read`` or
+    ``write``.
+
+    :arg ignore_uncountable: If *False*, an error will be raised for accesses
+        on which the footprint cannot be determined (e.g. data-dependent or
+        nonlinear indices)
+    """
+
+    if entrypoint is None:
+        if len(program.entrypoints) > 1:
+            raise LoopyError("Must provide entrypoint")
+
+        entrypoint = list(program.entrypoints)[0]
+
+    assert entrypoint in program.entrypoints
+
+    # FIMXE: works only for one callable kernel till now.
+    if len([in_knl_callable for in_knl_callable in
+        program.callables_table.values() if isinstance(in_knl_callable,
+            CallableKernel)]) != 1:
+        raise NotImplementedError("Currently only supported for program with "
+            "only one CallableKernel.")
+
+    from loopy.preprocess import preprocess_program, infer_unknown_types
+
+    program = preprocess_program(program)
+    # Ordering restriction: preprocess might insert arguments to
+    # make strides valid. Those also need to go through type inference.
+    program = infer_unknown_types(program, expect_completion=True)
+
+    write_footprints = []
+    read_footprints = []
+
+    write_footprints, read_footprints = _gather_access_footprints_for_single_kernel(
+            program[entrypoint], ignore_uncountable)
+
     write_footprints = AccessFootprintGatherer.combine(write_footprints)
     read_footprints = AccessFootprintGatherer.combine(read_footprints)
 
@@ -1872,7 +2212,7 @@ def gather_access_footprints(kernel, ignore_uncountable=False):
     return result
 
 
-def gather_access_footprint_bytes(kernel, ignore_uncountable=False):
+def gather_access_footprint_bytes(program, ignore_uncountable=False):
     """Return a dictionary mapping ``(var_name, direction)`` to
     :class:`islpy.PwQPolynomial` instances capturing the number of bytes  are
     read/written (where *direction* is either ``read`` or ``write`` on array
@@ -1883,12 +2223,12 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False):
         nonlinear indices)
     """
 
-    from loopy.preprocess import preprocess_kernel, infer_unknown_types
-    kernel = infer_unknown_types(kernel, expect_completion=True)
+    from loopy.preprocess import preprocess_program, infer_unknown_types
+    kernel = infer_unknown_types(program, expect_completion=True)
 
     from loopy.kernel import KernelState
     if kernel.state < KernelState.PREPROCESSED:
-        kernel = preprocess_kernel(kernel)
+        kernel = preprocess_program(program)
 
     result = {}
     fp = gather_access_footprints(kernel,
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index e083a87d814e5e935edee1be795eac9d8244f693..82f7525dcc43bdd089042a2f72f511d054ab0101 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -56,7 +56,7 @@ from pymbolic.mapper.constant_folder import \
         ConstantFoldingMapper as ConstantFoldingMapperBase
 
 from pymbolic.parser import Parser as ParserBase
-
+from loopy.diagnostic import LoopyError
 from loopy.diagnostic import ExpressionToAffineConversionError
 
 import islpy as isl
@@ -138,7 +138,14 @@ class IdentityMapperMixin:
         return expr
 
     def map_type_annotation(self, expr, *args, **kwargs):
-        return type(expr)(expr.type, self.rec(expr.child))
+        return type(expr)(expr.type, self.rec(expr.child, *args, **kwargs))
+
+    def map_sub_array_ref(self, expr, *args, **kwargs):
+        return SubArrayRef(self.rec(expr.swept_inames, *args, **kwargs),
+                self.rec(expr.subscript, *args, **kwargs))
+
+    def map_resolved_function(self, expr, *args, **kwargs):
+        return ResolvedFunction(expr.function)
 
     map_type_cast = map_type_annotation
 
@@ -197,15 +204,34 @@ class WalkMapper(WalkMapperBase):
 
     map_rule_argument = map_group_hw_index
 
+    def map_sub_array_ref(self, expr, *args):
+        if not self.visit(expr):
+            return
+
+        self.rec(expr.swept_inames, *args)
+        self.rec(expr.subscript, *args)
+
+    def map_resolved_function(self, expr, *args):
+        if not self.visit(expr):
+            return
+
+        self.rec(expr.function, *args)
+
 
 class CallbackMapper(CallbackMapperBase, IdentityMapper):
     map_reduction = CallbackMapperBase.map_constant
+    map_resolved_function = CallbackMapperBase.map_constant
 
 
 class CombineMapper(CombineMapperBase):
     def map_reduction(self, expr, *args, **kwargs):
         return self.rec(expr.expr, *args, **kwargs)
 
+    def map_sub_array_ref(self, expr):
+        return self.combine((
+            self.rec(expr.subscript),
+            self.combine(tuple(self.rec(idx) for idx in expr.swept_inames))))
+
     map_linear_subscript = CombineMapperBase.map_subscript
 
 
@@ -265,6 +291,16 @@ class StringifyMapper(StringifyMapperBase):
         return "cast({}, {})".format(
                 repr(expr.type), self.rec(expr.child, PREC_NONE))
 
+    def map_resolved_function(self, expr, prec):
+        # underlining a resolved call
+        return "\u0332".join(str(expr.function))
+
+    def map_sub_array_ref(self, expr, prec):
+        return "[{inames}]: {subscr}".format(
+                inames=",".join(self.rec(iname, prec) for iname in
+                    expr.swept_inames),
+                subscr=self.rec(expr.subscript, prec))
+
 
 class EqualityPreservingStringifyMapper(StringifyMapperBase):
     """
@@ -304,7 +340,7 @@ class UnidirectionalUnifier(UnidirectionalUnifierBase):
         if not isinstance(other, type(expr)):
             return self.treat_mismatch(expr, other, unis)
         if (expr.inames != other.inames
-                or type(expr.operation) != type(other.operation)):  # noqa
+                or type(expr.function) != type(other.function)):  # noqa
             return []
 
         return self.rec(expr.expr, other.expr, unis)
@@ -339,6 +375,13 @@ class DependencyMapper(DependencyMapperBase):
         return self.combine(
                 self.rec(child, *args, **kwargs) for child in expr.parameters)
 
+    def map_call_with_kwargs(self, expr, *args):
+        # Loopy does not have first-class functions. Do not descend
+        # into 'function' attribute of Call.
+        return self.combine(
+                self.rec(child, *args) for child in expr.parameters+tuple(
+                    expr.kw_parameters.values()))
+
     def map_reduction(self, expr, *args, **kwargs):
         deps = self.rec(expr.expr, *args, **kwargs)
         return deps - {p.Variable(iname) for iname in expr.inames}
@@ -349,11 +392,18 @@ class DependencyMapper(DependencyMapperBase):
     def map_loopy_function_identifier(self, expr, *args, **kwargs):
         return set()
 
+    def map_sub_array_ref(self, expr, *args, **kwargs):
+        deps = self.rec(expr.subscript, *args, **kwargs)
+        return deps - set(expr.swept_inames)
+
     map_linear_subscript = DependencyMapperBase.map_subscript
 
     def map_type_cast(self, expr, *args, **kwargs):
         return self.rec(expr.child, *args, **kwargs)
 
+    def map_resolved_function(self, expr):
+        return self.rec(expr.function)
+
     def map_literal(self, expr):
         return set()
 
@@ -621,7 +671,6 @@ class Reduction(LoopyExpressionBase):
     Represents a reduction operation on :attr:`expr` across :attr:`inames`.
 
     .. attribute:: operation
-
         an instance of :class:`loopy.library.reduction.ReductionOperation`
 
     .. attribute:: inames
@@ -748,6 +797,170 @@ class RuleArgument(LoopyExpressionBase):
 
     mapper_method = intern("map_rule_argument")
 
+
+class ResolvedFunction(LoopyExpressionBase):
+    """
+    A function invocation whose definition is known in a :mod:`loopy` kernel.
+    Each instance of :class:`loopy.symbolic.ResolvedFunction` in an expression
+    points to an instance of
+    :class:`loopy.kernel.function_interface.InKernelCallable` through the
+    mapping :attr:`loopy.kernel.LoopKernel.scoped_functions`. Refer
+    :ref:`ref_scoped_function` for a slightly detailed explanation on scoped
+    functions.
+
+    .. attribute:: function
+
+        An instance of :class:`pymbolic.primitives.Variable`,
+        :class:`loopy.library.reduction.ArgExtOp` or
+        :class:`loopy.library.reduction.SegmentedOp`.
+    """
+    init_arg_names = ("function", )
+
+    def __init__(self, function):
+        if isinstance(function, str):
+            function = p.Variable(function)
+        from loopy.library.reduction import ReductionOpFunction
+        assert isinstance(function, (p.Variable, ReductionOpFunction))
+        self.function = function
+
+    @property
+    def name(self):
+        from loopy.library.reduction import ReductionOpFunction
+        if isinstance(self.function, p.Variable):
+            return self.function.name
+        elif isinstance(self.function, ReductionOpFunction):
+            return self.function
+        else:
+            raise LoopyError("Unexpected function type %s in ResolvedFunction." %
+                    type(self.function))
+
+    def __getinitargs__(self):
+        return (self.function, )
+
+    def make_stringifier(self, originating_stringifier=None):
+        return StringifyMapper()
+
+    mapper_method = intern("map_resolved_function")
+
+
+class EvaluatorWithDeficientContext(PartialEvaluationMapper):
+    """Evaluation Mapper that does not need values of all the variables
+    involved in the expression.
+
+    Returns the expression with the values mapped from :attr:`context`.
+    """
+    def map_variable(self, expr):
+        if expr.name in self.context:
+            return self.context[expr.name]
+        else:
+            return expr
+
+
+class VariableInAnExpression(CombineMapper):
+    def __init__(self, variables_to_search):
+        assert(all(isinstance(variable, p.Variable) for variable in
+            variables_to_search))
+        self.variables_to_search = variables_to_search
+
+    def combine(self, values):
+        return any(values)
+
+    def map_variable(self, expr):
+        return expr in self.variables_to_search
+
+    def map_constant(self, expr):
+        return False
+
+
+class SweptInameStrideCollector(CoefficientCollectorBase):
+    """
+    Mapper to compute the coefficient swept inames for :class:`SubArrayRef`.
+    """
+    def map_algebraic_leaf(self, expr):
+        # subscripts that are not involved in :attr:`target_names` are treated
+        # as constants.
+        if isinstance(expr, p.Subscript) and (self.target_names is None
+                or expr.aggregate.name not in self.target_names):
+            return {1: expr}
+
+        return super().map_algebraic_leaf(expr)
+
+
+def get_start_subscript_from_sar(sar, kernel):
+    """
+    Returns an instance of :class:`pymbolic.primitives.Subscript`, the
+    beginning subscript of the array swept by the *SubArrayRef*.
+
+    **Example:** Consider ``[i, k]: a[i, j, k, l]``. The beginning
+    subscript would be ``a[0, j, 0, l]``
+    """
+
+    def _get_lower_bound(iname):
+        pwaff = kernel.get_iname_bounds(iname).lower_bound_pw_aff
+        return int(pw_aff_to_expr(pwaff))
+
+    swept_inames_to_zeros = {
+            swept_iname.name: _get_lower_bound(swept_iname.name) for
+            swept_iname in sar.swept_inames}
+
+    return EvaluatorWithDeficientContext(swept_inames_to_zeros)(
+            sar.subscript)
+
+
+class SubArrayRef(LoopyExpressionBase):
+    """
+    An algebraic expression to map an affine memory layout pattern (known as
+    sub-arary) as consecutive elements of the sweeping axes which are defined
+    using :attr:`SubArrayRef.swept_inames`.
+
+    .. attribute:: swept_inames
+
+        An instance of :class:`tuple` denoting the axes to which the sub array
+        is supposed to be mapper to.
+
+    .. attribute:: subscript
+
+        An instance of :class:`pymbolic.primitives.Subscript` denoting the
+        array in the kernel.
+    """
+
+    init_arg_names = ("swept_inames", "subscript")
+
+    def __init__(self, swept_inames, subscript):
+
+        # {{{ sanity checks
+
+        if not isinstance(swept_inames, tuple):
+            assert isinstance(swept_inames, p.Variable)
+            swept_inames = (swept_inames,)
+
+        assert isinstance(swept_inames, tuple)
+
+        for iname in swept_inames:
+            assert isinstance(iname, p.Variable)
+        assert isinstance(subscript, p.Subscript)
+
+        # }}}
+
+        self.swept_inames = swept_inames
+        self.subscript = subscript
+
+    def __getinitargs__(self):
+        return (self.swept_inames, self.subscript)
+
+    def get_hash(self):
+        return hash((self.__class__, self.swept_inames, self.subscript))
+
+    def is_equal(self, other):
+        return (other.__class__ == self.__class__
+                and other.subscript == self.subscript
+                and other.swept_inames == self.swept_inames)
+
+    def make_stringifier(self, originating_stringifier=None):
+        return StringifyMapper()
+
+    mapper_method = intern("map_sub_array_ref")
+
 # }}}
 
 
@@ -780,9 +993,12 @@ def get_reduction_inames(expr):
 # {{{ rule-aware mappers
 
 def parse_tagged_name(expr):
+    from loopy.library.reduction import ArgExtOp, SegmentedOp
     if isinstance(expr, TaggedVariable):
         return expr.name, expr.tags
-    elif isinstance(expr, p.Variable):
+    elif isinstance(expr, ResolvedFunction):
+        return parse_tagged_name(expr.function)
+    elif isinstance(expr, (p.Variable, ArgExtOp, SegmentedOp)):
         return expr.name, None
     else:
         raise RuntimeError("subst rule name not understood: %s" % expr)
@@ -981,12 +1197,14 @@ class RuleAwareIdentityMapper(IdentityMapper):
     def __init__(self, rule_mapping_context):
         self.rule_mapping_context = rule_mapping_context
 
-    def map_variable(self, expr, expn_state):
+    def map_variable(self, expr, expn_state, *args, **kwargs):
         name, tags = parse_tagged_name(expr)
         if name not in self.rule_mapping_context.old_subst_rules:
-            return IdentityMapper.map_variable(self, expr, expn_state)
+            return IdentityMapper.map_variable(self, expr, expn_state, *args,
+                    **kwargs)
         else:
-            return self.map_substitution(name, tags, (), expn_state)
+            return self.map_substitution(name, tags, (), expn_state, *args,
+                    **kwargs)
 
     def map_call(self, expr, expn_state):
         if not isinstance(expr.function, p.Variable):
@@ -1038,6 +1256,10 @@ class RuleAwareIdentityMapper(IdentityMapper):
             return sym
 
     def __call__(self, expr, kernel, insn):
+        """
+        :arg insn: A :class:`~loopy.kernel.InstructionBase` of which *expr* is
+            a part of, or *None* if *expr*'s source is not an instruction.
+        """
         from loopy.kernel.data import InstructionBase
         assert insn is None or isinstance(insn, InstructionBase)
 
@@ -1288,6 +1510,14 @@ class FunctionToPrimitiveMapper(IdentityMapper):
             else:
                 return IdentityMapper.map_call(self, expr)
 
+    def map_call_with_kwargs(self, expr):
+        for par in expr.kw_parameters.values():
+            if not isinstance(par, SubArrayRef):
+                raise LoopyError("Keyword Arguments is only supported for"
+                        " array arguments--use positional order to specify"
+                        " the order of the arguments in the call.")
+        return IdentityMapper.map_call_with_kwargs(self, expr)
+
 
 # {{{ customization to pymbolic parser
 
@@ -1318,8 +1548,10 @@ class LoopyParser(ParserBase):
             return float(val)  # generic float
 
     def parse_prefix(self, pstate):
-        from pymbolic.parser import _PREC_UNARY, _less, _greater, _identifier
+        from pymbolic.parser import (_PREC_UNARY, _less, _greater, _identifier,
+                _openbracket, _closebracket, _colon)
         import loopy as lp
+
         if pstate.is_next(_less):
             pstate.advance()
             if pstate.is_next(_greater):
@@ -1335,6 +1567,26 @@ class LoopyParser(ParserBase):
             return TypeAnnotation(
                     typename,
                     self.parse_expression(pstate, _PREC_UNARY))
+
+        elif pstate.is_next(_openbracket):
+            rollback_pstate = pstate.copy()
+            pstate.advance()
+            pstate.expect_not_end()
+            if pstate.is_next(_closebracket):
+                swept_inames = ()
+            else:
+                swept_inames = self.parse_expression(pstate)
+
+            pstate.expect(_closebracket)
+            pstate.advance()
+            if pstate.is_next(_colon):
+                # pstate.expect(_colon):
+                pstate.advance()
+                subscript = self.parse_expression(pstate, _PREC_UNARY)
+                return SubArrayRef(swept_inames, subscript)
+            else:
+                pstate = rollback_pstate
+                return super().parse_prefix(rollback_pstate)
         else:
             return super().parse_prefix(pstate)
 
@@ -2032,7 +2284,7 @@ def get_access_map(domain, subscript, assumptions=None, shape=None,
         except ExpressionToAffineConversionError as err:
             shape_aff = None
 
-            if shape is not None:
+            if shape is not None and shape[idim] is not None:
                 try:
                     shape_aff = guarded_aff_from_expr(access_map.space, shape[idim])
                 except ExpressionToAffineConversionError:
@@ -2166,6 +2418,10 @@ class BatchedAccessMapMapper(WalkMapper):
     def map_type_cast(self, expr, inames):
         return self.rec(expr.child, inames)
 
+    def map_sub_array_ref(self, expr, inames):
+        total_inames = inames | {iname.name for iname in expr.swept_inames}
+        return self.rec(expr.subscript, total_inames)
+
 
 class AccessRangeMapper:
     """**IMPORTANT**
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index 8af47c41222416fbd2dbe3dc5a88d4090a4a06f0..8706c4a37728973c572cd7acc679e23da9c13932 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -87,7 +87,7 @@ class TargetBase:
     def preprocess(self, kernel):
         return kernel
 
-    def pre_codegen_check(self, kernel):
+    def pre_codegen_check(self, kernel, callables_table):
         pass
 
     # }}}
@@ -157,8 +157,15 @@ class ASTBuilderBase:
 
     # {{{ library
 
-    def function_manglers(self):
-        return []
+    @property
+    def known_callables(self):
+        """
+        Returns a mapping from function ids to corresponding
+        :class:`loopy.kernel.function_interface.InKernelCallable` for the
+        function ids known to *self.target*.
+        """
+        # FIXME: @inducer: Do we need to move this to TargetBase?
+        return {}
 
     def symbol_manglers(self):
         return []
@@ -170,6 +177,10 @@ class ASTBuilderBase:
 
     # {{{ code generation guts
 
+    @property
+    def ast_module(self):
+        raise NotImplementedError()
+
     def get_function_definition(self, codegen_state, codegen_result,
             schedule_index, function_decl, function_body):
         raise NotImplementedError
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 92b94d658fd24f44ff4b8b0ba748f3cd5212617a..a45965c800316c0922b3186053fd3f61c96e5e63 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -24,7 +24,6 @@ THE SOFTWARE.
 """
 
 import numpy as np  # noqa
-from loopy.kernel.data import CallMangleInfo
 from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder
 from loopy.diagnostic import LoopyError, LoopyTypeError
 from cgen import Pointer, NestedDeclarator, Block
@@ -32,6 +31,7 @@ from cgen.mapper import IdentityMapper as CASTIdentityMapperBase
 from pymbolic.mapper.stringifier import PREC_NONE
 from loopy.symbolic import IdentityMapper
 from loopy.types import NumpyType
+from loopy.kernel.function_interface import ScalarCallable
 import pymbolic.primitives as p
 
 from loopy.tools import remove_common_indentation
@@ -72,11 +72,13 @@ class DTypeRegistryWrapper:
             return self.wrapped_registry.get_or_register_dtype(names, dtype)
 
     def dtype_to_ctype(self, dtype):
-        from loopy.types import LoopyType, NumpyType
+        from loopy.types import LoopyType, NumpyType, OpaqueType
         assert isinstance(dtype, LoopyType)
 
         if isinstance(dtype, NumpyType):
             return self.wrapped_registry.dtype_to_ctype(dtype)
+        elif isinstance(dtype, OpaqueType):
+            return dtype.name
         else:
             raise LoopyError(
                     "unable to convert type '%s' to C"
@@ -447,42 +449,60 @@ def c_symbol_mangler(kernel, name):
     # float NAN as defined in C99 standard
     if name == "NAN":
         return NumpyType(np.dtype(np.float32)), name
+
+    if name in ["INT_MAX", "INT_MIN"]:
+        return NumpyType(np.dtype(np.int32)), name
+
     return None
 
 # }}}
 
 
-# {{{ function mangler
+# {{{ function scoping
 
-def c_math_mangler(target, name, arg_dtypes, modify_name=True):
-    # Function mangler for math functions defined in C standard
-    # Convert abs, min, max to fabs, fmin, fmax.
-    # If modify_name is set to True, function names are modified according to
-    # floating point types of the arguments (e.g. cos(double), cosf(float))
-    # This should be set to True for C and Cuda, False for OpenCL
-    if not isinstance(name, str):
-        return None
+class CMathCallable(ScalarCallable):
+    """
+    An umbrella callable for all the math functions which can be seen in a
+    C-Target.
+    """
 
-    # {{{ (abs|max|min) -> (fabs|fmax|fmin)
+    def with_types(self, arg_id_to_dtype, callables_table):
+        name = self.name
 
-    if name in ["abs", "min", "max"]:
-        dtype = np.find_common_type(
-            [], [dtype.numpy_dtype for dtype in arg_dtypes])
-        if dtype.kind == "f":
-            name = "f" + name
+        # {{{ (abs|max|min) -> (fabs|fmax|fmin)
 
-    # }}}
+        if name in ["abs", "min", "max"]:
+            dtype = np.find_common_type(
+                [], [dtype.numpy_dtype for dtype in arg_id_to_dtype.values()])
+            if dtype.kind == "f":
+                name = "f" + name
+
+        # }}}
 
-    # unitary functions
-    if (name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh",
-                 "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"]
-            and len(arg_dtypes) == 1
-            and arg_dtypes[0].numpy_dtype.kind in "fc"):
+        # unary functions
+        if name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh",
+                    "tan", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor",
+                    "erf", "erfc", "abs", "real", "imag"]:
 
-        dtype = arg_dtypes[0].numpy_dtype
-        real_dtype = np.empty(0, dtype=dtype).real.dtype
+            for id in arg_id_to_dtype:
+                if not -1 <= id <= 0:
+                    raise LoopyError(f"'{name}' can take only one argument.")
 
-        if modify_name:
+            if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None:
+                # the types provided aren't mature enough to specialize the
+                # callable
+                return (
+                        self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                        callables_table)
+
+            dtype = arg_id_to_dtype[0].numpy_dtype
+            real_dtype = np.empty(0, dtype=dtype).real.dtype
+
+            if dtype.kind in ("u", "i"):
+                # ints and unsigned casted to float32
+                dtype = np.float32
+
+            # for CUDA, C Targets the name must be modified
             if real_dtype == np.float64:
                 pass  # fabs
             elif real_dtype == np.float32:
@@ -491,29 +511,45 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True):
                     and real_dtype == np.float128):  # pylint:disable=no-member
                 name = name + "l"  # fabsl
             else:
-                raise LoopyTypeError(f"{name} does not support type {real_dtype}")
+                raise LoopyTypeError("{} does not support type {}".format(name,
+                    dtype))
 
             if dtype.kind == "c":
                 name = "c" + name
 
-        return CallMangleInfo(
-                target_name=name,
-                result_dtypes=arg_dtypes,
-                arg_dtypes=arg_dtypes)
+            if name in ["abs", "real", "imag"]:
+                dtype = real_dtype
+
+            return (
+                    self.copy(name_in_target=name,
+                        arg_id_to_dtype={0: NumpyType(dtype), -1:
+                            NumpyType(dtype)}),
+                    callables_table)
 
-    # binary functions
-    if (name in ["fmax", "fmin", "copysign", "pow"]
-            and len(arg_dtypes) == 2):
+        # binary functions
+        elif name in ["fmax", "fmin", "pow", "atan2", "copysign"]:
 
-        dtype = np.find_common_type(
-            [], [dtype.numpy_dtype for dtype in arg_dtypes])
-        real_dtype = np.empty(0, dtype=dtype).real.dtype
+            for id in arg_id_to_dtype:
+                if not -1 <= id <= 1:
+                    raise LoopyError("%s can take only two arguments." % name)
 
-        if name in ["fmax", "fmin", "copysign"] and dtype.kind == "c":
-            raise LoopyTypeError(f"{name} does not support complex numbers")
+            if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or (
+                    arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None):
+                # the types provided aren't mature enough to specialize the
+                # callable
+                return (
+                        self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                        callables_table)
 
-        elif real_dtype.kind in "fc":
-            if modify_name:
+            dtype = np.find_common_type(
+                [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items()
+                     if id >= 0])
+            real_dtype = np.empty(0, dtype=dtype).real.dtype
+
+            if name in ["fmax", "fmin", "copysign"] and dtype.kind == "c":
+                raise LoopyTypeError(f"{name} does not support complex numbers")
+
+            elif real_dtype.kind in "fc":
                 if real_dtype == np.float64:
                     pass  # fmin
                 elif real_dtype == np.float32:
@@ -523,50 +559,48 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True):
                     name = name + "l"  # fminl
                 else:
                     raise LoopyTypeError("%s does not support type %s"
-                                         % (name, real_dtype))
-
-                if dtype.kind == "c":
-                    name = "c" + name  # cpow
-
-            result_dtype = NumpyType(dtype)
-            return CallMangleInfo(
-                    target_name=name,
-                    result_dtypes=(result_dtype,),
-                    arg_dtypes=2*(result_dtype,))
-
-    # complex functions
-    if (name in ["abs", "real", "imag"]
-            and len(arg_dtypes) == 1
-            and arg_dtypes[0].numpy_dtype.kind == "c"):
-        dtype = arg_dtypes[0].numpy_dtype
-        real_dtype = np.empty(0, dtype=dtype).real.dtype
-
-        if modify_name:
-            if real_dtype == np.float64:
-                pass  # fabs
-            elif real_dtype == np.float32:
-                name = name + "f"  # fabsf
-            elif (hasattr(np, "float128")
-                    and real_dtype == np.float128):  # pylint:disable=no-member
-                name = name + "l"  # fabsl
-            else:
-                raise LoopyTypeError(f"{name} does not support type {real_dtype}")
-
-            name = "c" + name
-
-        return CallMangleInfo(
-                target_name=name,
-                result_dtypes=(NumpyType(real_dtype),),
-                arg_dtypes=arg_dtypes)
-
-    if (name == "isnan" and len(arg_dtypes) == 1
-            and arg_dtypes[0].numpy_dtype.kind == "f"):
-        return CallMangleInfo(
-                target_name=name,
-                result_dtypes=(NumpyType(np.int32),),
-                arg_dtypes=arg_dtypes)
+                                         % (name, dtype))
+            if dtype.kind == "c":
+                name = "c" + name  # cpow
+            dtype = NumpyType(dtype)
+            return (
+                    self.copy(name_in_target=name,
+                        arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}),
+                    callables_table)
+
+        elif name == "isnan":
+            for id in arg_id_to_dtype:
+                if not -1 <= id <= 0:
+                    raise LoopyError(f"'{name}' can take only one argument.")
+
+            if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None:
+                # the types provided aren't mature enough to specialize the
+                # callable
+                return (
+                        self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                        callables_table)
+
+            dtype = arg_id_to_dtype[0].numpy_dtype
+            return (
+                    self.copy(
+                        name_in_target=name,
+                        arg_id_to_dtype={
+                            0: NumpyType(dtype),
+                            -1: NumpyType(np.int32)}),
+                    callables_table)
+
+
+def get_c_callables():
+    """
+    Returns an instance of :class:`InKernelCallable` if the function
+    represented by :arg:`identifier` is known in C, otherwise returns *None*.
+    """
+    cmath_ids = ["abs", "acos", "asin", "atan", "cos", "cosh", "sin",
+                 "sinh", "pow", "atan2", "tanh", "exp", "log", "log10",
+                 "sqrt", "ceil", "floor", "max", "min", "fmax", "fmin",
+                 "fabs", "tan", "erf", "erfc", "isnan", "real", "imag"]
 
-    return None
+    return {id_: CMathCallable(id_) for id_ in cmath_ids}
 
 # }}}
 
@@ -574,12 +608,6 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True):
 class CFamilyASTBuilder(ASTBuilderBase):
     # {{{ library
 
-    def function_manglers(self):
-        return (
-                super().function_manglers() + [
-                    c_math_mangler
-                    ])
-
     def symbol_manglers(self):
         return (
                 super().symbol_manglers() + [
@@ -592,6 +620,12 @@ class CFamilyASTBuilder(ASTBuilderBase):
                     _preamble_generator,
                     ])
 
+    @property
+    def known_callables(self):
+        callables = super().known_callables
+        callables.update(get_c_callables())
+        return callables
+
     # }}}
 
     # {{{ code generation
@@ -678,9 +712,13 @@ class CFamilyASTBuilder(ASTBuilderBase):
         if self.target.fortran_abi:
             name += "_"
 
+        if codegen_state.is_entrypoint:
+            name = Value("void", name)
+        else:
+            name = Value("static void", name)
         return FunctionDeclarationWrapper(
                 FunctionDeclaration(
-                    Value("void", name),
+                    name,
                     [self.idi_to_cgen_declarator(codegen_state.kernel, idi)
                         for idi in codegen_state.implemented_data_info]))
 
@@ -709,8 +747,8 @@ class CFamilyASTBuilder(ASTBuilderBase):
                 temporaries_written_in_subkernel)
         subkernel = kernel.schedule[schedule_index].kernel_name
         sub_knl_temps = (
-                temporaries_read_in_subkernel(kernel, subkernel) |
-                temporaries_written_in_subkernel(kernel, subkernel))
+                temporaries_read_in_subkernel(kernel, subkernel)
+                | temporaries_written_in_subkernel(kernel, subkernel))
 
         for tv in sorted(
                 kernel.temporary_variables.values(),
@@ -831,6 +869,11 @@ class CFamilyASTBuilder(ASTBuilderBase):
 
     # {{{ code generation guts
 
+    @property
+    def ast_module(self):
+        import cgen
+        return cgen
+
     def get_expression_to_code_mapper(self, codegen_state):
         return self.get_expression_to_c_expression_mapper(codegen_state)
 
@@ -993,83 +1036,33 @@ class CFamilyASTBuilder(ASTBuilderBase):
         return block_if_necessary(assignments)
 
     def emit_multiple_assignment(self, codegen_state, insn):
-        ecm = codegen_state.expression_to_code_mapper
-
-        from pymbolic.primitives import Variable
-        from pymbolic.mapper.stringifier import PREC_NONE
-
-        func_id = insn.expression.function
-        parameters = insn.expression.parameters
-
-        if isinstance(func_id, Variable):
-            func_id = func_id.name
 
-        assignee_var_descriptors = [
-                codegen_state.kernel.get_var_descriptor(a)
-                for a in insn.assignee_var_names()]
-
-        par_dtypes = tuple(ecm.infer_type(par) for par in parameters)
-
-        mangle_result = codegen_state.kernel.mangle_function(func_id, par_dtypes)
-        if mangle_result is None:
-            raise RuntimeError("function '%s' unknown--"
-                    "maybe you need to register a function mangler?"
-                    % func_id)
-
-        assert mangle_result.arg_dtypes is not None
+        ecm = codegen_state.expression_to_code_mapper
+        func_id = insn.expression.function.name
+        in_knl_callable = codegen_state.callables_table[func_id]
 
-        if mangle_result.target_name == "loopy_make_tuple":
-            # This shorcut avoids actually having to emit a 'make_tuple' function.
+        if isinstance(in_knl_callable, ScalarCallable) and (
+                in_knl_callable.name_in_target == "loopy_make_tuple"):
             return self.emit_tuple_assignment(codegen_state, insn)
 
-        from loopy.expression import dtype_to_type_context
-        c_parameters = [
-                ecm(par, PREC_NONE,
-                    dtype_to_type_context(self.target, tgt_dtype),
-                    tgt_dtype).expr
-                for par, par_dtype, tgt_dtype in zip(
-                    parameters, par_dtypes, mangle_result.arg_dtypes)]
-
-        from loopy.codegen import SeenFunction
-        codegen_state.seen_functions.add(
-                SeenFunction(func_id,
-                    mangle_result.target_name,
-                    mangle_result.arg_dtypes,
-                    mangle_result.result_dtypes))
-
-        from pymbolic import var
-        for i, (a, tgt_dtype) in enumerate(
-                zip(insn.assignees[1:], mangle_result.result_dtypes[1:])):
-            if tgt_dtype != ecm.infer_type(a):
-                raise LoopyError("type mismatch in %d'th (1-based) left-hand "
-                        "side of instruction '%s'" % (i+1, insn.id))
-            c_parameters.append(
-                        # TODO Yuck: The "where-at function": &(...)
-                        var("&")(
-                            ecm(a, PREC_NONE,
-                                dtype_to_type_context(self.target, tgt_dtype),
-                                tgt_dtype).expr))
+        # takes "is_returned" to infer whether insn.assignees[0] is a part of
+        # LHS.
+        in_knl_callable_as_call, is_returned = in_knl_callable.emit_call_insn(
+                insn=insn,
+                target=self.target,
+                expression_to_code_mapper=ecm)
 
-        from pymbolic import var
-        result = var(mangle_result.target_name)(*c_parameters)
-
-        # In case of no assignees, we are done
-        if len(mangle_result.result_dtypes) == 0:
+        if is_returned:
+            from cgen import Assign
+            lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None)
+            return Assign(lhs_code,
+                    CExpression(self.get_c_expression_to_code_mapper(),
+                    in_knl_callable_as_call))
+        else:
             from cgen import ExpressionStatement
             return ExpressionStatement(
-                    CExpression(self.get_c_expression_to_code_mapper(), result))
-
-        result = ecm.wrap_in_typecast_lazy(
-                lambda: mangle_result.result_dtypes[0],
-                assignee_var_descriptors[0].dtype,
-                result)
-
-        lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None)
-
-        from cgen import Assign
-        return Assign(
-                lhs_code,
-                CExpression(self.get_c_expression_to_code_mapper(), result))
+                    CExpression(self.get_c_expression_to_code_mapper(),
+                                in_knl_callable_as_call))
 
     def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
             lbound, ubound, inner):
@@ -1207,7 +1200,6 @@ class ExecutableCTarget(CTarget):
     """
     An executable CFamilyTarget that uses (by default) JIT compilation of C-code
     """
-
     def __init__(self, compiler=None, fortran_abi=False):
         super().__init__(fortran_abi=fortran_abi)
         from loopy.target.c.c_execution import CCompiler
@@ -1215,7 +1207,8 @@ class ExecutableCTarget(CTarget):
 
     def get_kernel_executor(self, knl, *args, **kwargs):
         from loopy.target.c.c_execution import CKernelExecutor
-        return CKernelExecutor(knl, compiler=self.compiler)
+        return CKernelExecutor(knl, entrypoint=kwargs.pop("entrypoint"),
+                compiler=self.compiler)
 
     def get_host_ast_builder(self):
         # enable host code generation
diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py
index d73912460a2c99075c875375056be5922a98d692..1150a9f9b930148d28a7a5b658d3e2a17404b5b1 100644
--- a/loopy/target/c/c_execution.py
+++ b/loopy/target/c/c_execution.py
@@ -163,7 +163,8 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
                     % ", ".join(f'"{arg.name}": {arg.name}'
                         for arg in implemented_data_info
                         if issubclass(arg.arg_class, KernelArgument)
-                        if arg.base_name in kernel.get_written_variables()))
+                        if arg.base_name in
+                        kernel.get_written_variables()))
         else:
             out_args = [arg
                     for arg in implemented_data_info
@@ -404,7 +405,7 @@ class CKernelExecutor(KernelExecutorBase):
     .. automethod:: __call__
     """
 
-    def __init__(self, kernel, compiler=None):
+    def __init__(self, program, entrypoint, compiler=None):
         """
         :arg kernel: may be a loopy.LoopKernel, a generator returning kernels
             (a warning will be issued if more than one is returned). If the
@@ -413,54 +414,57 @@ class CKernelExecutor(KernelExecutorBase):
         """
 
         self.compiler = compiler if compiler else CCompiler()
-        super().__init__(kernel)
+        super().__init__(program, entrypoint)
 
-    def get_invoker_uncached(self, kernel, codegen_result):
+    def get_invoker_uncached(self, kernel, entrypoint, codegen_result):
         generator = CExecutionWrapperGenerator()
-        return generator(kernel, codegen_result)
+        return generator(kernel, entrypoint, codegen_result)
 
     def get_wrapper_generator(self):
         return CExecutionWrapperGenerator()
 
     @memoize_method
-    def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None):
-        kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set)
+    def program_info(self, entrypoint, arg_to_dtype_set=frozenset(),
+            all_kwargs=None):
+        program = self.get_typed_and_scheduled_program(entrypoint, arg_to_dtype_set)
 
         from loopy.codegen import generate_code_v2
-        codegen_result = generate_code_v2(kernel)
+        codegen_result = generate_code_v2(program)
 
         dev_code = codegen_result.device_code()
         host_code = codegen_result.host_code()
         all_code = "\n".join([dev_code, "", host_code])
 
-        if self.kernel.options.write_cl:
+        if self.program[entrypoint].options.write_cl:
             output = all_code
-            if self.kernel.options.highlight_cl:
+            if self.program[entrypoint].options.highlight_cl:
                 output = get_highlighted_code(output)
 
-            if self.kernel.options.write_cl is True:
+            if self.program[entrypoint].options.write_cl is True:
                 print(output)
             else:
-                with open(self.kernel.options.write_cl, "w") as outf:
+                with open(self.program[entrypoint].options.write_cl, "w") as outf:
                     outf.write(output)
 
-        if self.kernel.options.edit_cl:
+        if self.program[entrypoint].options.edit_cl:
             from pytools import invoke_editor
             dev_code = invoke_editor(dev_code, "code.c")
             # update code from editor
             all_code = "\n".join([dev_code, "", host_code])
 
         c_kernels = []
+
         for dp in codegen_result.device_programs:
             c_kernels.append(CompiledCKernel(dp,
-                codegen_result.implemented_data_info, all_code, self.kernel.target,
-                self.compiler))
+                codegen_result.implemented_data_infos[entrypoint], all_code,
+                self.program.target, self.compiler))
 
         return _KernelInfo(
-                kernel=kernel,
+                program=program,
                 c_kernels=c_kernels,
-                implemented_data_info=codegen_result.implemented_data_info,
-                invoker=self.get_invoker(kernel, codegen_result))
+                implemented_data_info=codegen_result.implemented_data_infos[
+                    entrypoint],
+                invoker=self.get_invoker(program, entrypoint, codegen_result))
 
     # }}}
 
@@ -477,7 +481,9 @@ class CKernelExecutor(KernelExecutorBase):
 
         kwargs = self.packing_controller.unpack(kwargs)
 
-        kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs))
+        program_info = self.program_info(kwargs["entrypoint"],
+                self.arg_to_dtype_set(kwargs))
+        kwargs.pop("entrypoint")
 
-        return kernel_info.invoker(
-                kernel_info.c_kernels, *args, **kwargs)
+        return program_info.invoker(
+                program_info.c_kernels, *args, **kwargs)
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 853722c3c4533865c7c313d09f16105b9c9c46cc..336028925f8aaf2eea7865d080ac88e1c398b033 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -35,9 +35,9 @@ from pymbolic import var
 
 
 from loopy.expression import dtype_to_type_context
-from loopy.type_inference import TypeInferenceMapper
+from loopy.type_inference import TypeReader
 
-from loopy.diagnostic import LoopyError, LoopyWarning
+from loopy.diagnostic import LoopyError
 from loopy.tools import is_integer
 from loopy.types import LoopyType
 from loopy.target.c import CExpression
@@ -62,7 +62,8 @@ class ExpressionToCExpressionMapper(IdentityMapper):
         self.codegen_state = codegen_state
 
         if type_inf_mapper is None:
-            type_inf_mapper = TypeInferenceMapper(self.kernel)
+            type_inf_mapper = TypeReader(self.kernel,
+                    self.codegen_state.callables_table)
         self.type_inf_mapper = type_inf_mapper
 
         self.allow_complex = codegen_state.allow_complex
@@ -176,6 +177,11 @@ class ExpressionToCExpressionMapper(IdentityMapper):
     def map_tagged_variable(self, expr, type_context):
         return var(expr.name)
 
+    def map_sub_array_ref(self, expr, type_context):
+        from loopy.symbolic import get_start_subscript_from_sar
+        return var("&")(self.rec(get_start_subscript_from_sar(expr, self.kernel),
+            type_context))
+
     def map_subscript(self, expr, type_context):
         def base_impl(expr, type_context):
             return self.rec(expr.aggregate, type_context)[self.rec(expr.index, "i")]
@@ -439,104 +445,12 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                         "for constant '%s'" % expr)
 
     def map_call(self, expr, type_context):
-        from pymbolic.primitives import Variable, Subscript
-
-        identifier = expr.function
-
-        # {{{ implement indexof, indexof_vec
-
-        if identifier.name in ["indexof", "indexof_vec"]:
-            if len(expr.parameters) != 1:
-                raise LoopyError("%s takes exactly one argument" % identifier.name)
-            arg, = expr.parameters
-            if not isinstance(arg, Subscript):
-                raise LoopyError(
-                        "argument to %s must be a subscript" % identifier.name)
-
-            ary = self.find_array(arg)
-
-            from loopy.kernel.array import get_access_info
-            from pymbolic import evaluate
-            access_info = get_access_info(self.kernel.target, ary, arg.index,
-                    lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
-                    self.codegen_state.vectorization_info)
-
-            from loopy.kernel.data import ImageArg
-            if isinstance(ary, ImageArg):
-                raise LoopyError("%s does not support images" % identifier.name)
-
-            if identifier.name == "indexof":
-                return access_info.subscripts[0]
-            elif identifier.name == "indexof_vec":
-                from loopy.kernel.array import VectorArrayDimTag
-                ivec = None
-                for iaxis, dim_tag in enumerate(ary.dim_tags):
-                    if isinstance(dim_tag, VectorArrayDimTag):
-                        ivec = iaxis
-
-                if ivec is None:
-                    return access_info.subscripts[0]
-                else:
-                    return (
-                        access_info.subscripts[0]*ary.shape[ivec]
-                        + access_info.vector_index)
-
-            else:
-                raise RuntimeError("should not get here")
-
-        # }}}
-
-        if isinstance(identifier, Variable):
-            identifier = identifier.name
-
-        par_dtypes = tuple(self.infer_type(par) for par in expr.parameters)
-
-        processed_parameters = None
-
-        mangle_result = self.kernel.mangle_function(
-                identifier, par_dtypes,
-                ast_builder=self.codegen_state.ast_builder)
-
-        if mangle_result is None:
-            raise RuntimeError("function '%s' unknown--"
-                    "maybe you need to register a function mangler?"
-                    % identifier)
-
-        if len(mangle_result.result_dtypes) != 1:
-            raise LoopyError("functions with more or fewer than one return value "
-                    "may not be used in an expression")
-
-        if mangle_result.arg_dtypes is not None:
-            processed_parameters = tuple(
-                    self.rec(par,
-                        dtype_to_type_context(self.kernel.target, tgt_dtype),
-                        tgt_dtype)
-                    for par, par_dtype, tgt_dtype in zip(
-                        expr.parameters, par_dtypes, mangle_result.arg_dtypes))
-
-        else:
-            # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to
-            # propagate the type context here. But for many others, it does
-            # not. Using the inferred type as a stopgap for now.
-            processed_parameters = tuple(
-                    self.rec(par,
-                        type_context=dtype_to_type_context(
-                            self.kernel.target, par_dtype))
-                    for par, par_dtype in zip(expr.parameters, par_dtypes))
-
-            from warnings import warn
-            warn("Calling function '%s' with unknown C signature--"
-                    "return CallMangleInfo.arg_dtypes"
-                    % identifier, LoopyWarning)
-
-        from loopy.codegen import SeenFunction
-        self.codegen_state.seen_functions.add(
-                SeenFunction(identifier,
-                    mangle_result.target_name,
-                    mangle_result.arg_dtypes or par_dtypes,
-                    mangle_result.result_dtypes))
-
-        return var(mangle_result.target_name)(*processed_parameters)
+        return (
+                self.codegen_state.callables_table[
+                    expr.function.name].emit_call(
+                        expression_to_code_mapper=self,
+                    expression=expr,
+                    target=self.kernel.target))
 
     # {{{ deal with complex-valued variables
 
@@ -563,6 +477,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
 
     def map_power(self, expr, type_context):
         tgt_dtype = self.infer_type(expr)
+        base_dtype = self.infer_type(expr.base)
         exponent_dtype = self.infer_type(expr.exponent)
 
         from pymbolic.primitives import is_constant, is_zero
@@ -584,10 +499,21 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                         "int_pow", func_name,
                         (tgt_dtype, exponent_dtype),
                         (tgt_dtype, )))
+            # FIXME: This need some more callables to be registered.
             return var(func_name)(self.rec(expr.base, type_context),
                                   self.rec(expr.exponent, type_context))
         else:
-            return self.rec(var("pow")(expr.base, expr.exponent), type_context)
+            from loopy.codegen import SeenFunction
+            clbl = self.codegen_state.ast_builder.known_callables["pow"]
+            clbl = clbl.with_types({0: tgt_dtype, 1: exponent_dtype},
+                    self.codegen_state.callables_table)[0]
+            self.codegen_state.seen_functions.add(
+                    SeenFunction(
+                        clbl.name, clbl.name_in_target,
+                        (base_dtype, exponent_dtype),
+                        (tgt_dtype,)))
+            return var(clbl.name_in_target)(self.rec(expr.base, type_context),
+                    self.rec(expr.exponent, type_context))
 
     # }}}
 
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index 67dc1fe249af91d9b73a7162867dcd98c7ef6bc7..63018189e7aaa729f6a008b4768d479f78e3cfeb 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -29,10 +29,11 @@ from pytools import memoize_method
 
 from loopy.target.c import CFamilyTarget, CFamilyASTBuilder
 from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
-from loopy.diagnostic import LoopyError
+from loopy.diagnostic import LoopyError, LoopyTypeError
 from loopy.types import NumpyType
 from loopy.kernel.data import AddressSpace
 from pymbolic import var
+from loopy.kernel.function_interface import ScalarCallable
 
 
 # {{{ vector types
@@ -110,43 +111,82 @@ def _register_vector_types(dtype_registry):
 # }}}
 
 
-# {{{ function mangler
+# {{{ function scoper
 
-def cuda_function_mangler(kernel, name, arg_dtypes):
-    if not isinstance(name, str):
-        return None
+_CUDA_SPECIFIC_FUNCTIONS = {
+        "rsqrt": 1,
+        "atan2": 2,
+        }
 
-    if name in ["max", "min"] and len(arg_dtypes) == 2:
-        dtype = np.find_common_type([], arg_dtypes)
 
-        if dtype.kind == "c":
-            raise RuntimeError("min/max do not support complex numbers")
+class CudaCallable(ScalarCallable):
 
-        if dtype.kind == "f":
-            name = "f" + name
+    def cuda_with_types(self, arg_id_to_dtype, callables_table):
 
-        return dtype, name
+        name = self.name
 
-    if name in ["pow"] and len(arg_dtypes) == 2:
-        dtype = np.find_common_type([], arg_dtypes)
+        if name in _CUDA_SPECIFIC_FUNCTIONS:
+            num_args = _CUDA_SPECIFIC_FUNCTIONS[name]
 
-        if dtype == np.float64:
-            pass  # pow
-        elif dtype == np.float32:
-            name = name + "f"  # powf
-        else:
-            raise RuntimeError(f"{name} does not support type {dtype}")
+            # {{{ sanity checks
+
+            for id, dtype in arg_id_to_dtype.items():
+                if not -1 <= id < num_args:
+                    raise LoopyError("%s can take only %d arguments." % (name,
+                            num_args))
+
+                if dtype is not None and dtype.kind == "c":
+                    raise LoopyTypeError(
+                        f"'{name}' does not support complex arguments.")
+
+            # }}}
+
+            for i in range(num_args):
+                if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None:
+                    # the types provided aren't mature enough to specialize the
+                    # callable
+                    return (
+                            self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                            callables_table)
+
+            dtype = np.find_common_type(
+                    [], [dtype.numpy_dtype for id, dtype in
+                        arg_id_to_dtype.items() if id >= 0])
+
+            updated_arg_id_to_dtype = {id: NumpyType(dtype)
+                    for id in range(-1, num_args)}
+
+            return (
+                    self.copy(name_in_target=name,
+                        arg_id_to_dtype=updated_arg_id_to_dtype),
+                    callables_table)
 
-        return dtype, name
+        if name == "dot":
+            # CUDA dot function:
+            # Performs dot product. Input types: vector and return type: scalar.
+            for i in range(2):
+                if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None:
+                    # the types provided aren't mature enough to specialize the
+                    # callable
+                    return (
+                            self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                            callables_table)
 
-    if name in "atan2" and len(arg_dtypes) == 2:
-        return arg_dtypes[0], name
+            input_dtype = arg_id_to_dtype[0]
 
-    if name == "dot":
-        scalar_dtype, offset, field_name = arg_dtypes[0].fields["x"]
-        return scalar_dtype, name
+            scalar_dtype, offset, field_name = input_dtype.fields["x"]
+            return_dtype = scalar_dtype
+            return self.copy(arg_id_to_dtype={0: input_dtype, 1: input_dtype,
+                                              -1: return_dtype})
+
+        return (
+                self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                callables_table)
 
-    return None
+
+def get_cuda_callables():
+    cuda_func_ids = {"dot"} | set(_CUDA_SPECIFIC_FUNCTIONS)
+    return {id_: CudaCallable(name=id_) for id_ in cuda_func_ids}
 
 # }}}
 
@@ -192,6 +232,9 @@ class CudaTarget(CFamilyTarget):
 
         super().__init__()
 
+    def split_kernel_at_global_barriers(self):
+        return True
+
     def get_device_ast_builder(self):
         return CUDACASTBuilder(self)
 
@@ -225,16 +268,51 @@ class CudaTarget(CFamilyTarget):
 # }}}
 
 
+# {{{ preamable generator
+
+def cuda_preamble_generator(preamble_info):
+    from loopy.types import AtomicNumpyType
+    seen_64_bit_atomics = any(
+            isinstance(dtype, AtomicNumpyType) and dtype.numpy_dtype.itemsize == 8
+            for dtype in preamble_info.seen_atomic_dtypes)
+
+    if seen_64_bit_atomics:
+        # Source:
+        # docs.nvidia.com/cuda/cuda-c-programming-guide/index.html#atomic-functions
+        yield ("00_enable_64bit_atomics", """
+            #if __CUDA_ARCH__ < 600
+            __device__ double atomicAdd(double* address, double val)
+            {
+                unsigned long long int* address_as_ull =
+                                          (unsigned long long int*)address;
+                unsigned long long int old = *address_as_ull, assumed;
+
+                do {
+                    assumed = old;
+                    old = atomicCAS(address_as_ull, assumed,
+                                    __double_as_longlong(val +
+                                           __longlong_as_double(assumed)));
+
+                } while (assumed != old);
+
+                return __longlong_as_double(old);
+            }
+            #endif
+            """)
+
+# }}}
+
+
 # {{{ ast builder
 
 class CUDACASTBuilder(CFamilyASTBuilder):
     # {{{ library
 
-    def function_manglers(self):
-        return (
-                super().function_manglers() + [
-                    cuda_function_mangler
-                    ])
+    @property
+    def known_callables(self):
+        callables = super().known_callables
+        callables.update(get_cuda_callables())
+        return callables
 
     # }}}
 
@@ -260,7 +338,8 @@ class CUDACASTBuilder(CFamilyASTBuilder):
         _, local_grid_size = \
                 codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs(
                         get_insn_ids_for_block_at(
-                            codegen_state.kernel.schedule, schedule_index))
+                            codegen_state.kernel.schedule, schedule_index),
+                        codegen_state.callables_table)
 
         from loopy.symbolic import get_dependencies
         if not get_dependencies(local_grid_size):
@@ -273,6 +352,12 @@ class CUDACASTBuilder(CFamilyASTBuilder):
 
         return FunctionDeclarationWrapper(fdecl)
 
+    def preamble_generators(self):
+
+        return (
+                super().preamble_generators() + [
+                    cuda_preamble_generator])
+
     # }}}
 
     # {{{ code generation guts
@@ -350,6 +435,97 @@ class CUDACASTBuilder(CFamilyASTBuilder):
 
         return CudaConstant(arg_decl)
 
+    # {{{ code generation for atomic update
+
+    def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var,
+            lhs_expr, rhs_expr, lhs_dtype, rhs_type_context):
+
+        from pymbolic.primitives import Sum
+        from cgen import Statement
+        from pymbolic.mapper.stringifier import PREC_NONE
+
+        if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [
+                np.int32, np.int64, np.float32, np.float64]:
+            # atomicAdd
+            if isinstance(rhs_expr, Sum):
+                ecm = self.get_expression_to_code_mapper(codegen_state)
+
+                new_rhs_expr = Sum(tuple(c for c in rhs_expr.children
+                                         if c != lhs_expr))
+                lhs_expr_code = ecm(lhs_expr)
+                rhs_expr_code = ecm(new_rhs_expr)
+
+                return Statement("atomicAdd(&{}, {})".format(
+                    lhs_expr_code, rhs_expr_code))
+            else:
+                from cgen import Block, DoWhile, Assign
+                from loopy.target.c import POD
+                old_val_var = codegen_state.var_name_generator("loopy_old_val")
+                new_val_var = codegen_state.var_name_generator("loopy_new_val")
+
+                from loopy.kernel.data import TemporaryVariable
+                ecm = codegen_state.expression_to_code_mapper.with_assignments(
+                        {
+                            old_val_var: TemporaryVariable(old_val_var, lhs_dtype),
+                            new_val_var: TemporaryVariable(new_val_var, lhs_dtype),
+                            })
+
+                lhs_expr_code = ecm(lhs_expr, prec=PREC_NONE, type_context=None)
+
+                from pymbolic.mapper.substitutor import make_subst_func
+                from pymbolic import var
+                from loopy.symbolic import SubstitutionMapper
+
+                subst = SubstitutionMapper(
+                        make_subst_func({lhs_expr: var(old_val_var)}))
+                rhs_expr_code = ecm(subst(rhs_expr), prec=PREC_NONE,
+                        type_context=rhs_type_context,
+                        needed_dtype=lhs_dtype)
+
+                cast_str = ""
+                old_val = old_val_var
+                new_val = new_val_var
+
+                if lhs_dtype.numpy_dtype.kind == "f":
+                    if lhs_dtype.numpy_dtype == np.float32:
+                        ctype = "int"
+                    elif lhs_dtype.numpy_dtype == np.float64:
+                        ctype = "long"
+                    else:
+                        assert False
+
+                    old_val = "*(%s *) &" % ctype + old_val
+                    new_val = "*(%s *) &" % ctype + new_val
+                    cast_str = "(%s *) " % (ctype)
+
+                return Block([
+                    POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
+                        old_val_var),
+                    POD(self, NumpyType(lhs_dtype.dtype, target=self.target),
+                        new_val_var),
+                    DoWhile(
+                        "atomicCAS("
+                        "%(cast_str)s&(%(lhs_expr)s), "
+                        "%(old_val)s, "
+                        "%(new_val)s"
+                        ") != %(old_val)s"
+                        % {
+                            "cast_str": cast_str,
+                            "lhs_expr": lhs_expr_code,
+                            "old_val": old_val,
+                            "new_val": new_val,
+                            },
+                        Block([
+                            Assign(old_val_var, lhs_expr_code),
+                            Assign(new_val_var, rhs_expr_code),
+                            ])
+                        )
+                    ])
+        else:
+            raise NotImplementedError("atomic update for '%s'" % lhs_dtype)
+
+    # }}}
+
     # }}}
 
 # }}}
diff --git a/loopy/target/execution.py b/loopy/target/execution.py
index d594f3e10a9fef7e3e16a2848d68eca7f29ebfc9..a8666f02bb152e008efc817da9461edce6a34d33 100644
--- a/loopy/target/execution.py
+++ b/loopy/target/execution.py
@@ -58,12 +58,13 @@ class SeparateArrayPackingController:
     It also repacks outgoing arrays of this type back into an object array.
     """
 
-    def __init__(self, kernel):
+    def __init__(self, program, entrypoint):
+
         # map from arg name
         self.packing_info = {}
 
         from loopy.kernel.array import ArrayBase
-        for arg in kernel.args:
+        for arg in program[entrypoint].args:
             if not isinstance(arg, ArrayBase):
                 continue
 
@@ -79,7 +80,8 @@ class SeparateArrayPackingController:
                     name=arg.name,
                     sep_shape=arg.sep_shape(),
                     subscripts_and_names=subscripts_and_names,
-                    is_written=arg.name in kernel.get_written_variables())
+                    is_written=arg.name in
+                    program[entrypoint].get_written_variables())
 
     def unpack(self, kernel_kwargs):
         if not self.packing_info:
@@ -140,7 +142,7 @@ class ExecutionWrapperGeneratorBase:
     # {{{ integer arg finding from shapes
 
     def generate_integer_arg_finding_from_shapes(
-            self, gen, kernel, implemented_data_info):
+            self, gen, program, implemented_data_info):
         # a mapping from integer argument names to a list of tuples
         # (arg_name, expression), where expression is a
         # unary function of kernel.arg_dict[arg_name]
@@ -165,7 +167,8 @@ class ExecutionWrapperGeneratorBase:
                     if len(deps) == 1:
                         integer_arg_var, = deps
 
-                        if kernel.arg_dict[integer_arg_var.name].dtype.is_integral():
+                        if program.arg_dict[
+                                integer_arg_var.name].dtype.is_integral():
                             from pymbolic.algorithm import solve_affine_equations_for
                             try:
                                 # friggin' overkill :)
@@ -212,7 +215,7 @@ class ExecutionWrapperGeneratorBase:
     # {{{ integer arg finding from offsets
 
     def generate_integer_arg_finding_from_offsets(self, gen, kernel,
-                                                  implemented_data_info):
+            implemented_data_info):
         options = kernel.options
 
         gen("# {{{ find integer arguments from offsets")
@@ -616,7 +619,7 @@ class ExecutionWrapperGeneratorBase:
     def generate_host_code(self, gen, codegen_result):
         raise NotImplementedError
 
-    def __call__(self, kernel, codegen_result):
+    def __call__(self, program, entrypoint, codegen_result):
         """
         Generates the wrapping python invoker for this execution target
 
@@ -628,12 +631,14 @@ class ExecutionWrapperGeneratorBase:
             kernel
         """
 
-        options = kernel.options
-        implemented_data_info = codegen_result.implemented_data_info
+        options = program[entrypoint].options
+        #FIXME: endswith is ugly maybe make
+        # codegen_result.implemented_data_infos a dict?
+        implemented_data_info = codegen_result.implemented_data_infos[entrypoint]
 
         from loopy.kernel.data import KernelArgument
         gen = PythonFunctionGenerator(
-                "invoke_%s_loopy_kernel" % kernel.name,
+                "invoke_%s_loopy_kernel" % entrypoint,
                 self.system_args + [
                     "%s=None" % idi.name
                     for idi in implemented_data_info
@@ -648,21 +653,24 @@ class ExecutionWrapperGeneratorBase:
         self.initialize_system_args(gen)
 
         self.generate_integer_arg_finding_from_shapes(
-            gen, kernel, implemented_data_info)
+            gen, program[entrypoint], implemented_data_info)
         self.generate_integer_arg_finding_from_offsets(
-            gen, kernel, implemented_data_info)
+            gen, program[entrypoint], implemented_data_info)
         self.generate_integer_arg_finding_from_strides(
-            gen, kernel, implemented_data_info)
+            gen, program[entrypoint], implemented_data_info)
         self.generate_value_arg_check(
-            gen, kernel, implemented_data_info)
-
+            gen, program[entrypoint], implemented_data_info)
         args = self.generate_arg_setup(
-            gen, kernel, implemented_data_info, options)
+            gen, program[entrypoint], implemented_data_info, options)
+
+        #FIXME: should we make this as a dict as well.
+        host_program_name = codegen_result.host_programs[entrypoint].name
 
-        self.generate_invocation(gen, codegen_result.host_program.name, args,
-                kernel, implemented_data_info)
+        self.generate_invocation(gen, host_program_name, args,
+                program[entrypoint], implemented_data_info)
 
-        self.generate_output_handler(gen, options, kernel, implemented_data_info)
+        self.generate_output_handler(gen, options, program[entrypoint],
+                implemented_data_info)
 
         if options.write_wrapper:
             output = gen.get()
@@ -710,64 +718,66 @@ class KernelExecutorBase:
     .. automethod:: __call__
     """
 
-    def __init__(self, kernel):
+    def __init__(self, program, entrypoint):
         """
         :arg kernel: a loopy.LoopKernel
         """
 
-        self.kernel = kernel
+        self.program = program
+        self.entrypoint = entrypoint
 
-        self.packing_controller = SeparateArrayPackingController(kernel)
+        self.packing_controller = SeparateArrayPackingController(program,
+                entrypoint)
 
-        self.output_names = tuple(arg.name for arg in self.kernel.args
-                if arg.name in self.kernel.get_written_variables())
+        self.output_names = tuple(arg.name for arg in self.program[entrypoint].args
+                if arg.is_output)
 
         self.has_runtime_typed_args = any(
                 arg.dtype is None
-                for arg in kernel.args)
+                for arg in program[entrypoint].args)
 
-    def get_typed_and_scheduled_kernel_uncached(self, arg_to_dtype_set):
+    def get_typed_and_scheduled_program_uncached(self, entrypoint, arg_to_dtype_set):
         from loopy.kernel.tools import add_dtypes
+        from loopy.kernel import KernelState
+        from loopy.program import resolve_callables
 
-        kernel = self.kernel
+        program = resolve_callables(self.program)
 
         if arg_to_dtype_set:
             var_to_dtype = {}
+            entry_knl = program[entrypoint]
             for var, dtype in arg_to_dtype_set:
-                try:
-                    dest_name = kernel.impl_arg_to_arg[var].name
-                except KeyError:
+                if var in entry_knl.impl_arg_to_arg:
+                    dest_name = entry_knl.impl_arg_to_arg[var].name
+                else:
                     dest_name = var
 
-                try:
-                    var_to_dtype[dest_name] = dtype
-                except KeyError:
-                    raise LoopyError("cannot set type for '%s': "
-                            "no known variable/argument with that name"
-                            % var)
+                var_to_dtype[dest_name] = dtype
 
-            kernel = add_dtypes(kernel, var_to_dtype)
+            program = program.with_kernel(add_dtypes(entry_knl, var_to_dtype))
 
             from loopy.type_inference import infer_unknown_types
-            kernel = infer_unknown_types(kernel, expect_completion=True)
+            program = infer_unknown_types(program, expect_completion=True)
 
-        if kernel.schedule is None:
-            from loopy.preprocess import preprocess_kernel
-            kernel = preprocess_kernel(kernel)
+        if program.state < KernelState.SCHEDULED:
+            from loopy.preprocess import preprocess_program
+            program = preprocess_program(program)
 
             from loopy.schedule import get_one_scheduled_kernel
-            kernel = get_one_scheduled_kernel(kernel)
+            for e in program.entrypoints:
+                program = program.with_kernel(
+                    get_one_scheduled_kernel(program[e], program.callables_table))
 
-        return kernel
+        return program
 
-    def get_typed_and_scheduled_kernel(self, arg_to_dtype_set):
+    def get_typed_and_scheduled_program(self, entrypoint, arg_to_dtype_set):
         from loopy import CACHING_ENABLED
 
         from loopy.preprocess import prepare_for_caching
         # prepare_for_caching() gets run by preprocess, but the kernel at this
         # stage is not guaranteed to be preprocessed.
-        cacheable_kernel = prepare_for_caching(self.kernel)
-        cache_key = (type(self).__name__, cacheable_kernel, arg_to_dtype_set)
+        cacheable_program = prepare_for_caching(self.program)
+        cache_key = (type(self).__name__, cacheable_program, arg_to_dtype_set)
 
         if CACHING_ENABLED:
             try:
@@ -775,9 +785,11 @@ class KernelExecutorBase:
             except KeyError:
                 pass
 
-        logger.debug("%s: typed-and-scheduled cache miss" % self.kernel.name)
+        logger.debug("%s: typed-and-scheduled cache miss" %
+                self.program.entrypoints)
 
-        kernel = self.get_typed_and_scheduled_kernel_uncached(arg_to_dtype_set)
+        kernel = self.get_typed_and_scheduled_program_uncached(entrypoint,
+                arg_to_dtype_set)
 
         if CACHING_ENABLED:
             typed_and_scheduled_cache.store_if_not_present(cache_key, kernel)
@@ -785,10 +797,13 @@ class KernelExecutorBase:
         return kernel
 
     def arg_to_dtype_set(self, kwargs):
+        kwargs = kwargs.copy()
         if not self.has_runtime_typed_args:
             return None
 
-        impl_arg_to_arg = self.kernel.impl_arg_to_arg
+        entrypoint = kwargs.pop("entrypoint")
+
+        impl_arg_to_arg = self.program[entrypoint].impl_arg_to_arg
         arg_to_dtype = {}
         for arg_name, val in kwargs.items():
             arg = impl_arg_to_arg.get(arg_name, None)
@@ -809,18 +824,18 @@ class KernelExecutorBase:
 
     # {{{ debugging aids
 
-    def get_highlighted_code(self, arg_to_dtype=None, code=None):
+    def get_highlighted_code(self, entrypoint, arg_to_dtype=None, code=None):
         if code is None:
-            code = self.get_code(arg_to_dtype)
+            code = self.get_code(entrypoint, arg_to_dtype)
         return get_highlighted_code(code)
 
-    def get_code(self, arg_to_dtype=None):
+    def get_code(self, entrypoint, arg_to_dtype=None):
         def process_dtype(dtype):
             if isinstance(dtype, type) and issubclass(dtype, np.generic):
                 dtype = np.dtype(dtype)
             if isinstance(dtype, np.dtype):
                 from loopy.types import NumpyType
-                dtype = NumpyType(dtype, self.kernel.target)
+                dtype = NumpyType(dtype, self.program.target)
 
             return dtype
 
@@ -828,22 +843,19 @@ class KernelExecutorBase:
             arg_to_dtype = frozenset(
                     (k, process_dtype(v)) for k, v in arg_to_dtype.items())
 
-        kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype)
+        kernel = self.get_typed_and_scheduled_program(entrypoint, arg_to_dtype)
 
         from loopy.codegen import generate_code_v2
         code = generate_code_v2(kernel)
         return code.device_code()
 
-    def get_invoker_uncached(self, kernel, *args):
-        raise NotImplementedError()
-
-    def get_wrapper_generator(self):
+    def get_invoker_uncached(self, program, entrypoint, *args):
         raise NotImplementedError()
 
-    def get_invoker(self, kernel, *args):
+    def get_invoker(self, program, entrypoint, *args):
         from loopy import CACHING_ENABLED
 
-        cache_key = (self.__class__.__name__, kernel)
+        cache_key = (self.__class__.__name__, (program, entrypoint))
 
         if CACHING_ENABLED:
             try:
@@ -851,9 +863,9 @@ class KernelExecutorBase:
             except KeyError:
                 pass
 
-        logger.debug("%s: invoker cache miss" % kernel.name)
+        logger.debug("%s: invoker cache miss" % entrypoint)
 
-        invoker = self.get_invoker_uncached(kernel, *args)
+        invoker = self.get_invoker_uncached(program, entrypoint, *args)
 
         if CACHING_ENABLED:
             invoker_cache.store_if_not_present(cache_key, invoker)
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index d047b6464c1fc86010d1943527af68be73278bbb..526d3855e023df4357c68ca9b59975cbb3fd908f 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -171,8 +171,9 @@ class ISPCTarget(CFamilyTarget):
     host_program_name_suffix = ""
     device_program_name_suffix = "_inner"
 
-    def pre_codegen_check(self, kernel):
-        gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs()
+    def pre_codegen_check(self, kernel, callables_table):
+        gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs(
+                callables_table)
         if len(lsize) > 1:
             for i, ls_i in enumerate(lsize[1:]):
                 if ls_i != 1:
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 096b4de940dd487e8333f397197fe451239caf90..fa356068eeb617c606acdf4eef2dd9c762ee35ed 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -30,11 +30,11 @@ from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
 from pytools import memoize_method
 from loopy.diagnostic import LoopyError, LoopyTypeError
 from loopy.types import NumpyType
-from loopy.target.c import DTypeRegistryWrapper, c_math_mangler
-from loopy.kernel.data import AddressSpace, CallMangleInfo
+from loopy.target.c import DTypeRegistryWrapper
+from loopy.kernel.data import AddressSpace
+from loopy.kernel.function_interface import ScalarCallable
 from pymbolic import var
 
-from functools import partial
 
 # {{{ dtype registry wrappers
 
@@ -180,77 +180,225 @@ VECTOR_LITERAL_FUNCS = {
         }
 
 
-def opencl_function_mangler(kernel, name, arg_dtypes):
-    if not isinstance(name, str):
-        return None
+class OpenCLCallable(ScalarCallable):
+    """
+    Records information about OpenCL functions which are not covered by
+    :class:`loopy.target.c.CMathCallable`.
+    """
 
-    # OpenCL has min(), max() for integer types
-    if name in ["max", "min"] and len(arg_dtypes) == 2:
-        dtype = np.find_common_type(
-                [], [dtype.numpy_dtype for dtype in arg_dtypes])
-
-        if dtype.kind == "i":
-            result_dtype = NumpyType(dtype)
-            return CallMangleInfo(
-                    target_name=name,
-                    result_dtypes=(result_dtype,),
-                    arg_dtypes=2*(result_dtype,))
-
-    if name == "pow" and len(arg_dtypes) == 2:
-        dtype = np.find_common_type(
-                [], [dtype.numpy_dtype for dtype in arg_dtypes])
-        if dtype == np.float64:
-            name = "powf64"
-        elif dtype == np.float32:
-            name = "powf32"
-        else:
-            raise LoopyTypeError(f"'pow' does not support type {dtype}.")
-
-        result_dtype = NumpyType(dtype)
-        return CallMangleInfo(
-                target_name=name,
-                result_dtypes=(result_dtype,),
-                arg_dtypes=2*(result_dtype,))
-
-    if name == "dot":
-        scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields["s0"]
-        return CallMangleInfo(
-                target_name=name,
-                result_dtypes=(NumpyType(scalar_dtype),),
-                arg_dtypes=(arg_dtypes[0],)*2)
-
-    if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS:
-        num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name]
-        if len(arg_dtypes) != num_args:
-            raise LoopyError("%s takes %d arguments (%d received)"
-                    % (name, num_args, len(arg_dtypes)))
-
-        dtype = np.find_common_type(
-                [], [dtype.numpy_dtype for dtype in arg_dtypes])
-
-        if dtype.kind == "c":
-            raise LoopyError("%s does not support complex numbers"
-                    % name)
-
-        result_dtype = NumpyType(dtype)
-        return CallMangleInfo(
-                target_name=name,
-                result_dtypes=(result_dtype,),
-                arg_dtypes=(result_dtype,)*num_args)
-
-    if name in VECTOR_LITERAL_FUNCS:
-        base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name]
-
-        if count != len(arg_dtypes):
-            return None
-
-        return CallMangleInfo(
-                target_name="(%s%d) " % (base_tp_name, count),
-                result_dtypes=(kernel.target.vector_dtype(
-                    NumpyType(dtype), count),),
-                arg_dtypes=(NumpyType(dtype),)*count)
-
-    return None
+    def with_types(self, arg_id_to_dtype, callables_table):
+        name = self.name
+
+        # unary functions
+        if name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh",
+                    "tan", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor",
+                    "erf", "erfc"]:
+
+            for id in arg_id_to_dtype:
+                if not -1 <= id <= 0:
+                    raise LoopyError(f"'{name}' can take only one argument.")
+
+            if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None:
+                # the types provided aren't mature enough to specialize the
+                # callable
+                return (
+                        self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                        callables_table)
+
+            dtype = arg_id_to_dtype[0]
+            dtype = dtype.numpy_dtype
+
+            if dtype.kind in ("u", "i"):
+                # ints and unsigned casted to float32
+                dtype = np.float32
+            elif dtype.kind == "c":
+                raise LoopyTypeError(f"{name} does not support type {dtype}")
+
+            return (
+                    self.copy(name_in_target=name,
+                        arg_id_to_dtype={0: NumpyType(dtype), -1:
+                            NumpyType(dtype)}),
+                    callables_table)
+        # binary functions
+        elif name in ["fmax", "fmin", "atan2", "copysign"]:
+
+            for id in arg_id_to_dtype:
+                if not -1 <= id <= 1:
+                    #FIXME: Do we need to raise here?:
+                    #   The pattern we generally follow is that if we don't find
+                    #   a function, then we just return None
+                    raise LoopyError("%s can take only two arguments." % name)
+
+            if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or (
+                    arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None):
+                # the types provided aren't mature enough to specialize the
+                # callable
+                return (
+                        self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                        callables_table)
+
+            dtype = np.find_common_type(
+                [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items()
+                     if id >= 0])
+
+            if dtype.kind == "c":
+                raise LoopyTypeError("%s does not support complex numbers")
+
+            dtype = NumpyType(dtype)
+            return (
+                    self.copy(name_in_target=name,
+                        arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}),
+                    callables_table)
+
+        elif name in ["max", "min"]:
+            for id in arg_id_to_dtype:
+                if not -1 <= id <= 1:
+                    raise LoopyError("%s can take only 2 arguments." % name)
+            if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype:
+                return (
+                        self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                        callables_table)
+            common_dtype = np.find_common_type(
+                    [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items()
+                        if (id >= 0 and dtype is not None)])
+
+            if common_dtype.kind in ["u", "i", "f"]:
+                if common_dtype.kind == "f":
+                    name = "f"+name
+
+                dtype = NumpyType(common_dtype)
+                return (
+                        self.copy(name_in_target=name,
+                            arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}),
+                        callables_table)
+            else:
+                # Unsupported type.
+                raise LoopyError("%s function not supported for the types %s" %
+                        (name, common_dtype))
+
+        elif name == "dot":
+            for id in arg_id_to_dtype:
+                if not -1 <= id <= 1:
+                    raise LoopyError(f"'{name}' can take only 2 arguments.")
+
+            if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or (
+                    arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None):
+                # the types provided aren't mature enough to specialize the
+                # callable
+                return (
+                        self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                        callables_table)
+
+            dtype = arg_id_to_dtype[0]
+            scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["s0"]
+            return (
+                    self.copy(name_in_target=name, arg_id_to_dtype={-1:
+                        NumpyType(scalar_dtype), 0: dtype, 1: dtype}),
+                    callables_table)
+
+        elif name == "pow":
+            for id in arg_id_to_dtype:
+                if not -1 <= id <= 1:
+                    raise LoopyError(f"'{name}' can take only 2 arguments.")
+
+            common_dtype = np.find_common_type(
+                    [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items()
+                        if (id >= 0 and dtype is not None)])
+
+            if common_dtype == np.float64:
+                name = "powf64"
+            elif common_dtype == np.float32:
+                name = "powf32"
+            else:
+                raise LoopyTypeError(f"'pow' does not support type {dtype}.")
+
+            result_dtype = NumpyType(common_dtype)
+
+            return (
+                    self.copy(name_in_target=name,
+                              arg_id_to_dtype={-1: result_dtype,
+                                               0: common_dtype, 1: common_dtype}),
+                    callables_table)
+
+        elif name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS:
+            num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name]
+            for id in arg_id_to_dtype:
+                if not -1 <= id < num_args:
+                    raise LoopyError("%s can take only %d arguments." % (name,
+                            num_args))
+
+            for i in range(num_args):
+                if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None:
+                    # the types provided aren't mature enough to specialize the
+                    # callable
+                    return (
+                            self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                            callables_table)
+
+            dtype = np.find_common_type(
+                    [], [dtype.numpy_dtype for id, dtype in
+                        arg_id_to_dtype.items() if id >= 0])
+
+            if dtype.kind == "c":
+                raise LoopyError("%s does not support complex numbers"
+                        % name)
+
+            updated_arg_id_to_dtype = {id: NumpyType(dtype) for id in range(-1,
+                num_args)}
+
+            return (
+                    self.copy(name_in_target=name,
+                        arg_id_to_dtype=updated_arg_id_to_dtype),
+                    callables_table)
+
+        elif name in VECTOR_LITERAL_FUNCS:
+            base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name]
+
+            for id in arg_id_to_dtype:
+                if not -1 <= id < count:
+                    raise LoopyError("%s can take only %d arguments." % (name,
+                            num_args))
+
+            for i in range(count):
+                if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None:
+                    # the types provided aren't mature enough to specialize the
+                    # callable
+                    return (
+                            self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                            callables_table)
+
+            updated_arg_id_to_dtype = {id: NumpyType(dtype) for id in
+                    range(count)}
+            updated_arg_id_to_dtype[-1] = OpenCLTarget().vector_dtype(
+                        NumpyType(dtype), count)
+
+            return (
+                    self.copy(name_in_target="(%s%d) " % (base_tp_name, count),
+                        arg_id_to_dtype=updated_arg_id_to_dtype),
+                    callables_table)
+
+        # does not satisfy any of the conditions needed for specialization.
+        # hence just returning a copy of the callable.
+        return (
+                self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                callables_table)
+
+
+def get_opencl_callables():
+    """
+    Returns an instance of :class:`InKernelCallable` if the function defined by
+    *identifier* is known in OpenCL.
+    """
+    opencl_function_ids = (
+            {"max", "min", "dot", "pow", "abs", "acos", "asin",
+            "atan", "cos", "cosh", "sin", "sinh", "pow", "atan2", "tanh", "exp",
+            "log", "log10", "sqrt", "ceil", "floor", "max", "min", "fmax", "fmin",
+            "fabs", "tan", "erf", "erfc"}
+            | set(_CL_SIMPLE_MULTI_ARG_FUNCTIONS)
+            | set(VECTOR_LITERAL_FUNCS))
+
+    return {id_: OpenCLCallable(name=id_) for id_ in
+        opencl_function_ids}
 
 # }}}
 
@@ -274,6 +422,8 @@ def opencl_symbol_mangler(kernel, name):
         return NumpyType(np.dtype(np.int32)), name
     elif name.startswith("LONG_"):
         return NumpyType(np.dtype(np.int64)), name
+    elif name == "HUGE_VAL":
+        return NumpyType(np.dtype(np.float64)), name
     else:
         return None
 
@@ -310,6 +460,7 @@ def opencl_preamble_generator(preamble_info):
 
     from loopy.tools import remove_common_indentation
     kernel = preamble_info.kernel
+
     yield ("00_declare_gid_lid",
             remove_common_indentation("""
                 #define lid(N) ((%(idx_ctype)s) get_local_id(N))
@@ -417,13 +568,11 @@ class OpenCLTarget(CFamilyTarget):
 class OpenCLCASTBuilder(CFamilyASTBuilder):
     # {{{ library
 
-    def function_manglers(self):
-        return (
-                [
-                    opencl_function_mangler,
-                    partial(c_math_mangler, modify_name=False)
-                ] +
-                super().function_manglers())
+    @property
+    def known_callables(self):
+        callables = super().known_callables
+        callables.update(get_opencl_callables())
+        return callables
 
     def symbol_manglers(self):
         return (
@@ -432,13 +581,10 @@ class OpenCLCASTBuilder(CFamilyASTBuilder):
                     ])
 
     def preamble_generators(self):
-        from loopy.library.reduction import reduction_preamble_generator
 
         return (
                 super().preamble_generators() + [
-                    opencl_preamble_generator,
-                    reduction_preamble_generator,
-                    ])
+                    opencl_preamble_generator])
 
     # }}}
 
@@ -451,6 +597,11 @@ class OpenCLCASTBuilder(CFamilyASTBuilder):
 
         from loopy.target.c import FunctionDeclarationWrapper
         assert isinstance(fdecl, FunctionDeclarationWrapper)
+        if not codegen_state.is_entrypoint:
+            # auxiliary kernels need not mention opencl speicific qualifiers
+            # for a functions signature
+            return fdecl
+
         fdecl = fdecl.subdecl
 
         from cgen.opencl import CLKernel, CLRequiredWorkGroupSize
@@ -459,7 +610,8 @@ class OpenCLCASTBuilder(CFamilyASTBuilder):
         from loopy.schedule import get_insn_ids_for_block_at
         _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs(
                 get_insn_ids_for_block_at(
-                    codegen_state.kernel.schedule, schedule_index))
+                    codegen_state.kernel.schedule, schedule_index),
+                codegen_state.callables_table)
 
         from loopy.symbolic import get_dependencies
         if not get_dependencies(local_sizes):
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index bb04ddc3d8f4b01b859242766bc0230c68dccb23..70663b2daffc43da19d69473ece5369244981fd6 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -25,13 +25,13 @@ THE SOFTWARE.
 import numpy as np
 import pymbolic.primitives as p
 
-from loopy.kernel.data import CallMangleInfo
 from loopy.target.opencl import (OpenCLTarget, OpenCLCASTBuilder,
         ExpressionToOpenCLCExpressionMapper)
 from loopy.target.python import PythonASTBuilderBase
 from loopy.types import NumpyType
-from loopy.diagnostic import LoopyError, warn_with_kernel
+from loopy.diagnostic import LoopyError, warn_with_kernel, LoopyTypeError
 from warnings import warn
+from loopy.kernel.function_interface import ScalarCallable
 
 import logging
 logger = logging.getLogger(__name__)
@@ -130,7 +130,7 @@ def adjust_local_temp_var_storage(kernel, device):
 
 # {{{ check sizes against device properties
 
-def check_sizes(kernel, device):
+def check_sizes(kernel, callables_table, device):
     import loopy as lp
 
     from loopy.diagnostic import LoopyAdvisory, LoopyError
@@ -147,7 +147,8 @@ def check_sizes(kernel, device):
         if isinstance(arg, lp.ValueArg) and arg.approximately is not None:
             parameters[arg.name] = arg.approximately
 
-    glens, llens = kernel.get_grid_size_upper_bounds_as_exprs()
+    glens, llens = (
+            kernel.get_grid_size_upper_bounds_as_exprs(callables_table))
 
     if (max(len(glens), len(llens))
             > device.max_work_item_dimensions):
@@ -195,36 +196,86 @@ def check_sizes(kernel, device):
 # }}}
 
 
-def pyopencl_function_mangler(target, name, arg_dtypes):
-    if len(arg_dtypes) == 1 and isinstance(name, str):
-        arg_dtype, = arg_dtypes
+# {{{ pyopencl function scopers
 
-        if arg_dtype.is_complex():
-            if arg_dtype.numpy_dtype == np.complex64:
-                tpname = "cfloat"
-            elif arg_dtype.numpy_dtype == np.complex128:
-                tpname = "cdouble"
+class PyOpenCLCallable(ScalarCallable):
+    """
+    Records information about the callables which are not covered by
+    :class:`loopy.target.opencl.OpenCLCallable`
+    """
+    def with_types(self, arg_id_to_dtype, callables_table):
+
+        name = self.name
+
+        for id in arg_id_to_dtype:
+            # since all the below functions are single arg.
+            if not -1 <= id <= 0:
+                raise LoopyError("%s can only take one argument." % name)
+
+        if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None:
+            # the types provided aren't mature enough to specialize the
+            # callable
+            return (
+                    self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                    callables_table)
+
+        dtype = arg_id_to_dtype[0]
+
+        if name in ["real", "imag", "abs"]:
+            if dtype.is_complex():
+                if dtype.numpy_dtype == np.complex64:
+                    tpname = "cfloat"
+                elif dtype.numpy_dtype == np.complex128:
+                    tpname = "cdouble"
+                else:
+                    raise LoopyTypeError("unexpected complex type '%s'" % dtype)
+
+                return (
+                        self.copy(name_in_target=f"{tpname}_{name}",
+                            arg_id_to_dtype={0: dtype, -1: NumpyType(
+                                np.dtype(dtype.numpy_dtype.type(0).real))}),
+                        callables_table)
+
+        if name in ["sqrt", "exp", "log",
+                "sin", "cos", "tan",
+                "sinh", "cosh", "tanh",
+                "conj", "abs"]:
+            if dtype.is_complex():
+                # function parameters are complex.
+                if dtype.numpy_dtype == np.complex64:
+                    tpname = "cfloat"
+                elif dtype.numpy_dtype == np.complex128:
+                    tpname = "cdouble"
+                else:
+                    raise LoopyTypeError("unexpected complex type '%s'" % dtype)
+
+                return (
+                        self.copy(name_in_target=f"{tpname}_{name}",
+                            arg_id_to_dtype={0: dtype, -1: dtype}),
+                        callables_table)
             else:
-                raise RuntimeError("unexpected complex type '%s'" % arg_dtype)
+                # function calls for floating parameters.
+                numpy_dtype = dtype.numpy_dtype
+                if numpy_dtype.kind in ("u", "i"):
+                    dtype = dtype.copy(numpy_dtype=np.float32)
+                if name == "abs":
+                    name = "fabs"
+                return (
+                        self.copy(name_in_target=name,
+                            arg_id_to_dtype={0: dtype, -1: dtype}),
+                        callables_table)
+
+        return (
+                self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                callables_table)
 
-            if name in ["sqrt", "exp", "log",
-                    "sin", "cos", "tan",
-                    "sinh", "cosh", "tanh",
-                    "conj"]:
-                return CallMangleInfo(
-                        target_name=f"{tpname}_{name}",
-                        result_dtypes=(arg_dtype,),
-                        arg_dtypes=(arg_dtype,))
 
-            if name in ["real", "imag", "abs"]:
-                return CallMangleInfo(
-                        target_name=f"{tpname}_{name}",
-                        result_dtypes=(NumpyType(
-                            np.dtype(arg_dtype.numpy_dtype.type(0).real)),
-                            ),
-                        arg_dtypes=(arg_dtype,))
+def get_pyopencl_callables():
+    pyopencl_ids = ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh",
+            "tanh", "conj", "real", "imag", "abs"]
+    return {id_: PyOpenCLCallable(name=id_) for id_ in pyopencl_ids}
 
-    return None
+# }}}
 
 
 # {{{ preamble generator
@@ -569,8 +620,8 @@ class PyOpenCLTarget(OpenCLTarget):
             kernel = adjust_local_temp_var_storage(kernel, self.device)
         return kernel
 
-    def pre_codegen_check(self, kernel):
-        check_sizes(kernel, self.device)
+    def pre_codegen_check(self, kernel, callables_table):
+        check_sizes(kernel, callables_table, self.device)
 
     def get_host_ast_builder(self):
         return PyOpenCLPythonASTBuilder(self)
@@ -638,9 +689,10 @@ class PyOpenCLTarget(OpenCLTarget):
     def get_kernel_executor_cache_key(self, queue, **kwargs):
         return queue.context
 
-    def get_kernel_executor(self, kernel, queue, **kwargs):
+    def get_kernel_executor(self, program, queue, **kwargs):
         from loopy.target.pyopencl_execution import PyOpenCLKernelExecutor
-        return PyOpenCLKernelExecutor(queue.context, kernel)
+        return PyOpenCLKernelExecutor(queue.context, program,
+                entrypoint=kwargs.pop("entrypoint"))
 
     def with_device(self, device):
         return type(self)(device)
@@ -973,21 +1025,20 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder):
 
     # {{{ library
 
-    def function_manglers(self):
-        from loopy.library.random123 import random123_function_mangler
-        return (
-                [
-                    pyopencl_function_mangler,
-                    random123_function_mangler
-                    # order matters: e.g. prefer our abs() over that of the
-                    # superclass
-                    ] + super().function_manglers())
+    @property
+    def known_callables(self):
+        from loopy.library.random123 import get_random123_callables
+
+        # order matters: e.g. prefer our abs() over that of the
+        # superclass
+        callables = super().known_callables
+        callables.update(get_pyopencl_callables())
+        callables.update(get_random123_callables(self.target))
+        return callables
 
     def preamble_generators(self):
-        from loopy.library.random123 import random123_preamble_generator
         return ([
             pyopencl_preamble_generator,
-            random123_preamble_generator,
             ] + super().preamble_generators())
 
     # }}}
diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py
index cdee5600bb5dd0dce3a3971583604f737c6913d9..5ac37e1520abc774b877d16741302ff7b79810af 100644
--- a/loopy/target/pyopencl_execution.py
+++ b/loopy/target/pyopencl_execution.py
@@ -257,7 +257,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
     .. automethod:: __call__
     """
 
-    def __init__(self, context, kernel):
+    def __init__(self, context, program, entrypoint):
         """
         :arg context: a :class:`pyopencl.Context`
         :arg kernel: may be a loopy.LoopKernel, a generator returning kernels
@@ -266,62 +266,69 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
             specific arguments.
         """
 
-        super().__init__(kernel)
+        super().__init__(program, entrypoint)
 
         self.context = context
 
         from loopy.target.pyopencl import PyOpenCLTarget
-        if isinstance(kernel.target, PyOpenCLTarget):
-            self.kernel = kernel.copy(target=(
-                kernel.target.with_device(context.devices[0])))
+        if isinstance(program.target, PyOpenCLTarget):
+            self.program = program.copy(target=(
+                program.target.with_device(context.devices[0])))
 
-    def get_invoker_uncached(self, kernel, codegen_result):
+    def get_invoker_uncached(self, program, entrypoint, codegen_result):
         generator = PyOpenCLExecutionWrapperGenerator()
-        return generator(kernel, codegen_result)
+        return generator(program, entrypoint, codegen_result)
 
     def get_wrapper_generator(self):
         return PyOpenCLExecutionWrapperGenerator()
 
     @memoize_method
-    def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None):
-        kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set)
+    def program_info(self, entrypoint, arg_to_dtype_set=frozenset(),
+            all_kwargs=None):
+        program = self.get_typed_and_scheduled_program(entrypoint,
+                arg_to_dtype_set)
 
+        # FIXME: now just need to add the types to the arguments
         from loopy.codegen import generate_code_v2
         from loopy.target.execution import get_highlighted_code
-        codegen_result = generate_code_v2(kernel)
+        codegen_result = generate_code_v2(program)
 
         dev_code = codegen_result.device_code()
 
-        if self.kernel.options.write_cl:
+        if program[entrypoint].options.write_cl:
+            #FIXME: redirect to "translation unit" level option as well.
             output = dev_code
-            if self.kernel.options.highlight_cl:
+            if self.program[entrypoint].options.highlight_cl:
                 output = get_highlighted_code(output)
 
-            if self.kernel.options.write_cl is True:
+            if self.program[entrypoint].options.write_cl is True:
                 print(output)
             else:
-                with open(self.kernel.options.write_cl, "w") as outf:
+                with open(self.program[entrypoint].options.write_cl, "w") as outf:
                     outf.write(output)
 
-        if self.kernel.options.edit_cl:
+        if program[entrypoint].options.edit_cl:
+            #FIXME: redirect to "translation unit" level option as well.
             from pytools import invoke_editor
             dev_code = invoke_editor(dev_code, "code.cl")
 
         import pyopencl as cl
 
+        #FIXME: redirect to "translation unit" level option as well.
         cl_program = (
                 cl.Program(self.context, dev_code)
-                .build(options=kernel.options.cl_build_options))
+                .build(options=program[entrypoint].options.cl_build_options))
 
         cl_kernels = _Kernels()
-        for dp in codegen_result.device_programs:
-            setattr(cl_kernels, dp.name, getattr(cl_program, dp.name))
+        for dp in cl_program.kernel_names.split(";"):
+            setattr(cl_kernels, dp, getattr(cl_program, dp))
 
         return _KernelInfo(
-                kernel=kernel,
+                program=program,
                 cl_kernels=cl_kernels,
-                implemented_data_info=codegen_result.implemented_data_info,
-                invoker=self.get_invoker(kernel, codegen_result))
+                implemented_data_info=codegen_result.implemented_data_infos[
+                    entrypoint],
+                invoker=self.get_invoker(program, entrypoint, codegen_result))
 
     def __call__(self, queue, **kwargs):
         """
@@ -356,10 +363,12 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
 
         kwargs = self.packing_controller.unpack(kwargs)
 
-        kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs))
+        program_info = self.program_info(kwargs["entrypoint"],
+                self.arg_to_dtype_set(kwargs))
+        kwargs.pop("entrypoint")
 
-        return kernel_info.invoker(
-                kernel_info.cl_kernels, queue, allocator, wait_for,
+        return program_info.invoker(
+                program_info.cl_kernels, queue, allocator, wait_for,
                 out_host, **kwargs)
 
 # }}}
diff --git a/loopy/target/python.py b/loopy/target/python.py
index a1557e47bdf8990e7aa89472b59f3c9fc3666a05..c7f20ff559cad5d23efed9f091a1a5407337277e 100644
--- a/loopy/target/python.py
+++ b/loopy/target/python.py
@@ -27,11 +27,11 @@ import numpy as np
 
 from pymbolic.mapper import Mapper
 from pymbolic.mapper.stringifier import StringifyMapper
-from loopy.type_inference import TypeInferenceMapper
+from loopy.type_inference import TypeReader
 from loopy.kernel.data import ValueArg
 from loopy.diagnostic import LoopyError  # noqa
 from loopy.target import ASTBuilderBase
-from genpy import Suite
+from genpy import Suite, Collection
 
 
 # {{{ expression to code
@@ -42,7 +42,8 @@ class ExpressionToPythonMapper(StringifyMapper):
         self.codegen_state = codegen_state
 
         if type_inf_mapper is None:
-            type_inf_mapper = TypeInferenceMapper(self.kernel)
+            type_inf_mapper = TypeReader(self.kernel,
+                    self.codegen_state.callables_table)
         self.type_inf_mapper = type_inf_mapper
 
     def handle_unsupported_expression(self, victim, enclosing_prec):
@@ -80,48 +81,37 @@ class ExpressionToPythonMapper(StringifyMapper):
                 expr, enclosing_prec)
 
     def map_call(self, expr, enclosing_prec):
-        from pymbolic.primitives import Variable
         from pymbolic.mapper.stringifier import PREC_NONE
 
-        identifier = expr.function
+        identifier_name = self.codegen_state.callables_table[
+                expr.function.name].name
 
-        if identifier.name in ["indexof", "indexof_vec"]:
+        if identifier_name in ["indexof", "indexof_vec"]:
             raise LoopyError(
                     "indexof, indexof_vec not yet supported in Python")
 
-        if isinstance(identifier, Variable):
-            identifier = identifier.name
-
-        par_dtypes = tuple(self.type_inf_mapper(par) for par in expr.parameters)
+        clbl = self.codegen_state.callables_table[
+                expr.function.name]
 
         str_parameters = None
+        number_of_assignees = len([key for key in
+            clbl.arg_id_to_dtype.keys() if key < 0])
 
-        mangle_result = self.kernel.mangle_function(
-                identifier, par_dtypes,
-                ast_builder=self.codegen_state.ast_builder)
-
-        if mangle_result is None:
-            raise RuntimeError("function '%s' unknown--"
-                    "maybe you need to register a function mangler?"
-                    % identifier)
-
-        if len(mangle_result.result_dtypes) != 1:
+        if number_of_assignees != 1:
             raise LoopyError("functions with more or fewer than one return value "
                     "may not be used in an expression")
 
-        str_parameters = [
-                self.rec(par, PREC_NONE)
-                for par, par_dtype, tgt_dtype in zip(
-                    expr.parameters, par_dtypes, mangle_result.arg_dtypes)]
+        str_parameters = [self.rec(par, PREC_NONE) for par in expr.parameters]
 
         from loopy.codegen import SeenFunction
         self.codegen_state.seen_functions.add(
-                SeenFunction(identifier,
-                    mangle_result.target_name,
-                    mangle_result.arg_dtypes or par_dtypes,
-                    mangle_result.result_dtypes))
+                SeenFunction(clbl.name,
+                             clbl.name_in_target,
+                             clbl.input_dtypes,
+                             clbl.result_dtypes))
 
-        return "{}({})".format(mangle_result.target_name, ", ".join(str_parameters))
+        return "{}({})".format(clbl.name_in_target,
+                               ", ".join(str_parameters))
 
     def map_group_hw_index(self, expr, enclosing_prec):
         raise LoopyError("plain Python does not have group hw axes")
@@ -147,16 +137,6 @@ class ExpressionToPythonMapper(StringifyMapper):
 # }}}
 
 
-# {{{ genpy extensions
-
-class Collection(Suite):
-    def generate(self):
-        for item in self.contents:
-            yield from item.generate()
-
-# }}}
-
-
 # {{{ ast builder
 
 def _numpy_single_arg_function_mangler(kernel, name, arg_dtypes):
@@ -185,13 +165,12 @@ class PythonASTBuilderBase(ASTBuilderBase):
     """A Python host AST builder for integration with PyOpenCL.
     """
 
-    # {{{ code generation guts
-
-    def function_manglers(self):
-        return (
-                super().function_manglers() + [
-                    _numpy_single_arg_function_mangler,
-                    ])
+    @property
+    def known_callables(self):
+        from loopy.target.c import get_c_callables
+        callables = super().known_callables
+        callables.update(get_c_callables())
+        return callables
 
     def preamble_generators(self):
         return (
@@ -199,6 +178,13 @@ class PythonASTBuilderBase(ASTBuilderBase):
                     _base_python_preamble_generator
                     ])
 
+    # {{{ code generation guts
+
+    @property
+    def ast_module(self):
+        import genpy
+        return genpy
+
     def get_function_declaration(self, codegen_state, codegen_result,
             schedule_index):
         return None
diff --git a/loopy/tools.py b/loopy/tools.py
index 5be4ca6b58f0a2e0dd5907eacf4749dd3aaf927b..644082ed61143798f3c01e5af820092aabd665af 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -35,6 +35,17 @@ def is_integer(obj):
     return isinstance(obj, (int, np.integer))
 
 
+def update_persistent_hash(obj, key_hash, key_builder):
+    """
+    Custom hash computation function for use with
+    :class:`pytools.persistent_dict.PersistentDict`.
+
+    Only works in conjunction with :class:`loopy.tools.KeyBuilder`.
+    """
+    for field_name in obj.hash_fields:
+        key_builder.rec(key_hash, getattr(obj, field_name))
+
+
 # {{{ custom KeyBuilder subclass
 
 class PersistentHashWalkMapper(LoopyWalkMapper, PersistentHashWalkMapperBase):
@@ -52,6 +63,13 @@ class PersistentHashWalkMapper(LoopyWalkMapper, PersistentHashWalkMapperBase):
         self.key_hash.update(type(expr.operation).__name__.encode("utf-8"))
         self.rec(expr.expr, *args)
 
+    def map_foreign(self, expr, *args, **kwargs):
+        """Mapper method dispatch for non-:mod:`pymbolic` objects."""
+        if expr is None:
+            self.key_hash.update(b"<None>")
+        else:
+            PersistentHashWalkMapperBase.map_foreign(self, expr, *args, **kwargs)
+
 
 class LoopyKeyBuilder(KeyBuilderBase):
     """A custom :class:`pytools.persistent_dict.KeyBuilder` subclass
@@ -72,6 +90,11 @@ class LoopyKeyBuilder(KeyBuilderBase):
 
     update_for_defaultdict = update_for_dict
 
+    def update_for_frozenset(self, key_hash, key):
+        for set_key in sorted(key,
+                key=lambda obj: type(obj).__name__ + str(obj)):
+            self.rec(key_hash, set_key)
+
     def update_for_BasicSet(self, key_hash, key):  # noqa
         from islpy import Printer
         prn = Printer.to_str(key.get_ctx())
@@ -99,6 +122,8 @@ class LoopyKeyBuilder(KeyBuilderBase):
         else:
             PersistentHashWalkMapper(key_hash)(key)
 
+    update_for_PMap = update_for_dict  # noqa: N815
+
 
 class PymbolicExpressionHashWrapper:
     def __init__(self, expression):
diff --git a/loopy/transform/add_barrier.py b/loopy/transform/add_barrier.py
index bc324d7fa96ad409c27e048a3258f6eca2d0f1b9..1e03ade94710b25cd56eecc7079afdadf567a82c 100644
--- a/loopy/transform/add_barrier.py
+++ b/loopy/transform/add_barrier.py
@@ -24,6 +24,8 @@ THE SOFTWARE.
 from loopy.kernel.instruction import BarrierInstruction
 from loopy.match import parse_match
 from loopy.transform.instruction import add_dependency
+from loopy.program import iterate_over_kernels_if_given_program
+from loopy.kernel import LoopKernel
 
 __doc__ = """
 .. currentmodule:: loopy
@@ -34,6 +36,7 @@ __doc__ = """
 
 # {{{ add_barrier
 
+@iterate_over_kernels_if_given_program
 def add_barrier(kernel, insn_before="", insn_after="", id_based_on=None,
                 tags=None, synchronization_kind="global", mem_kind=None):
     """Takes in a kernel that needs to be added a barrier and returns a kernel
@@ -53,6 +56,8 @@ def add_barrier(kernel, insn_before="", insn_after="", id_based_on=None,
         for "global" bariers.  If not supplied, defaults to *synchronization_kind*
     """
 
+    assert isinstance(kernel, LoopKernel)
+
     if mem_kind is None:
         mem_kind = synchronization_kind
 
diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py
index 0a38790152f9e1325733a8bdc47d13f05d400c39..8376688198c3cff232d9f9006883d1b236efe367 100644
--- a/loopy/transform/arithmetic.py
+++ b/loopy/transform/arithmetic.py
@@ -23,9 +23,13 @@ THE SOFTWARE.
 
 from loopy.diagnostic import LoopyError
 
+from loopy.program import iterate_over_kernels_if_given_program
+from loopy.kernel import LoopKernel
+
 
 # {{{ fold constants
 
+@iterate_over_kernels_if_given_program
 def fold_constants(kernel):
     from loopy.symbolic import ConstantFoldingMapper
     cfm = ConstantFoldingMapper()
@@ -49,7 +53,9 @@ def fold_constants(kernel):
 # {{{ collect_common_factors_on_increment
 
 # thus far undocumented
+@iterate_over_kernels_if_given_program
 def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()):
+    assert isinstance(kernel, LoopKernel)
     # FIXME: Does not understand subst rules for now
     if kernel.substitutions:
         from loopy.transform.subst import expand_subst
diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py
index e27e5902644205e8a1643b4c243ba8ae6532fafa..5da142e3d400edf151ee755990d1fa4845aa147e 100644
--- a/loopy/transform/batch.py
+++ b/loopy/transform/batch.py
@@ -25,6 +25,9 @@ from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingCont
 from loopy.kernel.data import ValueArg, ArrayArg
 import islpy as isl
 
+from loopy.program import iterate_over_kernels_if_given_program
+
+
 __doc__ = """
 .. currentmodule:: loopy
 
@@ -98,6 +101,7 @@ def _add_unique_dim_name(name, dim_names):
     return (ng(name),) + tuple(dim_names)
 
 
+@iterate_over_kernels_if_given_program
 def to_batched(kernel, nbatches, batch_varying_args, batch_iname_prefix="ibatch",
         sequential=False):
     """Takes in a kernel that carries out an operation and returns a kernel
diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py
index 84503a618b067a147dc5181c2251d17d8b83eb44..e8c4bc2e9bd687d782a4d9f71dbc5e3a54eb639b 100644
--- a/loopy/transform/buffer.py
+++ b/loopy/transform/buffer.py
@@ -30,6 +30,9 @@ from pytools.persistent_dict import WriteOncePersistentDict
 from loopy.tools import LoopyKeyBuilder, PymbolicExpressionHashWrapper
 from loopy.version import DATA_MODEL_VERSION
 from loopy.diagnostic import LoopyError
+from loopy.kernel import LoopKernel
+from loopy.program import Program
+from loopy.kernel.function_interface import CallableKernel, ScalarCallable
 
 from pymbolic import var
 
@@ -127,10 +130,10 @@ buffer_array_cache = WriteOncePersistentDict(
 
 
 # Adding an argument? also add something to the cache_key below.
-def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
-        store_expression=None, within=None, default_tag="l.auto",
-        temporary_scope=None, temporary_is_local=None,
-        fetch_bounding_box=False):
+def buffer_array_for_single_kernel(kernel, callables_table, var_name,
+        buffer_inames, init_expression=None, store_expression=None,
+        within=None, default_tag="l.auto", temporary_scope=None,
+        temporary_is_local=None, fetch_bounding_box=False):
     """Replace accesses to *var_name* with ones to a temporary, which is
     created and acts as a buffer. To perform this transformation, the access
     footprint to *var_name* is determined and a temporary of a suitable
@@ -166,6 +169,20 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
         fetched.
     """
 
+    if isinstance(kernel, Program):
+        kernel_names = [i for i, clbl in
+                kernel.callables_table.items() if isinstance(clbl,
+                    CallableKernel)]
+        if len(kernel_names) != 1:
+            raise LoopyError()
+
+        return kernel.with_kernel(buffer_array(kernel[kernel_names[0]],
+            var_name, buffer_inames, init_expression, store_expression, within,
+            default_tag, temporary_scope, temporary_is_local,
+            fetch_bounding_box, kernel.callables_table))
+
+    assert isinstance(kernel, LoopKernel)
+
     # {{{ unify temporary_scope / temporary_is_local
 
     from loopy.kernel.data import AddressSpace
@@ -237,7 +254,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
 
     from loopy.preprocess import prepare_for_caching
     key_kernel = prepare_for_caching(kernel)
-    cache_key = (key_kernel, var_name, tuple(buffer_inames),
+    cache_key = (key_kernel, var_name,
+            tuple(buffer_inames),
             PymbolicExpressionHashWrapper(init_expression),
             PymbolicExpressionHashWrapper(store_expression), within,
             default_tag, temporary_scope, fetch_bounding_box)
@@ -525,7 +543,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
     kernel = tag_inames(kernel, new_iname_to_tag)
 
     from loopy.kernel.tools import assign_automatic_axes
-    kernel = assign_automatic_axes(kernel)
+    kernel = assign_automatic_axes(kernel, callables_table)
 
     if CACHING_ENABLED:
         from loopy.preprocess import prepare_for_caching
@@ -534,4 +552,25 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
 
     return kernel
 
+
+def buffer_array(program, *args, **kwargs):
+    assert isinstance(program, Program)
+
+    new_callables = {}
+
+    for func_id, clbl in program.callables_table.items():
+        if isinstance(clbl, CallableKernel):
+            clbl = clbl.copy(
+                    subkernel=buffer_array_for_single_kernel(clbl.subkernel,
+                        program.callables_table, *args, **kwargs))
+        elif isinstance(clbl, ScalarCallable):
+            pass
+        else:
+            raise NotImplementedError()
+
+        new_callables[func_id] = clbl
+
+    return program.copy(callables_table=new_callables)
+
+
 # vim: foldmethod=marker
diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py
new file mode 100644
index 0000000000000000000000000000000000000000..a5c4c52840541efcaf6971cfbe98e37d6367e772
--- /dev/null
+++ b/loopy/transform/callable.py
@@ -0,0 +1,741 @@
+__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import islpy as isl
+
+from pytools import UniqueNameGenerator
+
+from loopy.kernel import LoopKernel
+from loopy.diagnostic import LoopyError
+from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase,
+        Assignment, CInstruction, _DataObliviousInstruction)
+from loopy.symbolic import (
+            RuleAwareSubstitutionMapper,
+            SubstitutionRuleMappingContext, CombineMapper, IdentityMapper)
+from loopy.isl_helpers import simplify_via_aff
+from loopy.kernel.function_interface import (
+        CallableKernel, ScalarCallable)
+from loopy.program import Program
+
+__doc__ = """
+.. currentmodule:: loopy
+
+.. autofunction:: register_callable
+
+.. autofunction:: merge
+"""
+
+
+def register_callable(translation_unit, function_identifier, callable_,
+        redefining_not_ok=True):
+    """
+    :param translation_unit: A :class:`loopy.Program`.
+    :param callable_: A :class:`loopy.InKernelCallable`.
+    """
+
+    if isinstance(callable_, LoopKernel):
+        callable_ = CallableKernel(callable_)
+
+    from loopy.kernel.function_interface import InKernelCallable
+    assert isinstance(callable_, InKernelCallable)
+
+    if (function_identifier in translation_unit.callables_table) and (
+            translation_unit.callables_table[function_identifier] != callable_
+            and redefining_not_ok):
+        raise LoopyError("Redefining function identifier not allowed. Set the"
+                " option 'redefining_not_ok=False' to bypass this error.")
+
+    new_callables = translation_unit.callables_table.set(function_identifier,
+            callable_)
+
+    return translation_unit.copy(
+            callables_table=new_callables)
+
+
+def merge(translation_units):
+    """
+    :param translation_units: A list of :class:`loopy.Program`.
+
+    :returns: An instance of :class:`loopy.Program` which contains all the
+        callables from each of the *translation_units.
+    """
+
+    for i in range(1, len(translation_units)):
+        if translation_units[i].target != translation_units[i-1].target:
+            raise LoopyError("translation units to be merged should have the"
+                             " same target.")
+
+    # {{{ check for callable collision
+
+    for i, prg_i in enumerate(translation_units):
+        for prg_j in translation_units[i+1:]:
+            for clbl_name in (set(prg_i.callables_table)
+                              & set(prg_j.callables_table)):
+                if (prg_i.callables_table[clbl_name]
+                        != prg_j.callables_table[clbl_name]):
+                    # FIXME: generate unique names + rename for the colliding
+                    # callables
+                    raise NotImplementedError("Translation units to be merged"
+                                              " must have different callable names"
+                                              " for now.")
+
+    # }}}
+
+    callables_table = {}
+    for trans_unit in translation_units:
+        callables_table.update(trans_unit.callables_table.copy())
+
+    return Program(
+            entrypoints=frozenset().union(*(
+                t.entrypoints or frozenset() for t in translation_units)),
+            callables_table=callables_table,
+            target=translation_units[0].target)
+
+
+# {{{ kernel inliner mapper
+
+class KernelInliner(RuleAwareSubstitutionMapper):
+    def __init__(self, rule_mapping_context, subst_func, caller_knl,
+            callee_knl, callee_arg_to_call_param):
+        super().__init__(rule_mapping_context, subst_func, lambda *args: True)
+        self.caller_knl = caller_knl
+        self.callee_knl = callee_knl
+        self.callee_arg_to_call_param = callee_arg_to_call_param
+
+    def map_subscript(self, expr, expn_state):
+        if expr.aggregate.name in self.callee_knl.arg_dict:
+            from loopy.symbolic import get_start_subscript_from_sar
+            from loopy.isl_helpers import simplify_via_aff
+            from pymbolic.primitives import Subscript, Variable
+
+            sar = self.callee_arg_to_call_param[expr.aggregate.name]  # SubArrayRef
+
+            callee_arg = self.callee_knl.arg_dict[expr.aggregate.name]
+            if sar.subscript.aggregate.name in self.caller_knl.arg_dict:
+                caller_arg = self.caller_knl.arg_dict[sar.subscript.aggregate.name]
+            else:
+                caller_arg = self.caller_knl.temporary_variables[
+                        sar.subscript.aggregate.name]
+
+            # map inner inames to outer inames.
+            outer_indices = self.map_tuple(expr.index_tuple, expn_state)
+
+            flatten_index = 0
+            for i, idx in enumerate(get_start_subscript_from_sar(sar,
+                    self.caller_knl).index_tuple):
+                flatten_index += idx*caller_arg.dim_tags[i].stride
+
+            flatten_index += sum(
+                idx * tag.stride
+                for idx, tag in zip(outer_indices, callee_arg.dim_tags))
+
+            flatten_index = simplify_via_aff(flatten_index)
+
+            new_indices = []
+            for dim_tag in caller_arg.dim_tags:
+                ind = flatten_index // dim_tag.stride
+                flatten_index -= (dim_tag.stride * ind)
+                new_indices.append(ind)
+
+            new_indices = tuple(simplify_via_aff(i) for i in new_indices)
+
+            return Subscript(Variable(sar.subscript.aggregate.name), new_indices)
+        else:
+            assert expr.aggregate.name in self.callee_knl.temporary_variables
+            return super().map_subscript(expr, expn_state)
+
+    def map_variable(self, expr, expn_state):
+        from loopy.kernel.data import ArrayArg, ValueArg
+        from loopy.symbolic import SubArrayRef
+        if expr.name in self.callee_knl.arg_dict:
+            arg = self.callee_knl.arg_dict[expr.name]
+            par = self.callee_arg_to_call_param[expr.name]
+            if isinstance(arg, ArrayArg):
+                assert arg.shape == ()
+                assert isinstance(par, SubArrayRef) and par.swept_inames == ()
+                return par.subscript.aggregate
+            else:
+                assert isinstance(arg, ValueArg)
+                return par
+
+        else:
+            return super().map_variable(expr, expn_state)
+
+# }}}
+
+
+# {{{ inlining of a single call instruction
+
+def substitute_into_domain(domain, param_name, expr, allowed_param_dims):
+    """
+    :arg allowed_deps: A :class:`list` of :class:`str` that are
+    """
+    import pymbolic.primitives as prim
+    from loopy.symbolic import get_dependencies, isl_set_from_expr
+    if param_name not in domain.get_var_dict():
+        # param_name not in domain => domain will be unchanged
+        return domain
+
+    # {{{ rename 'param_name' to avoid namespace pollution with allowed_param_dims
+
+    dt, pos = domain.get_var_dict()[param_name]
+    domain = domain.set_dim_name(dt, pos, UniqueNameGenerator(
+        set(allowed_param_dims))(param_name))
+
+    # }}}
+
+    for dep in get_dependencies(expr):
+        if dep in allowed_param_dims:
+            domain = domain.add_dims(isl.dim_type.param, 1)
+            domain = domain.set_dim_name(
+                    isl.dim_type.param,
+                    domain.dim(isl.dim_type.param)-1,
+                    dep)
+        else:
+            raise ValueError("Augmenting caller's domain "
+                    f"with '{dep}' is not allowed.")
+
+    set_ = isl_set_from_expr(domain.space,
+            prim.Comparison(prim.Variable(param_name),
+                            "==",
+                            expr))
+
+    bset, = set_.get_basic_sets()
+    domain = domain & bset
+
+    return domain.project_out(dt, pos, 1)
+
+
+def rename_iname(domain, old_iname, new_iname):
+    if old_iname not in domain.get_var_dict():
+        return domain
+
+    dt, pos = domain.get_var_dict()[old_iname]
+    return domain.set_dim_name(dt, pos, new_iname)
+
+
+def get_valid_domain_param_names(knl):
+    from loopy.kernel.data import ValueArg
+    return ([arg.name for arg in knl.args if isinstance(arg, ValueArg)]
+            + [tv.name
+               for tv in knl.temporary_variables.values()
+               if tv.shape == ()]
+            + list(knl.all_inames())
+            )
+
+
+def _inline_call_instruction(caller_knl, callee_knl, call_insn):
+    """
+    Returns a copy of *caller_knl* with the *call_insn* in the *kernel*
+    replaced by inlining *callee_knl* into it within it.
+    """
+    import pymbolic.primitives as prim
+    from pymbolic.mapper.substitutor import make_subst_func
+    from loopy.kernel.data import ValueArg
+
+    # {{{ sanity checks
+
+    assert call_insn.expression.function.name == callee_knl.name
+
+    # }}}
+
+    callee_label = callee_knl.name[:4] + "_"
+    vng = caller_knl.get_var_name_generator()
+    ing = caller_knl.get_instruction_id_generator()
+
+    # {{{ construct callee->caller name mappings
+
+    # name_map: Mapping[str, str]
+    # A mapping from variable names in the callee kernel's namespace to
+    # the ones they would be referred by in the caller's namespace post inlining.
+    name_map = {}
+
+    # only consider temporary variables and inames, arguments would be mapping
+    # according to the invocation in call_insn.
+    for name in (callee_knl.all_inames()
+                 | set(callee_knl.temporary_variables.keys())):
+        new_name = vng(callee_label+name)
+        name_map[name] = new_name
+
+    # }}}
+
+    # {{{ iname_to_tags
+
+    # new_iname_to_tags: caller's iname_to_tags post inlining
+    new_iname_to_tags = caller_knl.iname_to_tags
+
+    for old_name, tags in callee_knl.iname_to_tags.items():
+        new_iname_to_tags[name_map[old_name]] = tags
+
+    # }}}
+
+    # {{{ register callee's temps as caller's
+
+    # new_temps: caller's temps post inlining
+    new_temps = caller_knl.temporary_variables.copy()
+
+    for name, tv in callee_knl.temporary_variables.items():
+        new_temps[name_map[name]] = tv.copy(name=name_map[name])
+
+    # }}}
+
+    # {{{ get callee args -> parameters passed to the call
+
+    arg_map = {}  # callee arg name -> caller symbols (e.g. SubArrayRef)
+
+    assignees = call_insn.assignees  # writes
+    parameters = call_insn.expression.parameters  # reads
+
+    # add keyword parameters
+    from pymbolic.primitives import CallWithKwargs
+
+    from loopy.kernel.function_interface import get_kw_pos_association
+    kw_to_pos, pos_to_kw = get_kw_pos_association(callee_knl)
+    if isinstance(call_insn.expression, CallWithKwargs):
+        kw_parameters = call_insn.expression.kw_parameters
+    else:
+        kw_parameters = {}
+
+    for kw, par in kw_parameters.items():
+        arg_map[kw] = par
+
+    for i, par in enumerate(parameters):
+        arg_map[pos_to_kw[i]] = par
+
+    for i, assignee in enumerate(assignees):
+        arg_map[pos_to_kw[-i-1]] = assignee
+
+    # }}}
+
+    # {{{ domains/assumptions
+
+    new_domains = callee_knl.domains.copy()
+    for old_iname in callee_knl.all_inames():
+        new_domains = [rename_iname(dom, old_iname, name_map[old_iname])
+                       for dom in new_domains]
+
+    new_assumptions = callee_knl.assumptions
+
+    for callee_arg_name, param_expr in arg_map.items():
+        if isinstance(callee_knl.arg_dict[callee_arg_name],
+                      ValueArg):
+            new_domains = [
+                    substitute_into_domain(
+                        dom,
+                        callee_arg_name,
+                        param_expr, get_valid_domain_param_names(caller_knl))
+                    for dom in new_domains]
+
+            new_assumptions = substitute_into_domain(
+                        new_assumptions,
+                        callee_arg_name,
+                        param_expr, get_valid_domain_param_names(caller_knl))
+
+    # }}}
+
+    # {{{ map callee's expressions to get expressions after inlining
+
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            callee_knl.substitutions, vng)
+    smap = KernelInliner(rule_mapping_context,
+            make_subst_func({old_name: prim.Variable(new_name)
+                             for old_name, new_name in name_map.items()}),
+            caller_knl, callee_knl, arg_map)
+
+    callee_knl = rule_mapping_context.finish_kernel(smap.map_kernel(
+        callee_knl))
+
+    # }}}
+
+    # {{{ generate new ids for instructions
+
+    insn_id_map = {}
+    for insn in callee_knl.instructions:
+        insn_id_map[insn.id] = ing(callee_label+insn.id)
+
+    # }}}
+
+    # {{{ use NoOp to mark the start and end of callee kernel
+
+    from loopy.kernel.instruction import NoOpInstruction
+
+    noop_start = NoOpInstruction(
+        id=ing(callee_label+"_start"),
+        within_inames=call_insn.within_inames,
+        depends_on=call_insn.depends_on
+    )
+    noop_end = NoOpInstruction(
+        id=call_insn.id,
+        within_inames=call_insn.within_inames,
+        depends_on=frozenset(insn_id_map.values())
+    )
+
+    # }}}
+
+    # {{{ map callee's instruction ids
+
+    inlined_insns = [noop_start]
+
+    for insn in callee_knl.instructions:
+        new_within_inames = (frozenset(name_map[iname]
+                             for iname in insn.within_inames)
+                | call_insn.within_inames)
+        new_depends_on = (frozenset(insn_id_map[dep] for dep in insn.depends_on)
+                      | {noop_start.id})
+        new_no_sync_with = frozenset((insn_id_map[id], scope)
+                                 for id, scope in insn.no_sync_with)
+        new_id = insn_id_map[insn.id]
+
+        if isinstance(insn, Assignment):
+            new_atomicity = tuple(
+                    type(atomicity)(name_map[atomicity.var_name])
+                    for atomicity in insn.atomicity)
+            insn = insn.copy(
+                id=insn_id_map[insn.id],
+                within_inames=new_within_inames,
+                depends_on=new_depends_on,
+                tags=insn.tags | call_insn.tags,
+                atomicity=new_atomicity,
+                no_sync_with=new_no_sync_with
+            )
+        else:
+            insn = insn.copy(
+                id=new_id,
+                within_inames=new_within_inames,
+                depends_on=new_depends_on,
+                tags=insn.tags | call_insn.tags,
+                no_sync_with=new_no_sync_with
+            )
+        inlined_insns.append(insn)
+
+    inlined_insns.append(noop_end)
+
+    # }}}
+
+    # {{{ swap out call_insn with inlined_instructions
+
+    idx = caller_knl.instructions.index(call_insn)
+    new_insns = (caller_knl.instructions[:idx]
+                 + inlined_insns
+                 + caller_knl.instructions[idx+1:])
+
+    # }}}
+
+    old_assumptions, new_assumptions = isl.align_two(
+            caller_knl.assumptions, new_assumptions)
+
+    return caller_knl.copy(instructions=new_insns,
+            temporary_variables=new_temps,
+            domains=caller_knl.domains+new_domains,
+            assumptions=old_assumptions.params() & new_assumptions.params(),
+            iname_to_tags=new_iname_to_tags)
+
+# }}}
+
+
+# {{{ inline callable kernel
+
+def _inline_single_callable_kernel(caller_kernel, callee_kernel,
+        callables_table):
+    for insn in caller_kernel.instructions:
+        if isinstance(insn, CallInstruction):
+            # FIXME This seems to use identifiers across namespaces. Why not
+            # check whether the function is a scoped function first? ~AK
+            if insn.expression.function.name == callee_kernel.name:
+                caller_kernel = _inline_call_instruction(
+                        caller_kernel, callee_kernel, insn)
+        elif isinstance(insn, (MultiAssignmentBase, CInstruction,
+                _DataObliviousInstruction)):
+            pass
+        else:
+            raise NotImplementedError(
+                    "Unknown instruction type %s"
+                    % type(insn).__name__)
+
+    return caller_kernel
+
+
+# FIXME This should take a 'within' parameter to be able to only inline
+# *some* calls to a kernel, but not others.
+def inline_callable_kernel(program, function_name):
+    """
+    Returns a copy of *kernel* with the callable kernel addressed by
+    (scoped) name *function_name* inlined.
+    """
+    from loopy.preprocess import infer_arg_descr
+    from loopy.program import resolve_callables
+    program = resolve_callables(program)
+    program = infer_arg_descr(program)
+    callables_table = program.callables_table
+    new_callables = {}
+    callee = program[function_name]
+
+    for func_id, in_knl_callable in callables_table.items():
+        if isinstance(in_knl_callable, CallableKernel):
+            caller = in_knl_callable.subkernel
+            in_knl_callable = in_knl_callable.copy(
+                    subkernel=_inline_single_callable_kernel(caller,
+                        callee, program.callables_table))
+        elif isinstance(in_knl_callable, ScalarCallable):
+            pass
+        else:
+            raise NotImplementedError()
+
+        new_callables[func_id] = in_knl_callable
+
+    return program.copy(callables_table=new_callables)
+
+# }}}
+
+
+# {{{ tools to match caller to callee args by (guessed) automatic reshaping
+
+# (This is undocumented and not recommended, but it is currently needed
+# to support Firedrake.)
+
+class DimChanger(IdentityMapper):
+    """
+    Mapper to change the dimensions of an argument.
+
+    .. attribute:: callee_arg_dict
+
+        A mapping from the argument name (:class:`str`) to instances of
+        :class:`loopy.kernel.array.ArrayBase`.
+
+    .. attribute:: desried_shape
+
+        A mapping from argument name (:class:`str`) to an instance of
+        :class:`tuple`.
+    """
+    def __init__(self, callee_arg_dict, desired_shape):
+        self.callee_arg_dict = callee_arg_dict
+        self.desired_shape = desired_shape
+
+    def map_subscript(self, expr):
+        if expr.aggregate.name not in self.callee_arg_dict:
+            return super().map_subscript(expr)
+        callee_arg_dim_tags = self.callee_arg_dict[expr.aggregate.name].dim_tags
+        flattened_index = sum(dim_tag.stride*idx for dim_tag, idx in
+                zip(callee_arg_dim_tags, expr.index_tuple))
+        new_indices = []
+
+        from operator import mul
+        from functools import reduce
+        stride = reduce(mul, self.desired_shape[expr.aggregate.name], 1)
+
+        for length in self.desired_shape[expr.aggregate.name]:
+            stride /= length
+            ind = flattened_index // int(stride)
+            flattened_index -= (int(stride) * ind)
+            new_indices.append(simplify_via_aff(ind))
+
+        return expr.aggregate.index(tuple(new_indices))
+
+
+def _match_caller_callee_argument_dimension_for_single_kernel(
+        caller_knl, callee_knl):
+    """
+    :returns: a copy of *caller_knl* with the instance of
+        :class:`loopy.kernel.function_interface.CallableKernel` addressed by
+        *callee_function_name* in the *caller_knl* aligned with the argument
+        dimensions required by *caller_knl*.
+    """
+    for insn in caller_knl.instructions:
+        if not isinstance(insn, CallInstruction) or (
+                insn.expression.function.name !=
+                callee_knl.name):
+            # Call to a callable kernel can only occur through a
+            # CallInstruction.
+            continue
+
+        def _shape_1_if_empty(shape):
+            assert isinstance(shape, tuple)
+            if shape == ():
+                return (1, )
+            else:
+                return shape
+
+        from loopy.kernel.function_interface import (
+                ArrayArgDescriptor, get_arg_descriptor_for_expression,
+                get_kw_pos_association)
+        _, pos_to_kw = get_kw_pos_association(callee_knl)
+        arg_id_to_shape = {}
+        for arg_id, arg in insn.arg_id_to_val().items():
+            arg_id = pos_to_kw[arg_id]
+
+            arg_descr = get_arg_descriptor_for_expression(caller_knl, arg)
+            if isinstance(arg_descr, ArrayArgDescriptor):
+                arg_id_to_shape[arg_id] = _shape_1_if_empty(arg_descr.shape)
+            else:
+                arg_id_to_shape[arg_id] = (1, )
+
+        dim_changer = DimChanger(
+                callee_knl.arg_dict,
+                arg_id_to_shape)
+
+        new_callee_insns = []
+        for callee_insn in callee_knl.instructions:
+            if isinstance(callee_insn, MultiAssignmentBase):
+                new_callee_insns.append(callee_insn.copy(expression=dim_changer(
+                    callee_insn.expression),
+                    assignee=dim_changer(callee_insn.assignee)))
+
+            elif isinstance(callee_insn, (CInstruction,
+                    _DataObliviousInstruction)):
+                pass
+            else:
+                raise NotImplementedError("Unknown instruction %s." %
+                        type(insn))
+
+        # subkernel with instructions adjusted according to the new dimensions
+        new_callee_knl = callee_knl.copy(instructions=new_callee_insns)
+
+        return new_callee_knl
+
+
+class _FunctionCalledChecker(CombineMapper):
+    def __init__(self, func_name):
+        self.func_name = func_name
+
+    def combine(self, values):
+        return any(values)
+
+    def map_call(self, expr):
+        if expr.function.name == self.func_name:
+            return True
+        return self.combine(
+                tuple(
+                    self.rec(child) for child in expr.parameters)
+                )
+
+    map_call_with_kwargs = map_call
+
+    def map_constant(self, expr):
+        return False
+
+    def map_algebraic_leaf(self, expr):
+        return False
+
+    def map_kernel(self, kernel):
+        return any(self.rec(insn.expression) for insn in kernel.instructions if
+                isinstance(insn, MultiAssignmentBase))
+
+
+def _match_caller_callee_argument_dimension_(program, callee_function_name):
+    """
+    Returns a copy of *program* with the instance of
+    :class:`loopy.kernel.function_interface.CallableKernel` addressed by
+    *callee_function_name* in the *program* aligned with the argument
+    dimensions required by *caller_knl*.
+
+    .. note::
+
+        The callee kernel addressed by *callee_function_name*, should be
+        called at only one location throughout the program, as multiple
+        invocations would demand complex renaming logic which is not
+        implemented yet.
+    """
+
+    # {{{  sanity checks
+
+    assert isinstance(program, Program)
+    assert isinstance(callee_function_name, str)
+    assert callee_function_name not in program.entrypoints
+    assert callee_function_name in program.callables_table
+
+    # }}}
+
+    is_invoking_callee = _FunctionCalledChecker(
+            callee_function_name).map_kernel
+
+    caller_knl,  = [in_knl_callable.subkernel for in_knl_callable in
+            program.callables_table.values() if isinstance(in_knl_callable,
+                CallableKernel) and
+            is_invoking_callee(in_knl_callable.subkernel)]
+
+    from pymbolic.primitives import Call
+    assert len([insn for insn in caller_knl.instructions if (isinstance(insn,
+        CallInstruction) and isinstance(insn.expression, Call) and
+        insn.expression.function.name == callee_function_name)]) == 1
+    new_callee_kernel = _match_caller_callee_argument_dimension_for_single_kernel(
+            caller_knl, program[callee_function_name])
+    return program.with_kernel(new_callee_kernel)
+
+# }}}
+
+
+def rename_callable(program, old_name, new_name=None, existing_ok=False):
+    """
+    :arg program: An instance of :class:`loopy.Program`
+    :arg old_name: The callable to be renamed
+    :arg new_name: New name for the callable to be renamed
+    :arg existing_ok: An instance of :class:`bool`
+    """
+    from loopy.symbolic import (
+            RuleAwareSubstitutionMapper,
+            SubstitutionRuleMappingContext)
+    from pymbolic import var
+
+    assert isinstance(program, Program)
+    assert isinstance(old_name, str)
+
+    if (new_name in program.callables_table) and not existing_ok:
+        raise LoopyError(f"callables named '{new_name}' already exists")
+
+    if new_name is None:
+        namegen = UniqueNameGenerator(program.callables_table.keys())
+        new_name = namegen(old_name)
+
+    assert isinstance(new_name, str)
+
+    new_callables_table = {}
+
+    for name, clbl in program.callables_table.items():
+        if name == old_name:
+            name = new_name
+
+        if isinstance(clbl, CallableKernel):
+            knl = clbl.subkernel
+            rule_mapping_context = SubstitutionRuleMappingContext(
+                    knl.substitutions, knl.get_var_name_generator())
+            smap = RuleAwareSubstitutionMapper(rule_mapping_context,
+                                               {var(old_name): var(new_name)}.get,
+                                               within=lambda *args: True)
+            knl = rule_mapping_context.finish_kernel(smap.map_kernel(knl))
+            clbl = clbl.copy(subkernel=knl.copy(name=name))
+        elif isinstance(clbl, ScalarCallable):
+            pass
+        else:
+            raise NotImplementedError(f"{type(clbl)}")
+
+        new_callables_table[name] = clbl
+
+    new_entrypoints = program.entrypoints.copy()
+    if old_name in new_entrypoints:
+        new_entrypoints = ((new_entrypoints | frozenset([new_name]))
+                           - frozenset([old_name]))
+
+    return program.copy(callables_table=new_callables_table,
+                        entrypoints=new_entrypoints)
+
+
+# vim: foldmethod=marker
diff --git a/loopy/transform/data.py b/loopy/transform/data.py
index 4851ffdece47dc092011991c5b7218d96ea953c0..185af24c47eaa569abf8b2ac617b7fb07fa47939 100644
--- a/loopy/transform/data.py
+++ b/loopy/transform/data.py
@@ -26,6 +26,9 @@ from islpy import dim_type
 from loopy.kernel.data import ImageArg
 
 from pytools import MovedFunctionDeprecationWrapper
+from loopy.program import Program, iterate_over_kernels_if_given_program
+from loopy.kernel import LoopKernel
+from loopy.kernel.function_interface import CallableKernel, ScalarCallable
 
 
 # {{{ convenience: add_prefetch
@@ -136,7 +139,8 @@ class _not_provided:  # noqa: N801
     pass
 
 
-def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
+def add_prefetch_for_single_kernel(kernel, callables_table, var_name,
+        sweep_inames=[], dim_arg_names=None,
 
         # "None" is a valid value here, distinct from the default.
         default_tag=_not_provided,
@@ -235,6 +239,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
 
     This function internally uses :func:`extract_subst` and :func:`precompute`.
     """
+    assert isinstance(kernel, LoopKernel)
 
     # {{{ fish indexing out of var_name and into footprint_subscripts
 
@@ -327,9 +332,9 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
     # precompute module, but precompute acutally uses that to adjust its
     # warning message.
 
-    from loopy.transform.precompute import precompute
-    new_kernel = precompute(kernel, subst_use, sweep_inames,
-            precompute_inames=dim_arg_names,
+    from loopy.transform.precompute import precompute_for_single_kernel
+    new_kernel = precompute_for_single_kernel(kernel, callables_table,
+            subst_use, sweep_inames, precompute_inames=dim_arg_names,
             default_tag=default_tag, dtype=var_descr.dtype,
             fetch_bounding_box=fetch_bounding_box,
             temporary_name=temporary_name,
@@ -362,6 +367,29 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
     else:
         return new_kernel
 
+
+def add_prefetch(program, *args, **kwargs):
+    assert isinstance(program, Program)
+
+    new_callables = {}
+    for func_id, in_knl_callable in program.callables_table.items():
+        if isinstance(in_knl_callable, CallableKernel):
+            new_subkernel = add_prefetch_for_single_kernel(
+                    in_knl_callable.subkernel, program.callables_table,
+                    *args, **kwargs)
+            in_knl_callable = in_knl_callable.copy(
+                    subkernel=new_subkernel)
+
+        elif isinstance(in_knl_callable, ScalarCallable):
+            pass
+        else:
+            raise NotImplementedError("Unknown type of callable %s." % (
+                type(in_knl_callable).__name__))
+
+        new_callables[func_id] = in_knl_callable
+
+    return program.copy(callables_table=new_callables)
+
 # }}}
 
 
@@ -384,6 +412,7 @@ def change_arg_to_image(kernel, name):
 
 # {{{ tag array axes
 
+@iterate_over_kernels_if_given_program
 def tag_array_axes(kernel, ary_names, dim_tags):
     """
     :arg dim_tags: a tuple of
@@ -422,13 +451,15 @@ def tag_array_axes(kernel, ary_names, dim_tags):
     return kernel
 
 
-tag_data_axes = MovedFunctionDeprecationWrapper(tag_array_axes)
+tag_data_axes = (
+        MovedFunctionDeprecationWrapper(tag_array_axes))
 
 # }}}
 
 
 # {{{ set_array_axis_names
 
+@iterate_over_kernels_if_given_program
 def set_array_axis_names(kernel, ary_names, dim_names):
     """
     .. versionchanged:: 2016.2
@@ -453,13 +484,15 @@ def set_array_axis_names(kernel, ary_names, dim_names):
     return kernel
 
 
-set_array_dim_names = MovedFunctionDeprecationWrapper(set_array_axis_names)
+set_array_dim_names = (MovedFunctionDeprecationWrapper(
+    set_array_axis_names))
 
 # }}}
 
 
 # {{{ remove_unused_arguments
 
+@iterate_over_kernels_if_given_program
 def remove_unused_arguments(kernel):
     new_args = []
 
@@ -501,6 +534,7 @@ def remove_unused_arguments(kernel):
 
 # {{{ alias_temporaries
 
+@iterate_over_kernels_if_given_program
 def alias_temporaries(kernel, names, base_name_prefix=None,
         synchronize_for_exclusive_use=True):
     """Sets all temporaries given by *names* to be backed by a single piece of
@@ -585,11 +619,14 @@ def alias_temporaries(kernel, names, base_name_prefix=None,
 
 # {{{ set argument order
 
+@iterate_over_kernels_if_given_program
 def set_argument_order(kernel, arg_names):
     """
     :arg arg_names: A list (or comma-separated string) or argument
         names. All arguments must be in this list.
     """
+    #FIXME: @inducer -- shoulld this only affect the root kernel, or should it
+    # take a within?
 
     if isinstance(arg_names, str):
         arg_names = arg_names.split(",")
@@ -618,6 +655,7 @@ def set_argument_order(kernel, arg_names):
 
 # {{{ rename argument
 
+@iterate_over_kernels_if_given_program
 def rename_argument(kernel, old_name, new_name, existing_ok=False):
     """
     .. versionadded:: 2016.2
@@ -691,6 +729,7 @@ def rename_argument(kernel, old_name, new_name, existing_ok=False):
 
 # {{{ set temporary scope
 
+@iterate_over_kernels_if_given_program
 def set_temporary_scope(kernel, temp_var_names, scope):
     """
     :arg temp_var_names: a container with membership checking,
@@ -732,6 +771,7 @@ def set_temporary_scope(kernel, temp_var_names, scope):
 
 # {{{ reduction_arg_to_subst_rule
 
+@iterate_over_kernels_if_given_program
 def reduction_arg_to_subst_rule(
         kernel, inames, insn_match=None, subst_rule_name=None):
     if isinstance(inames, str):
diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py
index 59428cde258436c3e30f4f82b23d9c6b423605b8..124568f4512340a812d6fd366318cceb0fea2591 100644
--- a/loopy/transform/diff.py
+++ b/loopy/transform/diff.py
@@ -31,6 +31,7 @@ import loopy as lp
 from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext
 from loopy.isl_helpers import make_slab
 from loopy.diagnostic import LoopyError
+from loopy.kernel import LoopKernel
 
 
 # {{{ diff mapper
@@ -348,6 +349,8 @@ class DifferentiationContext:
                     arg.dtype,
                     shape=shape,
                     dim_tags=dim_tags,
+                    is_input=arg.is_input,
+                    is_output=arg.is_output
                 ))
 
         elif var_name in self.kernel.temporary_variables:
@@ -377,6 +380,8 @@ def diff_kernel(kernel, diff_outputs, by, diff_iname_prefix="diff_i",
         *diff_context.by_name*, or *None* if no dependency exists.
     """
 
+    assert isinstance(kernel, LoopKernel)
+
     from loopy.kernel.creation import apply_single_writer_depencency_heuristic
     kernel = apply_single_writer_depencency_heuristic(kernel, warn_if_used=True)
 
diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py
index f6d0e1a0932916b9fdf59e54ef10b45f180fb962..0880c22ae7d4ba5b2f579e4579de768c16046b9c 100644
--- a/loopy/transform/fusion.py
+++ b/loopy/transform/fusion.py
@@ -27,6 +27,10 @@ from islpy import dim_type
 from loopy.diagnostic import LoopyError
 from pymbolic import var
 
+from loopy.kernel import LoopKernel
+from loopy.program import Program
+from loopy.kernel.function_interface import CallableKernel
+
 
 def _apply_renames_in_exprs(kernel, var_renames):
     from loopy.symbolic import (
@@ -252,9 +256,6 @@ def _fuse_two_kernels(kernela, kernelb):
                 "substitution",
                 kernela.substitutions,
                 kernelb.substitutions),
-            function_manglers=_ordered_merge_lists(
-                kernela.function_manglers,
-                kernelb.function_manglers),
             symbol_manglers=_ordered_merge_lists(
                 kernela.symbol_manglers,
                 kernelb.symbol_manglers),
@@ -327,6 +328,25 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None):
 
         *data_flow* was added in version 2016.2
     """
+    # FIXME: This should take in inputs as (prog1, knlname1) and (prog2,
+    # knlname2). if prog1 == prog2 then the callable names belong to the same
+    # namespace, otherwise the kernel names should be uniquified.
+    # We should also somehow be able to know that callables like "sin"/"cos"
+    # belong to the global namespace and need not be uniquified.
+    if all(isinstance(kernel, Program) for kernel in kernels):
+        new_kernels = []
+        for knl in kernels:
+            kernel_names = [i for i, clbl in
+                    knl.callables_table.items() if isinstance(clbl,
+                        CallableKernel)]
+            if len(kernel_names) != 1:
+                raise NotImplementedError("Kernel containing more than one"
+                        " callable kernel, not allowed for now.")
+            new_kernels.append(knl[kernel_names[0]])
+
+        kernels = new_kernels[:]
+
+    assert all(isinstance(knl, LoopKernel) for knl in kernels)
     kernels = list(kernels)
 
     if data_flow is None:
@@ -405,6 +425,7 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None):
 
     # }}}
 
-    return result
+    from loopy.program import make_program
+    return make_program(result).with_entrypoints(result.name)
 
 # vim: foldmethod=marker
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index 32c56a5a3af5776d12a3a7a8b603e10c18389d59..324cec8e9a61df3e9ef51e2539ba7bdd3c9f6a67 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -29,6 +29,10 @@ from loopy.symbolic import (
         SubstitutionRuleMappingContext)
 from loopy.diagnostic import LoopyError
 
+from loopy.program import Program, iterate_over_kernels_if_given_program
+from loopy.kernel import LoopKernel
+from loopy.kernel.function_interface import CallableKernel
+
 
 __doc__ = """
 .. currentmodule:: loopy
@@ -74,6 +78,7 @@ __doc__ = """
 
 # {{{ set loop priority
 
+@iterate_over_kernels_if_given_program
 def set_loop_priority(kernel, loop_priority):
     from warnings import warn
     warn("set_loop_priority is deprecated. Use prioritize_loops instead. "
@@ -88,6 +93,7 @@ def set_loop_priority(kernel, loop_priority):
     return kernel.copy(loop_priority=frozenset([loop_priority]))
 
 
+@iterate_over_kernels_if_given_program
 def prioritize_loops(kernel, loop_priority):
     """Indicates the textual order in which loops should be entered in the
     kernel code. Note that this priority has an advisory role only. If the
@@ -102,6 +108,8 @@ def prioritize_loops(kernel, loop_priority):
     :arg: an iterable of inames, or, for brevity, a comma-separated string of
         inames
     """
+
+    assert isinstance(kernel, LoopKernel)
     if isinstance(loop_priority, str):
         loop_priority = tuple(s.strip()
                               for s in loop_priority.split(",") if s.strip())
@@ -330,6 +338,7 @@ def _split_iname_backend(kernel, iname_to_split,
 
 # {{{ split iname
 
+@iterate_over_kernels_if_given_program
 def split_iname(kernel, split_iname, inner_length,
         *,
         outer_iname=None, inner_iname=None,
@@ -356,6 +365,8 @@ def split_iname(kernel, split_iname, inner_length,
     :arg within: a stack match as understood by
         :func:`loopy.match.parse_match`.
     """
+    assert isinstance(kernel, LoopKernel)
+
     def make_new_loop_index(inner, outer):
         return inner + outer*inner_length
 
@@ -372,6 +383,7 @@ def split_iname(kernel, split_iname, inner_length,
 
 # {{{ chunk iname
 
+@iterate_over_kernels_if_given_program
 def chunk_iname(kernel, split_iname, num_chunks,
         outer_iname=None, inner_iname=None,
         outer_tag=None, inner_tag=None,
@@ -506,6 +518,7 @@ class _InameJoiner(RuleAwareSubstitutionMapper):
             return super().map_reduction(expr, expn_state)
 
 
+@iterate_over_kernels_if_given_program
 def join_inames(kernel, inames, new_iname=None, tag=None, within=None):
     """In a sense, the inverse of :func:`split_iname`. Takes in inames,
     finds their bounds (all but the first have to be bounded), and combines
@@ -606,8 +619,8 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None):
 
     new_insns = [
             insn.copy(
-                within_inames=subst_within_inames(insn.within_inames))
-            for insn in kernel.instructions]
+                within_inames=subst_within_inames(insn.within_inames)) if
+            within(kernel, insn) else insn for insn in kernel.instructions]
 
     kernel = (kernel
             .copy(
@@ -632,7 +645,7 @@ def join_inames(kernel, inames, new_iname=None, tag=None, within=None):
     if tag is not None:
         kernel = tag_inames(kernel, {new_iname: tag})
 
-    return kernel
+    return remove_unused_inames(kernel, inames)
 
 # }}}
 
@@ -662,7 +675,9 @@ def untag_inames(kernel, iname_to_untag, tag_type):
 
 # {{{ tag inames
 
-def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False):
+@iterate_over_kernels_if_given_program
+def tag_inames(kernel, iname_to_tag, force=False,
+        ignore_nonexistent=False):
     """Tag an iname
 
     :arg iname_to_tag: a list of tuples ``(iname, new_tag)``. *new_tag* is given
@@ -832,6 +847,7 @@ class _InameDuplicator(RuleAwareIdentityMapper):
         return insn.copy(within_inames=new_fid)
 
 
+@iterate_over_kernels_if_given_program
 def duplicate_inames(kernel, inames, within, new_inames=None, suffix=None,
         tags={}):
     """
@@ -992,7 +1008,7 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset(
     # If partitioning was empty, we have recursed successfully and yield nothing
 
 
-def get_iname_duplication_options(kernel, use_boostable_into=None):
+def get_iname_duplication_options(kernel, use_boostable_into=False):
     """List options for duplication of inames, if necessary for schedulability
 
     :returns: a generator listing all options to duplicate inames, if duplication
@@ -1022,6 +1038,13 @@ def get_iname_duplication_options(kernel, use_boostable_into=None):
     Use :func:`has_schedulable_iname_nesting` to decide whether an iname needs to be
     duplicated in a given kernel.
     """
+    if isinstance(kernel, Program):
+        if len([clbl for clbl in kernel.callables_table.values() if
+                isinstance(clbl, CallableKernel)]) == 1:
+            kernel = kernel[list(kernel.entrypoints)[0]]
+
+    assert isinstance(kernel, LoopKernel)
+
     if use_boostable_into:
         raise LoopyError("'use_boostable_into=True' is no longer supported.")
 
@@ -1069,6 +1092,10 @@ def has_schedulable_iname_nesting(kernel):
     :returns: a :class:`bool` indicating whether this kernel needs
         an iname duplication in order to be schedulable.
     """
+    if isinstance(kernel, Program):
+        if len([clbl for clbl in kernel.callables_table.values() if
+                isinstance(clbl, CallableKernel)]) == 1:
+            kernel = kernel[list(kernel.entrypoints)[0]]
     return not bool(next(get_iname_duplication_options(kernel), False))
 
 # }}}
@@ -1076,6 +1103,7 @@ def has_schedulable_iname_nesting(kernel):
 
 # {{{ rename_inames
 
+@iterate_over_kernels_if_given_program
 def rename_iname(kernel, old_iname, new_iname, existing_ok=False, within=None):
     """
     :arg within: a stack match as understood by
@@ -1325,6 +1353,7 @@ def _split_reduction(kernel, inames, direction, within=None):
             rsplit.map_kernel(kernel))
 
 
+@iterate_over_kernels_if_given_program
 def split_reduction_inward(kernel, inames, within=None):
     """Takes a reduction of the form::
 
@@ -1344,6 +1373,7 @@ def split_reduction_inward(kernel, inames, within=None):
     return _split_reduction(kernel, inames, "in", within)
 
 
+@iterate_over_kernels_if_given_program
 def split_reduction_outward(kernel, inames, within=None):
     """Takes a reduction of the form::
 
@@ -1367,6 +1397,7 @@ def split_reduction_outward(kernel, inames, within=None):
 
 # {{{ affine map inames
 
+@iterate_over_kernels_if_given_program
 def affine_map_inames(kernel, old_inames, new_inames, equations):
     """Return a new *kernel* where the affine transform
     specified by *equations* has been applied to the inames.
@@ -1698,6 +1729,7 @@ class _ReductionInameUniquifier(RuleAwareIdentityMapper):
                     expr, expn_state)
 
 
+@iterate_over_kernels_if_given_program
 def make_reduction_inames_unique(kernel, inames=None, within=None):
     """
     :arg inames: if not *None*, only apply to these inames
@@ -1744,6 +1776,7 @@ def make_reduction_inames_unique(kernel, inames=None, within=None):
 
 # {{{ add_inames_to_insn
 
+@iterate_over_kernels_if_given_program
 def add_inames_to_insn(kernel, inames, insn_match):
     """
     :arg inames: a frozenset of inames that will be added to the
@@ -1782,6 +1815,7 @@ def add_inames_to_insn(kernel, inames, insn_match):
 # }}}
 
 
+@iterate_over_kernels_if_given_program
 def add_inames_for_unused_hw_axes(kernel, within=None):
     """
     Returns a kernel with inames added to each instruction
diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 045850651f94ebed65afc24b0008a712b047dd20..a48e8eda7472a73116a9cfcb2c567b23191ead93 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -21,15 +21,38 @@ THE SOFTWARE.
 """
 
 from loopy.diagnostic import LoopyError
+from loopy.kernel import LoopKernel
+from loopy.kernel.function_interface import (ScalarCallable, CallableKernel)
+from loopy.program import Program, iterate_over_kernels_if_given_program
 
 
 # {{{ find_instructions
 
-def find_instructions(kernel, insn_match):
+def find_instructions_in_single_kernel(kernel, insn_match):
+    assert isinstance(kernel, LoopKernel)
     from loopy.match import parse_match
     match = parse_match(insn_match)
     return [insn for insn in kernel.instructions if match(kernel, insn)]
 
+
+def find_instructions(program, insn_match):
+    if isinstance(program, LoopKernel):
+        return find_instructions_in_single_kernel(program, insn_match)
+
+    assert isinstance(program, Program)
+    insns = []
+    for in_knl_callable in program.callables_table.values():
+        if isinstance(in_knl_callable, CallableKernel):
+            insns += (find_instructions_in_single_kernel(
+                in_knl_callable.subkernel, insn_match))
+        elif isinstance(in_knl_callable, ScalarCallable):
+            pass
+        else:
+            raise NotImplementedError("Unknown callable type %s." % (
+                type(in_knl_callable)))
+
+    return insns
+
 # }}}
 
 
@@ -54,6 +77,7 @@ def map_instructions(kernel, insn_match, f):
 
 # {{{ set_instruction_priority
 
+@iterate_over_kernels_if_given_program
 def set_instruction_priority(kernel, insn_match, priority):
     """Set the priority of instructions matching *insn_match* to *priority*.
 
@@ -71,6 +95,7 @@ def set_instruction_priority(kernel, insn_match, priority):
 
 # {{{ add_dependency
 
+@iterate_over_kernels_if_given_program
 def add_dependency(kernel, insn_match, depends_on):
     """Add the instruction dependency *dependency* to the instructions matched
     by *insn_match*.
@@ -88,7 +113,8 @@ def add_dependency(kernel, insn_match, depends_on):
         added_deps = frozenset([depends_on])
     else:
         added_deps = frozenset(
-                dep.id for dep in find_instructions(kernel, depends_on))
+                dep.id for dep in find_instructions_in_single_kernel(kernel,
+                    depends_on))
 
     if not added_deps:
         raise LoopyError("no instructions found matching '%s' "
@@ -119,6 +145,7 @@ def add_dependency(kernel, insn_match, depends_on):
 
 # {{{ remove_instructions
 
+@iterate_over_kernels_if_given_program
 def remove_instructions(kernel, insn_ids):
     """Return a new kernel with instructions in *insn_ids* removed.
 
@@ -209,6 +236,7 @@ def replace_instruction_ids(kernel, replacements):
 
 # {{{ tag_instructions
 
+@iterate_over_kernels_if_given_program
 def tag_instructions(kernel, new_tag, within=None):
     from loopy.match import parse_match
     within = parse_match(within)
@@ -231,6 +259,7 @@ def tag_instructions(kernel, new_tag, within=None):
 
 # {{{ add nosync
 
+@iterate_over_kernels_if_given_program
 def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False,
         empty_ok=False):
     """Add a *no_sync_with* directive between *source* and *sink*.
@@ -263,18 +292,21 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False,
         This used to silently pass. This behavior can be restored using
         *empty_ok*.
     """
+    assert isinstance(kernel, LoopKernel)
 
     if isinstance(source, str) and source in kernel.id_to_insn:
         sources = frozenset([source])
     else:
         sources = frozenset(
-                source.id for source in find_instructions(kernel, source))
+                source.id for source in find_instructions_in_single_kernel(
+                    kernel, source))
 
     if isinstance(sink, str) and sink in kernel.id_to_insn:
         sinks = frozenset([sink])
     else:
         sinks = frozenset(
-                sink.id for sink in find_instructions(kernel, sink))
+                sink.id for sink in find_instructions_in_single_kernel(
+                    kernel, sink))
 
     if not sources and not empty_ok:
         raise LoopyError("No match found for source specification '%s'." % source)
@@ -327,6 +359,7 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False,
 
 # {{{ uniquify_instruction_ids
 
+@iterate_over_kernels_if_given_program
 def uniquify_instruction_ids(kernel):
     """Converts any ids that are :class:`loopy.UniqueName` or *None* into unique
     strings.
diff --git a/loopy/transform/make_scalar.py b/loopy/transform/make_scalar.py
new file mode 100644
index 0000000000000000000000000000000000000000..b8db7f43f90a5a1203dea470c9a0ba6f8fa21cae
--- /dev/null
+++ b/loopy/transform/make_scalar.py
@@ -0,0 +1,51 @@
+from pymbolic.primitives import Variable
+from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingContext)
+from loopy.kernel.data import ValueArg
+from loopy.transform.iname import remove_unused_inames
+
+
+class ScalarChanger(RuleAwareIdentityMapper):
+    def __init__(self, rule_mapping_context, var_name):
+        self.var_name = var_name
+        super().__init__(rule_mapping_context)
+
+    def map_subscript(self, expr, expn_state):
+        if expr.aggregate.name == self.var_name:
+            return Variable(self.var_name)
+
+        return super().map_subscript(expr, expn_state)
+
+
+def make_scalar(kernel, var_name):
+    rule_mapping_context = SubstitutionRuleMappingContext(kernel.substitutions,
+            kernel.get_var_name_generator())
+
+    kernel = ScalarChanger(rule_mapping_context, var_name).map_kernel(kernel)
+
+    new_args = [ValueArg(arg.name, arg.dtype, target=arg.target,
+        is_output=arg.is_output) if arg.name == var_name else arg for
+        arg in kernel.args]
+    new_temps = dict((tv.name, tv.copy(shape=(), dim_tags=None))
+            if tv.name == var_name else (tv.name, tv) for tv in
+            kernel.temporary_variables.values())
+
+    return kernel.copy(args=new_args, temporary_variables=new_temps)
+
+
+def remove_invariant_inames(kernel):
+    inames_used = set()
+    untagged_inames = (
+            kernel.all_inames() - frozenset(kernel.iname_to_tags.keys()))
+    for insn in kernel.instructions:
+        for iname in ((insn.read_dependency_names()
+            | insn.write_dependency_names())
+        & untagged_inames):
+            inames_used.add(iname)
+
+    removable_inames = untagged_inames - inames_used
+
+    new_insns = [insn.copy(within_inames=insn.within_inames-removable_inames)
+            for insn in kernel.instructions]
+
+    return remove_unused_inames(kernel.copy(instructions=new_insns),
+            removable_inames)
diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py
new file mode 100644
index 0000000000000000000000000000000000000000..cf0730760417e9c439e05fcd9f19449cb98aba02
--- /dev/null
+++ b/loopy/transform/pack_and_unpack_args.py
@@ -0,0 +1,340 @@
+__copyright__ = "Copyright (C) 2018 Tianjiao Sun, Kaushik Kulkarni"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from loopy.diagnostic import LoopyError
+from loopy.kernel.instruction import CallInstruction
+from loopy.program import Program
+from loopy.kernel import LoopKernel
+from loopy.kernel.function_interface import CallableKernel, ScalarCallable
+from loopy.symbolic import SubArrayRef
+
+__doc__ = """
+.. currentmodule:: loopy
+
+.. autofunction:: pack_and_unpack_args_for_call
+"""
+
+
+def pack_and_unpack_args_for_call_for_single_kernel(kernel,
+        callables_table, call_name, args_to_pack=None,
+        args_to_unpack=None):
+    """
+    Returns a a copy of *kernel* with instructions appended to copy the
+    arguments in *args* to match the alignment expected by the *call_name* in
+    the kernel. The arguments are copied back to *args* with the appropriate
+    data layout.
+
+    :arg call_name: An instance of :class:`str` denoting the function call in
+        the *kernel*.
+    :arg args_to_unpack: A list of the arguments as instances of :class:`str` which
+        must be packed. If set *None*, it is interpreted that all the array
+        arguments would be packed.
+    :arg args_to_unpack: A list of the arguments as instances of :class:`str`
+        which must be unpacked. If set *None*, it is interpreted that
+        all the array arguments should be unpacked.
+    """
+    assert isinstance(kernel, LoopKernel)
+    new_domains = []
+    new_tmps = kernel.temporary_variables.copy()
+    old_insn_to_new_insns = {}
+
+    for insn in kernel.instructions:
+        if not isinstance(insn, CallInstruction):
+            # pack and unpack call only be done for CallInstructions.
+            continue
+        if insn.expression.function.name not in callables_table:
+            continue
+
+        in_knl_callable = callables_table[
+                insn.expression.function.name]
+
+        if in_knl_callable.name != call_name:
+            # not the function we're looking for.
+            continue
+        in_knl_callable = in_knl_callable.with_packing_for_args()
+
+        vng = kernel.get_var_name_generator()
+        ing = kernel.get_instruction_id_generator()
+
+        parameters = insn.expression.parameters
+        if args_to_pack is None:
+            args_to_pack = [par.subscript.aggregate.name for par in
+                    parameters+insn.assignees if isinstance(par, SubArrayRef)
+                    and (par.swept_inames)]
+        if args_to_unpack is None:
+            args_to_unpack = [par.subscript.aggregate.name for par in
+                    parameters+insn.assignees if isinstance(par, SubArrayRef)
+                    and (par.swept_inames)]
+
+        # {{{ sanity checks for args
+
+        assert isinstance(args_to_pack, list)
+        assert isinstance(args_to_unpack, list)
+
+        for arg in args_to_pack:
+            found_sub_array_ref = False
+
+            for par in parameters + insn.assignees:
+                # checking that the given args is a sub array ref
+                if isinstance(par, SubArrayRef) and (
+                        par.subscript.aggregate.name == arg):
+                    found_sub_array_ref = True
+                    break
+            if not found_sub_array_ref:
+                raise LoopyError("No match found for packing arg '%s' of call '%s' "
+                        "at insn '%s'." % (arg, call_name, insn.id))
+        for arg in args_to_unpack:
+            if arg not in args_to_pack:
+                raise LoopyError("Argument %s should be packed in order to be "
+                        "unpacked." % arg)
+
+        # }}}
+
+        packing_insns = []
+        unpacking_insns = []
+
+        # {{{ handling ilp tags
+
+        from loopy.kernel.data import IlpBaseTag, VectorizeTag
+        import islpy as isl
+        from pymbolic import var
+
+        dim_type = isl.dim_type.set
+        ilp_inames = {iname for iname in insn.within_inames
+                         if all(isinstance(tag, (IlpBaseTag, VectorizeTag))
+                                for tag in kernel.iname_to_tags.get(iname, []))}
+        new_ilp_inames = set()
+        ilp_inames_map = {}
+        for iname in ilp_inames:
+            new_iname_name = vng(iname + "_ilp")
+            ilp_inames_map[var(iname)] = var(new_iname_name)
+            new_ilp_inames.add(new_iname_name)
+        for iname in ilp_inames:
+            new_domain = kernel.get_inames_domain(iname).copy()
+            for i in range(new_domain.n_dim()):
+                old_iname = new_domain.get_dim_name(dim_type, i)
+                if old_iname in ilp_inames:
+                    new_domain = new_domain.set_dim_name(
+                        dim_type, i, ilp_inames_map[var(old_iname)].name)
+            new_domains.append(new_domain)
+
+        # }}}
+
+        from pymbolic.mapper.substitutor import make_subst_func
+        from loopy.symbolic import SubstitutionMapper
+
+        # dict to store the new assignees and parameters, the mapping pattern
+        # from arg_id to parameters is identical to InKernelCallable.arg_id_to_dtype
+        id_to_parameters = tuple(enumerate(parameters)) + tuple(
+                (-i-1, assignee) for i, assignee in enumerate(insn.assignees))
+        new_id_to_parameters = {}
+
+        for arg_id, p in id_to_parameters:
+            if isinstance(p, SubArrayRef) and (p.subscript.aggregate.name in
+                    args_to_pack):
+                new_pack_inames = ilp_inames_map.copy()  # packing-specific inames
+                new_unpack_inames = ilp_inames_map.copy()  # unpacking-specific iname
+
+                new_pack_inames = {iname: var(vng(iname.name +
+                    "_pack")) for iname in p.swept_inames}
+                new_unpack_inames = {iname: var(vng(iname.name +
+                    "_unpack")) for iname in p.swept_inames}
+
+                # Updating the domains corresponding to the new inames.
+                for iname in p.swept_inames:
+                    new_domain_pack = kernel.get_inames_domain(iname.name).copy()
+                    new_domain_unpack = kernel.get_inames_domain(iname.name).copy()
+                    for i in range(new_domain_pack.n_dim()):
+                        old_iname = new_domain_pack.get_dim_name(dim_type, i)
+                        if var(old_iname) in new_pack_inames:
+                            new_domain_pack = new_domain_pack.set_dim_name(
+                                dim_type, i, new_pack_inames[var(old_iname)].name)
+                            new_domain_unpack = new_domain_unpack.set_dim_name(
+                                dim_type, i, new_unpack_inames[var(old_iname)].name)
+                    new_domains.append(new_domain_pack)
+                    new_domains.append(new_domain_unpack)
+
+                arg = p.subscript.aggregate.name
+                pack_name = vng(arg + "_pack")
+
+                from loopy.kernel.data import (TemporaryVariable,
+                        temp_var_scope)
+
+                if arg in kernel.arg_dict:
+                    arg_in_caller = kernel.arg_dict[arg]
+                else:
+                    arg_in_caller = kernel.temporary_variables[arg]
+
+                pack_tmp = TemporaryVariable(
+                    name=pack_name,
+                    dtype=arg_in_caller.dtype,
+                    dim_tags=in_knl_callable.arg_id_to_descr[arg_id].dim_tags,
+                    shape=in_knl_callable.arg_id_to_descr[arg_id].shape,
+                    scope=temp_var_scope.PRIVATE,
+                )
+
+                new_tmps[pack_name] = pack_tmp
+
+                from loopy import Assignment
+                pack_subst_mapper = SubstitutionMapper(make_subst_func(
+                    new_pack_inames))
+                unpack_subst_mapper = SubstitutionMapper(make_subst_func(
+                    new_unpack_inames))
+
+                # {{{ getting the lhs for packing and rhs for unpacking
+
+                from loopy.isl_helpers import simplify_via_aff, make_slab
+
+                flatten_index = simplify_via_aff(
+                        sum(dim_tag.stride*idx for dim_tag, idx in
+                        zip(arg_in_caller.dim_tags, p.subscript.index_tuple)))
+
+                new_indices = []
+                for dim_tag in in_knl_callable.arg_id_to_descr[arg_id].dim_tags:
+                    ind = flatten_index // dim_tag.stride
+                    flatten_index -= (dim_tag.stride * ind)
+                    new_indices.append(ind)
+
+                new_indices = tuple(simplify_via_aff(i) for i in new_indices)
+
+                pack_lhs_assignee = pack_subst_mapper(
+                        var(pack_name).index(new_indices))
+                unpack_rhs = unpack_subst_mapper(
+                        var(pack_name).index(new_indices))
+
+                # }}}
+
+                packing_insns.append(Assignment(
+                    assignee=pack_lhs_assignee,
+                    expression=pack_subst_mapper.map_subscript(p.subscript),
+                    within_inames=insn.within_inames - ilp_inames | {
+                        new_pack_inames[i].name for i in p.swept_inames} | (
+                            new_ilp_inames),
+                    depends_on=insn.depends_on,
+                    id=ing(insn.id+"_pack"),
+                    depends_on_is_final=True
+                ))
+
+                if p.subscript.aggregate.name in args_to_unpack:
+                    unpacking_insns.append(Assignment(
+                        expression=unpack_rhs,
+                        assignee=unpack_subst_mapper.map_subscript(p.subscript),
+                        within_inames=insn.within_inames - ilp_inames | {
+                            new_unpack_inames[i].name for i in p.swept_inames} | (
+                                new_ilp_inames),
+                        id=ing(insn.id+"_unpack"),
+                        depends_on=frozenset([insn.id]),
+                        depends_on_is_final=True
+                    ))
+
+                # {{{ creating the sweep inames for the new sub array refs
+
+                updated_swept_inames = []
+
+                for i, _ in enumerate(
+                        in_knl_callable.arg_id_to_descr[arg_id].shape):
+                    updated_swept_inames.append(var(vng("i_packsweep_"+arg)))
+
+                ctx = kernel.isl_context
+                space = isl.Space.create_from_names(ctx,
+                        set=[iname.name for iname in updated_swept_inames])
+                iname_set = isl.BasicSet.universe(space)
+                for iname, axis_length in zip(updated_swept_inames,
+                        in_knl_callable.arg_id_to_descr[arg_id].shape):
+                    iname_set = iname_set & make_slab(space, iname.name, 0,
+                            axis_length)
+                new_domains = new_domains + [iname_set]
+
+                # }}}
+
+                new_id_to_parameters[arg_id] = SubArrayRef(
+                        tuple(updated_swept_inames),
+                        (var(pack_name).index(tuple(updated_swept_inames))))
+            else:
+                new_id_to_parameters[arg_id] = p
+
+        if packing_insns:
+            subst_mapper = SubstitutionMapper(make_subst_func(ilp_inames_map))
+            new_call_insn = insn.with_transformed_expressions(subst_mapper)
+            new_params = tuple(subst_mapper(new_id_to_parameters[i]) for i, _ in
+                    enumerate(parameters))
+            new_assignees = tuple(subst_mapper(new_id_to_parameters[-i-1])
+                    for i, _ in enumerate(insn.assignees))
+            new_call_insn = new_call_insn.copy(
+                    depends_on=new_call_insn.depends_on | {
+                        pack.id for pack in packing_insns},
+                    within_inames=new_call_insn.within_inames - ilp_inames | (
+                        new_ilp_inames),
+                    expression=new_call_insn.expression.function(*new_params),
+                    assignees=new_assignees)
+            old_insn_to_new_insns[insn.id] = (packing_insns + [new_call_insn] +
+                    unpacking_insns)
+
+    if old_insn_to_new_insns:
+        new_instructions = []
+        for insn in kernel.instructions:
+            if insn.id in old_insn_to_new_insns:
+                # Replacing the current instruction with the group of
+                # instructions including the packing and unpacking instructions
+                new_instructions.extend(old_insn_to_new_insns[insn.id])
+            else:
+                # for the instructions that depend on the call instruction that
+                # are to be packed and unpacked, we need to add the complete
+                # instruction block as a dependency for them.
+                new_depends_on = insn.depends_on
+                if insn.depends_on & set(old_insn_to_new_insns):
+                    # need to add the unpack instructions on dependencies.
+                    for old_insn_id in insn.depends_on & set(old_insn_to_new_insns):
+                        new_depends_on |= frozenset(i.id for i
+                                in old_insn_to_new_insns[old_insn_id])
+                new_instructions.append(insn.copy(depends_on=new_depends_on))
+        kernel = kernel.copy(
+            domains=kernel.domains + new_domains,
+            instructions=new_instructions,
+            temporary_variables=new_tmps
+        )
+
+    return kernel
+
+
+def pack_and_unpack_args_for_call(program, *args, **kwargs):
+    assert isinstance(program, Program)
+
+    new_callables = {}
+    for func_id, in_knl_callable in program.callables_table.items():
+        if isinstance(in_knl_callable, CallableKernel):
+            new_subkernel = pack_and_unpack_args_for_call_for_single_kernel(
+                    in_knl_callable.subkernel, program.callables_table,
+                    *args, **kwargs)
+            in_knl_callable = in_knl_callable.copy(
+                    subkernel=new_subkernel)
+        elif isinstance(in_knl_callable, ScalarCallable):
+            pass
+        else:
+            raise NotImplementedError("Unknown type of callable %s." % (
+                type(in_knl_callable).__name__))
+
+        new_callables[func_id] = in_knl_callable
+
+    return program.copy(callables_table=new_callables)
+
+# vim: foldmethod=marker
diff --git a/loopy/transform/padding.py b/loopy/transform/padding.py
index 390fafb2b71b705acaa990e45d2d4d4b9fc59cbe..455ce31d03fa30476b9154ea2773a06b3db1b17d 100644
--- a/loopy/transform/padding.py
+++ b/loopy/transform/padding.py
@@ -24,6 +24,11 @@ THE SOFTWARE.
 from pytools import MovedFunctionDeprecationWrapper
 from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext
 
+from loopy.program import iterate_over_kernels_if_given_program, Program
+from loopy.kernel import LoopKernel
+from loopy.kernel.function_interface import CallableKernel
+from loopy.diagnostic import LoopyError
+
 
 class ArrayAxisSplitHelper(RuleAwareIdentityMapper):
     def __init__(self, rule_mapping_context, arg_names, handler):
@@ -40,7 +45,9 @@ class ArrayAxisSplitHelper(RuleAwareIdentityMapper):
 
 # {{{ split_array_dim (deprecated since June 2016)
 
-def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True,
+@iterate_over_kernels_if_given_program
+def split_array_dim(kernel, arrays_and_axes, count,
+        auto_split_inames=True,
         split_kwargs=None):
     """
     :arg arrays_and_axes: a list of tuples *(array, axis_nr)* indicating
@@ -242,7 +249,7 @@ def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True,
     return kernel
 
 
-split_arg_axis = MovedFunctionDeprecationWrapper(split_array_dim)
+split_arg_axis = (MovedFunctionDeprecationWrapper(split_array_dim))
 
 # }}}
 
@@ -366,7 +373,9 @@ def _split_array_axis_inner(kernel, array_name, axis_nr, count, order="C"):
     return kernel
 
 
-def split_array_axis(kernel, array_names, axis_nr, count, order="C"):
+@iterate_over_kernels_if_given_program
+def split_array_axis(kernel, array_names, axis_nr, count,
+        order="C"):
     """
     :arg array: a list of names of temporary variables or arguments. May
         also be a comma-separated string of these.
@@ -384,6 +393,7 @@ def split_array_axis(kernel, array_names, axis_nr, count, order="C"):
         ``loopy.split_array_dim`` that had the role of this function in
         versions prior to 2016.2.
     """
+    assert isinstance(kernel, LoopKernel)
 
     if isinstance(array_names, str):
         array_names = [i.strip() for i in array_names.split(",") if i.strip()]
@@ -399,6 +409,15 @@ def split_array_axis(kernel, array_names, axis_nr, count, order="C"):
 # {{{ find_padding_multiple
 
 def find_padding_multiple(kernel, variable, axis, align_bytes, allowed_waste=0.1):
+    if isinstance(kernel, Program):
+        kernel_names = [i for i, clbl in kernel.callables_table.items()
+                if isinstance(clbl, CallableKernel)]
+        if len(kernel_names) > 1:
+            raise LoopyError()
+        return find_padding_multiple(kernel[kernel_names[0]], variable, axis,
+                align_bytes, allowed_waste)
+    assert isinstance(kernel, LoopKernel)
+
     arg = kernel.arg_dict[variable]
 
     if arg.dim_tags is None:
@@ -436,6 +455,7 @@ def find_padding_multiple(kernel, variable, axis, align_bytes, allowed_waste=0.1
 
 # {{{ add_padding
 
+@iterate_over_kernels_if_given_program
 def add_padding(kernel, variable, axis, align_bytes):
     arg_to_idx = {arg.name: i for i, arg in enumerate(kernel.args)}
     arg_idx = arg_to_idx[variable]
diff --git a/loopy/transform/parameter.py b/loopy/transform/parameter.py
index 60fef9e9852fcff2e6a3a9929d45bc59508fbcb7..52feb577a21ba473827bd70830373e91ec0dd1f0 100644
--- a/loopy/transform/parameter.py
+++ b/loopy/transform/parameter.py
@@ -25,6 +25,9 @@ from loopy.symbolic import (RuleAwareSubstitutionMapper,
         SubstitutionRuleMappingContext)
 import islpy as isl
 
+from loopy.program import iterate_over_kernels_if_given_program
+from loopy.kernel import LoopKernel
+
 __doc__ = """
 
 .. currentmodule:: loopy
@@ -37,6 +40,7 @@ __doc__ = """
 
 # {{{ assume
 
+@iterate_over_kernels_if_given_program
 def assume(kernel, assumptions):
     """Include an assumption about :ref:`domain-parameters` in the kernel, e.g.
     `n mod 4 = 0`.
@@ -64,18 +68,8 @@ def assume(kernel, assumptions):
 
 # {{{ fix_parameter
 
-def fix_parameters(kernel, within=None, **value_dict):
-    """Fix the values of the arguments to specific constants.
-
-    *value_dict* consists of *name*/*value* pairs, where *name* will be fixed
-    to be *value*. *name* may refer to :ref:`domain-parameters` or
-    :ref:`arguments`.
-    """
-
-    if not value_dict:
-        return kernel
-
-    def process_set_one_param(s, name, value):
+def _fix_parameter(kernel, name, value, remove_argument, within=None):
+    def process_set(s):
         var_dict = s.get_var_dict()
 
         try:
@@ -95,15 +89,10 @@ def fix_parameters(kernel, within=None, **value_dict):
 
         return s
 
-    def process_set(s):
-        for name, value in value_dict.items():
-            s = process_set_one_param(s, name, value)
-        return s
-
     new_domains = [process_set(dom) for dom in kernel.domains]
 
     from pymbolic.mapper.substitutor import make_subst_func
-    subst_func = make_subst_func(value_dict)
+    subst_func = make_subst_func({name: value})
 
     from loopy.symbolic import SubstitutionMapper, PartialEvaluationMapper
     subst_map = SubstitutionMapper(subst_func)
@@ -115,8 +104,7 @@ def fix_parameters(kernel, within=None, **value_dict):
     from loopy.kernel.array import ArrayBase
     new_args = []
     for arg in kernel.args:
-        if arg.name in value_dict.keys():
-            # remove from argument list
+        if arg.name == name and remove_argument:
             continue
 
         if not isinstance(arg, ArrayBase):
@@ -146,6 +134,29 @@ def fix_parameters(kernel, within=None, **value_dict):
                 ))
 
 
+@iterate_over_kernels_if_given_program
+def fix_parameters(kernel, **value_dict):
+    """Fix the values of the arguments to specific constants.
+
+    *value_dict* consists of *name*/*value* pairs, where *name* will be fixed
+    to be *value*. *name* may refer to :ref:`domain-parameters` or
+    :ref:`arguments`.
+    """
+    assert isinstance(kernel, LoopKernel)
+
+    # FIXME: Parameter / argument terminology?
+
+    # FIXME: Is _remove the right approach? (I'm not sure it is.) Because of
+    # the potential namespace conflict. If yes, document. If no, fix.
+
+    remove_arg = value_dict.pop("_remove", True)
+    within = value_dict.pop("within", None)
+
+    for name, value in value_dict.items():
+        kernel = _fix_parameter(kernel, name, value, remove_arg, within)
+
+    return kernel
+
 # }}}
 
 # vim: foldmethod=marker
diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py
index cefed807d73bd0a9064c170190a3ba19b2d5abf6..438c07339b217f21d3e60c4f2f87050ea5b2d0d7 100644
--- a/loopy/transform/precompute.py
+++ b/loopy/transform/precompute.py
@@ -27,6 +27,8 @@ from loopy.symbolic import (get_dependencies,
         SubstitutionRuleMappingContext)
 from loopy.diagnostic import LoopyError
 from pymbolic.mapper.substitutor import make_subst_func
+from loopy.program import Program
+from loopy.kernel.function_interface import CallableKernel, ScalarCallable
 import numpy as np
 
 from pymbolic import var
@@ -255,9 +257,9 @@ class _not_provided:  # noqa: N801
     pass
 
 
-def precompute(kernel, subst_use, sweep_inames=[], within=None,
-        storage_axes=None, temporary_name=None, precompute_inames=None,
-        precompute_outer_inames=None,
+def precompute_for_single_kernel(kernel, callables_table, subst_use,
+        sweep_inames=[], within=None, storage_axes=None, temporary_name=None,
+        precompute_inames=None, precompute_outer_inames=None,
         storage_axis_to_tag={},
 
         # "None" is a valid value here, distinct from the default.
@@ -352,6 +354,18 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
     Trivial storage axes (i.e. axes of length 1 with respect to the sweep) are
     eliminated.
     """
+    if isinstance(kernel, Program):
+        kernel_names = [i for i, clbl in
+                kernel.callables_table.items() if isinstance(clbl,
+                    CallableKernel)]
+        if len(kernel_names) != 1:
+            raise LoopyError()
+
+        return kernel.with_kernel(precompute(kernel[kernel_names[0]],
+            subst_use, sweep_inames, within, storage_axes, temporary_name,
+            precompute_inames, precompute_outer_inames, storage_axis_to_tag,
+            default_tag, dtype, fetch_bounding_box, temporary_address_space,
+            compute_insn_id, kernel.callables_table, **kwargs))
 
     # {{{ unify temporary_address_space / temporary_scope
 
@@ -1030,15 +1044,34 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
     # }}}
 
-    from loopy import tag_inames
+    from loopy.transform.iname import tag_inames
     kernel = tag_inames(kernel, new_iname_to_tag)
 
     from loopy.kernel.data import AutoFitLocalIndexTag, filter_iname_tags_by_type
 
     if filter_iname_tags_by_type(new_iname_to_tag.values(), AutoFitLocalIndexTag):
         from loopy.kernel.tools import assign_automatic_axes
-        kernel = assign_automatic_axes(kernel)
+        kernel = assign_automatic_axes(kernel, callables_table)
 
     return kernel
 
+
+def precompute(program, *args, **kwargs):
+    assert isinstance(program, Program)
+    new_callables = {}
+
+    for func_id, clbl in program.callables_table.items():
+        if isinstance(clbl, CallableKernel):
+            knl = precompute_for_single_kernel(clbl.subkernel,
+                    program.callables_table, *args, **kwargs)
+            clbl = clbl.copy(subkernel=knl)
+        elif isinstance(clbl, ScalarCallable):
+            pass
+        else:
+            raise NotImplementedError()
+
+        new_callables[func_id] = clbl
+
+    return program.copy(callables_table=new_callables)
+
 # vim: foldmethod=marker
diff --git a/loopy/transform/save.py b/loopy/transform/save.py
index 0a9cfb7bce21a64cc2858e4f3b9472e2992984b8..884e17f776ccb3d81a7c33ba195c2dcb5c7debfd 100644
--- a/loopy/transform/save.py
+++ b/loopy/transform/save.py
@@ -61,7 +61,7 @@ class LivenessAnalysis:
 
     def __init__(self, kernel):
         self.kernel = kernel
-        self.schedule = self.kernel.schedule
+        self.schedule = kernel.schedule
 
     @memoize_method
     def get_successor_relation(self):
@@ -232,8 +232,9 @@ class TemporarySaver:
         def new_shape(self):
             return self.hw_dims + self.non_hw_dims
 
-    def __init__(self, kernel):
+    def __init__(self, kernel, callables_table):
         self.kernel = kernel
+        self.callables_table = callables_table
         self.var_name_gen = kernel.get_var_name_generator()
         self.insn_name_gen = kernel.get_instruction_id_generator()
 
@@ -436,7 +437,8 @@ class TemporarySaver:
             return (), ()
 
         group_sizes, local_sizes = (
-            self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids))
+            self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids,
+                self.callables_table))
 
         if temporary.address_space == lp.AddressSpace.LOCAL:
             # Elide local axes in the save slot for local temporaries.
@@ -623,7 +625,7 @@ class TemporarySaver:
                     kernel = lp.add_nosync(kernel, "global", source, sink)
 
         from loopy.kernel.tools import assign_automatic_axes
-        return assign_automatic_axes(kernel)
+        return assign_automatic_axes(kernel, self.callables_table)
 
     def save(self, temporary, subkernel):
         self.save_or_reload_impl(temporary, subkernel, "save")
@@ -717,7 +719,7 @@ class TemporarySaver:
 
 # {{{ auto save and reload across kernel calls
 
-def save_and_reload_temporaries(kernel):
+def save_and_reload_temporaries(program, entrypoint=None):
     """
     Add instructions to save and reload temporary variables that are live
     across kernel calls.
@@ -740,13 +742,28 @@ def save_and_reload_temporaries(kernel):
 
     :returns: The resulting kernel
     """
-    liveness = LivenessAnalysis(kernel)
-    saver = TemporarySaver(kernel)
+    if entrypoint is None:
+        if len(program.entrypoints) != 1:
+            raise LoopyError("Missing argument 'entrypoint'.")
+        entrypoint = list(program.entrypoints)[0]
+
+    knl = program[entrypoint]
+
+    if not knl.schedule:
+        program = lp.preprocess_program(program)
+        from loopy.schedule import get_one_scheduled_kernel
+        knl = get_one_scheduled_kernel(program[entrypoint],
+                program.callables_table)
+
+    assert knl.schedule is not None
+
+    liveness = LivenessAnalysis(knl)
+    saver = TemporarySaver(knl, program.callables_table)
 
     from loopy.schedule.tools import (
         temporaries_read_in_subkernel, temporaries_written_in_subkernel)
 
-    for sched_idx, sched_item in enumerate(kernel.schedule):
+    for sched_idx, sched_item in enumerate(knl.schedule):
 
         if isinstance(sched_item, CallKernel):
             # Any written temporary that is live-out needs to be read into
@@ -757,8 +774,9 @@ def save_and_reload_temporaries(kernel):
             else:
                 subkernel = sched_item.kernel_name
                 interesting_temporaries = (
-                    temporaries_read_in_subkernel(kernel, subkernel)
-                    | temporaries_written_in_subkernel(kernel, subkernel))
+                    temporaries_read_in_subkernel(knl, subkernel)
+                    | temporaries_written_in_subkernel(knl,
+                                                       subkernel))
 
             for temporary in liveness[sched_idx].live_out & interesting_temporaries:
                 logger.info("reloading {} at entry of {}"
@@ -766,20 +784,20 @@ def save_and_reload_temporaries(kernel):
                 saver.reload(temporary, sched_item.kernel_name)
 
         elif isinstance(sched_item, ReturnFromKernel):
-            if sched_idx == len(kernel.schedule) - 1:
+            if sched_idx == len(knl.schedule) - 1:
                 # Kernel exit: nothing live
                 interesting_temporaries = set()
             else:
                 subkernel = sched_item.kernel_name
                 interesting_temporaries = (
-                    temporaries_written_in_subkernel(kernel, subkernel))
+                    temporaries_written_in_subkernel(knl, subkernel))
 
             for temporary in liveness[sched_idx].live_in & interesting_temporaries:
                 logger.info("saving {} before return of {}"
                         .format(temporary, sched_item.kernel_name))
                 saver.save(temporary, sched_item.kernel_name)
 
-    return saver.finish()
+    return program.with_kernel(saver.finish())
 
 # }}}
 
diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py
index 565c69a49d07c92311f750e4a8fce0db91ff9bb2..066cf326cc4f7ea45e693ea1f48910dbe5747ad1 100644
--- a/loopy/transform/subst.py
+++ b/loopy/transform/subst.py
@@ -28,6 +28,8 @@ from loopy.transform.iname import remove_any_newly_unused_inames
 from pytools import ImmutableRecord
 from pymbolic import var
 
+from loopy.program import iterate_over_kernels_if_given_program, Program
+from loopy.kernel.function_interface import CallableKernel, ScalarCallable
 
 import logging
 logger = logging.getLogger(__name__)
@@ -51,6 +53,16 @@ def extract_subst(kernel, subst_name, template, parameters=()):
     unifications.
     """
 
+    if isinstance(kernel, Program):
+        kernel_names = [i for i, clbl in
+                kernel.callables_table.items() if isinstance(clbl,
+                    CallableKernel)]
+        if len(kernel_names) != 1:
+            raise LoopyError()
+
+        return kernel.with_kernel(extract_subst(kernel[kernel_names[0]],
+            subst_name, template, parameters))
+
     if isinstance(template, str):
         from pymbolic import parse
         template = parse(template)
@@ -188,6 +200,7 @@ def extract_subst(kernel, subst_name, template, parameters=()):
             instructions=new_insns,
             substitutions=new_substs)
 
+
 # }}}
 
 
@@ -275,6 +288,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper):
             return var(subst_name)(*index)
 
 
+@iterate_over_kernels_if_given_program
 @remove_any_newly_unused_inames
 def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None,
         force_retain_argument=False):
@@ -458,6 +472,7 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None,
 
 # {{{ expand_subst
 
+@iterate_over_kernels_if_given_program
 def expand_subst(kernel, within=None):
     """
     Returns an instance of :class:`loopy.LoopKernel` with the substitutions
@@ -466,6 +481,7 @@ def expand_subst(kernel, within=None):
     :arg within: a stack match as understood by
         :func:`loopy.match.parse_stack_match`.
     """
+
     if not kernel.substitutions:
         return kernel
 
@@ -498,8 +514,17 @@ def find_rules_matching(kernel, pattern):
     return [r for r in kernel.substitutions if pattern.match(r)]
 
 
-def find_one_rule_matching(kernel, pattern):
-    rules = find_rules_matching(kernel, pattern)
+def find_one_rule_matching(program, pattern):
+    rules = []
+    for in_knl_callable in program.callables_table.values():
+        if isinstance(in_knl_callable, CallableKernel):
+            knl = in_knl_callable.subkernel
+            rules.extend(find_rules_matching(knl, pattern))
+        elif isinstance(in_knl_callable, ScalarCallable):
+            pass
+        else:
+            raise NotImplementedError("Unknown callable types %s." % (
+                type(in_knl_callable).__name__))
 
     if len(rules) > 1:
         raise ValueError("more than one substitution rule matched '%s'"
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 787966efc7fd00ad282e60990846ce07004e7906..ee1ddf33d72adf405a84bf02a7d259f0eb2d66a5 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -29,6 +29,14 @@ from loopy.types import NumpyType
 from loopy.diagnostic import (
         LoopyError,
         TypeInferenceFailure, DependencyTypeInferenceFailure)
+from loopy.kernel.instruction import _DataObliviousInstruction
+
+from loopy.symbolic import (
+        LinearSubscript, parse_tagged_name, RuleAwareIdentityMapper,
+        SubstitutionRuleExpander, ResolvedFunction,
+        SubstitutionRuleMappingContext, SubArrayRef)
+from pymbolic.primitives import Variable, Subscript, Lookup
+from loopy.program import CallablesInferenceContext, make_clbl_inf_ctx
 
 import logging
 logger = logging.getLogger(__name__)
@@ -40,10 +48,152 @@ def _debug(kernel, s, *args):
         logger.debug(f"{kernel.name}: {logstr}")
 
 
+def get_return_types_as_tuple(arg_id_to_dtype):
+    """Returns the types of arguments in  a tuple format.
+
+    :arg arg_id_to_dtype: An instance of :class:`dict` which denotes a
+                            mapping from the arguments to their inferred types.
+    """
+    return_arg_id_to_dtype = {id: dtype for id, dtype in
+            arg_id_to_dtype.items() if (isinstance(id, int) and id < 0)}
+    return_arg_pos = sorted(return_arg_id_to_dtype.keys(), reverse=True)
+
+    return tuple(return_arg_id_to_dtype[id] for id in return_arg_pos)
+
+
+# {{{ renaming helpers
+
+class FunctionNameChanger(RuleAwareIdentityMapper):
+    """
+    Changes the names of scoped functions in calls of expressions according to
+    the mapping ``calls_to_new_functions``
+    """
+
+    def __init__(self, rule_mapping_context, calls_to_new_names,
+            subst_expander):
+        super().__init__(rule_mapping_context)
+        self.calls_to_new_names = calls_to_new_names
+        self.subst_expander = subst_expander
+
+    def map_call(self, expr, expn_state):
+        name, tag = parse_tagged_name(expr.function)
+
+        if name not in self.rule_mapping_context.old_subst_rules:
+            expanded_expr = self.subst_expander(expr)
+            if expr in self.calls_to_new_names:
+                return type(expr)(
+                        ResolvedFunction(self.calls_to_new_names[expr]),
+                        tuple(self.rec(child, expn_state)
+                            for child in expr.parameters))
+            elif expanded_expr in self.calls_to_new_names:
+                # FIXME: This is killing the substitution.
+                # Maybe using a RuleAwareIdentityMapper for TypeInferenceMapper
+                # would help.
+                return type(expr)(
+                        ResolvedFunction(self.calls_to_new_names[expanded_expr]),
+                        tuple(self.rec(child, expn_state)
+                            for child in expanded_expr.parameters))
+            else:
+                return super().map_call(
+                        expr, expn_state)
+        else:
+            return self.map_substitution(name, tag, expr.parameters, expn_state)
+
+    def map_call_with_kwargs(self, expr, expn_state):
+
+        if expr in self.calls_to_new_names:
+            return type(expr)(
+                ResolvedFunction(self.calls_to_new_names[expr]),
+                tuple(self.rec(child, expn_state)
+                    for child in expr.parameters),
+                {
+                    key: self.rec(val, expn_state)
+                    for key, val in expr.kw_parameters.items()}
+                    )
+        else:
+            return super().map_call_with_kwargs(
+                    expr, expn_state)
+
+
+def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names):
+    """
+    Returns a copy of *kernel* with the names of pymbolic calls changed
+    according to the mapping given by *pymbolic_calls_new_names*.
+
+    :arg pymbolic_calls_to_new_names: A mapping from instances of
+        :class:`pymbolic.primitives.Call` to :class:`str`.
+
+    **Example: **
+
+        - Given a *kernel* --
+
+        .. code::
+
+            -------------------------------------------------------------
+            KERNEL: loopy_kernel
+            -------------------------------------------------------------
+            ARGUMENTS:
+            x: type: <auto/runtime>, shape: (10), dim_tags: (N0:stride:1)
+            y: type: <auto/runtime>, shape: (10), dim_tags: (N0:stride:1)
+            -------------------------------------------------------------
+            DOMAINS:
+            { [i] : 0 <= i <= 9 }
+            -------------------------------------------------------------
+            INAME IMPLEMENTATION TAGS:
+            i: None
+            -------------------------------------------------------------
+            INSTRUCTIONS:
+            for i
+                y[i] = ResolvedFunction('sin')(x[i])
+            end i
+            -------------------------------------------------------------
+
+        - And given a *pymbolic_calls_to_new_names* --
+
+        .. code::
+
+            {Call(ResolvedFunction(Variable('sin')), (Subscript(Variable('x'),
+            Variable('i')),))": 'sin_1'}
+
+        - The following *kernel* is returned --
+
+        .. code::
+
+            -------------------------------------------------------------
+            KERNEL: loopy_kernel
+            -------------------------------------------------------------
+            ARGUMENTS:
+            x: type: <auto/runtime>, shape: (10), dim_tags: (N0:stride:1)
+            y: type: <auto/runtime>, shape: (10), dim_tags: (N0:stride:1)
+            -------------------------------------------------------------
+            DOMAINS:
+            { [i] : 0 <= i <= 9 }
+            -------------------------------------------------------------
+            INAME IMPLEMENTATION TAGS:
+            i: None
+            -------------------------------------------------------------
+            INSTRUCTIONS:
+            for i
+                y[i] = ResolvedFunction('sin_1')(x[i])
+            end i
+            -------------------------------------------------------------
+    """
+    rule_mapping_context = SubstitutionRuleMappingContext(
+                    kernel.substitutions, kernel.get_var_name_generator())
+    subst_expander = SubstitutionRuleExpander(kernel.substitutions)
+    name_changer = FunctionNameChanger(rule_mapping_context,
+            pymbolic_calls_to_new_names, subst_expander)
+
+    return rule_mapping_context.finish_kernel(
+            name_changer.map_kernel(kernel))
+
+# }}}
+
+
 # {{{ type inference mapper
 
 class TypeInferenceMapper(CombineMapper):
-    def __init__(self, kernel, new_assignments=None):
+    def __init__(self, kernel, clbl_inf_ctx, new_assignments=None):
         """
         :arg new_assignments: mapping from names to either
             :class:`loopy.kernel.data.TemporaryVariable`
@@ -52,10 +202,13 @@ class TypeInferenceMapper(CombineMapper):
             instances
         """
         self.kernel = kernel
+        assert isinstance(clbl_inf_ctx, CallablesInferenceContext)
         if new_assignments is None:
             new_assignments = {}
         self.new_assignments = new_assignments
         self.symbols_with_unknown_types = set()
+        self.clbl_inf_ctx = clbl_inf_ctx
+        self.old_calls_to_new_calls = {}
 
     def __call__(self, expr, return_tuple=False, return_dtype_set=False):
         kwargs = {}
@@ -88,13 +241,16 @@ class TypeInferenceMapper(CombineMapper):
     # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x)
     # are Python-equal (for many common constants such as integers).
 
-    def copy(self):
-        return type(self)(self.kernel, self.new_assignments)
+    def copy(self, clbl_inf_ctx=None):
+        if clbl_inf_ctx is None:
+            clbl_inf_ctx = self.clbl_inf_ctx
+        return type(self)(self.kernel, clbl_inf_ctx,
+                self.new_assignments)
 
     def with_assignments(self, names_to_vars):
         new_ass = self.new_assignments.copy()
         new_ass.update(names_to_vars)
-        return type(self)(self.kernel, new_ass)
+        return type(self)(self.kernel, self.clbl_inf_ctx, new_ass)
 
     @staticmethod
     def combine(dtype_sets):
@@ -250,14 +406,23 @@ class TypeInferenceMapper(CombineMapper):
         return self.rec(expr.aggregate)
 
     def map_call(self, expr, return_tuple=False):
-        from pymbolic.primitives import Variable
+
+        from pymbolic.primitives import Variable, CallWithKwargs, Call
+
+        if isinstance(expr, CallWithKwargs):
+            kw_parameters = expr.kw_parameters
+        else:
+            assert isinstance(expr, Call)
+            kw_parameters = {}
 
         identifier = expr.function
-        if isinstance(identifier, Variable):
-            identifier = identifier.name
 
-        if identifier in ["indexof", "indexof_vec"]:
-            return [self.kernel.index_dtype]
+        if not isinstance(identifier, ResolvedFunction):
+            # function not resolved => exit
+            return []
+
+        if isinstance(identifier, (Variable, ResolvedFunction)):
+            identifier = identifier.name
 
         def none_if_empty(d):
             if d:
@@ -266,25 +431,83 @@ class TypeInferenceMapper(CombineMapper):
             else:
                 return None
 
-        arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in expr.parameters)
-        if None in arg_dtypes:
-            return []
+        arg_id_to_dtype = {i: none_if_empty(self.rec(par)) for (i, par) in
+                tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items())}
 
-        mangle_result = self.kernel.mangle_function(identifier, arg_dtypes)
-        if return_tuple:
-            if mangle_result is not None:
-                return [mangle_result.result_dtypes]
+        # specializing the known function wrt type
+        in_knl_callable = self.clbl_inf_ctx[expr.function.name]
+
+        # {{{ checking that there is no overwriting of types of in_knl_callable
+
+        if in_knl_callable.arg_id_to_dtype is not None:
+
+            # specializing an already specialized function.
+            for id, dtype in arg_id_to_dtype.items():
+                if id in in_knl_callable.arg_id_to_dtype and (
+                        in_knl_callable.arg_id_to_dtype[id] !=
+                        arg_id_to_dtype[id]):
+
+                    # {{{ ignoring the the cases when there is a discrepancy
+                    # between np.uint and np.int
+
+                    import numpy as np
+                    if in_knl_callable.arg_id_to_dtype[id].dtype.type == (
+                            np.uint32) and (
+                                    arg_id_to_dtype[id].dtype.type == np.int32):
+                        continue
+                    if in_knl_callable.arg_id_to_dtype[id].dtype.type == (
+                            np.uint64) and (
+                                    arg_id_to_dtype[id].dtype.type ==
+                                    np.int64):
+                        continue
+
+                    if np.can_cast(arg_id_to_dtype[id].dtype.type,
+                            in_knl_callable.arg_id_to_dtype[id].dtype.type):
+                        continue
+
+                    # }}}
+
+                    raise LoopyError("Overwriting a specialized function "
+                            "is illegal--maybe start with new instance of "
+                            "InKernelCallable?")
+
+        # }}}
+
+        in_knl_callable, self.clbl_inf_ctx = (
+                in_knl_callable.with_types(
+                    arg_id_to_dtype,
+                    self.clbl_inf_ctx))
+
+        in_knl_callable = in_knl_callable.with_target(self.kernel.target)
+
+        # storing the type specialized function so that it can be used for
+        # later use
+        self.clbl_inf_ctx, new_function_id = (
+                self.clbl_inf_ctx.with_callable(
+                    expr.function.function,
+                    in_knl_callable))
+
+        if isinstance(expr, Call):
+            self.old_calls_to_new_calls[expr] = new_function_id
         else:
-            if mangle_result is not None:
-                if len(mangle_result.result_dtypes) != 1 and not return_tuple:
-                    raise LoopyError("functions with more or fewer than one "
-                            "return value may only be used in direct assignments")
+            assert isinstance(expr, CallWithKwargs)
+            self.old_calls_to_new_calls[expr] = new_function_id
+
+        new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype
+
+        if new_arg_id_to_dtype is None:
+            return []
+
+        # collecting result dtypes in order of the assignees
+        if -1 in new_arg_id_to_dtype and new_arg_id_to_dtype[-1] is not None:
+            if return_tuple:
+                return [get_return_types_as_tuple(new_arg_id_to_dtype)]
+            else:
+                return [new_arg_id_to_dtype[-1]]
 
-                return [mangle_result.result_dtypes[0]]
+        return []
 
-        raise RuntimeError("unable to resolve "
-                "function '%s' with %d given arguments"
-                % (identifier, len(arg_dtypes)))
+    map_call_with_kwargs = map_call
 
     def map_variable(self, expr):
         if expr.name in self.kernel.all_inames():
@@ -352,11 +575,20 @@ class TypeInferenceMapper(CombineMapper):
     def map_comparison(self, expr):
         # "bool" is unusable because OpenCL's bool has indeterminate memory
         # format.
+        self(expr.left, return_tuple=False, return_dtype_set=False)
+        self(expr.right, return_tuple=False, return_dtype_set=False)
+        return [NumpyType(np.dtype(np.int32))]
+
+    def map_logical_not(self, expr):
+        return [NumpyType(np.dtype(np.int32))]
+
+    def map_logical_and(self, expr):
+        for child in expr.children:
+            self.rec(child)
+
         return [NumpyType(np.dtype(np.int32))]
 
-    map_logical_not = map_comparison
-    map_logical_and = map_comparison
-    map_logical_or = map_comparison
+    map_logical_or = map_logical_and
 
     def map_group_hw_index(self, expr, *args):
         return [self.kernel.index_dtype]
@@ -393,20 +625,112 @@ class TypeInferenceMapper(CombineMapper):
                 rec_results = self.rec(expr.expr)
 
         if return_tuple:
-            return [expr.operation.result_dtypes(self.kernel, *rec_result)
+            return [expr.operation.result_dtypes(*rec_result)
                     for rec_result in rec_results]
         else:
-            return [expr.operation.result_dtypes(self.kernel, rec_result)[0]
+            return [expr.operation.result_dtypes(rec_result)[0]
                     for rec_result in rec_results]
 
+    def map_sub_array_ref(self, expr):
+        return self.rec(expr.subscript)
+
+# }}}
+
+
+# {{{ TypeReader
+
+class TypeReader(TypeInferenceMapper):
+    def __init__(self, kernel, callables, new_assignments={}):
+        self.kernel = kernel
+        self.callables = callables
+        self.new_assignments = new_assignments
+
+    # {{{ disabled interface
+
+    def copy(self, *args, **kwargs):
+        raise ValueError("Not allowed in TypeReader")
+
+    # }}}
+
+    def with_assignments(self, names_to_vars):
+        new_ass = self.new_assignments.copy()
+        new_ass.update(names_to_vars)
+        return type(self)(self.kernel, self.callables, new_ass)
+
+    def map_call(self, expr, return_tuple=False):
+        identifier = expr.function
+        if isinstance(identifier, (Variable, ResolvedFunction)):
+            identifier = identifier.name
+
+        # specializing the known function wrt type
+        if isinstance(expr.function, ResolvedFunction):
+            in_knl_callable = self.callables[expr.function.name]
+
+            arg_id_to_dtype = in_knl_callable.arg_id_to_dtype
+
+            if arg_id_to_dtype is None:
+                return []
+
+            # collecting result dtypes in order of the assignees
+            if -1 in arg_id_to_dtype and arg_id_to_dtype[-1] is not None:
+                if return_tuple:
+                    return [get_return_types_as_tuple(arg_id_to_dtype)]
+                else:
+                    return [arg_id_to_dtype[-1]]
+
+        return []
+
+    def map_variable(self, expr):
+        if expr.name in self.kernel.all_inames():
+            return [self.kernel.index_dtype]
+
+        result = self.kernel.mangle_symbol(
+                self.kernel.target.get_device_ast_builder(),
+                expr.name)
+
+        if result is not None:
+            result_dtype, _ = result
+            return [result_dtype]
+
+        obj = self.new_assignments.get(expr.name)
+
+        if obj is None:
+            obj = self.kernel.arg_dict.get(expr.name)
+
+        if obj is None:
+            obj = self.kernel.temporary_variables.get(expr.name)
+
+        if obj is None:
+            raise TypeInferenceFailure("name not known in type inference: %s"
+                    % expr.name)
+
+        from loopy.kernel.data import TemporaryVariable, KernelArgument
+        import loopy as lp
+        if isinstance(obj, (KernelArgument, TemporaryVariable)):
+            assert obj.dtype is not lp.auto
+            result = [obj.dtype]
+            if result[0] is None:
+                raise DependencyTypeInferenceFailure(
+                        ", ".join(sorted(expr.name)))
+            else:
+                return result
+
+        else:
+            raise RuntimeError("unexpected type inference "
+                    "object type for '%s'" % expr.name)
+
+    map_call_with_kwargs = map_call
+
 # }}}
 
 
 # {{{ infer single variable
 
 def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
+
     if var_name in kernel.all_params():
-        return [kernel.index_dtype], []
+        return [kernel.index_dtype], [], {}, (
+                type_inf_mapper.clbl_inf_ctx)
 
     from functools import partial
     debug = partial(_debug, kernel)
@@ -451,11 +775,15 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
         dtype_sets.append(result)
 
     if not dtype_sets:
-        return None, type_inf_mapper.symbols_with_unknown_types
+        return (
+                None, type_inf_mapper.symbols_with_unknown_types, None,
+                type_inf_mapper.clbl_inf_ctx)
 
     result = type_inf_mapper.combine(dtype_sets)
 
-    return result, type_inf_mapper.symbols_with_unknown_types
+    return (result, type_inf_mapper.symbols_with_unknown_types,
+            type_inf_mapper.old_calls_to_new_calls,
+            type_inf_mapper.clbl_inf_ctx)
 
 # }}}
 
@@ -482,7 +810,7 @@ class _DictUnionView:
 
 # {{{ infer_unknown_types
 
-def infer_unknown_types(kernel, expect_completion=False):
+def infer_unknown_types_for_a_single_kernel(kernel, clbl_inf_ctx):
     """Infer types on temporaries and arguments."""
 
     logger.debug("%s: infer types" % kernel.name)
@@ -544,7 +872,8 @@ def infer_unknown_types(kernel, expect_completion=False):
             new_temp_vars,
             new_arg_dict
             ])
-    type_inf_mapper = TypeInferenceMapper(kernel, item_lookup)
+    type_inf_mapper = TypeInferenceMapper(kernel, clbl_inf_ctx,
+            item_lookup)
 
     from loopy.symbolic import SubstitutionRuleExpander
     subst_expander = SubstitutionRuleExpander(kernel.substitutions)
@@ -553,6 +882,8 @@ def infer_unknown_types(kernel, expect_completion=False):
 
     from loopy.kernel.data import TemporaryVariable, KernelArgument
 
+    old_calls_to_new_calls = {}
+
     for var_chain in sccs:
         changed_during_last_queue_run = False
         queue = var_chain[:]
@@ -575,10 +906,15 @@ def infer_unknown_types(kernel, expect_completion=False):
             item = item_lookup[name]
 
             debug("inferring type for %s %s", type(item).__name__, item.name)
-
-            result, symbols_with_unavailable_types = (
-                    _infer_var_type(
-                            kernel, item.name, type_inf_mapper, subst_expander))
+            try:
+                (result, symbols_with_unavailable_types,
+                        new_old_calls_to_new_calls, clbl_inf_ctx) = (
+                        _infer_var_type(
+                                kernel, item.name, type_inf_mapper, subst_expander))
+            except DependencyTypeInferenceFailure:
+                result = tuple()
+            type_inf_mapper = type_inf_mapper.copy(
+                    clbl_inf_ctx=clbl_inf_ctx)
 
             failed = not result
             if not failed:
@@ -597,6 +933,7 @@ def infer_unknown_types(kernel, expect_completion=False):
                         new_arg_dict[name] = item.copy(dtype=new_dtype)
                     else:
                         raise LoopyError("unexpected item type in type inference")
+                old_calls_to_new_calls.update(new_old_calls_to_new_calls)
             else:
                 debug("     failure")
 
@@ -609,14 +946,10 @@ def infer_unknown_types(kernel, expect_completion=False):
                                 " (need type of '%s'--check for missing arguments)"
                                 % ", ".join(symbols_with_unavailable_types))
 
-                    if expect_completion:
-                        raise LoopyError(
-                                "could not determine type of '%s'%s"
-                                % (item.name, advice))
-
-                    else:
-                        # We're done here.
-                        break
+                    debug("could not determine type of '%s'%s"
+                           % (item.name, advice))
+                    # We're done here
+                    break
 
                 # remember that this item failed
                 failed_names.add(item.name)
@@ -624,7 +957,6 @@ def infer_unknown_types(kernel, expect_completion=False):
                 if set(queue) == failed_names:
                     # We did what we could...
                     print(queue, failed_names, item.name)
-                    assert not expect_completion
                     break
 
                 # can't infer type yet, put back into queue
@@ -635,23 +967,134 @@ def infer_unknown_types(kernel, expect_completion=False):
 
     # }}}
 
+    # FIXME: copy the explanation from make_function_ready_for_codegen
+    # here.
+
+    # {{{ check if insn missed during type inference
+
+    def _instruction_missed_during_inference(insn):
+        for assignee in insn.assignees:
+            if isinstance(assignee, Lookup):
+                assignee = assignee.aggregate
+
+            if isinstance(assignee, Variable):
+                if assignee.name in kernel.arg_dict:
+                    if kernel.arg_dict[assignee.name].dtype is None:
+                        return False
+                else:
+                    assert assignee.name in kernel.temporary_variables
+                    if kernel.temporary_variables[assignee.name].dtype is None:
+                        return False
+
+            elif isinstance(assignee, (Subscript, LinearSubscript)):
+                if assignee.aggregate.name in kernel.arg_dict:
+                    if kernel.arg_dict[assignee.aggregate.name].dtype is None:
+                        return False
+                else:
+                    assert assignee.aggregate.name in kernel.temporary_variables
+                    if kernel.temporary_variables[
+                            assignee.aggregate.name].dtype is None:
+                        return False
+            else:
+                assert isinstance(assignee, SubArrayRef)
+                if assignee.subscript.aggregate.name in kernel.arg_dict:
+                    if kernel.arg_dict[
+                            assignee.subscript.aggregate.name].dtype is None:
+                        return False
+                else:
+                    assert assignee.subscript.aggregate.name in (
+                            kernel.temporary_variables)
+                    if kernel.temporary_variables[
+                            assignee.subscript.aggregate.name] is None:
+                        return False
+
+        return True
+
+    # }}}
+
+    for insn in kernel.instructions:
+        if isinstance(insn, lp.MultiAssignmentBase):
+            # just a dummy run over the expression, to pass over all the
+            # functions
+            if _instruction_missed_during_inference(insn):
+                type_inf_mapper(insn.expression,
+                        return_tuple=len(insn.assignees) != 1,
+                        return_dtype_set=True)
+        elif isinstance(insn, (_DataObliviousInstruction,
+                lp.CInstruction)):
+            pass
+        else:
+            raise NotImplementedError("Unknown instructions type %s." % (
+                type(insn).__name__))
+
+    clbl_inf_ctx = type_inf_mapper.clbl_inf_ctx
+    old_calls_to_new_calls.update(type_inf_mapper.old_calls_to_new_calls)
+
     end_time = time.time()
     logger.debug("type inference took {dur:.2f} seconds".format(
             dur=end_time - start_time))
 
-    return unexpanded_kernel.copy(
+    pre_type_specialized_knl = unexpanded_kernel.copy(
             temporary_variables=new_temp_vars,
             args=[new_arg_dict[arg.name] for arg in kernel.args],
             )
 
+    type_specialized_kernel = change_names_of_pymbolic_calls(
+            pre_type_specialized_knl, old_calls_to_new_calls)
+
+    return type_specialized_kernel, clbl_inf_ctx
+
+
+def infer_unknown_types(program, expect_completion=False):
+    """Infer types on temporaries and arguments."""
+    from loopy.kernel.data import auto
+    from loopy.program import resolve_callables
+
+    program = resolve_callables(program)
+
+    clbl_inf_ctx = make_clbl_inf_ctx(program.callables_table,
+            program.entrypoints)
+
+    renamed_entrypoints = set()
+
+    for e in program.entrypoints:
+        logger.debug(f"Entering entrypoint: {e}")
+        arg_id_to_dtype = {arg.name: arg.dtype for arg in
+                program[e].args if arg.dtype not in (None, auto)}
+        new_callable, clbl_inf_ctx = program.callables_table[e].with_types(
+                arg_id_to_dtype, clbl_inf_ctx)
+        clbl_inf_ctx, new_name = clbl_inf_ctx.with_callable(e, new_callable)
+        renamed_entrypoints.add(new_name.name)
+
+        if expect_completion:
+            from loopy.types import LoopyType
+            new_knl = new_callable.subkernel
+
+            args_not_inferred = {arg.name
+                                 for arg in new_knl.args
+                                 if not isinstance(arg.dtype, LoopyType)}
+
+            tvs_not_inferred = {tv.name
+                                for tv in new_knl.temporary_variables.values()
+                                if not isinstance(tv.dtype, LoopyType)}
+
+            vars_not_inferred = tvs_not_inferred | args_not_inferred
+
+            if vars_not_inferred:
+                if expect_completion:
+                    raise LoopyError("could not determine type of"
+                            f" '{vars_not_inferred.pop()}' of kernel '{e}'.")
+
+    return clbl_inf_ctx.finish_program(program, renamed_entrypoints)
+
 # }}}
 
 
 # {{{ reduction expression helper
 
 def infer_arg_and_reduction_dtypes_for_reduction_expression(
-        kernel, expr, unknown_types_ok):
-    type_inf_mapper = TypeInferenceMapper(kernel)
+        kernel, expr, callables_table, unknown_types_ok):
+    type_inf_mapper = TypeReader(kernel, callables_table)
     import loopy as lp
 
     if expr.is_tuple_typed:
@@ -676,7 +1119,7 @@ def infer_arg_and_reduction_dtypes_for_reduction_expression(
                 raise LoopyError("failed to determine type of accumulator for "
                         "reduction '%s'" % expr)
 
-    reduction_dtypes = expr.operation.result_dtypes(kernel, *arg_dtypes)
+    reduction_dtypes = expr.operation.result_dtypes(*arg_dtypes)
     reduction_dtypes = tuple(
             dt.with_target(kernel.target)
             if dt is not lp.auto else dt
diff --git a/loopy/types.py b/loopy/types.py
index de6208476e270eb5aab2595f05def4f771bcf901..2457049073eab8c73202e324514526097b56c4d1 100644
--- a/loopy/types.py
+++ b/loopy/types.py
@@ -193,6 +193,45 @@ class AtomicNumpyType(NumpyType, AtomicType):
 # }}}
 
 
+# {{{
+
+class OpaqueType(LoopyType):
+    """An opaque data type is truly opaque - it has no allocations, no
+    temporaries of that type, etc. The only thing allowed is to be pass in
+    through one ValueArg and go out to another. It is introduced to accomodate
+    functional calls to external libraries.
+    """
+    def __init__(self, name):
+        assert isinstance(name, str)
+        self.name = name
+        self.target = None
+
+    def is_integral(self):
+        return False
+
+    def is_complex(self):
+        return False
+
+    def involves_complex(self):
+        return False
+
+    def update_persistent_hash(self, key_hash, key_builder):
+        key_builder.rec(key_hash, self.name)
+
+    def __hash__(self):
+        return hash(self.name)
+
+    def __eq__(self, other):
+        return (
+                type(self) == type(other)
+                and self.name == other.name)
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+# }}}
+
+
 def to_loopy_type(dtype, allow_auto=False, allow_none=False, for_atomic=False,
         target=None):
     from loopy.kernel.data import auto
diff --git a/setup.py b/setup.py
index 57f5e895d0fdd39e3be7e521c23c1ad3324da08e..4f56bc367f08308d46cdfde4e9ee0efa6f1f7ccf 100644
--- a/setup.py
+++ b/setup.py
@@ -92,6 +92,7 @@ setup(name="loopy",
           "codepy>=2017.1",
           "colorama",
           "Mako",
+          "pyrsistent",
           ],
 
       extras_require={
diff --git a/test/library_for_test.py b/test/library_for_test.py
index 2cb4067e0acd6f4a88ff166e0fd460ec925585f2..cfaacdc0ef2df0a76209398dac1cde7a40a1b336 100644
--- a/test/library_for_test.py
+++ b/test/library_for_test.py
@@ -1,23 +1,61 @@
-# This exists because function handles can't be pickled.
+import loopy as lp
+import numpy as np
 
 
-def no_ret_f_mangler(kernel, name, arg_dtypes):
-    if not isinstance(name, str):
-        return None
+class NoRetFunction(lp.ScalarCallable):
+    def with_types(self, arg_id_to_dtype, callables):
+        if len(arg_id_to_dtype) != 0:
+            raise RuntimeError("'f' cannot take any inputs.")
 
-    if (name == "f" and len(arg_dtypes) == 0):
-        from loopy.kernel.data import CallMangleInfo
-        return CallMangleInfo(
-                target_name="f",
-                result_dtypes=arg_dtypes,
-                arg_dtypes=arg_dtypes)
+        return (self.copy(arg_id_to_dtype=arg_id_to_dtype,
+                         name_in_target="f"),
+                callables)
 
+    def with_descrs(self, arg_id_to_descr, callables):
+        if len(arg_id_to_descr) != 0:
+            raise RuntimeError("'f' cannot take any inputs.")
 
-def no_ret_f_preamble_gen(preamble_info):
-    yield ("10_define_f",
-            r"""
-            void f()
-            {
-                printf("Hi!\n");
-            }
-            """)
+        return (self.copy(arg_id_to_descr=arg_id_to_descr),
+                callables)
+
+    def generate_preambles(self, target):
+        assert isinstance(target, lp.CFamilyTarget)
+        yield ("10_define_f",
+                r"""
+                void f()
+                {
+                    printf("Hi!\n");
+                }
+                """)
+
+
+class SingleArgNoRetFunction(lp.ScalarCallable):
+    def with_types(self, arg_id_to_dtype, callables):
+        input_dtype = arg_id_to_dtype.get(0)
+        if input_dtype is None:
+            return self, callables
+
+        if input_dtype.numpy_dtype != np.float32:
+            raise RuntimeError("'f' only supports f32.")
+
+        return (self.copy(arg_id_to_dtype=arg_id_to_dtype,
+                          name_in_target="f"),
+                callables)
+
+    def with_descrs(self, arg_id_to_descr, callables):
+        if len(arg_id_to_descr) != 0:
+            raise RuntimeError("'f' cannot take any inputs.")
+
+        return (self.copy(arg_id_to_descr=arg_id_to_descr),
+                callables)
+
+    def generate_preambles(self, target):
+        assert isinstance(target, lp.CFamilyTarget)
+
+        yield ("10_define_f",
+                r"""
+                void f(float x)
+                {
+                    printf("Hi!\n");
+                }
+                """)
diff --git a/test/test_apps.py b/test/test_apps.py
index 56f4127ac6be827afda8bd41b6e87ee6d5e774dc..6e49e73fafae569411ad68fb8fefd24b5315087f 100644
--- a/test/test_apps.py
+++ b/test/test_apps.py
@@ -217,7 +217,8 @@ def test_rob_stroud_bernstein(ctx_factory):
                 lp.GlobalArg("coeffs", None, shape=None),
                 "..."
                 ],
-            assumptions="deg>=0 and nels>=1"
+            assumptions="deg>=0 and nels>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0])
             )
 
     knl = lp.fix_parameters(knl, nqp1d=7, deg=4)
@@ -225,13 +226,12 @@ def test_rob_stroud_bernstein(ctx_factory):
     knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp",
             slabs=(0, 1))
     knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr"))
-
-    print(lp.CompiledKernel(ctx, knl).get_highlighted_code(
-            dict(
+    knl = lp.add_dtypes(knl, dict(
                 qpts=np.float32,
                 coeffs=np.float32,
                 tmp=np.float32,
-                )))
+                ))
+    print(lp.generate_code_v2(knl))
 
 
 def test_rob_stroud_bernstein_full(ctx_factory):
@@ -297,7 +297,8 @@ def test_rob_stroud_bernstein_full(ctx_factory):
             lp.GlobalArg("coeffs", None, shape=None),
             "..."
             ],
-        assumptions="deg>=0 and nels>=1"
+        assumptions="deg>=0 and nels>=1",
+        target=lp.PyOpenCLTarget(ctx.devices[0])
         )
 
     knl = lp.fix_parameters(knl, nqp1d=7, deg=4)
@@ -311,14 +312,14 @@ def test_rob_stroud_bernstein_full(ctx_factory):
     from pickle import dumps, loads
     knl = loads(dumps(knl))
 
-    knl = lp.CompiledKernel(ctx, knl).get_highlighted_code(
+    knl = lp.add_dtypes(knl,
             dict(
                 qpts=np.float32,
                 tmp=np.float32,
                 coeffs=np.float32,
                 result=np.float32,
                 ))
-    print(knl)
+    print(lp.generate_code_v2(knl))
 
 
 def test_stencil(ctx_factory):
@@ -661,9 +662,10 @@ def test_domain_tree_nesting():
         TV("num_vals_offset", initializer=num_vals_offset, read_only=True,
            address_space=AS.PRIVATE),
         lp.GlobalArg("B", shape=(100, 31), dtype=np.float64),
-        lp.GlobalArg("out", shape=(100, 12), dtype=np.float64)])
+        lp.GlobalArg("out", shape=(100, 12), dtype=np.float64)],
+        name="nested_domain")
 
-    parents_per_domain = knl.parents_per_domain()
+    parents_per_domain = knl["nested_domain"].parents_per_domain()
 
     def depth(i):
         if parents_per_domain[i] is None:
diff --git a/test/test_c_execution.py b/test/test_c_execution.py
index a204859fff57e4806ac9ebd8204acded021512ac..1c79241cfe4f78f574655c230fa1c393d2c4b51e 100644
--- a/test/test_c_execution.py
+++ b/test/test_c_execution.py
@@ -111,11 +111,12 @@ def test_c_target_strides_nonsquare():
                     lp.GlobalArg("a", np.float32, shape=sizes, order=order),
                     "..."
                     ],
-                target=ExecutableCTarget())
+                target=ExecutableCTarget(),
+                name="nonsquare_strides")
 
     # test with C-order
     knl = __get_kernel("C")
-    a_lp = next(x for x in knl.args if x.name == "a")
+    a_lp = next(x for x in knl["nonsquare_strides"].args if x.name == "a")
     a_np = np.reshape(np.arange(np.product(a_lp.shape), dtype=np.float32),
                       a_lp.shape,
                       order="C")
@@ -125,7 +126,7 @@ def test_c_target_strides_nonsquare():
 
     # test with F-order
     knl = __get_kernel("F")
-    a_lp = next(x for x in knl.args if x.name == "a")
+    a_lp = next(x for x in knl["nonsquare_strides"].args if x.name == "a")
     a_np = np.reshape(np.arange(np.product(a_lp.shape), dtype=np.float32),
                       a_lp.shape,
                       order="F")
diff --git a/test/test_callables.py b/test/test_callables.py
new file mode 100644
index 0000000000000000000000000000000000000000..ef22b163294793d86478a2fa0e3a913cfdeb6382
--- /dev/null
+++ b/test/test_callables.py
@@ -0,0 +1,768 @@
+__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import numpy as np
+import pyopencl as cl
+import pyopencl.clrandom  # noqa: F401
+import loopy as lp
+import pytest
+import sys
+
+
+from pyopencl.tools import (  # noqa: F401
+        pytest_generate_tests_for_pyopencl
+        as pytest_generate_tests)
+
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa: F401
+
+
+def test_register_function_lookup(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    from testlib import Log2Callable
+
+    x = np.random.rand(10)
+    queue = cl.CommandQueue(ctx)
+
+    prog = lp.make_kernel(
+            "{[i]: 0<=i<10}",
+            """
+            y[i] = log2(x[i])
+            """)
+    prog = lp.register_callable(prog, "log2", Log2Callable("log2"))
+
+    evt, (out, ) = prog(queue, x=x)
+
+    assert np.linalg.norm(np.log2(x)-out)/np.linalg.norm(np.log2(x)) < 1e-15
+
+
+@pytest.mark.parametrize("inline", [False, True])
+def test_register_knl(ctx_factory, inline):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+    n = 4
+
+    x = np.random.rand(n, n, n, n, n)
+    y = np.random.rand(n, n, n, n, n)
+
+    grandchild_knl = lp.make_function(
+            "{[i, j]:0<= i, j< 4}",
+            """
+            c[i, j] = 2*a[i, j] + 3*b[i, j]
+            """, name="linear_combo1")
+
+    child_knl = lp.make_function(
+            "{[i, j]:0<=i, j < 4}",
+            """
+            [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j])
+            """, name="linear_combo2")
+
+    parent_knl = lp.make_kernel(
+            "{[i, j, k, l, m]: 0<=i, j, k, l, m<4}",
+            """
+            [j, l]: z[i, j, k, l, m] = linear_combo2([j, l]: x[i, j, k, l, m],
+                                                     [j, l]: y[i, j, k, l, m])
+            """,
+            kernel_data=[
+                lp.GlobalArg(
+                    name="x, y",
+                    dtype=np.float64,
+                    shape=(n, n, n, n, n)),
+                ...]
+            )
+
+    knl = lp.merge([grandchild_knl, child_knl, parent_knl])
+
+    if inline:
+        knl = lp.inline_callable_kernel(knl, "linear_combo2")
+        knl = lp.inline_callable_kernel(knl, "linear_combo1")
+
+    evt, (out, ) = knl(queue, x=x, y=y)
+
+    assert (np.linalg.norm(2*x+3*y-out)/(
+        np.linalg.norm(2*x+3*y))) < 1e-15
+
+
+@pytest.mark.parametrize("inline", [False, True])
+def test_slices_with_negative_step(ctx_factory, inline):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+    n = 4
+
+    x = np.random.rand(n, n, n, n, n)
+    y = np.random.rand(n, n, n, n, n)
+
+    child_knl = lp.make_function(
+            "{[i, j]:0<=i, j < 4}",
+            """
+            g[i, j] = 2*e[i, j] + 3*f[i, j]
+            """, name="linear_combo")
+
+    parent_knl = lp.make_kernel(
+            "{[i, k, m]: 0<=i, k, m<4}",
+            """
+            z[i, 3:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m],
+                                                   y[i, :, k, :, m])
+            """,
+            kernel_data=[
+                lp.GlobalArg(
+                    name="x, y, z",
+                    dtype=np.float64,
+                    shape=(n, n, n, n, n)),
+                ...]
+            )
+
+    knl = lp.merge([parent_knl, child_knl])
+    if inline:
+        knl = lp.inline_callable_kernel(knl, "linear_combo")
+
+    evt, (out, ) = knl(queue, x=x, y=y)
+
+    assert (np.linalg.norm(2*x+3*y-out[:, ::-1, :, :, :])/(
+        np.linalg.norm(2*x+3*y))) < 1e-15
+
+
+@pytest.mark.parametrize("inline", [False, True])
+def test_register_knl_with_call_with_kwargs(ctx_factory, inline):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    n = 4
+
+    a_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32)
+    b_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32)
+    c_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64)
+
+    callee_knl = lp.make_function(
+            "{[i, j]:0<=i, j < %d}" % n,
+            """
+            h[i, j] = 2 * e[i, j] + 3*f[i, j] + 4*g[i, j]
+            <>f1[i, j] = 2*f[i, j]
+            p[i, j] = 7 * e[i, j] + 4*f1[i, j] + 2*g[i, j]
+            """,
+            [
+                lp.GlobalArg("f, e, h, g"), ...],
+            name="linear_combo")
+
+    caller_knl = lp.make_kernel(
+            "{[i, j, k, l, m]: 0<=i, j, k, l, m<%d}" % n,
+            """
+            <> d[i, j, k, l, m] = 2*b[i, j, k, l, m]
+            [j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m]  = linear_combo(
+                                                     f=[j, l]: a[i, j, k, l, m],
+                                                     g=[j, l]: d[i, j, k, l, m],
+                                                     e=[j, l]: c[i, j, k, l, m])
+            """)
+
+    knl = lp.merge([caller_knl, callee_knl])
+    if inline:
+        knl = lp.inline_callable_kernel(knl, "linear_combo")
+
+    evt, (out1, out2, ) = knl(queue, a=a_dev, b=b_dev, c=c_dev)
+
+    a = a_dev.get()
+    b = b_dev.get()
+    c = c_dev.get()
+
+    h = out1.get()  # h = 2c + 3a +  8b
+    p = out2.get()  # p = 7c + 8a + 4b
+    h_exact = 3*a + 8*b + 2*c
+    p_exact = 8*a + 4*b + 7*c
+
+    assert np.linalg.norm(h-h_exact)/np.linalg.norm(h_exact) < 1e-7
+    assert np.linalg.norm(p-p_exact)/np.linalg.norm(p_exact) < 1e-7
+
+
+@pytest.mark.parametrize("inline", [False, True])
+def test_register_knl_with_hw_axes(ctx_factory, inline):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    n = 4
+
+    x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64)
+    y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64)
+
+    callee_knl = lp.make_function(
+            "{[i, j]:0<=i, j < 4}",
+            """
+            g[i, j] = 2*e[i, j] + 3*f[i, j]
+            """, name="linear_combo")
+
+    callee_knl = lp.split_iname(callee_knl, "i", 1, inner_tag="l.0", outer_tag="g.0")
+
+    caller_knl = lp.make_kernel(
+            "{[i, j, k, l, m]: 0<=i, j, k, l, m<4}",
+            """
+            [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m],
+                                                     [j, l]: y[i, j, k, l, m])
+            """, name="caller")
+    caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1")
+
+    knl = lp.merge([caller_knl, callee_knl])
+
+    knl = lp.set_options(knl, "return_dict")
+
+    if inline:
+        knl = lp.inline_callable_kernel(knl, "linear_combo")
+
+    evt, out = knl(queue, x=x_dev, y=y_dev)
+
+    x_host = x_dev.get()
+    y_host = y_dev.get()
+
+    assert np.linalg.norm(2*x_host+3*y_host-out["z"].get())/np.linalg.norm(
+            2*x_host+3*y_host) < 1e-15
+
+
+@pytest.mark.parametrize("inline", [False, True])
+def test_shape_translation_through_sub_array_ref(ctx_factory, inline):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64)
+    x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64)
+    x3 = cl.clrandom.rand(queue, (6, 6), dtype=np.float64)
+
+    callee1 = lp.make_function(
+            "{[i]: 0<=i<6}",
+            """
+            b[i] = 2*abs(a[i])
+            """, name="callee_fn1")
+
+    callee2 = lp.make_function(
+            "{[i, j]: 0<=i<3 and 0 <= j < 2}",
+            """
+            b[i, j] = 3*a[i, j]
+            """, name="callee_fn2")
+
+    callee3 = lp.make_function(
+            "{[i]: 0<=i<6}",
+            """
+            b[i] = 5*a[i]
+            """, name="callee_fn3")
+
+    knl = lp.make_kernel(
+            "{[i, j, k, l]:  0<= i < 6 and 0 <= j < 3 and 0 <= k < 2 and 0<=l<6}",
+            """
+            [i]: y1[i//2, i%2] = callee_fn1([i]: x1[i//2, i%2])
+            [j, k]: y2[2*j+k] = callee_fn2([j, k]: x2[2*j+k])
+            [l]: y3[l, l] = callee_fn3([l]: x3[l, l])
+            """)
+
+    knl = lp.merge([knl, callee1])
+    knl = lp.merge([knl, callee2])
+    knl = lp.merge([knl, callee3])
+
+    if inline:
+        knl = lp.inline_callable_kernel(knl, "callee_fn1")
+        knl = lp.inline_callable_kernel(knl, "callee_fn2")
+        knl = lp.inline_callable_kernel(knl, "callee_fn3")
+
+    knl = lp.set_options(knl, "write_cl")
+    knl = lp.set_options(knl, "return_dict")
+    evt, out_dict = knl(queue, x1=x1, x2=x2, x3=x3)
+
+    y1 = out_dict["y1"].get()
+    y2 = out_dict["y2"].get()
+    y3 = out_dict["y3"].get()
+
+    assert (np.linalg.norm(y1-2*x1.get())) < 1e-15
+    assert (np.linalg.norm(y2-3*x2.get())) < 1e-15
+    assert (np.linalg.norm(np.diag(y3-5*x3.get()))) < 1e-15
+
+
+def test_multi_arg_array_call(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+    import pymbolic.primitives as p
+    n = 10
+    acc_i = p.Variable("acc_i")
+    i = p.Variable("i")
+    index = p.Variable("index")
+    a_i = p.Subscript(p.Variable("a"), p.Variable("i"))
+    argmin_kernel = lp.make_function(
+            "{[i]: 0 <= i < n}",
+            [
+                lp.Assignment(id="init2", assignee=index,
+                    expression=0),
+                lp.Assignment(id="init1", assignee=acc_i,
+                    expression="214748367"),
+                lp.Assignment(id="insn", assignee=index,
+                    expression=p.If(p.Expression.eq(acc_i, a_i), i, index),
+                    depends_on="update"),
+                lp.Assignment(id="update", assignee=acc_i,
+                    expression=p.Variable("min")(acc_i, a_i),
+                    depends_on="init1,init2")],
+            [
+                lp.GlobalArg("a"),
+                lp.GlobalArg("acc_i, index", is_input=False, is_output=True,
+                             shape=lp.auto),
+                ...],
+            name="custom_argmin")
+
+    argmin_kernel = lp.fix_parameters(argmin_kernel, n=n)
+
+    knl = lp.make_kernel(
+            "{[i]:0<=i<n}",
+            """
+            []: min_val[()], []: min_index[()] = custom_argmin([i]:b[i])
+            """)
+
+    knl = lp.fix_parameters(knl, n=n)
+    knl = lp.set_options(knl, return_dict=True)
+
+    knl = lp.merge([knl, argmin_kernel])
+    b = np.random.randn(n)
+    evt, out_dict = knl(queue, b=b)
+    tol = 1e-15
+    from numpy.linalg import norm
+    assert(norm(out_dict["min_val"] - np.min(b)) < tol)
+    assert(norm(out_dict["min_index"] - np.argmin(b)) < tol)
+
+
+@pytest.mark.parametrize("inline", [False, True])
+def test_packing_unpacking(ctx_factory, inline):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64)
+    x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64)
+
+    callee1 = lp.make_function(
+            "{[i]: 0<=i<6}",
+            """
+            b[i] = 2*a[i]
+            """, name="callee_fn1")
+
+    callee2 = lp.make_function(
+            "{[i, j]: 0<=i<2 and 0 <= j < 3}",
+            """
+            b[i, j] = 3*a[i, j]
+            """, name="callee_fn2")
+
+    knl = lp.make_kernel(
+            "{[i, j, k]:  0<= i < 3 and 0 <= j < 2 and 0 <= k < 6}",
+            """
+            [i, j]: y1[i, j] = callee_fn1([i, j]: x1[i, j])
+            [k]: y2[k] = callee_fn2([k]: x2[k])
+            """)
+
+    knl = lp.merge([knl, callee1])
+    knl = lp.merge([knl, callee2])
+
+    knl = lp.pack_and_unpack_args_for_call(knl, "callee_fn1")
+    knl = lp.pack_and_unpack_args_for_call(knl, "callee_fn2")
+
+    if inline:
+        knl = lp.inline_callable_kernel(knl, "callee_fn1")
+        knl = lp.inline_callable_kernel(knl, "callee_fn2")
+
+    knl = lp.set_options(knl, "write_cl")
+    knl = lp.set_options(knl, "return_dict")
+    evt, out_dict = knl(queue, x1=x1, x2=x2)
+
+    y1 = out_dict["y1"].get()
+    y2 = out_dict["y2"].get()
+
+    assert np.linalg.norm(2*x1.get()-y1)/np.linalg.norm(
+            2*x1.get()) < 1e-15
+    assert np.linalg.norm(3*x2.get()-y2)/np.linalg.norm(
+            3*x2.get()) < 1e-15
+
+
+def test_non_sub_array_refs_arguments(ctx_factory):
+    from loopy.transform.callable import _match_caller_callee_argument_dimension_
+
+    callee = lp.make_function("{[i] : 0 <= i < 6}", "a[i] = a[i] + j",
+            [lp.GlobalArg("a", dtype="double", shape=(6,), is_output=True,
+                is_input=True),
+                lp.ValueArg("j", dtype="int")], name="callee",
+            target=lp.CTarget())
+    caller1 = lp.make_kernel("{[j] : 0 <= j < 2}", "a[:] = callee(a[:], b[0])",
+            [lp.GlobalArg("a", dtype="double", shape=(6, ), is_output=False),
+            lp.GlobalArg("b", dtype="double", shape=(1, ), is_output=False)],
+            name="caller", target=lp.CTarget())
+
+    caller2 = lp.make_kernel("{[j] : 0 <= j < 2}", "a[:]=callee(a[:], 3.1415926)",
+            [lp.GlobalArg("a", dtype="double", shape=(6, ),
+                is_output=False)],
+            name="caller", target=lp.CTarget())
+
+    caller3 = lp.make_kernel("{[j] : 0 <= j < 2}", "a[:]=callee(a[:], kappa)",
+            [lp.GlobalArg("a", dtype="double", shape=(6, ),
+                is_output=False), ...],
+            name="caller", target=lp.CTarget())
+
+    registered = lp.merge([caller1, callee])
+    inlined = _match_caller_callee_argument_dimension_(registered, "callee")
+    inlined = lp.inline_callable_kernel(inlined, "callee")
+
+    print(inlined)
+
+    registered = lp.merge([caller2, callee])
+    inlined = _match_caller_callee_argument_dimension_(registered, "callee")
+    inlined = lp.inline_callable_kernel(inlined, "callee")
+
+    print(inlined)
+
+    registered = lp.merge([caller3, callee])
+    inlined = _match_caller_callee_argument_dimension_(registered, "callee")
+    inlined = lp.inline_callable_kernel(inlined, "callee")
+
+    print(inlined)
+
+
+@pytest.mark.parametrize("inline", [False, True])
+def test_empty_sub_array_refs(ctx_factory, inline):
+    # See: https://github.com/OP2/PyOP2/pull/559#discussion_r272208618
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    x = np.random.randn(10)
+    y = np.random.randn(10)
+
+    callee = lp.make_function(
+            "{[d]:0<=d<1}",
+            """
+            c[d] = a[d] - b[d]
+            """, name="wence_function")
+
+    caller = lp.make_kernel("{[i,k]: 0<=i<10 and 0<=k<1}",
+            """
+            [k]:z[i+k] = wence_function([k]:x[i+k], [k]:y[i+k])
+            """,
+            [lp.GlobalArg("x, y", dtype=np.float64, shape=(10, )), ...])
+
+    caller = lp.merge([caller, callee])
+
+    if inline:
+        caller = lp.inline_callable_kernel(caller, "wence_function")
+
+    evt, (out, ) = caller(queue, x=x, y=y)
+    assert np.allclose(out, x-y)
+
+
+@pytest.mark.parametrize("inline", [False, True])
+def test_array_inputs_to_callee_kernels(ctx_factory, inline):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+    n = 2 ** 3
+
+    x = np.random.rand(n, n)
+    y = np.random.rand(n, n)
+
+    child_knl = lp.make_function(
+            "{[i, j]:0<=i, j < 8}",
+            """
+            g[i, j] = 2*e[i, j] + 3*f[i, j]
+            """, name="linear_combo")
+
+    parent_knl = lp.make_kernel(
+            "{:}",
+            """
+            z[:, :] = linear_combo(x, y)
+            """,
+            kernel_data=[
+                lp.GlobalArg(
+                    name="x, y, z",
+                    dtype=np.float64,
+                    shape=(n, n)),
+                ...]
+            )
+
+    knl = lp.merge([parent_knl, child_knl])
+    if inline:
+        knl = lp.inline_callable_kernel(knl, "linear_combo")
+
+    evt, (out, ) = knl(queue, x=x, y=y)
+
+    assert (np.linalg.norm(2*x+3*y-out)/(
+        np.linalg.norm(2*x+3*y))) < 1e-15
+
+
+def test_stride_depending_on_args():
+    twice = lp.make_function(
+            "{[i, j]: 0<=i, j < n}",
+            """
+            b[i, j] = 2*a[i, j]
+            """, [lp.ValueArg("n"), lp.GlobalArg("a"), lp.GlobalArg("b")],
+            name="twice")
+
+    thrice = lp.make_function(
+            "{[i, j]: 0<=i, j < n}",
+            """
+            b[i, j] = 3*a[i, j]
+            """, [lp.ValueArg("n"), lp.GlobalArg("a", shape=lp.auto),
+                lp.GlobalArg("b", shape=lp.auto)],
+            name="thrice")
+
+    prog = lp.make_kernel(
+            "{[i0,i1,i2,i3,i4,i5,i6,i7]: 0<=i0, i1, i2, i3, i4, i5, i6, i7< N}",
+            """
+            [i0, i1]: y[i0, i1] = twice(N, [i2, i3]: x[2*i2, i3])
+            [i4, i5]: z[i4, i5] = thrice(N, [i6, i7]: x[2*i6+1, i7])
+            """, [
+                lp.ValueArg("N", dtype=np.int32), lp.GlobalArg("x",
+                    shape=lp.auto, dtype=np.float64), ...])
+
+    prog = lp.merge([prog, twice])
+    prog = lp.merge([prog, thrice])
+
+    # FIXME: actually test something
+    print(lp.generate_code_v2(prog).device_code())
+
+
+def test_unknown_stride_to_callee():
+    twice = lp.make_function(
+            "{[i, j]: 0<=i, j < n}",
+            """
+            b[i, j] = 2*a[i, j]
+            """, [lp.ValueArg("n"), lp.GlobalArg("a"), lp.GlobalArg("b")],
+            name="twice")
+
+    prog = lp.make_kernel(
+            "{[i,i0,i1,i2,i3]: 0<=i0, i1, i2, i3< N and 0<=i<Nvar}",
+            """
+            [i0, i1]: y[i0, i1, i] = twice(N, [i2, i3]: x[2*i2, i3, i])
+            """, [
+                lp.ValueArg("N", dtype=np.int32), lp.ValueArg("Nvar",
+                    dtype=np.int32), lp.GlobalArg("x", shape=lp.auto,
+                        dtype=np.float64), ...])
+
+    prog = lp.merge([prog, twice])
+
+    # FIXME: actually test something
+    print(lp.generate_code_v2(prog).device_code())
+
+
+def test_argument_matching_for_inplace_update(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+    twice = lp.make_function(
+            "{[i]: 0<=i<10}",
+            """
+            x[i] = 2*x[i]
+            """, name="twice")
+
+    knl = lp.make_kernel(
+            "{:}",
+            """
+            x[:] = twice(x[:])
+            """, [lp.GlobalArg("x", shape=(10,), dtype=np.float64)])
+
+    knl = lp.merge([knl, twice])
+
+    x = np.random.randn(10)
+    evt, (out, ) = knl(queue, x=np.copy(x))
+
+    assert np.allclose(2*x, out)
+
+
+def test_non_zero_start_in_subarray_ref(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+    twice = lp.make_function(
+            "{[i]: 0<=i<10}",
+            """
+            b[i] = 2*a[i]
+            """, name="twice")
+
+    knl = lp.make_kernel(
+            "{[i, j]: -5<=i<5 and 0<=j<10}",
+            """
+            [i]:y[i+5] = twice([j]: x[j])
+            """, [lp.GlobalArg("x, y", shape=(10,), dtype=np.float64)])
+
+    knl = lp.merge([knl, twice])
+
+    x = np.random.randn(10)
+    evt, (out, ) = knl(queue, x=np.copy(x))
+
+    assert np.allclose(2*x, out)
+
+
+def test_incomplete_entrypoint_raises_type_inf_failure():
+    from loopy.diagnostic import LoopyError
+
+    twice = lp.make_kernel(
+            "{[i]: 0<=i<10}",
+            """
+            y[i] = 2*x[i]
+            """, name="dosify")
+
+    quadr = lp.make_kernel(
+            "{:}",
+            """
+            y[:] = dosify(x[:])
+            y[:] = dosify(y[:])
+            """, [lp.GlobalArg("x,y", shape=(10,))], name="cuatroify",
+            seq_dependencies=True)
+
+    prog = lp.merge([quadr, twice])
+
+    with pytest.raises(LoopyError):
+        # 'twice' is also registered as an entrypoint but provided args aren't
+        # enough to infer the types
+        lp.generate_code_v2(prog)
+
+
+def test_callees_with_gbarriers_are_inlined(ctx_factory):
+    queue = cl.CommandQueue(ctx_factory())
+
+    ones_and_zeros = lp.make_function(
+            "{[i, j]: 0<=i<6 and 0<=j<3}",
+            """
+            x[i] = 0.0f
+            ...gbarrier
+            x[j] = 1.0f
+            """,
+            seq_dependencies=True,
+            name="ones_and_zeros")
+
+    prg = lp.make_kernel(
+            "{ : }",
+            """
+            y[:] = ones_and_zeros()
+            """, [lp.GlobalArg("y", shape=6, dtype=lp.auto)])
+
+    prg = lp.merge([prg, ones_and_zeros])
+    evt, (out,) = prg(queue)
+
+    expected_out = np.array([1, 1, 1, 0, 0, 0]).astype(np.float32)
+
+    assert (expected_out == out.get()).all()
+
+
+def test_inlining_with_indirections(ctx_factory):
+    queue = cl.CommandQueue(ctx_factory())
+
+    ones_and_zeros = lp.make_function(
+            "{[i, j]: 0<=i<6 and 0<=j<3}",
+            """
+            x[i] = 0.0f
+            ...gbarrier
+            x[map[j]] = 1.0f
+            """,
+            seq_dependencies=True,
+            name="ones_and_zeros")
+
+    prg = lp.make_kernel(
+            "{ : }",
+            """
+            y[:] = ones_and_zeros(map[:])
+            """, [lp.GlobalArg("y", shape=6, dtype=lp.auto),
+                  lp.GlobalArg("map", dtype=np.int32, shape=3)])
+
+    prg = lp.merge([prg, ones_and_zeros])
+    prg = lp.inline_callable_kernel(prg, "ones_and_zeros")
+
+    map_in = np.arange(3).astype(np.int32)
+
+    evt, (out, ) = prg(queue, map=map_in)
+
+    expected_out = np.array([1, 1, 1, 0, 0, 0]).astype(np.float32)
+    assert (expected_out == out).all()
+
+
+def test_inlining_with_callee_domain_param(ctx_factory):
+    queue = cl.CommandQueue(ctx_factory())
+
+    fill2 = lp.make_function(
+            "{[i]: 0<=i<n}",
+            """
+            y[i] = 2.0
+            """,
+            name="fill2")
+
+    caller = lp.make_kernel(
+            "{[i]: 0<=i<10}",
+            """
+            [i]: res[i] = fill2(10)
+            """)
+
+    caller = lp.merge([caller, fill2])
+    caller = lp.inline_callable_kernel(caller, "fill2")
+    evt, (out, ) = caller(queue)
+
+    assert (out == 2).all()
+
+
+def test_double_resolving():
+    from loopy.program import resolve_callables
+    from loopy.kernel import KernelState
+    from loopy.symbolic import ResolvedFunction
+
+    knl = lp.make_kernel(
+            "{[i]: 0<=i<10}",
+            """
+            y[i] = sin(x[i])
+            """,
+            [
+                lp.GlobalArg("x", dtype=float, shape=lp.auto),
+                ...],
+            name="foo"
+            )
+
+    knl = resolve_callables(knl)
+    knl = knl.with_kernel(knl["foo"].copy(state=KernelState.INITIAL))
+    knl = resolve_callables(knl)
+
+    assert "sin" in knl.callables_table
+    assert isinstance(knl["foo"].instructions[0].expression.function,
+                      ResolvedFunction)
+
+
+@pytest.mark.parametrize("inline", [False, True])
+def test_passing_and_getting_scalar_in_clbl_knl(ctx_factory, inline):
+    ctx = cl.create_some_context()
+    cq = cl.CommandQueue(ctx)
+
+    call_sin = lp.make_function(
+        "{:}",
+        """
+        y = sin(x)
+        """, name="call_sin")
+
+    knl = lp.make_kernel(
+        "{:}",
+        """
+        []: real_y[()] = call_sin(real_x)
+        """)
+
+    knl = lp.merge([knl, call_sin])
+    knl = lp.set_options(knl, "write_cl")
+    if inline:
+        knl = lp.inline_callable_kernel(knl, "call_sin")
+
+    evt, (out,) = knl(cq, real_x=np.asarray(3.0, dtype=float))
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from pytest import main
+        main([__file__])
+
+# vim: foldmethod=marker
diff --git a/test/test_diff.py b/test/test_diff.py
index 8af2a2b057a52ef6e122ffa65caf85d777ccbbb1..c1bfd9093a09cd9c1f265eb5895b3c677bdb37bf 100644
--- a/test/test_diff.py
+++ b/test/test_diff.py
@@ -58,12 +58,15 @@ def test_diff(ctx_factory):
          """
          <> a = 1/(1+sinh(x[i] + y[j])**2)
          z[i] = sum(j, exp(a * x[j]))
-         """)
+         """, name="diff")
 
     knl = lp.fix_parameters(knl, n=50)
 
     from loopy.transform.diff import diff_kernel
-    dknl, diff_map = diff_kernel(knl, "z", "x")
+    #FIXME Is this the correct interface. Does it make sense to take the entire
+    #translation unit?
+    dknl, diff_map = diff_kernel(knl["diff"], "z", "x")
+    dknl = knl.with_kernel(dknl)
     dknl = lp.remove_unused_arguments(dknl)
 
     dknl = lp.add_inames_to_insn(dknl, "diff_i0", "writes:a_dx or writes:a")
diff --git a/test/test_domain.py b/test/test_domain.py
index 6a0d9f255faefc1e1e3e8fbd8c8f745b058ff1b9..03f1bbc2f538b03af8e7beb6b69d4132c99448e9 100644
--- a/test/test_domain.py
+++ b/test/test_domain.py
@@ -56,20 +56,15 @@ def test_assume(ctx_factory):
     knl = lp.make_kernel(
             "{[i]: 0<=i<n}",
             "a[i] = a[i] + 1",
-            [lp.GlobalArg("a", np.float32, shape="n"), "..."])
+            [lp.GlobalArg("a", np.float32, shape="n"), "..."],
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     knl = lp.split_iname(knl, "i", 16)
     knl = lp.prioritize_loops(knl, "i_outer,i_inner")
     knl = lp.assume(knl, "n mod 16 = 0")
     knl = lp.assume(knl, "n > 10")
-    knl = lp.preprocess_kernel(knl, ctx.devices[0])
-    kernel_gen = lp.generate_loop_schedules(knl)
-
-    for gen_knl in kernel_gen:
-        print(gen_knl)
-        compiled = lp.CompiledKernel(ctx, gen_knl)
-        print(compiled.get_code())
-        assert "if" not in compiled.get_code()
+    code = lp.generate_code_v2(knl).device_code()
+    assert "if" not in code
 
 
 def test_divisibility_assumption(ctx_factory):
@@ -85,16 +80,14 @@ def test_divisibility_assumption(ctx_factory):
                 lp.GlobalArg("b", np.float32, shape=("n",)),
                 lp.ValueArg("n", np.int32),
                 ],
-            assumptions="n>=1 and (exists zz: n = 16*zz)")
+            assumptions="n>=1 and (exists zz: n = 16*zz)",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     ref_knl = knl
 
     knl = lp.split_iname(knl, "i", 16)
-
-    knl = lp.preprocess_kernel(knl, ctx.devices[0])
-    for k in lp.generate_loop_schedules(knl):
-        code = lp.generate_code(k)
-        assert "if" not in code
+    code = lp.generate_code_v2(knl).device_code()
+    assert "if" not in code
 
     lp.auto_test_vs_ref(ref_knl, ctx, knl,
             parameters={"n": 16**3})
@@ -113,16 +106,12 @@ def test_eq_constraint(ctx_factory):
             [
                 lp.GlobalArg("a", np.float32, shape=(1000,)),
                 lp.GlobalArg("b", np.float32, shape=(1000,))
-                ])
+                ],
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     knl = lp.split_iname(knl, "i", 16, outer_tag="g.0")
     knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0")
-
-    knl = lp.preprocess_kernel(knl, ctx.devices[0])
-    kernel_gen = lp.generate_loop_schedules(knl)
-
-    for knl in kernel_gen:
-        print(lp.generate_code(knl))
+    print(lp.generate_code_v2(knl).device_code())
 
 
 def test_dependent_loop_bounds(ctx_factory):
@@ -145,12 +134,10 @@ def test_dependent_loop_bounds(ctx_factory):
                 lp.GlobalArg("a_sum", dtype, shape=lp.auto),
                 lp.ValueArg("n", np.int32),
                 ],
-            assumptions="n>=1 and row_len>=1")
+            assumptions="n>=1 and row_len>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
-    cknl = lp.CompiledKernel(ctx, knl)
-    print("---------------------------------------------------")
-    print(cknl.get_highlighted_code())
-    print("---------------------------------------------------")
+    print(lp.generate_code_v2(knl).device_code())
 
 
 def test_dependent_loop_bounds_2(ctx_factory):
@@ -174,14 +161,13 @@ def test_dependent_loop_bounds_2(ctx_factory):
                 lp.GlobalArg("ax", dtype, shape=lp.auto),
                 lp.ValueArg("n", np.int32),
                 ],
-            assumptions="n>=1 and row_len>=1")
+            assumptions="n>=1 and row_len>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     knl = lp.split_iname(knl, "i", 128, outer_tag="g.0",
             inner_tag="l.0")
-    cknl = lp.CompiledKernel(ctx, knl)
-    print("---------------------------------------------------")
-    print(cknl.get_highlighted_code())
-    print("---------------------------------------------------")
+
+    print(lp.generate_code_v2(knl).device_code())
 
 
 def test_dependent_loop_bounds_3(ctx_factory):
@@ -206,25 +192,22 @@ def test_dependent_loop_bounds_3(ctx_factory):
                 lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto),
                 lp.GlobalArg("a", dtype, shape=("n,n"), order="C"),
                 lp.ValueArg("n", np.int32),
-                ])
+                ],
+            target=lp.PyOpenCLTarget(ctx.devices[0]),
+            name="loopy_kernel")
 
-    assert knl.parents_per_domain()[1] == 0
+    assert knl["loopy_kernel"].parents_per_domain()[1] == 0
 
     knl = lp.split_iname(knl, "i", 128, outer_tag="g.0",
             inner_tag="l.0")
 
-    cknl = lp.CompiledKernel(ctx, knl)
-    print("---------------------------------------------------")
-    print(cknl.get_highlighted_code())
-    print("---------------------------------------------------")
+    print(lp.generate_code_v2(knl).device_code())
 
     knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1",
             inner_tag="l.1")
 
-    knl = lp.preprocess_kernel(knl, ctx.devices[0])
-
     with pytest.raises(RuntimeError):
-        list(lp.generate_loop_schedules(knl_bad))
+        list(lp.generate_code_v2(knl_bad))
 
 
 def test_dependent_loop_bounds_4():
@@ -280,17 +263,17 @@ def test_independent_multi_domain(ctx_factory):
                 lp.GlobalArg("a", dtype, shape=("n"), order="C"),
                 lp.GlobalArg("b", dtype, shape=("n"), order="C"),
                 lp.ValueArg("n", np.int32),
-                ])
+                ],
+            name="loopy_kernel")
 
     knl = lp.split_iname(knl, "i", 16, outer_tag="g.0",
             inner_tag="l.0")
     knl = lp.split_iname(knl, "j", 16, outer_tag="g.0",
             inner_tag="l.0")
-    assert knl.parents_per_domain() == 2*[None]
+    assert knl["loopy_kernel"].parents_per_domain() == 2*[None]
 
     n = 50
-    cknl = lp.CompiledKernel(ctx, knl)
-    evt, (a, b) = cknl(queue, n=n, out_host=True)
+    evt, (a, b) = knl(queue, n=n, out_host=True)
 
     assert a.shape == (50,)
     assert b.shape == (50,)
@@ -394,10 +377,11 @@ def test_triangle_domain(ctx_factory):
     knl = lp.make_kernel(
             "{[i,j]: 0<=i,j<n and i <= j}",
             "a[i,j] = 17",
-            assumptions="n>=1")
+            assumptions="n>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     print(knl)
-    print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
+    print(lp.generate_code_v2(knl).device_code())
 
 
 if __name__ == "__main__":
diff --git a/test/test_fortran.py b/test/test_fortran.py
index ff0855fa239218b05fae25b37cbe0f80d4f0362e..f596acbf53c8ab5372a0f649134525c3feb9636a 100644
--- a/test/test_fortran.py
+++ b/test/test_fortran.py
@@ -43,6 +43,97 @@ __all__ = [
 pytest.importorskip("fparser")
 
 
+def test_fp_prec_comparison():
+    # FIXME: This test should succeed even when the number is exactly
+    # representable in single precision.
+    #
+    # https://gitlab.tiker.net/inducer/loopy/issues/187
+
+    fortran_src_dp = """
+        subroutine assign_scalar(a)
+          real*8 a(1)
+
+          a(1) = 1.1d0
+        end
+        """
+
+    prg_dp = lp.parse_fortran(fortran_src_dp)
+
+    fortran_src_sp = """
+        subroutine assign_scalar(a)
+          real*8 a(1)
+
+          a(1) = 1.1
+        end
+        """
+
+    prg_sp = lp.parse_fortran(fortran_src_sp)
+
+    assert prg_sp != prg_dp
+
+
+def test_assign_double_precision_scalar(ctx_factory):
+    fortran_src = """
+        subroutine assign_scalar(a)
+          real*8 a(1)
+
+          a(1) = 1.1d0
+        end
+        """
+
+    prg = lp.parse_fortran(fortran_src)
+    print(lp.generate_code_v2(prg).device_code())
+    assert "1.1;" in lp.generate_code_v2(prg).device_code()
+    queue = cl.CommandQueue(ctx_factory())
+
+    a_dev = cl.array.empty(queue, 1, dtype=np.float64, order="F")
+    prg(queue, a=a_dev)
+
+    abs_err = abs(a_dev.get()[0] - 1.1)
+    assert abs_err < 1e-15
+
+
+def test_assign_double_precision_scalar_as_rational(ctx_factory):
+    fortran_src = """
+        subroutine assign_scalar(a)
+          real*8 a(1)
+
+          a(1) = 11
+          a(1) = a(1) / 10
+        end
+        """
+
+    prg = lp.parse_fortran(fortran_src)
+    queue = cl.CommandQueue(ctx_factory())
+
+    a_dev = cl.array.empty(queue, 1, dtype=np.float64, order="F")
+    prg(queue, a=a_dev)
+
+    abs_err = abs(a_dev.get()[0] - 1.1)
+    assert abs_err < 1e-15
+
+
+def test_assign_single_precision_scalar(ctx_factory):
+    fortran_src = """
+        subroutine assign_scalar(a)
+          real*8 a(1)
+
+          a(1) = 1.1
+        end
+        """
+
+    prg = lp.parse_fortran(fortran_src)
+    assert "1.1f" in lp.generate_code_v2(prg).device_code()
+    queue = cl.CommandQueue(ctx_factory())
+
+    a_dev = cl.array.empty(queue, 1, dtype=np.float64, order="F")
+    prg(queue, a=a_dev)
+
+    abs_err = abs(a_dev.get()[0] - 1.1)
+    assert abs_err > 1e-15
+    assert abs_err < 1e-6
+
+
 def test_fill(ctx_factory):
     fortran_src = """
         subroutine fill(out, a, n)
@@ -58,18 +149,18 @@ def test_fill(ctx_factory):
 
         !$loopy begin
         !
-        ! fill, = lp.parse_fortran(SOURCE)
+        ! fill = lp.parse_fortran(SOURCE)
         ! fill = lp.split_iname(fill, "i", split_amount,
         !     outer_tag="g.0", inner_tag="l.0")
-        ! RESULT = [fill]
+        ! RESULT = fill
         !
         !$loopy end
         """
 
-    knl, = lp.parse_transformed_fortran(fortran_src,
+    knl = lp.parse_transformed_fortran(fortran_src,
             pre_transform_code="split_amount = 128")
 
-    assert "i_inner" in knl.all_inames()
+    assert "i_inner" in knl["fill"].all_inames()
 
     ctx = ctx_factory()
 
@@ -90,7 +181,7 @@ def test_fill_const(ctx_factory):
         end
         """
 
-    knl, = lp.parse_fortran(fortran_src)
+    knl = lp.parse_fortran(fortran_src)
 
     ctx = ctx_factory()
 
@@ -113,7 +204,7 @@ def test_asterisk_in_shape(ctx_factory):
         end
         """
 
-    knl, = lp.parse_fortran(fortran_src)
+    knl = lp.parse_fortran(fortran_src)
 
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
@@ -137,7 +228,7 @@ def test_assignment_to_subst(ctx_factory):
         end
         """
 
-    knl, = lp.parse_fortran(fortran_src)
+    knl = lp.parse_fortran(fortran_src)
 
     ref_knl = knl
 
@@ -164,7 +255,7 @@ def test_assignment_to_subst_two_defs(ctx_factory):
         end
         """
 
-    knl, = lp.parse_fortran(fortran_src)
+    knl = lp.parse_fortran(fortran_src)
 
     ref_knl = knl
 
@@ -192,15 +283,15 @@ def test_assignment_to_subst_indices(ctx_factory):
         end
         """
 
-    knl, = lp.parse_fortran(fortran_src)
+    knl = lp.parse_fortran(fortran_src)
 
     knl = lp.fix_parameters(knl, n=5)
 
     ref_knl = knl
 
-    assert "a" in knl.temporary_variables
+    assert "a" in knl["fill"].temporary_variables
     knl = lp.assignment_to_subst(knl, "a")
-    assert "a" not in knl.temporary_variables
+    assert "a" not in knl["fill"].temporary_variables
 
     ctx = ctx_factory()
     lp.auto_test_vs_ref(ref_knl, ctx, knl)
@@ -229,7 +320,7 @@ def test_if(ctx_factory):
         end
         """
 
-    knl, = lp.parse_fortran(fortran_src)
+    knl = lp.parse_fortran(fortran_src)
 
     ref_knl = knl
 
@@ -263,7 +354,7 @@ def test_tagged(ctx_factory):
         end
         """
 
-    knl, = lp.parse_fortran(fortran_src)
+    knl = lp.parse_fortran(fortran_src)
 
     assert sum(1 for insn in lp.find_instructions(knl, "tag:input")) == 2
 
@@ -297,34 +388,34 @@ def test_matmul(ctx_factory, buffer_inames):
         end subroutine
         """
 
-    knl, = lp.parse_fortran(fortran_src)
+    prog = lp.parse_fortran(fortran_src)
 
-    assert len(knl.domains) == 1
+    assert len(prog["dgemm"].domains) == 1
 
-    ref_knl = knl
+    ref_prog = prog
 
-    knl = lp.split_iname(knl, "i", 16,
+    prog = lp.split_iname(prog, "i", 16,
             outer_tag="g.0", inner_tag="l.1")
-    knl = lp.split_iname(knl, "j", 8,
+    prog = lp.split_iname(prog, "j", 8,
             outer_tag="g.1", inner_tag="l.0")
-    knl = lp.split_iname(knl, "k", 32)
-    knl = lp.assume(knl, "n mod 32 = 0")
-    knl = lp.assume(knl, "m mod 32 = 0")
-    knl = lp.assume(knl, "ell mod 16 = 0")
-
-    knl = lp.extract_subst(knl, "a_acc", "a[i1,i2]", parameters="i1, i2")
-    knl = lp.extract_subst(knl, "b_acc", "b[i1,i2]", parameters="i1, i2")
-    knl = lp.precompute(knl, "a_acc", "k_inner,i_inner",
+    prog = lp.split_iname(prog, "k", 32)
+    prog = lp.assume(prog, "n mod 32 = 0")
+    prog = lp.assume(prog, "m mod 32 = 0")
+    prog = lp.assume(prog, "ell mod 16 = 0")
+
+    prog = lp.extract_subst(prog, "a_acc", "a[i1,i2]", parameters="i1, i2")
+    prog = lp.extract_subst(prog, "b_acc", "b[i1,i2]", parameters="i1, i2")
+    prog = lp.precompute(prog, "a_acc", "k_inner,i_inner",
             precompute_outer_inames="i_outer, j_outer, k_outer",
             default_tag="l.auto")
-    knl = lp.precompute(knl, "b_acc", "j_inner,k_inner",
+    prog = lp.precompute(prog, "b_acc", "j_inner,k_inner",
             precompute_outer_inames="i_outer, j_outer, k_outer",
             default_tag="l.auto")
 
-    knl = lp.buffer_array(knl, "c", buffer_inames=buffer_inames,
+    prog = lp.buffer_array(prog, "c", buffer_inames=buffer_inames,
             init_expression="0", store_expression="base+buffer")
 
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128))
+    lp.auto_test_vs_ref(ref_prog, ctx, prog, parameters=dict(n=128, m=128, ell=128))
 
 
 @pytest.mark.xfail
@@ -362,7 +453,7 @@ def test_batched_sparse():
 
         """
 
-    knl, = lp.parse_fortran(fortran_src)
+    knl = lp.parse_fortran(fortran_src)
 
     knl = lp.split_iname(knl, "i", 128)
     knl = lp.tag_inames(knl, {"i_outer": "g.0"})
@@ -406,18 +497,19 @@ def test_fuse_kernels(ctx_factory):
         result(e,i,j) = prev + d(i,k)*q(e,k,j)
         """
 
-    xderiv, = lp.parse_fortran(
+    xderiv = lp.parse_fortran(
             fortran_template.format(inner=xd_line, name="xderiv"))
-    yderiv, = lp.parse_fortran(
+    yderiv = lp.parse_fortran(
             fortran_template.format(inner=yd_line, name="yderiv"))
-    xyderiv, = lp.parse_fortran(
+    xyderiv = lp.parse_fortran(
             fortran_template.format(
                 inner=(xd_line + "\n" + yd_line), name="xyderiv"))
 
-    knl = lp.fuse_kernels((xderiv, yderiv), data_flow=[("result", 0, 1)])
-    knl = lp.prioritize_loops(knl, "e,i,j,k")
+    knl = lp.fuse_kernels((xderiv["xderiv"], yderiv["yderiv"]),
+            data_flow=[("result", 0, 1)])
+    knl = knl.with_kernel(lp.prioritize_loops(knl["xderiv_and_yderiv"], "e,i,j,k"))
 
-    assert len(knl.temporary_variables) == 2
+    assert len(knl["xderiv_and_yderiv"].temporary_variables) == 2
 
     ctx = ctx_factory()
     lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4))
@@ -449,15 +541,17 @@ def test_parse_and_fuse_two_kernels():
 
         !$loopy begin
         !
-        ! fill, twice = lp.parse_fortran(SOURCE)
+        ! prg = lp.parse_fortran(SOURCE)
+        ! fill = prg["fill"]
+        ! twice = prg["twice"]
         ! knl = lp.fuse_kernels((fill, twice))
         ! print(knl)
-        ! RESULT = [knl]
+        ! RESULT = knl
         !
         !$loopy end
         """
 
-    knl, = lp.parse_transformed_fortran(fortran_src)
+    lp.parse_transformed_fortran(fortran_src)
 
 
 def test_precompute_some_exist(ctx_factory):
@@ -477,9 +571,9 @@ def test_precompute_some_exist(ctx_factory):
         end subroutine
         """
 
-    knl, = lp.parse_fortran(fortran_src)
+    knl = lp.parse_fortran(fortran_src)
 
-    assert len(knl.domains) == 1
+    assert len(knl["dgemm"].domains) == 1
 
     knl = lp.split_iname(knl, "i", 8,
             outer_tag="g.0", inner_tag="l.1")
@@ -507,6 +601,53 @@ def test_precompute_some_exist(ctx_factory):
     lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=128, m=128, ell=128))
 
 
+def test_fortran_subroutines():
+    fortran_src = """
+        subroutine twice(n, a)
+          implicit none
+          real*8  a(n)
+          integer i,n
+
+          do i=1,n
+            a(i) = a(i) * 2
+          end do
+        end subroutine
+
+        subroutine twice_cross(n, a, i)
+          implicit none
+          integer i, n
+          real*8  a(n,n)
+
+          call twice(n, a(1:n, i))
+          call twice(n, a(i, 1:n))
+        end subroutine
+        """
+    prg = lp.parse_fortran(fortran_src).with_entrypoints("twice_cross")
+    print(lp.generate_code_v2(prg).device_code())
+
+
+def test_domain_fusion_imperfectly_nested():
+    fortran_src = """
+        subroutine imperfect(n, m, a, b)
+            implicit none
+            integer i, j, n, m
+            real a(n), b(n,n)
+
+            do i=1, n
+                a(i) = i
+                do j=1, m
+                    b(i,j) = i*j
+                end do
+            end do
+        end subroutine
+        """
+
+    prg = lp.parse_fortran(fortran_src)
+    # If n > 0 and m == 0, a single domain would be empty,
+    # leading (incorrectly) to no assignments to 'a'.
+    assert len(prg["imperfect"].domains) > 1
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_loopy.py b/test/test_loopy.py
index c5295397552c0462da06ff126814f456e3bdcc6e..1e728eefb9ecc6430438992bb23f34877635a6f0 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -92,7 +92,7 @@ def test_complicated_subst(ctx_factory):
 
     print(knl)
 
-    sr_keys = list(knl.substitutions.keys())
+    sr_keys = list(knl["loopy_kernel"].substitutions.keys())
     for letter, how_many in [
             ("f", 1),
             ("g", 1),
@@ -102,8 +102,10 @@ def test_complicated_subst(ctx_factory):
         assert substs_with_letter == how_many
 
 
-def test_type_inference_no_artificial_doubles():
-    knl = lp.make_kernel(
+def test_type_inference_no_artificial_doubles(ctx_factory):
+    ctx = ctx_factory()
+
+    prog = lp.make_kernel(
             "{[i]: 0<=i<n}",
             """
                 <> bb = a[i] - b[i]
@@ -115,16 +117,15 @@ def test_type_inference_no_artificial_doubles():
                 lp.GlobalArg("c", np.float32, shape=("n",)),
                 lp.ValueArg("n", np.int32),
                 ],
-            assumptions="n>=1")
+            assumptions="n>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
-    knl = lp.preprocess_kernel(knl)
-    for k in lp.generate_loop_schedules(knl):
-        code = lp.generate_code(k)
-        assert "double" not in code
+    code = lp.generate_code_v2(prog).device_code()
+    assert "double" not in code
 
 
 def test_type_inference_with_type_dependencies():
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
             "{[i]: i=0}",
             """
             <>a = 99
@@ -136,13 +137,17 @@ def test_type_inference_with_type_dependencies():
             <>d = b + 2 + 1j
             """,
             "...")
-    knl = lp.infer_unknown_types(knl)
+    prog = lp.infer_unknown_types(prog)
 
     from loopy.types import to_loopy_type
-    assert knl.temporary_variables["a"].dtype == to_loopy_type(np.int32)
-    assert knl.temporary_variables["b"].dtype == to_loopy_type(np.float32)
-    assert knl.temporary_variables["c"].dtype == to_loopy_type(np.float32)
-    assert knl.temporary_variables["d"].dtype == to_loopy_type(np.complex128)
+    assert prog["loopy_kernel"].temporary_variables["a"].dtype == to_loopy_type(
+            np.int32)
+    assert prog["loopy_kernel"].temporary_variables["b"].dtype == to_loopy_type(
+            np.float32)
+    assert prog["loopy_kernel"].temporary_variables["c"].dtype == to_loopy_type(
+            np.float32)
+    assert prog["loopy_kernel"].temporary_variables["d"].dtype == to_loopy_type(
+            np.complex128)
 
 
 def test_sized_and_complex_literals(ctx_factory):
@@ -176,16 +181,12 @@ def test_simple_side_effect(ctx_factory):
             """
                 a[i] = a[i] + 1
                 """,
-            [lp.GlobalArg("a", np.float32, shape=(100,))]
+            [lp.GlobalArg("a", np.float32, shape=(100,))],
+            target=lp.PyOpenCLTarget(ctx.devices[0])
             )
 
-    knl = lp.preprocess_kernel(knl)
-    kernel_gen = lp.generate_loop_schedules(knl)
-
-    for gen_knl in kernel_gen:
-        print(gen_knl)
-        compiled = lp.CompiledKernel(ctx, gen_knl)
-        print(compiled.get_code())
+    print(knl)
+    print(lp.generate_code_v2(knl))
 
 
 def test_owed_barriers(ctx_factory):
@@ -196,17 +197,14 @@ def test_owed_barriers(ctx_factory):
             [
                 "<float32> z[i] = a[i]"
                 ],
-            [lp.GlobalArg("a", np.float32, shape=(100,))]
+            [lp.GlobalArg("a", np.float32, shape=(100,))],
+            target=lp.PyOpenCLTarget(ctx.devices[0])
             )
 
     knl = lp.tag_inames(knl, dict(i="l.0"))
 
-    knl = lp.preprocess_kernel(knl)
-    kernel_gen = lp.generate_loop_schedules(knl)
-
-    for gen_knl in kernel_gen:
-        compiled = lp.CompiledKernel(ctx, gen_knl)
-        print(compiled.get_code())
+    print(knl)
+    print(lp.generate_code_v2(knl))
 
 
 def test_wg_too_small(ctx_factory):
@@ -218,17 +216,14 @@ def test_wg_too_small(ctx_factory):
                 "<float32> z[i] = a[i] {id=copy}"
                 ],
             [lp.GlobalArg("a", np.float32, shape=(100,))],
+            target=lp.PyOpenCLTarget(ctx.devices[0]),
             local_sizes={0: 16})
 
     knl = lp.tag_inames(knl, dict(i="l.0"))
 
-    knl = lp.preprocess_kernel(knl)
-    kernel_gen = lp.generate_loop_schedules(knl)
-
-    import pytest
-    for gen_knl in kernel_gen:
-        with pytest.raises(RuntimeError):
-            lp.CompiledKernel(ctx, gen_knl).get_code()
+    print(knl)
+    with pytest.raises(RuntimeError):
+        print(lp.generate_code_v2(knl))
 
 
 def test_multi_cse(ctx_factory):
@@ -240,17 +235,14 @@ def test_multi_cse(ctx_factory):
                 "<float32> z[i] = a[i] + a[i]**2"
                 ],
             [lp.GlobalArg("a", np.float32, shape=(100,))],
+            target=lp.PyOpenCLTarget(ctx.devices[0]),
             local_sizes={0: 16})
 
     knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
     knl = lp.add_prefetch(knl, "a", [])
 
-    knl = lp.preprocess_kernel(knl)
-    kernel_gen = lp.generate_loop_schedules(knl)
-
-    for gen_knl in kernel_gen:
-        compiled = lp.CompiledKernel(ctx, gen_knl)
-        print(compiled.get_code())
+    print(knl)
+    print(lp.generate_code_v2(knl))
 
 
 def test_bare_data_dependency(ctx_factory):
@@ -280,7 +272,9 @@ def test_bare_data_dependency(ctx_factory):
 
 # {{{ test race detection
 
-def test_ilp_write_race_detection_global():
+def test_ilp_write_race_detection_global(ctx_factory):
+    ctx = ctx_factory()
+
     knl = lp.make_kernel(
             "[n] -> {[i,j]: 0<=i,j<n }",
             [
@@ -290,7 +284,9 @@ def test_ilp_write_race_detection_global():
                 lp.GlobalArg("a", np.float32),
                 lp.ValueArg("n", np.int32, approximately=1000),
                 ],
-            assumptions="n>=1")
+            assumptions="n>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0]),
+            name="loopy_kernel")
 
     knl = lp.tag_inames(knl, dict(j="ilp"))
 
@@ -300,40 +296,46 @@ def test_ilp_write_race_detection_global():
         from loopy.diagnostic import WriteRaceConditionWarning
         from warnings import catch_warnings
         with catch_warnings(record=True) as warn_list:
-            list(lp.generate_loop_schedules(knl))
+            list(lp.generate_loop_schedules(knl["loopy_kernel"],
+                    knl.callables_table))
 
             assert any(isinstance(w.message, WriteRaceConditionWarning)
                     for w in warn_list)
 
 
-def test_ilp_write_race_avoidance_local():
+def test_ilp_write_race_avoidance_local(ctx_factory):
+    ctx = ctx_factory()
+
     knl = lp.make_kernel(
             "{[i,j]: 0<=i<16 and 0<=j<17 }",
             [
                 "<> a[i] = 5+i+j",
                 ],
-            [])
+            [],
+            target=lp.PyOpenCLTarget(ctx.devices[0]),
+            name="loopy_kernel")
 
     knl = lp.tag_inames(knl, dict(i="l.0", j="ilp"))
 
     knl = lp.preprocess_kernel(knl)
-    for k in lp.generate_loop_schedules(knl):
-        assert k.temporary_variables["a"].shape == (16, 17)
+    assert knl["loopy_kernel"].temporary_variables["a"].shape == (16, 17)
 
 
-def test_ilp_write_race_avoidance_private():
+def test_ilp_write_race_avoidance_private(ctx_factory):
+    ctx = ctx_factory()
     knl = lp.make_kernel(
             "{[j]: 0<=j<16 }",
             [
                 "<> a = 5+j",
                 ],
-            [])
+            [],
+            target=lp.PyOpenCLTarget(ctx.devices[0]),
+            name="loopy_kernel")
 
     knl = lp.tag_inames(knl, dict(j="ilp"))
 
     knl = lp.preprocess_kernel(knl)
-    for k in lp.generate_loop_schedules(knl):
-        assert k.temporary_variables["a"].shape == (16,)
+    assert knl["loopy_kernel"].temporary_variables["a"].shape == (16,)
 
 # }}}
 
@@ -354,11 +356,12 @@ def test_write_parameter(ctx_factory):
                 lp.GlobalArg("b", dtype, shape=()),
                 lp.ValueArg("n", np.int32, approximately=1000),
                 ],
-            assumptions="n>=1")
+            assumptions="n>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     import pytest
     with pytest.raises(RuntimeError):
-        lp.CompiledKernel(ctx, knl).get_code()
+        lp.generate_code_v2(knl).device_code()
 
 
 # {{{ arg guessing
@@ -379,10 +382,11 @@ def test_arg_shape_guessing(ctx_factory):
                 lp.GlobalArg("c", shape=lp.auto),
                 lp.ValueArg("n"),
                 ],
-            assumptions="n>=1")
+            assumptions="n>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     print(knl)
-    print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
+    print(lp.generate_code_v2(knl).device_code())
 
 
 def test_arg_guessing(ctx_factory):
@@ -395,10 +399,11 @@ def test_arg_guessing(ctx_factory):
                 b[i, j] = i*j
                 c[i+j, j] = b[j,i]
                 """,
-            assumptions="n>=1")
+            assumptions="n>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     print(knl)
-    print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
+    print(lp.generate_code_v2(knl).device_code())
 
 
 def test_arg_guessing_with_reduction(ctx_factory):
@@ -413,16 +418,16 @@ def test_arg_guessing_with_reduction(ctx_factory):
                 b[i, j] = i*j
                 c[i+j, j] = b[j,i]
                 """,
-            assumptions="n>=1")
+            assumptions="n>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     print(knl)
-    print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
+    print(lp.generate_code_v2(knl).device_code())
 
 
 def test_unknown_arg_shape(ctx_factory):
     ctx = ctx_factory()
     from loopy.target.pyopencl import PyOpenCLTarget
-    from loopy.compiled import CompiledKernel
     bsize = [256, 0]
 
     knl = lp.make_kernel(
@@ -438,11 +443,11 @@ def test_unknown_arg_shape(ctx_factory):
         """,
         seq_dependencies=True,
         name="uniform_l",
-        target=PyOpenCLTarget(),
+        target=PyOpenCLTarget(ctx.devices[0]),
         assumptions="m<=%d and m>=1 and n mod %d = 0" % (bsize[0], bsize[0]))
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
-    kernel_info = CompiledKernel(ctx, knl).kernel_info(frozenset())  # noqa
+    print(lp.generate_code_v2(knl).device_code())
 
 # }}}
 
@@ -459,10 +464,11 @@ def test_nonlinear_index(ctx_factory):
                 lp.GlobalArg("a", shape="n"),
                 lp.ValueArg("n"),
                 ],
-            assumptions="n>=1")
+            assumptions="n>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     print(knl)
-    print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
+    print(lp.generate_code_v2(knl).device_code())
 
 
 def test_offsets_and_slicing(ctx_factory):
@@ -494,9 +500,9 @@ def test_offsets_and_slicing(ctx_factory):
 
     b_full_h[b_sub] = 2*a_full_h[a_sub]
 
-    #print(cknl.get_highlighted_code({"a": a.dtype}))
-    knl = lp.set_options(knl, write_cl=True)
+    knl = lp.add_dtypes(knl, {"a": a.dtype})
 
+    print(lp.generate_code_v2(knl))
     knl(queue, a=a, b=b)
 
     import numpy.linalg as la
@@ -514,18 +520,16 @@ def test_vector_ilp_with_prefetch(ctx_factory):
                 # argument guessing.
                 lp.GlobalArg("out,a", np.float32, shape=lp.auto),
                 "..."
-                ])
+                ],
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     knl = lp.split_iname(knl, "i", 128, inner_tag="l.0")
     knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp")
     knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"],
             default_tag="l.auto")
 
-    cknl = lp.CompiledKernel(ctx, knl)
-    cknl.kernel_info()
-
     import re
-    code = cknl.get_code()
+    code = lp.generate_code_v2(knl).device_code()
     assert len(list(re.finditer("barrier", code))) == 1
 
 
@@ -546,18 +550,18 @@ def test_c_instruction(ctx_factory):
                 lp.TemporaryVariable("x", np.float32),
                 "...",
                 ],
-            assumptions="n>=1")
+            assumptions="n>=1", target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
 
     print(knl)
-    print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
+    print(lp.generate_code_v2(knl).device_code())
 
 
 def test_dependent_domain_insn_iname_finding(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel([
+    prog = lp.make_kernel([
             "{[isrc_box]: 0<=isrc_box<nsrc_boxes}",
             "{[isrc]: isrc_start<=isrc<isrc_end}",
             ],
@@ -572,23 +576,25 @@ def test_dependent_domain_insn_iname_finding(ctx_factory):
                     None, shape=None),
                 lp.GlobalArg("strengths",
                     None, shape="nsources"),
-                "..."])
+                "..."],
+            target=lp.PyOpenCLTarget(ctx.devices[0]),
+            name="loopy_kernel")
 
-    print(knl)
-    assert "isrc_box" in knl.insn_inames("set_strength")
+    print(prog)
+    assert "isrc_box" in prog["loopy_kernel"].insn_inames("set_strength")
 
-    print(lp.CompiledKernel(ctx, knl).get_highlighted_code(
-            dict(
-                source_boxes=np.int32,
-                box_source_starts=np.int32,
-                box_source_counts_nonchild=np.int32,
-                strengths=np.float64,
-                nsources=np.int32,
-                )))
+    prog = lp.add_dtypes(prog,
+        dict(
+            source_boxes=np.int32,
+            box_source_starts=np.int32,
+            box_source_counts_nonchild=np.int32,
+            strengths=np.float64,
+            nsources=np.int32))
+    print(lp.generate_code_v2(prog).device_code())
 
 
 def test_inames_deps_from_write_subscript(ctx_factory):
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
             "{[i,j]: 0<=i,j<n}",
             """
                 <> src_ibox = source_boxes[i]
@@ -598,10 +604,11 @@ def test_inames_deps_from_write_subscript(ctx_factory):
             [
                 lp.GlobalArg("box_source_starts,box_source_counts_nonchild,a",
                     None, shape=None),
-                "..."])
+                "..."],
+            name="loopy_kernel")
 
-    print(knl)
-    assert "i" in knl.insn_inames("myred")
+    print(prog)
+    assert "i" in prog["loopy_kernel"].insn_inames("myred")
 
 
 def test_modulo_indexing(ctx_factory):
@@ -615,14 +622,12 @@ def test_modulo_indexing(ctx_factory):
             [
                 lp.GlobalArg("a", None, shape="n"),
                 "..."
-                ]
+                ], target=lp.PyOpenCLTarget(ctx.devices[0])
             )
 
     print(knl)
-    print(lp.CompiledKernel(ctx, knl).get_highlighted_code(
-            dict(
-                a=np.float32,
-                )))
+    knl = lp.add_dtypes(knl, {"a": np.float32})
+    print(lp.generate_code_v2(knl).device_code())
 
 
 @pytest.mark.parametrize("vec_len", [2, 3, 4, 8, 16])
@@ -770,11 +775,7 @@ def test_multiple_writes_to_local_temporary():
         temp[i, 1] = 15
         """)
     knl = lp.tag_inames(knl, dict(i="l.0"))
-
-    knl = lp.preprocess_kernel(knl)
-    for k in lp.generate_loop_schedules(knl):
-        code, _ = lp.generate_code(k)
-        print(code)
+    print(lp.generate_code_v2(knl).device_code())
 
 
 def test_make_copy_kernel(ctx_factory):
@@ -854,9 +855,7 @@ def test_variable_size_temporary():
 
     # Make sure that code generation succeeds even if
     # there are variable-length arrays.
-    knl = lp.preprocess_kernel(knl)
-    for k in lp.generate_loop_schedules(knl):
-        lp.generate_code(k)
+    lp.generate_code_v2(knl).device_code()
 
 
 @pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])
@@ -980,7 +979,7 @@ def test_within_inames_and_reduction():
             within_inames=frozenset(),
             within_inames_is_final=True)
 
-    k = lp.make_kernel("{[i,j] : 0<=i,j<n}",
+    prog = lp.make_kernel("{[i,j] : 0<=i,j<n}",
             [i1, i2],
             [
                 lp.GlobalArg("a", dtype=np.float32, shape=()),
@@ -988,12 +987,13 @@ def test_within_inames_and_reduction():
                 lp.TemporaryVariable("phi", dtype=np.float32, shape=("n",)),
                 ],
             target=lp.CTarget(),
+            name="loopy_kernel"
             )
 
-    k = lp.preprocess_kernel(k)
+    prog = lp.preprocess_kernel(prog)
 
-    assert "i" not in k.insn_inames("insn_0_j_update")
-    print(k.stringify(with_dependencies=True))
+    assert "i" not in prog["loopy_kernel"].insn_inames("insn_0_j_update")
+    print(prog["loopy_kernel"].stringify(with_dependencies=True))
 
 
 def test_literal_local_barrier(ctx_factory):
@@ -1056,13 +1056,6 @@ def test_kernel_splitting(ctx_factory):
 
     knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
 
-    # schedule
-    from loopy.preprocess import preprocess_kernel
-    knl = preprocess_kernel(knl)
-
-    from loopy.schedule import get_one_scheduled_kernel
-    knl = get_one_scheduled_kernel(knl)
-
     # map schedule onto host or device
     print(knl)
 
@@ -1097,13 +1090,6 @@ def test_kernel_splitting_with_loop(ctx_factory):
 
     knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
 
-    # schedule
-    from loopy.preprocess import preprocess_kernel
-    knl = preprocess_kernel(knl)
-
-    from loopy.schedule import get_one_scheduled_kernel
-    knl = get_one_scheduled_kernel(knl)
-
     # map schedule onto host or device
     print(knl)
 
@@ -1117,25 +1103,20 @@ def test_kernel_splitting_with_loop(ctx_factory):
     lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
 
 
-def save_and_reload_temporaries_test(queue, knl, out_expect, debug=False):
-    from loopy.preprocess import preprocess_kernel
-    from loopy.schedule import get_one_scheduled_kernel
-
-    knl = preprocess_kernel(knl)
-    knl = get_one_scheduled_kernel(knl)
+def save_and_reload_temporaries_test(queue, prog, out_expect, debug=False):
 
     from loopy.transform.save import save_and_reload_temporaries
-    knl = save_and_reload_temporaries(knl)
-    knl = get_one_scheduled_kernel(knl)
+    prog = save_and_reload_temporaries(prog)
+    prog = prog.with_kernel(lp.get_one_scheduled_kernel(prog["loopy_kernel"],
+        prog.callables_table))
 
     if debug:
-        print(knl)
-        cgr = lp.generate_code_v2(knl)
+        print(prog)
+        cgr = lp.generate_code_v2(prog)
         print(cgr.device_code())
         print(cgr.host_code())
-        1/0
 
-    _, (out,) = knl(queue, out_host=True)
+    _, (out,) = prog(queue, out_host=True)
     assert (out == out_expect).all(), (out, out_expect)
 
 
@@ -1144,7 +1125,7 @@ def test_save_of_private_scalar(ctx_factory, hw_loop, debug=False):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
         "{ [i]: 0<=i<8 }",
         """
         for i
@@ -1155,9 +1136,9 @@ def test_save_of_private_scalar(ctx_factory, hw_loop, debug=False):
         """, seq_dependencies=True)
 
     if hw_loop:
-        knl = lp.tag_inames(knl, dict(i="g.0"))
+        prog = lp.tag_inames(prog, dict(i="g.0"))
 
-    save_and_reload_temporaries_test(queue, knl, np.arange(8), debug)
+    save_and_reload_temporaries_test(queue, prog, np.arange(8), debug)
 
 
 def test_save_of_private_array(ctx_factory, debug=False):
@@ -1389,9 +1370,6 @@ def test_save_ambiguous_storage_requirements():
     knl = lp.duplicate_inames(knl, "j", within="writes:out", tags={"j": "l.0"})
     knl = lp.set_temporary_scope(knl, "a", "local")
 
-    knl = lp.preprocess_kernel(knl)
-    knl = lp.get_one_scheduled_kernel(knl)
-
     from loopy.diagnostic import LoopyError
     with pytest.raises(LoopyError):
         lp.save_and_reload_temporaries(knl)
@@ -1524,7 +1502,8 @@ def test_finite_difference_expr_subst(ctx_factory):
             lp.GlobalArg("u", shape="n+2"),
             ])
 
-    fused_knl = lp.fuse_kernels([fin_diff_knl, flux_knl],
+    fused_knl = lp.fuse_kernels(
+            [fin_diff_knl["loopy_kernel"], flux_knl["loopy_kernel"]],
             data_flow=[
                 ("f", 1, 0)
                 ])
@@ -1568,9 +1547,8 @@ def test_call_with_no_returned_value(ctx_factory):
         [lp.CallInstruction((), p.Call(p.Variable("f"), ()))]
         )
 
-    from library_for_test import no_ret_f_mangler, no_ret_f_preamble_gen
-    knl = lp.register_function_manglers(knl, [no_ret_f_mangler])
-    knl = lp.register_preamble_generators(knl, [no_ret_f_preamble_gen])
+    from library_for_test import NoRetFunction
+    knl = lp.register_callable(knl, "f", NoRetFunction("f"))
 
     evt, _ = knl(queue)
 
@@ -1585,8 +1563,8 @@ def test_call_with_options():
         "f() {id=init}"
         )
 
-    from library_for_test import no_ret_f_mangler
-    knl = lp.register_function_manglers(knl, [no_ret_f_mangler])
+    from library_for_test import NoRetFunction
+    knl = lp.register_callable(knl, "f", NoRetFunction("f"))
 
     print(lp.generate_code_v2(knl).device_code())
 
@@ -1594,6 +1572,8 @@ def test_call_with_options():
 
 
 def test_unschedulable_kernel_detection():
+    # FIXME: does not work
+    # Reason for multiple calllable kernels, not sure how this will go.
     knl = lp.make_kernel(["{[i,j]:0<=i,j<n}"],
                          """
                          mat1[i,j] = mat1[i,j] + 1 {inames=i:j, id=i1}
@@ -1625,12 +1605,12 @@ def test_unschedulable_kernel_detection():
 
 def test_regression_no_ret_call_removal(ctx_factory):
     # https://github.com/inducer/loopy/issues/32
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
             "{[i] : 0<=i<n}",
             "f(sum(i, x[i]))")
-    knl = lp.add_and_infer_dtypes(knl, {"x": np.float32})
-    knl = lp.preprocess_kernel(knl)
-    assert len(knl.instructions) == 3
+    prog = lp.add_and_infer_dtypes(prog, {"x": np.float32})
+    prog = lp.preprocess_kernel(prog)
+    assert len(prog["loopy_kernel"].instructions) == 3
 
 
 def test_regression_persistent_hash():
@@ -1643,14 +1623,15 @@ def test_regression_persistent_hash():
             "cse_exprvar = d[0]*d[0]")
     from loopy.tools import LoopyKeyBuilder
     lkb = LoopyKeyBuilder()
-    assert lkb(knl1.instructions[0]) != lkb(knl2.instructions[0])
+    assert (lkb(knl1["loopy_kernel"].instructions[0]) !=
+            lkb(knl2["loopy_kernel"].instructions[0]))
     assert lkb(knl1) != lkb(knl2)
 
 
 def test_sequential_dependencies(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
             "{[i]: 0<=i<n}",
             """
             for i
@@ -1662,9 +1643,9 @@ def test_sequential_dependencies(ctx_factory):
             end
             """, seq_dependencies=True)
 
-    print(knl.stringify(with_dependencies=True))
+    print(prog["loopy_kernel"].stringify(with_dependencies=True))
 
-    lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(n=5))
+    lp.auto_test_vs_ref(prog, ctx, prog, parameters=dict(n=5))
 
 
 def test_nop(ctx_factory):
@@ -1719,8 +1700,12 @@ def test_global_barrier(ctx_factory):
     print(knl)
 
     knl = lp.preprocess_kernel(knl)
-    assert knl.temporary_variables["z"].address_space == lp.AddressSpace.GLOBAL
-    assert knl.temporary_variables["v"].address_space == lp.AddressSpace.GLOBAL
+    assert (
+            knl["loopy_kernel"].temporary_variables["z"].address_space ==
+            lp.AddressSpace.GLOBAL)
+    assert (
+            knl["loopy_kernel"].temporary_variables["v"].address_space ==
+            lp.AddressSpace.GLOBAL)
 
     print(knl)
 
@@ -1743,11 +1728,12 @@ def test_missing_global_barrier():
 
     knl = lp.set_temporary_scope(knl, "z", "global")
     knl = lp.split_iname(knl, "i", 256, outer_tag="g.0")
+    knl = lp.add_dtypes(knl, {"z": np.float32, "v": np.float32})
     knl = lp.preprocess_kernel(knl)
 
     from loopy.diagnostic import MissingBarrierError
     with pytest.raises(MissingBarrierError):
-        lp.get_one_scheduled_kernel(knl)
+        lp.generate_code_v2(knl)
 
 
 def test_index_cse(ctx_factory):
@@ -1862,7 +1848,7 @@ def test_temp_initializer(ctx_factory, src_order, tmp_order):
 
 
 def test_const_temp_with_initializer_not_saved():
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
         "{[i]: 0<=i<10}",
         """
         ... gbarrier
@@ -1878,12 +1864,11 @@ def test_const_temp_with_initializer_not_saved():
             ],
         seq_dependencies=True)
 
-    knl = lp.preprocess_kernel(knl)
-    knl = lp.get_one_scheduled_kernel(knl)
-    knl = lp.save_and_reload_temporaries(knl)
+    prog = lp.preprocess_kernel(prog)
+    prog = lp.save_and_reload_temporaries(prog)
 
     # This ensures no save slot was added.
-    assert len(knl.temporary_variables) == 1
+    assert len(prog["loopy_kernel"].temporary_variables) == 1
 
 
 def test_header_extract():
@@ -2078,20 +2063,24 @@ def test_tight_loop_bounds_codegen():
 
 
 def test_unscheduled_insn_detection():
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
         "{ [i]: 0 <= i < 10 }",
         """
         out[i] = i {id=insn1}
         """,
         "...")
 
-    knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
-    insn1, = lp.find_instructions(knl, "id:insn1")
-    knl.instructions.append(insn1.copy(id="insn2"))
+    prog = lp.preprocess_kernel(prog)
+    knl = lp.get_one_scheduled_kernel(prog["loopy_kernel"], prog.callables_table)
+    prog = prog.with_kernel(knl)
+    insn1, = lp.find_instructions(prog, "id:insn1")
+    insns = prog["loopy_kernel"].instructions[:]
+    insns.append(insn1.copy(id="insn2"))
+    prog = prog.with_kernel(prog["loopy_kernel"].copy(instructions=insns))
 
     from loopy.diagnostic import UnscheduledInstructionError
     with pytest.raises(UnscheduledInstructionError):
-        lp.generate_code(knl)
+        lp.generate_code(prog)
 
 
 def test_integer_reduction(ctx_factory):
@@ -2233,7 +2222,7 @@ def barrier_between(knl, id1, id2, ignore_barriers_in_levels=()):
 
 
 def test_barrier_insertion_near_top_of_loop():
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
         "{[i,j]: 0 <= i,j < 10 }",
         """
         for i
@@ -2247,10 +2236,11 @@ def test_barrier_insertion_near_top_of_loop():
         """,
         seq_dependencies=True)
 
-    knl = lp.tag_inames(knl, dict(i="l.0"))
-    knl = lp.set_temporary_scope(knl, "a", "local")
-    knl = lp.set_temporary_scope(knl, "b", "local")
-    knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
+    prog = lp.tag_inames(prog, dict(i="l.0"))
+    prog = lp.set_temporary_scope(prog, "a", "local")
+    prog = lp.set_temporary_scope(prog, "b", "local")
+    prog = lp.preprocess_kernel(prog)
+    knl = lp.get_one_scheduled_kernel(prog["loopy_kernel"], prog.callables_table)
 
     print(knl)
 
@@ -2260,7 +2250,7 @@ def test_barrier_insertion_near_top_of_loop():
 
 
 def test_barrier_insertion_near_bottom_of_loop():
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
         ["{[i]: 0 <= i < 10 }",
          "[jmax] -> {[j]: 0 <= j < jmax}"],
         """
@@ -2274,10 +2264,11 @@ def test_barrier_insertion_near_bottom_of_loop():
         end
         """,
         seq_dependencies=True)
-    knl = lp.tag_inames(knl, dict(i="l.0"))
-    knl = lp.set_temporary_scope(knl, "a", "local")
-    knl = lp.set_temporary_scope(knl, "b", "local")
-    knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
+    prog = lp.tag_inames(prog, dict(i="l.0"))
+    prog = lp.set_temporary_scope(prog, "a", "local")
+    prog = lp.set_temporary_scope(prog, "b", "local")
+    prog = lp.preprocess_kernel(prog)
+    knl = lp.get_one_scheduled_kernel(prog["loopy_kernel"], prog.callables_table)
 
     print(knl)
 
@@ -2287,7 +2278,7 @@ def test_barrier_insertion_near_bottom_of_loop():
 
 def test_barrier_in_overridden_get_grid_size_expanded_kernel():
     # make simple barrier'd kernel
-    knl = lp.make_kernel("{[i]: 0 <= i < 10}",
+    prog = lp.make_kernel("{[i]: 0 <= i < 10}",
                    """
               for i
                     a[i] = i {id=a}
@@ -2302,24 +2293,26 @@ def test_barrier_in_overridden_get_grid_size_expanded_kernel():
 
     # split into kernel w/ vesize larger than iname domain
     vecsize = 16
-    knl = lp.split_iname(knl, "i", vecsize, inner_tag="l.0")
+    prog = lp.split_iname(prog, "i", vecsize, inner_tag="l.0")
 
     from testlib import GridOverride
 
     # artifically expand via overridden_get_grid_sizes_for_insn_ids
+    knl = prog["loopy_kernel"]
     knl = knl.copy(overridden_get_grid_sizes_for_insn_ids=GridOverride(
         knl.copy(), vecsize))
+    prog = prog.with_kernel(knl)
     # make sure we can generate the code
-    lp.generate_code_v2(knl)
+    lp.generate_code_v2(prog)
 
 
 def test_multi_argument_reduction_type_inference():
-    from loopy.type_inference import TypeInferenceMapper
+    from loopy.type_inference import TypeReader
     from loopy.library.reduction import SegmentedSumReductionOperation
     from loopy.types import to_loopy_type
     op = SegmentedSumReductionOperation()
 
-    knl = lp.make_kernel("{[i,j]: 0<=i<10 and 0<=j<i}", "")
+    prog = lp.make_kernel("{[i,j]: 0<=i<10 and 0<=j<i}", "")
 
     int32 = to_loopy_type(np.int32)
 
@@ -2333,7 +2326,8 @@ def test_multi_argument_reduction_type_inference():
                 allow_simultaneous=True),
             allow_simultaneous=True)
 
-    t_inf_mapper = TypeInferenceMapper(knl)
+    t_inf_mapper = TypeReader(prog["loopy_kernel"],
+            prog.callables_table)
 
     assert (
             t_inf_mapper(expr, return_tuple=True, return_dtype_set=True)
@@ -2349,7 +2343,7 @@ def test_multi_argument_reduction_parsing():
 
 
 def test_global_barrier_order_finding():
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
             "{[i,itrip]: 0<=i<n and 0<=itrip<ntrips}",
             """
             for i
@@ -2366,7 +2360,8 @@ def test_global_barrier_order_finding():
             end
             """)
 
-    assert lp.get_global_barrier_order(knl) == ("top", "yoink", "postloop")
+    assert (lp.get_global_barrier_order(prog["loopy_kernel"]) == ("top", "yoink",
+        "postloop"))
 
     for insn, barrier in (
             ("nop", None),
@@ -2376,12 +2371,13 @@ def test_global_barrier_order_finding():
             ("yoink", "top"),
             ("postloop", "yoink"),
             ("zzzv", "postloop")):
-        assert lp.find_most_recent_global_barrier(knl, insn) == barrier
+        assert lp.find_most_recent_global_barrier(prog["loopy_kernel"],
+                insn) == barrier
 
 
 def test_global_barrier_error_if_unordered():
     # FIXME: Should be illegal to declare this
-    knl = lp.make_kernel("{[i]: 0 <= i < 10}",
+    prog = lp.make_kernel("{[i]: 0 <= i < 10}",
             """
             ... gbarrier
             ... gbarrier
@@ -2389,7 +2385,7 @@ def test_global_barrier_error_if_unordered():
 
     from loopy.diagnostic import LoopyError
     with pytest.raises(LoopyError):
-        lp.get_global_barrier_order(knl)
+        lp.get_global_barrier_order(prog["loopy_kernel"])
 
 
 def test_struct_assignment(ctx_factory):
@@ -2451,14 +2447,14 @@ def test_inames_conditional_generation(ctx_factory):
 
 
 def test_kernel_var_name_generator():
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
             "{[i]: 0 <= i <= 10}",
             """
             <>a = 0
             <>b_s0 = 0
             """)
 
-    vng = knl.get_var_name_generator()
+    vng = prog["loopy_kernel"].get_var_name_generator()
 
     assert vng("a_s0") != "a_s0"
     assert vng("b") != "b"
@@ -2481,7 +2477,7 @@ def test_fixed_parameters(ctx_factory):
 
 def test_parameter_inference():
     knl = lp.make_kernel("{[i]: 0 <= i < n and i mod 2 = 0}", "")
-    assert knl.all_params() == {"n"}
+    assert knl["loopy_kernel"].all_params() == {"n"}
 
 
 def test_execution_backend_can_cache_dtypes(ctx_factory):
@@ -2500,7 +2496,7 @@ def test_execution_backend_can_cache_dtypes(ctx_factory):
 
 
 def test_wildcard_dep_matching():
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
             "{[i]: 0 <= i < 10}",
             """
             <>a = 0 {id=insn1}
@@ -2513,11 +2509,15 @@ def test_wildcard_dep_matching():
 
     all_insns = {"insn%d" % i for i in range(1, 6)}
 
-    assert knl.id_to_insn["insn1"].depends_on == set()
-    assert knl.id_to_insn["insn2"].depends_on == all_insns - {"insn2"}
-    assert knl.id_to_insn["insn3"].depends_on == all_insns - {"insn3"}
-    assert knl.id_to_insn["insn4"].depends_on == {"insn1", "insn2"}
-    assert knl.id_to_insn["insn5"].depends_on == all_insns - {"insn1", "insn5"}
+    assert prog["loopy_kernel"].id_to_insn["insn1"].depends_on == set()
+    assert (prog["loopy_kernel"].id_to_insn["insn2"].depends_on == all_insns -
+            {"insn2"})
+    assert (prog["loopy_kernel"].id_to_insn["insn3"].depends_on == all_insns -
+            {"insn3"})
+    assert (prog["loopy_kernel"].id_to_insn["insn4"].depends_on == {"insn1",
+        "insn2"})
+    assert (prog["loopy_kernel"].id_to_insn["insn5"].depends_on == all_insns -
+            {"insn1", "insn5"})
 
 
 def test_preamble_with_separate_temporaries(ctx_factory):
@@ -2581,12 +2581,14 @@ def test_preamble_with_separate_temporaries(ctx_factory):
 
 
 def test_arg_inference_for_predicates():
-    knl = lp.make_kernel("{[i]: 0 <= i < 10}",
+    prog = lp.make_kernel("{[i]: 0 <= i < 10}",
             """
             if incr[i]
               a = a + 1
             end
-            """)
+            """, name="loopy_kernel")
+
+    knl = prog["loopy_kernel"]
 
     assert "incr" in knl.arg_dict
     assert knl.arg_dict["incr"].shape == (10,)
@@ -2611,7 +2613,7 @@ def test_relaxed_stride_checks(ctx_factory):
 
 
 def test_add_prefetch_works_in_lhs_index():
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
             "{ [n,k,l,k1,l1,k2,l2]: "
             "start<=n<end and 0<=k,k1,k2<3 and 0<=l,l1,l2<2 }",
             """
@@ -2627,10 +2629,10 @@ def test_add_prefetch_works_in_lhs_index():
                 "..."
             ])
 
-    knl = lp.add_prefetch(knl, "a1_map", "k", default_tag="l.auto")
+    prog = lp.add_prefetch(prog, "a1_map", "k", default_tag="l.auto")
 
     from loopy.symbolic import get_dependencies
-    for insn in knl.instructions:
+    for insn in prog["loopy_kernel"].instructions:
         assert "a1_map" not in get_dependencies(insn.assignees)
 
 
@@ -2642,11 +2644,9 @@ def test_check_for_variable_access_ordering():
             a[i+1] = 13
             """)
 
-    knl = lp.preprocess_kernel(knl)
-
     from loopy.diagnostic import VariableAccessNotOrdered
     with pytest.raises(VariableAccessNotOrdered):
-        lp.get_one_scheduled_kernel(knl)
+        lp.generate_code_v2(knl)
 
 
 def test_check_for_variable_access_ordering_with_aliasing():
@@ -2661,11 +2661,9 @@ def test_check_for_variable_access_ordering_with_aliasing():
                 lp.TemporaryVariable("b", shape="n+1", base_storage="tmp"),
                 ])
 
-    knl = lp.preprocess_kernel(knl)
-
     from loopy.diagnostic import VariableAccessNotOrdered
     with pytest.raises(VariableAccessNotOrdered):
-        lp.get_one_scheduled_kernel(knl)
+        lp.generate_code_v2(knl)
 
 
 @pytest.mark.parametrize(("second_index", "expect_barrier"),
@@ -2674,7 +2672,7 @@ def test_check_for_variable_access_ordering_with_aliasing():
             ("2*i+1", False),
             ])
 def test_no_barriers_for_nonoverlapping_access(second_index, expect_barrier):
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
             "{[i]: 0<=i<128}",
             """
             a[2*i] = 12  {id=first}
@@ -2685,10 +2683,11 @@ def test_no_barriers_for_nonoverlapping_access(second_index, expect_barrier):
                     address_space=lp.AddressSpace.LOCAL),
                 ])
 
-    knl = lp.tag_inames(knl, "i:l.0")
+    prog = lp.tag_inames(prog, "i:l.0")
+    prog = lp.preprocess_kernel(prog)
 
-    knl = lp.preprocess_kernel(knl)
-    knl = lp.get_one_scheduled_kernel(knl)
+    knl = lp.get_one_scheduled_kernel(prog["loopy_kernel"],
+            prog.callables_table)
 
     assert barrier_between(knl, "first", "second") == expect_barrier
 
@@ -2739,7 +2738,7 @@ def test_dep_cycle_printing_and_error():
 
     from loopy.diagnostic import DependencyCycleFound
     with pytest.raises(DependencyCycleFound):
-        print(lp.generate_code(knl)[0])
+        print(lp.generate_code_v2(knl).device_code())
 
 
 def test_backwards_dep_printing_and_error():
@@ -2762,6 +2761,8 @@ def test_backwards_dep_printing_and_error():
 
 
 def test_dump_binary(ctx_factory):
+    pytest.skip("Not investing time in passing test depends on feature which was "
+            "deprecated in 2016")
     ctx = ctx_factory()
 
     device = ctx.devices[0]
@@ -2825,7 +2826,7 @@ def test_shape_mismatch_check(ctx_factory):
     a = np.random.rand(10, 10).astype(np.float32)
     b = np.random.rand(10).astype(np.float32)
 
-    if prg.options.skip_arg_checks:
+    if prg["loopy_kernel"].options.skip_arg_checks:
         pytest.skip("args checks disabled, cannot check")
 
     with pytest.raises(TypeError, match="strides mismatch"):
@@ -2844,6 +2845,49 @@ def test_array_arg_extra_kwargs_persis_hash():
     assert key_builder(a) != key_builder(not_a)
 
 
+def test_type_inference_walks_fn_in_comparison():
+    # Reported by Lawrence Mitchell
+    # See: https://gitlab.tiker.net/inducer/loopy/issues/180
+
+    knl = lp.make_kernel(
+        [
+            "{ [p] : 0 <= p <= 2 }",
+            "{ [i] : 0 <= i <= 2 }",
+        ],
+        """
+        t2 = 0.0 {id=insn}
+        t1 = 0.0 {id=insn_0, dep=insn}
+        t1 = t1 + t0[p, i]*w_0[1 + i*2] {id=insn_1, dep=insn_0}
+        t2 = t2 + t0[p, i]*w_0[i*2] {id=insn_2, dep=insn_1}
+        A[p] = A[p]+(0.2 if abs(-1.2+t2) <= 0.1 and abs(-0.15+t1) <= 0.05 else 0.0
+                                            ) {dep=insn_2}
+        """, [
+            lp.GlobalArg(
+                name="A", dtype=np.float64,
+                shape=(3)),
+            lp.GlobalArg(
+                name="w_0", dtype=np.float64,
+                shape=(6),),
+            lp.TemporaryVariable(
+                name="t0", dtype=np.float64,
+                shape=(3, 3),
+                read_only=True,
+                address_space=lp.AddressSpace.LOCAL,
+                initializer=np.array([[1., 0., 0.],
+                    [0., 1., 0.],
+                    [0., 0., 1.]]),),
+            lp.TemporaryVariable(
+                name="t1", dtype=np.float64,
+                shape=()),
+            lp.TemporaryVariable(
+                name="t2", dtype=np.float64,
+                shape=()),
+            ],
+        target=lp.CTarget())
+
+    print(lp.generate_code_v2(knl).device_code())
+
+
 def test_non_integral_array_idx_raises():
     knl = lp.make_kernel(
             "{[i, j]: 0<=i<=4 and 0<=j<16}",
@@ -3057,16 +3101,16 @@ def test_deps_from_conditionals():
                 result = result + simul_reduce(sum, i, i*i)
                 result = result + simul_reduce(sum, i, 2*i*i)
             end
-            """)
+            """, name="lpy_knl")
     ppknl = lp.preprocess_kernel(knl)
 
     # accumulator initializers must be dependency-less
     assert all(not insn.depends_on
-            for insn in ppknl.instructions
+            for insn in ppknl["lpy_knl"].instructions
             if "init" in insn.id)
     # accumulator initializers must not have inherited the predicates
     assert all(not insn.predicates
-            for insn in ppknl.instructions
+            for insn in ppknl["lpy_knl"].instructions
             if "init" in insn.id)
 
     # Ensure valid linearization exists: No valid linearization unless the
@@ -3105,7 +3149,7 @@ def test_cached_written_variables_doesnt_carry_over_invalidly():
     knl2 = loads(dumps(knl))
 
     knl2 = lp.remove_instructions(knl2, {"write_b"})
-    assert "b" not in knl2.get_written_variables()
+    assert "b" not in knl2["loopy_kernel"].get_written_variables()
 
 
 if __name__ == "__main__":
diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py
index 7e708f449c72ab93dc7ee342da055d4b5becb8a5..bb0a281263a9ef9ee4d85d40d450c5ce2aa8280b 100644
--- a/test/test_numa_diff.py
+++ b/test/test_numa_diff.py
@@ -57,10 +57,10 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
 
     source = source.replace("datafloat", "real*4")
 
-    hsv_r, hsv_s = [
-           knl for knl in lp.parse_fortran(source, filename, seq_dependencies=False)
-           if "KernelR" in knl.name or "KernelS" in knl.name
-           ]
+    program = lp.parse_fortran(source, filename, seq_dependencies=False)
+
+    hsv_r, hsv_s = program["strongVolumeKernelR"], program["strongVolumeKernelS"]
+
     hsv_r = lp.tag_instructions(hsv_r, "rknl")
     hsv_s = lp.tag_instructions(hsv_s, "sknl")
     hsv = lp.fuse_kernels([hsv_r, hsv_s], ["_r", "_s"])
@@ -228,6 +228,15 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
 
     hsv = tap_hsv
 
+    hsv = lp.set_options(hsv,
+            ignore_boostable_into=True,
+            cl_build_options=[
+                 "-cl-denorms-are-zero",
+                 "-cl-fast-relaxed-math",
+                 "-cl-finite-math-only",
+                 "-cl-mad-enable",
+                 "-cl-no-signed-zeros"])
+
     if 1:
         print("OPS")
         op_map = lp.get_op_map(hsv, subgroup_size=32)
@@ -237,15 +246,9 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
         gmem_map = lp.get_mem_access_map(hsv, subgroup_size=32).to_bytes()
         print(lp.stringify_stats_mapping(gmem_map))
 
-    hsv = lp.set_options(hsv, cl_build_options=[
-         "-cl-denorms-are-zero",
-         "-cl-fast-relaxed-math",
-         "-cl-finite-math-only",
-         "-cl-mad-enable",
-         "-cl-no-signed-zeros",
-         ])
-
-    hsv = hsv.copy(name="horizontalStrongVolumeKernel")
+    # FIXME: renaming's a bit tricky in this program model.
+    # add a simple transformation for it
+    # hsv = hsv.copy(name="horizontalStrongVolumeKernel")
 
     results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300),
             quiet=True)
diff --git a/test/test_reduction.py b/test/test_reduction.py
index 3ffc4ec65f7a778a40225df4676fd5d70254b476..965e5f1ab90ce2b4afe0f519ca85623d51ceb70d 100644
--- a/test/test_reduction.py
+++ b/test/test_reduction.py
@@ -78,7 +78,7 @@ def test_empty_reduction(ctx_factory):
             "a[i] = sum(j, j)",
             )
 
-    knl = lp.realize_reduction(knl)
+    knl = lp.preprocess_kernel(knl)
     print(knl)
 
     knl = lp.set_options(knl, write_cl=True)
@@ -107,11 +107,9 @@ def test_nested_dependent_reduction(ctx_factory):
                 lp.GlobalArg("ell", np.int32, ("n",)),
                 ])
 
-    cknl = lp.CompiledKernel(ctx, knl)
-
     n = 330
     ell = np.arange(n, dtype=np.int32)
-    evt, (a,) = cknl(queue, ell=ell, n=n, out_host=True)
+    evt, (a,) = knl(queue, ell=ell, n=n, out_host=True)
 
     tgt_result = (2*ell-1)*2*ell/2
     assert (a == tgt_result).all()
@@ -142,10 +140,10 @@ def test_multi_nested_dependent_reduction(ctx_factory):
                 lp.ValueArg("ntgts", np.int32),
                 lp.ValueArg("nboxes", np.int32),
                 ],
-            assumptions="ntgts>=1")
+            assumptions="ntgts>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
-    cknl = lp.CompiledKernel(ctx, knl)
-    print(cknl.get_code())
+    print(lp.generate_code_v2(knl).device_code())
     # FIXME: Actually test functionality.
 
 
@@ -175,10 +173,10 @@ def test_recursive_nested_dependent_reduction(ctx_factory):
                 lp.ValueArg("ntgts", np.int32),
                 lp.ValueArg("nboxes", np.int32),
                 ],
-            assumptions="ntgts>=1")
+            assumptions="ntgts>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
-    cknl = lp.CompiledKernel(ctx, knl)
-    print(cknl.get_code())
+    print(lp.generate_code_v2(knl).device_code())
     # FIXME: Actually test functionality.
 
 
@@ -274,6 +272,7 @@ def test_global_mc_parallel_reduction(ctx_factory, size):
             """)
 
     ref_knl = knl
+    ref_knl = lp.add_dtypes(ref_knl, {"n": np.int32})
 
     gsize = 128
     knl = lp.split_iname(knl, "i", gsize * 20)
@@ -285,7 +284,7 @@ def test_global_mc_parallel_reduction(ctx_factory, size):
     knl = lp.precompute(knl, "red_i_outer_arg", "i_outer",
             temporary_address_space=lp.AddressSpace.GLOBAL,
             default_tag="l.auto")
-    knl = lp.realize_reduction(knl)
+    knl = lp.preprocess_kernel(knl)
     knl = lp.add_dependency(
             knl, "writes:acc_i_outer",
             "id:red_i_outer_arg_barrier")
@@ -410,7 +409,6 @@ def test_parallel_multi_output_reduction(ctx_factory):
                 """)
     knl = lp.tag_inames(knl, dict(i="l.0"))
     knl = lp.add_dtypes(knl, dict(a=np.float64))
-    knl = lp.realize_reduction(knl)
 
     ctx = ctx_factory()
 
diff --git a/test/test_statistics.py b/test/test_statistics.py
index d6ad5874dcb5a315a4a838b922a9324a169f0874..5982d220caaedd3c8b560775b3f71711f3682a87 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -64,12 +64,15 @@ def test_op_counter_basic():
     m = 256
     ell = 128
     params = {"n": n, "m": m, "ell": ell}
-    f32add = op_map[lp.Op(np.float32, "add", CG.SUBGROUP)].eval_with_dict(params)
-    f32mul = op_map[lp.Op(np.float32, "mul", CG.SUBGROUP)].eval_with_dict(params)
-    f32div = op_map[lp.Op(np.float32, "div", CG.SUBGROUP)].eval_with_dict(params)
-    f64mul = op_map[lp.Op(np.dtype(np.float64), "mul", CG.SUBGROUP)
+    f32add = op_map[lp.Op(np.float32, "add", CG.SUBGROUP, "basic")].eval_with_dict(
+            params)
+    f32mul = op_map[lp.Op(np.float32, "mul", CG.SUBGROUP, "basic")].eval_with_dict(
+            params)
+    f32div = op_map[lp.Op(np.float32, "div", CG.SUBGROUP, "basic")].eval_with_dict(
+            params)
+    f64mul = op_map[lp.Op(np.dtype(np.float64), "mul", CG.SUBGROUP, "basic")
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), "add", CG.SUBGROUP)
+    i32add = op_map[lp.Op(np.dtype(np.int32), "add", CG.SUBGROUP, "basic")
                     ].eval_with_dict(params)
     # (count-per-sub-group)*n_subgroups
     assert f32add == f32mul == f32div == n*m*ell*n_subgroups
@@ -96,9 +99,10 @@ def test_op_counter_reduction():
     m = 256
     ell = 128
     params = {"n": n, "m": m, "ell": ell}
-    f32add = op_map[lp.Op(np.float32, "add", CG.SUBGROUP)].eval_with_dict(params)
-    f32mul = op_map[lp.Op(np.dtype(np.float32), "mul", CG.SUBGROUP)
-                    ].eval_with_dict(params)
+    f32add = op_map[lp.Op(np.float32, "add", CG.SUBGROUP,
+        "matmul_serial")].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.dtype(np.float32), "mul", CG.SUBGROUP,
+        "matmul_serial")].eval_with_dict(params)
     # (count-per-sub-group)*n_subgroups
     assert f32add == f32mul == n*m*ell*n_subgroups
 
@@ -131,11 +135,13 @@ def test_op_counter_logic():
     m = 256
     ell = 128
     params = {"n": n, "m": m, "ell": ell}
-    f32mul = op_map[lp.Op(np.float32, "mul", CG.SUBGROUP)].eval_with_dict(params)
-    f64add = op_map[lp.Op(np.float64, "add", CG.SUBGROUP)].eval_with_dict(params)
-    f64div = op_map[lp.Op(np.dtype(np.float64), "div", CG.SUBGROUP)
+    f32mul = op_map[lp.Op(np.float32, "mul", CG.SUBGROUP, "logic")].eval_with_dict(
+            params)
+    f64add = op_map[lp.Op(np.float64, "add", CG.SUBGROUP, "logic")].eval_with_dict(
+            params)
+    f64div = op_map[lp.Op(np.dtype(np.float64), "div", CG.SUBGROUP, "logic")
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), "add", CG.SUBGROUP)
+    i32add = op_map[lp.Op(np.dtype(np.int32), "add", CG.SUBGROUP, "logic")
                     ].eval_with_dict(params)
     # (count-per-sub-group)*n_subgroups
     assert f32mul == n*m*n_subgroups
@@ -144,7 +150,7 @@ def test_op_counter_logic():
     assert i32add == n*m*n_subgroups
 
 
-def test_op_counter_specialops():
+def test_op_counter_special_ops():
 
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
@@ -154,7 +160,7 @@ def test_op_counter_specialops():
                 e[i, k] = (1+g[i,k])**(1+h[i,k+1])+rsqrt(g[i,k])*sin(g[i,k])
                 """
             ],
-            name="specialops", assumptions="n,m,ell >= 1")
+            name="special_ops", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
@@ -169,18 +175,22 @@ def test_op_counter_specialops():
     m = 256
     ell = 128
     params = {"n": n, "m": m, "ell": ell}
-    f32mul = op_map[lp.Op(np.float32, "mul", CG.SUBGROUP)].eval_with_dict(params)
-    f32div = op_map[lp.Op(np.float32, "div", CG.SUBGROUP)].eval_with_dict(params)
-    f32add = op_map[lp.Op(np.float32, "add", CG.SUBGROUP)].eval_with_dict(params)
-    f64pow = op_map[lp.Op(np.float64, "pow", CG.SUBGROUP)].eval_with_dict(params)
-    f64add = op_map[lp.Op(np.dtype(np.float64), "add", CG.SUBGROUP)
-                    ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), "add", CG.SUBGROUP)
-                    ].eval_with_dict(params)
-    f64rsq = op_map[lp.Op(np.dtype(np.float64), "func:rsqrt", CG.SUBGROUP)
+    f32mul = op_map[lp.Op(np.float32, "mul", CG.SUBGROUP,
+        "special_ops")].eval_with_dict(params)
+    f32div = op_map[lp.Op(np.float32, "div", CG.SUBGROUP,
+        "special_ops")].eval_with_dict(params)
+    f32add = op_map[lp.Op(np.float32, "add", CG.SUBGROUP,
+        "special_ops")].eval_with_dict(params)
+    f64pow = op_map[lp.Op(np.float64, "pow", CG.SUBGROUP,
+        "special_ops")].eval_with_dict(params)
+    f64add = op_map[lp.Op(np.dtype(np.float64), "add", CG.SUBGROUP, "special_ops")
                     ].eval_with_dict(params)
-    f64sin = op_map[lp.Op(np.dtype(np.float64), "func:sin", CG.SUBGROUP)
+    i32add = op_map[lp.Op(np.dtype(np.int32), "add", CG.SUBGROUP, "special_ops")
                     ].eval_with_dict(params)
+    f64rsq = op_map[lp.Op(np.dtype(np.float64), "func:rsqrt", CG.SUBGROUP,
+        "special_ops")].eval_with_dict(params)
+    f64sin = op_map[lp.Op(np.dtype(np.float64), "func:sin", CG.SUBGROUP,
+        "special_ops")].eval_with_dict(params)
     # (count-per-sub-group)*n_subgroups
     assert f32div == 2*n*m*ell*n_subgroups
     assert f32mul == f32add == n*m*ell*n_subgroups
@@ -215,16 +225,25 @@ def test_op_counter_bitwise():
     m = 256
     ell = 128
     params = {"n": n, "m": m, "ell": ell}
-    i32add = op_map[lp.Op(np.int32, "add", CG.SUBGROUP)].eval_with_dict(params)
-    i32bw = op_map[lp.Op(np.int32, "bw", CG.SUBGROUP)].eval_with_dict(params)
-    i64bw = op_map[lp.Op(np.dtype(np.int64), "bw", CG.SUBGROUP)
-                   ].eval_with_dict(params)
-    i64mul = op_map[lp.Op(np.dtype(np.int64), "mul", CG.SUBGROUP)
-                    ].eval_with_dict(params)
-    i64add = op_map[lp.Op(np.dtype(np.int64), "add", CG.SUBGROUP)
-                    ].eval_with_dict(params)
-    i64shift = op_map[lp.Op(np.dtype(np.int64), "shift", CG.SUBGROUP)
-                      ].eval_with_dict(params)
+    print(op_map)
+    i32add = op_map[
+            lp.Op(np.int32, "add", CG.SUBGROUP, "bitwise")
+            ].eval_with_dict(params)
+    i32bw = op_map[
+            lp.Op(np.int32, "bw", CG.SUBGROUP, "bitwise")
+            ].eval_with_dict(params)
+    i64bw = op_map[
+            lp.Op(np.dtype(np.int64), "bw", CG.SUBGROUP, "bitwise")
+            ].eval_with_dict(params)
+    i64mul = op_map[
+            lp.Op(np.dtype(np.int64), "mul", CG.SUBGROUP, "bitwise")
+            ].eval_with_dict(params)
+    i64add = op_map[
+            lp.Op(np.dtype(np.int64), "add", CG.SUBGROUP, "bitwise")
+            ].eval_with_dict(params)
+    i64shift = op_map[
+            lp.Op(np.dtype(np.int64), "shift", CG.SUBGROUP, "bitwise")
+            ].eval_with_dict(params)
     # (count-per-sub-group)*n_subgroups
     assert i32add == n*m*ell*n_subgroups
     assert i32bw == 2*n*m*ell*n_subgroups
@@ -258,7 +277,7 @@ def test_op_counter_triangular_domain():
                     knl,
                     subgroup_size=SGS,
                     count_redundant_work=True
-                    )[lp.Op(np.float64, "mul", CG.SUBGROUP)]
+                    )[lp.Op(np.float64, "mul", CG.SUBGROUP, "bitwise")]
     value_dict = dict(m=13, n=200)
     flops = op_map.eval_with_dict(value_dict)
 
@@ -304,22 +323,26 @@ def test_mem_access_counter_basic():
     f32l = mem_map[lp.MemAccess("global", np.float32,
                         lid_strides={}, gid_strides={},
                         direction="load", variable="a",
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name="basic")
                    ].eval_with_dict(params)
     f32l += mem_map[lp.MemAccess("global", np.float32,
                         lid_strides={}, gid_strides={},
                         direction="load", variable="b",
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name="basic")
                     ].eval_with_dict(params)
     f64l = mem_map[lp.MemAccess("global", np.float64,
                         lid_strides={}, gid_strides={},
                         direction="load", variable="g",
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name="basic")
                    ].eval_with_dict(params)
     f64l += mem_map[lp.MemAccess("global", np.float64,
                         lid_strides={}, gid_strides={},
                         direction="load", variable="h",
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name="basic")
                     ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -329,12 +352,14 @@ def test_mem_access_counter_basic():
     f32s = mem_map[lp.MemAccess("global", np.dtype(np.float32),
                         lid_strides={}, gid_strides={},
                         direction="store", variable="c",
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name="basic")
                    ].eval_with_dict(params)
     f64s = mem_map[lp.MemAccess("global", np.dtype(np.float64),
                         lid_strides={}, gid_strides={},
                         direction="store", variable="e",
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name="basic")
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -368,12 +393,14 @@ def test_mem_access_counter_reduction():
     f32l = mem_map[lp.MemAccess("global", np.float32,
                         lid_strides={}, gid_strides={},
                         direction="load", variable="a",
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name="matmul")
                    ].eval_with_dict(params)
     f32l += mem_map[lp.MemAccess("global", np.float32,
                         lid_strides={}, gid_strides={},
                         direction="load", variable="b",
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name="matmul")
                     ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -382,7 +409,8 @@ def test_mem_access_counter_reduction():
     f32s = mem_map[lp.MemAccess("global", np.dtype(np.float32),
                         lid_strides={}, gid_strides={},
                         direction="store", variable="c",
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name="matmul")
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -441,7 +469,7 @@ def test_mem_access_counter_logic():
     assert f64_g_s == (n*m)*n_subgroups
 
 
-def test_mem_access_counter_specialops():
+def test_mem_access_counter_special_ops():
 
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
@@ -451,7 +479,7 @@ def test_mem_access_counter_specialops():
                 e[i, k] = (1+g[i,k])**(1+h[i,k+1])
                 """
             ],
-            name="specialops", assumptions="n,m,ell >= 1")
+            name="special_ops", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
                                             g=np.float64, h=np.float64))
@@ -471,22 +499,26 @@ def test_mem_access_counter_specialops():
     f32 = mem_map[lp.MemAccess("global", np.float32,
                         lid_strides={}, gid_strides={},
                         direction="load", variable="a",
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name="special_ops")
                   ].eval_with_dict(params)
     f32 += mem_map[lp.MemAccess("global", np.float32,
                         lid_strides={}, gid_strides={},
                         direction="load", variable="b",
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name="special_ops")
                    ].eval_with_dict(params)
     f64 = mem_map[lp.MemAccess("global", np.dtype(np.float64),
                         lid_strides={}, gid_strides={},
                         direction="load", variable="g",
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name="special_ops")
                   ].eval_with_dict(params)
     f64 += mem_map[lp.MemAccess("global", np.dtype(np.float64),
                         lid_strides={}, gid_strides={},
                         direction="load", variable="h",
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name="special_ops")
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -496,12 +528,14 @@ def test_mem_access_counter_specialops():
     f32 = mem_map[lp.MemAccess("global", np.float32,
                         lid_strides={}, gid_strides={},
                         direction="store", variable="c",
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name="special_ops")
                   ].eval_with_dict(params)
     f64 = mem_map[lp.MemAccess("global", np.float64,
                         lid_strides={}, gid_strides={},
                         direction="store", variable="e",
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name="special_ops")
                   ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -548,22 +582,26 @@ def test_mem_access_counter_bitwise():
     i32 = mem_map[lp.MemAccess("global", np.int32,
                         lid_strides={}, gid_strides={},
                         direction="load", variable="a",
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name="bitwise")
                   ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess("global", np.int32,
                         lid_strides={}, gid_strides={},
                         direction="load", variable="b",
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name="bitwise")
                    ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess("global", np.int32,
                         lid_strides={}, gid_strides={},
                         direction="load", variable="g",
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name="bitwise")
                    ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess("global", np.dtype(np.int32),
                         lid_strides={}, gid_strides={},
                         direction="load", variable="h",
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name="bitwise")
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -572,12 +610,14 @@ def test_mem_access_counter_bitwise():
     i32 = mem_map[lp.MemAccess("global", np.int32,
                         lid_strides={}, gid_strides={},
                         direction="store", variable="c",
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name="bitwise")
                   ].eval_with_dict(params)
     i32 += mem_map[lp.MemAccess("global", np.int32,
                         lid_strides={}, gid_strides={},
                         direction="store", variable="e",
-                        count_granularity=CG.SUBGROUP)
+                        count_granularity=CG.SUBGROUP,
+                        kernel_name="bitwise")
                    ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -619,31 +659,36 @@ def test_mem_access_counter_mixed():
     f64uniform = mem_map[lp.MemAccess("global", np.float64,
                                 lid_strides={}, gid_strides={},
                                 direction="load", variable="g",
-                                count_granularity=CG.SUBGROUP)
+                                count_granularity=CG.SUBGROUP,
+                                kernel_name="mixed")
                          ].eval_with_dict(params)
     f64uniform += mem_map[lp.MemAccess("global", np.float64,
                                 lid_strides={}, gid_strides={},
                                 direction="load", variable="h",
-                                count_granularity=CG.SUBGROUP)
+                                count_granularity=CG.SUBGROUP,
+                                kernel_name="mixed")
                           ].eval_with_dict(params)
     f32uniform = mem_map[lp.MemAccess("global", np.float32,
                                 lid_strides={}, gid_strides={},
                                 direction="load", variable="x",
-                                count_granularity=CG.SUBGROUP)
+                                count_granularity=CG.SUBGROUP,
+                                kernel_name="mixed")
                          ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess("global", np.dtype(np.float32),
                                 lid_strides={0: Variable("m")},
                                 gid_strides={0: Variable("m")*group_size_0},
                                 direction="load",
                                 variable="a",
-                                count_granularity=CG.WORKITEM)
+                                count_granularity=CG.WORKITEM,
+                                kernel_name="mixed")
                            ].eval_with_dict(params)
     f32nonconsec += mem_map[lp.MemAccess("global", np.dtype(np.float32),
                                 lid_strides={0: Variable("m")},
                                 gid_strides={0: Variable("m")*group_size_0},
                                 direction="load",
                                 variable="b",
-                                count_granularity=CG.WORKITEM)
+                                count_granularity=CG.WORKITEM,
+                                kernel_name="mixed")
                             ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -670,14 +715,16 @@ def test_mem_access_counter_mixed():
     f64uniform = mem_map[lp.MemAccess("global", np.float64,
                                 lid_strides={}, gid_strides={},
                                 direction="store", variable="e",
-                                count_granularity=CG.SUBGROUP)
+                                count_granularity=CG.SUBGROUP,
+                                kernel_name="mixed")
                          ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess("global", np.float32,
                                 lid_strides={0: Variable("m")},
                                 gid_strides={0: Variable("m")*group_size_0},
                                 direction="store",
                                 variable="c",
-                                count_granularity=CG.WORKITEM)
+                                count_granularity=CG.WORKITEM,
+                                kernel_name="mixed")
                            ].eval_with_dict(params)
 
     # uniform: (count-per-sub-group)*n_subgroups
@@ -702,7 +749,7 @@ def test_mem_access_counter_nonconsec():
             e[i, k] = g[i,k]*(2+h[i,k])
             """
             ],
-            name="nonconsec", assumptions="n,m,ell >= 1")
+            name="non_consec", assumptions="n,m,ell >= 1")
     knl = lp.add_and_infer_dtypes(knl, dict(
                 a=np.float32, b=np.float32, g=np.float64, h=np.float64))
     lsize0 = 16
@@ -720,30 +767,32 @@ def test_mem_access_counter_nonconsec():
                                 gid_strides={0: Variable("m")*lsize0},
                                 direction="load",
                                 variable="g",
-                                count_granularity=CG.WORKITEM)
+                                count_granularity=CG.WORKITEM,
+                                kernel_name="non_consec")
                            ].eval_with_dict(params)
     f64nonconsec += mem_map[lp.MemAccess("global", np.float64,
                                 lid_strides={0: Variable("m")},
                                 gid_strides={0: Variable("m")*lsize0},
                                 direction="load",
                                 variable="h",
-                                count_granularity=CG.WORKITEM)
+                                count_granularity=CG.WORKITEM,
+                                kernel_name="non_consec")
                             ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess(
                             "global", np.dtype(np.float32),
                             lid_strides={0: Variable("m")*Variable("ell")},
                             gid_strides={0: Variable("m")*Variable("ell")*lsize0},
                             direction="load", variable="a",
-                            count_granularity=CG.WORKITEM
-                            )
+                            count_granularity=CG.WORKITEM,
+                            kernel_name="non_consec")
                            ].eval_with_dict(params)
     f32nonconsec += mem_map[lp.MemAccess(
                             "global", np.dtype(np.float32),
                             lid_strides={0: Variable("m")*Variable("ell")},
                             gid_strides={0: Variable("m")*Variable("ell")*lsize0},
                             direction="load", variable="b",
-                            count_granularity=CG.WORKITEM
-                            )
+                            count_granularity=CG.WORKITEM,
+                            kernel_name="non_consec")
                             ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*ell
@@ -753,15 +802,16 @@ def test_mem_access_counter_nonconsec():
                                 gid_strides={0: Variable("m")*lsize0},
                                 direction="store",
                                 variable="e",
-                                count_granularity=CG.WORKITEM)
+                                count_granularity=CG.WORKITEM,
+                                kernel_name="non_consec")
                            ].eval_with_dict(params)
     f32nonconsec = mem_map[lp.MemAccess(
                             "global", np.float32,
                             lid_strides={0: Variable("m")*Variable("ell")},
                             gid_strides={0: Variable("m")*Variable("ell")*lsize0},
                             direction="store", variable="c",
-                            count_granularity=CG.WORKITEM
-                            )
+                            count_granularity=CG.WORKITEM,
+                            kernel_name="non_consec")
                            ].eval_with_dict(params)
     assert f64nonconsec == n*m
     assert f32nonconsec == n*m*ell
@@ -774,7 +824,8 @@ def test_mem_access_counter_nonconsec():
                     lid_strides={0: Variable("m")},
                     gid_strides={0: Variable("m")*lsize0},
                     direction="load", variable="g",
-                    count_granularity=CG.WORKITEM)
+                    count_granularity=CG.WORKITEM,
+                    kernel_name="non_consec")
                     ].eval_with_dict(params)
     f64nonconsec += mem_map64[lp.MemAccess(
                     "global",
@@ -782,7 +833,8 @@ def test_mem_access_counter_nonconsec():
                     lid_strides={0: Variable("m")},
                     gid_strides={0: Variable("m")*lsize0},
                     direction="load", variable="h",
-                    count_granularity=CG.WORKITEM)
+                    count_granularity=CG.WORKITEM,
+                    kernel_name="non_consec")
                     ].eval_with_dict(params)
     f32nonconsec = mem_map64[lp.MemAccess(
                     "global",
@@ -791,7 +843,8 @@ def test_mem_access_counter_nonconsec():
                     gid_strides={0: Variable("m")*Variable("ell")*lsize0},
                     direction="load",
                     variable="a",
-                    count_granularity=CG.WORKITEM)
+                    count_granularity=CG.WORKITEM,
+                    kernel_name="non_consec")
                     ].eval_with_dict(params)
     f32nonconsec += mem_map64[lp.MemAccess(
                     "global",
@@ -800,7 +853,8 @@ def test_mem_access_counter_nonconsec():
                     gid_strides={0: Variable("m")*Variable("ell")*lsize0},
                     direction="load",
                     variable="b",
-                    count_granularity=CG.WORKITEM)
+                    count_granularity=CG.WORKITEM,
+                    kernel_name="non_consec")
                     ].eval_with_dict(params)
     assert f64nonconsec == 2*n*m
     assert f32nonconsec == 3*n*m*ell
@@ -832,27 +886,31 @@ def test_mem_access_counter_consec():
                     "global", np.float64,
                     lid_strides={0: 1}, gid_strides={0: Variable("m")},
                     direction="load", variable="g",
-                    count_granularity=CG.WORKITEM)
+                    count_granularity=CG.WORKITEM,
+                    kernel_name="consec")
                     ].eval_with_dict(params)
     f64consec += mem_map[lp.MemAccess(
                     "global", np.float64,
                     lid_strides={0: 1}, gid_strides={0: Variable("m")},
                     direction="load", variable="h",
-                    count_granularity=CG.WORKITEM)
+                    count_granularity=CG.WORKITEM,
+                    kernel_name="consec")
                     ].eval_with_dict(params)
     f32consec = mem_map[lp.MemAccess(
                     "global", np.float32,
                     lid_strides={0: 1},
                     gid_strides={0: Variable("m")*Variable("ell"), 1: Variable("m")},
                     direction="load", variable="a",
-                    count_granularity=CG.WORKITEM)
+                    count_granularity=CG.WORKITEM,
+                    kernel_name="consec")
                     ].eval_with_dict(params)
     f32consec += mem_map[lp.MemAccess(
                     "global", np.dtype(np.float32),
                     lid_strides={0: 1},
                     gid_strides={0: Variable("m")*Variable("ell"), 1: Variable("m")},
                     direction="load", variable="b",
-                    count_granularity=CG.WORKITEM)
+                    count_granularity=CG.WORKITEM,
+                    kernel_name="consec")
                     ].eval_with_dict(params)
     assert f64consec == 2*n*m*ell
     assert f32consec == 3*n*m*ell
@@ -861,14 +919,16 @@ def test_mem_access_counter_consec():
                     "global", np.float64,
                     lid_strides={0: 1}, gid_strides={0: Variable("m")},
                     direction="store", variable="e",
-                    count_granularity=CG.WORKITEM)
+                    count_granularity=CG.WORKITEM,
+                    kernel_name="consec")
                     ].eval_with_dict(params)
     f32consec = mem_map[lp.MemAccess(
                     "global", np.float32,
                     lid_strides={0: 1},
                     gid_strides={0: Variable("m")*Variable("ell"), 1: Variable("m")},
                     direction="store", variable="c",
-                    count_granularity=CG.WORKITEM)
+                    count_granularity=CG.WORKITEM,
+                    kernel_name="consec")
                     ].eval_with_dict(params)
     assert f64consec == n*m*ell
     assert f32consec == n*m*ell
@@ -919,11 +979,10 @@ def test_barrier_counter_nobarriers():
     ell = 128
     params = {"n": n, "m": m, "ell": ell}
     assert len(sync_map) == 1
-    assert sync_map["kernel_launch"].eval_with_dict(params) == 1
+    assert sync_map.filter_by(kind="kernel_launch").eval_and_sum(params) == 1
 
 
 def test_barrier_counter_barriers():
-
     knl = lp.make_kernel(
             "[n,m,ell] -> {[i,k,j]: 0<=i<50 and 1<=k<98 and 0<=j<10}",
             [
@@ -945,10 +1004,25 @@ def test_barrier_counter_barriers():
     m = 256
     ell = 128
     params = {"n": n, "m": m, "ell": ell}
-    barrier_count = sync_map["barrier_local"].eval_with_dict(params)
+    barrier_count = sync_map.filter_by(kind="barrier_local").eval_and_sum(params)
     assert barrier_count == 50*10*2
 
 
+def test_barrier_count_single():
+    knl = lp.make_kernel(
+            "{[i]: 0<=i<128}",
+            """
+            <> c[i] = 15*i {id=yoink}
+            c[i+1] = c[i]  {dep=yoink}
+            """)
+
+    knl = lp.tag_inames(knl, {"i": "l.0"})
+    sync_map = lp.get_synchronization_map(knl)
+    print(sync_map)
+    barrier_count = sync_map.filter_by(kind="barrier_local").eval_and_sum()
+    assert barrier_count == 1
+
+
 def test_all_counters_parallel_matmul():
     bsize = 16
     knl = lp.make_kernel(
@@ -975,21 +1049,21 @@ def test_all_counters_parallel_matmul():
 
     sync_map = lp.get_synchronization_map(knl)
     assert len(sync_map) == 2
-    assert sync_map["kernel_launch"].eval_with_dict(params) == 1
-    assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/bsize
+    assert sync_map.filter_by(kind="kernel_launch").eval_and_sum(params) == 1
+    assert sync_map.filter_by(kind="barrier_local").eval_and_sum(params) == 2*m/bsize
 
     op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
     f32mul = op_map[
-                        lp.Op(np.float32, "mul", CG.SUBGROUP)
+                        lp.Op(np.float32, "mul", CG.SUBGROUP, "matmul")
                         ].eval_with_dict(params)
     f32add = op_map[
-                        lp.Op(np.float32, "add", CG.SUBGROUP)
+                        lp.Op(np.float32, "add", CG.SUBGROUP, "matmul")
                         ].eval_with_dict(params)
     i32ops = op_map[
-                        lp.Op(np.int32, "add", CG.SUBGROUP)
+                        lp.Op(np.int32, "add", CG.SUBGROUP, "matmul")
                         ].eval_with_dict(params)
     i32ops += op_map[
-                        lp.Op(np.dtype(np.int32), "mul", CG.SUBGROUP)
+                        lp.Op(np.dtype(np.int32), "mul", CG.SUBGROUP, "matmul")
                         ].eval_with_dict(params)
 
     # (count-per-sub-group)*n_subgroups
@@ -1002,13 +1076,15 @@ def test_all_counters_parallel_matmul():
                              lid_strides={0: 1, 1: Variable("ell")},
                              gid_strides={1: bsize},
                              direction="load", variable="b",
-                             count_granularity=CG.WORKITEM)
+                             count_granularity=CG.WORKITEM,
+                             kernel_name="matmul")
                              ].eval_with_dict(params)
     f32s1la = mem_access_map[lp.MemAccess("global", np.float32,
                              lid_strides={0: 1, 1: Variable("m")},
                              gid_strides={0: Variable("m")*bsize},
                              direction="load",
-                             variable="a", count_granularity=CG.WORKITEM)
+                             variable="a", count_granularity=CG.WORKITEM,
+                             kernel_name="matmul")
                              ].eval_with_dict(params)
 
     assert f32s1lb == n*m*ell/bsize
@@ -1018,7 +1094,8 @@ def test_all_counters_parallel_matmul():
                              lid_strides={0: 1, 1: Variable("ell")},
                              gid_strides={0: Variable("ell")*bsize, 1: bsize},
                              direction="store", variable="c",
-                             count_granularity=CG.WORKITEM)
+                             count_granularity=CG.WORKITEM,
+                             kernel_name="matmul")
                              ].eval_with_dict(params)
 
     assert f32coal == n*ell
@@ -1037,14 +1114,16 @@ def test_all_counters_parallel_matmul():
                                                lid_strides={1: 16},
                                                gid_strides={},
                                                variable="a_fetch",
-                                               count_granularity=CG.SUBGROUP)
+                                               count_granularity=CG.SUBGROUP,
+                                               kernel_name="matmul")
                                   ].eval_with_dict(params)
     local_mem_l_b = local_mem_map[lp.MemAccess("local", np.dtype(np.float32),
                                                direction="load",
                                                lid_strides={0: 1},
                                                gid_strides={},
                                                variable="b_fetch",
-                                               count_granularity=CG.SUBGROUP)
+                                               count_granularity=CG.SUBGROUP,
+                                               kernel_name="matmul")
                                   ].eval_with_dict(params)
 
     # (count-per-sub-group)*n_subgroups
@@ -1093,9 +1172,8 @@ def test_floor_div_coefficient_collector():
     n_subgroups = n_workgroups*subgroups_per_group
 
     # count local f32 accesses
-    f32_local = lp.get_mem_access_map(
-        knl, count_redundant_work=True, subgroup_size=SGS
-        ).filter_by(dtype=[np.float32], mtype=["local"]).eval_and_sum(params)
+    m = lp.get_mem_access_map(knl, count_redundant_work=True, subgroup_size=SGS)
+    f32_local = m.filter_by(dtype=[np.float32], mtype=["local"]).eval_and_sum(params)
 
     # (count-per-sub-group)*n_subgroups
     assert f32_local == 2*(rept+1)*n_subgroups
@@ -1133,7 +1211,8 @@ def test_mem_access_tagged_variables():
                              gid_strides={1: bsize},
                              direction="load", variable="b",
                              variable_tag="mmbload",
-                             count_granularity=CG.WORKITEM)
+                             count_granularity=CG.WORKITEM,
+                             kernel_name="matmul")
                              ].eval_with_dict(params)
     f32s1la = mem_access_map[lp.MemAccess("global", np.float32,
                              lid_strides={1: Variable("m")},
@@ -1141,7 +1220,8 @@ def test_mem_access_tagged_variables():
                              direction="load",
                              variable="a",
                              variable_tag="mmaload",
-                             count_granularity=CG.SUBGROUP)
+                             count_granularity=CG.SUBGROUP,
+                             kernel_name="matmul")
                              ].eval_with_dict(params)
 
     assert f32s1lb == n*m*ell
@@ -1154,7 +1234,8 @@ def test_mem_access_tagged_variables():
                              gid_strides={0: Variable("ell")*bsize, 1: bsize},
                              direction="store", variable="c",
                              variable_tag="mmresult",
-                             count_granularity=CG.WORKITEM)
+                             count_granularity=CG.WORKITEM,
+                             kernel_name="matmul")
                              ].eval_with_dict(params)
 
     assert f32coal == n*ell
@@ -1319,6 +1400,85 @@ def test_strided_footprint():
     assert 2*num < denom
 
 
+def test_stats_on_callable_kernel():
+    callee = lp.make_function(
+            "{[i, j]: 0<=i, j< 20}",
+            """
+            y[i] = sum(j, A[i,j]*x[j])
+            """, name="matvec20x20")
+
+    caller = lp.make_kernel(
+            "{:}",
+            """
+            y[:]  = matvec20x20(A[:,:], x[:])
+            """,
+            [
+                lp.GlobalArg("x,y", shape=(20,), dtype=np.float),
+                lp.GlobalArg("A", shape=(20, 20), dtype=np.float),
+                ],
+            name="matvec")
+    caller = lp.merge([caller, callee])
+
+    op_map = lp.get_op_map(caller, subgroup_size=SGS, count_redundant_work=True,
+                           count_within_subscripts=True)
+    f64_add = op_map.filter_by(name="add").eval_and_sum({})
+    assert f64_add == 400
+
+
+def test_stats_on_callable_kernel_within_loop():
+    callee = lp.make_function(
+            "{[i, j]: 0<=i, j< 20}",
+            """
+            y[i] = sum(j, A[i,j]*x[j])
+            """, name="matvec20x20")
+
+    caller = lp.make_kernel(
+            "{[i]: 0<=i< 20}",
+            """
+            y[i, :]  = matvec20x20(A[:,:], x[i, :])
+            """,
+            [
+                lp.GlobalArg("x,y", shape=(20, 20), dtype=np.float),
+                lp.GlobalArg("A", shape=(20, 20), dtype=np.float),
+                ],
+            name="matmat")
+    caller = lp.merge([caller, callee])
+
+    op_map = lp.get_op_map(caller, subgroup_size=SGS, count_redundant_work=True,
+                           count_within_subscripts=True)
+
+    f64_add = op_map.filter_by(name="add").eval_and_sum({})
+    assert f64_add == 8000
+
+
+def test_callable_kernel_with_substitution():
+    callee = lp.make_function(
+            "{[i, j]: 0<=i, j< n}",
+            """
+            y[i] = sum(j, A[i,j]*x[j])
+            """,
+            [lp.ValueArg("n"), ...],
+            name="matvec")
+
+    caller = lp.make_kernel(
+            "{[i]: 0<=i< 20}",
+            """
+            y[i, :]  = matvec(20, A[:,:], x[i, :])
+            """,
+            [
+                lp.GlobalArg("x,y", shape=(20, 20), dtype=np.float),
+                lp.GlobalArg("A", shape=(20, 20), dtype=np.float),
+                ],
+            name="matmat")
+    caller = lp.merge([caller, callee])
+
+    op_map = lp.get_op_map(caller, subgroup_size=SGS, count_redundant_work=True,
+                           count_within_subscripts=True)
+
+    f64_add = op_map.filter_by(name="add").eval_and_sum({})
+    assert f64_add == 8000
+
+
 def test_no_loop_ops():
     # See https://github.com/inducer/loopy/issues/211
 
diff --git a/test/test_target.py b/test/test_target.py
index e6a93299143c399aebd2f5025adb90238aea0c5a..0fb386998e775b3603a099d997ba1ff78bf85af8 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -70,9 +70,7 @@ def test_ispc_target(occa_mode=False):
     knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"],
             default_tag="l.auto")
 
-    codegen_result = lp.generate_code_v2(
-                lp.get_one_scheduled_kernel(
-                    lp.preprocess_kernel(knl)))
+    codegen_result = lp.generate_code_v2(knl)
 
     print(codegen_result.device_code())
     print(codegen_result.host_code())
@@ -96,9 +94,8 @@ def test_cuda_target():
             default_tag="l.auto")
 
     print(
-            lp.generate_code(
-                lp.get_one_scheduled_kernel(
-                    lp.preprocess_kernel(knl)))[0])
+            lp.generate_code_v2(
+                knl).device_code())
 
 
 def test_generate_c_snippet():
@@ -138,10 +135,7 @@ def test_generate_c_snippet():
 
     knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1))
     knl = lp.prioritize_loops(knl, "I,k_outer,k_inner")
-
-    knl = lp.preprocess_kernel(knl)
-    knl = lp.get_one_scheduled_kernel(knl)
-    print(lp.generate_body(knl))
+    print(lp.generate_code_v2(knl))
 
 
 @pytest.mark.parametrize("target", [CTarget, OpenCLTarget])
@@ -354,8 +348,7 @@ def test_ispc_streaming_stores():
 
     knl = lp.set_argument_order(knl, vars + ["n"])
 
-    knl = lp.preprocess_kernel(knl)
-    knl = lp.get_one_scheduled_kernel(knl)
+    lp.generate_code_v2(knl).all_code()
     assert "streaming_store(" in lp.generate_code_v2(knl).all_code()
 
 
diff --git a/test/test_transform.py b/test/test_transform.py
index cad5d776a748fc8b42c9b1d9e950e91523d4c2cd..9ac29766bfb8f7f887455cfd1cb123af9ff4915c 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -148,7 +148,7 @@ def test_to_batched_temp(ctx_factory):
     bref_knl = lp.to_batched(ref_knl, "nbatches", "out,x")
 
     # checking that cnst is not being bathced
-    assert bknl.temporary_variables["cnst"].shape == ()
+    assert bknl["loopy_kernel"].temporary_variables["cnst"].shape == ()
 
     a = np.random.randn(5, 5)
     x = np.random.randn(7, 5)
@@ -253,18 +253,17 @@ def test_vectorize(ctx_factory):
         a[i] = temp
         """)
     knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32))
-    knl = lp.set_array_dim_names(knl, "a,b", "i")
+    knl = lp.set_array_axis_names(knl, "a,b", "i")
     knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4,
             split_kwargs=dict(slabs=(0, 1)))
 
-    knl = lp.tag_data_axes(knl, "a,b", "c,vec")
+    knl = lp.tag_array_axes(knl, "a,b", "c,vec")
     ref_knl = knl
     ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"})
 
     knl = lp.tag_inames(knl, {"i_inner": "vec"})
 
     knl = lp.preprocess_kernel(knl)
-    knl = lp.get_one_scheduled_kernel(knl)
     code, inf = lp.generate_code(knl)
 
     lp.auto_test_vs_ref(
@@ -273,19 +272,19 @@ def test_vectorize(ctx_factory):
 
 
 def test_extract_subst(ctx_factory):
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
             "{[i]: 0<=i<n}",
             """
                 a[i] = 23*b[i]**2 + 25*b[i]**2
-                """)
+                """, name="extract_subst")
 
-    knl = lp.extract_subst(knl, "bsquare", "alpha*b[i]**2", "alpha")
+    prog = lp.extract_subst(prog, "bsquare", "alpha*b[i]**2", "alpha")
 
-    print(knl)
+    print(prog)
 
     from loopy.symbolic import parse
 
-    insn, = knl.instructions
+    insn, = prog["extract_subst"].instructions
     assert insn.expression == parse("bsquare(23) + bsquare(25)")
 
 
@@ -321,12 +320,12 @@ def test_tag_data_axes(ctx_factory):
     ref_knl = knl
 
     with pytest.raises(lp.LoopyError):
-        lp.tag_data_axes(knl, "out", "N1,N0,N5")
+        lp.tag_array_axes(knl, "out", "N1,N0,N5")
 
     with pytest.raises(lp.LoopyError):
-        lp.tag_data_axes(knl, "out", "N1,N0,c")
+        lp.tag_array_axes(knl, "out", "N1,N0,c")
 
-    knl = lp.tag_data_axes(knl, "out", "N1,N0,N2")
+    knl = lp.tag_array_axes(knl, "out", "N1,N0,N2")
     knl = lp.tag_inames(knl, dict(j="g.0", i="g.1"))
 
     lp.auto_test_vs_ref(ref_knl, ctx, knl,
@@ -356,63 +355,66 @@ def test_affine_map_inames():
 def test_precompute_confusing_subst_arguments(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
         "{[i,j]: 0<=i<n and 0<=j<5}",
         """
         D(i):=a[i+1]-a[i]
         b[i,j] = D(j)
-        """)
+        """, name="precomputer")
 
-    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
+    prog = lp.add_and_infer_dtypes(prog, dict(a=np.float32))
 
-    ref_knl = knl
+    ref_prog = prog
 
-    knl = lp.tag_inames(knl, dict(j="g.1"))
-    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
+    prog = lp.tag_inames(prog, dict(j="g.1"))
+    prog = lp.split_iname(prog, "i", 128, outer_tag="g.0", inner_tag="l.0")
 
     from loopy.symbolic import get_dependencies
-    assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression)
-    knl = lp.precompute(knl, "D", sweep_inames="j",
+    assert "i_inner" not in get_dependencies(
+            prog["precomputer"].substitutions["D"].expression)
+    prog = lp.precompute(prog, "D", sweep_inames="j",
             precompute_outer_inames="j, i_inner, i_outer")
 
     lp.auto_test_vs_ref(
-            ref_knl, ctx, knl,
+            ref_prog, ctx, prog,
             parameters=dict(n=12345))
 
 
 def test_precompute_nested_subst(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
         "{[i]: 0<=i<n}",
         """
         E:=a[i]
         D:=E*E
         b[i] = D
-        """)
+        """, name="precomputer")
 
-    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
+    prog = lp.add_and_infer_dtypes(prog, dict(a=np.float32))
 
-    ref_knl = knl
+    ref_prog = prog
 
-    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
+    prog = lp.split_iname(prog, "i", 128, outer_tag="g.0", inner_tag="l.0")
 
     from loopy.symbolic import get_dependencies
-    assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression)
-    knl = lp.precompute(knl, "D", "i_inner", default_tag="l.auto")
+    assert "i_inner" not in get_dependencies(
+            prog["precomputer"].substitutions["D"].expression)
+    prog = lp.precompute(prog, "D", "i_inner", default_tag="l.auto")
 
     # There's only one surviving 'E' rule.
     assert len([
         rule_name
-        for rule_name in knl.substitutions
+        for rule_name in prog["precomputer"].substitutions
         if rule_name.startswith("E")]) == 1
 
     # That rule should use the newly created prefetch inames,
     # not the prior 'i_inner'
-    assert "i_inner" not in get_dependencies(knl.substitutions["E"].expression)
+    assert "i_inner" not in get_dependencies(
+            prog["precomputer"].substitutions["E"].expression)
 
     lp.auto_test_vs_ref(
-            ref_knl, ctx, knl,
+            ref_prog, ctx, prog,
             parameters=dict(n=12345))
 
 
@@ -478,7 +480,7 @@ def test_precompute_with_preexisting_inames_fail():
 
 
 def test_add_nosync():
-    orig_knl = lp.make_kernel("{[i]: 0<=i<10}",
+    orig_prog = lp.make_kernel("{[i]: 0<=i<10}",
         """
         <>tmp[i] = 10 {id=insn1}
         <>tmp2[i] = 10 {id=insn2}
@@ -488,30 +490,36 @@ def test_add_nosync():
 
         <>tmp5[i] = 0 {id=insn5,groups=g1}
         tmp5[i] = 1 {id=insn6,conflicts=g1}
-        """)
+        """, name="nosync")
 
-    orig_knl = lp.set_temporary_scope(orig_knl, "tmp3", "local")
-    orig_knl = lp.set_temporary_scope(orig_knl, "tmp5", "local")
+    orig_prog = lp.set_temporary_scope(orig_prog, "tmp3", "local")
+    orig_prog = lp.set_temporary_scope(orig_prog, "tmp5", "local")
 
     # No dependency present - don't add nosync
-    knl = lp.add_nosync(orig_knl, "any", "writes:tmp", "writes:tmp2",
+    prog = lp.add_nosync(orig_prog, "any", "writes:tmp", "writes:tmp2",
             empty_ok=True)
-    assert frozenset() == knl.id_to_insn["insn2"].no_sync_with
+    assert frozenset() == (
+            prog["nosync"].id_to_insn["insn2"].no_sync_with)
 
     # Dependency present
-    knl = lp.add_nosync(orig_knl, "local", "writes:tmp3", "reads:tmp3")
-    assert frozenset() == knl.id_to_insn["insn3"].no_sync_with
-    assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with
+    prog = lp.add_nosync(orig_prog, "local", "writes:tmp3", "reads:tmp3")
+    assert frozenset() == (
+            prog["nosync"].id_to_insn["insn3"].no_sync_with)
+    assert frozenset([("insn3", "local")]) == (
+            prog["nosync"].id_to_insn["insn4"].no_sync_with)
 
     # Bidirectional
-    knl = lp.add_nosync(
-            orig_knl, "local", "writes:tmp3", "reads:tmp3", bidirectional=True)
-    assert frozenset([("insn4", "local")]) == knl.id_to_insn["insn3"].no_sync_with
-    assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with
+    prog = lp.add_nosync(
+            orig_prog, "local", "writes:tmp3", "reads:tmp3", bidirectional=True)
+    assert frozenset([("insn4", "local")]) == (
+            prog["nosync"].id_to_insn["insn3"].no_sync_with)
+    assert frozenset([("insn3", "local")]) == (
+            prog["nosync"].id_to_insn["insn4"].no_sync_with)
 
     # Groups
-    knl = lp.add_nosync(orig_knl, "local", "insn5", "insn6")
-    assert frozenset([("insn5", "local")]) == knl.id_to_insn["insn6"].no_sync_with
+    prog = lp.add_nosync(orig_prog, "local", "insn5", "insn6")
+    assert frozenset([("insn5", "local")]) == (
+            prog["nosync"].id_to_insn["insn6"].no_sync_with)
 
 
 def test_uniquify_instruction_ids():
@@ -520,28 +528,30 @@ def test_uniquify_instruction_ids():
     i3 = lp.Assignment("b", 1, id=lp.UniqueName("b"))
     i4 = lp.Assignment("b", 1, id=lp.UniqueName("b"))
 
-    knl = lp.make_kernel("{[i]: i = 1}", []).copy(instructions=[i1, i2, i3, i4])
+    prog = lp.make_kernel("{[i]: i = 1}", [], name="lpy_knl")
+    new_root_kernel = prog["lpy_knl"].copy(instructions=[i1, i2, i3, i4])
+    prog = prog.with_kernel(new_root_kernel)
 
     from loopy.transform.instruction import uniquify_instruction_ids
-    knl = uniquify_instruction_ids(knl)
+    prog = uniquify_instruction_ids(prog)
 
-    insn_ids = {insn.id for insn in knl.instructions}
+    insn_ids = {insn.id for insn in prog["lpy_knl"].instructions}
 
     assert len(insn_ids) == 4
     assert all(isinstance(id, str) for id in insn_ids)
 
 
 def test_split_iname_only_if_in_within():
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
             "{[i]: 0<=i<10}",
             """
             c[i] = 3*d[i] {id=to_split}
             a[i] = 2*b[i] {id=not_to_split}
-            """)
+            """, name="splitter")
 
-    knl = lp.split_iname(knl, "i", 4, within="id:to_split")
+    prog = lp.split_iname(prog, "i", 4, within="id:to_split")
 
-    for insn in knl.instructions:
+    for insn in prog["splitter"].instructions:
         if insn.id == "to_split":
             assert insn.within_inames == frozenset({"i_outer", "i_inner"})
         if insn.id == "not_to_split":
@@ -552,7 +562,7 @@ def test_nested_substs_in_insns(ctx_factory):
     ctx = ctx_factory()
     import loopy as lp
 
-    ref_knl = lp.make_kernel(
+    ref_prg = lp.make_kernel(
         "{[i]: 0<=i<10}",
         """
         a(x) := 2 * x
@@ -562,10 +572,12 @@ def test_nested_substs_in_insns(ctx_factory):
         """
     )
 
-    knl = lp.expand_subst(ref_knl)
-    assert not knl.substitutions
+    prg = lp.expand_subst(ref_prg)
+    assert not any(
+            cknl.subkernel.substitutions
+            for cknl in prg.callables_table.values())
 
-    lp.auto_test_vs_ref(ref_knl, ctx, knl)
+    lp.auto_test_vs_ref(ref_prg, ctx, prg)
 
 
 def test_extract_subst_with_iname_deps_in_templ(ctx_factory):
@@ -658,12 +670,12 @@ def test_add_inames_for_unused_hw_axes(ctx_factory):
 
     knl = lp.add_inames_for_unused_hw_axes(knl)
 
-    assert knl.id_to_insn["init_alpha"].within_inames == frozenset(["i_inner",
-        "i_outer", "j_outer", "j_inner"])
-    assert knl.id_to_insn["a_fetch_rule"].within_inames == frozenset(["i_inner",
-        "i_outer", "j_outer", "j_inner"])
-    assert knl.id_to_insn["b_fetch_rule"].within_inames == frozenset(["i_inner",
-        "i_outer", "j_outer", "j_inner"])
+    assert (knl["rank_one"].id_to_insn["init_alpha"].within_inames
+            == frozenset(["i_inner", "i_outer", "j_outer", "j_inner"]))
+    assert (knl["rank_one"].id_to_insn["a_fetch_rule"].within_inames
+            == frozenset(["i_inner", "i_outer", "j_outer", "j_inner"]))
+    assert (knl["rank_one"].id_to_insn["b_fetch_rule"].within_inames
+            == frozenset(["i_inner", "i_outer", "j_outer", "j_inner"]))
 
     lp.auto_test_vs_ref(ref_knl, ctx, knl,
             op_count=[np.dtype(dtype).itemsize*n**2/1e9], op_label=["GBytes"],
@@ -723,12 +735,13 @@ def test_rename_argument_with_assumptions():
     knl = lp.assume(knl, "n_old=10")
 
     knl = lp.rename_argument(knl, "n_old", "n_new")
+    assumptions = knl["loopy_kernel"].assumptions
 
-    assert "n_old" not in knl.assumptions.get_var_dict()
-    assert "n_new" in knl.assumptions.get_var_dict()
+    assert "n_old" not in assumptions.get_var_dict()
+    assert "n_new" in assumptions.get_var_dict()
     assert (
-            (knl.assumptions & isl.BasicSet("[n_new]->{: n_new=10}"))
-            == knl.assumptions)
+            (assumptions & isl.BasicSet("[n_new]->{: n_new=10}"))
+            == assumptions)
 
 
 def test_tag_iname_with_match_pattern():
@@ -740,6 +753,7 @@ def test_tag_iname_with_match_pattern():
             """)
 
     knl = lp.tag_inames(knl, "i*:unr")
+    knl = knl["loopy_kernel"]
     i0_tag, = knl.inames["i0"].tags
     i1_tag, = knl.inames["i1"].tags
 
@@ -765,6 +779,7 @@ def test_custom_iname_tag():
             """)
     knl = lp.tag_inames(knl, {"ifuzz0": ElementLoopTag(), "ifuzz1": DOFLoopTag()})
 
+    knl = knl["loopy_kernel"]
     ifuzz0_tag, = knl.inames["ifuzz0"].tags
     ifuzz1_tag, = knl.inames["ifuzz1"].tags
 
diff --git a/test/testlib.py b/test/testlib.py
index 35d51f72d2d7cf08dc5b92c8377c9c1578509e6d..7009e8f5aa2caba96d83ba9bd5f8f700a75b7e4a 100644
--- a/test/testlib.py
+++ b/test/testlib.py
@@ -1,4 +1,5 @@
 import loopy as lp
+import numpy as np
 
 
 # {{{ test_barrier_in_overridden_get_grid_size_expanded_kernel
@@ -8,8 +9,9 @@ class GridOverride:
         self.clean = clean
         self.vecsize = vecsize
 
-    def __call__(self, insn_ids, ignore_auto=True):
-        gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, ignore_auto)
+    def __call__(self, insn_ids, callables_table, ignore_auto=True):
+        gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids,
+                callables_table, ignore_auto)
         return gsize, (self.vecsize,)
 
 # }}}
@@ -131,4 +133,42 @@ class SeparateTemporariesPreambleTestPreambleGenerator(
 
 # }}}
 
+
+# {{{ test_register_function_lookup
+
+class Log2Callable(lp.ScalarCallable):
+
+    def with_types(self, arg_id_to_dtype, callables_table):
+
+        if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None:
+            # the types provided aren't mature enough to specialize the
+            # callable
+            return (
+                    self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                    callables_table)
+
+        dtype = arg_id_to_dtype[0].numpy_dtype
+
+        if dtype.kind in ("u", "i"):
+            # ints and unsigned casted to float32
+            dtype = np.float32
+
+        if dtype.type == np.float32:
+            name_in_target = "log2f"
+        elif dtype.type == np.float64:
+            name_in_target = "log2"
+            pass
+        else:
+            raise TypeError(f"log2: unexpected type {dtype}")
+
+        from loopy.types import NumpyType
+        return (
+                self.copy(name_in_target=name_in_target,
+                    arg_id_to_dtype={0: NumpyType(dtype), -1:
+                        NumpyType(dtype)}),
+                callables_table)
+
+
+# }}}
+
 # vim: foldmethod=marker