From 748f7bffc0fd3162a1bad718cd0d76eeb7bf6915 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Tue, 31 Jul 2018 21:09:08 -0500
Subject: [PATCH 01/80] now counting ops with count-granularity=subgroup

---
 loopy/statistics.py     | 131 ++++++++++++++++--
 test/test_statistics.py | 293 ++++++++++++++++++++++------------------
 2 files changed, 281 insertions(+), 143 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index cee28b24..2df3093d 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -715,7 +715,8 @@ class ExpressionOpCounter(CounterBase):
         return ToCountMap(
                     {Op(dtype=self.type_inf(expr),
                         name='func:'+str(expr.function),
-                        count_granularity=CountGranularity.WORKITEM): 1}
+                        #count_granularity=CountGranularity.WORKITEM): 1}
+                        count_granularity=CountGranularity.SUBGROUP): 1}
                     ) + self.rec(expr.parameters)
 
     def map_subscript(self, expr):
@@ -726,7 +727,8 @@ class ExpressionOpCounter(CounterBase):
         return ToCountMap(
                     {Op(dtype=self.type_inf(expr),
                         name='add',
-                        count_granularity=CountGranularity.WORKITEM):
+                        #count_granularity=CountGranularity.WORKITEM):
+                        count_granularity=CountGranularity.SUBGROUP):
                      len(expr.children)-1}
                     ) + sum(self.rec(child) for child in expr.children)
 
@@ -735,18 +737,21 @@ class ExpressionOpCounter(CounterBase):
         assert expr.children
         return sum(ToCountMap({Op(dtype=self.type_inf(expr),
                                   name='mul',
-                                  count_granularity=CountGranularity.WORKITEM): 1})
+                                  #count_granularity=CountGranularity.WORKITEM): 1})
+                                  count_granularity=CountGranularity.SUBGROUP): 1})
                    + self.rec(child)
                    for child in expr.children
                    if not is_zero(child + 1)) + \
                    ToCountMap({Op(dtype=self.type_inf(expr),
                                   name='mul',
-                                  count_granularity=CountGranularity.WORKITEM): -1})
+                                  #count_granularity=CountGranularity.WORKITEM): -1})
+                                  count_granularity=CountGranularity.SUBGROUP): -1})
 
     def map_quotient(self, expr, *args):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='div',
-                              count_granularity=CountGranularity.WORKITEM): 1}) \
+                              #count_granularity=CountGranularity.WORKITEM): 1}) \
+                              count_granularity=CountGranularity.SUBGROUP): 1}) \
                                 + self.rec(expr.numerator) \
                                 + self.rec(expr.denominator)
 
@@ -756,14 +761,16 @@ class ExpressionOpCounter(CounterBase):
     def map_power(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='pow',
-                              count_granularity=CountGranularity.WORKITEM): 1}) \
+                              #count_granularity=CountGranularity.WORKITEM): 1}) \
+                              count_granularity=CountGranularity.SUBGROUP): 1}) \
                                 + self.rec(expr.base) \
                                 + self.rec(expr.exponent)
 
     def map_left_shift(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='shift',
-                              count_granularity=CountGranularity.WORKITEM): 1}) \
+                              #count_granularity=CountGranularity.WORKITEM): 1}) \
+                              count_granularity=CountGranularity.SUBGROUP): 1}) \
                                 + self.rec(expr.shiftee) \
                                 + self.rec(expr.shift)
 
@@ -772,13 +779,15 @@ class ExpressionOpCounter(CounterBase):
     def map_bitwise_not(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='bw',
-                              count_granularity=CountGranularity.WORKITEM): 1}) \
+                              #count_granularity=CountGranularity.WORKITEM): 1}) \
+                              count_granularity=CountGranularity.SUBGROUP): 1}) \
                                 + self.rec(expr.child)
 
     def map_bitwise_or(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='bw',
-                              count_granularity=CountGranularity.WORKITEM):
+                              #count_granularity=CountGranularity.WORKITEM):
+                              count_granularity=CountGranularity.SUBGROUP):
                            len(expr.children)-1}) \
                                 + sum(self.rec(child) for child in expr.children)
 
@@ -802,7 +811,8 @@ class ExpressionOpCounter(CounterBase):
     def map_min(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='maxmin',
-                              count_granularity=CountGranularity.WORKITEM):
+                              #count_granularity=CountGranularity.WORKITEM):
+                              count_granularity=CountGranularity.SUBGROUP):
                            len(expr.children)-1}) \
                + sum(self.rec(child) for child in expr.children)
 
@@ -1329,14 +1339,109 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False,
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
 
+    if not isinstance(subgroup_size, int):
+        # try to find subgroup_size
+        subgroup_size_guess = _find_subgroup_size_for_knl(knl)
+
+        if subgroup_size is None:
+            if subgroup_size_guess is None:
+                # 'guess' was not passed and either no target device found
+                # or get_simd_group_size returned None
+                raise ValueError("No sub-group size passed, no target device found. "
+                                 "Either (1) pass integer value for subgroup_size, "
+                                 "(2) ensure that kernel.target is PyOpenClTarget "
+                                 "and kernel.target.device is set, or (3) pass "
+                                 "subgroup_size='guess' and hope for the best.")
+            else:
+                subgroup_size = subgroup_size_guess
+
+        elif subgroup_size == 'guess':
+            if subgroup_size_guess is None:
+                # unable to get subgroup_size from device, so guess
+                subgroup_size = 32
+                warn_with_kernel(knl, "get_op_map_guessing_subgroup_size",
+                                 "get_op_map: 'guess' sub-group size "
+                                 "passed, no target device found, wildly guessing "
+                                 "that sub-group size is %d." % (subgroup_size))
+            else:
+                subgroup_size = subgroup_size_guess
+        else:
+            raise ValueError("Invalid value for subgroup_size: %s. subgroup_size "
+                             "must be integer, 'guess', or, if you're feeling "
+                             "lucky, None." % (subgroup_size))
+
+    # ------------------------------
+    #class CacheHolder(object):
+    #    pass
+
+    #cache_holder = CacheHolder()
+    #from pytools import memoize_in
+
+    #@memoize_in(cache_holder, "insn_count")
+    def get_insn_count(knl, insn, count_granularity=CountGranularity.WORKITEM):
+
+        if count_granularity is None:
+            warn_with_kernel(knl, "get_insn_count_assumes_granularity",
+                             "get_insn_count: No count granularity passed for "
+                             "Op, assuming %s granularity."
+                             % (CountGranularity.WORKITEM))
+            count_granularity == CountGranularity.WORKITEM
+
+        if count_granularity == CountGranularity.WORKITEM:
+            return count_insn_runs(
+                knl, insn, count_redundant_work=count_redundant_work,
+                disregard_local_axes=False)
+
+        ct_disregard_local = count_insn_runs(
+                knl, insn, disregard_local_axes=True,
+                count_redundant_work=count_redundant_work)
+
+        if count_granularity == CountGranularity.WORKGROUP:
+            return ct_disregard_local
+        elif count_granularity == CountGranularity.SUBGROUP:
+            # get the group size
+            from loopy.symbolic import aff_to_expr
+            _, local_size = knl.get_grid_size_upper_bounds()
+            workgroup_size = 1
+            if local_size:
+                for size in local_size:
+                    s = aff_to_expr(size)
+                    if not isinstance(s, int):
+                        raise LoopyError("Cannot count insn with %s granularity, "
+                                         "work-group size is not integer: %s"
+                                         % (CountGranularity.SUBGROUP, local_size))
+                    workgroup_size *= s
+
+            warn_with_kernel(knl, "insn_count_subgroups_upper_bound",
+                    "get_insn_count: when counting instruction %s with "
+                    "count_granularity=%s, using upper bound for work-group size "
+                    "(%d work-items) to compute sub-groups per work-group. When "
+                    "multiple device programs present, actual sub-group count may be"
+                    "lower." % (insn, CountGranularity.SUBGROUP, workgroup_size))
+
+            from pytools import div_ceil
+            return ct_disregard_local*div_ceil(workgroup_size, subgroup_size)
+        else:
+            # this should not happen since this is enforced in Op
+            raise ValueError("get_insn_count: count_granularity '%s' is"
+                    "not allowed. count_granularity options: %s"
+                    % (count_granularity, CountGranularity.ALL+[None]))
+    # ------------------------------
+
     op_map = ToCountMap()
     op_counter = ExpressionOpCounter(knl)
     for insn in knl.instructions:
         if isinstance(insn, (CallInstruction, CInstruction, Assignment)):
             ops = op_counter(insn.assignee) + op_counter(insn.expression)
-            op_map = op_map + ops*count_insn_runs(
-                    knl, insn,
-                    count_redundant_work=count_redundant_work)
+            #op_map = op_map + ops*count_insn_runs(
+            #        knl, insn,
+            #        count_redundant_work=count_redundant_work)
+            for key, val in six.iteritems(ops):
+                op_map = (
+                        op_map
+                        + ToCountMap({key: val})
+                        * get_insn_count(knl, insn, key.count_granularity))
+
         elif isinstance(insn, (NoOpInstruction, BarrierInstruction)):
             pass
         else:
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 79c5ec7d..b5b55347 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -39,6 +39,9 @@ from pymbolic.primitives import Variable
 from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa
 
 
+SGS = 32  # Subgroup size
+
+
 def test_op_counter_basic():
 
     knl = lp.make_kernel(
@@ -54,21 +57,26 @@ def test_op_counter_basic():
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
                                        g=np.float64, h=np.float64))
-    op_map = lp.get_op_map(knl, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
+    n_workgroups = 1
+    group_size = 1
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params)
-    f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params)
-    f32div = op_map[lp.Op(np.float32, 'div', CG.WORKITEM)].eval_with_dict(params)
-    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.WORKITEM)
+    f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params)
+    f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params)
+    f64mul = op_map[lp.Op(np.dtype(np.float64), 'mul', CG.SUBGROUP)
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP)
                     ].eval_with_dict(params)
-    assert f32add == f32mul == f32div == n*m*ell
-    assert f64mul == n*m
-    assert i32add == n*m*2
+    # (count-per-sub-group)*n_subgroups
+    assert f32add == f32mul == f32div == n*m*ell*n_subgroups
+    assert f64mul == n*m*n_subgroups
+    assert i32add == n*m*2*n_subgroups
 
 
 def test_op_counter_reduction():
@@ -81,15 +89,20 @@ def test_op_counter_reduction():
             name="matmul_serial", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
-    op_map = lp.get_op_map(knl, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
+    n_workgroups = 1
+    group_size = 1
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params)
-    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.WORKITEM)
+    f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params)
+    f32mul = op_map[lp.Op(np.dtype(np.float32), 'mul', CG.SUBGROUP)
                     ].eval_with_dict(params)
-    assert f32add == f32mul == n*m*ell
+    # (count-per-sub-group)*n_subgroups
+    assert f32add == f32mul == n*m*ell*n_subgroups
 
     op_map_dtype = op_map.group_by('dtype')
     f32 = op_map_dtype[lp.Op(dtype=np.float32)].eval_with_dict(params)
@@ -111,21 +124,26 @@ def test_op_counter_logic():
             name="logic", assumptions="n,m,ell >= 1")
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
-    op_map = lp.get_op_map(knl, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
+    n_workgroups = 1
+    group_size = 1
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params)
-    f64add = op_map[lp.Op(np.float64, 'add', CG.WORKITEM)].eval_with_dict(params)
-    f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.WORKITEM)
+    f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params)
+    f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(params)
+    f64div = op_map[lp.Op(np.dtype(np.float64), 'div', CG.SUBGROUP)
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP)
                     ].eval_with_dict(params)
-    assert f32mul == n*m
-    assert f64div == 2*n*m  # TODO why?
-    assert f64add == n*m
-    assert i32add == n*m
+    # (count-per-sub-group)*n_subgroups
+    assert f32mul == n*m*n_subgroups
+    assert f64div == 2*n*m*n_subgroups  # TODO why?
+    assert f64add == n*m*n_subgroups
+    assert i32add == n*m*n_subgroups
 
 
 def test_op_counter_specialops():
@@ -143,27 +161,32 @@ def test_op_counter_specialops():
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
                                        g=np.float64, h=np.float64))
-    op_map = lp.get_op_map(knl, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
+    n_workgroups = 1
+    group_size = 1
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(params)
-    f32div = op_map[lp.Op(np.float32, 'div', CG.WORKITEM)].eval_with_dict(params)
-    f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(params)
-    f64pow = op_map[lp.Op(np.float64, 'pow', CG.WORKITEM)].eval_with_dict(params)
-    f64add = op_map[lp.Op(np.dtype(np.float64), 'add', CG.WORKITEM)
+    f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(params)
+    f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(params)
+    f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(params)
+    f64pow = op_map[lp.Op(np.float64, 'pow', CG.SUBGROUP)].eval_with_dict(params)
+    f64add = op_map[lp.Op(np.dtype(np.float64), 'add', CG.SUBGROUP)
                     ].eval_with_dict(params)
-    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.WORKITEM)
+    i32add = op_map[lp.Op(np.dtype(np.int32), 'add', CG.SUBGROUP)
                     ].eval_with_dict(params)
-    f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', CG.WORKITEM)
+    f64rsq = op_map[lp.Op(np.dtype(np.float64), 'func:rsqrt', CG.SUBGROUP)
                     ].eval_with_dict(params)
-    f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', CG.WORKITEM)
+    f64sin = op_map[lp.Op(np.dtype(np.float64), 'func:sin', CG.SUBGROUP)
                     ].eval_with_dict(params)
-    assert f32div == 2*n*m*ell
-    assert f32mul == f32add == n*m*ell
-    assert f64add == 3*n*m
-    assert f64pow == i32add == f64rsq == f64sin == n*m
+    # (count-per-sub-group)*n_subgroups
+    assert f32div == 2*n*m*ell*n_subgroups
+    assert f32mul == f32add == n*m*ell*n_subgroups
+    assert f64add == 3*n*m*n_subgroups
+    assert f64pow == i32add == f64rsq == f64sin == n*m*n_subgroups
 
 
 def test_op_counter_bitwise():
@@ -183,26 +206,31 @@ def test_op_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int64, h=np.int64))
 
-    op_map = lp.get_op_map(knl, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
+    n_workgroups = 1
+    group_size = 1
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
     n = 512
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
-    i32add = op_map[lp.Op(np.int32, 'add', CG.WORKITEM)].eval_with_dict(params)
-    i32bw = op_map[lp.Op(np.int32, 'bw', CG.WORKITEM)].eval_with_dict(params)
-    i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', CG.WORKITEM)
+    i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP)].eval_with_dict(params)
+    i32bw = op_map[lp.Op(np.int32, 'bw', CG.SUBGROUP)].eval_with_dict(params)
+    i64bw = op_map[lp.Op(np.dtype(np.int64), 'bw', CG.SUBGROUP)
                    ].eval_with_dict(params)
-    i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', CG.WORKITEM)
+    i64mul = op_map[lp.Op(np.dtype(np.int64), 'mul', CG.SUBGROUP)
                     ].eval_with_dict(params)
-    i64add = op_map[lp.Op(np.dtype(np.int64), 'add', CG.WORKITEM)
+    i64add = op_map[lp.Op(np.dtype(np.int64), 'add', CG.SUBGROUP)
                     ].eval_with_dict(params)
-    i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.WORKITEM)
+    i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.SUBGROUP)
                       ].eval_with_dict(params)
-    assert i32add == n*m+n*m*ell
-    assert i32bw == 2*n*m*ell
-    assert i64bw == 2*n*m
-    assert i64add == i64mul == n*m
-    assert i64shift == 2*n*m
+    # (count-per-sub-group)*n_subgroups
+    assert i32add == n*m+n*m*ell*n_subgroups
+    assert i32bw == 2*n*m*ell*n_subgroups
+    assert i64bw == 2*n*m*n_subgroups
+    assert i64add == i64mul == n*m*n_subgroups
+    assert i64shift == 2*n*m*n_subgroups
 
 
 def test_op_counter_triangular_domain():
@@ -228,15 +256,21 @@ def test_op_counter_triangular_domain():
 
     op_map = lp.get_op_map(
                     knl,
+                    subgroup_size=SGS,
                     count_redundant_work=True
-                    )[lp.Op(np.float64, 'mul', CG.WORKITEM)]
+                    )[lp.Op(np.float64, 'mul', CG.SUBGROUP)]
     value_dict = dict(m=13, n=200)
     flops = op_map.eval_with_dict(value_dict)
 
+    n_workgroups = 1
+    group_size = 1
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
+
     if expect_fallback:
-        assert flops == 144
+        assert flops == 144*n_subgroups
     else:
-        assert flops == 78
+        assert flops == 78*n_subgroups
 
 
 def test_mem_access_counter_basic():
@@ -254,10 +288,8 @@ def test_mem_access_counter_basic():
     knl = lp.add_and_infer_dtypes(knl,
                     dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
 
-    subgroup_size = 32
-
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
-                                    subgroup_size=subgroup_size)
+                                    subgroup_size=SGS)
 
     n = 512
     m = 256
@@ -266,7 +298,8 @@ def test_mem_access_counter_basic():
 
     n_workgroups = 1
     group_size = 1
-    subgroups_per_group = div_ceil(group_size, subgroup_size)
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
 
     f32l = mem_map[lp.MemAccess('global', np.float32,
                         lid_strides={}, gid_strides={},
@@ -289,9 +322,9 @@ def test_mem_access_counter_basic():
                         count_granularity=CG.SUBGROUP)
                     ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert f32l == (3*n*m*ell)*n_workgroups*subgroups_per_group
-    assert f64l == (2*n*m)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert f32l == (3*n*m*ell)*n_subgroups
+    assert f64l == (2*n*m)*n_subgroups
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                         lid_strides={}, gid_strides={},
@@ -304,9 +337,9 @@ def test_mem_access_counter_basic():
                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert f32s == (n*m*ell)*n_workgroups*subgroups_per_group
-    assert f64s == (n*m)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert f32s == (n*m*ell)*n_subgroups
+    assert f64s == (n*m)*n_subgroups
 
 
 def test_mem_access_counter_reduction():
@@ -320,10 +353,8 @@ def test_mem_access_counter_reduction():
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
 
-    subgroup_size = 32
-
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
-                                    subgroup_size=subgroup_size)
+                                    subgroup_size=SGS)
     n = 512
     m = 256
     ell = 128
@@ -331,7 +362,8 @@ def test_mem_access_counter_reduction():
 
     n_workgroups = 1
     group_size = 1
-    subgroups_per_group = div_ceil(group_size, subgroup_size)
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
 
     f32l = mem_map[lp.MemAccess('global', np.float32,
                         lid_strides={}, gid_strides={},
@@ -344,8 +376,8 @@ def test_mem_access_counter_reduction():
                         count_granularity=CG.SUBGROUP)
                     ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert f32l == (2*n*m*ell)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert f32l == (2*n*m*ell)*n_subgroups
 
     f32s = mem_map[lp.MemAccess('global', np.dtype(np.float32),
                         lid_strides={}, gid_strides={},
@@ -353,8 +385,8 @@ def test_mem_access_counter_reduction():
                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert f32s == (n*ell)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert f32s == (n*ell)*n_subgroups
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load']
                                  ).to_bytes().eval_and_sum(params)
@@ -379,10 +411,8 @@ def test_mem_access_counter_logic():
 
     knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64))
 
-    subgroup_size = 32
-
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
-                                    subgroup_size=subgroup_size)
+                                    subgroup_size=SGS)
     n = 512
     m = 256
     ell = 128
@@ -390,7 +420,8 @@ def test_mem_access_counter_logic():
 
     n_workgroups = 1
     group_size = 1
-    subgroups_per_group = div_ceil(group_size, subgroup_size)
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
 
     reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
 
@@ -404,10 +435,10 @@ def test_mem_access_counter_logic():
                                        direction='store')
                           ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert f32_g_l == (2*n*m)*n_workgroups*subgroups_per_group
-    assert f64_g_l == (n*m)*n_workgroups*subgroups_per_group
-    assert f64_g_s == (n*m)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert f32_g_l == (2*n*m)*n_subgroups
+    assert f64_g_l == (n*m)*n_subgroups
+    assert f64_g_s == (n*m)*n_subgroups
 
 
 def test_mem_access_counter_specialops():
@@ -425,10 +456,8 @@ def test_mem_access_counter_specialops():
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32,
                                             g=np.float64, h=np.float64))
 
-    subgroup_size = 32
-
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
-                                    subgroup_size=subgroup_size)
+                                    subgroup_size=SGS)
     n = 512
     m = 256
     ell = 128
@@ -436,7 +465,8 @@ def test_mem_access_counter_specialops():
 
     n_workgroups = 1
     group_size = 1
-    subgroups_per_group = div_ceil(group_size, subgroup_size)
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
 
     f32 = mem_map[lp.MemAccess('global', np.float32,
                         lid_strides={}, gid_strides={},
@@ -459,9 +489,9 @@ def test_mem_access_counter_specialops():
                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert f32 == (2*n*m*ell)*n_workgroups*subgroups_per_group
-    assert f64 == (2*n*m)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert f32 == (2*n*m*ell)*n_subgroups
+    assert f64 == (2*n*m)*n_subgroups
 
     f32 = mem_map[lp.MemAccess('global', np.float32,
                         lid_strides={}, gid_strides={},
@@ -474,16 +504,16 @@ def test_mem_access_counter_specialops():
                         count_granularity=CG.SUBGROUP)
                   ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert f32 == (n*m*ell)*n_workgroups*subgroups_per_group
-    assert f64 == (n*m)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert f32 == (n*m*ell)*n_subgroups
+    assert f64 == (n*m)*n_subgroups
 
     filtered_map = mem_map.filter_by(direction=['load'], variable=['a', 'g'],
                          count_granularity=CG.SUBGROUP)
     tot = filtered_map.eval_and_sum(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert tot == (n*m*ell + n*m)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert tot == (n*m*ell + n*m)*n_subgroups
 
 
 def test_mem_access_counter_bitwise():
@@ -503,10 +533,8 @@ def test_mem_access_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int32, h=np.int32))
 
-    subgroup_size = 32
-
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
-                                    subgroup_size=subgroup_size)
+                                    subgroup_size=SGS)
     n = 512
     m = 256
     ell = 128
@@ -514,7 +542,8 @@ def test_mem_access_counter_bitwise():
 
     n_workgroups = 1
     group_size = 1
-    subgroups_per_group = div_ceil(group_size, subgroup_size)
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
 
     i32 = mem_map[lp.MemAccess('global', np.int32,
                         lid_strides={}, gid_strides={},
@@ -537,8 +566,8 @@ def test_mem_access_counter_bitwise():
                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert i32 == (4*n*m+2*n*m*ell)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert i32 == (4*n*m+2*n*m*ell)*n_subgroups
 
     i32 = mem_map[lp.MemAccess('global', np.int32,
                         lid_strides={}, gid_strides={},
@@ -551,8 +580,8 @@ def test_mem_access_counter_bitwise():
                         count_granularity=CG.SUBGROUP)
                    ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert i32 == (n*m+n*m*ell)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert i32 == (n*m+n*m*ell)*n_subgroups
 
 
 def test_mem_access_counter_mixed():
@@ -571,7 +600,6 @@ def test_mem_access_counter_mixed():
                 x=np.float32))
 
     group_size_0 = 65
-    subgroup_size = 32
 
     knl = lp.split_iname(knl, "j", group_size_0)
     knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"})
@@ -583,10 +611,11 @@ def test_mem_access_counter_mixed():
 
     n_workgroups = div_ceil(ell, group_size_0)
     group_size = group_size_0
-    subgroups_per_group = div_ceil(group_size, subgroup_size)
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
 
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
-                                    subgroup_size=subgroup_size)
+                                    subgroup_size=SGS)
     f64uniform = mem_map[lp.MemAccess('global', np.float64,
                                 lid_strides={}, gid_strides={},
                                 direction='load', variable='g',
@@ -617,9 +646,9 @@ def test_mem_access_counter_mixed():
                                 count_granularity=CG.WORKITEM)
                             ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert f64uniform == (2*n*m)*n_workgroups*subgroups_per_group
-    assert f32uniform == (m*n)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert f64uniform == (2*n*m)*n_subgroups
+    assert f32uniform == (m*n)*n_subgroups
 
     expect_fallback = False
     import islpy as isl
@@ -651,8 +680,8 @@ def test_mem_access_counter_mixed():
                                 count_granularity=CG.WORKITEM)
                            ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert f64uniform == m*n*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert f64uniform == m*n*n_subgroups
 
     if expect_fallback:
         if ell < group_size_0:
@@ -681,7 +710,7 @@ def test_mem_access_counter_nonconsec():
     knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"})
 
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
-                                    subgroup_size=32)  # noqa
+                                    subgroup_size=SGS)  # noqa
     n = 512
     m = 256
     ell = 128
@@ -939,30 +968,35 @@ def test_all_counters_parallel_matmul():
     m = 256
     ell = 128
     params = {'n': n, 'm': m, 'ell': ell}
+    group_size = bsize*bsize
+    n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize)
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
 
     sync_map = lp.get_synchronization_map(knl)
     assert len(sync_map) == 2
     assert sync_map["kernel_launch"].eval_with_dict(params) == 1
     assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/bsize
 
-    op_map = lp.get_op_map(knl, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
     f32mul = op_map[
-                        lp.Op(np.float32, 'mul', CG.WORKITEM)
+                        lp.Op(np.float32, 'mul', CG.SUBGROUP)
                         ].eval_with_dict(params)
     f32add = op_map[
-                        lp.Op(np.float32, 'add', CG.WORKITEM)
+                        lp.Op(np.float32, 'add', CG.SUBGROUP)
                         ].eval_with_dict(params)
     i32ops = op_map[
-                        lp.Op(np.int32, 'add', CG.WORKITEM)
+                        lp.Op(np.int32, 'add', CG.SUBGROUP)
                         ].eval_with_dict(params)
     i32ops += op_map[
-                        lp.Op(np.dtype(np.int32), 'mul', CG.WORKITEM)
+                        lp.Op(np.dtype(np.int32), 'mul', CG.SUBGROUP)
                         ].eval_with_dict(params)
 
-    assert f32mul+f32add == n*m*ell*2
+    # (count-per-sub-group)*n_subgroups
+    assert f32mul+f32add == m*2*n_subgroups
 
     mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True,
-                                           subgroup_size=32)
+                                           subgroup_size=SGS)
 
     f32s1lb = mem_access_map[lp.MemAccess('global', np.float32,
                              lid_strides={0: 1, 1: Variable('ell')},
@@ -991,7 +1025,7 @@ def test_all_counters_parallel_matmul():
 
     local_mem_map = lp.get_mem_access_map(knl,
                         count_redundant_work=True,
-                        subgroup_size=32).filter_by(mtype=['local'])
+                        subgroup_size=SGS).filter_by(mtype=['local'])
 
     local_mem_l = local_mem_map.filter_by(direction=['load']
                                           ).eval_and_sum(params)
@@ -1067,8 +1101,6 @@ def test_summations_and_filters():
     knl = lp.add_and_infer_dtypes(knl,
                     dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64))
 
-    subgroup_size = 32
-
     n = 512
     m = 256
     ell = 128
@@ -1076,24 +1108,25 @@ def test_summations_and_filters():
 
     n_workgroups = 1
     group_size = 1
-    subgroups_per_group = div_ceil(group_size, subgroup_size)
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
 
     mem_map = lp.get_mem_access_map(knl, count_redundant_work=True,
-                                    subgroup_size=subgroup_size)
+                                    subgroup_size=SGS)
 
     loads_a = mem_map.filter_by(direction=['load'], variable=['a'],
                                 count_granularity=[CG.SUBGROUP]
                                 ).eval_and_sum(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert loads_a == (2*n*m*ell)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert loads_a == (2*n*m*ell)*n_subgroups
 
     global_stores = mem_map.filter_by(mtype=['global'], direction=['store'],
                                       count_granularity=[CG.SUBGROUP]
                                       ).eval_and_sum(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert global_stores == (n*m*ell + n*m)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert global_stores == (n*m*ell + n*m)*n_subgroups
 
     ld_bytes = mem_map.filter_by(mtype=['global'], direction=['load'],
                                  count_granularity=[CG.SUBGROUP]
@@ -1102,9 +1135,9 @@ def test_summations_and_filters():
                                  count_granularity=[CG.SUBGROUP]
                                  ).to_bytes().eval_and_sum(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_workgroups*subgroups_per_group
-    assert st_bytes == (4*n*m*ell + 8*n*m)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert ld_bytes == (4*n*m*ell*3 + 8*n*m*2)*n_subgroups
+    assert st_bytes == (4*n*m*ell + 8*n*m)*n_subgroups
 
     # ignore stride and variable names in this map
     reduced_map = mem_map.group_by('mtype', 'dtype', 'direction')
@@ -1113,11 +1146,11 @@ def test_summations_and_filters():
     f64lall = reduced_map[lp.MemAccess('global', np.float64, direction='load')
                           ].eval_with_dict(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert f32lall == (3*n*m*ell)*n_workgroups*subgroups_per_group
-    assert f64lall == (2*n*m)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert f32lall == (3*n*m*ell)*n_subgroups
+    assert f64lall == (2*n*m)*n_subgroups
 
-    op_map = lp.get_op_map(knl, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
     #for k, v in op_map.items():
     #    print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v)
 
@@ -1149,8 +1182,8 @@ def test_summations_and_filters():
                key.direction == 'load'
     f64l = mem_map.filter_by_func(func_filter).eval_and_sum(params)
 
-    # uniform: (count-per-sub-group)*n_workgroups*subgroups_per_group
-    assert f64l == (2*n*m)*n_workgroups*subgroups_per_group
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert f64l == (2*n*m)*n_subgroups
 
 
 def test_strided_footprint():
-- 
GitLab


From e382422b64be0a67c5f892f5bfb22cae9aa5c846 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Wed, 1 Aug 2018 14:29:19 -0500
Subject: [PATCH 02/80] now counting local access with
 count_granularity=subgroup

---
 loopy/statistics.py     |  6 ++++--
 test/test_statistics.py | 13 ++++++++-----
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 2df3093d..f8999367 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -928,7 +928,8 @@ class LocalMemAccessCounter(MemAccessCounter):
                     sub_map[MemAccess(
                                 mtype='local',
                                 dtype=dtype,
-                                count_granularity=CountGranularity.WORKITEM)
+                                #count_granularity=CountGranularity.WORKITEM)
+                                count_granularity=CountGranularity.SUBGROUP)
                             ] = 1
                     return sub_map
 
@@ -948,7 +949,8 @@ class LocalMemAccessCounter(MemAccessCounter):
                         lid_strides=dict(sorted(six.iteritems(lid_strides))),
                         gid_strides=dict(sorted(six.iteritems(gid_strides))),
                         variable=name,
-                        count_granularity=CountGranularity.WORKITEM)] = 1
+                        #count_granularity=CountGranularity.WORKITEM)] = 1
+                        count_granularity=CountGranularity.SUBGROUP)] = 1
 
         return sub_map
 
diff --git a/test/test_statistics.py b/test/test_statistics.py
index b5b55347..3f236652 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -1029,29 +1029,32 @@ def test_all_counters_parallel_matmul():
 
     local_mem_l = local_mem_map.filter_by(direction=['load']
                                           ).eval_and_sum(params)
-    assert local_mem_l == n*m*ell*2
+    # (count-per-sub-group)*n_subgroups
+    assert local_mem_l == m*2*n_subgroups
 
     local_mem_l_a = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
                                                direction='load',
                                                lid_strides={1: 16},
                                                gid_strides={},
                                                variable='a_fetch',
-                                               count_granularity=CG.WORKITEM)
+                                               count_granularity=CG.SUBGROUP)
                                   ].eval_with_dict(params)
     local_mem_l_b = local_mem_map[lp.MemAccess('local', np.dtype(np.float32),
                                                direction='load',
                                                lid_strides={0: 1},
                                                gid_strides={},
                                                variable='b_fetch',
-                                               count_granularity=CG.WORKITEM)
+                                               count_granularity=CG.SUBGROUP)
                                   ].eval_with_dict(params)
 
-    assert local_mem_l_a == local_mem_l_b == n*m*ell
+    # (count-per-sub-group)*n_subgroups
+    assert local_mem_l_a == local_mem_l_b == m*n_subgroups
 
     local_mem_s = local_mem_map.filter_by(direction=['store']
                                           ).eval_and_sum(params)
 
-    assert local_mem_s == n*m*ell*2/bsize
+    # (count-per-sub-group)*n_subgroups
+    assert local_mem_s == m*2/bsize*n_subgroups
 
 
 def test_gather_access_footprint():
-- 
GitLab


From 906090e11804a78c0c06455f5ea29b7e61657868 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Sun, 12 Aug 2018 16:38:04 +0530
Subject: [PATCH 03/80] those were a lot of changes :o

---
 doc/index.rst                             |   1 +
 examples/python/global_barrier_removal.py |   2 +-
 examples/python/hello-loopy.py            |   3 +-
 examples/python/ispc-stream-harness.py    |   2 -
 examples/python/sparse.py                 |   4 +-
 loopy/__init__.py                         |  36 +-
 loopy/auto_test.py                        | 289 ++++++--------
 loopy/check.py                            | 137 ++++++-
 loopy/cli.py                              |   2 +-
 loopy/codegen/__init__.py                 |  90 ++++-
 loopy/codegen/control.py                  |   3 +-
 loopy/codegen/loop.py                     |   2 +-
 loopy/codegen/result.py                   |   2 +-
 loopy/isl_helpers.py                      |   2 +-
 loopy/kernel/__init__.py                  | 132 ++++---
 loopy/kernel/creation.py                  |  35 +-
 loopy/kernel/data.py                      |   6 +-
 loopy/kernel/instruction.py               |  34 +-
 loopy/kernel/tools.py                     |  35 +-
 loopy/library/function.py                 |  54 +--
 loopy/library/random123.py                | 108 ++---
 loopy/library/reduction.py                | 256 ++++++------
 loopy/loop.py                             |   2 +
 loopy/preprocess.py                       | 320 +++++++++++++--
 loopy/schedule/__init__.py                |  21 +-
 loopy/statistics.py                       | 462 ++++++++++++++--------
 loopy/symbolic.py                         | 105 ++++-
 loopy/target/__init__.py                  |   9 +-
 loopy/target/c/__init__.py                | 245 ++++++------
 loopy/target/c/c_execution.py             |  39 +-
 loopy/target/c/codegen/expression.py      |  92 ++---
 loopy/target/cuda.py                      |  98 +++--
 loopy/target/execution.py                 | 116 +++---
 loopy/target/ispc.py                      |   5 +-
 loopy/target/opencl.py                    | 209 ++++++----
 loopy/target/pyopencl.py                  | 129 ++++--
 loopy/target/pyopencl_execution.py        |  61 +--
 loopy/target/python.py                    |  57 ++-
 loopy/tools.py                            |   3 +-
 loopy/transform/add_barrier.py            |  12 +-
 loopy/transform/arithmetic.py             |   6 +
 loopy/transform/batch.py                  |   8 +-
 loopy/transform/buffer.py                 |  43 +-
 loopy/transform/data.py                   |  54 ++-
 loopy/transform/diff.py                   |   3 +
 loopy/transform/fusion.py                 |  56 ++-
 loopy/transform/iname.py                  |  60 ++-
 loopy/transform/instruction.py            |  37 +-
 loopy/transform/padding.py                |  15 +-
 loopy/transform/parameter.py              |   6 +
 loopy/transform/precompute.py             |  38 +-
 loopy/transform/save.py                   |  27 +-
 loopy/transform/subst.py                  |  20 +-
 loopy/type_inference.py                   | 354 +++++++++++++++--
 test/test_apps.py                         |  19 +-
 test/test_c_execution.py                  |   1 +
 test/test_diff.py                         |   3 +-
 test/test_domain.py                       |  74 ++--
 test/test_fortran.py                      |  12 +-
 test/test_loopy.py                        | 393 +++++++++---------
 test/test_numa_diff.py                    |   4 +-
 test/test_reduction.py                    |  46 ++-
 test/test_target.py                       |  14 +-
 test/test_transform.py                    | 116 +++---
 test/testlib.py                           |  50 ++-
 65 files changed, 3071 insertions(+), 1608 deletions(-)

diff --git a/doc/index.rst b/doc/index.rst
index d862a8ac..0644b34c 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -68,6 +68,7 @@ Please check :ref:`installation` to get started.
     ref_creation
     ref_kernel
     ref_transform
+    ref_call
     ref_other
     misc
 
diff --git a/examples/python/global_barrier_removal.py b/examples/python/global_barrier_removal.py
index 7ab049cd..cc4926fe 100644
--- a/examples/python/global_barrier_removal.py
+++ b/examples/python/global_barrier_removal.py
@@ -24,7 +24,7 @@ from loopy.preprocess import preprocess_kernel
 knl = preprocess_kernel(knl)
 
 from loopy.schedule import get_one_scheduled_kernel
-knl = get_one_scheduled_kernel(knl)
+knl = get_one_scheduled_kernel(knl.root_kernel, knl.program_callables_info)
 
 # map schedule onto host or device
 print(knl)
diff --git a/examples/python/hello-loopy.py b/examples/python/hello-loopy.py
index 9098c544..764cea0e 100644
--- a/examples/python/hello-loopy.py
+++ b/examples/python/hello-loopy.py
@@ -16,7 +16,8 @@ a = cl.array.arange(queue, n, dtype=np.float32)
 # ------
 knl = lp.make_kernel(
         "{ [i]: 0<=i<n }",
-        "out[i] = 2*a[i]")
+        "out[i] = 2*a[i]",
+        target=lp.PyOpenCLTarget(ctx.devices[0]))
 
 # transform
 # ---------
diff --git a/examples/python/ispc-stream-harness.py b/examples/python/ispc-stream-harness.py
index fa581d42..90f31f09 100644
--- a/examples/python/ispc-stream-harness.py
+++ b/examples/python/ispc-stream-harness.py
@@ -29,8 +29,6 @@ def transform(knl, vars, stream_dtype):
 
 
 def gen_code(knl):
-    knl = lp.preprocess_kernel(knl)
-    knl = lp.get_one_scheduled_kernel(knl)
     codegen_result = lp.generate_code_v2(knl)
 
     return codegen_result.device_code() + "\n" + codegen_result.host_code()
diff --git a/examples/python/sparse.py b/examples/python/sparse.py
index a13ba34a..fbeb538f 100644
--- a/examples/python/sparse.py
+++ b/examples/python/sparse.py
@@ -14,6 +14,6 @@ k = lp.make_kernel([
     """)
 
 k = lp.add_and_infer_dtypes(k, {
-    "values,x": np.float64, "rowstarts,colindices": k.index_dtype
+    "values,x": np.float64, "rowstarts,colindices": k.root_kernel.index_dtype
     })
-print(lp.generate_code(k)[0])
+print(lp.generate_code_v2(k).device_code())
diff --git a/loopy/__init__.py b/loopy/__init__.py
index f50ce237..8ebd4d0e 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -29,13 +29,10 @@ from six.moves import range, zip
 from loopy.symbolic import (
         TaggedVariable, Reduction, LinearSubscript, TypeCast)
 from loopy.diagnostic import LoopyError, LoopyWarning
-
+from loopy.program import iterate_over_kernels_if_given_program
 
 # {{{ imported user interface
 
-from loopy.library.function import (
-        default_function_mangler, single_arg_function_mangler)
-
 from loopy.kernel.instruction import (
         MemoryOrdering, memory_ordering,
         MemoryScope, memory_scope,
@@ -51,6 +48,10 @@ from loopy.kernel.data import (
         TemporaryVariable,
         SubstitutionRule,
         CallMangleInfo)
+from loopy.kernel.function_interface import (
+        CallableKernel, ScalarCallable)
+from loopy.program import (
+        Program, make_program_from_kernel)
 
 from loopy.kernel import LoopKernel, KernelState, kernel_state
 from loopy.kernel.tools import (
@@ -63,7 +64,7 @@ from loopy.kernel.tools import (
         get_subkernels,
         get_subkernel_to_insn_id_map)
 from loopy.types import to_loopy_type
-from loopy.kernel.creation import make_kernel, UniqueName
+from loopy.kernel.creation import make_kernel, UniqueName, make_kernel_function
 from loopy.library.reduction import register_reduction_parser
 
 # {{{ import transforms
@@ -119,10 +120,14 @@ from loopy.transform.batch import to_batched
 from loopy.transform.parameter import assume, fix_parameters
 from loopy.transform.save import save_and_reload_temporaries
 from loopy.transform.add_barrier import add_barrier
+from loopy.transform.callable import (
+        register_function_id_to_in_knl_callable_mapper)
+
 # }}}
 
 from loopy.type_inference import infer_unknown_types
-from loopy.preprocess import preprocess_kernel, realize_reduction
+from loopy.preprocess import (preprocess_kernel, realize_reduction,
+        preprocess_program)
 from loopy.schedule import generate_loop_schedules, get_one_scheduled_kernel
 from loopy.statistics import (ToCountMap, CountGranularity, stringify_stats_mapping,
         Op, MemAccess, get_op_poly, get_op_map, get_lmem_access_poly,
@@ -168,6 +173,10 @@ __all__ = [
         "CallInstruction", "CInstruction", "NoOpInstruction",
         "BarrierInstruction",
 
+        "ScalarCallable", "CallableKernel",
+
+        "Program", "make_program_from_kernel",
+
         "KernelArgument",
         "ValueArg", "ArrayArg", "GlobalArg", "ConstantArg", "ImageArg",
         "AddressSpace", "temp_var_scope",   # temp_var_scope is deprecated
@@ -175,9 +184,7 @@ __all__ = [
         "SubstitutionRule",
         "CallMangleInfo",
 
-        "default_function_mangler", "single_arg_function_mangler",
-
-        "make_kernel", "UniqueName",
+        "make_kernel", "UniqueName", "make_kernel_function",
 
         "register_reduction_parser",
 
@@ -230,6 +237,8 @@ __all__ = [
 
         "add_barrier",
 
+        "register_function_id_to_in_knl_callable_mapper",
+
         # }}}
 
         "get_dot_dependency_graph",
@@ -245,7 +254,7 @@ __all__ = [
 
         "infer_unknown_types",
 
-        "preprocess_kernel", "realize_reduction",
+        "preprocess_kernel", "realize_reduction", "preprocess_program",
         "generate_loop_schedules", "get_one_scheduled_kernel",
         "GeneratedProgram", "CodeGenerationResult",
         "PreambleInfo",
@@ -293,6 +302,7 @@ __all__ = [
 
 # {{{ set_options
 
+@iterate_over_kernels_if_given_program
 def set_options(kernel, *args, **kwargs):
     """Return a new kernel with the options given as keyword arguments, or from
     a string representation passed in as the first (and only) positional
@@ -300,6 +310,7 @@ def set_options(kernel, *args, **kwargs):
 
     See also :class:`Options`.
     """
+    assert isinstance(kernel, LoopKernel)
 
     if args and kwargs:
         raise TypeError("cannot pass both positional and keyword arguments")
@@ -331,6 +342,7 @@ def set_options(kernel, *args, **kwargs):
 
 # {{{ library registration
 
+@iterate_over_kernels_if_given_program
 def register_preamble_generators(kernel, preamble_generators):
     """
     :arg manglers: list of functions of signature ``(preamble_info)``
@@ -355,6 +367,7 @@ def register_preamble_generators(kernel, preamble_generators):
     return kernel.copy(preamble_generators=new_pgens)
 
 
+@iterate_over_kernels_if_given_program
 def register_symbol_manglers(kernel, manglers):
     from loopy.tools import unpickles_equally
 
@@ -372,6 +385,7 @@ def register_symbol_manglers(kernel, manglers):
     return kernel.copy(symbol_manglers=new_manglers)
 
 
+@iterate_over_kernels_if_given_program
 def register_function_manglers(kernel, manglers):
     """
     :arg manglers: list of functions of signature ``(kernel, name, arg_dtypes)``
@@ -437,7 +451,7 @@ class CacheMode(object):
 # {{{ make copy kernel
 
 def make_copy_kernel(new_dim_tags, old_dim_tags=None):
-    """Returns a :class:`LoopKernel` that changes the data layout
+    """Returns a :class:`loopy.Program` that changes the data layout
     of a variable (called "input") to the new layout specified by
     *new_dim_tags* from the one specified by *old_dim_tags*.
     *old_dim_tags* defaults to an all-C layout of the same rank
diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index 015c82dd..5ce80ed8 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -29,6 +29,7 @@ from warnings import warn
 import numpy as np
 
 import loopy as lp
+
 from loopy.diagnostic import LoopyError, AutomaticTestFailure
 
 
@@ -75,7 +76,7 @@ class TestArgInfo(Record):
 
 # {{{ "reference" arguments
 
-def make_ref_args(kernel, impl_arg_info, queue, parameters):
+def make_ref_args(program, impl_arg_info, queue, parameters):
     import pyopencl as cl
     import pyopencl.array as cl_array
 
@@ -88,7 +89,7 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters):
     ref_arg_data = []
 
     for arg in impl_arg_info:
-        kernel_arg = kernel.impl_arg_to_arg.get(arg.name)
+        kernel_arg = program.impl_arg_to_arg.get(arg.name)
 
         if arg.arg_class is ValueArg:
             if arg.offset_for_name:
@@ -117,7 +118,7 @@ def make_ref_args(kernel, impl_arg_info, queue, parameters):
             shape = evaluate_shape(arg.unvec_shape, parameters)
             dtype = kernel_arg.dtype
 
-            is_output = arg.base_name in kernel.get_written_variables()
+            is_output = arg.base_name in program.root_kernel.get_written_variables()
 
             if arg.arg_class is ImageArg:
                 storage_array = ary = cl_array.empty(
@@ -366,7 +367,7 @@ def _enumerate_cl_devices_for_ref_test(blacklist_ref_vendors):
 # {{{ main automatic testing entrypoint
 
 def auto_test_vs_ref(
-        ref_knl, ctx, test_knl=None, op_count=[], op_label=[], parameters={},
+        ref_prog, ctx, test_prog=None, op_count=[], op_label=[], parameters={},
         print_ref_code=False, print_code=True, warmup_rounds=2,
         dump_binary=False,
         fills_entire_output=None, do_check=True, check_result=None,
@@ -383,24 +384,26 @@ def auto_test_vs_ref(
 
     import pyopencl as cl
 
-    if test_knl is None:
-        test_knl = ref_knl
+    if test_prog is None:
+        test_prog = ref_prog
         do_check = False
 
-    if len(ref_knl.args) != len(test_knl.args):
-        raise LoopyError("ref_knl and test_knl do not have the same number "
+    ref_prog = lp.preprocess_kernel(ref_prog)
+    test_prog = lp.preprocess_kernel(test_prog)
+
+    if len(ref_prog.args) != len(test_prog.args):
+        raise LoopyError("ref_prog and test_prog do not have the same number "
                 "of arguments")
 
-    for i, (ref_arg, test_arg) in enumerate(zip(ref_knl.args, test_knl.args)):
+    for i, (ref_arg, test_arg) in enumerate(zip(ref_prog.args, test_prog.args)):
         if ref_arg.name != test_arg.name:
-            raise LoopyError("ref_knl and test_knl argument lists disagree at index "
-                    "%d (1-based)" % (i+1))
+            raise LoopyError("ref_prog and test_prog argument lists disagree at "
+                    "index %d (1-based)" % (i+1))
 
         if ref_arg.dtype != test_arg.dtype:
-            raise LoopyError("ref_knl and test_knl argument lists disagree at index "
-                    "%d (1-based)" % (i+1))
+            raise LoopyError("ref_prog and test_prog argument lists disagree at "
+                    "index %d (1-based)" % (i+1))
 
-    from loopy.compiled import CompiledKernel
     from loopy.target.execution import get_highlighted_code
 
     if isinstance(op_count, (int, float)):
@@ -421,7 +424,7 @@ def auto_test_vs_ref(
     # {{{ compile and run reference code
 
     from loopy.type_inference import infer_unknown_types
-    ref_knl = infer_unknown_types(ref_knl, expect_completion=True)
+    ref_prog = infer_unknown_types(ref_prog, expect_completion=True)
 
     found_ref_device = False
 
@@ -431,30 +434,25 @@ def auto_test_vs_ref(
         ref_ctx = cl.Context([dev])
         ref_queue = cl.CommandQueue(ref_ctx,
                 properties=cl.command_queue_properties.PROFILING_ENABLE)
+        ref_codegen_result = lp.generate_code_v2(ref_prog)
 
-        pp_ref_knl = lp.preprocess_kernel(ref_knl)
-
-        for knl in lp.generate_loop_schedules(pp_ref_knl):
-            ref_sched_kernel = knl
-            break
+        ref_implemented_data_info = ref_codegen_result.implemented_data_info
 
         logger.info("%s (ref): trying %s for the reference calculation" % (
-            ref_knl.name, dev))
+            ref_prog.name, dev))
 
-        ref_compiled = CompiledKernel(ref_ctx, ref_sched_kernel)
         if not quiet and print_ref_code:
             print(75*"-")
             print("Reference Code:")
             print(75*"-")
-            print(get_highlighted_code(ref_compiled.get_code()))
+            print(get_highlighted_code(
+                ref_codegen_result.device_code()))
             print(75*"-")
 
-        ref_kernel_info = ref_compiled.kernel_info(frozenset())
-
         try:
             ref_args, ref_arg_data = \
-                    make_ref_args(ref_sched_kernel,
-                            ref_kernel_info.implemented_data_info,
+                    make_ref_args(ref_prog,
+                            ref_implemented_data_info,
                             ref_queue, parameters)
             ref_args["out_host"] = False
         except cl.RuntimeError as e:
@@ -479,13 +477,13 @@ def auto_test_vs_ref(
         ref_queue.finish()
 
         logger.info("%s (ref): using %s for the reference calculation" % (
-            ref_knl.name, dev))
-        logger.info("%s (ref): run" % ref_knl.name)
+            ref_prog.name, dev))
+        logger.info("%s (ref): run" % ref_prog.name)
 
         ref_start = time()
 
         if not AUTO_TEST_SKIP_RUN:
-            ref_evt, _ = ref_compiled(ref_queue, **ref_args)
+            ref_evt, _ = ref_prog(ref_queue, **ref_args)
         else:
             ref_evt = cl.enqueue_marker(ref_queue)
 
@@ -493,7 +491,7 @@ def auto_test_vs_ref(
         ref_stop = time()
         ref_elapsed_wall = ref_stop-ref_start
 
-        logger.info("%s (ref): run done" % ref_knl.name)
+        logger.info("%s (ref): run done" % ref_prog.name)
 
         ref_evt.wait()
         ref_elapsed_event = 1e-9*(ref_evt.profile.END-ref_evt.profile.START)
@@ -514,161 +512,136 @@ def auto_test_vs_ref(
     queue = cl.CommandQueue(ctx,
             properties=cl.command_queue_properties.PROFILING_ENABLE)
 
-    args = None
-    from loopy.kernel import KernelState
-    from loopy.target.pyopencl import PyOpenCLTarget
-    if test_knl.state not in [
-            KernelState.PREPROCESSED,
-            KernelState.SCHEDULED]:
-        if isinstance(test_knl.target, PyOpenCLTarget):
-            test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0]))
+    from loopy.type_inference import infer_unknown_types
 
-        test_knl = lp.preprocess_kernel(test_knl)
+    test_prog = infer_unknown_types(test_prog, expect_completion=True)
+    test_prog_codegen_result = lp.generate_code_v2(test_prog)
+
+    args = make_args(test_prog,
+            test_prog_codegen_result.implemented_data_info,
+            queue, ref_arg_data, parameters)
+    args["out_host"] = False
+
+    if not quiet:
+        print(75*"-")
+        print("Kernel:")
+        print(75*"-")
+        if print_code:
+            print(get_highlighted_code(
+                test_prog_codegen_result.device_code()))
+            print(75*"-")
+        if dump_binary:
+            print(type(test_prog_codegen_result.cl_program))
+            print(test_prog_codegen_result.cl_program.binaries[0])
+            print(75*"-")
 
-    if not test_knl.schedule:
-        test_kernels = lp.generate_loop_schedules(test_knl)
-    else:
-        test_kernels = [test_knl]
+    logger.info("%s: run warmup" % (test_prog.name))
 
-    test_kernel_count = 0
+    for i in range(warmup_rounds):
+        if not AUTO_TEST_SKIP_RUN:
+            test_prog(queue, **args)
 
-    from loopy.type_inference import infer_unknown_types
-    for i, kernel in enumerate(test_kernels):
-        test_kernel_count += 1
-        if test_kernel_count > max_test_kernel_count:
-            break
+        if need_check and not AUTO_TEST_SKIP_RUN:
+            for arg_desc in ref_arg_data:
+                if arg_desc is None:
+                    continue
+                if not arg_desc.needs_checking:
+                    continue
 
-        kernel = infer_unknown_types(kernel, expect_completion=True)
+                from pyopencl.compyte.array import as_strided
+                ref_ary = as_strided(
+                        arg_desc.ref_storage_array.get(),
+                        shape=arg_desc.ref_shape,
+                        strides=arg_desc.ref_numpy_strides).flatten()
+                test_ary = as_strided(
+                        arg_desc.test_storage_array.get(),
+                        shape=arg_desc.test_shape,
+                        strides=arg_desc.test_numpy_strides).flatten()
+                common_len = min(len(ref_ary), len(test_ary))
+                ref_ary = ref_ary[:common_len]
+                test_ary = test_ary[:common_len]
 
-        compiled = CompiledKernel(ctx, kernel)
+                error_is_small, error = check_result(test_ary, ref_ary)
+                if not error_is_small:
+                    raise AutomaticTestFailure(error)
 
-        if args is None:
-            kernel_info = compiled.kernel_info(frozenset())
+                need_check = False
 
-            args = make_args(kernel,
-                    kernel_info.implemented_data_info,
-                    queue, ref_arg_data, parameters)
-        args["out_host"] = False
+    events = []
+    queue.finish()
 
-        if not quiet:
-            print(75*"-")
-            print("Kernel #%d:" % i)
-            print(75*"-")
-            if print_code:
-                print(compiled.get_highlighted_code())
-                print(75*"-")
-            if dump_binary:
-                print(type(compiled.cl_program))
-                print(compiled.cl_program.binaries[0])
-                print(75*"-")
+    logger.info("%s: warmup done" % (test_prog.name))
 
-        logger.info("%s: run warmup" % (knl.name))
+    logger.info("%s: timing run" % (test_prog.name))
 
-        for i in range(warmup_rounds):
-            if not AUTO_TEST_SKIP_RUN:
-                compiled(queue, **args)
-
-            if need_check and not AUTO_TEST_SKIP_RUN:
-                for arg_desc in ref_arg_data:
-                    if arg_desc is None:
-                        continue
-                    if not arg_desc.needs_checking:
-                        continue
-
-                    from pyopencl.compyte.array import as_strided
-                    ref_ary = as_strided(
-                            arg_desc.ref_storage_array.get(),
-                            shape=arg_desc.ref_shape,
-                            strides=arg_desc.ref_numpy_strides).flatten()
-                    test_ary = as_strided(
-                            arg_desc.test_storage_array.get(),
-                            shape=arg_desc.test_shape,
-                            strides=arg_desc.test_numpy_strides).flatten()
-                    common_len = min(len(ref_ary), len(test_ary))
-                    ref_ary = ref_ary[:common_len]
-                    test_ary = test_ary[:common_len]
-
-                    error_is_small, error = check_result(test_ary, ref_ary)
-                    if not error_is_small:
-                        raise AutomaticTestFailure(error)
-
-                    need_check = False
-
-        events = []
-        queue.finish()
+    timing_rounds = warmup_rounds
 
-        logger.info("%s: warmup done" % (knl.name))
+    while True:
+        from time import time
+        start_time = time()
 
-        logger.info("%s: timing run" % (knl.name))
+        evt_start = cl.enqueue_marker(queue)
 
-        timing_rounds = warmup_rounds
+        for i in range(timing_rounds):
+            if not AUTO_TEST_SKIP_RUN:
+                evt, _ = test_prog(queue, **args)
+                events.append(evt)
+            else:
+                events.append(cl.enqueue_marker(queue))
 
-        while True:
-            from time import time
-            start_time = time()
+        evt_end = cl.enqueue_marker(queue)
 
-            evt_start = cl.enqueue_marker(queue)
+        queue.finish()
+        stop_time = time()
 
-            for i in range(timing_rounds):
-                if not AUTO_TEST_SKIP_RUN:
-                    evt, _ = compiled(queue, **args)
-                    events.append(evt)
-                else:
-                    events.append(cl.enqueue_marker(queue))
+        for evt in events:
+            evt.wait()
+        evt_start.wait()
+        evt_end.wait()
 
-            evt_end = cl.enqueue_marker(queue)
+        elapsed_event = (1e-9*events[-1].profile.END
+                - 1e-9*events[0].profile.START) \
+                / timing_rounds
+        try:
+            elapsed_event_marker = ((1e-9*evt_end.profile.START
+                        - 1e-9*evt_start.profile.START)
+                    / timing_rounds)
+        except cl.RuntimeError:
+            elapsed_event_marker = None
 
-            queue.finish()
-            stop_time = time()
+        elapsed_wall = (stop_time-start_time)/timing_rounds
 
-            for evt in events:
-                evt.wait()
-            evt_start.wait()
-            evt_end.wait()
+        if elapsed_wall * timing_rounds < 0.3:
+            timing_rounds *= 4
+        else:
+            break
 
-            elapsed_event = (1e-9*events[-1].profile.END
-                    - 1e-9*events[0].profile.START) \
-                    / timing_rounds
-            try:
-                elapsed_event_marker = ((1e-9*evt_end.profile.START
-                            - 1e-9*evt_start.profile.START)
-                        / timing_rounds)
-            except cl.RuntimeError:
-                elapsed_event_marker = None
+    logger.info("%s: timing run done" % (test_prog.name))
 
-            elapsed_wall = (stop_time-start_time)/timing_rounds
+    rates = ""
+    for cnt, lbl in zip(op_count, op_label):
+        rates += " %g %s/s" % (cnt/elapsed_wall, lbl)
 
-            if elapsed_wall * timing_rounds < 0.3:
-                timing_rounds *= 4
+    if not quiet:
+        def format_float_or_none(v):
+            if v is None:
+                return "<unavailable>"
             else:
-                break
+                return "%g" % v
 
-        logger.info("%s: timing run done" % (knl.name))
+        print("elapsed: %s s event, %s s marker-event %s s wall "
+                "(%d rounds)%s" % (
+                    format_float_or_none(elapsed_event),
+                    format_float_or_none(elapsed_event_marker),
+                    format_float_or_none(elapsed_wall), timing_rounds, rates))
 
-        rates = ""
+    if do_check:
+        ref_rates = ""
         for cnt, lbl in zip(op_count, op_label):
-            rates += " %g %s/s" % (cnt/elapsed_wall, lbl)
-
+            ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl)
         if not quiet:
-            def format_float_or_none(v):
-                if v is None:
-                    return "<unavailable>"
-                else:
-                    return "%g" % v
-
-            print("elapsed: %s s event, %s s marker-event %s s wall "
-                    "(%d rounds)%s" % (
-                        format_float_or_none(elapsed_event),
-                        format_float_or_none(elapsed_event_marker),
-                        format_float_or_none(elapsed_wall), timing_rounds, rates))
-
-        if do_check:
-            ref_rates = ""
-            for cnt, lbl in zip(op_count, op_label):
-                ref_rates += " %g %s/s" % (cnt/ref_elapsed_event, lbl)
-            if not quiet:
-                print("ref: elapsed: %g s event, %g s wall%s" % (
-                        ref_elapsed_event, ref_elapsed_wall, ref_rates))
+            print("ref: elapsed: %g s event, %g s wall%s" % (
+                    ref_elapsed_event, ref_elapsed_wall, ref_rates))
 
     # }}}
 
diff --git a/loopy/check.py b/loopy/check.py
index c31304d8..ae5599bc 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -27,9 +27,13 @@ from six.moves import range
 
 from islpy import dim_type
 import islpy as isl
-from loopy.symbolic import WalkMapper
+from loopy.symbolic import WalkMapper, CombineMapper, ResolvedFunction
 from loopy.diagnostic import LoopyError, WriteRaceConditionWarning, warn_with_kernel
 
+from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction,
+        _DataObliviousInstruction)
+from functools import reduce
+
 import logging
 logger = logging.getLogger(__name__)
 
@@ -56,6 +60,73 @@ def check_identifiers_in_subst_rules(knl):
                     % (knl.name, rule.name,
                        ", ".join(deps-rule_allowed_identifiers)))
 
+
+class UnscopedCallCollector(CombineMapper):
+    """
+    Collects all the unscoped calls within a kernel.
+
+    :returns:
+        An :class:`frozenset` of function names that are not scoped in
+        the kernel.
+
+    .. note::
+        :class:`loopy.library.reduction.ArgExtOp` are ignored, as they are
+        never scoped in the pipeline.
+    """
+
+    def combine(self, values):
+        import operator
+        return reduce(operator.or_, values, frozenset())
+
+    def map_call(self, expr):
+        from pymbolic.primitives import CallWithKwargs
+        return self.rec(CallWithKwargs(
+            function=expr.function, parameters=expr.parameters,
+            kw_parameters={}))
+
+    def map_call_with_kwargs(self, expr):
+        from loopy.library.reduction import ArgExtOp
+        if not isinstance(expr.function, (ResolvedFunction, ArgExtOp)):
+            return (frozenset([expr.function.name]) |
+                    self.combine((self.rec(child) for child in expr.parameters
+                        + tuple(expr.kw_parameters.values()))))
+        else:
+            return self.combine((self.rec(child) for child in
+                expr.parameters+tuple(expr.kw_parameters.values())))
+
+    def map_constant(self, expr):
+        return frozenset()
+
+    map_variable = map_constant
+    map_function_symbol = map_constant
+    map_tagged_variable = map_constant
+    map_type_cast = map_constant
+
+
+def check_functions_are_scoped(kernel):
+    """ Checks if all the calls in the instruction expression have been scoped,
+    otherwise indicates to what all calls we await signature. Refer
+    :class:`loopy.symbolic.ResolvedFunction` for a detailed explanation of a
+    scoped function.
+    """
+
+    from loopy.symbolic import SubstitutionRuleExpander
+    subst_expander = SubstitutionRuleExpander(kernel.substitutions)
+
+    for insn in kernel.instructions:
+        if isinstance(insn, MultiAssignmentBase):
+            unscoped_calls = UnscopedCallCollector()(subst_expander(
+                insn.expression))
+            if unscoped_calls:
+                raise LoopyError("Unknown function '%s' obtained -- register a "
+                        "function or a kernel corresponding to it." %
+                        set(unscoped_calls).pop())
+        elif isinstance(insn, (CInstruction, _DataObliviousInstruction)):
+            pass
+        else:
+            raise NotImplementedError(
+                    "Unknown type of instruction %s" % type(insn).__name__)
+
 # }}}
 
 
@@ -114,6 +185,18 @@ def check_loop_priority_inames_known(kernel):
                 raise LoopyError("unknown iname '%s' in loop priorities" % iname)
 
 
+def _get_all_unique_iname_tags(kernel):
+    """Returns a set of all the iname tags used in *kernel* that
+    inherit from :class:`loopy.kernel.data.UniqueTag`.
+    """
+    from loopy.kernel.data import UniqueTag
+    iname_tags = [kernel.iname_to_tag.get(iname) for iname in
+        kernel.all_inames()]
+    return set(
+            tag for tag in iname_tags if
+            isinstance(tag, UniqueTag))
+
+
 def check_multiple_tags_allowed(kernel):
     from loopy.kernel.data import (GroupIndexTag, LocalIndexTag, VectorizeTag,
                 UnrollTag, ForceSequentialTag, IlpBaseTag, filter_iname_tags_by_type)
@@ -128,8 +211,10 @@ def check_multiple_tags_allowed(kernel):
                                  "tags: {1}".format(iname, tags))
 
 
-def check_for_double_use_of_hw_axes(kernel):
+def check_for_double_use_of_hw_axes(kernel, program_callables_info):
     from loopy.kernel.data import UniqueTag
+    from loopy.kernel.instruction import CallInstruction
+    from loopy.kernel.function_interface import CallableKernel
 
     for insn in kernel.instructions:
         insn_tag_keys = set()
@@ -142,6 +227,21 @@ def check_for_double_use_of_hw_axes(kernel):
 
                 insn_tag_keys.add(key)
 
+        # check usage of iname tags in the callee kernel
+        if isinstance(insn, CallInstruction):
+            in_knl_callable = program_callables_info[
+                    insn.expression.function.name]
+            if isinstance(in_knl_callable, CallableKernel):
+                # check for collision in iname_tag keys in the instruction
+                # due to the callee kernel
+                common_iname_tags = [tag for tag in
+                        _get_all_unique_iname_tags(in_knl_callable.subkernel)
+                        if tag.key in insn_tag_keys]
+                if common_iname_tags:
+                    raise LoopyError("instruction '%s' has multiple "
+                            "inames tagged '%s'" % (insn.id,
+                                common_iname_tags.pop()))
+
 
 def check_for_inactive_iname_access(kernel):
     for insn in kernel.instructions:
@@ -387,11 +487,12 @@ def check_write_destinations(kernel):
 # {{{ check_has_schedulable_iname_nesting
 
 def check_has_schedulable_iname_nesting(kernel):
-    from loopy.transform.iname import (has_schedulable_iname_nesting,
-                                       get_iname_duplication_options)
-    if not has_schedulable_iname_nesting(kernel):
+    from loopy.transform.iname import (
+            has_schedulable_iname_nesting_for_single_kernel,
+            get_iname_duplication_options_for_single_kernel)
+    if not has_schedulable_iname_nesting_for_single_kernel(kernel):
         import itertools as it
-        opt = get_iname_duplication_options(kernel)
+        opt = get_iname_duplication_options_for_single_kernel(kernel)
         opt_str = "\n".join("* Duplicate %s within instructions %s" % (i, w)
                             for i, w in it.islice(opt, 3))
         raise LoopyError("Kernel does not have a schedulable iname nesting. "
@@ -616,13 +717,13 @@ def check_variable_access_ordered(kernel):
 # }}}
 
 
-def pre_schedule_checks(kernel):
+def pre_schedule_checks(kernel, program_callables_info):
     try:
         logger.debug("%s: pre-schedule check: start" % kernel.name)
 
         check_for_duplicate_insn_ids(kernel)
         check_for_orphaned_user_hardware_axes(kernel)
-        check_for_double_use_of_hw_axes(kernel)
+        check_for_double_use_of_hw_axes(kernel, program_callables_info)
         check_insn_attributes(kernel)
         check_loop_priority_inames_known(kernel)
         check_multiple_tags_allowed(kernel)
@@ -650,7 +751,8 @@ def pre_schedule_checks(kernel):
 
 # {{{ check for unused hw axes
 
-def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None):
+def _check_for_unused_hw_axes_in_kernel_chunk(kernel, program_callables_info,
+        sched_index=None):
     from loopy.schedule import (CallKernel, RunInstruction,
             Barrier, EnterLoop, LeaveLoop, ReturnFromKernel,
             get_insn_ids_for_block_at, gather_schedule_block)
@@ -665,7 +767,8 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None):
         assert isinstance(kernel.schedule[sched_index], CallKernel)
         _, past_end_i = gather_schedule_block(kernel.schedule, sched_index)
         group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs(
-                get_insn_ids_for_block_at(kernel.schedule, sched_index))
+                get_insn_ids_for_block_at(kernel.schedule, sched_index),
+                program_callables_info)
 
         group_axes = set(ax for ax, length in enumerate(group_size))
         local_axes = set(ax for ax, length in enumerate(local_size))
@@ -682,7 +785,8 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None):
     while i < loop_end_i:
         sched_item = kernel.schedule[i]
         if isinstance(sched_item, CallKernel):
-            i = _check_for_unused_hw_axes_in_kernel_chunk(kernel, i)
+            i = _check_for_unused_hw_axes_in_kernel_chunk(kernel,
+                    program_callables_info, i)
 
         elif isinstance(sched_item, RunInstruction):
             insn = kernel.id_to_insn[sched_item.insn_id]
@@ -733,9 +837,10 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, sched_index=None):
     return past_end_i
 
 
-def check_for_unused_hw_axes_in_insns(kernel):
+def check_for_unused_hw_axes_in_insns(kernel, program_callables_info):
     if kernel.schedule:
-        _check_for_unused_hw_axes_in_kernel_chunk(kernel)
+        _check_for_unused_hw_axes_in_kernel_chunk(kernel,
+                program_callables_info)
 
 # }}}
 
@@ -889,15 +994,15 @@ def check_that_shapes_and_strides_are_arguments(kernel):
 # }}}
 
 
-def pre_codegen_checks(kernel):
+def pre_codegen_checks(kernel, program_callables_info):
     try:
         logger.debug("pre-codegen check %s: start" % kernel.name)
 
-        check_for_unused_hw_axes_in_insns(kernel)
+        check_for_unused_hw_axes_in_insns(kernel, program_callables_info)
         check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel)
         check_that_temporaries_are_defined_in_subkernels_where_used(kernel)
         check_that_all_insns_are_scheduled(kernel)
-        kernel.target.pre_codegen_check(kernel)
+        kernel.target.pre_codegen_check(kernel, program_callables_info)
         check_that_shapes_and_strides_are_arguments(kernel)
 
         logger.debug("pre-codegen check %s: done" % kernel.name)
diff --git a/loopy/cli.py b/loopy/cli.py
index a92922b1..060340d5 100644
--- a/loopy/cli.py
+++ b/loopy/cli.py
@@ -205,7 +205,7 @@ def main():
         new_kernels = []
         for kernel in kernels:
             new_args = [
-                    lp.ArrayArg("occa_info", np.int32, shape=None)
+                    lp.GlobalArg("occa_info", np.int32, shape=None)
                     ] + kernel.args
             new_kernels.append(kernel.copy(args=new_args))
 
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 11f874e1..3e675db7 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -32,6 +32,10 @@ from pytools.persistent_dict import WriteOncePersistentDict
 from loopy.tools import LoopyKeyBuilder
 from loopy.version import DATA_MODEL_VERSION
 
+from loopy.kernel.function_interface import CallableKernel
+from cgen import Collection
+
+
 import logging
 logger = logging.getLogger(__name__)
 
@@ -146,6 +150,7 @@ class SeenFunction(ImmutableRecord):
 class CodeGenerationState(object):
     """
     .. attribute:: kernel
+    .. attribute:: target
     .. attribute:: implemented_data_info
 
         a list of :class:`ImplementedDataInfo` objects.
@@ -187,17 +192,21 @@ class CodeGenerationState(object):
         generated.
 
     .. attribute:: schedule_index_end
+
+    .. attribute:: program_callables_info
     """
 
-    def __init__(self, kernel,
+    def __init__(self, kernel, target,
             implemented_data_info, implemented_domain, implemented_predicates,
             seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map,
             allow_complex,
+            program_callables_info,
             vectorization_info=None, var_name_generator=None,
             is_generating_device_code=None,
             gen_program_name=None,
             schedule_index_end=None):
         self.kernel = kernel
+        self.target = target
         self.implemented_data_info = implemented_data_info
         self.implemented_domain = implemented_domain
         self.implemented_predicates = implemented_predicates
@@ -206,6 +215,7 @@ class CodeGenerationState(object):
         self.seen_atomic_dtypes = seen_atomic_dtypes
         self.var_subst_map = var_subst_map.copy()
         self.allow_complex = allow_complex
+        self.program_callables_info = program_callables_info
         self.vectorization_info = vectorization_info
         self.var_name_generator = var_name_generator
         self.is_generating_device_code = is_generating_device_code
@@ -214,7 +224,7 @@ class CodeGenerationState(object):
 
     # {{{ copy helpers
 
-    def copy(self, kernel=None, implemented_data_info=None,
+    def copy(self, kernel=None, target=None, implemented_data_info=None,
             implemented_domain=None, implemented_predicates=frozenset(),
             var_subst_map=None, vectorization_info=None,
             is_generating_device_code=None,
@@ -224,6 +234,9 @@ class CodeGenerationState(object):
         if kernel is None:
             kernel = self.kernel
 
+        if target is None:
+            target = self.target
+
         if implemented_data_info is None:
             implemented_data_info = self.implemented_data_info
 
@@ -244,6 +257,7 @@ class CodeGenerationState(object):
 
         return CodeGenerationState(
                 kernel=kernel,
+                target=target,
                 implemented_data_info=implemented_data_info,
                 implemented_domain=implemented_domain or self.implemented_domain,
                 implemented_predicates=(
@@ -253,6 +267,7 @@ class CodeGenerationState(object):
                 seen_atomic_dtypes=self.seen_atomic_dtypes,
                 var_subst_map=var_subst_map or self.var_subst_map,
                 allow_complex=self.allow_complex,
+                program_callables_info=self.program_callables_info,
                 vectorization_info=vectorization_info,
                 var_name_generator=self.var_name_generator,
                 is_generating_device_code=is_generating_device_code,
@@ -374,19 +389,15 @@ class PreambleInfo(ImmutableRecord):
 
 # {{{ main code generation entrypoint
 
-def generate_code_v2(kernel):
+def generate_code_for_a_single_kernel(kernel, program_callables_info, target):
     """
     :returns: a :class:`CodeGenerationResult`
     """
 
     from loopy.kernel import KernelState
-    if kernel.state == KernelState.INITIAL:
-        from loopy.preprocess import preprocess_kernel
-        kernel = preprocess_kernel(kernel)
-
     if kernel.schedule is None:
         from loopy.schedule import get_one_scheduled_kernel
-        kernel = get_one_scheduled_kernel(kernel)
+        kernel = get_one_scheduled_kernel(kernel, program_callables_info)
 
     if kernel.state != KernelState.SCHEDULED:
         raise LoopyError("cannot generate code for a kernel that has not been "
@@ -407,11 +418,8 @@ def generate_code_v2(kernel):
 
     # }}}
 
-    from loopy.type_inference import infer_unknown_types
-    kernel = infer_unknown_types(kernel, expect_completion=True)
-
     from loopy.check import pre_codegen_checks
-    pre_codegen_checks(kernel)
+    pre_codegen_checks(kernel, program_callables_info)
 
     logger.info("%s: generate code: start" % kernel.name)
 
@@ -469,10 +477,12 @@ def generate_code_v2(kernel):
             gen_program_name=(
                 kernel.target.host_program_name_prefix
                 + kernel.name
-                + kernel.target.host_program_name_suffix),
-            schedule_index_end=len(kernel.schedule))
+                + target.host_program_name_suffix),
+            schedule_index_end=len(kernel.schedule),
+            program_callables_info=program_callables_info)
 
     from loopy.codegen.result import generate_host_or_device_program
+
     codegen_result = generate_host_or_device_program(
             codegen_state,
             schedule_index=0)
@@ -502,7 +512,7 @@ def generate_code_v2(kernel):
             )
 
     preamble_generators = (kernel.preamble_generators
-            + kernel.target.get_device_ast_builder().preamble_generators())
+            + target.get_device_ast_builder().preamble_generators())
     for prea_gen in preamble_generators:
         preambles.extend(prea_gen(preamble_info))
 
@@ -524,6 +534,56 @@ def generate_code_v2(kernel):
     return codegen_result
 
 
+def generate_code_v2(program):
+    from loopy.kernel import LoopKernel
+    from loopy.program import make_program_from_kernel
+
+    if isinstance(program, LoopKernel):
+        program = make_program_from_kernel(program)
+
+    from loopy.kernel import KernelState
+    if program.root_kernel.state == KernelState.INITIAL:
+        from loopy.preprocess import preprocess_program
+        program = preprocess_program(program)
+
+    from loopy.type_inference import infer_unknown_types
+    program = infer_unknown_types(program, expect_completion=True)
+
+    codegen_results = {}
+
+    for func_id, in_knl_callable in program.program_callables_info.items():
+        if isinstance(in_knl_callable, CallableKernel):
+            codegen_results[func_id] = (
+                    generate_code_for_a_single_kernel(in_knl_callable.subkernel,
+                        program.program_callables_info, program.target))
+
+    device_preambles = set()
+    for cgr in codegen_results.values():
+        device_preambles.update(cgr.device_preambles)
+
+    for in_knl_callable in program.program_callables_info.values():
+        for preamble in in_knl_callable.generate_preambles(program.target):
+            device_preambles.update([preamble])
+
+    collective_device_program = codegen_results[program.name].device_programs[0]
+    for func_id, callee_cgr in codegen_results.items():
+        if func_id != program.name:
+            assert len(callee_cgr.device_programs) == 1
+            callee_prog_ast = callee_cgr.device_programs[0].ast
+            collective_device_program = collective_device_program.copy(
+                    ast=Collection([callee_prog_ast, collective_device_program.ast]))
+
+            device_preambles.update([('98_%s' % func_id,
+                str(callee_prog_ast.fdecl)), ])
+
+    collective_device_programs = [collective_device_program] + (
+            codegen_results[program.name].device_programs[1:])
+
+    return codegen_results[program.name].copy(
+            device_programs=collective_device_programs,
+            device_preambles=device_preambles)
+
+
 def generate_code(kernel, device=None):
     if device is not None:
         from warnings import warn
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index 45e2a18c..90bdbda3 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -115,7 +115,8 @@ def generate_code_for_sched_index(codegen_state, sched_index):
                 new_codegen_state, sched_index)
 
         glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs(
-                get_insn_ids_for_block_at(kernel.schedule, sched_index))
+                get_insn_ids_for_block_at(kernel.schedule, sched_index),
+                codegen_state.program_callables_info)
 
         return merge_codegen_results(codegen_state, [
             codegen_result,
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index ebddf315..39cf20c7 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -248,7 +248,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
         return next_func(codegen_state)
 
     global_size, local_size = kernel.get_grid_sizes_for_insn_ids(
-            insn_ids_for_block)
+            insn_ids_for_block, codegen_state.program_callables_info)
 
     hw_inames_left = hw_inames_left[:]
     iname = hw_inames_left.pop()
diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index 4318ad71..00f19d99 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -133,7 +133,7 @@ class CodeGenerationResult(ImmutableRecord):
         preamble_codes = process_preambles(
                 getattr(self, "host_preambles", [])
                 +
-                getattr(self, "device_preambles", [])
+                list(getattr(self, "device_preambles", []))
                 )
 
         return (
diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index 5a747d07..ef07b7e2 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -27,7 +27,7 @@ THE SOFTWARE.
 
 from six.moves import range, zip
 
-from loopy.diagnostic import StaticValueFindingError
+from loopy.diagnostic import StaticValueFindingError, LoopyError
 
 import islpy as isl
 from islpy import dim_type
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 6b003380..d2723c57 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -37,10 +37,6 @@ import re
 
 from pytools import UniqueNameGenerator, generate_unique_names
 
-from loopy.library.function import (
-        default_function_mangler,
-        single_arg_function_mangler)
-
 from loopy.diagnostic import CannotBranchDomainTree, LoopyError
 from loopy.tools import natsorted
 from loopy.diagnostic import StaticValueFindingError
@@ -224,6 +220,12 @@ class LoopKernel(ImmutableRecordWithoutPickling):
     .. attribute:: target
 
         A subclass of :class:`loopy.TargetBase`.
+
+    .. attribute:: is_called_from_host
+
+        An instance of :class:`bool`. Will be set *False* for the kernel which
+        would be called from another top level kernels. Default value is
+        *True*.
     """
 
     # {{{ constructor
@@ -252,6 +254,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             state=KernelState.INITIAL,
             target=None,
 
+            is_called_from_host=True,
+
             overridden_get_grid_sizes_for_insn_ids=None,
             _cached_written_variables=None):
         """
@@ -277,15 +281,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         if substitutions is None:
             substitutions = {}
         if function_manglers is None:
-            function_manglers = [
-                default_function_mangler,
-                single_arg_function_mangler,
-                ]
-        if symbol_manglers is None:
-            function_manglers = [
-                default_function_mangler,
-                single_arg_function_mangler,
-                ]
+            function_manglers = []
         if iname_slab_increments is None:
             iname_slab_increments = {}
 
@@ -372,6 +368,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 options=options,
                 state=state,
                 target=target,
+                is_called_from_host=is_called_from_host,
                 overridden_get_grid_sizes_for_insn_ids=(
                     overridden_get_grid_sizes_for_insn_ids),
                 _cached_written_variables=_cached_written_variables)
@@ -380,7 +377,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
     # }}}
 
-    # {{{ function mangling
+    # {{{ function mangling/scoping
 
     def mangle_function(self, identifier, arg_dtypes, ast_builder=None):
         if ast_builder is None:
@@ -1039,21 +1036,25 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 self.get_iname_bounds(iname, constants_only=True).size,
                 constants_only=True)))
 
-    @memoize_method
-    def get_grid_sizes_for_insn_ids(self, insn_ids, ignore_auto=False):
+    def get_grid_sizes_for_insn_ids_as_dicts(self, insn_ids,
+            program_callables_info, ignore_auto=False):
         """Return a tuple (global_size, local_size) containing a grid that
         could accommodate execution of all instructions whose IDs are given
         in *insn_ids*.
 
         :arg insn_ids: a :class:`frozenset` of instruction IDs
 
-        *global_size* and *local_size* are :class:`islpy.PwAff` objects.
+        *global_size* and *local_size* are instances of :class:`dict` with
+        mapping of the form from ``axis`` to :class:`islpy.PwAff` objects.
         """
 
-        if self.overridden_get_grid_sizes_for_insn_ids:
-            return self.overridden_get_grid_sizes_for_insn_ids(
-                    insn_ids,
-                    ignore_auto=ignore_auto)
+        # {{{ collecting the callee kernels in insn_ids
+
+        from loopy.kernel.tools import get_direct_callee_kernels
+        callee_kernels = get_direct_callee_kernels(self,
+                program_callables_info, insn_ids)
+
+        # }}}
 
         all_inames_by_insns = set()
         for insn_id in insn_ids:
@@ -1068,6 +1069,15 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         global_sizes = {}
         local_sizes = {}
 
+        # updating the grid sizes from the callee_kernels.
+        for callee_kernel in callee_kernels:
+            gsize, lsize = callee_kernel.get_grid_sizes_for_insn_ids_as_dicts(
+                    frozenset(insn.id for insn in callee_kernel.instructions),
+                    program_callables_info, ignore_auto)
+
+            global_sizes.update(gsize)
+            local_sizes.update(lsize)
+
         from loopy.kernel.data import (
                 GroupIndexTag, LocalIndexTag,
                 AutoLocalIndexTagBase)
@@ -1108,6 +1118,31 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
             tgt_dict[tag.axis] = size
 
+        return global_sizes, local_sizes
+
+    def get_grid_sizes_for_insn_ids(self, insn_ids, program_callables_info,
+            ignore_auto=False):
+        """Return a tuple (global_size, local_size) containing a grid that
+        could accommodate execution of all instructions whose IDs are given
+        in *insn_ids*.
+
+        :arg insn_ids: a :class:`frozenset` of instruction IDs
+
+        *global_size* and *local_size* are :class:`islpy.PwAff` objects.
+        """
+
+        if self.overridden_get_grid_sizes_for_insn_ids:
+            return self.overridden_get_grid_sizes_for_insn_ids(
+                    insn_ids,
+                    program_callables_info=program_callables_info,
+                    ignore_auto=ignore_auto)
+
+        assert self.is_called_from_host, ("Callee kernels do not have sufficient "
+                "information to compute grid sizes.")
+
+        global_sizes, local_sizes = self.get_grid_sizes_for_insn_ids_as_dicts(
+                insn_ids, program_callables_info, ignore_auto=ignore_auto)
+
         def to_dim_tuple(size_dict, which, forced_sizes={}):
             forced_sizes = forced_sizes.copy()
 
@@ -1137,7 +1172,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         return (to_dim_tuple(global_sizes, "global"),
                 to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes))
 
-    def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids, ignore_auto=False):
+    def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids,
+            program_callables_info, ignore_auto=False):
         """Return a tuple (global_size, local_size) containing a grid that
         could accommodate execution of all instructions whose IDs are given
         in *insn_ids*.
@@ -1148,7 +1184,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         """
 
         grid_size, group_size = self.get_grid_sizes_for_insn_ids(
-                insn_ids, ignore_auto)
+                insn_ids, program_callables_info, ignore_auto)
 
         def tup_to_exprs(tup):
             from loopy.symbolic import pw_aff_to_expr
@@ -1156,7 +1192,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         return tup_to_exprs(grid_size), tup_to_exprs(group_size)
 
-    def get_grid_size_upper_bounds(self, ignore_auto=False):
+    def get_grid_size_upper_bounds(self, program_callables_info, ignore_auto=False):
         """Return a tuple (global_size, local_size) containing a grid that
         could accommodate execution of *all* instructions in the kernel.
 
@@ -1164,9 +1200,11 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         """
         return self.get_grid_sizes_for_insn_ids(
                 frozenset(insn.id for insn in self.instructions),
+                program_callables_info,
                 ignore_auto=ignore_auto)
 
-    def get_grid_size_upper_bounds_as_exprs(self, ignore_auto=False):
+    def get_grid_size_upper_bounds_as_exprs(self, program_callables_info,
+            ignore_auto=False):
         """Return a tuple (global_size, local_size) containing a grid that
         could accommodate execution of *all* instructions in the kernel.
 
@@ -1175,6 +1213,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         return self.get_grid_sizes_for_insn_ids_as_exprs(
                 frozenset(insn.id for insn in self.instructions),
+                program_callables_info,
                 ignore_auto=ignore_auto)
 
     # }}}
@@ -1365,47 +1404,13 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
     # }}}
 
-    # {{{ implementation arguments
-
-    @property
-    @memoize_method
-    def impl_arg_to_arg(self):
-        from loopy.kernel.array import ArrayBase
-
-        result = {}
-
-        for arg in self.args:
-            if not isinstance(arg, ArrayBase):
-                result[arg.name] = arg
-                continue
-
-            if arg.shape is None or arg.dim_tags is None:
-                result[arg.name] = arg
-                continue
-
-            subscripts_and_names = arg.subscripts_and_names()
-            if subscripts_and_names is None:
-                result[arg.name] = arg
-                continue
-
-            for index, sub_arg_name in subscripts_and_names:
-                result[sub_arg_name] = arg
-
-        return result
-
-    # }}}
-
     # {{{ direct execution
 
     def __call__(self, *args, **kwargs):
-        key = self.target.get_kernel_executor_cache_key(*args, **kwargs)
-        try:
-            kex = self._kernel_executor_cache[key]
-        except KeyError:
-            kex = self.target.get_kernel_executor(self, *args, **kwargs)
-            self._kernel_executor_cache[key] = kex
-
-        return kex(*args, **kwargs)
+        # FIXME: scream and then convert to a program
+        from loopy.program import make_program_from_kernel
+        program = make_program_from_kernel(self)
+        return program(*args, **kwargs)
 
     # }}}
 
@@ -1489,6 +1494,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             "silenced_warnings",
             "options",
             "state",
+            "is_called_from_host",
             "target",
             )
 
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index c42db348..bac4afc8 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -24,16 +24,19 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-
 import numpy as np
 
 from pymbolic.mapper import CSECachingMapperMixin
+from pymbolic.primitives import Slice, Variable, Subscript
 from loopy.tools import intern_frozenset_of_ids
-from loopy.symbolic import IdentityMapper, WalkMapper
+from loopy.symbolic import (
+        IdentityMapper, WalkMapper, SubArrayRef)
 from loopy.kernel.data import (
         InstructionBase,
         MultiAssignmentBase, Assignment,
-        SubstitutionRule)
+        SubstitutionRule, AddressSpace)
+from loopy.kernel.instruction import (CInstruction, _DataObliviousInstruction,
+        CallInstruction)
 from loopy.diagnostic import LoopyError, warn_with_kernel
 import islpy as isl
 from islpy import dim_type
@@ -504,9 +507,11 @@ def parse_insn(groups, insn_options):
             assignee_names.append(inner_lhs_i.name)
         elif isinstance(inner_lhs_i, (Subscript, LinearSubscript)):
             assignee_names.append(inner_lhs_i.aggregate.name)
+        elif isinstance(inner_lhs_i, SubArrayRef):
+            assignee_names.append(inner_lhs_i.subscript.aggregate.name)
         else:
             raise LoopyError("left hand side of assignment '%s' must "
-                    "be variable or subscript" % (lhs_i,))
+                    "be variable, subscript or a SubArrayRef" % (lhs_i,))
 
         new_lhs.append(lhs_i)
 
@@ -1139,7 +1144,7 @@ class ArgumentGuesser:
     def make_new_arg(self, arg_name):
         arg_name = arg_name.strip()
 
-        from loopy.kernel.data import ValueArg, ArrayArg, AddressSpace
+        from loopy.kernel.data import ValueArg, ArrayArg
         import loopy as lp
 
         if arg_name in self.all_params:
@@ -1664,7 +1669,7 @@ def _is_wildcard(s):
 
 
 def _resolve_dependencies(what, knl, insn, deps):
-    from loopy import find_instructions
+    from loopy.transform.instruction import find_instructions_in_single_kernel
     from loopy.match import MatchExpressionBase
 
     new_deps = []
@@ -1673,7 +1678,7 @@ def _resolve_dependencies(what, knl, insn, deps):
         found_any = False
 
         if isinstance(dep, MatchExpressionBase):
-            for new_dep in find_instructions(knl, dep):
+            for new_dep in find_instructions_in_single_kernel(knl, dep):
                 if new_dep.id != insn.id:
                     new_deps.append(new_dep.id)
                     found_any = True
@@ -1954,6 +1959,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     target = kwargs.pop("target", None)
     seq_dependencies = kwargs.pop("seq_dependencies", False)
     fixed_parameters = kwargs.pop("fixed_parameters", {})
+    make_program = kwargs.pop("make_program", True)
 
     if defines:
         from warnings import warn
@@ -2165,15 +2171,24 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     check_for_duplicate_names(knl)
     check_written_variable_names(knl)
 
+    from loopy.kernel.tools import infer_arg_is_output_only
+    knl = infer_arg_is_output_only(knl)
+
     from loopy.preprocess import prepare_for_caching
     knl = prepare_for_caching(knl)
 
     creation_plog.done()
 
-    from loopy.kernel.tools import infer_arg_is_output_only
-    knl = infer_arg_is_output_only(knl)
+    if make_program:
+        from loopy.program import make_program_from_kernel
+        return make_program_from_kernel(knl)
+    else:
+        return knl
+
 
-    return knl
+def make_kernel_function(*args, **kwargs):
+    kwargs['make_program'] = False
+    return make_kernel(*args, **kwargs)
 
 # }}}
 
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 3e776bd0..9ba28896 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -337,6 +337,7 @@ class KernelArgument(ImmutableRecord):
             dtype = None
 
         kwargs["dtype"] = dtype
+        kwargs["is_output_only"] = kwargs.pop("is_output_only", None)
 
         ImmutableRecord.__init__(self, **kwargs)
 
@@ -362,7 +363,7 @@ class ArrayArg(ArrayBase, KernelArgument):
     def __init__(self, *args, **kwargs):
         if "address_space" not in kwargs:
             raise TypeError("'address_space' must be specified")
-        kwargs["is_output_only"] = kwargs.pop("is_output_only", False)
+        kwargs["is_output_only"] = kwargs.pop("is_output_only", None)
 
         super(ArrayArg, self).__init__(*args, **kwargs)
 
@@ -402,6 +403,9 @@ class ConstantArg(ArrayBase, KernelArgument):
     min_target_axes = 0
     max_target_axes = 1
 
+    # Constant Arg cannot be an output
+    is_output_only = False
+
     def get_arg_decl(self, ast_builder, name_suffix, shape, dtype, is_written):
         return ast_builder.get_constant_arg_decl(self.name + name_suffix, shape,
                 dtype, is_written)
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index e9c7bde9..0f548bba 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -487,7 +487,7 @@ class InstructionBase(ImmutableRecord):
 
 def _get_assignee_var_name(expr):
     from pymbolic.primitives import Variable, Subscript, Lookup
-    from loopy.symbolic import LinearSubscript
+    from loopy.symbolic import LinearSubscript, SubArrayRef
 
     if isinstance(expr, Lookup):
         expr = expr.aggregate
@@ -506,13 +506,20 @@ def _get_assignee_var_name(expr):
         assert isinstance(agg, Variable)
 
         return agg.name
+
+    elif isinstance(expr, SubArrayRef):
+        agg = expr.subscript.aggregate
+        assert isinstance(agg, Variable)
+
+        return agg.name
+
     else:
         raise RuntimeError("invalid lvalue '%s'" % expr)
 
 
 def _get_assignee_subscript_deps(expr):
     from pymbolic.primitives import Variable, Subscript, Lookup
-    from loopy.symbolic import LinearSubscript, get_dependencies
+    from loopy.symbolic import LinearSubscript, get_dependencies, SubArrayRef
 
     if isinstance(expr, Lookup):
         expr = expr.aggregate
@@ -523,6 +530,8 @@ def _get_assignee_subscript_deps(expr):
         return get_dependencies(expr.index)
     elif isinstance(expr, LinearSubscript):
         return get_dependencies(expr.index)
+    elif isinstance(expr, SubArrayRef):
+        return get_dependencies(expr.get_begin_subscript().index)
     else:
         raise RuntimeError("invalid lvalue '%s'" % expr)
 
@@ -942,12 +951,12 @@ class Assignment(MultiAssignmentBase):
     def assignee_subscript_deps(self):
         return (_get_assignee_subscript_deps(self.assignee),)
 
-    def with_transformed_expressions(self, f, *args):
+    def with_transformed_expressions(self, f, *args, **kwargs):
         return self.copy(
-                assignee=f(self.assignee, *args),
-                expression=f(self.expression, *args),
+                assignee=f(self.assignee, *args, **kwargs),
+                expression=f(self.expression, *args, **kwargs),
                 predicates=frozenset(
-                    f(pred, *args) for pred in self.predicates))
+                    f(pred, *args, **kwargs) for pred in self.predicates))
 
     # }}}
 
@@ -1052,9 +1061,10 @@ class CallInstruction(MultiAssignmentBase):
                 forced_iname_deps=forced_iname_deps,
                 forced_iname_deps_is_final=forced_iname_deps_is_final)
 
-        from pymbolic.primitives import Call
+        from pymbolic.primitives import Call, CallWithKwargs
         from loopy.symbolic import Reduction
-        if not isinstance(expression, (Call, Reduction)) and expression is not None:
+        if not isinstance(expression, (Call, CallWithKwargs, Reduction)) and (
+                expression is not None):
             raise LoopyError("'expression' argument to CallInstruction "
                     "must be a function call")
 
@@ -1094,12 +1104,12 @@ class CallInstruction(MultiAssignmentBase):
                 _get_assignee_subscript_deps(a)
                 for a in self.assignees)
 
-    def with_transformed_expressions(self, f, *args):
+    def with_transformed_expressions(self, f, *args, **kwargs):
         return self.copy(
-                assignees=f(self.assignees, *args),
-                expression=f(self.expression, *args),
+                assignees=f(self.assignees, *args, **kwargs),
+                expression=f(self.expression, *args, **kwargs),
                 predicates=frozenset(
-                    f(pred, *args) for pred in self.predicates))
+                    f(pred, *args, **kwargs) for pred in self.predicates))
 
     # }}}
 
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 95c3c336..3c0c2443 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -36,6 +36,7 @@ from islpy import dim_type
 from loopy.diagnostic import LoopyError, warn_with_kernel
 from pytools import memoize_on_first_arg
 from loopy.tools import natsorted
+from loopy.program import Program
 
 import logging
 logger = logging.getLogger(__name__)
@@ -43,19 +44,25 @@ logger = logging.getLogger(__name__)
 
 # {{{ add and infer argument dtypes
 
-def add_dtypes(knl, dtype_dict):
+def add_dtypes(program, dtype_dict):
     """Specify remaining unspecified argument/temporary variable types.
 
     :arg dtype_dict: a mapping from variable names to :class:`numpy.dtype`
         instances
     """
-    dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes(knl, dtype_dict)
+    root_kernel = program.root_kernel
+    dtype_dict_remainder, new_args, new_temp_vars = _add_dtypes(
+            root_kernel, dtype_dict)
 
     if dtype_dict_remainder:
         raise RuntimeError("unused argument dtypes: %s"
                 % ", ".join(dtype_dict_remainder))
+    root_kernel
 
-    return knl.copy(args=new_args, temporary_variables=new_temp_vars)
+    root_kernel_with_added_dtypes = (
+            root_kernel.copy(args=new_args, temporary_variables=new_temp_vars))
+
+    return program.with_root_kernel(root_kernel_with_added_dtypes)
 
 
 def _add_dtypes_overdetermined(knl, dtype_dict):
@@ -107,7 +114,8 @@ def get_arguments_with_incomplete_dtype(knl):
             if arg.dtype is None]
 
 
-def add_and_infer_dtypes(knl, dtype_dict, expect_completion=False):
+def add_and_infer_dtypes(prog, dtype_dict, expect_completion=False):
+    assert isinstance(prog, Program)
     processed_dtype_dict = {}
 
     for k, v in six.iteritems(dtype_dict):
@@ -116,10 +124,10 @@ def add_and_infer_dtypes(knl, dtype_dict, expect_completion=False):
             if subkey:
                 processed_dtype_dict[subkey] = v
 
-    knl = add_dtypes(knl, processed_dtype_dict)
+    prog = add_dtypes(prog, processed_dtype_dict)
 
     from loopy.type_inference import infer_unknown_types
-    return infer_unknown_types(knl, expect_completion=expect_completion)
+    return infer_unknown_types(prog, expect_completion=expect_completion)
 
 
 def _add_and_infer_dtypes_overdetermined(knl, dtype_dict):
@@ -747,7 +755,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn):
 # }}}
 
 
-def assign_automatic_axes(kernel, axis=0, local_size=None):
+def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=None):
     logger.debug("%s: assign automatic axes" % kernel.name)
     # TODO: do the tag removal rigorously, might be easier after switching
     # to set() from tuple()
@@ -761,7 +769,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
 
     if local_size is None:
         _, local_size = kernel.get_grid_size_upper_bounds_as_exprs(
-                ignore_auto=True)
+                program_callables_info, ignore_auto=True)
 
     # {{{ axis assignment helper function
 
@@ -789,6 +797,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
 
             return assign_automatic_axes(
                     kernel.copy(iname_to_tags=new_iname_to_tags),
+                    program_callables_info,
                     axis=recursion_axis)
 
         if axis is None:
@@ -828,7 +837,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
         else:
             new_tag = LocalIndexTag(axis)
             if desired_length > local_size[axis]:
-                from loopy import split_iname, untag_inames
+                from loopy import untag_inames
+                from loopy.transform.iname import split_iname
 
                 # Don't be tempted to switch the outer tag to unroll--this may
                 # generate tons of code on some examples.
@@ -839,6 +849,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
                             iname, inner_length=local_size[axis],
                             outer_tag=None, inner_tag=new_tag,
                             do_tagged_check=False),
+                        program_callables_info=program_callables_info,
                         axis=recursion_axis, local_size=local_size)
 
         if not kernel.iname_tags_of_type(iname, AutoLocalIndexTagBase):
@@ -860,7 +871,7 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
             del new_iname_to_tags[iname]
 
         return assign_automatic_axes(kernel.copy(iname_to_tags=new_iname_to_tags),
-                axis=recursion_axis, local_size=local_size)
+                program_callables_info, axis=recursion_axis, local_size=local_size)
 
     # }}}
 
@@ -928,7 +939,8 @@ def assign_automatic_axes(kernel, axis=0, local_size=None):
     if axis >= len(local_size):
         return kernel
     else:
-        return assign_automatic_axes(kernel, axis=axis+1,
+        return assign_automatic_axes(kernel,
+                program_callables_info=program_callables_info, axis=axis+1,
                 local_size=local_size)
 
 # }}}
@@ -1866,6 +1878,7 @@ def infer_arg_is_output_only(kernel):
     """
     from loopy.kernel.data import ArrayArg, ValueArg, ConstantArg, ImageArg
     new_args = []
+
     for arg in kernel.args:
         if isinstance(arg, (ArrayArg, ImageArg, ValueArg)):
             if arg.is_output_only is not None:
diff --git a/loopy/library/function.py b/loopy/library/function.py
index 9d557ac9..8338875d 100644
--- a/loopy/library/function.py
+++ b/loopy/library/function.py
@@ -22,38 +22,48 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+from loopy.kernel.function_interface import ScalarCallable
 
-def default_function_mangler(kernel, name, arg_dtypes):
-    from loopy.library.reduction import reduction_function_mangler
 
-    manglers = [reduction_function_mangler, tuple_function_mangler]
-    for mangler in manglers:
-        result = mangler(kernel, name, arg_dtypes)
-        if result is not None:
-            return result
+class MakeTupleCallable(ScalarCallable):
+    def with_types(self, arg_id_to_dtype, kernel, program_callables_info):
+        new_arg_id_to_dtype = arg_id_to_dtype.copy()
+        for i in range(len(arg_id_to_dtype)):
+            if i in arg_id_to_dtype and arg_id_to_dtype[i] is not None:
+                new_arg_id_to_dtype[-i-1] = new_arg_id_to_dtype[i]
 
-    return None
+        return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
+            name_in_target="loopy_make_tuple"), program_callables_info)
 
+    def with_descrs(self, arg_id_to_descr, program_callables_info):
+        from loopy.kernel.function_interface import ValueArgDescriptor
+        new_arg_id_to_descr = dict(((id, ValueArgDescriptor()),
+            (-id-1, ValueArgDescriptor())) for id in arg_id_to_descr.keys())
 
-def single_arg_function_mangler(kernel, name, arg_dtypes):
-    if len(arg_dtypes) == 1:
-        dtype, = arg_dtypes
+        return (
+                self.copy(arg_id_to_descr=new_arg_id_to_descr),
+                program_callables_info)
 
-        from loopy.kernel.data import CallMangleInfo
-        return CallMangleInfo(name, (dtype,), (dtype,))
 
-    return None
+class IndexOfCallable(ScalarCallable):
+    def with_types(self, arg_id_to_dtype, kernel, program_callables_info):
+        new_arg_id_to_dtype = dict((i, dtype) for i, dtype in
+                arg_id_to_dtype.items() if dtype is not None)
+        new_arg_id_to_dtype[-1] = kernel.index_dtype
 
+        return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype),
+                program_callables_info)
 
-def tuple_function_mangler(kernel, name, arg_dtypes):
-    if name == "make_tuple":
-        from loopy.kernel.data import CallMangleInfo
-        return CallMangleInfo(
-                target_name="loopy_make_tuple",
-                result_dtypes=arg_dtypes,
-                arg_dtypes=arg_dtypes)
 
-    return None
+def loopy_specific_callable_scopers(target, identifier):
+    if identifier == "make_tuple":
+        return MakeTupleCallable(name="make_tuple")
+
+    if identifier in ["indexof", "indexof_vec"]:
+        return IndexOfCallable(name=identifier)
+
+    from loopy.library.reduction import reduction_scoper
+    return reduction_scoper(target, identifier)
 
 
 # vim: foldmethod=marker
diff --git a/loopy/library/random123.py b/loopy/library/random123.py
index b8633114..59ca72df 100644
--- a/loopy/library/random123.py
+++ b/loopy/library/random123.py
@@ -27,6 +27,7 @@ THE SOFTWARE.
 
 from pytools import ImmutableRecord
 from mako.template import Template
+from loopy.kernel.function_interface import ScalarCallable
 import numpy as np
 
 
@@ -163,60 +164,77 @@ double${ width } ${ name }_f64(
 # }}}
 
 
-def random123_preamble_generator(preamble_info):
-    for f in preamble_info.seen_functions:
-        try:
-            rng_variant = FUNC_NAMES_TO_RNG[f.name]
-        except KeyError:
-            continue
+class Random123Callable(ScalarCallable):
+    """
+    Records information about for the random123 functions.
+    """
+
+    def with_types(self, arg_id_to_dtype, kernel, program_callables_info):
+
+        if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or (
+                arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None):
+            # the types provided aren't mature enough to specialize the
+            # callable
+            return (self.copy(),
+                    program_callables_info)
+
+        name = self.name
+        target = kernel.target
+
+        rng_variant = FUNC_NAMES_TO_RNG[name]
+
+        from loopy.types import NumpyType
+        base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits]
+        ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width)
+        key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width)
+
+        fn = rng_variant.full_name
+        if name == fn:
+            new_arg_id_to_dtype = {-1: ctr_dtype, -2: ctr_dtype, 0: ctr_dtype, 1:
+                    key_dtype}
+            return (
+                    self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
+                        name_in_target=fn+"_gen"),
+                    program_callables_info)
+
+        elif name == fn + "_f32":
+            new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float32),
+                rng_variant.width),
+                    -2: ctr_dtype, 0: ctr_dtype, 1:
+                    key_dtype}
+            return self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
+                    name_in_target=name), program_callables_info
+
+        elif name == fn + "_f64":
+            new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float64),
+                rng_variant.width),
+                    -2: ctr_dtype, 0: ctr_dtype, 1:
+                    key_dtype}
+            return self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
+                    name_in_target=name), program_callables_info
+
+        return (self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                program_callables_info)
+
+    def generate_preambles(self, target):
+        rng_variant = FUNC_NAMES_TO_RNG[self.name]
 
         from loopy.target.pyopencl import PyOpenCLTarget
         yield ("90-random123-"+rng_variant.full_name,
                 PREAMBLE_TEMPLATE.render(
                     is_pyopencl_target=isinstance(
-                        preamble_info.kernel.target,
+                        target,
                         PyOpenCLTarget),
                     rng_variant=rng_variant,
                     ))
 
+        return
 
-def random123_function_mangler(kernel, name, arg_dtypes):
-    try:
-        rng_variant = FUNC_NAMES_TO_RNG[name]
-    except KeyError:
-        return None
-
-    from loopy.types import NumpyType
-    target = kernel.target
-    base_dtype = {32: np.uint32, 64: np.uint64}[rng_variant.bits]
-    ctr_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.width)
-    key_dtype = target.vector_dtype(NumpyType(base_dtype), rng_variant.key_width)
-
-    from loopy.kernel.data import CallMangleInfo
-    fn = rng_variant.full_name
-    if name == fn:
-        return CallMangleInfo(
-                target_name=fn+"_gen",
-                result_dtypes=(ctr_dtype, ctr_dtype),
-                arg_dtypes=(ctr_dtype, key_dtype))
-
-    elif name == fn + "_f32":
-        return CallMangleInfo(
-                target_name=name,
-                result_dtypes=(
-                    target.vector_dtype(NumpyType(np.float32), rng_variant.width),
-                    ctr_dtype),
-                arg_dtypes=(ctr_dtype, key_dtype))
-
-    elif name == fn + "_f64":
-        return CallMangleInfo(
-                target_name=name,
-                result_dtypes=(
-                    target.vector_dtype(NumpyType(np.float64), rng_variant.width),
-                    ctr_dtype),
-                arg_dtypes=(ctr_dtype, key_dtype))
-
-    else:
-        return None
+
+def random123_function_scoper(target, identifier):
+    if identifier in FUNC_NAMES_TO_RNG:
+        return Random123Callable(name=identifier)
+
+    return None
 
 # vim: foldmethod=marker
diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index 8ed5cbe5..6ec8e4b2 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -1,4 +1,4 @@
-from __future__ import division
+from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -24,11 +24,14 @@ THE SOFTWARE.
 
 
 from pymbolic import var
+from loopy.symbolic import ResolvedFunction
+from loopy.kernel.function_interface import ScalarCallable
 import numpy as np
 
 from loopy.symbolic import FunctionIdentifier
 from loopy.diagnostic import LoopyError
 from loopy.types import NumpyType
+from loopy.kernel import LoopKernel
 
 
 class ReductionOperation(object):
@@ -81,6 +84,9 @@ class ReductionOperation(object):
         raise LoopyError("unable to parse reduction type: '%s'"
                 % op_type)
 
+    def get_scalar_callables(self):
+        return frozenset()
+
 
 class ScalarReductionOperation(ReductionOperation):
     def __init__(self, forced_result_type=None):
@@ -180,7 +186,10 @@ class MaxReductionOperation(ScalarReductionOperation):
         return get_ge_neutral(dtype)
 
     def __call__(self, dtype, operand1, operand2):
-        return var("max")(operand1, operand2)
+        return ResolvedFunction("max")(operand1, operand2)
+
+    def get_scalar_callables(self):
+        return frozenset(["max"])
 
 
 class MinReductionOperation(ScalarReductionOperation):
@@ -188,7 +197,10 @@ class MinReductionOperation(ScalarReductionOperation):
         return get_le_neutral(dtype)
 
     def __call__(self, dtype, operand1, operand2):
-        return var("min")(operand1, operand2)
+        return ResolvedFunction("min")(operand1, operand2)
+
+    def get_scalar_callables(self):
+        return frozenset(["min"])
 
 
 # {{{ base class for symbolic reduction ops
@@ -212,6 +224,11 @@ class ReductionOpFunction(FunctionIdentifier):
 
         return type(self)(reduction_op)
 
+    hash_fields = (
+            "reduction_op",)
+
+    update_persistent_hash = LoopKernel.update_persistent_hash
+
 
 # }}}
 
@@ -237,7 +254,7 @@ class _SegmentedScalarReductionOperation(ReductionOperation):
 
     def neutral_element(self, scalar_dtype, segment_flag_dtype):
         scalar_neutral_element = self.inner_reduction.neutral_element(scalar_dtype)
-        return var("make_tuple")(scalar_neutral_element,
+        return ResolvedFunction("make_tuple")(scalar_neutral_element,
                 segment_flag_dtype.numpy_dtype.type(0))
 
     def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype):
@@ -254,7 +271,10 @@ class _SegmentedScalarReductionOperation(ReductionOperation):
         return type(self) == type(other)
 
     def __call__(self, dtypes, operand1, operand2):
-        return SegmentedOp(self)(*(operand1 + operand2))
+        return ResolvedFunction(SegmentedOp(self))(*(operand1 + operand2))
+
+    def get_scalar_callables(self):
+        return frozenset(["make_tuple", SegmentedOp(self)])
 
 
 class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation):
@@ -262,34 +282,24 @@ class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation):
     which = "sum"
     op = "((%s) + (%s))"
 
+    hash_fields = (
+            "which",
+            "op",)
+
+    update_persistent_hash = LoopKernel.update_persistent_hash
+
 
 class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation):
     base_reduction_class = ProductReductionOperation
     op = "((%s) * (%s))"
     which = "product"
 
+    hash_fields = (
+            "which",
+            "op",
+            "base_reduction_class",)
 
-def get_segmented_function_preamble(kernel, func_id, arg_dtypes):
-    op = func_id.reduction_op
-    scalar_dtype = arg_dtypes[0]
-    segment_flag_dtype = arg_dtypes[1]
-    prefix = op.prefix(scalar_dtype, segment_flag_dtype)
-
-    return (prefix, """
-    inline %(scalar_t)s %(prefix)s_op(
-        %(scalar_t)s op1, %(segment_flag_t)s segment_flag1,
-        %(scalar_t)s op2, %(segment_flag_t)s segment_flag2,
-        %(segment_flag_t)s *segment_flag_out)
-    {
-        *segment_flag_out = segment_flag1 | segment_flag2;
-        return segment_flag2 ? op2 : %(combined)s;
-    }
-    """ % dict(
-            scalar_t=kernel.target.dtype_to_typename(scalar_dtype),
-            prefix=prefix,
-            segment_flag_t=kernel.target.dtype_to_typename(segment_flag_dtype),
-            combined=op.op % ("op1", "op2"),
-            ))
+    update_persistent_hash = LoopKernel.update_persistent_hash
 
 # }}}
 
@@ -313,7 +323,7 @@ class _ArgExtremumReductionOperation(ReductionOperation):
         scalar_neutral_func = (
                 get_ge_neutral if self.neutral_sign < 0 else get_le_neutral)
         scalar_neutral_element = scalar_neutral_func(scalar_dtype)
-        return var("make_tuple")(scalar_neutral_element,
+        return ResolvedFunction("make_tuple")(scalar_neutral_element,
                 index_dtype.numpy_dtype.type(-1))
 
     def __str__(self):
@@ -330,7 +340,10 @@ class _ArgExtremumReductionOperation(ReductionOperation):
         return 2
 
     def __call__(self, dtypes, operand1, operand2):
-        return ArgExtOp(self)(*(operand1 + operand2))
+        return ResolvedFunction(ArgExtOp(self))(*(operand1 + operand2))
+
+    def get_scalar_callables(self):
+        return frozenset([self.which, "make_tuple", ArgExtOp(self)])
 
 
 class ArgMaxReductionOperation(_ArgExtremumReductionOperation):
@@ -338,43 +351,23 @@ class ArgMaxReductionOperation(_ArgExtremumReductionOperation):
     update_comparison = ">="
     neutral_sign = -1
 
+    hash_fields = ("which",
+            "update_comparison",
+            "neutral_sign",)
+
+    update_persistent_hash = LoopKernel.update_persistent_hash
+
 
 class ArgMinReductionOperation(_ArgExtremumReductionOperation):
     which = "min"
     update_comparison = "<="
     neutral_sign = +1
 
+    hash_fields = ("which",
+            "update_comparison",
+            "neutral_sign",)
 
-def get_argext_preamble(kernel, func_id, arg_dtypes):
-    op = func_id.reduction_op
-    scalar_dtype = arg_dtypes[0]
-    index_dtype = arg_dtypes[1]
-
-    prefix = op.prefix(scalar_dtype, index_dtype)
-
-    return (prefix, """
-    inline %(scalar_t)s %(prefix)s_op(
-        %(scalar_t)s op1, %(index_t)s index1,
-        %(scalar_t)s op2, %(index_t)s index2,
-        %(index_t)s *index_out)
-    {
-        if (op2 %(comp)s op1)
-        {
-            *index_out = index2;
-            return op2;
-        }
-        else
-        {
-            *index_out = index1;
-            return op1;
-        }
-    }
-    """ % dict(
-            scalar_t=kernel.target.dtype_to_typename(scalar_dtype),
-            prefix=prefix,
-            index_t=kernel.target.dtype_to_typename(index_dtype),
-            comp=op.update_comparison,
-            ))
+    update_persistent_hash = LoopKernel.update_persistent_hash
 
 # }}}
 
@@ -429,70 +422,93 @@ def parse_reduction_op(name):
 # }}}
 
 
-def reduction_function_mangler(kernel, func_id, arg_dtypes):
-    if isinstance(func_id, ArgExtOp):
-        from loopy.target.opencl import CTarget
-        if not isinstance(kernel.target, CTarget):
-            raise LoopyError("%s: only C-like targets supported for now" % func_id)
-
-        op = func_id.reduction_op
-        scalar_dtype = arg_dtypes[0]
-        index_dtype = arg_dtypes[1]
-
-        from loopy.kernel.data import CallMangleInfo
-        return CallMangleInfo(
-                target_name="%s_op" % op.prefix(
-                    scalar_dtype, index_dtype),
-                result_dtypes=op.result_dtypes(
-                    kernel, scalar_dtype, index_dtype),
-                arg_dtypes=(
-                    scalar_dtype,
-                    index_dtype,
-                    scalar_dtype,
-                    index_dtype),
-                )
-
-    elif isinstance(func_id, SegmentedOp):
-        from loopy.target.opencl import CTarget
-        if not isinstance(kernel.target, CTarget):
-            raise LoopyError("%s: only C-like targets supported for now" % func_id)
-
-        op = func_id.reduction_op
-        scalar_dtype = arg_dtypes[0]
-        segment_flag_dtype = arg_dtypes[1]
-
-        from loopy.kernel.data import CallMangleInfo
-        return CallMangleInfo(
-                target_name="%s_op" % op.prefix(
-                    scalar_dtype, segment_flag_dtype),
-                result_dtypes=op.result_dtypes(
-                    kernel, scalar_dtype, segment_flag_dtype),
-                arg_dtypes=(
-                    scalar_dtype,
-                    segment_flag_dtype,
-                    scalar_dtype,
-                    segment_flag_dtype),
-                )
+# {{{ reduction specific callables
+
+class ReductionCallable(ScalarCallable):
+    def with_types(self, arg_id_to_dtype, kernel, program_callables_info):
+        scalar_dtype = arg_id_to_dtype[0]
+        index_dtype = arg_id_to_dtype[1]
+        result_dtypes = self.name.reduction_op.result_dtypes(kernel, scalar_dtype,
+                index_dtype)
+        new_arg_id_to_dtype = arg_id_to_dtype.copy()
+        new_arg_id_to_dtype[-1] = result_dtypes[0]
+        new_arg_id_to_dtype[-2] = result_dtypes[1]
+        name_in_target = self.name.reduction_op.prefix(scalar_dtype,
+                index_dtype) + "_op"
+
+        return self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
+                name_in_target=name_in_target), program_callables_info
+
+    def with_descr(self, arg_id_to_descr, program_callables_info):
+        from loopy.library.kernel.function_interface import ValueArgDescriptor
+        new_arg_id_to_descr = arg_id_to_descr.copy()
+        new_arg_id_to_descr[-1] = ValueArgDescriptor()
+        return (
+                self.copy(arg_id_to_descr=arg_id_to_descr),
+                program_callables_info)
+
+    def generate_preambles(self, target):
+        if isinstance(self.name, ArgExtOp):
+            op = self.name.reduction_op
+            scalar_dtype = self.arg_id_to_dtype[-1]
+            index_dtype = self.arg_id_to_dtype[-2]
+
+            prefix = op.prefix(scalar_dtype, index_dtype)
+
+            yield (prefix, """
+            inline %(scalar_t)s %(prefix)s_op(
+                %(scalar_t)s op1, %(index_t)s index1,
+                %(scalar_t)s op2, %(index_t)s index2,
+                %(index_t)s *index_out)
+            {
+                if (op2 %(comp)s op1)
+                {
+                    *index_out = index2;
+                    return op2;
+                }
+                else
+                {
+                    *index_out = index1;
+                    return op1;
+                }
+            }
+            """ % dict(
+                    scalar_t=target.dtype_to_typename(scalar_dtype),
+                    prefix=prefix,
+                    index_t=target.dtype_to_typename(index_dtype),
+                    comp=op.update_comparison,
+                    ))
+        elif isinstance(self.name, SegmentedOp):
+            op = self.name.reduction_op
+            scalar_dtype = self.arg_id_to_dtype[-1]
+            segment_flag_dtype = self.arg_id_to_dtype[-2]
+            prefix = op.prefix(scalar_dtype, segment_flag_dtype)
+
+            yield (prefix, """
+            inline %(scalar_t)s %(prefix)s_op(
+                %(scalar_t)s op1, %(segment_flag_t)s segment_flag1,
+                %(scalar_t)s op2, %(segment_flag_t)s segment_flag2,
+                %(segment_flag_t)s *segment_flag_out)
+            {
+                *segment_flag_out = segment_flag1 | segment_flag2;
+                return segment_flag2 ? op2 : %(combined)s;
+            }
+            """ % dict(
+                    scalar_t=target.dtype_to_typename(scalar_dtype),
+                    prefix=prefix,
+                    segment_flag_t=target.dtype_to_typename(segment_flag_dtype),
+                    combined=op.op % ("op1", "op2"),
+                    ))
+
+        return
+
+
+def reduction_scoper(target, identifier):
+    if isinstance(identifier, (ArgExtOp, SegmentedOp)):
+        return ReductionCallable(name=identifier)
 
     return None
 
-
-def reduction_preamble_generator(preamble_info):
-    from loopy.target.opencl import OpenCLTarget
-
-    for func in preamble_info.seen_functions:
-        if isinstance(func.name, ArgExtOp):
-            if not isinstance(preamble_info.kernel.target, OpenCLTarget):
-                raise LoopyError("only OpenCL supported for now")
-
-            yield get_argext_preamble(preamble_info.kernel, func.name,
-                    func.arg_dtypes)
-
-        elif isinstance(func.name, SegmentedOp):
-            if not isinstance(preamble_info.kernel.target, OpenCLTarget):
-                raise LoopyError("only OpenCL supported for now")
-
-            yield get_segmented_function_preamble(preamble_info.kernel, func.name,
-                    func.arg_dtypes)
+# }}}
 
 # vim: fdm=marker
diff --git a/loopy/loop.py b/loopy/loop.py
index 45924638..66d41398 100644
--- a/loopy/loop.py
+++ b/loopy/loop.py
@@ -25,6 +25,7 @@ THE SOFTWARE.
 
 import islpy as isl
 import six
+from loopy.program import iterate_over_kernels_if_given_program
 
 
 def potential_loop_nest_map(kernel):
@@ -55,6 +56,7 @@ def potential_loop_nest_map(kernel):
     return result
 
 
+@iterate_over_kernels_if_given_program
 def fuse_loop_domains(kernel):
     from loopy.kernel.tools import is_domain_dependent_on_inames
 
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index fc950c78..3657967a 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -27,7 +27,6 @@ import six
 from loopy.diagnostic import (
         LoopyError, WriteRaceConditionWarning, warn_with_kernel,
         LoopyAdvisory)
-
 import islpy as isl
 
 from pytools.persistent_dict import WriteOncePersistentDict
@@ -37,13 +36,19 @@ from loopy.version import DATA_MODEL_VERSION
 from loopy.kernel.data import make_assignment, filter_iname_tags_by_type
 # for the benefit of loopy.statistics, for now
 from loopy.type_inference import infer_unknown_types
+from loopy.symbolic import RuleAwareIdentityMapper
 
+from loopy.kernel.instruction import (MultiAssignmentBase, CInstruction,
+        CallInstruction,  _DataObliviousInstruction)
+from loopy.program import Program, iterate_over_kernels_if_given_program
+from loopy.kernel.function_interface import CallableKernel, ScalarCallable
 import logging
 logger = logging.getLogger(__name__)
 
 
 # {{{ prepare for caching
 
+@iterate_over_kernels_if_given_program
 def prepare_for_caching(kernel):
     import loopy as lp
     new_args = []
@@ -885,9 +890,9 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain):
 # }}}
 
 
-def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
-                      automagic_scans_ok=False, force_scan=False,
-                      force_outer_iname_for_scan=None):
+def realize_reduction_for_single_kernel(kernel, program_callables_info,
+        insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False,
+        force_scan=False, force_outer_iname_for_scan=None):
     """Rewrites reductions into their imperative form. With *insn_id_filter*
     specified, operate only on the instruction with an instruction id matching
     *insn_id_filter*.
@@ -1007,7 +1012,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
     # {{{ sequential
 
-    def map_reduction_seq(expr, rec, nresults, arg_dtypes,
+    def map_reduction_seq(expr, rec, program_callables_info, nresults, arg_dtypes,
             reduction_dtypes):
         outer_insn_inames = temp_kernel.insn_inames(insn)
 
@@ -1125,7 +1130,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                 v[iname].lt_set(v[0] + ubound)).get_basic_sets()
         return bs
 
-    def map_reduction_local(expr, rec, nresults, arg_dtypes,
+    def map_reduction_local(expr, rec, program_callables_info, nresults, arg_dtypes,
             reduction_dtypes):
         red_iname, = expr.inames
 
@@ -1365,7 +1370,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
     # {{{ sequential scan
 
-    def map_scan_seq(expr, rec, nresults, arg_dtypes,
+    def map_scan_seq(expr, rec, program_callables_info, nresults, arg_dtypes,
             reduction_dtypes, sweep_iname, scan_iname, sweep_min_value,
             scan_min_value, stride):
         outer_insn_inames = temp_kernel.insn_inames(insn)
@@ -1454,17 +1459,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
     # {{{ local-parallel scan
 
-    def map_scan_local(expr, rec, nresults, arg_dtypes,
-            reduction_dtypes, sweep_iname, scan_iname,
-            sweep_min_value, scan_min_value, stride):
+    def map_scan_local(expr, rec, program_callables_info, nresults, arg_dtypes,
+            reduction_dtypes, sweep_iname, scan_iname, sweep_min_value,
+            scan_min_value, stride):
 
         scan_size = _get_int_iname_size(sweep_iname)
 
         assert scan_size > 0
 
         if scan_size == 1:
-            return map_reduction_seq(
-                    expr, rec, nresults, arg_dtypes, reduction_dtypes)
+            return map_reduction_seq(expr, rec, program_callables_info,
+                    nresults, arg_dtypes, reduction_dtypes)
 
         outer_insn_inames = temp_kernel.insn_inames(insn)
 
@@ -1663,15 +1668,15 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
     # {{{ seq/par dispatch
 
-    def map_reduction(expr, rec, nresults=1):
+    def map_reduction(expr, rec, program_callables_info, nresults=1):
         # Only expand one level of reduction at a time, going from outermost to
         # innermost. Otherwise we get the (iname + insn) dependencies wrong.
 
         from loopy.type_inference import (
                 infer_arg_and_reduction_dtypes_for_reduction_expression)
-        arg_dtypes, reduction_dtypes = (
+        arg_dtypes, reduction_dtypes, program_callables_info = (
                 infer_arg_and_reduction_dtypes_for_reduction_expression(
-                        temp_kernel, expr, unknown_types_ok))
+                    temp_kernel, expr, program_callables_info, unknown_types_ok))
 
         outer_insn_inames = temp_kernel.insn_inames(insn)
         bad_inames = frozenset(expr.inames) & outer_insn_inames
@@ -1780,15 +1785,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
                             for tag in temp_kernel.iname_tags(sweep_iname))))
                 elif parallel:
                     return map_scan_local(
-                            expr, rec, nresults, arg_dtypes, reduction_dtypes,
+                            expr, rec, program_callables_info, nresults,
+                            arg_dtypes, reduction_dtypes,
                             sweep_iname, scan_param.scan_iname,
                             scan_param.sweep_lower_bound,
                             scan_param.scan_lower_bound,
                             scan_param.stride)
                 elif sequential:
                     return map_scan_seq(
-                            expr, rec, nresults, arg_dtypes, reduction_dtypes,
-                            sweep_iname, scan_param.scan_iname,
+                            expr, rec, program_callables_info, nresults,
+                            arg_dtypes, reduction_dtypes, sweep_iname,
+                            scan_param.scan_iname,
                             scan_param.sweep_lower_bound,
                             scan_param.scan_lower_bound,
                             scan_param.stride)
@@ -1807,12 +1814,13 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
         if n_sequential:
             assert n_local_par == 0
-            return map_reduction_seq(
-                    expr, rec, nresults, arg_dtypes, reduction_dtypes)
+            return map_reduction_seq(expr, rec, program_callables_info,
+                    nresults, arg_dtypes, reduction_dtypes)
         else:
             assert n_local_par > 0
             return map_reduction_local(
-                    expr, rec, nresults, arg_dtypes, reduction_dtypes)
+                    expr, rec, program_callables_info, nresults, arg_dtypes,
+                    reduction_dtypes)
 
     # }}}
 
@@ -1845,9 +1853,13 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
         # Run reduction expansion.
         from loopy.symbolic import Reduction
         if isinstance(insn.expression, Reduction) and nresults > 1:
-            new_expressions = cb_mapper(insn.expression, nresults=nresults)
+            new_expressions = cb_mapper(insn.expression,
+                    program_callables_info=program_callables_info,
+                    nresults=nresults)
         else:
-            new_expressions = (cb_mapper(insn.expression),)
+            new_expressions = (
+                    cb_mapper(insn.expression,
+                        program_callables_info=program_callables_info),)
 
         if generated_insns:
             # An expansion happened, so insert the generated stuff plus
@@ -1935,6 +1947,31 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True,
 
     return kernel
 
+
+def realize_reduction(program, *args, **kwargs):
+    assert isinstance(program, Program)
+
+    new_resolved_functions = {}
+    for func_id, in_knl_callable in program.program_callables_info.items():
+        if isinstance(in_knl_callable, CallableKernel):
+            new_subkernel = realize_reduction_for_single_kernel(
+                    in_knl_callable.subkernel, program.program_callables_info,
+                    *args, **kwargs)
+            in_knl_callable = in_knl_callable.copy(
+                    subkernel=new_subkernel)
+
+        elif isinstance(in_knl_callable, ScalarCallable):
+            pass
+        else:
+            raise NotImplementedError("Unknown type of callable %s." % (
+                type(in_knl_callable).__name__))
+
+        new_resolved_functions[func_id] = in_knl_callable
+
+    new_program_callables_info = program.program_callables_info.copy(
+            resolved_functions=new_resolved_functions)
+    return program.copy(program_callables_info=new_program_callables_info)
+
 # }}}
 
 
@@ -2108,17 +2145,159 @@ def check_atomic_loads(kernel):
 # }}}
 
 
+# {{{ arg_descr_inference
+
+class ArgDescrInferenceMapper(RuleAwareIdentityMapper):
+    """
+    Returns a set of instances of :class:`tuple` (expr,
+    in_kernel_callable). The mapped `in_kernel_callable` of the
+    :class:`InKernelCallable` are descriptor specialized for the given
+    arguments.
+    """
+
+    def __init__(self, rule_mapping_context, caller_kernel,
+            program_callables_info):
+        super(ArgDescrInferenceMapper, self).__init__(
+                rule_mapping_context)
+        self.caller_kernel = caller_kernel
+        self.program_callables_info = program_callables_info
+
+    def map_call(self, expr, expn_state, **kwargs):
+        from pymbolic.primitives import Call, CallWithKwargs
+        from loopy.kernel.function_interface import ValueArgDescriptor
+        from loopy.symbolic import ResolvedFunction, SubArrayRef
+
+        if not isinstance(expr.function, ResolvedFunction):
+            # ignore if the call is not to a ResolvedFunction
+            return super(ArgDescrInferenceMapper, self).map_call(expr, expn_state)
+
+        if isinstance(expr, Call):
+            kw_parameters = {}
+        else:
+            assert isinstance(expr, CallWithKwargs)
+            kw_parameters = expr.kw_parameters
+
+        # descriptors for the args and kwargs of the Call
+        arg_id_to_descr = dict((i, par.get_array_arg_descriptor(self.caller_kernel))
+                if isinstance(par, SubArrayRef) else (i, ValueArgDescriptor())
+                for i, par in tuple(enumerate(expr.parameters)) +
+                tuple(kw_parameters.items()))
+
+        assignee_id_to_descr = {}
+
+        if 'assignees' in kwargs:
+            # If supplied with assignees then this is a CallInstruction
+            assignees = kwargs['assignees']
+            assert isinstance(assignees, tuple)
+            for i, par in enumerate(assignees):
+                if isinstance(par, SubArrayRef):
+                    assignee_id_to_descr[-i-1] = (
+                            par.get_array_arg_descriptor(self.caller_kernel))
+                else:
+                    assignee_id_to_descr[-i-1] = ValueArgDescriptor()
+
+        # gathering all the descriptors
+        combined_arg_id_to_descr = arg_id_to_descr.copy()
+        combined_arg_id_to_descr.update(assignee_id_to_descr)
+
+        # specializing the function according to the parameter description
+        in_knl_callable = self.program_callables_info[expr.function.name]
+        new_in_knl_callable, self.program_callables_info = (
+                in_knl_callable.with_descrs(
+                    combined_arg_id_to_descr, self.program_callables_info))
+        self.program_callables_info, new_func_id = (
+                self.program_callables_info.with_callable(
+                    expr.function.function,
+                    new_in_knl_callable))
+
+        if isinstance(expr, Call):
+            return Call(
+                    ResolvedFunction(new_func_id),
+                    tuple(self.rec(child, expn_state)
+                    for child in expr.parameters))
+        else:
+            assert isinstance(expr, CallWithKwargs)
+            return CallWithKwargs(
+                    ResolvedFunction(new_func_id),
+                    tuple(self.rec(child, expn_state)
+                        for child in expr.parameters),
+                    dict(
+                        (key, self.rec(val, expn_state))
+                        for key, val in six.iteritems(kw_parameters))
+                    )
+
+    map_call_with_kwargs = map_call
+
+    def map_kernel(self, kernel):
+
+        new_insns = []
+
+        for insn in kernel.instructions:
+            if isinstance(insn, CallInstruction):
+                # In call instructions the assignees play an important in
+                # determining the arg_id_to_descr
+                new_insns.append(insn.with_transformed_expressions(
+                    self, kernel, insn, assignees=insn.assignees))
+            elif isinstance(insn, MultiAssignmentBase):
+                new_insns.append(insn.with_transformed_expressions(
+                    self, kernel, insn))
+            elif isinstance(insn, (_DataObliviousInstruction, CInstruction)):
+                new_insns.append(insn)
+            else:
+                raise NotImplementedError("arg_descr_inference for %s instruction" %
+                        type(insn))
+
+        return kernel.copy(instructions=new_insns)
+
+
+def traverse_to_infer_arg_descr(kernel, program_callables_info):
+    """
+    Returns a copy of *kernel* with the argument shapes and strides matching for
+    scoped functions in the *kernel*. Refer
+    :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`.
+    """
+    # FIXME: update this docs, once the design is finalized
+
+    from loopy.symbolic import SubstitutionRuleMappingContext
+
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            kernel.substitutions, kernel.get_var_name_generator())
+
+    arg_descr_inf_mapper = ArgDescrInferenceMapper(rule_mapping_context,
+            kernel, program_callables_info)
+
+    descr_inferred_kernel = rule_mapping_context.finish_kernel(
+            arg_descr_inf_mapper.map_kernel(kernel))
+
+    return descr_inferred_kernel, arg_descr_inf_mapper.program_callables_info
+
+
+def infer_arg_descr(program):
+    root_kernel_callable = program.program_callables_info[program.name]
+    program_callables_info = (
+            program.program_callables_info.with_edit_callables_mode())
+    root_kernel = program.root_kernel
+
+    new_root_kernel, program_callables_info = traverse_to_infer_arg_descr(
+            root_kernel, program_callables_info)
+    new_root_kernel_callable = root_kernel_callable.copy(
+            subkernel=new_root_kernel)
+    program_callables_info, _ = program_callables_info.with_callable(program.name,
+            new_root_kernel_callable)
+
+    program_callables_info = program_callables_info.with_exit_edit_callables_mode()
+
+    return program.copy(program_callables_info=program_callables_info)
+
+# }}}
+
+
 preprocess_cache = WriteOncePersistentDict(
         "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION,
         key_builder=LoopyKeyBuilder())
 
 
-def preprocess_kernel(kernel, device=None):
-    if device is not None:
-        from warnings import warn
-        warn("passing 'device' to preprocess_kernel() is deprecated",
-                DeprecationWarning, stacklevel=2)
-
+def preprocess_single_kernel(kernel, program_callables_info, device=None):
     from loopy.kernel import KernelState
     if kernel.state >= KernelState.PREPROCESSED:
         return kernel
@@ -2161,8 +2340,6 @@ def preprocess_kernel(kernel, device=None):
     # Type inference and reduction iname uniqueness don't handle substitutions.
     # Get them out of the way.
 
-    kernel = infer_unknown_types(kernel, expect_completion=False)
-
     check_for_writes_to_predicates(kernel)
     check_reduction_iname_uniqueness(kernel)
 
@@ -2177,8 +2354,8 @@ def preprocess_kernel(kernel, device=None):
     # - realize_reduction must happen after default dependencies are added
     #   because it manipulates the depends_on field, which could prevent
     #   defaults from being applied.
-
-    kernel = realize_reduction(kernel, unknown_types_ok=False)
+    kernel = realize_reduction_for_single_kernel(kernel,
+            program_callables_info, unknown_types_ok=False)
 
     # Ordering restriction:
     # add_axes_to_temporaries_for_ilp because reduction accumulators
@@ -2222,4 +2399,81 @@ def preprocess_kernel(kernel, device=None):
 
     return kernel
 
+
+def preprocess_kernel(kernel, device=None):
+    # FIXME: error message?
+    return preprocess_program(kernel, device)
+
+
+def preprocess_program(program, device=None):
+
+    if device is not None:
+        from warnings import warn
+        warn("passing 'device' to preprocess_kernel() is deprecated",
+                DeprecationWarning, stacklevel=2)
+
+    program = infer_unknown_types(program, expect_completion=False)
+
+    # {{{ preprocess the root kernel
+
+    # Callable editing restrictions:
+    #
+    # - cannot edit program_callables_info in :meth:`preprocess_single_kernel`
+    #   as we are iterating over it.
+    #
+    # Refer: https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects
+
+    new_resolved_functions = {}
+    for func_id, in_knl_callable in program.program_callables_info.items():
+        if isinstance(in_knl_callable, CallableKernel):
+            new_subkernel = preprocess_single_kernel(
+                    in_knl_callable.subkernel, program.program_callables_info,
+                    device)
+            in_knl_callable = in_knl_callable.copy(
+                    subkernel=new_subkernel)
+        elif isinstance(in_knl_callable, ScalarCallable):
+            pass
+        else:
+            raise NotImplementedError("Unknown type of callable %s." % (
+                type(in_knl_callable).__name__))
+
+        new_resolved_functions[func_id] = in_knl_callable
+
+    new_program_callables_info = program.program_callables_info.copy(
+            resolved_functions=new_resolved_functions)
+    program = program.copy(program_callables_info=new_program_callables_info)
+
+    # }}}
+
+    # infer arg descrs of the callables
+    program = infer_arg_descr(program)
+
+    # {{{ hw axes inference
+
+    # FIXME: think of wrapping this in a function?
+
+    local_size, global_size = program.get_grid_size_upper_bounds()
+
+    resolved_function_with_hw_axes_sizes_set = {}
+
+    for func_id, in_knl_callable in (
+            program.program_callables_info.items()):
+        if func_id == program.name:
+            resolved_function_with_hw_axes_sizes_set[func_id] = (
+                    in_knl_callable)
+        else:
+            resolved_function_with_hw_axes_sizes_set[func_id] = (
+                    in_knl_callable.with_hw_axes_sizes(local_size, global_size))
+
+    new_program_callables_info = (
+            program.program_callables_info.copy(
+                resolved_functions=resolved_function_with_hw_axes_sizes_set))
+
+    program = program.copy(program_callables_info=new_program_callables_info)
+
+    # }}}
+
+    return program
+
+
 # vim: foldmethod=marker
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index 652f8b89..201bcc25 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -1832,7 +1832,7 @@ class MinRecursionLimitForScheduling(MinRecursionLimit):
 
 # {{{ main scheduling entrypoint
 
-def generate_loop_schedules(kernel, debug_args={}):
+def generate_loop_schedules(kernel, program_callables_info, debug_args={}):
     """
     .. warning::
 
@@ -1845,18 +1845,19 @@ def generate_loop_schedules(kernel, debug_args={}):
     """
 
     with MinRecursionLimitForScheduling(kernel):
-        for sched in generate_loop_schedules_inner(kernel, debug_args=debug_args):
+        for sched in generate_loop_schedules_inner(kernel,
+                program_callables_info, debug_args=debug_args):
             yield sched
 
 
-def generate_loop_schedules_inner(kernel, debug_args={}):
+def generate_loop_schedules_inner(kernel, program_callables_info, debug_args={}):
     from loopy.kernel import KernelState
     if kernel.state not in (KernelState.PREPROCESSED, KernelState.SCHEDULED):
         raise LoopyError("cannot schedule a kernel that has not been "
                 "preprocessed")
 
     from loopy.check import pre_schedule_checks
-    pre_schedule_checks(kernel)
+    pre_schedule_checks(kernel, program_callables_info)
 
     schedule_count = 0
 
@@ -1969,7 +1970,8 @@ def generate_loop_schedules_inner(kernel, debug_args={}):
             gen_sched = convert_barrier_instructions_to_barriers(
                     kernel, gen_sched)
 
-            gsize, lsize = kernel.get_grid_size_upper_bounds()
+            gsize, lsize = (
+                    kernel.get_grid_size_upper_bounds(program_callables_info))
 
             if (gsize or lsize):
                 if not kernel.options.disable_global_barriers:
@@ -2026,7 +2028,7 @@ schedule_cache = WriteOncePersistentDict(
         key_builder=LoopyKeyBuilder())
 
 
-def _get_one_scheduled_kernel_inner(kernel):
+def _get_one_scheduled_kernel_inner(kernel, program_callables_info):
     # This helper function exists to ensure that the generator chain is fully
     # out of scope after the function returns. This allows it to be
     # garbage-collected in the exit handler of the
@@ -2036,10 +2038,10 @@ def _get_one_scheduled_kernel_inner(kernel):
     #
     # See https://gitlab.tiker.net/inducer/sumpy/issues/31 for context.
 
-    return next(iter(generate_loop_schedules(kernel)))
+    return next(iter(generate_loop_schedules(kernel, program_callables_info)))
 
 
-def get_one_scheduled_kernel(kernel):
+def get_one_scheduled_kernel(kernel, program_callables_info):
     from loopy import CACHING_ENABLED
 
     sched_cache_key = kernel
@@ -2057,7 +2059,8 @@ def get_one_scheduled_kernel(kernel):
     if not from_cache:
         with ProcessLogger(logger, "%s: schedule" % kernel.name):
             with MinRecursionLimitForScheduling(kernel):
-                result = _get_one_scheduled_kernel_inner(kernel)
+                result = _get_one_scheduled_kernel_inner(kernel,
+                        program_callables_info)
 
     if CACHING_ENABLED and not from_cache:
         schedule_cache.store_if_not_present(sched_cache_key, result)
diff --git a/loopy/statistics.py b/loopy/statistics.py
index cee28b24..08b7f89e 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -33,6 +33,7 @@ from loopy.kernel.data import (
         MultiAssignmentBase, TemporaryVariable, AddressSpace)
 from loopy.diagnostic import warn_with_kernel, LoopyError
 from pytools import Record
+from loopy.kernel.function_interface import ScalarCallable, CallableKernel
 
 
 __doc__ = """
@@ -59,6 +60,14 @@ __doc__ = """
 """
 
 
+# FIXME: this is broken for the callable kernel design.
+# Qns:
+# - The variable name, what if multiple kernels use the same name?
+# - We should also add the cumulative effect on the arguments of callee kernels
+# into the caller kernel.
+# FIXME: add an error that there is only one callable kernel. disable for
+# multiple callable kernels.
+
 # {{{ GuardedPwQPolynomial
 
 class GuardedPwQPolynomial(object):
@@ -639,10 +648,11 @@ class MemAccess(Record):
 # {{{ counter base
 
 class CounterBase(CombineMapper):
-    def __init__(self, knl):
+    def __init__(self, knl, program_callables_info):
         self.knl = knl
+        self.program_callables_info = program_callables_info
         from loopy.type_inference import TypeInferenceMapper
-        self.type_inf = TypeInferenceMapper(knl)
+        self.type_inf = TypeInferenceMapper(knl, program_callables_info)
 
     def combine(self, values):
         return sum(values)
@@ -697,10 +707,11 @@ class CounterBase(CombineMapper):
 # {{{ ExpressionOpCounter
 
 class ExpressionOpCounter(CounterBase):
-    def __init__(self, knl):
+    def __init__(self, knl, program_callables_info):
         self.knl = knl
+        self.program_callables_info = program_callables_info
         from loopy.type_inference import TypeInferenceMapper
-        self.type_inf = TypeInferenceMapper(knl)
+        self.type_inf = TypeInferenceMapper(knl, program_callables_info)
 
     def combine(self, values):
         return sum(values)
@@ -712,9 +723,16 @@ class ExpressionOpCounter(CounterBase):
     map_variable = map_constant
 
     def map_call(self, expr):
+        from loopy.symbolic import ResolvedFunction
+        if isinstance(expr.function, ResolvedFunction):
+            function_identifier = self.program_callables_info[
+                    expr.function.name].name
+        else:
+            function_identifier = expr.function.name
+
         return ToCountMap(
                     {Op(dtype=self.type_inf(expr),
-                        name='func:'+str(expr.function),
+                        name='func:'+function_identifier,
                         count_granularity=CountGranularity.WORKITEM): 1}
                     ) + self.rec(expr.parameters)
 
@@ -1090,6 +1108,16 @@ def add_assumptions_guard(kernel, pwqpolynomial):
 
 
 def count(kernel, set, space=None):
+    from loopy.program import Program
+    if isinstance(kernel, Program):
+        if len([in_knl_callable for in_knl_callable in
+            kernel.program_callables_info.values() if isinstance(in_knl_callable,
+                CallableKernel)]) != 1:
+            raise NotImplementedError("Currently only supported for program with "
+                "only one CallableKernel.")
+
+        kernel = kernel.root_kernel
+
     try:
         if space is not None:
             set = set.align_params(space)
@@ -1188,9 +1216,10 @@ def count(kernel, set, space=None):
     return add_assumptions_guard(kernel, count)
 
 
-def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None):
+def get_unused_hw_axes_factor(knl, program_callables_info, insn,
+        disregard_local_axes, space=None):
     # FIXME: Multi-kernel support
-    gsize, lsize = knl.get_grid_size_upper_bounds()
+    gsize, lsize = knl.get_grid_size_upper_bounds(program_callables_info)
 
     g_used = set()
     l_used = set()
@@ -1228,7 +1257,8 @@ def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None):
     return add_assumptions_guard(knl, result)
 
 
-def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False):
+def count_insn_runs(knl, program_callables_info, insn, count_redundant_work,
+        disregard_local_axes=False):
 
     insn_inames = knl.insn_inames(insn)
 
@@ -1248,9 +1278,8 @@ def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False)
     c = count(knl, domain, space=space)
 
     if count_redundant_work:
-        unused_fac = get_unused_hw_axes_factor(knl, insn,
-                        disregard_local_axes=disregard_local_axes,
-                        space=space)
+        unused_fac = get_unused_hw_axes_factor(knl, program_callables_info,
+                insn, disregard_local_axes=disregard_local_axes, space=space)
         return c * unused_fac
     else:
         return c
@@ -1260,7 +1289,50 @@ def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False)
 
 # {{{ get_op_map
 
-def get_op_map(knl, numpy_types=True, count_redundant_work=False,
+
+def get_op_map_for_single_kernel(knl, program_callables_info,
+        numpy_types=True, count_redundant_work=False,
+               subgroup_size=None):
+
+    if not knl.options.ignore_boostable_into:
+        raise LoopyError("Kernel '%s': Using operation counting requires the option "
+                "ignore_boostable_into to be set." % knl.name)
+
+    from loopy.kernel.instruction import (
+            CallInstruction, CInstruction, Assignment,
+            NoOpInstruction, BarrierInstruction)
+
+    op_map = ToCountMap()
+    op_counter = ExpressionOpCounter(knl,
+            program_callables_info=program_callables_info)
+    for insn in knl.instructions:
+        if isinstance(insn, (CallInstruction, CInstruction, Assignment)):
+            ops = op_counter(insn.assignee) + op_counter(insn.expression)
+            op_map = op_map + ops*count_insn_runs(
+                    knl, program_callables_info, insn,
+                    count_redundant_work=count_redundant_work)
+        elif isinstance(insn, (NoOpInstruction, BarrierInstruction)):
+            pass
+        else:
+            raise NotImplementedError("unexpected instruction item type: '%s'"
+                    % type(insn).__name__)
+
+    if numpy_types:
+        return ToCountMap(
+                    init_dict=dict(
+                        (Op(
+                            dtype=op.dtype.numpy_dtype,
+                            name=op.name,
+                            count_granularity=op.count_granularity),
+                        ct)
+                        for op, ct in six.iteritems(op_map.count_map)),
+                    val_type=op_map.val_type
+                    )
+    else:
+        return op_map
+
+
+def get_op_map(program, numpy_types=True, count_redundant_work=False,
                subgroup_size=None):
 
     """Count the number of operations in a loopy kernel.
@@ -1318,44 +1390,31 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False,
 
     """
 
-    if not knl.options.ignore_boostable_into:
-        raise LoopyError("Kernel '%s': Using operation counting requires the option "
-                "ignore_boostable_into to be set." % knl.name)
-
-    from loopy.preprocess import preprocess_kernel, infer_unknown_types
-    from loopy.kernel.instruction import (
-            CallInstruction, CInstruction, Assignment,
-            NoOpInstruction, BarrierInstruction)
-    knl = infer_unknown_types(knl, expect_completion=True)
-    knl = preprocess_kernel(knl)
+    from loopy.preprocess import preprocess_program, infer_unknown_types
+    program = infer_unknown_types(program, expect_completion=True)
+    program = preprocess_program(program)
 
     op_map = ToCountMap()
-    op_counter = ExpressionOpCounter(knl)
-    for insn in knl.instructions:
-        if isinstance(insn, (CallInstruction, CInstruction, Assignment)):
-            ops = op_counter(insn.assignee) + op_counter(insn.expression)
-            op_map = op_map + ops*count_insn_runs(
-                    knl, insn,
-                    count_redundant_work=count_redundant_work)
-        elif isinstance(insn, (NoOpInstruction, BarrierInstruction)):
+
+    for func_id, in_knl_callable in program.program_callables_info.items():
+        if isinstance(in_knl_callable, CallableKernel):
+            num_times_called = (
+                    program.program_callables_info.num_times_callables_called[
+                        func_id])
+            knl = in_knl_callable.subkernel
+            knl_op_map = get_op_map_for_single_kernel(knl,
+                        program.program_callables_info, numpy_types,
+                        count_redundant_work, subgroup_size)
+
+            for i in range(num_times_called):
+                op_map += knl_op_map
+        elif isinstance(in_knl_callable, ScalarCallable):
             pass
         else:
-            raise NotImplementedError("unexpected instruction item type: '%s'"
-                    % type(insn).__name__)
+            raise NotImplementedError("Unknown callabke types %s." % (
+                type(in_knl_callable).__name__))
 
-    if numpy_types:
-        return ToCountMap(
-                    init_dict=dict(
-                        (Op(
-                            dtype=op.dtype.numpy_dtype,
-                            name=op.name,
-                            count_granularity=op.count_granularity),
-                        ct)
-                        for op, ct in six.iteritems(op_map.count_map)),
-                    val_type=op_map.val_type
-                    )
-    else:
-        return op_map
+    return op_map
 
 # }}}
 
@@ -1376,93 +1435,9 @@ def _find_subgroup_size_for_knl(knl):
 
 # {{{ get_mem_access_map
 
-def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
-                       subgroup_size=None):
-    """Count the number of memory accesses in a loopy kernel.
-
-    :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be
-        counted.
-
-    :arg numpy_types: A :class:`bool` specifying whether the types in the
-        returned mapping should be numpy types instead of
-        :class:`loopy.LoopyType`.
 
-    :arg count_redundant_work: Based on usage of hardware axes or other
-        specifics, a kernel may perform work redundantly. This :class:`bool`
-        flag indicates whether this work should be included in the count.
-        (Likely desirable for performance modeling, but undesirable for
-        code optimization.)
-
-    :arg subgroup_size: An :class:`int`, :class:`str` ``'guess'``, or
-        *None* that specifies the sub-group size. An OpenCL sub-group is an
-        implementation-dependent grouping of work-items within a work-group,
-        analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when
-        counting a :class:`MemAccess` whose count_granularity specifies that it
-        should only be counted once per sub-group. If set to *None* an attempt
-        to find the sub-group size using the device will be made, if this fails
-        an error will be raised. If a :class:`str` ``'guess'`` is passed as
-        the subgroup_size, get_mem_access_map will attempt to find the
-        sub-group size using the device and, if unsuccessful, will make a wild
-        guess.
-
-    :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:**
-        :class:`islpy.PwQPolynomial` **}**.
-
-        - The :class:`MemAccess` specifies the characteristics of the memory
-          access.
-
-        - The :class:`islpy.PwQPolynomial` holds the number of memory accesses
-          with the characteristics specified in the key (in terms of the
-          :class:`loopy.LoopKernel` *inames*).
-
-    Example usage::
-
-        # (first create loopy kernel and specify array data types)
-
-        params = {'n': 512, 'm': 256, 'l': 128}
-        mem_map = get_mem_access_map(knl)
-
-        f32_s1_g_ld_a = mem_map[MemAccess(
-                                    mtype='global',
-                                    dtype=np.float32,
-                                    lid_strides={0: 1},
-                                    gid_strides={0: 256},
-                                    direction='load',
-                                    variable='a',
-                                    count_granularity=CountGranularity.WORKITEM)
-                               ].eval_with_dict(params)
-        f32_s1_g_st_a = mem_map[MemAccess(
-                                    mtype='global',
-                                    dtype=np.float32,
-                                    lid_strides={0: 1},
-                                    gid_strides={0: 256},
-                                    direction='store',
-                                    variable='a',
-                                    count_granularity=CountGranularity.WORKITEM)
-                               ].eval_with_dict(params)
-        f32_s1_l_ld_x = mem_map[MemAccess(
-                                    mtype='local',
-                                    dtype=np.float32,
-                                    lid_strides={0: 1},
-                                    gid_strides={0: 256},
-                                    direction='load',
-                                    variable='x',
-                                    count_granularity=CountGranularity.WORKITEM)
-                               ].eval_with_dict(params)
-        f32_s1_l_st_x = mem_map[MemAccess(
-                                    mtype='local',
-                                    dtype=np.float32,
-                                    lid_strides={0: 1},
-                                    gid_strides={0: 256},
-                                    direction='store',
-                                    variable='x',
-                                    count_granularity=CountGranularity.WORKITEM)
-                               ].eval_with_dict(params)
-
-        # (now use these counts to, e.g., predict performance)
-
-    """
-    from loopy.preprocess import preprocess_kernel, infer_unknown_types
+def get_access_map_for_single_kernel(knl, program_callables_info,
+        numpy_types=True, count_redundant_work=False, subgroup_size=None):
 
     if not knl.options.ignore_boostable_into:
         raise LoopyError("Kernel '%s': Using operation counting requires the option "
@@ -1518,11 +1493,12 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
 
         if count_granularity == CountGranularity.WORKITEM:
             return count_insn_runs(
-                knl, insn, count_redundant_work=count_redundant_work,
+                knl, program_callables_info, insn,
+                count_redundant_work=count_redundant_work,
                 disregard_local_axes=False)
 
         ct_disregard_local = count_insn_runs(
-                knl, insn, disregard_local_axes=True,
+                knl, program_callables_info, insn, disregard_local_axes=True,
                 count_redundant_work=count_redundant_work)
 
         if count_granularity == CountGranularity.WORKGROUP:
@@ -1530,7 +1506,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
         elif count_granularity == CountGranularity.SUBGROUP:
             # get the group size
             from loopy.symbolic import aff_to_expr
-            _, local_size = knl.get_grid_size_upper_bounds()
+            _, local_size = knl.get_grid_size_upper_bounds(program_callables_info)
             workgroup_size = 1
             if local_size:
                 for size in local_size:
@@ -1556,12 +1532,9 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                     "not allowed. count_granularity options: %s"
                     % (count_granularity, CountGranularity.ALL+[None]))
 
-    knl = infer_unknown_types(knl, expect_completion=True)
-    knl = preprocess_kernel(knl)
-
     access_map = ToCountMap()
-    access_counter_g = GlobalMemAccessCounter(knl)
-    access_counter_l = LocalMemAccessCounter(knl)
+    access_counter_g = GlobalMemAccessCounter(knl, program_callables_info)
+    access_counter_l = LocalMemAccessCounter(knl, program_callables_info)
 
     from loopy.kernel.instruction import (
             CallInstruction, CInstruction, Assignment,
@@ -1617,12 +1590,129 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
     else:
         return access_map
 
+
+def get_mem_access_map(program, numpy_types=True, count_redundant_work=False,
+                       subgroup_size=None):
+    """Count the number of memory accesses in a loopy kernel.
+
+    :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be
+        counted.
+
+    :arg numpy_types: A :class:`bool` specifying whether the types in the
+        returned mapping should be numpy types instead of
+        :class:`loopy.LoopyType`.
+
+    :arg count_redundant_work: Based on usage of hardware axes or other
+        specifics, a kernel may perform work redundantly. This :class:`bool`
+        flag indicates whether this work should be included in the count.
+        (Likely desirable for performance modeling, but undesirable for
+        code optimization.)
+
+    :arg subgroup_size: An :class:`int`, :class:`str` ``'guess'``, or
+        *None* that specifies the sub-group size. An OpenCL sub-group is an
+        implementation-dependent grouping of work-items within a work-group,
+        analagous to an NVIDIA CUDA warp. subgroup_size is used, e.g., when
+        counting a :class:`MemAccess` whose count_granularity specifies that it
+        should only be counted once per sub-group. If set to *None* an attempt
+        to find the sub-group size using the device will be made, if this fails
+        an error will be raised. If a :class:`str` ``'guess'`` is passed as
+        the subgroup_size, get_mem_access_map will attempt to find the
+        sub-group size using the device and, if unsuccessful, will make a wild
+        guess.
+
+    :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:**
+        :class:`islpy.PwQPolynomial` **}**.
+
+        - The :class:`MemAccess` specifies the characteristics of the memory
+          access.
+
+        - The :class:`islpy.PwQPolynomial` holds the number of memory accesses
+          with the characteristics specified in the key (in terms of the
+          :class:`loopy.LoopKernel` *inames*).
+
+    Example usage::
+
+        # (first create loopy kernel and specify array data types)
+
+        params = {'n': 512, 'm': 256, 'l': 128}
+        mem_map = get_mem_access_map(knl)
+
+        f32_s1_g_ld_a = mem_map[MemAccess(
+                                    mtype='global',
+                                    dtype=np.float32,
+                                    lid_strides={0: 1},
+                                    gid_strides={0: 256},
+                                    direction='load',
+                                    variable='a',
+                                    count_granularity=CountGranularity.WORKITEM)
+                               ].eval_with_dict(params)
+        f32_s1_g_st_a = mem_map[MemAccess(
+                                    mtype='global',
+                                    dtype=np.float32,
+                                    lid_strides={0: 1},
+                                    gid_strides={0: 256},
+                                    direction='store',
+                                    variable='a',
+                                    count_granularity=CountGranularity.WORKITEM)
+                               ].eval_with_dict(params)
+        f32_s1_l_ld_x = mem_map[MemAccess(
+                                    mtype='local',
+                                    dtype=np.float32,
+                                    lid_strides={0: 1},
+                                    gid_strides={0: 256},
+                                    direction='load',
+                                    variable='x',
+                                    count_granularity=CountGranularity.WORKITEM)
+                               ].eval_with_dict(params)
+        f32_s1_l_st_x = mem_map[MemAccess(
+                                    mtype='local',
+                                    dtype=np.float32,
+                                    lid_strides={0: 1},
+                                    gid_strides={0: 256},
+                                    direction='store',
+                                    variable='x',
+                                    count_granularity=CountGranularity.WORKITEM)
+                               ].eval_with_dict(params)
+
+        # (now use these counts to, e.g., predict performance)
+
+    """
+    from loopy.preprocess import preprocess_program, infer_unknown_types
+
+    program = infer_unknown_types(program, expect_completion=True)
+    program = preprocess_program(program)
+
+    access_map = ToCountMap()
+
+    for func_id, in_knl_callable in program.program_callables_info.items():
+        if isinstance(in_knl_callable, CallableKernel):
+            num_times_called = (
+                    program.program_callables_info.num_times_callables_called[
+                        func_id])
+            knl = in_knl_callable.subkernel
+            knl_access_map = get_access_map_for_single_kernel(knl,
+                        program.program_callables_info, numpy_types,
+                        count_redundant_work, subgroup_size)
+
+            # FIXME: didn't see any easy way to multiply
+            for i in range(num_times_called):
+                access_map += knl_access_map
+        elif isinstance(in_knl_callable, ScalarCallable):
+            pass
+        else:
+            raise NotImplementedError("Unknown callabke types %s." % (
+                type(in_knl_callable).__name__))
+
+    return access_map
+
+
 # }}}
 
 
 # {{{ get_synchronization_map
 
-def get_synchronization_map(knl, subgroup_size=None):
+def get_synchronization_map_for_single_kernel(knl, program_callables_info,
+        subgroup_size=None):
 
     """Count the number of synchronization events each work-item encounters in
     a loopy kernel.
@@ -1664,13 +1754,10 @@ def get_synchronization_map(knl, subgroup_size=None):
         raise LoopyError("Kernel '%s': Using operation counting requires the option "
                 "ignore_boostable_into to be set." % knl.name)
 
-    from loopy.preprocess import preprocess_kernel, infer_unknown_types
     from loopy.schedule import (EnterLoop, LeaveLoop, Barrier,
             CallKernel, ReturnFromKernel, RunInstruction)
     from operator import mul
-    knl = infer_unknown_types(knl, expect_completion=True)
-    knl = preprocess_kernel(knl)
-    knl = lp.get_one_scheduled_kernel(knl)
+    knl = lp.get_one_scheduled_kernel(knl, program_callables_info)
     iname_list = []
 
     result = ToCountMap()
@@ -1713,12 +1800,42 @@ def get_synchronization_map(knl, subgroup_size=None):
 
     return result
 
+
+def get_synchronization_map(program, subgroup_size=None):
+
+    from loopy.preprocess import preprocess_program, infer_unknown_types
+
+    program = infer_unknown_types(program, expect_completion=True)
+    program = preprocess_program(program)
+
+    sync_map = ToCountMap()
+
+    for func_id, in_knl_callable in program.program_callables_info.items():
+        if isinstance(in_knl_callable, CallableKernel):
+            num_times_called = (
+                    program.program_callables_info.num_times_callables_called[
+                        func_id])
+            knl = in_knl_callable.subkernel
+            knl_sync_map = get_synchronization_map_for_single_kernel(knl,
+                    program.program_callables_info, subgroup_size)
+
+            # FIXME: didn't see any easy way to multiply
+            for i in range(num_times_called):
+                sync_map += knl_sync_map
+        elif isinstance(in_knl_callable, ScalarCallable):
+            pass
+        else:
+            raise NotImplementedError("Unknown callabke types %s." % (
+                type(in_knl_callable).__name__))
+
+    return sync_map
+
 # }}}
 
 
 # {{{ gather_access_footprints
 
-def gather_access_footprints(kernel, ignore_uncountable=False):
+def gather_access_footprints_for_single_kernel(kernel, ignore_uncountable=False):
     """Return a dictionary mapping ``(var_name, direction)`` to
     :class:`islpy.Set` instances capturing which indices of each the array
     *var_name* are read/written (where *direction* is either ``read`` or
@@ -1729,13 +1846,6 @@ def gather_access_footprints(kernel, ignore_uncountable=False):
         nonlinear indices)
     """
 
-    from loopy.preprocess import preprocess_kernel, infer_unknown_types
-    kernel = infer_unknown_types(kernel, expect_completion=True)
-
-    from loopy.kernel import KernelState
-    if kernel.state < KernelState.PREPROCESSED:
-        kernel = preprocess_kernel(kernel)
-
     write_footprints = []
     read_footprints = []
 
@@ -1758,6 +1868,46 @@ def gather_access_footprints(kernel, ignore_uncountable=False):
             write_footprints.append(afg(insn.assignees))
         read_footprints.append(afg(insn.expression))
 
+    return write_footprints, read_footprints
+
+
+def gather_access_footprints(program, ignore_uncountable=False):
+    # FIMXE: works only for one callable kernel till now.
+    if len([in_knl_callable for in_knl_callable in
+        program.program_callables_info.values() if isinstance(in_knl_callable,
+            CallableKernel)]) != 1:
+        raise NotImplementedError("Currently only supported for program with "
+            "only one CallableKernel.")
+
+    from loopy.preprocess import preprocess_program, infer_unknown_types
+
+    program = infer_unknown_types(program, expect_completion=True)
+    program = preprocess_program(program)
+
+    write_footprints = []
+    read_footprints = []
+
+    for func_id, in_knl_callable in program.program_callables_info.items():
+        if isinstance(in_knl_callable, CallableKernel):
+            num_times_called = (
+                    program.program_callables_info.num_times_callables_called[
+                        func_id])
+            knl = in_knl_callable.subkernel
+            knl_write_footprints, knl_read_footprints = (
+                    gather_access_footprints_for_single_kernel(knl,
+                        ignore_uncountable))
+
+            # FIXME: didn't see any easy way to multiply
+            for i in range(num_times_called):
+                write_footprints.extend(knl_write_footprints)
+                read_footprints.extend(knl_read_footprints)
+
+        elif isinstance(in_knl_callable, ScalarCallable):
+            pass
+        else:
+            raise NotImplementedError("Unknown callabke types %s." % (
+                type(in_knl_callable).__name__))
+
     write_footprints = AccessFootprintGatherer.combine(write_footprints)
     read_footprints = AccessFootprintGatherer.combine(read_footprints)
 
@@ -1772,7 +1922,7 @@ def gather_access_footprints(kernel, ignore_uncountable=False):
     return result
 
 
-def gather_access_footprint_bytes(kernel, ignore_uncountable=False):
+def gather_access_footprint_bytes(program, ignore_uncountable=False):
     """Return a dictionary mapping ``(var_name, direction)`` to
     :class:`islpy.PwQPolynomial` instances capturing the number of bytes  are
     read/written (where *direction* is either ``read`` or ``write`` on array
@@ -1783,12 +1933,12 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False):
         nonlinear indices)
     """
 
-    from loopy.preprocess import preprocess_kernel, infer_unknown_types
-    kernel = infer_unknown_types(kernel, expect_completion=True)
+    from loopy.preprocess import preprocess_program, infer_unknown_types
+    kernel = infer_unknown_types(program, expect_completion=True)
 
     from loopy.kernel import KernelState
     if kernel.state < KernelState.PREPROCESSED:
-        kernel = preprocess_kernel(kernel)
+        kernel = preprocess_program(program)
 
     result = {}
     fp = gather_access_footprints(kernel,
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 8927cd6f..7a268d06 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -56,7 +56,7 @@ from pymbolic.mapper.constant_folder import \
         ConstantFoldingMapper as ConstantFoldingMapperBase
 
 from pymbolic.parser import Parser as ParserBase
-
+from loopy.diagnostic import LoopyError
 from loopy.diagnostic import ExpressionToAffineConversionError
 
 import islpy as isl
@@ -69,22 +69,23 @@ import numpy as np
 # {{{ mappers with support for loopy-specific primitives
 
 class IdentityMapperMixin(object):
-    def map_literal(self, expr, *args):
+    def map_literal(self, expr, *args, **kwargs):
         return expr
 
-    def map_array_literal(self, expr, *args):
-        return type(expr)(tuple(self.rec(ch, *args) for ch in expr.children))
+    def map_array_literal(self, expr, *args, **kwargs):
+        return type(expr)(tuple(self.rec(ch, *args, **kwargs) for ch in
+            expr.children))
 
-    def map_group_hw_index(self, expr, *args):
+    def map_group_hw_index(self, expr, *args, **kwargs):
         return expr
 
-    def map_local_hw_index(self, expr, *args):
+    def map_local_hw_index(self, expr, *args, **kwargs):
         return expr
 
-    def map_loopy_function_identifier(self, expr, *args):
+    def map_loopy_function_identifier(self, expr, *args, **kwargs):
         return expr
 
-    def map_reduction(self, expr, *args):
+    def map_reduction(self, expr, *args, **kwargs):
         mapped_inames = [self.rec(p.Variable(iname), *args) for iname in expr.inames]
 
         new_inames = []
@@ -98,15 +99,18 @@ class IdentityMapperMixin(object):
 
         return Reduction(
                 expr.operation, tuple(new_inames),
-                self.rec(expr.expr, *args),
+                self.rec(expr.expr, *args, **kwargs),
                 allow_simultaneous=expr.allow_simultaneous)
 
-    def map_tagged_variable(self, expr, *args):
+    def map_tagged_variable(self, expr, *args, **kwargs):
         # leaf, doesn't change
         return expr
 
-    def map_type_annotation(self, expr, *args):
-        return type(expr)(expr.type, self.rec(expr.child))
+    def map_type_annotation(self, expr, *args, **kwargs):
+        return type(expr)(expr.type, self.rec(expr.child, *args, **kwargs))
+
+    def map_resolved_function(self, expr, *args, **kwargs):
+        return ResolvedFunction(expr.function)
 
     map_type_cast = map_type_annotation
 
@@ -165,9 +169,16 @@ class WalkMapper(WalkMapperBase):
 
     map_rule_argument = map_group_hw_index
 
+    def map_resolved_function(self, expr, *args):
+        if not self.visit(expr):
+            return
+
+        self.rec(expr.function, *args)
+
 
 class CallbackMapper(CallbackMapperBase, IdentityMapper):
     map_reduction = CallbackMapperBase.map_constant
+    map_resolved_function = CallbackMapperBase.map_constant
 
 
 class CombineMapper(CombineMapperBase):
@@ -232,13 +243,16 @@ class StringifyMapper(StringifyMapperBase):
         from pymbolic.mapper.stringifier import PREC_NONE
         return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE))
 
+    def map_resolved_function(self, expr, prec):
+        return "ResolvedFunction('%s')" % expr.name
+
 
 class UnidirectionalUnifier(UnidirectionalUnifierBase):
     def map_reduction(self, expr, other, unis):
         if not isinstance(other, type(expr)):
             return self.treat_mismatch(expr, other, unis)
         if (expr.inames != other.inames
-                or type(expr.operation) != type(other.operation)  # noqa
+                or type(expr.function) != type(other.function)  # noqa
                 ):
             return []
 
@@ -289,6 +303,9 @@ class DependencyMapper(DependencyMapperBase):
     def map_type_cast(self, expr):
         return self.rec(expr.child)
 
+    def map_resolved_function(self, expr):
+        return self.rec(expr.function)
+
 
 class SubstitutionRuleExpander(IdentityMapper):
     def __init__(self, rules):
@@ -638,6 +655,51 @@ class RuleArgument(p.Expression):
 
     mapper_method = intern("map_rule_argument")
 
+
+class ResolvedFunction(p.Expression):
+    """
+    A function invocation whose definition is known in a :mod:`loopy` kernel.
+    Each instance of :class:`loopy.symbolic.ResolvedFunction` in an expression
+    points to an instance of
+    :class:`loopy.kernel.function_interface.InKernelCallable` through the
+    mapping :attr:`loopy.kernel.LoopKernel.scoped_functions`. Refer
+    :ref:`ref_scoped_function` for a slightly detailed explanation on scoped
+    functions.
+
+    .. attribute:: function
+
+        An instance of :class:`pymbolic.primitives.Variable`,
+        :class:`loopy.library.reduction.ArgExtOp` or
+        :class:`loopy.library.reduction.SegmentedOp`.
+    """
+    init_arg_names = ("function", )
+
+    def __init__(self, function):
+        if isinstance(function, str):
+            function = p.Variable(function)
+        from loopy.library.reduction import ArgExtOp, SegmentedOp
+        assert isinstance(function, (p.Variable, ArgExtOp, SegmentedOp))
+        self.function = function
+
+    @property
+    def name(self):
+        from loopy.library.reduction import ArgExtOp, SegmentedOp
+        if isinstance(self.function, p.Variable):
+            return self.function.name
+        elif isinstance(self.function, (ArgExtOp, SegmentedOp)):
+            return self.function
+        else:
+            raise LoopyError("Unexpected function type %s in ResolvedFunction." %
+                    type(self.function))
+
+    def __getinitargs__(self):
+        return (self.function, )
+
+    def stringifier(self):
+        return StringifyMapper
+
+    mapper_method = intern("map_resolved_function")
+
 # }}}
 
 
@@ -650,9 +712,12 @@ def get_dependencies(expr):
 # {{{ rule-aware mappers
 
 def parse_tagged_name(expr):
+    from loopy.library.reduction import ArgExtOp, SegmentedOp
     if isinstance(expr, TaggedVariable):
         return expr.name, expr.tag
-    elif isinstance(expr, p.Variable):
+    elif isinstance(expr, ResolvedFunction):
+        return parse_tagged_name(expr.function)
+    elif isinstance(expr, (p.Variable, ArgExtOp, SegmentedOp)):
         return expr.name, None
     else:
         raise RuntimeError("subst rule name not understood: %s" % expr)
@@ -850,12 +915,14 @@ class RuleAwareIdentityMapper(IdentityMapper):
     def __init__(self, rule_mapping_context):
         self.rule_mapping_context = rule_mapping_context
 
-    def map_variable(self, expr, expn_state):
+    def map_variable(self, expr, expn_state, *args, **kwargs):
         name, tag = parse_tagged_name(expr)
         if name not in self.rule_mapping_context.old_subst_rules:
-            return IdentityMapper.map_variable(self, expr, expn_state)
+            return IdentityMapper.map_variable(self, expr, expn_state, *args,
+                    **kwargs)
         else:
-            return self.map_substitution(name, tag, (), expn_state)
+            return self.map_substitution(name, tag, (), expn_state, *args,
+                    **kwargs)
 
     def map_call(self, expr, expn_state):
         if not isinstance(expr.function, p.Variable):
@@ -910,7 +977,7 @@ class RuleAwareIdentityMapper(IdentityMapper):
         else:
             return sym
 
-    def __call__(self, expr, kernel, insn):
+    def __call__(self, expr, kernel, insn, *args, **kwargs):
         from loopy.kernel.data import InstructionBase
         assert insn is None or isinstance(insn, InstructionBase)
 
@@ -919,7 +986,7 @@ class RuleAwareIdentityMapper(IdentityMapper):
                     kernel=kernel,
                     instruction=insn,
                     stack=(),
-                    arg_context={}))
+                    arg_context={}), *args, **kwargs)
 
     def map_instruction(self, kernel, insn):
         return insn
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index a81354e2..e3b4853c 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -80,7 +80,7 @@ class TargetBase(object):
     def preprocess(self, kernel):
         return kernel
 
-    def pre_codegen_check(self, kernel):
+    def pre_codegen_check(self, kernel, program_callables_info):
         pass
 
     # }}}
@@ -150,7 +150,12 @@ class ASTBuilderBase(object):
 
     # {{{ library
 
-    def function_manglers(self):
+    def function_scopers(self):
+        """
+        Returns an instance of list of the functions of signature
+        ``(target, identifiers)`` returning either an instance of
+        :class:`InKernelCallable` if a match is found or *None*.
+        """
         return []
 
     def symbol_manglers(self):
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 83efecf0..1579bb31 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -27,7 +27,6 @@ THE SOFTWARE.
 import six
 
 import numpy as np  # noqa
-from loopy.kernel.data import CallMangleInfo
 from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder
 from loopy.diagnostic import LoopyError, LoopyTypeError
 from cgen import Pointer, NestedDeclarator, Block
@@ -35,6 +34,7 @@ from cgen.mapper import IdentityMapper as CASTIdentityMapperBase
 from pymbolic.mapper.stringifier import PREC_NONE
 from loopy.symbolic import IdentityMapper
 from loopy.types import NumpyType
+from loopy.kernel.function_interface import ScalarCallable
 import pymbolic.primitives as p
 
 from pytools import memoize_method
@@ -354,71 +354,116 @@ def c_symbol_mangler(kernel, name):
 # }}}
 
 
-# {{{ function mangler
+# {{{ function scoping
 
-def c_math_mangler(target, name, arg_dtypes, modify_name=True):
-    # Function mangler for math functions defined in C standard
-    # Convert abs, min, max to fabs, fmin, fmax.
-    # If modify_name is set to True, function names are modified according to
-    # floating point types of the arguments (e.g. cos(double), cosf(float))
-    # This should be set to True for C and Cuda, False for OpenCL
-    if not isinstance(name, str):
-        return None
+class CMathCallable(ScalarCallable):
+    """
+    An umbrella callable for all the math functions which can be seen in a
+    C-Target.
+    """
 
-    if name in ["abs", "min", "max"]:
-        name = "f" + name
+    def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info):
+        name = self.name
 
-    # unitary functions
-    if (name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh",
-                 "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"]
-            and len(arg_dtypes) == 1
-            and arg_dtypes[0].numpy_dtype.kind == "f"):
+        if name in ["abs", "min", "max"]:
+            name = "f" + name
 
-        dtype = arg_dtypes[0].numpy_dtype
+        # unary functions
+        if name in ["fabs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh",
+                    "tan", "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor"]:
 
-        if modify_name:
-            if dtype == np.float64:
-                pass  # fabs
-            elif dtype == np.float32:
-                name = name + "f"  # fabsf
-            elif dtype == np.float128:
-                name = name + "l"  # fabsl
-            else:
-                raise LoopyTypeError("%s does not support type %s" % (name, dtype))
+            for id in arg_id_to_dtype:
+                if not -1 <= id <= 0:
+                    raise LoopyError("%s can take only one argument." % name)
 
-        return CallMangleInfo(
-                target_name=name,
-                result_dtypes=arg_dtypes,
-                arg_dtypes=arg_dtypes)
+            if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None:
+                # the types provided aren't mature enough to specialize the
+                # callable
+                return (
+                        self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                        program_callables_info)
 
-    # binary functions
-    if (name in ["fmax", "fmin"]
-            and len(arg_dtypes) == 2):
+            dtype = arg_id_to_dtype[0]
+            dtype = dtype.numpy_dtype
 
-        dtype = np.find_common_type(
-            [], [dtype.numpy_dtype for dtype in arg_dtypes])
-
-        if dtype.kind == "c":
-            raise LoopyTypeError("%s does not support complex numbers")
+            if dtype.kind in ('u', 'i'):
+                # ints and unsigned casted to float32
+                dtype = np.float32
+            elif dtype.kind == 'c':
+                raise LoopyTypeError("%s does not support type %s" % (name, dtype))
 
-        elif dtype.kind == "f":
-            if modify_name:
+            from loopy.target.opencl import OpenCLTarget
+            if not isinstance(caller_kernel.target, OpenCLTarget):
+                # for CUDA, C Targets the name must be modified
                 if dtype == np.float64:
-                    pass  # fmin
+                    pass  # fabs
                 elif dtype == np.float32:
-                    name = name + "f"  # fminf
+                    name = name + "f"  # fabsf
                 elif dtype == np.float128:
-                    name = name + "l"  # fminl
+                    name = name + "l"  # fabsl
                 else:
-                    raise LoopyTypeError("%s does not support type %s"
-                                         % (name, dtype))
+                    raise LoopyTypeError("%s does not support type %s" % (name,
+                        dtype))
+
+            return (
+                    self.copy(name_in_target=name,
+                        arg_id_to_dtype={0: NumpyType(dtype), -1:
+                            NumpyType(dtype)}),
+                    program_callables_info)
+
+        # binary functions
+        if name in ["fmax", "fmin"]:
+
+            for id in arg_id_to_dtype:
+                if not -1 <= id <= 1:
+                    raise LoopyError("%s can take only two arguments." % name)
+
+            if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or (
+                    arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None):
+                # the types provided aren't mature enough to specialize the
+                # callable
+                return (
+                        self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                        program_callables_info)
+
+            dtype = np.find_common_type(
+                [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items()
+                     if id >= 0])
+
+            if dtype.kind == "c":
+                raise LoopyTypeError("%s does not support complex numbers")
+
+            elif dtype.kind == "f":
+                from loopy.target.opencl import OpenCLTarget
+                if not isinstance(caller_kernel.target, OpenCLTarget):
+                    if dtype == np.float64:
+                        pass  # fmin
+                    elif dtype == np.float32:
+                        name = name + "f"  # fminf
+                    elif dtype == np.float128:
+                        name = name + "l"  # fminl
+                    else:
+                        raise LoopyTypeError("%s does not support type %s"
+                                             % (name, dtype))
+            dtype = NumpyType(dtype)
+            return (
+                    self.copy(name_in_target=name,
+                        arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}),
+                    program_callables_info)
+
+        return (
+                self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                program_callables_info)
 
-            result_dtype = NumpyType(dtype)
-            return CallMangleInfo(
-                    target_name=name,
-                    result_dtypes=(result_dtype,),
-                    arg_dtypes=2*(result_dtype,))
 
+def scope_c_math_functions(target, identifier):
+    """
+    Returns an instance of :class:`InKernelCallable` if the function
+    represented by :arg:`identifier` is known in C, otherwise returns *None*.
+    """
+    if identifier in ["abs", "acos", "asin", "atan", "cos", "cosh", "sin", "sinh",
+            "tanh", "exp", "log", "log10", "sqrt", "ceil", "floor", "max", "min"]:
+        return CMathCallable(name=identifier)
     return None
 
 # }}}
@@ -427,12 +472,6 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True):
 class CASTBuilder(ASTBuilderBase):
     # {{{ library
 
-    def function_manglers(self):
-        return (
-                super(CASTBuilder, self).function_manglers() + [
-                    c_math_mangler
-                    ])
-
     def symbol_manglers(self):
         return (
                 super(CASTBuilder, self).symbol_manglers() + [
@@ -445,6 +484,11 @@ class CASTBuilder(ASTBuilderBase):
                     _preamble_generator,
                     ])
 
+    def function_scopers(self):
+        return (
+                super(CASTBuilder, self).function_scopers() + [
+                    scope_c_math_functions])
+
     # }}}
 
     # {{{ code generation
@@ -846,82 +890,31 @@ class CASTBuilder(ASTBuilderBase):
         return block_if_necessary(assignments)
 
     def emit_multiple_assignment(self, codegen_state, insn):
-        ecm = codegen_state.expression_to_code_mapper
 
-        from pymbolic.primitives import Variable
-        from pymbolic.mapper.stringifier import PREC_NONE
-
-        func_id = insn.expression.function
-        parameters = insn.expression.parameters
-
-        if isinstance(func_id, Variable):
-            func_id = func_id.name
-
-        assignee_var_descriptors = [
-                codegen_state.kernel.get_var_descriptor(a)
-                for a in insn.assignee_var_names()]
-
-        par_dtypes = tuple(ecm.infer_type(par) for par in parameters)
-
-        mangle_result = codegen_state.kernel.mangle_function(func_id, par_dtypes)
-        if mangle_result is None:
-            raise RuntimeError("function '%s' unknown--"
-                    "maybe you need to register a function mangler?"
-                    % func_id)
-
-        assert mangle_result.arg_dtypes is not None
+        ecm = codegen_state.expression_to_code_mapper
+        func_id = insn.expression.function.name
+        in_knl_callable = codegen_state.program_callables_info[func_id]
 
-        if mangle_result.target_name == "loopy_make_tuple":
-            # This shorcut avoids actually having to emit a 'make_tuple' function.
+        if isinstance(in_knl_callable, ScalarCallable) and (
+                in_knl_callable.name_in_target == 'loopy_make_tuple'):
             return self.emit_tuple_assignment(codegen_state, insn)
 
-        from loopy.expression import dtype_to_type_context
-        c_parameters = [
-                ecm(par, PREC_NONE,
-                    dtype_to_type_context(self.target, tgt_dtype),
-                    tgt_dtype).expr
-                for par, par_dtype, tgt_dtype in zip(
-                    parameters, par_dtypes, mangle_result.arg_dtypes)]
-
-        from loopy.codegen import SeenFunction
-        codegen_state.seen_functions.add(
-                SeenFunction(func_id,
-                    mangle_result.target_name,
-                    mangle_result.arg_dtypes))
-
-        from pymbolic import var
-        for i, (a, tgt_dtype) in enumerate(
-                zip(insn.assignees[1:], mangle_result.result_dtypes[1:])):
-            if tgt_dtype != ecm.infer_type(a):
-                raise LoopyError("type mismatch in %d'th (1-based) left-hand "
-                        "side of instruction '%s'" % (i+1, insn.id))
-            c_parameters.append(
-                        # TODO Yuck: The "where-at function": &(...)
-                        var("&")(
-                            ecm(a, PREC_NONE,
-                                dtype_to_type_context(self.target, tgt_dtype),
-                                tgt_dtype).expr))
+        in_knl_callable_as_call, is_returned = in_knl_callable.emit_call_insn(
+                insn=insn,
+                target=self.target,
+                expression_to_code_mapper=ecm)
 
-        from pymbolic import var
-        result = var(mangle_result.target_name)(*c_parameters)
-
-        # In case of no assignees, we are done
-        if len(mangle_result.result_dtypes) == 0:
+        if is_returned:
+            from cgen import Assign
+            lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None)
+            return Assign(lhs_code,
+                    CExpression(self.get_c_expression_to_code_mapper(),
+                    in_knl_callable_as_call))
+        else:
             from cgen import ExpressionStatement
             return ExpressionStatement(
-                    CExpression(self.get_c_expression_to_code_mapper(), result))
-
-        result = ecm.wrap_in_typecast(
-                mangle_result.result_dtypes[0],
-                assignee_var_descriptors[0].dtype,
-                result)
-
-        lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None)
-
-        from cgen import Assign
-        return Assign(
-                lhs_code,
-                CExpression(self.get_c_expression_to_code_mapper(), result))
+                    CExpression(self.get_c_expression_to_code_mapper(),
+                    in_knl_callable_as_call))
 
     def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
             lbound, ubound, inner):
diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py
index 6b80bae2..b3c304d5 100644
--- a/loopy/target/c/c_execution.py
+++ b/loopy/target/c/c_execution.py
@@ -157,7 +157,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
     # {{{
 
     def generate_output_handler(
-            self, gen, options, kernel, implemented_data_info):
+            self, gen, options, program, implemented_data_info):
 
         from loopy.kernel.data import KernelArgument
 
@@ -166,12 +166,13 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
                     % ", ".join("\"%s\": %s" % (arg.name, arg.name)
                         for arg in implemented_data_info
                         if issubclass(arg.arg_class, KernelArgument)
-                        if arg.base_name in kernel.get_written_variables()))
+                        if arg.base_name in
+                        program.root_kernel.get_written_variables()))
         else:
             out_args = [arg
                     for arg in implemented_data_info
                         if issubclass(arg.arg_class, KernelArgument)
-                    if arg.base_name in kernel.get_written_variables()]
+                    if arg.base_name in program.root_kernel.get_written_variables()]
             if out_args:
                 gen("return None, (%s,)"
                         % ", ".join(arg.name for arg in out_args))
@@ -373,7 +374,7 @@ class CKernelExecutor(KernelExecutorBase):
     .. automethod:: __call__
     """
 
-    def __init__(self, kernel, compiler=None):
+    def __init__(self, program, compiler=None):
         """
         :arg kernel: may be a loopy.LoopKernel, a generator returning kernels
             (a warning will be issued if more than one is returned). If the
@@ -382,35 +383,35 @@ class CKernelExecutor(KernelExecutorBase):
         """
 
         self.compiler = compiler if compiler else CCompiler()
-        super(CKernelExecutor, self).__init__(kernel)
+        super(CKernelExecutor, self).__init__(program)
 
     def get_invoker_uncached(self, kernel, codegen_result):
         generator = CExecutionWrapperGenerator()
         return generator(kernel, codegen_result)
 
     @memoize_method
-    def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None):
-        kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set)
+    def program_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None):
+        program = self.get_typed_and_scheduled_program(arg_to_dtype_set)
 
         from loopy.codegen import generate_code_v2
-        codegen_result = generate_code_v2(kernel)
+        codegen_result = generate_code_v2(program)
 
         dev_code = codegen_result.device_code()
         host_code = codegen_result.host_code()
         all_code = '\n'.join([dev_code, '', host_code])
 
-        if self.kernel.options.write_cl:
+        if self.program.root_kernel.options.write_cl:
             output = all_code
-            if self.kernel.options.highlight_cl:
+            if self.program.root_kernel.options.highlight_cl:
                 output = get_highlighted_code(code=output)
 
-            if self.kernel.options.write_cl is True:
+            if self.program.root_kernel.options.write_cl is True:
                 print(output)
             else:
-                with open(self.kernel.options.write_cl, "w") as outf:
+                with open(self.program.root_kernel.options.write_cl, "w") as outf:
                     outf.write(output)
 
-        if self.kernel.options.edit_cl:
+        if self.program.root_kernel.options.edit_cl:
             from pytools import invoke_editor
             dev_code = invoke_editor(dev_code, "code.c")
             # update code from editor
@@ -419,14 +420,14 @@ class CKernelExecutor(KernelExecutorBase):
         c_kernels = []
         for dp in codegen_result.device_programs:
             c_kernels.append(CompiledCKernel(dp,
-                codegen_result.implemented_data_info, all_code, self.kernel.target,
+                codegen_result.implemented_data_info, all_code, self.program.target,
                 self.compiler))
 
         return _KernelInfo(
-                kernel=kernel,
+                program=program,
                 c_kernels=c_kernels,
                 implemented_data_info=codegen_result.implemented_data_info,
-                invoker=self.get_invoker(kernel, codegen_result))
+                invoker=self.get_invoker(program, codegen_result))
 
     # }}}
 
@@ -443,7 +444,7 @@ class CKernelExecutor(KernelExecutorBase):
 
         kwargs = self.packing_controller.unpack(kwargs)
 
-        kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs))
+        program_info = self.program_info(self.arg_to_dtype_set(kwargs))
 
-        return kernel_info.invoker(
-                kernel_info.c_kernels, *args, **kwargs)
+        return program_info.invoker(
+                program_info.c_kernels, *args, **kwargs)
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index dd2104d0..65a8c202 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -41,7 +41,7 @@ from pymbolic import var
 from loopy.expression import dtype_to_type_context
 from loopy.type_inference import TypeInferenceMapper
 
-from loopy.diagnostic import LoopyError, LoopyWarning
+from loopy.diagnostic import LoopyError
 from loopy.tools import is_integer
 from loopy.types import LoopyType
 
@@ -54,7 +54,8 @@ class ExpressionToCExpressionMapper(IdentityMapper):
         self.codegen_state = codegen_state
 
         if type_inf_mapper is None:
-            type_inf_mapper = TypeInferenceMapper(self.kernel)
+            type_inf_mapper = TypeInferenceMapper(self.kernel,
+                    self.codegen_state.program_callables_info)
         self.type_inf_mapper = type_inf_mapper
 
         self.allow_complex = codegen_state.allow_complex
@@ -383,19 +384,19 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                         "for constant '%s'" % expr)
 
     def map_call(self, expr, type_context):
-        from pymbolic.primitives import Variable, Subscript
-
-        identifier = expr.function
+        from pymbolic.primitives import Subscript
 
         # {{{ implement indexof, indexof_vec
 
-        if identifier.name in ["indexof", "indexof_vec"]:
+        identifier_name = (
+                self.codegen_state.program_callables_info[expr.function.name].name)
+        if identifier_name in ["indexof", "indexof_vec"]:
             if len(expr.parameters) != 1:
-                raise LoopyError("%s takes exactly one argument" % identifier.name)
+                raise LoopyError("%s takes exactly one argument" % identifier_name)
             arg, = expr.parameters
             if not isinstance(arg, Subscript):
                 raise LoopyError(
-                        "argument to %s must be a subscript" % identifier.name)
+                        "argument to %s must be a subscript" % identifier_name)
 
             ary = self.find_array(arg)
 
@@ -407,11 +408,11 @@ class ExpressionToCExpressionMapper(IdentityMapper):
 
             from loopy.kernel.data import ImageArg
             if isinstance(ary, ImageArg):
-                raise LoopyError("%s does not support images" % identifier.name)
+                raise LoopyError("%s does not support images" % identifier_name)
 
-            if identifier.name == "indexof":
+            if identifier_name == "indexof":
                 return access_info.subscripts[0]
-            elif identifier.name == "indexof_vec":
+            elif identifier_name == "indexof_vec":
                 from loopy.kernel.array import VectorArrayDimTag
                 ivec = None
                 for iaxis, dim_tag in enumerate(ary.dim_tags):
@@ -430,56 +431,25 @@ class ExpressionToCExpressionMapper(IdentityMapper):
 
         # }}}
 
-        if isinstance(identifier, Variable):
-            identifier = identifier.name
-
-        par_dtypes = tuple(self.infer_type(par) for par in expr.parameters)
-
-        processed_parameters = None
-
-        mangle_result = self.kernel.mangle_function(
-                identifier, par_dtypes,
-                ast_builder=self.codegen_state.ast_builder)
-
-        if mangle_result is None:
-            raise RuntimeError("function '%s' unknown--"
-                    "maybe you need to register a function mangler?"
-                    % identifier)
-
-        if len(mangle_result.result_dtypes) != 1:
-            raise LoopyError("functions with more or fewer than one return value "
-                    "may not be used in an expression")
-
-        if mangle_result.arg_dtypes is not None:
-            processed_parameters = tuple(
-                    self.rec(par,
-                        dtype_to_type_context(self.kernel.target, tgt_dtype),
-                        tgt_dtype)
-                    for par, par_dtype, tgt_dtype in zip(
-                        expr.parameters, par_dtypes, mangle_result.arg_dtypes))
-
-        else:
-            # /!\ FIXME For some functions (e.g. 'sin'), it makes sense to
-            # propagate the type context here. But for many others, it does
-            # not. Using the inferred type as a stopgap for now.
-            processed_parameters = tuple(
-                    self.rec(par,
-                        type_context=dtype_to_type_context(
-                            self.kernel.target, par_dtype))
-                    for par, par_dtype in zip(expr.parameters, par_dtypes))
-
-            from warnings import warn
-            warn("Calling function '%s' with unknown C signature--"
-                    "return CallMangleInfo.arg_dtypes"
-                    % identifier, LoopyWarning)
-
-        from loopy.codegen import SeenFunction
-        self.codegen_state.seen_functions.add(
-                SeenFunction(identifier,
-                    mangle_result.target_name,
-                    mangle_result.arg_dtypes or par_dtypes))
-
-        return var(mangle_result.target_name)(*processed_parameters)
+        from loopy.kernel.function_interface import ManglerCallable
+        if isinstance(self.codegen_state.program_callables_info[expr.function.name],
+                ManglerCallable):
+            from loopy.codegen import SeenFunction
+            in_knl_callable = (
+                    self.codegen_state.program_callables_info[
+                        expr.function.name])
+            mangle_result = in_knl_callable.mangle_result(self.kernel)
+            self.codegen_state.seen_functions.add(
+                    SeenFunction(identifier_name,
+                        mangle_result.target_name,
+                        mangle_result.arg_dtypes))
+
+        return (
+                self.codegen_state.program_callables_info[
+                    expr.function.name].emit_call(
+                        expression_to_code_mapper=self,
+                    expression=expr,
+                    target=self.kernel.target))
 
     # {{{ deal with complex-valued variables
 
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index 673d3b28..89cbfd03 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -34,6 +34,7 @@ from loopy.diagnostic import LoopyError
 from loopy.types import NumpyType
 from loopy.kernel.data import AddressSpace
 from pymbolic import var
+from loopy.kernel.function_interface import ScalarCallable
 
 
 # {{{ vector types
@@ -111,29 +112,82 @@ def _register_vector_types(dtype_registry):
 # }}}
 
 
-# {{{ function mangler
+# {{{ function scoper
 
-def cuda_function_mangler(kernel, name, arg_dtypes):
-    if not isinstance(name, str):
-        return None
+_CUDA_SPECIFIC_FUNCTIONS = {
+        "rsqrt": 1,
+        "atan2": 2,
+        }
 
-    if name in ["max", "min"] and len(arg_dtypes) == 2:
-        dtype = np.find_common_type([], arg_dtypes)
 
-        if dtype.kind == "c":
-            raise RuntimeError("min/max do not support complex numbers")
+class CudaCallable(ScalarCallable):
 
-        if dtype.kind == "f":
-            name = "f" + name
+    def cuda_with_types(self, arg_id_to_dtype, caller_kernel,
+            program_callables_info):
 
-        return dtype, name
+        name = self.name
 
-    if name in "atan2" and len(arg_dtypes) == 2:
-        return arg_dtypes[0], name
+        if name == "dot":
+            for id in arg_id_to_dtype:
+                if not -1 <= id <= 1:
+                    raise LoopyError("%s can take only 2 arguments." % name)
 
-    if name == "dot":
-        scalar_dtype, offset, field_name = arg_dtypes[0].fields["x"]
-        return scalar_dtype, name
+            if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or (
+                    arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None):
+                # the types provided aren't mature enough to specialize the
+                # callable
+                return (
+                        self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                        program_callables_info)
+
+            dtype = arg_id_to_dtype[0]
+            scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["x"]
+            return (
+                    self.copy(name_in_target=name, arg_id_to_dtype={-1:
+                        NumpyType(scalar_dtype),
+                        0: dtype, 1: dtype}),
+                    program_callables_info)
+
+        if name in _CUDA_SPECIFIC_FUNCTIONS:
+            num_args = _CUDA_SPECIFIC_FUNCTIONS[name]
+            for id in arg_id_to_dtype:
+                if not -1 <= id < num_args:
+                    raise LoopyError("%s can take only %d arguments." % (name,
+                            num_args))
+
+            for i in range(num_args):
+                if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None:
+                    # the types provided aren't mature enough to specialize the
+                    # callable
+                    return (
+                            self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                            program_callables_info)
+
+            dtype = np.find_common_type(
+                    [], [dtype.numpy_dtype for id, dtype in
+                        arg_id_to_dtype.items() if id >= 0])
+
+            if dtype.kind == "c":
+                raise LoopyError("%s does not support complex numbers"
+                        % name)
+
+            updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1,
+                num_args))
+
+            return (
+                    self.copy(name_in_target=name,
+                        arg_id_to_dtype=updated_arg_id_to_dtype),
+                    program_callables_info)
+
+        return (
+                self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                program_callables_info)
+
+
+def scope_cuda_functions(target, identifier):
+    if identifier in set(["dot"]) | set(
+            _CUDA_SPECIFIC_FUNCTIONS):
+        return CudaCallable(name=identifier)
 
     return None
 
@@ -217,13 +271,12 @@ class CudaTarget(CTarget):
 # {{{ ast builder
 
 class CUDACASTBuilder(CASTBuilder):
+
     # {{{ library
 
-    def function_manglers(self):
-        return (
-                super(CUDACASTBuilder, self).function_manglers() + [
-                    cuda_function_mangler
-                    ])
+    def function_scopers(self):
+        return [scope_cuda_functions] + (
+                super(CUDACASTBuilder, self).function_scopers())
 
     # }}}
 
@@ -249,7 +302,8 @@ class CUDACASTBuilder(CASTBuilder):
         _, local_grid_size = \
                 codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs(
                         get_insn_ids_for_block_at(
-                            codegen_state.kernel.schedule, schedule_index))
+                            codegen_state.kernel.schedule, schedule_index),
+                        codegen_state.program_callables_info)
 
         from loopy.symbolic import get_dependencies
         if not get_dependencies(local_grid_size):
diff --git a/loopy/target/execution.py b/loopy/target/execution.py
index 3cdf2057..43963ddb 100644
--- a/loopy/target/execution.py
+++ b/loopy/target/execution.py
@@ -61,12 +61,12 @@ class SeparateArrayPackingController(object):
     It also repacks outgoing arrays of this type back into an object array.
     """
 
-    def __init__(self, kernel):
+    def __init__(self, program):
         # map from arg name
         self.packing_info = {}
 
         from loopy.kernel.array import ArrayBase
-        for arg in kernel.args:
+        for arg in program.args:
             if not isinstance(arg, ArrayBase):
                 continue
 
@@ -82,7 +82,8 @@ class SeparateArrayPackingController(object):
                     name=arg.name,
                     sep_shape=arg.sep_shape(),
                     subscripts_and_names=subscripts_and_names,
-                    is_written=arg.name in kernel.get_written_variables())
+                    is_written=arg.name in
+                    program.root_kernel.get_written_variables())
 
     def unpack(self, kernel_kwargs):
         if not self.packing_info:
@@ -143,7 +144,7 @@ class ExecutionWrapperGeneratorBase(object):
     # {{{ integer arg finding from shapes
 
     def generate_integer_arg_finding_from_shapes(
-            self, gen, kernel, implemented_data_info):
+            self, gen, program, implemented_data_info):
         # a mapping from integer argument names to a list of tuples
         # (arg_name, expression), where expression is a
         # unary function of kernel.arg_dict[arg_name]
@@ -168,7 +169,8 @@ class ExecutionWrapperGeneratorBase(object):
                     if len(deps) == 1:
                         integer_arg_var, = deps
 
-                        if kernel.arg_dict[integer_arg_var.name].dtype.is_integral():
+                        if program.arg_dict[
+                                integer_arg_var.name].dtype.is_integral():
                             from pymbolic.algorithm import solve_affine_equations_for
                             try:
                                 # friggin' overkill :)
@@ -214,9 +216,9 @@ class ExecutionWrapperGeneratorBase(object):
 
     # {{{ integer arg finding from offsets
 
-    def generate_integer_arg_finding_from_offsets(self, gen, kernel,
+    def generate_integer_arg_finding_from_offsets(self, gen, program,
                                                   implemented_data_info):
-        options = kernel.options
+        options = program.root_kernel.options
 
         gen("# {{{ find integer arguments from offsets")
         gen("")
@@ -239,7 +241,7 @@ class ExecutionWrapperGeneratorBase(object):
                         else:
                             gen("_lpy_offset = %s.offset" % impl_array_name)
 
-                        base_arg = kernel.impl_arg_to_arg[impl_array_name]
+                        base_arg = program.impl_arg_to_arg[impl_array_name]
 
                         if not options.skip_arg_checks:
                             gen("%s, _lpy_remdr = divmod(_lpy_offset, %d)"
@@ -264,8 +266,8 @@ class ExecutionWrapperGeneratorBase(object):
     # {{{ integer arg finding from strides
 
     def generate_integer_arg_finding_from_strides(
-            self, gen, kernel, implemented_data_info):
-        options = kernel.options
+            self, gen, program, implemented_data_info):
+        options = program.root_kernel.options
 
         gen("# {{{ find integer arguments from strides")
         gen("")
@@ -284,7 +286,7 @@ class ExecutionWrapperGeneratorBase(object):
                                     "passed array\")"
                                     % (arg.name, impl_array_name))
 
-                        base_arg = kernel.impl_arg_to_arg[impl_array_name]
+                        base_arg = program.impl_arg_to_arg[impl_array_name]
 
                         if not options.skip_arg_checks:
                             gen("%s, _lpy_remdr = divmod(%s.strides[%d], %d)"
@@ -307,8 +309,8 @@ class ExecutionWrapperGeneratorBase(object):
     # {{{ check that value args are present
 
     def generate_value_arg_check(
-            self, gen, kernel, implemented_data_info):
-        if kernel.options.skip_arg_checks:
+            self, gen, program, implemented_data_info):
+        if program.root_kernel.options.skip_arg_checks:
             return
 
         from loopy.kernel.data import ValueArg
@@ -361,7 +363,7 @@ class ExecutionWrapperGeneratorBase(object):
     # {{{ arg setup
 
     def generate_arg_setup(
-            self, gen, kernel, implemented_data_info, options):
+            self, gen, program, implemented_data_info, options):
         import loopy as lp
 
         from loopy.kernel.data import KernelArgument
@@ -384,8 +386,8 @@ class ExecutionWrapperGeneratorBase(object):
         expect_no_more_arguments = False
 
         for arg_idx, arg in enumerate(implemented_data_info):
-            is_written = arg.base_name in kernel.get_written_variables()
-            kernel_arg = kernel.impl_arg_to_arg.get(arg.name)
+            is_written = arg.base_name in program.root_kernel.get_written_variables()
+            program_arg = program.impl_arg_to_arg.get(arg.name)
 
             if not issubclass(arg.arg_class, KernelArgument):
                 expect_no_more_arguments = True
@@ -447,7 +449,7 @@ class ExecutionWrapperGeneratorBase(object):
                 gen("if %s is None:" % arg.name)
                 with Indentation(gen):
                     self.handle_alloc(
-                        gen, arg, kernel_arg, strify, options.skip_arg_checks)
+                        gen, arg, program_arg, strify, options.skip_arg_checks)
                     gen("_lpy_made_by_loopy = True")
                     gen("")
 
@@ -465,7 +467,7 @@ class ExecutionWrapperGeneratorBase(object):
                 with Indentation(gen):
                     gen("if %s.dtype != %s:"
                             % (arg.name, self.python_dtype_str(
-                                kernel_arg.dtype.numpy_dtype)))
+                                program_arg.dtype.numpy_dtype)))
                     with Indentation(gen):
                         gen("raise TypeError(\"dtype mismatch on argument '%s' "
                                 "(got: %%s, expected: %s)\" %% %s.dtype)"
@@ -493,10 +495,10 @@ class ExecutionWrapperGeneratorBase(object):
                             "%% (%s.shape, %s))"
                             % (arg.name, arg.name, strify_tuple(arg.unvec_shape)))
 
-                    if kernel_arg.shape is None:
+                    if program_arg.shape is None:
                         pass
 
-                    elif any(shape_axis is None for shape_axis in kernel_arg.shape):
+                    elif any(shape_axis is None for shape_axis in program_arg.shape):
                         gen("if len(%s.shape) != %s:"
                                 % (arg.name, len(arg.unvec_shape)))
                         with Indentation(gen):
@@ -519,8 +521,8 @@ class ExecutionWrapperGeneratorBase(object):
 
                     # }}}
 
-                    if arg.unvec_strides and kernel_arg.dim_tags:
-                        itemsize = kernel_arg.dtype.numpy_dtype.itemsize
+                    if arg.unvec_strides and program_arg.dim_tags:
+                        itemsize = program_arg.dtype.numpy_dtype.itemsize
                         sym_strides = tuple(
                                 itemsize*s_i for s_i in arg.unvec_strides)
 
@@ -558,7 +560,7 @@ class ExecutionWrapperGeneratorBase(object):
                         with Indentation(gen):
                             gen("raise ValueError(\"Argument '%s' does not "
                                     "allow arrays with offsets. Try passing "
-                                    "default_offset=loopy.auto to make_kernel()."
+                                    "default_offset=loopy.auto to make_program()."
                                     "\")" % arg.name)
                             gen("")
 
@@ -617,7 +619,7 @@ class ExecutionWrapperGeneratorBase(object):
     def generate_host_code(self, gen, codegen_result):
         raise NotImplementedError
 
-    def __call__(self, kernel, codegen_result):
+    def __call__(self, program, codegen_result):
         """
         Generates the wrapping python invoker for this execution target
 
@@ -629,12 +631,12 @@ class ExecutionWrapperGeneratorBase(object):
             kernel
         """
 
-        options = kernel.options
+        options = program.root_kernel.options
         implemented_data_info = codegen_result.implemented_data_info
 
         from loopy.kernel.data import KernelArgument
         gen = PythonFunctionGenerator(
-                "invoke_%s_loopy_kernel" % kernel.name,
+                "invoke_%s_loopy_kernel" % program.name,
                 self.system_args + [
                     "%s=None" % idi.name
                     for idi in implemented_data_info
@@ -651,21 +653,21 @@ class ExecutionWrapperGeneratorBase(object):
         self.initialize_system_args(gen)
 
         self.generate_integer_arg_finding_from_shapes(
-            gen, kernel, implemented_data_info)
+            gen, program, implemented_data_info)
         self.generate_integer_arg_finding_from_offsets(
-            gen, kernel, implemented_data_info)
+            gen, program, implemented_data_info)
         self.generate_integer_arg_finding_from_strides(
-            gen, kernel, implemented_data_info)
+            gen, program, implemented_data_info)
         self.generate_value_arg_check(
-            gen, kernel, implemented_data_info)
+            gen, program, implemented_data_info)
 
         args = self.generate_arg_setup(
-            gen, kernel, implemented_data_info, options)
+            gen, program, implemented_data_info, options)
 
         self.generate_invocation(gen, codegen_result.host_program.name, args,
-                kernel, implemented_data_info)
+                program, implemented_data_info)
 
-        self.generate_output_handler(gen, options, kernel, implemented_data_info)
+        self.generate_output_handler(gen, options, program, implemented_data_info)
 
         if options.write_wrapper:
             output = gen.get()
@@ -713,32 +715,32 @@ class KernelExecutorBase(object):
     .. automethod:: __call__
     """
 
-    def __init__(self, kernel):
+    def __init__(self, program):
         """
         :arg kernel: a loopy.LoopKernel
         """
 
-        self.kernel = kernel
+        self.program = program
 
-        self.packing_controller = SeparateArrayPackingController(kernel)
+        self.packing_controller = SeparateArrayPackingController(program)
 
-        self.output_names = tuple(arg.name for arg in self.kernel.args
-                if arg.name in self.kernel.get_written_variables())
+        self.output_names = tuple(arg.name for arg in self.program.args
+                if arg.is_output_only)
 
         self.has_runtime_typed_args = any(
                 arg.dtype is None
-                for arg in kernel.args)
+                for arg in program.args)
 
-    def get_typed_and_scheduled_kernel_uncached(self, arg_to_dtype_set):
+    def get_typed_and_scheduled_program_uncached(self, arg_to_dtype_set):
         from loopy.kernel.tools import add_dtypes
 
-        kernel = self.kernel
+        program = self.program
 
         if arg_to_dtype_set:
             var_to_dtype = {}
             for var, dtype in arg_to_dtype_set:
                 try:
-                    dest_name = kernel.impl_arg_to_arg[var].name
+                    dest_name = program.impl_arg_to_arg[var].name
                 except KeyError:
                     dest_name = var
 
@@ -749,28 +751,30 @@ class KernelExecutorBase(object):
                             "no known variable/argument with that name"
                             % var)
 
-            kernel = add_dtypes(kernel, var_to_dtype)
+            program = add_dtypes(program, var_to_dtype)
 
-            from loopy.type_inference import infer_unknown_types
-            kernel = infer_unknown_types(kernel, expect_completion=True)
+        from loopy.type_inference import infer_unknown_types
+        program = infer_unknown_types(program, expect_completion=True)
 
-        if kernel.schedule is None:
-            from loopy.preprocess import preprocess_kernel
-            kernel = preprocess_kernel(kernel)
+        if program.root_kernel.schedule is None:
+            from loopy.preprocess import preprocess_program
+            program = preprocess_program(program)
 
             from loopy.schedule import get_one_scheduled_kernel
-            kernel = get_one_scheduled_kernel(kernel)
+            program = program.with_root_kernel(
+                    get_one_scheduled_kernel(program.root_kernel,
+                        program.program_callables_info))
 
-        return kernel
+        return program
 
-    def get_typed_and_scheduled_kernel(self, arg_to_dtype_set):
+    def get_typed_and_scheduled_program(self, arg_to_dtype_set):
         from loopy import CACHING_ENABLED
 
         from loopy.preprocess import prepare_for_caching
         # prepare_for_caching() gets run by preprocess, but the kernel at this
         # stage is not guaranteed to be preprocessed.
-        cacheable_kernel = prepare_for_caching(self.kernel)
-        cache_key = (type(self).__name__, cacheable_kernel, arg_to_dtype_set)
+        cacheable_program = prepare_for_caching(self.program)
+        cache_key = (type(self).__name__, cacheable_program, arg_to_dtype_set)
 
         if CACHING_ENABLED:
             try:
@@ -778,9 +782,9 @@ class KernelExecutorBase(object):
             except KeyError:
                 pass
 
-        logger.debug("%s: typed-and-scheduled cache miss" % self.kernel.name)
+        logger.debug("%s: typed-and-scheduled cache miss" % self.program.name)
 
-        kernel = self.get_typed_and_scheduled_kernel_uncached(arg_to_dtype_set)
+        kernel = self.get_typed_and_scheduled_program_uncached(arg_to_dtype_set)
 
         if CACHING_ENABLED:
             typed_and_scheduled_cache.store_if_not_present(cache_key, kernel)
@@ -791,7 +795,7 @@ class KernelExecutorBase(object):
         if not self.has_runtime_typed_args:
             return None
 
-        impl_arg_to_arg = self.kernel.impl_arg_to_arg
+        impl_arg_to_arg = self.program.impl_arg_to_arg
         arg_to_dtype = {}
         for arg_name, val in six.iteritems(kwargs):
             arg = impl_arg_to_arg.get(arg_name, None)
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 0464270a..53963183 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -172,8 +172,9 @@ class ISPCTarget(CTarget):
     host_program_name_suffix = ""
     device_program_name_suffix = "_inner"
 
-    def pre_codegen_check(self, kernel):
-        gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs()
+    def pre_codegen_check(self, kernel, program_callables_info):
+        gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs(
+                program_callables_info)
         if len(lsize) > 1:
             for i, ls_i in enumerate(lsize[1:]):
                 if ls_i != 1:
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 432c95ef..44f782a7 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -31,11 +31,11 @@ from loopy.target.c.codegen.expression import ExpressionToCExpressionMapper
 from pytools import memoize_method
 from loopy.diagnostic import LoopyError
 from loopy.types import NumpyType
-from loopy.target.c import DTypeRegistryWrapper, c_math_mangler
-from loopy.kernel.data import AddressSpace, CallMangleInfo
+from loopy.target.c import DTypeRegistryWrapper
+from loopy.kernel.data import AddressSpace
+from loopy.kernel.function_interface import ScalarCallable
 from pymbolic import var
 
-from functools import partial
 
 # {{{ dtype registry wrappers
 
@@ -166,59 +166,135 @@ VECTOR_LITERAL_FUNCS = dict(
         )
 
 
-def opencl_function_mangler(kernel, name, arg_dtypes):
-    if not isinstance(name, str):
-        return None
+class OpenCLCallable(ScalarCallable):
+    """
+    Records information about OpenCL functions which are not covered by
+    :class:`loopy.target.c.CMathCallable`.
+    """
+
+    def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info):
+        name = self.name
+
+        if name in ["max", "min"]:
+            for id in arg_id_to_dtype:
+                if not -1 <= id <= 1:
+                    raise LoopyError("%s can take only 2 arguments." % name)
+            if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype:
+                return (
+                        self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                        program_callables_info)
+
+            dtype = np.find_common_type(
+                    [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items()
+                        if (id >= 0 and dtype is not None)])
+
+            if dtype.kind in ['u', 'i', 'f']:
+                if dtype.kind == 'f':
+                    name = 'f'+name
+                dtype = NumpyType(dtype)
+                return (
+                        self.copy(name_in_target=name,
+                            arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}),
+                        program_callables_info)
+            else:
+                # Unsupported type.
+                raise LoopyError("%s function not supported for the types %s" %
+                        (name, dtype))
+
+        if name == "dot":
+            for id in arg_id_to_dtype:
+                if not -1 <= id <= 1:
+                    raise LoopyError("%s can take only 2 arguments." % name)
+
+            if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or (
+                    arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None):
+                # the types provided aren't mature enough to specialize the
+                # callable
+                return (
+                        self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                        program_callables_info)
+
+            dtype = arg_id_to_dtype[0]
+            scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["s0"]
+            return (
+                    self.copy(name_in_target=name, arg_id_to_dtype={-1:
+                        NumpyType(scalar_dtype), 0: dtype, 1: dtype}),
+                    program_callables_info)
+
+        if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS:
+            num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name]
+            for id in arg_id_to_dtype:
+                if not -1 <= id < num_args:
+                    raise LoopyError("%s can take only %d arguments." % (name,
+                            num_args))
+
+            for i in range(num_args):
+                if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None:
+                    # the types provided aren't mature enough to specialize the
+                    # callable
+                    return (
+                            self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                            program_callables_info)
+
+            dtype = np.find_common_type(
+                    [], [dtype.numpy_dtype for id, dtype in
+                        arg_id_to_dtype.items() if id >= 0])
+
+            if dtype.kind == "c":
+                raise LoopyError("%s does not support complex numbers"
+                        % name)
+
+            updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in range(-1,
+                num_args))
+
+            return (
+                    self.copy(name_in_target=name,
+                        arg_id_to_dtype=updated_arg_id_to_dtype),
+                    program_callables_info)
+
+        if name in VECTOR_LITERAL_FUNCS:
+            base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name]
+
+            for id in arg_id_to_dtype:
+                if not -1 <= id < count:
+                    raise LoopyError("%s can take only %d arguments." % (name,
+                            num_args))
+
+            for i in range(count):
+                if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None:
+                    # the types provided aren't mature enough to specialize the
+                    # callable
+                    return (
+                            self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                            program_callables_info)
+
+            updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in
+                    range(count))
+            updated_arg_id_to_dtype[-1] = OpenCLTarget().vector_dtype(
+                        NumpyType(dtype), count)
+
+            return (
+                    self.copy(name_in_target="(%s%d) " % (base_tp_name, count),
+                        arg_id_to_dtype=updated_arg_id_to_dtype),
+                    program_callables_info)
+
+        # does not satisfy any of the conditions needed for specialization.
+        # hence just returning a copy of the callable.
+        return (
+                self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                program_callables_info)
 
-    # OpenCL has min(), max() for integer types
-    if name in ["max", "min"] and len(arg_dtypes) == 2:
-        dtype = np.find_common_type(
-                [], [dtype.numpy_dtype for dtype in arg_dtypes])
-
-        if dtype.kind == "i":
-            result_dtype = NumpyType(dtype)
-            return CallMangleInfo(
-                    target_name=name,
-                    result_dtypes=(result_dtype,),
-                    arg_dtypes=2*(result_dtype,))
-
-    if name == "dot":
-        scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields["s0"]
-        return CallMangleInfo(
-                target_name=name,
-                result_dtypes=(NumpyType(scalar_dtype),),
-                arg_dtypes=(arg_dtypes[0],)*2)
-
-    if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS:
-        num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name]
-        if len(arg_dtypes) != num_args:
-            raise LoopyError("%s takes %d arguments (%d received)"
-                    % (name, num_args, len(arg_dtypes)))
-
-        dtype = np.find_common_type(
-                [], [dtype.numpy_dtype for dtype in arg_dtypes])
-
-        if dtype.kind == "c":
-            raise LoopyError("%s does not support complex numbers"
-                    % name)
-
-        result_dtype = NumpyType(dtype)
-        return CallMangleInfo(
-                target_name=name,
-                result_dtypes=(result_dtype,),
-                arg_dtypes=(result_dtype,)*num_args)
-
-    if name in VECTOR_LITERAL_FUNCS:
-        base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name]
-
-        if count != len(arg_dtypes):
-            return None
-
-        return CallMangleInfo(
-                target_name="(%s%d) " % (base_tp_name, count),
-                result_dtypes=(kernel.target.vector_dtype(
-                    NumpyType(dtype), count),),
-                arg_dtypes=(NumpyType(dtype),)*count)
+
+def scope_opencl_functions(target, identifier):
+    """
+    Returns an instance of :class:`InKernelCallable` if the function defined by
+    *identifier* is known in OpenCL.
+    """
+    opencl_function_ids = set(["max", "min", "dot"]) | set(
+            _CL_SIMPLE_MULTI_ARG_FUNCTIONS) | set(VECTOR_LITERAL_FUNCS)
+
+    if identifier in opencl_function_ids:
+        return OpenCLCallable(name=identifier)
 
     return None
 
@@ -280,6 +356,7 @@ def opencl_preamble_generator(preamble_info):
 
     from loopy.tools import remove_common_indentation
     kernel = preamble_info.kernel
+
     yield ("00_declare_gid_lid",
             remove_common_indentation("""
                 #define lid(N) ((%(idx_ctype)s) get_local_id(N))
@@ -365,13 +442,10 @@ class OpenCLTarget(CTarget):
 class OpenCLCASTBuilder(CASTBuilder):
     # {{{ library
 
-    def function_manglers(self):
+    def function_scopers(self):
         return (
-                [
-                    opencl_function_mangler,
-                    partial(c_math_mangler, modify_name=False)
-                ] +
-                super(OpenCLCASTBuilder, self).function_manglers())
+                [scope_opencl_functions] + super(
+                    OpenCLCASTBuilder, self).function_scopers())
 
     def symbol_manglers(self):
         return (
@@ -380,13 +454,10 @@ class OpenCLCASTBuilder(CASTBuilder):
                     ])
 
     def preamble_generators(self):
-        from loopy.library.reduction import reduction_preamble_generator
 
         return (
                 super(OpenCLCASTBuilder, self).preamble_generators() + [
-                    opencl_preamble_generator,
-                    reduction_preamble_generator,
-                    ])
+                    opencl_preamble_generator])
 
     # }}}
 
@@ -399,6 +470,11 @@ class OpenCLCASTBuilder(CASTBuilder):
 
         from loopy.target.c import FunctionDeclarationWrapper
         assert isinstance(fdecl, FunctionDeclarationWrapper)
+        if not codegen_state.kernel.is_called_from_host:
+            # auxiliary kernels need not mention opencl speicific qualifiers
+            # for a functions signature
+            return fdecl
+
         fdecl = fdecl.subdecl
 
         from cgen.opencl import CLKernel, CLRequiredWorkGroupSize
@@ -407,7 +483,8 @@ class OpenCLCASTBuilder(CASTBuilder):
         from loopy.schedule import get_insn_ids_for_block_at
         _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs(
                 get_insn_ids_for_block_at(
-                    codegen_state.kernel.schedule, schedule_index))
+                    codegen_state.kernel.schedule, schedule_index),
+                codegen_state.program_callables_info)
 
         from loopy.symbolic import get_dependencies
         if not get_dependencies(local_sizes):
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index 73e8e009..03ba2693 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -31,12 +31,12 @@ from six.moves import range
 
 import numpy as np
 
-from loopy.kernel.data import CallMangleInfo
 from loopy.target.opencl import OpenCLTarget, OpenCLCASTBuilder
 from loopy.target.python import PythonASTBuilderBase
 from loopy.types import NumpyType
-from loopy.diagnostic import LoopyError, warn_with_kernel
+from loopy.diagnostic import LoopyError, warn_with_kernel, LoopyTypeError
 from warnings import warn
+from loopy.kernel.function_interface import ScalarCallable
 
 import logging
 logger = logging.getLogger(__name__)
@@ -134,7 +134,7 @@ def adjust_local_temp_var_storage(kernel, device):
 
 # {{{ check sizes against device properties
 
-def check_sizes(kernel, device):
+def check_sizes(kernel, program_callables_info, device):
     import loopy as lp
 
     from loopy.diagnostic import LoopyAdvisory, LoopyError
@@ -151,7 +151,8 @@ def check_sizes(kernel, device):
         if isinstance(arg, lp.ValueArg) and arg.approximately is not None:
             parameters[arg.name] = arg.approximately
 
-    glens, llens = kernel.get_grid_size_upper_bounds_as_exprs()
+    glens, llens = (
+            kernel.get_grid_size_upper_bounds_as_exprs(program_callables_info))
 
     if (max(len(glens), len(llens))
             > device.max_work_item_dimensions):
@@ -199,37 +200,89 @@ def check_sizes(kernel, device):
 # }}}
 
 
-def pyopencl_function_mangler(target, name, arg_dtypes):
-    if len(arg_dtypes) == 1 and isinstance(name, str):
-        arg_dtype, = arg_dtypes
+# {{{ pyopencl function scopers
 
-        if arg_dtype.is_complex():
-            if arg_dtype.numpy_dtype == np.complex64:
-                tpname = "cfloat"
-            elif arg_dtype.numpy_dtype == np.complex128:
-                tpname = "cdouble"
+class PyOpenCLCallable(ScalarCallable):
+    """
+    Records information about the callables which are not covered by
+    :class:`loopy.target.opencl.OpenCLCallable`
+    """
+    def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info):
+
+        name = self.name
+
+        for id in arg_id_to_dtype:
+            # since all the below functions are single arg.
+            if not -1 <= id <= 0:
+                raise LoopyError("%s can only take one argument." % name)
+
+        if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None:
+            # the types provided aren't mature enough to specialize the
+            # callable
+            return (
+                    self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                    program_callables_info)
+
+        dtype = arg_id_to_dtype[0]
+
+        if name in ["real", "imag", "abs"]:
+            if dtype.is_complex():
+                if dtype.numpy_dtype == np.complex64:
+                    tpname = "cfloat"
+                elif dtype.numpy_dtype == np.complex128:
+                    tpname = "cdouble"
+                else:
+                    raise LoopyTypeError("unexpected complex type '%s'" % dtype)
+
+                return (
+                        self.copy(name_in_target="%s_%s" % (tpname, name),
+                            arg_id_to_dtype={0: dtype, -1: NumpyType(
+                                np.dtype(dtype.numpy_dtype.type(0).real))}),
+                        program_callables_info)
+
+        if name in ["sqrt", "exp", "log",
+                "sin", "cos", "tan",
+                "sinh", "cosh", "tanh",
+                "conj", "abs"]:
+            if dtype.is_complex():
+                # function parameters are complex.
+                if dtype.numpy_dtype == np.complex64:
+                    tpname = "cfloat"
+                elif dtype.numpy_dtype == np.complex128:
+                    tpname = "cdouble"
+                else:
+                    raise LoopyTypeError("unexpected complex type '%s'" % dtype)
+
+                return (
+                        self.copy(name_in_target="%s_%s" % (tpname, name),
+                            arg_id_to_dtype={0: dtype, -1: dtype}),
+                        program_callables_info)
             else:
-                raise RuntimeError("unexpected complex type '%s'" % arg_dtype)
-
-            if name in ["sqrt", "exp", "log",
-                    "sin", "cos", "tan",
-                    "sinh", "cosh", "tanh",
-                    "conj"]:
-                return CallMangleInfo(
-                        target_name="%s_%s" % (tpname, name),
-                        result_dtypes=(arg_dtype,),
-                        arg_dtypes=(arg_dtype,))
-
-            if name in ["real", "imag", "abs"]:
-                return CallMangleInfo(
-                        target_name="%s_%s" % (tpname, name),
-                        result_dtypes=(NumpyType(
-                            np.dtype(arg_dtype.numpy_dtype.type(0).real)),
-                            ),
-                        arg_dtypes=(arg_dtype,))
+                # function calls for floating parameters.
+                numpy_dtype = dtype.numpy_dtype
+                if numpy_dtype.kind in ('u', 'i'):
+                    dtype = dtype.copy(numpy_dtype=np.float32)
+                if name == 'abs':
+                    name = 'fabs'
+                return (
+                        self.copy(name_in_target=name,
+                            arg_id_to_dtype={0: dtype, -1: dtype}),
+                        program_callables_info)
+
+        return (
+                self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                program_callables_info)
+
+
+def pyopencl_function_scoper(target, identifier):
+    if identifier in ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh",
+            "tanh", "conj", "real", "imag", "abs"]:
+        return PyOpenCLCallable(name=identifier)
 
     return None
 
+# }}}
+
 
 # {{{ preamble generator
 
@@ -344,8 +397,8 @@ class PyOpenCLTarget(OpenCLTarget):
             kernel = adjust_local_temp_var_storage(kernel, self.device)
         return kernel
 
-    def pre_codegen_check(self, kernel):
-        check_sizes(kernel, self.device)
+    def pre_codegen_check(self, kernel, program_callables_info):
+        check_sizes(kernel, program_callables_info, self.device)
 
     def get_host_ast_builder(self):
         return PyOpenCLPythonASTBuilder(self)
@@ -739,19 +792,15 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder):
 
     # {{{ library
 
-    def function_manglers(self):
-        from loopy.library.random123 import random123_function_mangler
+    def function_scopers(self):
+        from loopy.library.random123 import random123_function_scoper
         return (
-                super(PyOpenCLCASTBuilder, self).function_manglers() + [
-                    pyopencl_function_mangler,
-                    random123_function_mangler
-                    ])
+                [pyopencl_function_scoper, random123_function_scoper] + super(
+                    PyOpenCLCASTBuilder, self).function_scopers())
 
     def preamble_generators(self):
-        from loopy.library.random123 import random123_preamble_generator
         return ([
             pyopencl_preamble_generator,
-            random123_preamble_generator,
             ] + super(PyOpenCLCASTBuilder, self).preamble_generators())
 
     # }}}
diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py
index 27be6198..380ab1d9 100644
--- a/loopy/target/pyopencl_execution.py
+++ b/loopy/target/pyopencl_execution.py
@@ -151,9 +151,9 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
 
     # {{{ generate invocation
 
-    def generate_invocation(self, gen, kernel_name, args,
-            kernel, implemented_data_info):
-        if kernel.options.cl_exec_manage_array_events:
+    def generate_invocation(self, gen, program_name, args,
+            program, implemented_data_info):
+        if program.root_kernel.options.cl_exec_manage_array_events:
             gen("""
                 if wait_for is None:
                     wait_for = []
@@ -169,20 +169,21 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
 
             gen("")
 
-        gen("_lpy_evt = {kernel_name}({args})"
+        gen("_lpy_evt = {program_name}({args})"
         .format(
-            kernel_name=kernel_name,
+            program_name=program_name,
             args=", ".join(
                 ["_lpy_cl_kernels", "queue"]
                 + args
                 + ["wait_for=wait_for"])))
 
-        if kernel.options.cl_exec_manage_array_events:
+        if program.root_kernel.options.cl_exec_manage_array_events:
             gen("")
             from loopy.kernel.data import ArrayArg
             for arg in implemented_data_info:
                 if (issubclass(arg.arg_class, ArrayArg)
-                        and arg.base_name in kernel.get_written_variables()):
+                        and arg.base_name in (
+                            program.root_kernel.get_written_variables())):
                     gen("{arg_name}.add_event(_lpy_evt)".format(arg_name=arg.name))
 
     # }}}
@@ -190,7 +191,7 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
     # {{{
 
     def generate_output_handler(
-            self, gen, options, kernel, implemented_data_info):
+            self, gen, options, program, implemented_data_info):
 
         from loopy.kernel.data import KernelArgument
 
@@ -207,7 +208,8 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
                     if not issubclass(arg.arg_class, KernelArgument):
                         continue
 
-                    is_written = arg.base_name in kernel.get_written_variables()
+                    is_written = arg.base_name in (
+                            program.root_kernel.get_written_variables())
                     if is_written:
                         gen("%s = %s.get(queue=queue)" % (arg.name, arg.name))
 
@@ -218,12 +220,13 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase):
                     % ", ".join("\"%s\": %s" % (arg.name, arg.name)
                         for arg in implemented_data_info
                         if issubclass(arg.arg_class, KernelArgument)
-                        if arg.base_name in kernel.get_written_variables()))
+                        if arg.base_name in
+                        program.root_kernel.get_written_variables()))
         else:
             out_args = [arg
                     for arg in implemented_data_info
                         if issubclass(arg.arg_class, KernelArgument)
-                    if arg.base_name in kernel.get_written_variables()]
+                    if arg.base_name in program.root_kernel.get_written_variables()]
             if out_args:
                 gen("return _lpy_evt, (%s,)"
                         % ", ".join(arg.name for arg in out_args))
@@ -252,7 +255,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
     .. automethod:: __call__
     """
 
-    def __init__(self, context, kernel):
+    def __init__(self, context, program):
         """
         :arg context: a :class:`pyopencl.Context`
         :arg kernel: may be a loopy.LoopKernel, a generator returning kernels
@@ -261,40 +264,40 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
             specific arguments.
         """
 
-        super(PyOpenCLKernelExecutor, self).__init__(kernel)
+        super(PyOpenCLKernelExecutor, self).__init__(program)
 
         self.context = context
 
         from loopy.target.pyopencl import PyOpenCLTarget
-        if isinstance(kernel.target, PyOpenCLTarget):
-            self.kernel = kernel.copy(target=PyOpenCLTarget(context.devices[0]))
+        if isinstance(program.target, PyOpenCLTarget):
+            self.program = program.copy(target=PyOpenCLTarget(context.devices[0]))
 
     def get_invoker_uncached(self, kernel, codegen_result):
         generator = PyOpenCLExecutionWrapperGenerator()
         return generator(kernel, codegen_result)
 
     @memoize_method
-    def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None):
-        kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set)
+    def program_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None):
+        program = self.get_typed_and_scheduled_program(arg_to_dtype_set)
 
         from loopy.codegen import generate_code_v2
         from loopy.target.execution import get_highlighted_code
-        codegen_result = generate_code_v2(kernel)
+        codegen_result = generate_code_v2(program)
 
         dev_code = codegen_result.device_code()
 
-        if self.kernel.options.write_cl:
+        if self.program.root_kernel.options.write_cl:
             output = dev_code
-            if self.kernel.options.highlight_cl:
+            if self.program.root_kernel.options.highlight_cl:
                 output = get_highlighted_code(output)
 
-            if self.kernel.options.write_cl is True:
+            if self.program.root_kernel.options.write_cl is True:
                 print(output)
             else:
-                with open(self.kernel.options.write_cl, "w") as outf:
+                with open(self.program.root_kernel.options.write_cl, "w") as outf:
                     outf.write(output)
 
-        if self.kernel.options.edit_cl:
+        if self.program.root_kernel.options.edit_cl:
             from pytools import invoke_editor
             dev_code = invoke_editor(dev_code, "code.cl")
 
@@ -302,17 +305,17 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
 
         cl_program = (
                 cl.Program(self.context, dev_code)
-                .build(options=kernel.options.cl_build_options))
+                .build(options=program.root_kernel.options.cl_build_options))
 
         cl_kernels = _Kernels()
         for dp in codegen_result.device_programs:
             setattr(cl_kernels, dp.name, getattr(cl_program, dp.name))
 
         return _KernelInfo(
-                kernel=kernel,
+                program=program,
                 cl_kernels=cl_kernels,
                 implemented_data_info=codegen_result.implemented_data_info,
-                invoker=self.get_invoker(kernel, codegen_result))
+                invoker=self.get_invoker(program, codegen_result))
 
     def __call__(self, queue, **kwargs):
         """
@@ -347,10 +350,10 @@ class PyOpenCLKernelExecutor(KernelExecutorBase):
 
         kwargs = self.packing_controller.unpack(kwargs)
 
-        kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs))
+        program_info = self.program_info(self.arg_to_dtype_set(kwargs))
 
-        return kernel_info.invoker(
-                kernel_info.cl_kernels, queue, allocator, wait_for,
+        return program_info.invoker(
+                program_info.cl_kernels, queue, allocator, wait_for,
                 out_host, **kwargs)
 
 # }}}
diff --git a/loopy/target/python.py b/loopy/target/python.py
index ce04986d..cd6e6116 100644
--- a/loopy/target/python.py
+++ b/loopy/target/python.py
@@ -44,7 +44,8 @@ class ExpressionToPythonMapper(StringifyMapper):
         self.codegen_state = codegen_state
 
         if type_inf_mapper is None:
-            type_inf_mapper = TypeInferenceMapper(self.kernel)
+            type_inf_mapper = TypeInferenceMapper(self.kernel,
+                    self.codegen_state.program_callables_info)
         self.type_inf_mapper = type_inf_mapper
 
     def handle_unsupported_expression(self, victim, enclosing_prec):
@@ -82,47 +83,37 @@ class ExpressionToPythonMapper(StringifyMapper):
                 expr, enclosing_prec)
 
     def map_call(self, expr, enclosing_prec):
-        from pymbolic.primitives import Variable
         from pymbolic.mapper.stringifier import PREC_NONE
 
-        identifier = expr.function
+        identifier_name = self.codegen_state.program_callables_info[
+                expr.function.name].name
 
-        if identifier.name in ["indexof", "indexof_vec"]:
+        if identifier_name in ["indexof", "indexof_vec"]:
             raise LoopyError(
                     "indexof, indexof_vec not yet supported in Python")
 
-        if isinstance(identifier, Variable):
-            identifier = identifier.name
-
-        par_dtypes = tuple(self.type_inf_mapper(par) for par in expr.parameters)
+        from loopy.kernel.function_interface import ManglerCallable
+        in_knl_callable = self.codegen_state.program_callables_info[
+                expr.function.name]
+        if isinstance(in_knl_callable, ManglerCallable):
+            from loopy.codegen import SeenFunction
+            mangle_result = in_knl_callable.mangle_result(self.kernel)
+            self.codegen_state.seen_functions.add(
+                    SeenFunction(identifier_name,
+                        mangle_result.target_name,
+                        mangle_result.arg_dtypes))
 
         str_parameters = None
+        number_of_assignees = len([key for key in
+            in_knl_callable.arg_id_to_dtype.keys() if key < 0])
 
-        mangle_result = self.kernel.mangle_function(
-                identifier, par_dtypes,
-                ast_builder=self.codegen_state.ast_builder)
-
-        if mangle_result is None:
-            raise RuntimeError("function '%s' unknown--"
-                    "maybe you need to register a function mangler?"
-                    % identifier)
-
-        if len(mangle_result.result_dtypes) != 1:
+        if number_of_assignees != 1:
             raise LoopyError("functions with more or fewer than one return value "
                     "may not be used in an expression")
 
-        str_parameters = [
-                self.rec(par, PREC_NONE)
-                for par, par_dtype, tgt_dtype in zip(
-                    expr.parameters, par_dtypes, mangle_result.arg_dtypes)]
+        str_parameters = [self.rec(par, PREC_NONE) for par in expr.parameters]
 
-        from loopy.codegen import SeenFunction
-        self.codegen_state.seen_functions.add(
-                SeenFunction(identifier,
-                    mangle_result.target_name,
-                    mangle_result.arg_dtypes or par_dtypes))
-
-        return "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters))
+        return "%s(%s)" % (in_knl_callable.name_in_target, ", ".join(str_parameters))
 
     def map_group_hw_index(self, expr, enclosing_prec):
         raise LoopyError("plain Python does not have group hw axes")
@@ -189,11 +180,11 @@ class PythonASTBuilderBase(ASTBuilderBase):
 
     # {{{ code generation guts
 
-    def function_manglers(self):
+    def function_scopers(self):
+        from loopy.target.c import scope_c_math_functions
         return (
-                super(PythonASTBuilderBase, self).function_manglers() + [
-                    _numpy_single_arg_function_mangler,
-                    ])
+                super(PythonASTBuilderBase, self).function_scopers() +
+                [scope_c_math_functions])
 
     def preamble_generators(self):
         return (
diff --git a/loopy/tools.py b/loopy/tools.py
index 8c5d3639..b243a794 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -73,7 +73,8 @@ class LoopyKeyBuilder(KeyBuilderBase):
 
     def update_for_dict(self, key_hash, key):
         # Order matters for the hash--insert in sorted order.
-        for dict_key in sorted(six.iterkeys(key)):
+        for dict_key in sorted(six.iterkeys(key), key=lambda obj:
+                type(obj).__name__ + str(obj)):
             self.rec(key_hash, (dict_key, key[dict_key]))
 
     update_for_defaultdict = update_for_dict
diff --git a/loopy/transform/add_barrier.py b/loopy/transform/add_barrier.py
index cfbbd56e..38bb2185 100644
--- a/loopy/transform/add_barrier.py
+++ b/loopy/transform/add_barrier.py
@@ -26,6 +26,8 @@ THE SOFTWARE.
 from loopy.kernel.instruction import BarrierInstruction
 from loopy.match import parse_match
 from loopy.transform.instruction import add_dependency
+from loopy.program import iterate_over_kernels_if_given_program
+from loopy.kernel import LoopKernel
 
 __doc__ = """
 .. currentmodule:: loopy
@@ -36,8 +38,10 @@ __doc__ = """
 
 # {{{ add_barrier
 
-def add_barrier(knl, insn_before="", insn_after="", id_based_on=None,
-                tags=None, synchronization_kind="global", mem_kind=None):
+@iterate_over_kernels_if_given_program
+def add_barrier(knl, insn_before="", insn_after="",
+        id_based_on=None, tags=None, synchronization_kind="global",
+        mem_kind=None):
     """Takes in a kernel that needs to be added a barrier and returns a kernel
     which has a barrier inserted into it. It takes input of 2 instructions and
     then adds a barrier in between those 2 instructions. The expressions can
@@ -55,6 +59,8 @@ def add_barrier(knl, insn_before="", insn_after="", id_based_on=None,
     for "global" bariers.  If not supplied, defaults to :arg:`synchronization_kind`
     """
 
+    assert isinstance(knl, LoopKernel)
+
     if mem_kind is None:
         mem_kind = synchronization_kind
 
@@ -76,7 +82,7 @@ def add_barrier(knl, insn_before="", insn_after="", id_based_on=None,
                                         mem_kind=mem_kind)
 
     new_knl = knl.copy(instructions=knl.instructions + [barrier_to_add])
-    new_knl = add_dependency(kernel=new_knl,
+    new_knl = add_dependency(new_knl,
                              insn_match=insn_after,
                              depends_on="id:"+id)
 
diff --git a/loopy/transform/arithmetic.py b/loopy/transform/arithmetic.py
index b7f47c38..3df86e7a 100644
--- a/loopy/transform/arithmetic.py
+++ b/loopy/transform/arithmetic.py
@@ -27,9 +27,13 @@ import six
 
 from loopy.diagnostic import LoopyError
 
+from loopy.program import iterate_over_kernels_if_given_program
+from loopy.kernel import LoopKernel
+
 
 # {{{ fold constants
 
+@iterate_over_kernels_if_given_program
 def fold_constants(kernel):
     from loopy.symbolic import ConstantFoldingMapper
     cfm = ConstantFoldingMapper()
@@ -53,7 +57,9 @@ def fold_constants(kernel):
 # {{{ collect_common_factors_on_increment
 
 # thus far undocumented
+@iterate_over_kernels_if_given_program
 def collect_common_factors_on_increment(kernel, var_name, vary_by_axes=()):
+    assert isinstance(kernel, LoopKernel)
     # FIXME: Does not understand subst rules for now
     if kernel.substitutions:
         from loopy.transform.subst import expand_subst
diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py
index f0b9814c..97054700 100644
--- a/loopy/transform/batch.py
+++ b/loopy/transform/batch.py
@@ -29,6 +29,9 @@ from loopy.symbolic import (RuleAwareIdentityMapper, SubstitutionRuleMappingCont
 from loopy.kernel.data import ValueArg, ArrayArg
 import islpy as isl
 
+from loopy.program import iterate_over_kernels_if_given_program
+
+
 __doc__ = """
 .. currentmodule:: loopy
 
@@ -102,8 +105,9 @@ def _add_unique_dim_name(name, dim_names):
     return (ng(name),) + tuple(dim_names)
 
 
-def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch",
-        sequential=False):
+@iterate_over_kernels_if_given_program
+def to_batched(knl, nbatches, batch_varying_args,
+        batch_iname_prefix="ibatch", sequential=False):
     """Takes in a kernel that carries out an operation and returns a kernel
     that carries out a batch of these operations.
     .. note::
diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py
index 801da4c1..57c4397f 100644
--- a/loopy/transform/buffer.py
+++ b/loopy/transform/buffer.py
@@ -33,6 +33,9 @@ from pytools.persistent_dict import WriteOncePersistentDict
 from loopy.tools import LoopyKeyBuilder, PymbolicExpressionHashWrapper
 from loopy.version import DATA_MODEL_VERSION
 from loopy.diagnostic import LoopyError
+from loopy.program import Program
+from loopy.kernel import LoopKernel
+from loopy.kernel.function_interface import ScalarCallable, CallableKernel
 
 from pymbolic import var
 
@@ -130,10 +133,10 @@ buffer_array_cache = WriteOncePersistentDict(
 
 
 # Adding an argument? also add something to the cache_key below.
-def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
-        store_expression=None, within=None, default_tag="l.auto",
-        temporary_scope=None, temporary_is_local=None,
-        fetch_bounding_box=False):
+def buffer_array_for_single_kernel(kernel, program_callables_info, var_name,
+        buffer_inames, init_expression=None, store_expression=None,
+        within=None, default_tag="l.auto", temporary_scope=None,
+        temporary_is_local=None, fetch_bounding_box=False):
     """Replace accesses to *var_name* with ones to a temporary, which is
     created and acts as a buffer. To perform this transformation, the access
     footprint to *var_name* is determined and a temporary of a suitable
@@ -169,6 +172,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
         fetched.
     """
 
+    assert isinstance(kernel, LoopKernel)
+
     # {{{ unify temporary_scope / temporary_is_local
 
     from loopy.kernel.data import AddressSpace
@@ -240,7 +245,8 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
 
     from loopy.preprocess import prepare_for_caching
     key_kernel = prepare_for_caching(kernel)
-    cache_key = (key_kernel, var_name, tuple(buffer_inames),
+    cache_key = (key_kernel, var_name,
+            tuple(buffer_inames),
             PymbolicExpressionHashWrapper(init_expression),
             PymbolicExpressionHashWrapper(store_expression), within,
             default_tag, temporary_scope, fetch_bounding_box)
@@ -528,7 +534,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
     kernel = tag_inames(kernel, new_iname_to_tag)
 
     from loopy.kernel.tools import assign_automatic_axes
-    kernel = assign_automatic_axes(kernel)
+    kernel = assign_automatic_axes(kernel, program_callables_info)
 
     if CACHING_ENABLED:
         from loopy.preprocess import prepare_for_caching
@@ -537,4 +543,29 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
 
     return kernel
 
+
+def buffer_array(program, *args, **kwargs):
+    assert isinstance(program, Program)
+
+    new_resolved_functions = {}
+    for func_id, in_knl_callable in program.program_callables_info.items():
+        if isinstance(in_knl_callable, CallableKernel):
+            new_subkernel = buffer_array_for_single_kernel(
+                    in_knl_callable.subkernel, program.program_callables_info,
+                    *args, **kwargs)
+            in_knl_callable = in_knl_callable.copy(
+                    subkernel=new_subkernel)
+
+        elif isinstance(in_knl_callable, ScalarCallable):
+            pass
+        else:
+            raise NotImplementedError("Unknown type of callable %s." % (
+                type(in_knl_callable).__name__))
+
+        new_resolved_functions[func_id] = in_knl_callable
+
+    new_program_callables_info = program.program_callables_info.copy(
+            resolved_functions=new_resolved_functions)
+    return program.copy(program_callables_info=new_program_callables_info)
+
 # vim: foldmethod=marker
diff --git a/loopy/transform/data.py b/loopy/transform/data.py
index 5b1ee6cc..5f4f2f2a 100644
--- a/loopy/transform/data.py
+++ b/loopy/transform/data.py
@@ -30,6 +30,9 @@ from islpy import dim_type
 from loopy.kernel.data import ImageArg
 
 from pytools import MovedFunctionDeprecationWrapper
+from loopy.program import Program, iterate_over_kernels_if_given_program
+from loopy.kernel import LoopKernel
+from loopy.kernel.function_interface import CallableKernel, ScalarCallable
 
 
 # {{{ convenience: add_prefetch
@@ -140,7 +143,8 @@ class _not_provided:  # noqa: N801
     pass
 
 
-def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
+def add_prefetch_for_single_kernel(kernel, program_callables_info, var_name,
+        sweep_inames=[], dim_arg_names=None,
 
         # "None" is a valid value here, distinct from the default.
         default_tag=_not_provided,
@@ -239,6 +243,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
 
     This function internally uses :func:`extract_subst` and :func:`precompute`.
     """
+    assert isinstance(kernel, LoopKernel)
 
     # {{{ fish indexing out of var_name and into footprint_subscripts
 
@@ -328,9 +333,9 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
     # precompute module, but precompute acutally uses that to adjust its
     # warning message.
 
-    from loopy.transform.precompute import precompute
-    new_kernel = precompute(kernel, subst_use, sweep_inames,
-            precompute_inames=dim_arg_names,
+    from loopy.transform.precompute import precompute_for_single_kernel
+    new_kernel = precompute_for_single_kernel(kernel, program_callables_info,
+            subst_use, sweep_inames, precompute_inames=dim_arg_names,
             default_tag=default_tag, dtype=arg.dtype,
             fetch_bounding_box=fetch_bounding_box,
             temporary_name=temporary_name,
@@ -363,6 +368,31 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None,
     else:
         return new_kernel
 
+
+def add_prefetch(program, *args, **kwargs):
+    assert isinstance(program, Program)
+
+    new_resolved_functions = {}
+    for func_id, in_knl_callable in program.program_callables_info.items():
+        if isinstance(in_knl_callable, CallableKernel):
+            new_subkernel = add_prefetch_for_single_kernel(
+                    in_knl_callable.subkernel, program.program_callables_info,
+                    *args, **kwargs)
+            in_knl_callable = in_knl_callable.copy(
+                    subkernel=new_subkernel)
+
+        elif isinstance(in_knl_callable, ScalarCallable):
+            pass
+        else:
+            raise NotImplementedError("Unknown type of callable %s." % (
+                type(in_knl_callable).__name__))
+
+        new_resolved_functions[func_id] = in_knl_callable
+
+    new_program_callables_info = program.program_callables_info.copy(
+            resolved_functions=new_resolved_functions)
+    return program.copy(program_callables_info=new_program_callables_info)
+
 # }}}
 
 
@@ -385,6 +415,7 @@ def change_arg_to_image(knl, name):
 
 # {{{ tag array axes
 
+@iterate_over_kernels_if_given_program
 def tag_array_axes(knl, ary_names, dim_tags):
     """
     .. versionchanged:: 2016.2
@@ -414,13 +445,15 @@ def tag_array_axes(knl, ary_names, dim_tags):
     return knl
 
 
-tag_data_axes = MovedFunctionDeprecationWrapper(tag_array_axes)
+tag_data_axes = (
+        MovedFunctionDeprecationWrapper(tag_array_axes))
 
 # }}}
 
 
 # {{{ set_array_axis_names
 
+@iterate_over_kernels_if_given_program
 def set_array_axis_names(kernel, ary_names, dim_names):
     """
     .. versionchanged:: 2016.2
@@ -445,13 +478,15 @@ def set_array_axis_names(kernel, ary_names, dim_names):
     return kernel
 
 
-set_array_dim_names = MovedFunctionDeprecationWrapper(set_array_axis_names)
+set_array_dim_names = (MovedFunctionDeprecationWrapper(
+    set_array_axis_names))
 
 # }}}
 
 
 # {{{ remove_unused_arguments
 
+@iterate_over_kernels_if_given_program
 def remove_unused_arguments(knl):
     new_args = []
 
@@ -493,6 +528,7 @@ def remove_unused_arguments(knl):
 
 # {{{ alias_temporaries
 
+@iterate_over_kernels_if_given_program
 def alias_temporaries(knl, names, base_name_prefix=None,
         synchronize_for_exclusive_use=True):
     """Sets all temporaries given by *names* to be backed by a single piece of
@@ -577,11 +613,14 @@ def alias_temporaries(knl, names, base_name_prefix=None,
 
 # {{{ set argument order
 
+@iterate_over_kernels_if_given_program
 def set_argument_order(kernel, arg_names):
     """
     :arg arg_names: A list (or comma-separated string) or argument
         names. All arguments must be in this list.
     """
+    #FIXME: @inducer -- shoulld this only affect the root kernel, or should it
+    # take a within?
 
     if isinstance(arg_names, str):
         arg_names = arg_names.split(",")
@@ -610,6 +649,7 @@ def set_argument_order(kernel, arg_names):
 
 # {{{ rename argument
 
+@iterate_over_kernels_if_given_program
 def rename_argument(kernel, old_name, new_name, existing_ok=False):
     """
     .. versionadded:: 2016.2
@@ -655,6 +695,7 @@ def rename_argument(kernel, old_name, new_name, existing_ok=False):
 
 # {{{ set temporary scope
 
+@iterate_over_kernels_if_given_program
 def set_temporary_scope(kernel, temp_var_names, scope):
     """
     :arg temp_var_names: a container with membership checking,
@@ -696,6 +737,7 @@ def set_temporary_scope(kernel, temp_var_names, scope):
 
 # {{{ reduction_arg_to_subst_rule
 
+@iterate_over_kernels_if_given_program
 def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=None):
     if isinstance(inames, str):
         inames = [s.strip() for s in inames.split(",")]
diff --git a/loopy/transform/diff.py b/loopy/transform/diff.py
index d4dcb370..54d06605 100644
--- a/loopy/transform/diff.py
+++ b/loopy/transform/diff.py
@@ -33,6 +33,7 @@ import loopy as lp
 from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext
 from loopy.isl_helpers import make_slab
 from loopy.diagnostic import LoopyError
+from loopy.kernel import LoopKernel
 
 
 # {{{ diff mapper
@@ -370,6 +371,8 @@ def diff_kernel(knl, diff_outputs, by, diff_iname_prefix="diff_i",
         *diff_context.by_name*, or *None* if no dependency exists.
     """
 
+    assert isinstance(knl, LoopKernel)
+
     from loopy.kernel.creation import apply_single_writer_depencency_heuristic
     knl = apply_single_writer_depencency_heuristic(knl, warn_if_used=True)
 
diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py
index 49e30a75..d43ce025 100644
--- a/loopy/transform/fusion.py
+++ b/loopy/transform/fusion.py
@@ -31,6 +31,10 @@ from islpy import dim_type
 from loopy.diagnostic import LoopyError
 from pymbolic import var
 
+from loopy.kernel import LoopKernel
+from loopy.kernel.function_interface import CallableKernel
+from loopy.program import rename_resolved_functions_in_a_single_kernel
+
 
 def _apply_renames_in_exprs(kernel, var_renames):
     from loopy.symbolic import (
@@ -287,7 +291,7 @@ def _fuse_two_kernels(knla, knlb):
 # }}}
 
 
-def fuse_kernels(kernels, suffixes=None, data_flow=None):
+def fuse_loop_kernels(kernels, suffixes=None, data_flow=None):
     """Return a kernel that performs all the operations in all entries
     of *kernels*.
 
@@ -331,6 +335,8 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None):
 
         *data_flow* was added in version 2016.2
     """
+
+    assert all(isinstance(knl, LoopKernel) for knl in kernels)
     kernels = list(kernels)
 
     if data_flow is None:
@@ -411,4 +417,52 @@ def fuse_kernels(kernels, suffixes=None, data_flow=None):
 
     return result
 
+
+def fuse_kernels(programs, suffixes=None, data_flow=None):
+    main_prog_callables_info = (
+            programs[0].program_callables_info.with_edit_callables_mode())
+    old_root_kernel_callable = (
+            programs[0].program_callables_info[programs[0].name])
+    kernels = [programs[0].root_kernel]
+
+    # removing the callable collisions that maybe present
+    for prog in programs[1:]:
+        root_kernel = prog.root_kernel
+        renames_needed = {}
+        for old_func_id, in_knl_callable in prog.program_callables_info.items():
+            if isinstance(in_knl_callable, CallableKernel):
+                if in_knl_callable.name != prog.name:
+                    raise LoopyError("fuse_kernels cannot fuse programs with "
+                            "multiple callable kernels.")
+                continue
+            num_times_called = (
+                    prog.program_callables_info.num_times_callables_called[
+                        old_func_id])
+            for i in range(num_times_called):
+                main_prog_callables_info, new_func_id = (
+                        main_prog_callables_info.with_callables(var(old_func_id),
+                            in_knl_callable, True))
+
+            if old_func_id != new_func_id:
+                renames_needed[old_func_id] = new_func_id
+
+        if renames_needed:
+            root_kernel = rename_resolved_functions_in_a_single_kernel(
+                    root_kernel, renames_needed)
+
+        kernels.append(root_kernel)
+
+    new_root_kernel = fuse_loop_kernels(kernels, suffixes, data_flow)
+    new_root_kernel_callable = old_root_kernel_callable.copy(
+            subkernel=new_root_kernel.copy(name=programs[0].name))
+
+    main_prog_callables_info, _ = main_prog_callables_info.with_callable(
+            var(programs[0].name), new_root_kernel_callable)
+
+    main_prog_callables_info = (
+            main_prog_callables_info.with_exit_edit_callables_mode())
+
+    return programs[0].copy(
+            program_callables_info=main_prog_callables_info)
+
 # vim: foldmethod=marker
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index 2b618a46..93f6c53e 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -34,6 +34,10 @@ from loopy.symbolic import (
         SubstitutionRuleMappingContext)
 from loopy.diagnostic import LoopyError
 
+from loopy.program import iterate_over_kernels_if_given_program
+from loopy.kernel import LoopKernel
+from loopy.kernel.function_interface import CallableKernel, ScalarCallable
+
 
 __doc__ = """
 .. currentmodule:: loopy
@@ -93,6 +97,7 @@ def set_loop_priority(kernel, loop_priority):
     return kernel.copy(loop_priority=frozenset([loop_priority]))
 
 
+@iterate_over_kernels_if_given_program
 def prioritize_loops(kernel, loop_priority):
     """Indicates the textual order in which loops should be entered in the
     kernel code. Note that this priority has an advisory role only. If the
@@ -107,6 +112,8 @@ def prioritize_loops(kernel, loop_priority):
     :arg: an iterable of inames, or, for brevity, a comma-separated string of
         inames
     """
+
+    assert isinstance(kernel, LoopKernel)
     if isinstance(loop_priority, str):
         loop_priority = tuple(s.strip()
                               for s in loop_priority.split(",") if s.strip())
@@ -299,13 +306,15 @@ def _split_iname_backend(kernel, split_iname,
         kernel = tag_inames(kernel,
                 {outer_iname: existing_tag, inner_iname: existing_tag})
 
-    return tag_inames(kernel, {outer_iname: outer_tag, inner_iname: inner_tag})
+    return tag_inames(kernel, {outer_iname: outer_tag,
+        inner_iname: inner_tag})
 
 # }}}
 
 
 # {{{ split iname
 
+@iterate_over_kernels_if_given_program
 def split_iname(kernel, split_iname, inner_length,
         outer_iname=None, inner_iname=None,
         outer_tag=None, inner_tag=None,
@@ -331,6 +340,8 @@ def split_iname(kernel, split_iname, inner_length,
     :arg within: a stack match as understood by
         :func:`loopy.match.parse_stack_match`.
     """
+    assert isinstance(kernel, LoopKernel)
+
     def make_new_loop_index(inner, outer):
         return inner + outer*inner_length
 
@@ -347,6 +358,7 @@ def split_iname(kernel, split_iname, inner_length,
 
 # {{{ chunk iname
 
+@iterate_over_kernels_if_given_program
 def chunk_iname(kernel, split_iname, num_chunks,
         outer_iname=None, inner_iname=None,
         outer_tag=None, inner_tag=None,
@@ -481,6 +493,7 @@ class _InameJoiner(RuleAwareSubstitutionMapper):
             return super(_InameJoiner, self).map_reduction(expr, expn_state)
 
 
+@iterate_over_kernels_if_given_program
 def join_inames(kernel, inames, new_iname=None, tag=None, within=None):
     """
     :arg inames: fastest varying last
@@ -625,7 +638,9 @@ def untag_inames(kernel, iname_to_untag, tag_type):
 
 # {{{ tag inames
 
-def tag_inames(kernel, iname_to_tag, force=False, ignore_nonexistent=False):
+@iterate_over_kernels_if_given_program
+def tag_inames(kernel, iname_to_tag, force=False,
+        ignore_nonexistent=False):
     """Tag an iname
 
     :arg iname_to_tag: a list of tuples ``(iname, new_tag)``. *new_tag* is given
@@ -804,7 +819,9 @@ class _InameDuplicator(RuleAwareIdentityMapper):
         return insn.copy(within_inames=new_fid)
 
 
-def duplicate_inames(knl, inames, within, new_inames=None, suffix=None,
+@iterate_over_kernels_if_given_program
+def duplicate_inames(knl, inames, within, new_inames=None,
+        suffix=None,
         tags={}):
     """
     :arg within: a stack match as understood by
@@ -966,7 +983,7 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset(
     # If partitioning was empty, we have recursed successfully and yield nothing
 
 
-def get_iname_duplication_options(knl, use_boostable_into=False):
+def get_iname_duplication_options_for_single_kernel(knl, use_boostable_into=False):
     """List options for duplication of inames, if necessary for schedulability
 
     :returns: a generator listing all options to duplicate inames, if duplication
@@ -1032,7 +1049,7 @@ def get_iname_duplication_options(knl, use_boostable_into=False):
         # If we find a duplication option and to not use boostable_into
         # information, we restart this generator with use_boostable_into=True
         if not use_boostable_into and not knl.options.ignore_boostable_into:
-            for option in get_iname_duplication_options(knl, True):
+            for option in get_iname_duplication_options_for_single_kernel(knl, True):
                 yield option
 
             # Emit a warning that we needed boostable_into
@@ -1060,18 +1077,42 @@ def get_iname_duplication_options(knl, use_boostable_into=False):
             yield iname, within
 
 
-def has_schedulable_iname_nesting(knl):
+def get_iname_duplication_options(program, use_boostable_into=False):
+    for in_knl_callable in program.program_callables_info.values():
+        if isinstance(in_knl_callable, CallableKernel):
+            for option in get_iname_duplication_options_for_single_kernel(
+                    in_knl_callable.subkernel, use_boostable_into):
+                yield option
+        elif isinstance(in_knl_callable, ScalarCallable):
+            pass
+        else:
+            raise NotImplementedError("Unknown type of in kernel callable %s."
+                    % (type(in_knl_callable)))
+
+    return
+
+
+def has_schedulable_iname_nesting_for_single_kernel(knl):
     """
     :returns: a :class:`bool` indicating whether this kernel needs
         an iname duplication in order to be schedulable.
     """
-    return not bool(next(get_iname_duplication_options(knl), False))
+    return not bool(next(get_iname_duplication_options_for_single_kernel(knl),
+        False))
+
+
+def has_schedulable_iname_nesting(program):
+    return all(has_schedulable_iname_nesting_for_single_kernel(
+        in_knl_callable.subkernel) for in_knl_callable in
+        program.program_callables_info.values() if isinstance(in_knl_callable,
+            CallableKernel))
 
 # }}}
 
 
 # {{{ rename_inames
 
+@iterate_over_kernels_if_given_program
 def rename_iname(knl, old_iname, new_iname, existing_ok=False, within=None):
     """
     :arg within: a stack match as understood by
@@ -1278,6 +1319,7 @@ def _split_reduction(kernel, inames, direction, within=None):
             rsplit.map_kernel(kernel))
 
 
+@iterate_over_kernels_if_given_program
 def split_reduction_inward(kernel, inames, within=None):
     """Takes a reduction of the form::
 
@@ -1297,6 +1339,7 @@ def split_reduction_inward(kernel, inames, within=None):
     return _split_reduction(kernel, inames, "in", within)
 
 
+@iterate_over_kernels_if_given_program
 def split_reduction_outward(kernel, inames, within=None):
     """Takes a reduction of the form::
 
@@ -1320,6 +1363,7 @@ def split_reduction_outward(kernel, inames, within=None):
 
 # {{{ affine map inames
 
+@iterate_over_kernels_if_given_program
 def affine_map_inames(kernel, old_inames, new_inames, equations):
     """Return a new *kernel* where the affine transform
     specified by *equations* has been applied to the inames.
@@ -1651,6 +1695,7 @@ class _ReductionInameUniquifier(RuleAwareIdentityMapper):
                     expr, expn_state)
 
 
+@iterate_over_kernels_if_given_program
 def make_reduction_inames_unique(kernel, inames=None, within=None):
     """
     :arg inames: if not *None*, only apply to these inames
@@ -1697,6 +1742,7 @@ def make_reduction_inames_unique(kernel, inames=None, within=None):
 
 # {{{ add_inames_to_insn
 
+@iterate_over_kernels_if_given_program
 def add_inames_to_insn(knl, inames, insn_match):
     """
     :arg inames: a frozenset of inames that will be added to the
diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index e6ecb409..93cf932b 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -25,15 +25,35 @@ THE SOFTWARE.
 import six  # noqa
 
 from loopy.diagnostic import LoopyError
+from loopy.kernel import LoopKernel
+from loopy.kernel.function_interface import (ScalarCallable, CallableKernel)
+from loopy.program import Program, iterate_over_kernels_if_given_program
 
 
 # {{{ find_instructions
 
-def find_instructions(kernel, insn_match):
+def find_instructions_in_single_kernel(kernel, insn_match):
+    assert isinstance(kernel, LoopKernel)
     from loopy.match import parse_match
     match = parse_match(insn_match)
     return [insn for insn in kernel.instructions if match(kernel, insn)]
 
+
+def find_instructions(program, insn_match):
+    assert isinstance(program, Program)
+    insns = []
+    for in_knl_callable in program.program_callables_info.values():
+        if isinstance(in_knl_callable, CallableKernel):
+            insns += (find_instructions_in_single_kernel(
+                in_knl_callable.subkernel, insn_match))
+        elif isinstance(in_knl_callable, ScalarCallable):
+            pass
+        else:
+            raise NotImplementedError("Unknown callable type %s." % (
+                type(in_knl_callable)))
+
+    return insns
+
 # }}}
 
 
@@ -58,6 +78,7 @@ def map_instructions(kernel, insn_match, f):
 
 # {{{ set_instruction_priority
 
+@iterate_over_kernels_if_given_program
 def set_instruction_priority(kernel, insn_match, priority):
     """Set the priority of instructions matching *insn_match* to *priority*.
 
@@ -75,6 +96,7 @@ def set_instruction_priority(kernel, insn_match, priority):
 
 # {{{ add_dependency
 
+@iterate_over_kernels_if_given_program
 def add_dependency(kernel, insn_match, depends_on):
     """Add the instruction dependency *dependency* to the instructions matched
     by *insn_match*.
@@ -92,7 +114,8 @@ def add_dependency(kernel, insn_match, depends_on):
         added_deps = frozenset([depends_on])
     else:
         added_deps = frozenset(
-                dep.id for dep in find_instructions(kernel, depends_on))
+                dep.id for dep in find_instructions_in_single_kernel(kernel,
+                    depends_on))
 
     if not added_deps:
         raise LoopyError("no instructions found matching '%s' "
@@ -209,6 +232,7 @@ def replace_instruction_ids(kernel, replacements):
 
 # {{{ tag_instructions
 
+@iterate_over_kernels_if_given_program
 def tag_instructions(kernel, new_tag, within=None):
     from loopy.match import parse_match
     within = parse_match(within)
@@ -228,6 +252,7 @@ def tag_instructions(kernel, new_tag, within=None):
 
 # {{{ add nosync
 
+@iterate_over_kernels_if_given_program
 def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False,
         empty_ok=False):
     """Add a *no_sync_with* directive between *source* and *sink*.
@@ -260,18 +285,21 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False,
         This used to silently pass. This behavior can be restored using
         *empty_ok*.
     """
+    assert isinstance(kernel, LoopKernel)
 
     if isinstance(source, str) and source in kernel.id_to_insn:
         sources = frozenset([source])
     else:
         sources = frozenset(
-                source.id for source in find_instructions(kernel, source))
+                source.id for source in find_instructions_in_single_kernel(
+                    kernel, source))
 
     if isinstance(sink, str) and sink in kernel.id_to_insn:
         sinks = frozenset([sink])
     else:
         sinks = frozenset(
-                sink.id for sink in find_instructions(kernel, sink))
+                sink.id for sink in find_instructions_in_single_kernel(
+                    kernel, sink))
 
     if not sources and not empty_ok:
         raise LoopyError("No match found for source specification '%s'." % source)
@@ -324,6 +352,7 @@ def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False,
 
 # {{{ uniquify_instruction_ids
 
+@iterate_over_kernels_if_given_program
 def uniquify_instruction_ids(kernel):
     """Converts any ids that are :class:`loopy.UniqueName` or *None* into unique
     strings.
diff --git a/loopy/transform/padding.py b/loopy/transform/padding.py
index d695e359..3e5e4a43 100644
--- a/loopy/transform/padding.py
+++ b/loopy/transform/padding.py
@@ -28,6 +28,9 @@ THE SOFTWARE.
 from pytools import MovedFunctionDeprecationWrapper
 from loopy.symbolic import RuleAwareIdentityMapper, SubstitutionRuleMappingContext
 
+from loopy.program import iterate_over_kernels_if_given_program
+from loopy.kernel import LoopKernel
+
 
 class ArrayAxisSplitHelper(RuleAwareIdentityMapper):
     def __init__(self, rule_mapping_context, arg_names, handler):
@@ -44,7 +47,9 @@ class ArrayAxisSplitHelper(RuleAwareIdentityMapper):
 
 # {{{ split_array_dim (deprecated since June 2016)
 
-def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True,
+@iterate_over_kernels_if_given_program
+def split_array_dim(kernel, arrays_and_axes, count,
+        auto_split_inames=True,
         split_kwargs=None):
     """
     :arg arrays_and_axes: a list of tuples *(array, axis_nr)* indicating
@@ -237,7 +242,7 @@ def split_array_dim(kernel, arrays_and_axes, count, auto_split_inames=True,
     kernel = rule_mapping_context.finish_kernel(aash.map_kernel(kernel))
 
     if auto_split_inames:
-        from loopy import split_iname
+        from loopy.transform.iname import split_iname
         for iname, (outer_iname, inner_iname) in six.iteritems(split_vars):
             kernel = split_iname(kernel, iname, count,
                     outer_iname=outer_iname, inner_iname=inner_iname,
@@ -370,7 +375,9 @@ def _split_array_axis_inner(kernel, array_name, axis_nr, count, order="C"):
     return kernel
 
 
-def split_array_axis(kernel, array_names, axis_nr, count, order="C"):
+@iterate_over_kernels_if_given_program
+def split_array_axis(kernel, array_names, axis_nr, count,
+        order="C"):
     """
     :arg array: a list of names of temporary variables or arguments. May
         also be a comma-separated string of these.
@@ -387,6 +394,7 @@ def split_array_axis(kernel, array_names, axis_nr, count, order="C"):
         There was a more complicated, dumber function called :func:`split_array_dim`
         that had the role of this function in versions prior to 2016.2.
     """
+    assert isinstance(kernel, LoopKernel)
 
     if isinstance(array_names, str):
         array_names = [i.strip() for i in array_names.split(",") if i.strip()]
@@ -439,6 +447,7 @@ def find_padding_multiple(kernel, variable, axis, align_bytes, allowed_waste=0.1
 
 # {{{ add_padding
 
+@iterate_over_kernels_if_given_program
 def add_padding(kernel, variable, axis, align_bytes):
     arg_to_idx = dict((arg.name, i) for i, arg in enumerate(kernel.args))
     arg_idx = arg_to_idx[variable]
diff --git a/loopy/transform/parameter.py b/loopy/transform/parameter.py
index fc5dad91..b7d017ec 100644
--- a/loopy/transform/parameter.py
+++ b/loopy/transform/parameter.py
@@ -28,6 +28,9 @@ from loopy.symbolic import (RuleAwareSubstitutionMapper,
         SubstitutionRuleMappingContext)
 import islpy as isl
 
+from loopy.program import iterate_over_kernels_if_given_program
+from loopy.kernel import LoopKernel
+
 __doc__ = """
 
 .. currentmodule:: loopy
@@ -40,6 +43,7 @@ __doc__ = """
 
 # {{{ assume
 
+@iterate_over_kernels_if_given_program
 def assume(kernel, assumptions):
     """Include an assumption about :ref:`domain-parameters` in the kernel, e.g.
     `n mod 4 = 0`.
@@ -134,6 +138,7 @@ def _fix_parameter(kernel, name, value):
                 ))
 
 
+@iterate_over_kernels_if_given_program
 def fix_parameters(kernel, **value_dict):
     """Fix the values of the arguments to specific constants.
 
@@ -141,6 +146,7 @@ def fix_parameters(kernel, **value_dict):
     to be *value*. *name* may refer to :ref:`domain-parameters` or
     :ref:`arguments`.
     """
+    assert isinstance(kernel, LoopKernel)
 
     for name, value in six.iteritems(value_dict):
         kernel = _fix_parameter(kernel, name, value)
diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py
index 52d56897..66c7114a 100644
--- a/loopy/transform/precompute.py
+++ b/loopy/transform/precompute.py
@@ -38,6 +38,9 @@ from pymbolic import var
 from loopy.transform.array_buffer_map import (ArrayToBufferMap, NoOpArrayToBufferMap,
         AccessDescriptor)
 
+from loopy.program import Program
+from loopy.kernel.function_interface import CallableKernel, ScalarCallable
+
 
 class RuleAccessDescriptor(AccessDescriptor):
     __slots__ = ["args", "expansion_stack"]
@@ -258,9 +261,9 @@ class _not_provided(object):  # noqa: N801
     pass
 
 
-def precompute(kernel, subst_use, sweep_inames=[], within=None,
-        storage_axes=None, temporary_name=None, precompute_inames=None,
-        precompute_outer_inames=None,
+def precompute_for_single_kernel(kernel, program_callables_info, subst_use,
+        sweep_inames=[], within=None, storage_axes=None, temporary_name=None,
+        precompute_inames=None, precompute_outer_inames=None,
         storage_axis_to_tag={},
 
         # "None" is a valid value here, distinct from the default.
@@ -1037,15 +1040,40 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None,
 
     # }}}
 
-    from loopy import tag_inames
+    from loopy.transform.iname import tag_inames
     kernel = tag_inames(kernel, new_iname_to_tag)
 
     from loopy.kernel.data import AutoFitLocalIndexTag, filter_iname_tags_by_type
 
     if filter_iname_tags_by_type(new_iname_to_tag.values(), AutoFitLocalIndexTag):
         from loopy.kernel.tools import assign_automatic_axes
-        kernel = assign_automatic_axes(kernel)
+        kernel = assign_automatic_axes(kernel, program_callables_info)
 
     return kernel
 
+
+def precompute(program, *args, **kwargs):
+    assert isinstance(program, Program)
+
+    new_resolved_functions = {}
+    for func_id, in_knl_callable in program.program_callables_info.items():
+        if isinstance(in_knl_callable, CallableKernel):
+            new_subkernel = precompute_for_single_kernel(
+                    in_knl_callable.subkernel, program.program_callables_info,
+                    *args, **kwargs)
+            in_knl_callable = in_knl_callable.copy(
+                    subkernel=new_subkernel)
+
+        elif isinstance(in_knl_callable, ScalarCallable):
+            pass
+        else:
+            raise NotImplementedError("Unknown type of callable %s." % (
+                type(in_knl_callable).__name__))
+
+        new_resolved_functions[func_id] = in_knl_callable
+
+    new_program_callables_info = program.program_callables_info.copy(
+            resolved_functions=new_resolved_functions)
+    return program.copy(program_callables_info=new_program_callables_info)
+
 # vim: foldmethod=marker
diff --git a/loopy/transform/save.py b/loopy/transform/save.py
index cca62bc5..4b957b03 100644
--- a/loopy/transform/save.py
+++ b/loopy/transform/save.py
@@ -64,7 +64,7 @@ class LivenessAnalysis(object):
 
     def __init__(self, kernel):
         self.kernel = kernel
-        self.schedule = self.kernel.schedule
+        self.schedule = kernel.schedule
 
     @memoize_method
     def get_successor_relation(self):
@@ -235,8 +235,9 @@ class TemporarySaver(object):
         def new_shape(self):
             return self.hw_dims + self.non_hw_dims
 
-    def __init__(self, kernel):
+    def __init__(self, kernel, program_callables_info):
         self.kernel = kernel
+        self.program_callables_info = program_callables_info
         self.var_name_gen = kernel.get_var_name_generator()
         self.insn_name_gen = kernel.get_instruction_id_generator()
 
@@ -439,7 +440,8 @@ class TemporarySaver(object):
             return (), ()
 
         group_sizes, local_sizes = (
-            self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids))
+            self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids,
+                self.program_callables_info))
 
         if temporary.address_space == lp.AddressSpace.LOCAL:
             # Elide local axes in the save slot for local temporaries.
@@ -628,7 +630,7 @@ class TemporarySaver(object):
                     kernel = lp.add_nosync(kernel, "global", source, sink)
 
         from loopy.kernel.tools import assign_automatic_axes
-        return assign_automatic_axes(kernel)
+        return assign_automatic_axes(kernel, self.program_callables_info)
 
     def save(self, temporary, subkernel):
         self.save_or_reload_impl(temporary, subkernel, "save")
@@ -722,7 +724,7 @@ class TemporarySaver(object):
 
 # {{{ auto save and reload across kernel calls
 
-def save_and_reload_temporaries(knl):
+def save_and_reload_temporaries(program):
     """
     Add instructions to save and reload temporary variables that are live
     across kernel calls.
@@ -745,8 +747,19 @@ def save_and_reload_temporaries(knl):
 
     :returns: The resulting kernel
     """
+
+    knl = program.root_kernel
+
+    if not knl.schedule:
+        program = lp.preprocess_program(program)
+        from loopy.schedule import get_one_scheduled_kernel
+        knl = get_one_scheduled_kernel(program.root_kernel,
+                program.program_callables_info)
+
+    assert knl.schedule is not None
+
     liveness = LivenessAnalysis(knl)
-    saver = TemporarySaver(knl)
+    saver = TemporarySaver(knl, program.program_callables_info)
 
     from loopy.schedule.tools import (
         temporaries_read_in_subkernel, temporaries_written_in_subkernel)
@@ -784,7 +797,7 @@ def save_and_reload_temporaries(knl):
                         .format(temporary, sched_item.kernel_name))
                 saver.save(temporary, sched_item.kernel_name)
 
-    return saver.finish()
+    return program.with_root_kernel(saver.finish())
 
 # }}}
 
diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py
index a681afe0..afe3fec5 100644
--- a/loopy/transform/subst.py
+++ b/loopy/transform/subst.py
@@ -33,6 +33,9 @@ from pymbolic.mapper.substitutor import make_subst_func
 from pytools import ImmutableRecord
 from pymbolic import var
 
+from loopy.program import iterate_over_kernels_if_given_program
+from loopy.kernel import LoopKernel
+from loopy.kernel.function_interface import CallableKernel, ScalarCallable
 
 import logging
 logger = logging.getLogger(__name__)
@@ -44,6 +47,7 @@ class ExprDescriptor(ImmutableRecord):
 
 # {{{ extract_subst
 
+@iterate_over_kernels_if_given_program
 def extract_subst(kernel, subst_name, template, parameters=()):
     """
     :arg subst_name: The name of the substitution rule to be created.
@@ -285,6 +289,7 @@ class AssignmentToSubstChanger(RuleAwareIdentityMapper):
             return var(subst_name)(*index)
 
 
+@iterate_over_kernels_if_given_program
 def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None,
         force_retain_argument=False):
     """Extract an assignment (to a temporary variable or an argument)
@@ -468,7 +473,9 @@ def assignment_to_subst(kernel, lhs_name, extra_arguments=(), within=None,
 
 # {{{ expand_subst
 
+@iterate_over_kernels_if_given_program
 def expand_subst(kernel, within=None):
+    assert isinstance(kernel, LoopKernel)
     if not kernel.substitutions:
         return kernel
 
@@ -501,8 +508,17 @@ def find_rules_matching(knl, pattern):
     return [r for r in knl.substitutions if pattern.match(r)]
 
 
-def find_one_rule_matching(knl, pattern):
-    rules = find_rules_matching(knl, pattern)
+def find_one_rule_matching(program, pattern):
+    rules = []
+    for in_knl_callable in program.program_callables_info.values():
+        if isinstance(in_knl_callable, CallableKernel):
+            knl = in_knl_callable.subkernel
+            rules.extend(find_rules_matching(knl, pattern))
+        elif isinstance(in_knl_callable, ScalarCallable):
+            pass
+        else:
+            raise NotImplementedError("Unknown callable types %s." % (
+                type(in_knl_callable).__name__))
 
     if len(rules) > 1:
         raise ValueError("more than one substitution rule matched '%s'"
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 010a0658..0e8fa305 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -33,6 +33,11 @@ from loopy.types import NumpyType
 from loopy.diagnostic import (
         LoopyError,
         TypeInferenceFailure, DependencyTypeInferenceFailure)
+from loopy.kernel.instruction import _DataObliviousInstruction
+
+from loopy.program import ProgramCallablesInfo
+from loopy.symbolic import SubArrayRef, LinearSubscript
+from pymbolic.primitives import Variable, Subscript, Lookup
 
 import logging
 logger = logging.getLogger(__name__)
@@ -44,10 +49,23 @@ def _debug(kernel, s, *args):
         logger.debug("%s: %s" % (kernel.name, logstr))
 
 
+def get_return_types_as_tuple(arg_id_to_dtype):
+    """Returns the types of arguments in  a tuple format.
+
+    :param arg_id_to_dtype: An instance of :class:`dict` which denotes a
+                            mapping from the arguments to their inferred types.
+    """
+    return_arg_id_to_dtype = dict((id, dtype) for id, dtype in
+            arg_id_to_dtype.items() if (isinstance(id, int) and id < 0))
+    return_arg_pos = sorted(return_arg_id_to_dtype.keys(), reverse=True)
+
+    return tuple(return_arg_id_to_dtype[id] for id in return_arg_pos)
+
+
 # {{{ type inference mapper
 
 class TypeInferenceMapper(CombineMapper):
-    def __init__(self, kernel, new_assignments=None):
+    def __init__(self, kernel, program_callables_info, new_assignments=None):
         """
         :arg new_assignments: mapping from names to either
             :class:`loopy.kernel.data.TemporaryVariable`
@@ -56,10 +74,13 @@ class TypeInferenceMapper(CombineMapper):
             instances
         """
         self.kernel = kernel
+        assert isinstance(program_callables_info, ProgramCallablesInfo)
         if new_assignments is None:
             new_assignments = {}
         self.new_assignments = new_assignments
         self.symbols_with_unknown_types = set()
+        self.program_callables_info = program_callables_info
+        self.old_calls_to_new_calls = {}
 
     def __call__(self, expr, return_tuple=False, return_dtype_set=False):
         kwargs = {}
@@ -92,13 +113,16 @@ class TypeInferenceMapper(CombineMapper):
     # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x)
     # are Python-equal (for many common constants such as integers).
 
-    def copy(self):
-        return type(self)(self.kernel, self.new_assignments)
+    def copy(self, program_callables_info=None):
+        if program_callables_info is None:
+            program_callables_info = self.program_callables_info
+        return type(self)(self.kernel, program_callables_info,
+                self.new_assignments)
 
     def with_assignments(self, names_to_vars):
         new_ass = self.new_assignments.copy()
         new_ass.update(names_to_vars)
-        return type(self)(self.kernel, new_ass)
+        return type(self)(self.kernel, self.program_callables_info, new_ass)
 
     @staticmethod
     def combine(dtype_sets):
@@ -250,15 +274,20 @@ class TypeInferenceMapper(CombineMapper):
         return self.rec(expr.aggregate)
 
     def map_call(self, expr, return_tuple=False):
-        from pymbolic.primitives import Variable
+
+        from pymbolic.primitives import Variable, CallWithKwargs, Call
+        from loopy.symbolic import ResolvedFunction
+
+        if isinstance(expr, CallWithKwargs):
+            kw_parameters = expr.kw_parameters
+        else:
+            assert isinstance(expr, Call)
+            kw_parameters = {}
 
         identifier = expr.function
-        if isinstance(identifier, Variable):
+        if isinstance(identifier, (Variable, ResolvedFunction)):
             identifier = identifier.name
 
-        if identifier in ["indexof", "indexof_vec"]:
-            return [self.kernel.index_dtype]
-
         def none_if_empty(d):
             if d:
                 d, = d
@@ -266,25 +295,145 @@ class TypeInferenceMapper(CombineMapper):
             else:
                 return None
 
-        arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in expr.parameters)
-        if None in arg_dtypes:
-            return []
+        arg_id_to_dtype = dict((i, none_if_empty(self.rec(par))) for (i, par) in
+                tuple(enumerate(expr.parameters)) + tuple(kw_parameters.items()))
+
+        # specializing the known function wrt type
+        if isinstance(expr.function, ResolvedFunction):
+            in_knl_callable = self.program_callables_info[expr.function.name]
+
+            # {{{ checking that there is no overwriting of types of in_knl_callable
+
+            if in_knl_callable.arg_id_to_dtype is not None:
+
+                # specializing an already specialized function.
+                for id, dtype in arg_id_to_dtype.items():
+                    if id in in_knl_callable.arg_id_to_dtype and (
+                            in_knl_callable.arg_id_to_dtype[id] !=
+                            arg_id_to_dtype[id]):
+
+                        # {{{ ignoring the the cases when there is a discrepancy
+                        # between np.uint and np.int
+
+                        import numpy as np
+                        if in_knl_callable.arg_id_to_dtype[id].dtype.type == (
+                                np.uint32) and (
+                                        arg_id_to_dtype[id].dtype.type == np.int32):
+                            continue
+                        if in_knl_callable.arg_id_to_dtype[id].dtype.type == (
+                                np.uint64) and (
+                                        arg_id_to_dtype[id].dtype.type ==
+                                        np.int64):
+                            continue
+
+                        # }}}
+
+                        raise LoopyError("Overwriting a specialized function "
+                                "is illegal--maybe start with new instance of "
+                                "InKernelCallable?")
+
+            # }}}
+
+            in_knl_callable, self.program_callables_info = (
+                    in_knl_callable.with_types(
+                        arg_id_to_dtype, self.kernel,
+                        self.program_callables_info))
+
+            in_knl_callable = in_knl_callable.with_target(self.kernel.target)
+
+            # storing the type specialized function so that it can be used for
+            # later use
+            self.program_callables_info, new_function_id = (
+                    self.program_callables_info.with_callable(
+                        expr.function.function,
+                        in_knl_callable))
+
+            if isinstance(expr, Call):
+                self.old_calls_to_new_calls[expr] = new_function_id
+            else:
+                assert isinstance(expr, CallWithKwargs)
+                self.old_calls_to_new_calls[expr] = new_function_id
+
+            new_arg_id_to_dtype = in_knl_callable.arg_id_to_dtype
+
+            if new_arg_id_to_dtype is None:
+                return []
+
+            # collecting result dtypes in order of the assignees
+            if -1 in new_arg_id_to_dtype and new_arg_id_to_dtype[-1] is not None:
+                if return_tuple:
+                    return [get_return_types_as_tuple(new_arg_id_to_dtype)]
+                else:
+                    return [new_arg_id_to_dtype[-1]]
+
+        elif isinstance(expr.function, Variable):
+            # Since, the function is not "scoped", attempt to infer using
+            # kernel.function_manglers
+
+            # {{{ trying to infer using function manglers
+
+            arg_dtypes = tuple(none_if_empty(self.rec(par)) for par in
+                    expr.parameters)
+
+            # finding the function_mangler which would be associated with the
+            # realized function.
+
+            mangle_result = None
+            for function_mangler in self.kernel.function_manglers:
+                mangle_result = function_mangler(self.kernel, identifier,
+                        arg_dtypes)
+                if mangle_result:
+                    # found a match.
+                    break
 
-        mangle_result = self.kernel.mangle_function(identifier, arg_dtypes)
-        if return_tuple:
-            if mangle_result is not None:
-                return [mangle_result.result_dtypes]
-        else:
             if mangle_result is not None:
-                if len(mangle_result.result_dtypes) != 1 and not return_tuple:
-                    raise LoopyError("functions with more or fewer than one "
-                            "return value may only be used in direct assignments")
+                from loopy.kernel.function_interface import (ManglerCallable,
+                        ValueArgDescriptor)
+
+                # creating arg_id_to_dtype, arg_id_to_descr from arg_dtypes
+                arg_id_to_dtype = dict((i, dt.with_target(self.kernel.target))
+                        for i, dt in enumerate(mangle_result.arg_dtypes))
+                arg_id_to_dtype.update(dict((-i-1,
+                    dtype.with_target(self.kernel.target)) for i, dtype in enumerate(
+                        mangle_result.result_dtypes)))
+                arg_descrs = tuple((i, ValueArgDescriptor()) for i, _ in
+                        enumerate(mangle_result.arg_dtypes))
+                res_descrs = tuple((-i-1, ValueArgDescriptor()) for i, _ in
+                        enumerate(mangle_result.result_dtypes))
+                arg_id_to_descr = dict(arg_descrs+res_descrs)
+
+                # creating the ManglerCallable object corresponding to the
+                # function.
+                in_knl_callable = ManglerCallable(
+                        identifier, function_mangler, arg_id_to_dtype,
+                        arg_id_to_descr, mangle_result.target_name)
+                self.program_callables_info, new_function_id = (
+                        self.program_callables_info.with_callable(
+                            expr.function, in_knl_callable, True))
+
+                if isinstance(expr, Call):
+                    self.old_calls_to_new_calls[expr] = new_function_id
+                else:
+                    assert isinstance(expr, CallWithKwargs)
+                    self.old_calls_to_new_calls = new_function_id
+
+            # Returning the type.
+            if return_tuple:
+                if mangle_result is not None:
+                    return [mangle_result.result_dtypes]
+            else:
+                if mangle_result is not None:
+                    if len(mangle_result.result_dtypes) != 1 and not return_tuple:
+                        raise LoopyError("functions with more or fewer than one "
+                                "return value may only be used in direct "
+                                "assignments")
 
-                return [mangle_result.result_dtypes[0]]
+                    return [mangle_result.result_dtypes[0]]
+            # }}}
 
-        raise RuntimeError("unable to resolve "
-                "function '%s' with %d given arguments"
-                % (identifier, len(arg_dtypes)))
+        return []
+
+    map_call_with_kwargs = map_call
 
     def map_variable(self, expr):
         if expr.name in self.kernel.all_inames():
@@ -399,14 +548,20 @@ class TypeInferenceMapper(CombineMapper):
             return [expr.operation.result_dtypes(self.kernel, rec_result)[0]
                     for rec_result in rec_results]
 
+    def map_sub_array_ref(self, expr):
+        return self.rec(expr.get_begin_subscript())
+
+
 # }}}
 
 
 # {{{ infer single variable
 
 def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
+
     if var_name in kernel.all_params():
-        return [kernel.index_dtype], []
+        return [kernel.index_dtype], [], {}, (
+                type_inf_mapper.program_callables_info)
 
     from functools import partial
     debug = partial(_debug, kernel)
@@ -451,11 +606,15 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
         dtype_sets.append(result)
 
     if not dtype_sets:
-        return None, type_inf_mapper.symbols_with_unknown_types
+        return (
+                None, type_inf_mapper.symbols_with_unknown_types, None,
+                type_inf_mapper.program_callables_info)
 
     result = type_inf_mapper.combine(dtype_sets)
 
-    return result, type_inf_mapper.symbols_with_unknown_types
+    return (result, type_inf_mapper.symbols_with_unknown_types,
+            type_inf_mapper.old_calls_to_new_calls,
+            type_inf_mapper.program_callables_info)
 
 # }}}
 
@@ -482,7 +641,8 @@ class _DictUnionView:
 
 # {{{ infer_unknown_types
 
-def infer_unknown_types(kernel, expect_completion=False):
+def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info,
+        expect_completion=False):
     """Infer types on temporaries and arguments."""
 
     logger.debug("%s: infer types" % kernel.name)
@@ -544,7 +704,8 @@ def infer_unknown_types(kernel, expect_completion=False):
             new_temp_vars,
             new_arg_dict
             ])
-    type_inf_mapper = TypeInferenceMapper(kernel, item_lookup)
+    type_inf_mapper = TypeInferenceMapper(kernel, program_callables_info,
+            item_lookup)
 
     from loopy.symbolic import SubstitutionRuleExpander
     subst_expander = SubstitutionRuleExpander(kernel.substitutions)
@@ -553,6 +714,8 @@ def infer_unknown_types(kernel, expect_completion=False):
 
     from loopy.kernel.data import TemporaryVariable, KernelArgument
 
+    old_calls_to_new_calls = {}
+
     for var_chain in sccs:
         changed_during_last_queue_run = False
         queue = var_chain[:]
@@ -576,9 +739,12 @@ def infer_unknown_types(kernel, expect_completion=False):
 
             debug("inferring type for %s %s", type(item).__name__, item.name)
 
-            result, symbols_with_unavailable_types = (
+            (result, symbols_with_unavailable_types,
+                    new_old_calls_to_new_calls, program_callables_info) = (
                     _infer_var_type(
                             kernel, item.name, type_inf_mapper, subst_expander))
+            type_inf_mapper = type_inf_mapper.copy(
+                    program_callables_info=program_callables_info)
 
             failed = not result
             if not failed:
@@ -597,6 +763,7 @@ def infer_unknown_types(kernel, expect_completion=False):
                         new_arg_dict[name] = item.copy(dtype=new_dtype)
                     else:
                         raise LoopyError("unexpected item type in type inference")
+                old_calls_to_new_calls.update(new_old_calls_to_new_calls)
             else:
                 debug("     failure")
 
@@ -635,23 +802,141 @@ def infer_unknown_types(kernel, expect_completion=False):
 
     # }}}
 
+    # FIXME: copy the explanation from make_function_ready_for_codegen
+    # here.
+
+    # {{{ check if insn missed during type inference
+
+    def _instruction_missed_during_inference(insn):
+        for assignee in insn.assignees:
+            if isinstance(assignee, Lookup):
+                assignee = assignee.aggregate
+
+            if isinstance(assignee, Variable):
+                if assignee.name in kernel.arg_dict:
+                    if kernel.arg_dict[assignee.name].dtype is None:
+                        return False
+                else:
+                    assert assignee.name in kernel.temporary_variables
+                    if kernel.temporary_variables[assignee.name].dtype is None:
+                        return False
+
+            elif isinstance(assignee, (Subscript, LinearSubscript)):
+                if assignee.aggregate.name in kernel.arg_dict:
+                    if kernel.arg_dict[assignee.aggregate.name].dtype is None:
+                        return False
+                else:
+                    assert assignee.aggregate.name in kernel.temporary_variables
+                    if kernel.temporary_variables[
+                            assignee.aggregate.name].dtype is None:
+                        return False
+            else:
+                assert isinstance(assignee, SubArrayRef)
+                if assignee.subscript.aggregate.name in kernel.arg_dict:
+                    if kernel.arg_dict[
+                            assignee.subscript.aggregate.name].dtype is None:
+                        return False
+                else:
+                    assert assignee.subscript.aggregate.name in (
+                            kernel.temporary_variables)
+                    if kernel.temporary_variables[
+                            assignee.subscript.aggregate.name] is None:
+                        return False
+
+        return True
+
+    # }}}
+
+    for insn in kernel.instructions:
+        if isinstance(insn, lp.MultiAssignmentBase):
+            # just a dummy run over the expression, to pass over all the
+            # functions
+            # FIXME: need a check over here which checks the instruction for
+            # unseen cases
+            if _instruction_missed_during_inference(insn):
+                type_inf_mapper(insn.expression, return_tuple=isinstance(insn,
+                    lp.CallInstruction), return_dtype_set=True)
+        elif isinstance(insn, (_DataObliviousInstruction,
+                lp.CInstruction)):
+            pass
+        else:
+            raise NotImplementedError("Unknown instructions type %s." % (
+                type(insn).__name__))
+
+    program_callables_info = type_inf_mapper.program_callables_info
+    old_calls_to_new_calls.update(type_inf_mapper.old_calls_to_new_calls)
+
     end_time = time.time()
     logger.debug("type inference took {dur:.2f} seconds".format(
             dur=end_time - start_time))
 
-    return unexpanded_kernel.copy(
+    pre_type_specialized_knl = unexpanded_kernel.copy(
             temporary_variables=new_temp_vars,
             args=[new_arg_dict[arg.name] for arg in kernel.args],
             )
 
+    # this has to be subsitutition
+    from loopy.kernel.function_interface import (
+            change_names_of_pymbolic_calls)
+    type_specialized_kernel = change_names_of_pymbolic_calls(
+            pre_type_specialized_knl, old_calls_to_new_calls)
+
+    # the check is unnecessary as we would first get TypeInfereceFailure before
+    # encountering this. Move this at the start once ManglerCallable is
+    # deprecated.
+    if expect_completion:
+        # if completion is expected, then it is important that all the
+        # callables are scoped.
+        from loopy.check import check_functions_are_scoped
+        check_functions_are_scoped(type_specialized_kernel)
+
+    return type_specialized_kernel, program_callables_info
+
+
+def infer_unknown_types(program, expect_completion=False):
+    """Infer types on temporaries and arguments."""
+    from loopy.kernel import LoopKernel
+    if isinstance(program, LoopKernel):
+        # FIXME: deprecate warning needed here
+        from loopy.program import make_program_from_kernel
+        program = make_program_from_kernel(program)
+
+    program_callables_info = program.program_callables_info
+
+    type_uninferred_knl_callable = (
+            program_callables_info[program.name])
+    type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel
+
+    program_callables_info = (
+            program.program_callables_info.with_edit_callables_mode())
+    root_kernel, program_callables_info = (
+            infer_unknown_types_for_a_single_kernel(
+                type_uninferred_root_kernel,
+                program_callables_info, expect_completion))
+
+    type_inferred_knl_callable = type_uninferred_knl_callable.copy(
+            subkernel=root_kernel)
+
+    program_callables_info, _ = (
+            program_callables_info.with_callable(
+                program.name,
+                type_inferred_knl_callable))
+
+    program_callables_info = (
+            program_callables_info.with_exit_edit_callables_mode())
+
+    # FIXME: maybe put all of this in a function?
+    # need to infer functions that were left out during inference
+    return program.copy(program_callables_info=program_callables_info)
+
 # }}}
 
 
 # {{{ reduction expression helper
 
 def infer_arg_and_reduction_dtypes_for_reduction_expression(
-        kernel, expr, unknown_types_ok):
-    type_inf_mapper = TypeInferenceMapper(kernel)
+        kernel, expr, program_callables_info, unknown_types_ok):
+    type_inf_mapper = TypeInferenceMapper(kernel, program_callables_info)
     import loopy as lp
 
     if expr.is_tuple_typed:
@@ -682,7 +967,8 @@ def infer_arg_and_reduction_dtypes_for_reduction_expression(
             if dt is not lp.auto else dt
             for dt in reduction_dtypes)
 
-    return tuple(arg_dtypes), reduction_dtypes
+    return tuple(arg_dtypes), reduction_dtypes, (
+            type_inf_mapper.program_callables_info)
 
 # }}}
 
diff --git a/test/test_apps.py b/test/test_apps.py
index e7f4004f..a9c3bf2a 100644
--- a/test/test_apps.py
+++ b/test/test_apps.py
@@ -216,7 +216,8 @@ def test_rob_stroud_bernstein(ctx_factory):
                 lp.GlobalArg("coeffs", None, shape=None),
                 "..."
                 ],
-            assumptions="deg>=0 and nels>=1"
+            assumptions="deg>=0 and nels>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0])
             )
 
     knl = lp.fix_parameters(knl, nqp1d=7, deg=4)
@@ -224,13 +225,12 @@ def test_rob_stroud_bernstein(ctx_factory):
     knl = lp.split_iname(knl, "el_outer", 2, outer_tag="g.0", inner_tag="ilp",
             slabs=(0, 1))
     knl = lp.tag_inames(knl, dict(i2="l.1", alpha1="unr", alpha2="unr"))
-
-    print(lp.CompiledKernel(ctx, knl).get_highlighted_code(
-            dict(
+    knl = lp.add_dtypes(knl, dict(
                 qpts=np.float32,
                 coeffs=np.float32,
                 tmp=np.float32,
-                )))
+                ))
+    print(lp.generate_code_v2(knl))
 
 
 def test_rob_stroud_bernstein_full(ctx_factory):
@@ -296,7 +296,8 @@ def test_rob_stroud_bernstein_full(ctx_factory):
             lp.GlobalArg("coeffs", None, shape=None),
             "..."
             ],
-        assumptions="deg>=0 and nels>=1"
+        assumptions="deg>=0 and nels>=1",
+        target=lp.PyOpenCLTarget(ctx.devices[0])
         )
 
     knl = lp.fix_parameters(knl, nqp1d=7, deg=4)
@@ -310,14 +311,14 @@ def test_rob_stroud_bernstein_full(ctx_factory):
     from pickle import dumps, loads
     knl = loads(dumps(knl))
 
-    knl = lp.CompiledKernel(ctx, knl).get_highlighted_code(
+    knl = lp.add_dtypes(knl,
             dict(
                 qpts=np.float32,
                 tmp=np.float32,
                 coeffs=np.float32,
                 result=np.float32,
                 ))
-    print(knl)
+    print(lp.generate_code_v2(knl))
 
 
 def test_stencil(ctx_factory):
@@ -660,7 +661,7 @@ def test_domain_tree_nesting():
         lp.GlobalArg('B', shape=(100, 31), dtype=np.float64),
         lp.GlobalArg('out', shape=(100, 12), dtype=np.float64)])
 
-    parents_per_domain = knl.parents_per_domain()
+    parents_per_domain = knl.root_kernel.parents_per_domain()
 
     def depth(i):
         if parents_per_domain[i] is None:
diff --git a/test/test_c_execution.py b/test/test_c_execution.py
index c355893e..7c7df255 100644
--- a/test/test_c_execution.py
+++ b/test/test_c_execution.py
@@ -76,6 +76,7 @@ def test_c_target_strides():
 
     # test with C-order
     knl = __get_kernel('C')
+    lp.generate_code_v2(knl)
     a_np = np.reshape(np.arange(16 * 16, dtype=np.float32), (16, -1),
                       order='C')
 
diff --git a/test/test_diff.py b/test/test_diff.py
index b735ab17..a7fd9298 100644
--- a/test/test_diff.py
+++ b/test/test_diff.py
@@ -55,7 +55,7 @@ def test_diff(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
-    knl = lp.make_kernel(
+    knl = lp.make_kernel_function(
          """{ [i,j]: 0<=i,j<n }""",
          """
          <> a = 1/(1+sinh(x[i] + y[j])**2)
@@ -66,6 +66,7 @@ def test_diff(ctx_factory):
 
     from loopy.transform.diff import diff_kernel
     dknl, diff_map = diff_kernel(knl, "z", "x")
+    dknl = lp.make_program_from_kernel(dknl)
     dknl = lp.remove_unused_arguments(dknl)
 
     dknl = lp.add_inames_to_insn(dknl, "diff_i0", "writes:a_dx or writes:a")
diff --git a/test/test_domain.py b/test/test_domain.py
index ebfde850..dd789d2c 100644
--- a/test/test_domain.py
+++ b/test/test_domain.py
@@ -61,20 +61,15 @@ def test_assume(ctx_factory):
     knl = lp.make_kernel(
             "{[i]: 0<=i<n}",
             "a[i] = a[i] + 1",
-            [lp.GlobalArg("a", np.float32, shape="n"), "..."])
+            [lp.GlobalArg("a", np.float32, shape="n"), "..."],
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     knl = lp.split_iname(knl, "i", 16)
     knl = lp.prioritize_loops(knl, "i_outer,i_inner")
     knl = lp.assume(knl, "n mod 16 = 0")
     knl = lp.assume(knl, "n > 10")
-    knl = lp.preprocess_kernel(knl, ctx.devices[0])
-    kernel_gen = lp.generate_loop_schedules(knl)
-
-    for gen_knl in kernel_gen:
-        print(gen_knl)
-        compiled = lp.CompiledKernel(ctx, gen_knl)
-        print(compiled.get_code())
-        assert "if" not in compiled.get_code()
+    code = lp.generate_code_v2(knl).device_code()
+    assert "if" not in code
 
 
 def test_divisibility_assumption(ctx_factory):
@@ -90,16 +85,14 @@ def test_divisibility_assumption(ctx_factory):
                 lp.GlobalArg("b", np.float32, shape=("n",)),
                 lp.ValueArg("n", np.int32),
                 ],
-            assumptions="n>=1 and (exists zz: n = 16*zz)")
+            assumptions="n>=1 and (exists zz: n = 16*zz)",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     ref_knl = knl
 
     knl = lp.split_iname(knl, "i", 16)
-
-    knl = lp.preprocess_kernel(knl, ctx.devices[0])
-    for k in lp.generate_loop_schedules(knl):
-        code = lp.generate_code(k)
-        assert "if" not in code
+    code = lp.generate_code_v2(knl).device_code()
+    assert "if" not in code
 
     lp.auto_test_vs_ref(ref_knl, ctx, knl,
             parameters={"n": 16**3})
@@ -118,16 +111,12 @@ def test_eq_constraint(ctx_factory):
             [
                 lp.GlobalArg("a", np.float32, shape=(1000,)),
                 lp.GlobalArg("b", np.float32, shape=(1000,))
-                ])
+                ],
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     knl = lp.split_iname(knl, "i", 16, outer_tag="g.0")
     knl = lp.split_iname(knl, "i_inner", 16, outer_tag=None, inner_tag="l.0")
-
-    knl = lp.preprocess_kernel(knl, ctx.devices[0])
-    kernel_gen = lp.generate_loop_schedules(knl)
-
-    for knl in kernel_gen:
-        print(lp.generate_code(knl))
+    print(lp.generate_code_v2(knl).device_code())
 
 
 def test_dependent_loop_bounds(ctx_factory):
@@ -150,12 +139,10 @@ def test_dependent_loop_bounds(ctx_factory):
                 lp.GlobalArg("a_sum", dtype, shape=lp.auto),
                 lp.ValueArg("n", np.int32),
                 ],
-            assumptions="n>=1 and row_len>=1")
+            assumptions="n>=1 and row_len>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
-    cknl = lp.CompiledKernel(ctx, knl)
-    print("---------------------------------------------------")
-    print(cknl.get_highlighted_code())
-    print("---------------------------------------------------")
+    print(lp.generate_code_v2(knl).device_code())
 
 
 def test_dependent_loop_bounds_2(ctx_factory):
@@ -179,14 +166,13 @@ def test_dependent_loop_bounds_2(ctx_factory):
                 lp.GlobalArg("ax", dtype, shape=lp.auto),
                 lp.ValueArg("n", np.int32),
                 ],
-            assumptions="n>=1 and row_len>=1")
+            assumptions="n>=1 and row_len>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     knl = lp.split_iname(knl, "i", 128, outer_tag="g.0",
             inner_tag="l.0")
-    cknl = lp.CompiledKernel(ctx, knl)
-    print("---------------------------------------------------")
-    print(cknl.get_highlighted_code())
-    print("---------------------------------------------------")
+
+    print(lp.generate_code_v2(knl).device_code())
 
 
 def test_dependent_loop_bounds_3(ctx_factory):
@@ -211,25 +197,21 @@ def test_dependent_loop_bounds_3(ctx_factory):
                 lp.GlobalArg("a_row_lengths", np.int32, shape=lp.auto),
                 lp.GlobalArg("a", dtype, shape=("n,n"), order="C"),
                 lp.ValueArg("n", np.int32),
-                ])
+                ],
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
-    assert knl.parents_per_domain()[1] == 0
+    assert knl.root_kernel.parents_per_domain()[1] == 0
 
     knl = lp.split_iname(knl, "i", 128, outer_tag="g.0",
             inner_tag="l.0")
 
-    cknl = lp.CompiledKernel(ctx, knl)
-    print("---------------------------------------------------")
-    print(cknl.get_highlighted_code())
-    print("---------------------------------------------------")
+    print(lp.generate_code_v2(knl).device_code())
 
     knl_bad = lp.split_iname(knl, "jj", 128, outer_tag="g.1",
             inner_tag="l.1")
 
-    knl = lp.preprocess_kernel(knl, ctx.devices[0])
-
     with pytest.raises(RuntimeError):
-        list(lp.generate_loop_schedules(knl_bad))
+        list(lp.generate_code_v2(knl_bad))
 
 
 def test_dependent_loop_bounds_4():
@@ -291,11 +273,10 @@ def test_independent_multi_domain(ctx_factory):
             inner_tag="l.0")
     knl = lp.split_iname(knl, "j", 16, outer_tag="g.0",
             inner_tag="l.0")
-    assert knl.parents_per_domain() == 2*[None]
+    assert knl.root_kernel.parents_per_domain() == 2*[None]
 
     n = 50
-    cknl = lp.CompiledKernel(ctx, knl)
-    evt, (a, b) = cknl(queue, n=n, out_host=True)
+    evt, (a, b) = knl(queue, n=n, out_host=True)
 
     assert a.shape == (50,)
     assert b.shape == (50,)
@@ -396,10 +377,11 @@ def test_triangle_domain(ctx_factory):
     knl = lp.make_kernel(
             "{[i,j]: 0<=i,j<n and i <= j}",
             "a[i,j] = 17",
-            assumptions="n>=1")
+            assumptions="n>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     print(knl)
-    print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
+    print(lp.generate_code_v2(knl).device_code())
 
 
 if __name__ == "__main__":
diff --git a/test/test_fortran.py b/test/test_fortran.py
index e0803336..5d5f7f0b 100644
--- a/test/test_fortran.py
+++ b/test/test_fortran.py
@@ -71,7 +71,7 @@ def test_fill(ctx_factory):
     knl, = lp.parse_transformed_fortran(fortran_src,
             pre_transform_code="split_amount = 128")
 
-    assert "i_inner" in knl.all_inames()
+    assert "i_inner" in knl.root_kernel.all_inames()
 
     ctx = ctx_factory()
 
@@ -200,9 +200,9 @@ def test_assignment_to_subst_indices(ctx_factory):
 
     ref_knl = knl
 
-    assert "a" in knl.temporary_variables
+    assert "a" in knl.root_kernel.temporary_variables
     knl = lp.assignment_to_subst(knl, "a")
-    assert "a" not in knl.temporary_variables
+    assert "a" not in knl.root_kernel.temporary_variables
 
     ctx = ctx_factory()
     lp.auto_test_vs_ref(ref_knl, ctx, knl)
@@ -295,7 +295,7 @@ def test_matmul(ctx_factory, buffer_inames):
 
     knl, = lp.parse_fortran(fortran_src)
 
-    assert len(knl.domains) == 1
+    assert len(knl.root_kernel.domains) == 1
 
     ref_knl = knl
 
@@ -410,7 +410,7 @@ def test_fuse_kernels(ctx_factory):
     knl = lp.fuse_kernels((xderiv, yderiv), data_flow=[("result", 0, 1)])
     knl = lp.prioritize_loops(knl, "e,i,j,k")
 
-    assert len(knl.temporary_variables) == 2
+    assert len(knl.root_kernel.temporary_variables) == 2
 
     ctx = ctx_factory()
     lp.auto_test_vs_ref(xyderiv, ctx, knl, parameters=dict(nelements=20, ndofs=4))
@@ -472,7 +472,7 @@ def test_precompute_some_exist(ctx_factory):
 
     knl, = lp.parse_fortran(fortran_src)
 
-    assert len(knl.domains) == 1
+    assert len(knl.root_kernel.domains) == 1
 
     knl = lp.split_iname(knl, "i", 8,
             outer_tag="g.0", inner_tag="l.1")
diff --git a/test/test_loopy.py b/test/test_loopy.py
index accf9c1d..02eeda13 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -97,7 +97,7 @@ def test_complicated_subst(ctx_factory):
 
     print(knl)
 
-    sr_keys = list(knl.substitutions.keys())
+    sr_keys = list(knl.root_kernel.substitutions.keys())
     for letter, how_many in [
             ("f", 1),
             ("g", 1),
@@ -110,7 +110,7 @@ def test_complicated_subst(ctx_factory):
 def test_type_inference_no_artificial_doubles(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
             "{[i]: 0<=i<n}",
             """
                 <> bb = a[i] - b[i]
@@ -122,16 +122,15 @@ def test_type_inference_no_artificial_doubles(ctx_factory):
                 lp.GlobalArg("c", np.float32, shape=("n",)),
                 lp.ValueArg("n", np.int32),
                 ],
-            assumptions="n>=1")
+            assumptions="n>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
-    knl = lp.preprocess_kernel(knl, ctx.devices[0])
-    for k in lp.generate_loop_schedules(knl):
-        code = lp.generate_code(k)
-        assert "double" not in code
+    code = lp.generate_code_v2(prog).device_code()
+    assert "double" not in code
 
 
 def test_type_inference_with_type_dependencies():
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
             "{[i]: i=0}",
             """
             <>a = 99
@@ -143,13 +142,17 @@ def test_type_inference_with_type_dependencies():
             <>d = b + 2 + 1j
             """,
             "...")
-    knl = lp.infer_unknown_types(knl)
+    prog = lp.infer_unknown_types(prog)
 
     from loopy.types import to_loopy_type
-    assert knl.temporary_variables["a"].dtype == to_loopy_type(np.int32)
-    assert knl.temporary_variables["b"].dtype == to_loopy_type(np.float32)
-    assert knl.temporary_variables["c"].dtype == to_loopy_type(np.float32)
-    assert knl.temporary_variables["d"].dtype == to_loopy_type(np.complex128)
+    assert prog.root_kernel.temporary_variables["a"].dtype == to_loopy_type(
+            np.int32)
+    assert prog.root_kernel.temporary_variables["b"].dtype == to_loopy_type(
+            np.float32)
+    assert prog.root_kernel.temporary_variables["c"].dtype == to_loopy_type(
+            np.float32)
+    assert prog.root_kernel.temporary_variables["d"].dtype == to_loopy_type(
+            np.complex128)
 
 
 def test_sized_and_complex_literals(ctx_factory):
@@ -183,16 +186,12 @@ def test_simple_side_effect(ctx_factory):
             """
                 a[i] = a[i] + 1
                 """,
-            [lp.GlobalArg("a", np.float32, shape=(100,))]
+            [lp.GlobalArg("a", np.float32, shape=(100,))],
+            target=lp.PyOpenCLTarget(ctx.devices[0])
             )
 
-    knl = lp.preprocess_kernel(knl, ctx.devices[0])
-    kernel_gen = lp.generate_loop_schedules(knl)
-
-    for gen_knl in kernel_gen:
-        print(gen_knl)
-        compiled = lp.CompiledKernel(ctx, gen_knl)
-        print(compiled.get_code())
+    print(knl)
+    print(lp.generate_code_v2(knl))
 
 
 def test_owed_barriers(ctx_factory):
@@ -203,17 +202,14 @@ def test_owed_barriers(ctx_factory):
             [
                 "<float32> z[i] = a[i]"
                 ],
-            [lp.GlobalArg("a", np.float32, shape=(100,))]
+            [lp.GlobalArg("a", np.float32, shape=(100,))],
+            target=lp.PyOpenCLTarget(ctx.devices[0])
             )
 
     knl = lp.tag_inames(knl, dict(i="l.0"))
 
-    knl = lp.preprocess_kernel(knl, ctx.devices[0])
-    kernel_gen = lp.generate_loop_schedules(knl)
-
-    for gen_knl in kernel_gen:
-        compiled = lp.CompiledKernel(ctx, gen_knl)
-        print(compiled.get_code())
+    print(knl)
+    print(lp.generate_code_v2(knl))
 
 
 def test_wg_too_small(ctx_factory):
@@ -225,17 +221,14 @@ def test_wg_too_small(ctx_factory):
                 "<float32> z[i] = a[i] {id=copy}"
                 ],
             [lp.GlobalArg("a", np.float32, shape=(100,))],
+            target=lp.PyOpenCLTarget(ctx.devices[0]),
             local_sizes={0: 16})
 
     knl = lp.tag_inames(knl, dict(i="l.0"))
 
-    knl = lp.preprocess_kernel(knl, ctx.devices[0])
-    kernel_gen = lp.generate_loop_schedules(knl)
-
-    import pytest
-    for gen_knl in kernel_gen:
-        with pytest.raises(RuntimeError):
-            lp.CompiledKernel(ctx, gen_knl).get_code()
+    print(knl)
+    with pytest.raises(RuntimeError):
+        print(lp.generate_code_v2(knl))
 
 
 def test_multi_cse(ctx_factory):
@@ -247,17 +240,14 @@ def test_multi_cse(ctx_factory):
                 "<float32> z[i] = a[i] + a[i]**2"
                 ],
             [lp.GlobalArg("a", np.float32, shape=(100,))],
+            target=lp.PyOpenCLTarget(ctx.devices[0]),
             local_sizes={0: 16})
 
     knl = lp.split_iname(knl, "i", 16, inner_tag="l.0")
     knl = lp.add_prefetch(knl, "a", [])
 
-    knl = lp.preprocess_kernel(knl, ctx.devices[0])
-    kernel_gen = lp.generate_loop_schedules(knl)
-
-    for gen_knl in kernel_gen:
-        compiled = lp.CompiledKernel(ctx, gen_knl)
-        print(compiled.get_code())
+    print(knl)
+    print(lp.generate_code_v2(knl))
 
 
 # {{{ code generator fuzzing
@@ -414,17 +404,16 @@ def test_ilp_write_race_detection_global(ctx_factory):
                 lp.GlobalArg("a", np.float32),
                 lp.ValueArg("n", np.int32, approximately=1000),
                 ],
-            assumptions="n>=1")
+            assumptions="n>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     knl = lp.tag_inames(knl, dict(j="ilp"))
 
-    knl = lp.preprocess_kernel(knl, ctx.devices[0])
-
     with lp.CacheMode(False):
         from loopy.diagnostic import WriteRaceConditionWarning
         from warnings import catch_warnings
         with catch_warnings(record=True) as warn_list:
-            list(lp.generate_loop_schedules(knl))
+            lp.generate_code_v2(knl)
 
             assert any(isinstance(w.message, WriteRaceConditionWarning)
                     for w in warn_list)
@@ -438,13 +427,13 @@ def test_ilp_write_race_avoidance_local(ctx_factory):
             [
                 "<> a[i] = 5+i+j",
                 ],
-            [])
+            [],
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     knl = lp.tag_inames(knl, dict(i="l.0", j="ilp"))
 
     knl = lp.preprocess_kernel(knl, ctx.devices[0])
-    for k in lp.generate_loop_schedules(knl):
-        assert k.temporary_variables["a"].shape == (16, 17)
+    assert knl.root_kernel.temporary_variables["a"].shape == (16, 17)
 
 
 def test_ilp_write_race_avoidance_private(ctx_factory):
@@ -455,13 +444,13 @@ def test_ilp_write_race_avoidance_private(ctx_factory):
             [
                 "<> a = 5+j",
                 ],
-            [])
+            [],
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     knl = lp.tag_inames(knl, dict(j="ilp"))
 
-    knl = lp.preprocess_kernel(knl, ctx.devices[0])
-    for k in lp.generate_loop_schedules(knl):
-        assert k.temporary_variables["a"].shape == (16,)
+    knl = lp.preprocess_kernel(knl)
+    assert knl.root_kernel.temporary_variables["a"].shape == (16,)
 
 # }}}
 
@@ -482,11 +471,12 @@ def test_write_parameter(ctx_factory):
                 lp.GlobalArg("b", dtype, shape=()),
                 lp.ValueArg("n", np.int32, approximately=1000),
                 ],
-            assumptions="n>=1")
+            assumptions="n>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     import pytest
     with pytest.raises(RuntimeError):
-        lp.CompiledKernel(ctx, knl).get_code()
+        lp.generate_code_v2(knl).device_code()
 
 
 # {{{ arg guessing
@@ -507,10 +497,11 @@ def test_arg_shape_guessing(ctx_factory):
                 lp.GlobalArg("c", shape=lp.auto),
                 lp.ValueArg("n"),
                 ],
-            assumptions="n>=1")
+            assumptions="n>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     print(knl)
-    print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
+    print(lp.generate_code_v2(knl).device_code())
 
 
 def test_arg_guessing(ctx_factory):
@@ -523,10 +514,11 @@ def test_arg_guessing(ctx_factory):
                 b[i, j] = i*j
                 c[i+j, j] = b[j,i]
                 """,
-            assumptions="n>=1")
+            assumptions="n>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     print(knl)
-    print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
+    print(lp.generate_code_v2(knl).device_code())
 
 
 def test_arg_guessing_with_reduction(ctx_factory):
@@ -541,16 +533,16 @@ def test_arg_guessing_with_reduction(ctx_factory):
                 b[i, j] = i*j
                 c[i+j, j] = b[j,i]
                 """,
-            assumptions="n>=1")
+            assumptions="n>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     print(knl)
-    print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
+    print(lp.generate_code_v2(knl).device_code())
 
 
 def test_unknown_arg_shape(ctx_factory):
     ctx = ctx_factory()
     from loopy.target.pyopencl import PyOpenCLTarget
-    from loopy.compiled import CompiledKernel
     bsize = [256, 0]
 
     knl = lp.make_kernel(
@@ -566,11 +558,11 @@ def test_unknown_arg_shape(ctx_factory):
         """,
         seq_dependencies=True,
         name="uniform_l",
-        target=PyOpenCLTarget(),
+        target=PyOpenCLTarget(ctx.devices[0]),
         assumptions="m<=%d and m>=1 and n mod %d = 0" % (bsize[0], bsize[0]))
 
     knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
-    kernel_info = CompiledKernel(ctx, knl).kernel_info(frozenset())  # noqa
+    print(lp.generate_code_v2(knl).device_code())
 
 # }}}
 
@@ -587,10 +579,11 @@ def test_nonlinear_index(ctx_factory):
                 lp.GlobalArg("a", shape="n"),
                 lp.ValueArg("n"),
                 ],
-            assumptions="n>=1")
+            assumptions="n>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     print(knl)
-    print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
+    print(lp.generate_code_v2(knl).device_code())
 
 
 def test_offsets_and_slicing(ctx_factory):
@@ -607,9 +600,7 @@ def test_offsets_and_slicing(ctx_factory):
             assumptions="n>=1 and m>=1",
             default_offset=lp.auto)
 
-    knl = lp.tag_data_axes(knl, "a,b", "stride:auto,stride:1")
-
-    cknl = lp.CompiledKernel(ctx, knl)
+    knl = lp.tag_array_axes(knl, "a,b", "stride:auto,stride:1")
 
     a_full = cl.clrandom.rand(queue, (n, n), np.float64)
     a_full_h = a_full.get()
@@ -624,8 +615,10 @@ def test_offsets_and_slicing(ctx_factory):
 
     b_full_h[b_sub] = 2*a_full_h[a_sub]
 
-    print(cknl.get_highlighted_code({"a": a.dtype}))
-    cknl(queue, a=a, b=b)
+    knl = lp.add_dtypes(knl, {"a": a.dtype})
+
+    print(lp.generate_code_v2(knl))
+    knl(queue, a=a, b=b)
 
     import numpy.linalg as la
     assert la.norm(b_full.get() - b_full_h) < 1e-13
@@ -642,18 +635,16 @@ def test_vector_ilp_with_prefetch(ctx_factory):
                 # argument guessing.
                 lp.GlobalArg("out,a", np.float32, shape=lp.auto),
                 "..."
-                ])
+                ],
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     knl = lp.split_iname(knl, "i", 128, inner_tag="l.0")
     knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp")
     knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"],
             default_tag="l.auto")
 
-    cknl = lp.CompiledKernel(ctx, knl)
-    cknl.kernel_info()
-
     import re
-    code = cknl.get_code()
+    code = lp.generate_code_v2(knl).device_code()
     assert len(list(re.finditer("barrier", code))) == 1
 
 
@@ -674,18 +665,18 @@ def test_c_instruction(ctx_factory):
                 lp.TemporaryVariable("x", np.float32),
                 "...",
                 ],
-            assumptions="n>=1")
+            assumptions="n>=1", target=lp.PyOpenCLTarget(ctx.devices[0]))
 
     knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
 
     print(knl)
-    print(lp.CompiledKernel(ctx, knl).get_highlighted_code())
+    print(lp.generate_code_v2(knl).device_code())
 
 
 def test_dependent_domain_insn_iname_finding(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel([
+    prog = lp.make_kernel([
             "{[isrc_box]: 0<=isrc_box<nsrc_boxes}",
             "{[isrc,idim]: isrc_start<=isrc<isrc_end and 0<=idim<dim}",
             ],
@@ -700,23 +691,24 @@ def test_dependent_domain_insn_iname_finding(ctx_factory):
                     None, shape=None),
                 lp.GlobalArg("strengths",
                     None, shape="nsources"),
-                "..."])
+                "..."],
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
-    print(knl)
-    assert "isrc_box" in knl.insn_inames("set_strength")
+    print(prog)
+    assert "isrc_box" in prog.root_kernel.insn_inames("set_strength")
 
-    print(lp.CompiledKernel(ctx, knl).get_highlighted_code(
-            dict(
-                source_boxes=np.int32,
-                box_source_starts=np.int32,
-                box_source_counts_nonchild=np.int32,
-                strengths=np.float64,
-                nsources=np.int32,
-                )))
+    prog = lp.add_dtypes(prog,
+        dict(
+            source_boxes=np.int32,
+            box_source_starts=np.int32,
+            box_source_counts_nonchild=np.int32,
+            strengths=np.float64,
+            nsources=np.int32))
+    print(lp.generate_code_v2(prog).device_code())
 
 
 def test_inames_deps_from_write_subscript(ctx_factory):
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
             "{[i,j]: 0<=i,j<n}",
             """
                 <> src_ibox = source_boxes[i]
@@ -728,8 +720,8 @@ def test_inames_deps_from_write_subscript(ctx_factory):
                     None, shape=None),
                 "..."])
 
-    print(knl)
-    assert "i" in knl.insn_inames("myred")
+    print(prog)
+    assert "i" in prog.root_kernel.insn_inames("myred")
 
 
 def test_modulo_indexing(ctx_factory):
@@ -743,14 +735,12 @@ def test_modulo_indexing(ctx_factory):
             [
                 lp.GlobalArg("a", None, shape="n"),
                 "..."
-                ]
+                ], target=lp.PyOpenCLTarget(ctx.devices[0])
             )
 
     print(knl)
-    print(lp.CompiledKernel(ctx, knl).get_highlighted_code(
-            dict(
-                a=np.float32,
-                )))
+    knl = lp.add_dtypes(knl, {"a": np.float32})
+    print(lp.generate_code_v2(knl).device_code())
 
 
 @pytest.mark.parametrize("vec_len", [2, 3, 4, 8, 16])
@@ -770,7 +760,7 @@ def test_vector_types(ctx_factory, vec_len):
 
     ref_knl = knl
 
-    knl = lp.tag_data_axes(knl, "out", "c,vec")
+    knl = lp.tag_array_axes(knl, "out", "c,vec")
     knl = lp.tag_inames(knl, dict(j="unr"))
 
     knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
@@ -898,11 +888,7 @@ def test_multiple_writes_to_local_temporary():
         temp[i, 1] = 15
         """)
     knl = lp.tag_inames(knl, dict(i="l.0"))
-
-    knl = lp.preprocess_kernel(knl)
-    for k in lp.generate_loop_schedules(knl):
-        code, _ = lp.generate_code(k)
-        print(code)
+    print(lp.generate_code_v2(knl).device_code())
 
 
 def test_make_copy_kernel(ctx_factory):
@@ -980,9 +966,7 @@ def test_variable_size_temporary():
 
     # Make sure that code generation succeeds even if
     # there are variable-length arrays.
-    knl = lp.preprocess_kernel(knl)
-    for k in lp.generate_loop_schedules(knl):
-        lp.generate_code(k)
+    lp.generate_code_v2(knl).device_code()
 
 
 def test_indexof(ctx_factory):
@@ -1014,7 +998,7 @@ def test_indexof_vec(ctx_factory):
          ''' out[i,j,k] = indexof_vec(out[i,j,k])''')
 
     knl = lp.tag_inames(knl, {"i": "vec"})
-    knl = lp.tag_data_axes(knl, "out", "vec,c,c")
+    knl = lp.tag_array_axes(knl, "out", "vec,c,c")
     knl = lp.set_options(knl, write_cl=True)
 
     (evt, (out,)) = knl(queue)
@@ -1156,7 +1140,7 @@ def test_within_inames_and_reduction():
             within_inames=frozenset(),
             within_inames_is_final=True)
 
-    k = lp.make_kernel("{[i,j] : 0<=i,j<n}",
+    prog = lp.make_kernel("{[i,j] : 0<=i,j<n}",
             [i1, i2],
             [
                 lp.GlobalArg("a", dtype=np.float32, shape=()),
@@ -1166,10 +1150,10 @@ def test_within_inames_and_reduction():
             target=lp.CTarget(),
             )
 
-    k = lp.preprocess_kernel(k)
+    prog = lp.preprocess_kernel(prog)
 
-    assert 'i' not in k.insn_inames("insn_0_j_update")
-    print(k.stringify(with_dependencies=True))
+    assert 'i' not in prog.root_kernel.insn_inames("insn_0_j_update")
+    print(prog.root_kernel.stringify(with_dependencies=True))
 
 
 def test_literal_local_barrier(ctx_factory):
@@ -1232,13 +1216,6 @@ def test_kernel_splitting(ctx_factory):
 
     knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
 
-    # schedule
-    from loopy.preprocess import preprocess_kernel
-    knl = preprocess_kernel(knl)
-
-    from loopy.schedule import get_one_scheduled_kernel
-    knl = get_one_scheduled_kernel(knl)
-
     # map schedule onto host or device
     print(knl)
 
@@ -1273,13 +1250,6 @@ def test_kernel_splitting_with_loop(ctx_factory):
 
     knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
 
-    # schedule
-    from loopy.preprocess import preprocess_kernel
-    knl = preprocess_kernel(knl)
-
-    from loopy.schedule import get_one_scheduled_kernel
-    knl = get_one_scheduled_kernel(knl)
-
     # map schedule onto host or device
     print(knl)
 
@@ -1293,25 +1263,20 @@ def test_kernel_splitting_with_loop(ctx_factory):
     lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=5))
 
 
-def save_and_reload_temporaries_test(queue, knl, out_expect, debug=False):
-    from loopy.preprocess import preprocess_kernel
-    from loopy.schedule import get_one_scheduled_kernel
-
-    knl = preprocess_kernel(knl)
-    knl = get_one_scheduled_kernel(knl)
+def save_and_reload_temporaries_test(queue, prog, out_expect, debug=False):
 
     from loopy.transform.save import save_and_reload_temporaries
-    knl = save_and_reload_temporaries(knl)
-    knl = get_one_scheduled_kernel(knl)
+    prog = save_and_reload_temporaries(prog)
+    prog = prog.with_root_kernel(lp.get_one_scheduled_kernel(prog.root_kernel,
+        prog.program_callables_info))
 
     if debug:
-        print(knl)
-        cgr = lp.generate_code_v2(knl)
+        print(prog)
+        cgr = lp.generate_code_v2(prog)
         print(cgr.device_code())
         print(cgr.host_code())
-        1/0
 
-    _, (out,) = knl(queue, out_host=True)
+    _, (out,) = prog(queue, out_host=True)
     assert (out == out_expect).all(), (out, out_expect)
 
 
@@ -1320,7 +1285,7 @@ def test_save_of_private_scalar(ctx_factory, hw_loop, debug=False):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
         "{ [i]: 0<=i<8 }",
         """
         for i
@@ -1331,9 +1296,9 @@ def test_save_of_private_scalar(ctx_factory, hw_loop, debug=False):
         """, seq_dependencies=True)
 
     if hw_loop:
-        knl = lp.tag_inames(knl, dict(i="g.0"))
+        prog = lp.tag_inames(prog, dict(i="g.0"))
 
-    save_and_reload_temporaries_test(queue, knl, np.arange(8), debug)
+    save_and_reload_temporaries_test(queue, prog, np.arange(8), debug)
 
 
 def test_save_of_private_array(ctx_factory, debug=False):
@@ -1565,9 +1530,6 @@ def test_save_ambiguous_storage_requirements():
     knl = lp.duplicate_inames(knl, "j", within="writes:out", tags={"j": "l.0"})
     knl = lp.set_temporary_scope(knl, "a", "local")
 
-    knl = lp.preprocess_kernel(knl)
-    knl = lp.get_one_scheduled_kernel(knl)
-
     from loopy.diagnostic import LoopyError
     with pytest.raises(LoopyError):
         lp.save_and_reload_temporaries(knl)
@@ -1801,12 +1763,12 @@ def test_unschedulable_kernel_detection():
 
 def test_regression_no_ret_call_removal(ctx_factory):
     # https://github.com/inducer/loopy/issues/32
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
             "{[i] : 0<=i<n}",
             "f(sum(i, x[i]))")
-    knl = lp.add_and_infer_dtypes(knl, {"x": np.float32})
-    knl = lp.preprocess_kernel(knl)
-    assert len(knl.instructions) == 3
+    prog = lp.add_and_infer_dtypes(prog, {"x": np.float32})
+    prog = lp.preprocess_kernel(prog)
+    assert len(prog.root_kernel.instructions) == 3
 
 
 def test_regression_persistent_hash():
@@ -1819,14 +1781,15 @@ def test_regression_persistent_hash():
             "cse_exprvar = d[0]*d[0]")
     from loopy.tools import LoopyKeyBuilder
     lkb = LoopyKeyBuilder()
-    assert lkb(knl1.instructions[0]) != lkb(knl2.instructions[0])
+    assert (lkb(knl1.root_kernel.instructions[0]) !=
+            lkb(knl2.root_kernel.instructions[0]))
     assert lkb(knl1) != lkb(knl2)
 
 
 def test_sequential_dependencies(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
             "{[i]: 0<=i<n}",
             """
             for i
@@ -1838,9 +1801,9 @@ def test_sequential_dependencies(ctx_factory):
             end
             """, seq_dependencies=True)
 
-    print(knl.stringify(with_dependencies=True))
+    print(prog.root_kernel.stringify(with_dependencies=True))
 
-    lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(n=5))
+    lp.auto_test_vs_ref(prog, ctx, prog, parameters=dict(n=5))
 
 
 def test_nop(ctx_factory):
@@ -1895,8 +1858,12 @@ def test_global_barrier(ctx_factory):
     print(knl)
 
     knl = lp.preprocess_kernel(knl)
-    assert knl.temporary_variables["z"].address_space == lp.AddressSpace.GLOBAL
-    assert knl.temporary_variables["v"].address_space == lp.AddressSpace.GLOBAL
+    assert (
+            knl.root_kernel.temporary_variables["z"].address_space ==
+            lp.AddressSpace.GLOBAL)
+    assert (
+            knl.root_kernel.temporary_variables["v"].address_space ==
+            lp.AddressSpace.GLOBAL)
 
     print(knl)
 
@@ -1919,11 +1886,12 @@ def test_missing_global_barrier():
 
     knl = lp.set_temporary_scope(knl, "z", "global")
     knl = lp.split_iname(knl, "i", 256, outer_tag="g.0")
+    knl = lp.add_dtypes(knl, {'z': np.float32, 'v': np.float32})
     knl = lp.preprocess_kernel(knl)
 
     from loopy.diagnostic import MissingBarrierError
     with pytest.raises(MissingBarrierError):
-        lp.get_one_scheduled_kernel(knl)
+        lp.generate_code_v2(knl)
 
 
 def test_index_cse(ctx_factory):
@@ -2038,7 +2006,7 @@ def test_temp_initializer(ctx_factory, src_order, tmp_order):
 
 
 def test_const_temp_with_initializer_not_saved():
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
         "{[i]: 0<=i<10}",
         """
         ... gbarrier
@@ -2054,12 +2022,11 @@ def test_const_temp_with_initializer_not_saved():
             ],
         seq_dependencies=True)
 
-    knl = lp.preprocess_kernel(knl)
-    knl = lp.get_one_scheduled_kernel(knl)
-    knl = lp.save_and_reload_temporaries(knl)
+    prog = lp.preprocess_kernel(prog)
+    prog = lp.save_and_reload_temporaries(prog)
 
     # This ensures no save slot was added.
-    assert len(knl.temporary_variables) == 1
+    assert len(prog.root_kernel.temporary_variables) == 1
 
 
 def test_header_extract():
@@ -2244,20 +2211,24 @@ def test_tight_loop_bounds_codegen():
 
 
 def test_unscheduled_insn_detection():
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
         "{ [i]: 0 <= i < 10 }",
         """
         out[i] = i {id=insn1}
         """,
         "...")
 
-    knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
-    insn1, = lp.find_instructions(knl, "id:insn1")
-    knl.instructions.append(insn1.copy(id="insn2"))
+    prog = lp.preprocess_kernel(prog)
+    knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info)
+    prog = prog.with_root_kernel(knl)
+    insn1, = lp.find_instructions(prog, "id:insn1")
+    insns = prog.root_kernel.instructions[:]
+    insns.append(insn1.copy(id="insn2"))
+    prog = prog.with_root_kernel(prog.root_kernel.copy(instructions=insns))
 
     from loopy.diagnostic import UnscheduledInstructionError
     with pytest.raises(UnscheduledInstructionError):
-        lp.generate_code(knl)
+        lp.generate_code(prog)
 
 
 def test_integer_reduction(ctx_factory):
@@ -2300,6 +2271,7 @@ def test_integer_reduction(ctx_factory):
             knl = lp.make_kernel('{[k]: 0<=k<n}',
                                 kstr,
                                 [var_lp, '...'])
+            knl = lp.set_options(knl, "write_cl")
 
             knl = lp.fix_parameters(knl, n=200)
 
@@ -2399,7 +2371,7 @@ def barrier_between(knl, id1, id2, ignore_barriers_in_levels=()):
 
 
 def test_barrier_insertion_near_top_of_loop():
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
         "{[i,j]: 0 <= i,j < 10 }",
         """
         for i
@@ -2413,10 +2385,11 @@ def test_barrier_insertion_near_top_of_loop():
         """,
         seq_dependencies=True)
 
-    knl = lp.tag_inames(knl, dict(i="l.0"))
-    knl = lp.set_temporary_scope(knl, "a", "local")
-    knl = lp.set_temporary_scope(knl, "b", "local")
-    knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
+    prog = lp.tag_inames(prog, dict(i="l.0"))
+    prog = lp.set_temporary_scope(prog, "a", "local")
+    prog = lp.set_temporary_scope(prog, "b", "local")
+    prog = lp.preprocess_kernel(prog)
+    knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info)
 
     print(knl)
 
@@ -2426,7 +2399,7 @@ def test_barrier_insertion_near_top_of_loop():
 
 
 def test_barrier_insertion_near_bottom_of_loop():
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
         ["{[i]: 0 <= i < 10 }",
          "[jmax] -> {[j]: 0 <= j < jmax}"],
         """
@@ -2440,10 +2413,11 @@ def test_barrier_insertion_near_bottom_of_loop():
         end
         """,
         seq_dependencies=True)
-    knl = lp.tag_inames(knl, dict(i="l.0"))
-    knl = lp.set_temporary_scope(knl, "a", "local")
-    knl = lp.set_temporary_scope(knl, "b", "local")
-    knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
+    prog = lp.tag_inames(prog, dict(i="l.0"))
+    prog = lp.set_temporary_scope(prog, "a", "local")
+    prog = lp.set_temporary_scope(prog, "b", "local")
+    prog = lp.preprocess_kernel(prog)
+    knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info)
 
     print(knl)
 
@@ -2453,7 +2427,7 @@ def test_barrier_insertion_near_bottom_of_loop():
 
 def test_barrier_in_overridden_get_grid_size_expanded_kernel():
     # make simple barrier'd kernel
-    knl = lp.make_kernel('{[i]: 0 <= i < 10}',
+    prog = lp.make_kernel('{[i]: 0 <= i < 10}',
                    """
               for i
                     a[i] = i {id=a}
@@ -2468,15 +2442,17 @@ def test_barrier_in_overridden_get_grid_size_expanded_kernel():
 
     # split into kernel w/ vesize larger than iname domain
     vecsize = 16
-    knl = lp.split_iname(knl, 'i', vecsize, inner_tag='l.0')
+    prog = lp.split_iname(prog, 'i', vecsize, inner_tag='l.0')
 
     from testlib import GridOverride
 
     # artifically expand via overridden_get_grid_sizes_for_insn_ids
+    knl = prog.root_kernel
     knl = knl.copy(overridden_get_grid_sizes_for_insn_ids=GridOverride(
         knl.copy(), vecsize))
+    prog = prog.with_root_kernel(knl)
     # make sure we can generate the code
-    lp.generate_code_v2(knl)
+    lp.generate_code_v2(prog)
 
 
 def test_multi_argument_reduction_type_inference():
@@ -2485,7 +2461,7 @@ def test_multi_argument_reduction_type_inference():
     from loopy.types import to_loopy_type
     op = SegmentedSumReductionOperation()
 
-    knl = lp.make_kernel("{[i,j]: 0<=i<10 and 0<=j<i}", "")
+    prog = lp.make_kernel("{[i,j]: 0<=i<10 and 0<=j<i}", "")
 
     int32 = to_loopy_type(np.int32)
 
@@ -2499,7 +2475,8 @@ def test_multi_argument_reduction_type_inference():
                 allow_simultaneous=True),
             allow_simultaneous=True)
 
-    t_inf_mapper = TypeInferenceMapper(knl)
+    t_inf_mapper = TypeInferenceMapper(prog.root_kernel,
+            prog.program_callables_info)
 
     assert (
             t_inf_mapper(expr, return_tuple=True, return_dtype_set=True)
@@ -2515,7 +2492,7 @@ def test_multi_argument_reduction_parsing():
 
 
 def test_global_barrier_order_finding():
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
             "{[i,itrip]: 0<=i<n and 0<=itrip<ntrips}",
             """
             for i
@@ -2532,7 +2509,8 @@ def test_global_barrier_order_finding():
             end
             """)
 
-    assert lp.get_global_barrier_order(knl) == ("top", "yoink", "postloop")
+    assert (lp.get_global_barrier_order(prog.root_kernel) == ("top", "yoink",
+        "postloop"))
 
     for insn, barrier in (
             ("nop", None),
@@ -2542,12 +2520,12 @@ def test_global_barrier_order_finding():
             ("yoink", "top"),
             ("postloop", "yoink"),
             ("zzzv", "postloop")):
-        assert lp.find_most_recent_global_barrier(knl, insn) == barrier
+        assert lp.find_most_recent_global_barrier(prog.root_kernel, insn) == barrier
 
 
 def test_global_barrier_error_if_unordered():
     # FIXME: Should be illegal to declare this
-    knl = lp.make_kernel("{[i]: 0 <= i < 10}",
+    prog = lp.make_kernel("{[i]: 0 <= i < 10}",
             """
             ... gbarrier
             ... gbarrier
@@ -2555,7 +2533,7 @@ def test_global_barrier_error_if_unordered():
 
     from loopy.diagnostic import LoopyError
     with pytest.raises(LoopyError):
-        lp.get_global_barrier_order(knl)
+        lp.get_global_barrier_order(prog.root_kernel)
 
 
 def test_struct_assignment(ctx_factory):
@@ -2617,14 +2595,14 @@ def test_inames_conditional_generation(ctx_factory):
 
 
 def test_kernel_var_name_generator():
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
             "{[i]: 0 <= i <= 10}",
             """
             <>a = 0
             <>b_s0 = 0
             """)
 
-    vng = knl.get_var_name_generator()
+    vng = prog.root_kernel.get_var_name_generator()
 
     assert vng("a_s0") != "a_s0"
     assert vng("b") != "b"
@@ -2647,7 +2625,7 @@ def test_fixed_parameters(ctx_factory):
 
 def test_parameter_inference():
     knl = lp.make_kernel("{[i]: 0 <= i < n and i mod 2 = 0}", "")
-    assert knl.all_params() == set(["n"])
+    assert knl.root_kernel.all_params() == set(["n"])
 
 
 def test_execution_backend_can_cache_dtypes(ctx_factory):
@@ -2666,7 +2644,7 @@ def test_execution_backend_can_cache_dtypes(ctx_factory):
 
 
 def test_wildcard_dep_matching():
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
             "{[i]: 0 <= i < 10}",
             """
             <>a = 0 {id=insn1}
@@ -2679,11 +2657,15 @@ def test_wildcard_dep_matching():
 
     all_insns = set("insn%d" % i for i in range(1, 6))
 
-    assert knl.id_to_insn["insn1"].depends_on == set()
-    assert knl.id_to_insn["insn2"].depends_on == all_insns - set(["insn2"])
-    assert knl.id_to_insn["insn3"].depends_on == all_insns - set(["insn3"])
-    assert knl.id_to_insn["insn4"].depends_on == set(["insn1", "insn2"])
-    assert knl.id_to_insn["insn5"].depends_on == all_insns - set(["insn1", "insn5"])
+    assert prog.root_kernel.id_to_insn["insn1"].depends_on == set()
+    assert (prog.root_kernel.id_to_insn["insn2"].depends_on == all_insns -
+            set(["insn2"]))
+    assert (prog.root_kernel.id_to_insn["insn3"].depends_on == all_insns -
+            set(["insn3"]))
+    assert (prog.root_kernel.id_to_insn["insn4"].depends_on == set(["insn1",
+        "insn2"]))
+    assert (prog.root_kernel.id_to_insn["insn5"].depends_on == all_insns -
+            set(["insn1", "insn5"]))
 
 
 def test_preamble_with_separate_temporaries(ctx_factory):
@@ -2777,7 +2759,7 @@ def test_relaxed_stride_checks(ctx_factory):
 
 
 def test_add_prefetch_works_in_lhs_index():
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
             "{ [n,k,l,k1,l1,k2,l2]: "
             "start<=n<end and 0<=k,k1,k2<3 and 0<=l,l1,l2<2 }",
             """
@@ -2793,10 +2775,10 @@ def test_add_prefetch_works_in_lhs_index():
                 "..."
             ])
 
-    knl = lp.add_prefetch(knl, "a1_map", "k", default_tag="l.auto")
+    prog = lp.add_prefetch(prog, "a1_map", "k", default_tag="l.auto")
 
     from loopy.symbolic import get_dependencies
-    for insn in knl.instructions:
+    for insn in prog.root_kernel.instructions:
         assert "a1_map" not in get_dependencies(insn.assignees)
 
 
@@ -2808,11 +2790,9 @@ def test_check_for_variable_access_ordering():
             a[i+1] = 13
             """)
 
-    knl = lp.preprocess_kernel(knl)
-
     from loopy.diagnostic import VariableAccessNotOrdered
     with pytest.raises(VariableAccessNotOrdered):
-        lp.get_one_scheduled_kernel(knl)
+        lp.generate_code_v2(knl)
 
 
 def test_check_for_variable_access_ordering_with_aliasing():
@@ -2827,11 +2807,9 @@ def test_check_for_variable_access_ordering_with_aliasing():
                 lp.TemporaryVariable("b", shape="n+1", base_storage="tmp"),
                 ])
 
-    knl = lp.preprocess_kernel(knl)
-
     from loopy.diagnostic import VariableAccessNotOrdered
     with pytest.raises(VariableAccessNotOrdered):
-        lp.get_one_scheduled_kernel(knl)
+        lp.generate_code_v2(knl)
 
 
 @pytest.mark.parametrize(("second_index", "expect_barrier"),
@@ -2840,7 +2818,7 @@ def test_check_for_variable_access_ordering_with_aliasing():
             ("2*i+1", False),
             ])
 def test_no_barriers_for_nonoverlapping_access(second_index, expect_barrier):
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
             "{[i]: 0<=i<128}",
             """
             a[2*i] = 12  {id=first}
@@ -2851,10 +2829,11 @@ def test_no_barriers_for_nonoverlapping_access(second_index, expect_barrier):
                     scope=lp.AddressSpace.LOCAL),
                 ])
 
-    knl = lp.tag_inames(knl, "i:l.0")
+    prog = lp.tag_inames(prog, "i:l.0")
+    prog = lp.preprocess_kernel(prog)
 
-    knl = lp.preprocess_kernel(knl)
-    knl = lp.get_one_scheduled_kernel(knl)
+    knl = lp.get_one_scheduled_kernel(prog.root_kernel,
+            prog.program_callables_info)
 
     assert barrier_between(knl, "first", "second") == expect_barrier
 
@@ -2905,7 +2884,7 @@ def test_dep_cycle_printing_and_error():
 
     from loopy.diagnostic import DependencyCycleFound
     with pytest.raises(DependencyCycleFound):
-        print(lp.generate_code(knl)[0])
+        print(lp.generate_code(knl).device_code())
 
 
 if __name__ == "__main__":
diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py
index 6b578838..4f802f8b 100644
--- a/test/test_numa_diff.py
+++ b/test/test_numa_diff.py
@@ -246,7 +246,9 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
          "-cl-no-signed-zeros",
          ])
 
-    hsv = hsv.copy(name="horizontalStrongVolumeKernel")
+    # FIXME: renaming's a bit tricky in this program model.
+    # add a simple transformation for it
+    # hsv = hsv.copy(name="horizontalStrongVolumeKernel")
 
     results = lp.auto_test_vs_ref(ref_hsv, ctx, hsv, parameters=dict(elements=300),
             quiet=True)
diff --git a/test/test_reduction.py b/test/test_reduction.py
index 78eca4d0..96dab405 100644
--- a/test/test_reduction.py
+++ b/test/test_reduction.py
@@ -80,7 +80,7 @@ def test_empty_reduction(ctx_factory):
             "a[i] = sum(j, j)",
             )
 
-    knl = lp.realize_reduction(knl)
+    knl = lp.preprocess_kernel(knl)
     print(knl)
 
     knl = lp.set_options(knl, write_cl=True)
@@ -109,11 +109,9 @@ def test_nested_dependent_reduction(ctx_factory):
                 lp.GlobalArg("ell", np.int32, ("n",)),
                 ])
 
-    cknl = lp.CompiledKernel(ctx, knl)
-
     n = 330
     ell = np.arange(n, dtype=np.int32)
-    evt, (a,) = cknl(queue, ell=ell, n=n, out_host=True)
+    evt, (a,) = knl(queue, ell=ell, n=n, out_host=True)
 
     tgt_result = (2*ell-1)*2*ell/2
     assert (a == tgt_result).all()
@@ -144,10 +142,10 @@ def test_multi_nested_dependent_reduction(ctx_factory):
                 lp.ValueArg("ntgts", np.int32),
                 lp.ValueArg("nboxes", np.int32),
                 ],
-            assumptions="ntgts>=1")
+            assumptions="ntgts>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
-    cknl = lp.CompiledKernel(ctx, knl)
-    print(cknl.get_code())
+    print(lp.generate_code_v2(knl).device_code())
     # FIXME: Actually test functionality.
 
 
@@ -177,10 +175,10 @@ def test_recursive_nested_dependent_reduction(ctx_factory):
                 lp.ValueArg("ntgts", np.int32),
                 lp.ValueArg("nboxes", np.int32),
                 ],
-            assumptions="ntgts>=1")
+            assumptions="ntgts>=1",
+            target=lp.PyOpenCLTarget(ctx.devices[0]))
 
-    cknl = lp.CompiledKernel(ctx, knl)
-    print(cknl.get_code())
+    print(lp.generate_code_v2(knl).device_code())
     # FIXME: Actually test functionality.
 
 
@@ -221,32 +219,32 @@ def test_local_parallel_reduction(ctx_factory, size):
 def test_global_parallel_reduction(ctx_factory, size):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
             "{[i]: 0 <= i < n }",
             """
             # Using z[0] instead of z works around a bug in ancient PyOpenCL.
             z[0] = sum(i, i/13)
             """)
 
-    ref_knl = knl
+    ref_prog = prog
 
     gsize = 128
-    knl = lp.split_iname(knl, "i", gsize * 20)
-    knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0")
-    knl = lp.split_reduction_inward(knl, "i_inner_inner")
-    knl = lp.split_reduction_inward(knl, "i_inner_outer")
+    prog = lp.split_iname(prog, "i", gsize * 20)
+    prog = lp.split_iname(prog, "i_inner", gsize, outer_tag="l.0")
+    prog = lp.split_reduction_inward(prog, "i_inner_inner")
+    prog = lp.split_reduction_inward(prog, "i_inner_outer")
     from loopy.transform.data import reduction_arg_to_subst_rule
-    knl = reduction_arg_to_subst_rule(knl, "i_outer")
-    knl = lp.precompute(knl, "red_i_outer_arg", "i_outer",
+    prog = reduction_arg_to_subst_rule(prog, "i_outer")
+    prog = lp.precompute(prog, "red_i_outer_arg", "i_outer",
             temporary_scope=lp.temp_var_scope.GLOBAL,
             default_tag="l.auto")
-    knl = lp.realize_reduction(knl)
-    knl = lp.add_dependency(
-            knl, "writes:acc_i_outer",
+    prog = lp.realize_reduction(prog)
+    prog = lp.add_dependency(
+            prog, "writes:acc_i_outer",
             "id:red_i_outer_arg_barrier")
 
     lp.auto_test_vs_ref(
-            ref_knl, ctx, knl, parameters={"n": size},
+            ref_prog, ctx, prog, parameters={"n": size},
             print_ref_code=True)
 
 
@@ -270,6 +268,7 @@ def test_global_mc_parallel_reduction(ctx_factory, size):
             """)
 
     ref_knl = knl
+    ref_knl = lp.add_dtypes(ref_knl, {"n": np.int32})
 
     gsize = 128
     knl = lp.split_iname(knl, "i", gsize * 20)
@@ -281,7 +280,7 @@ def test_global_mc_parallel_reduction(ctx_factory, size):
     knl = lp.precompute(knl, "red_i_outer_arg", "i_outer",
             temporary_scope=lp.temp_var_scope.GLOBAL,
             default_tag="l.auto")
-    knl = lp.realize_reduction(knl)
+    knl = lp.preprocess_kernel(knl)
     knl = lp.add_dependency(
             knl, "writes:acc_i_outer",
             "id:red_i_outer_arg_barrier")
@@ -406,7 +405,6 @@ def test_parallel_multi_output_reduction(ctx_factory):
                 """)
     knl = lp.tag_inames(knl, dict(i="l.0"))
     knl = lp.add_dtypes(knl, dict(a=np.float64))
-    knl = lp.realize_reduction(knl)
 
     ctx = ctx_factory()
 
diff --git a/test/test_target.py b/test/test_target.py
index 7c0d003e..7b9d4f40 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -72,9 +72,7 @@ def test_ispc_target(occa_mode=False):
     knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"],
             default_tag="l.auto")
 
-    codegen_result = lp.generate_code_v2(
-                lp.get_one_scheduled_kernel(
-                    lp.preprocess_kernel(knl)))
+    codegen_result = lp.generate_code_v2(knl)
 
     print(codegen_result.device_code())
     print(codegen_result.host_code())
@@ -98,9 +96,8 @@ def test_cuda_target():
             default_tag="l.auto")
 
     print(
-            lp.generate_code(
-                lp.get_one_scheduled_kernel(
-                    lp.preprocess_kernel(knl)))[0])
+            lp.generate_code_v2(
+                knl).device_code())
 
 
 def test_generate_c_snippet():
@@ -140,10 +137,7 @@ def test_generate_c_snippet():
 
     knl = lp.split_iname(knl, "k", 4, inner_tag="unr", slabs=(0, 1))
     knl = lp.prioritize_loops(knl, "I,k_outer,k_inner")
-
-    knl = lp.preprocess_kernel(knl)
-    knl = lp.get_one_scheduled_kernel(knl)
-    print(lp.generate_body(knl))
+    print(lp.generate_code_v2(knl))
 
 
 @pytest.mark.parametrize("target", [CTarget, OpenCLTarget])
diff --git a/test/test_transform.py b/test/test_transform.py
index ed184fb5..d54a820a 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -127,7 +127,7 @@ def test_to_batched(ctx_factory):
 def test_to_batched_temp(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
          ''' { [i,j]: 0<=i,j<n } ''',
          ''' cnst = 2.0
          out[i] = sum(j, cnst*a[i,j]*x[j])''',
@@ -136,28 +136,28 @@ def test_to_batched_temp(ctx_factory):
              dtype=np.float32,
              shape=(),
              scope=lp.temp_var_scope.PRIVATE), '...'])
-    knl = lp.add_and_infer_dtypes(knl, dict(out=np.float32,
+    prog = lp.add_and_infer_dtypes(prog, dict(out=np.float32,
                                             x=np.float32,
                                             a=np.float32))
-    ref_knl = lp.make_kernel(
+    ref_prog = lp.make_kernel(
          ''' { [i,j]: 0<=i,j<n } ''',
          '''out[i] = sum(j, 2.0*a[i,j]*x[j])''')
-    ref_knl = lp.add_and_infer_dtypes(ref_knl, dict(out=np.float32,
+    ref_prog = lp.add_and_infer_dtypes(ref_prog, dict(out=np.float32,
                                                     x=np.float32,
                                                     a=np.float32))
 
-    bknl = lp.to_batched(knl, "nbatches", "out,x")
-    bref_knl = lp.to_batched(ref_knl, "nbatches", "out,x")
+    bprog = lp.to_batched(prog, "nbatches", "out,x")
+    bref_prog = lp.to_batched(ref_prog, "nbatches", "out,x")
 
     # checking that cnst is not being bathced
-    assert bknl.temporary_variables['cnst'].shape == ()
+    assert bprog.root_kernel.temporary_variables['cnst'].shape == ()
 
     a = np.random.randn(5, 5)
     x = np.random.randn(7, 5)
 
     # Checking that the program compiles and the logic is correct
     lp.auto_test_vs_ref(
-            bref_knl, ctx, bknl,
+            bref_prog, ctx, bprog,
             parameters=dict(a=a, x=x, n=5, nbatches=7))
 
 
@@ -255,18 +255,17 @@ def test_vectorize(ctx_factory):
         a[i] = temp
         """)
     knl = lp.add_and_infer_dtypes(knl, dict(b=np.float32))
-    knl = lp.set_array_dim_names(knl, "a,b", "i")
+    knl = lp.set_array_axis_names(knl, "a,b", "i")
     knl = lp.split_array_dim(knl, [("a", 0), ("b", 0)], 4,
             split_kwargs=dict(slabs=(0, 1)))
 
-    knl = lp.tag_data_axes(knl, "a,b", "c,vec")
+    knl = lp.tag_array_axes(knl, "a,b", "c,vec")
     ref_knl = knl
     ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"})
 
     knl = lp.tag_inames(knl, {"i_inner": "vec"})
 
     knl = lp.preprocess_kernel(knl)
-    knl = lp.get_one_scheduled_kernel(knl)
     code, inf = lp.generate_code(knl)
 
     lp.auto_test_vs_ref(
@@ -275,19 +274,19 @@ def test_vectorize(ctx_factory):
 
 
 def test_extract_subst(ctx_factory):
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
             "{[i]: 0<=i<n}",
             """
                 a[i] = 23*b[i]**2 + 25*b[i]**2
                 """)
 
-    knl = lp.extract_subst(knl, "bsquare", "alpha*b[i]**2", "alpha")
+    prog = lp.extract_subst(prog, "bsquare", "alpha*b[i]**2", "alpha")
 
-    print(knl)
+    print(prog)
 
     from loopy.symbolic import parse
 
-    insn, = knl.instructions
+    insn, = prog.root_kernel.instructions
     assert insn.expression == parse("bsquare(23) + bsquare(25)")
 
 
@@ -323,12 +322,12 @@ def test_tag_data_axes(ctx_factory):
     ref_knl = knl
 
     with pytest.raises(lp.LoopyError):
-        lp.tag_data_axes(knl, "out", "N1,N0,N5")
+        lp.tag_array_axes(knl, "out", "N1,N0,N5")
 
     with pytest.raises(lp.LoopyError):
-        lp.tag_data_axes(knl, "out", "N1,N0,c")
+        lp.tag_array_axes(knl, "out", "N1,N0,c")
 
-    knl = lp.tag_data_axes(knl, "out", "N1,N0,N2")
+    knl = lp.tag_array_axes(knl, "out", "N1,N0,N2")
     knl = lp.tag_inames(knl, dict(j="g.0", i="g.1"))
 
     lp.auto_test_vs_ref(ref_knl, ctx, knl,
@@ -358,33 +357,34 @@ def test_affine_map_inames():
 def test_precompute_confusing_subst_arguments(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
         "{[i,j]: 0<=i<n and 0<=j<5}",
         """
         D(i):=a[i+1]-a[i]
         b[i,j] = D(j)
         """)
 
-    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
+    prog = lp.add_and_infer_dtypes(prog, dict(a=np.float32))
 
-    ref_knl = knl
+    ref_prog = prog
 
-    knl = lp.tag_inames(knl, dict(j="g.1"))
-    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
+    prog = lp.tag_inames(prog, dict(j="g.1"))
+    prog = lp.split_iname(prog, "i", 128, outer_tag="g.0", inner_tag="l.0")
 
     from loopy.symbolic import get_dependencies
-    assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression)
-    knl = lp.precompute(knl, "D")
+    assert "i_inner" not in get_dependencies(
+            prog.root_kernel.substitutions["D"].expression)
+    prog = lp.precompute(prog, "D")
 
     lp.auto_test_vs_ref(
-            ref_knl, ctx, knl,
+            ref_prog, ctx, prog,
             parameters=dict(n=12345))
 
 
 def test_precompute_nested_subst(ctx_factory):
     ctx = ctx_factory()
 
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
         "{[i,j]: 0<=i<n and 0<=j<5}",
         """
         E:=a[i]
@@ -392,29 +392,31 @@ def test_precompute_nested_subst(ctx_factory):
         b[i] = D
         """)
 
-    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32))
+    prog = lp.add_and_infer_dtypes(prog, dict(a=np.float32))
 
-    ref_knl = knl
+    ref_prog = prog
 
-    knl = lp.tag_inames(knl, dict(j="g.1"))
-    knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
+    prog = lp.tag_inames(prog, dict(j="g.1"))
+    prog = lp.split_iname(prog, "i", 128, outer_tag="g.0", inner_tag="l.0")
 
     from loopy.symbolic import get_dependencies
-    assert "i_inner" not in get_dependencies(knl.substitutions["D"].expression)
-    knl = lp.precompute(knl, "D", "i_inner", default_tag="l.auto")
+    assert "i_inner" not in get_dependencies(
+            prog.root_kernel.substitutions["D"].expression)
+    prog = lp.precompute(prog, "D", "i_inner", default_tag="l.auto")
 
     # There's only one surviving 'E' rule.
     assert len([
         rule_name
-        for rule_name in knl.substitutions
+        for rule_name in prog.root_kernel.substitutions
         if rule_name.startswith("E")]) == 1
 
     # That rule should use the newly created prefetch inames,
     # not the prior 'i_inner'
-    assert "i_inner" not in get_dependencies(knl.substitutions["E"].expression)
+    assert "i_inner" not in get_dependencies(
+            prog.root_kernel.substitutions["E"].expression)
 
     lp.auto_test_vs_ref(
-            ref_knl, ctx, knl,
+            ref_prog, ctx, prog,
             parameters=dict(n=12345))
 
 
@@ -480,7 +482,7 @@ def test_precompute_with_preexisting_inames_fail():
 
 
 def test_add_nosync():
-    orig_knl = lp.make_kernel("{[i]: 0<=i<10}",
+    orig_prog = lp.make_kernel("{[i]: 0<=i<10}",
         """
         <>tmp[i] = 10 {id=insn1}
         <>tmp2[i] = 10 {id=insn2}
@@ -492,28 +494,34 @@ def test_add_nosync():
         tmp5[i] = 1 {id=insn6,conflicts=g1}
         """)
 
-    orig_knl = lp.set_temporary_scope(orig_knl, "tmp3", "local")
-    orig_knl = lp.set_temporary_scope(orig_knl, "tmp5", "local")
+    orig_prog = lp.set_temporary_scope(orig_prog, "tmp3", "local")
+    orig_prog = lp.set_temporary_scope(orig_prog, "tmp5", "local")
 
     # No dependency present - don't add nosync
-    knl = lp.add_nosync(orig_knl, "any", "writes:tmp", "writes:tmp2",
+    prog = lp.add_nosync(orig_prog, "any", "writes:tmp", "writes:tmp2",
             empty_ok=True)
-    assert frozenset() == knl.id_to_insn["insn2"].no_sync_with
+    assert frozenset() == (
+            prog.root_kernel.id_to_insn["insn2"].no_sync_with)
 
     # Dependency present
-    knl = lp.add_nosync(orig_knl, "local", "writes:tmp3", "reads:tmp3")
-    assert frozenset() == knl.id_to_insn["insn3"].no_sync_with
-    assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with
+    prog = lp.add_nosync(orig_prog, "local", "writes:tmp3", "reads:tmp3")
+    assert frozenset() == (
+            prog.root_kernel.id_to_insn["insn3"].no_sync_with)
+    assert frozenset([("insn3", "local")]) == (
+            prog.root_kernel.id_to_insn["insn4"].no_sync_with)
 
     # Bidirectional
-    knl = lp.add_nosync(
-            orig_knl, "local", "writes:tmp3", "reads:tmp3", bidirectional=True)
-    assert frozenset([("insn4", "local")]) == knl.id_to_insn["insn3"].no_sync_with
-    assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with
+    prog = lp.add_nosync(
+            orig_prog, "local", "writes:tmp3", "reads:tmp3", bidirectional=True)
+    assert frozenset([("insn4", "local")]) == (
+            prog.root_kernel.id_to_insn["insn3"].no_sync_with)
+    assert frozenset([("insn3", "local")]) == (
+            prog.root_kernel.id_to_insn["insn4"].no_sync_with)
 
     # Groups
-    knl = lp.add_nosync(orig_knl, "local", "insn5", "insn6")
-    assert frozenset([("insn5", "local")]) == knl.id_to_insn["insn6"].no_sync_with
+    prog = lp.add_nosync(orig_prog, "local", "insn5", "insn6")
+    assert frozenset([("insn5", "local")]) == (
+            prog.root_kernel.id_to_insn["insn6"].no_sync_with)
 
 
 def test_uniquify_instruction_ids():
@@ -522,12 +530,14 @@ def test_uniquify_instruction_ids():
     i3 = lp.Assignment("b", 1, id=lp.UniqueName("b"))
     i4 = lp.Assignment("b", 1, id=lp.UniqueName("b"))
 
-    knl = lp.make_kernel("{[i]: i = 1}", []).copy(instructions=[i1, i2, i3, i4])
+    prog = lp.make_kernel("{[i]: i = 1}", [])
+    new_root_kernel = prog.root_kernel.copy(instructions=[i1, i2, i3, i4])
+    prog = prog.with_root_kernel(new_root_kernel)
 
     from loopy.transform.instruction import uniquify_instruction_ids
-    knl = uniquify_instruction_ids(knl)
+    prog = uniquify_instruction_ids(prog)
 
-    insn_ids = set(insn.id for insn in knl.instructions)
+    insn_ids = set(insn.id for insn in prog.root_kernel.instructions)
 
     assert len(insn_ids) == 4
     assert all(isinstance(id, str) for id in insn_ids)
diff --git a/test/testlib.py b/test/testlib.py
index ad290ee7..eebc792d 100644
--- a/test/testlib.py
+++ b/test/testlib.py
@@ -1,4 +1,5 @@
 import loopy as lp
+import numpy as np
 
 
 # {{{ test_barrier_in_overridden_get_grid_size_expanded_kernel
@@ -8,8 +9,9 @@ class GridOverride(object):
         self.clean = clean
         self.vecsize = vecsize
 
-    def __call__(self, insn_ids, ignore_auto=True):
-        gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids, ignore_auto)
+    def __call__(self, insn_ids, program_callables_info, ignore_auto=True):
+        gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids,
+                program_callables_info, ignore_auto)
         return gsize, (self.vecsize,)
 
 # }}}
@@ -132,4 +134,48 @@ class SeparateTemporariesPreambleTestPreambleGenerator(
 
 # }}}
 
+
+# {{{ test_register_function_lookup
+
+class Log2Callable(lp.ScalarCallable):
+
+    def with_types(self, arg_id_to_dtype, kernel, program_callables_info):
+
+        if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None:
+            # the types provided aren't mature enough to specialize the
+            # callable
+            return (
+                    self.copy(arg_id_to_dtype=arg_id_to_dtype),
+                    program_callables_info)
+
+        dtype = arg_id_to_dtype[0].numpy_dtype
+
+        if dtype.kind in ('u', 'i'):
+            # ints and unsigned casted to float32
+            dtype = np.float32
+
+        from loopy.target.opencl import OpenCLTarget
+        name_in_target = "log2"
+        if not isinstance(kernel.target, OpenCLTarget):
+            # for CUDA, C Targets the name must be modified
+            if dtype == np.float32:
+                name_in_target = "log2f"
+            elif dtype == np.float128:
+                name_in_target = "log2l"
+
+        from loopy.types import NumpyType
+        return (
+                self.copy(name_in_target=name_in_target,
+                    arg_id_to_dtype={0: NumpyType(dtype), -1:
+                        NumpyType(dtype)}),
+                program_callables_info)
+
+
+def register_log2_lookup(target, identifier):
+    if identifier == 'log2':
+        return Log2Callable(name='log2')
+    return None
+
+# }}}
+
 # vim: foldmethod=marker
-- 
GitLab


From ee6214767d96b9b4a7d240c5ed8affed2137ec6e Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Sun, 12 Aug 2018 16:38:50 +0530
Subject: [PATCH 04/80] adding untracked files.

---
 doc/ref_call.rst                   | 191 +++++++
 loopy/kernel/function_interface.py | 867 +++++++++++++++++++++++++++++
 loopy/program.py                   | 684 +++++++++++++++++++++++
 loopy/transform/callable.py        | 707 +++++++++++++++++++++++
 test/test_callables.py             | 414 ++++++++++++++
 5 files changed, 2863 insertions(+)
 create mode 100644 doc/ref_call.rst
 create mode 100644 loopy/kernel/function_interface.py
 create mode 100644 loopy/program.py
 create mode 100644 loopy/transform/callable.py
 create mode 100644 test/test_callables.py

diff --git a/doc/ref_call.rst b/doc/ref_call.rst
new file mode 100644
index 00000000..4ff1ef2f
--- /dev/null
+++ b/doc/ref_call.rst
@@ -0,0 +1,191 @@
+Calling Loopy Kernels and External Functions
+============================================
+
+Goals of a function interface
+-----------------------------
+
+- Must be able to have complete information of the function just through the
+  epxression node.
+- Must adhere to :mod:`loopy` semantics of immutability.
+- Must have a class instance linked with the expression node which would record
+  the properties of the function.
+- Must indicate in the expression if the function is known to the kernel. (This
+  is intended to be done by making the function expression node an instance of
+  ``ResolvedFunction`` as soon as the function definition is resolved by the
+  kernel)
+- Function overloading is not encouraged in :mod:`loopy` as it gives rise to
+  contention while debugging with the help of the kernel intermediate
+  representation and hence if the expression nodes point to different function
+  instances they must differ in their representation. For example: ``float
+  sin(float )`` and ``double sin(double )`` should diverge by having different
+  identifiers as soon as data type of the argument is inferred.
+- Must have an interface to register external functions.
+
+
+Scoped Function and resolving
+-----------------------------
+
+``ResolvedFunctions`` are pymbolic nodes within expressions in a ``Loo.py``
+kernel, whose name has been resolved by the kernel. The process of matching a
+function idenitifier with the function definition is called "resolving".
+
+A pymbolic ``Call`` node can be converted to a ``ResolvedFunction`` if it
+is "resolved" by one of the ``function_scoper`` in a
+:attr:`LoopKernel.scoped_functions`
+
+-  Functions already registered by the target. Some examples include --
+   ``sin()``, ``cos()``, ``exp()``, ``max()`` (for C-Targets.)
+-  Functions that are defined in ``Loo.py`` and are realized into
+   different set of instructions during code generation. Some examples
+   include ``make_tuple``, ``ArgExtOp``, ``index_of``, ...
+-  Functions registered as ``CallableKernels`` using
+   ``lp.register_callable_kernel(...)``.
+-  Functions that have been provided through
+   ``lp.register_function_scoper(...)``
+-  Functions that can be made known from the user through
+   ``lp.register_function_mangler``. This is planned to be deprecated,
+   as its functionality is superseded by
+   ``lp.register_function_scoper(...)``.
+
+Expressions after a function is scoped
+--------------------------------------
+
+Consider the following expression.
+
+::
+
+    sin(a[i]) + unknown_func(b[i]) + callable_knl_func(c[i])*mangler_call(d[i])
+
+During the kernel creation phase, the kernel would know that ``sin`` is
+a function known to the target and hence it should be scoped. And as
+expected, after ``make_kernel`` has been called the above expression
+would get converted to:
+
+::
+
+    ResolvedFunction(Variable('sin'))(a[i]) + unknown_func(b[i]) +
+    callable_knl_func(c[i])*mangler_call(d[i])
+
+This would also make an entry in the kernel's ``scoped_functions``
+dictionary as:
+
+::
+
+    {Variable('sin'): ScalarCallable(name='sin', arg_id_to_dtype=None,
+    arg_id_to_descr=None, name_in_target=None)}
+
+It might be noteworthy that at this step, it only scopes functions
+through their names without any information about the types of the
+function.
+
+Once, the user calls the transformation:
+``lp.register_callable_kernel(knl, 'callable_knl_func', callee_knl)``,
+the expression gets converted to:
+
+::
+
+    ResolvedFunction(Variable('sin'))(a[i]) + unknown_func(b[i]) +
+    ResolvedFunction('callable_knl_func')(c[i])*mangler_call(d[i])
+
+This also makes an entry in the ``scoped_functions`` dictionary as --
+
+::
+
+    {Variable('sin'): ScalarCallable(name='sin', arg_id_to_dtype=None,
+    arg_id_to_descr=None, name_in_target=None),
+    Variable('callable_knl_func'): CallableKernel(subkernel=LoopKernel(...),
+    arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None)}
+
+Now, if the user calls
+``register_function_mangler(knl, 'mangler_call')``, one might expect
+that the mangler call function should get scoped, but that does **not**
+happen, because the "old" ``function_manglers``, would return a match
+only if all the parameters of the function match viz. name, argument
+arity and argument types. Hence, the ``scoped_functions`` dictionary
+would remain unchanged.
+
+``ResolvedFunctions`` and specializations
+---------------------------------------
+
+Consider the same ``ResolvedFunction('sin')`` as above. This function
+although scoped does not the know the types i.e. it does yet know that
+for a ``C-Target``, whether it should emit ``sin`` or ``sinf`` or
+``sinl``. Hence, right now the function can be called as a
+"type-generic" function as further in the pipeline it can take any one
+of the above definitions. The functions go through a "specialization"
+processes at various points in the pipeline, where the attributes of the
+callables are resolved.
+
+-  During type inference, the functions go though type specialization
+   where in the ``arg_id_to_dtype`` of the functions is realized.
+-  During descriptor inference, the functions goes through a description
+   specialization where the ``arg_id_to_descr`` is populated. The
+   ``arg_id_to_descr`` contains important information regarding shape,
+   strides and scope of the arguments which form an important part of
+   ``CallableKernel`` as this information would be helpful to to
+   generate the function signature and make changes to the data access
+   pattern of the variables in the callee kernel.
+-  Whenever a ``ResolvedFunction`` goes through a specialization, this is
+   indicated by changing the name in the ``pymbolic`` node.
+
+If during type inference, it is inferred that the type of ``a[i]`` is
+``np.float32``. The new ``pymbolic`` node would be:
+
+::
+
+    ResolvedFunction('sin_0')(a[i]) + ...
+
+This name change is done so that it indicates that the node points to a
+different ``ScalarCallable`` in the dictionary. And hence a new entry is
+added to the ``scoped_functions`` dictionary as:
+
+::
+
+    {'sin': ScalarCallable(name='sin', arg_id_to_dtype=None,
+    arg_id_to_descr=None, name_in_target=None),
+    Variable('callable_knl_func'): CallableKernel(subkernel=LoopKernel(...),
+    arg_id_to_dtype=None, arg_id_to_descr=None, name_in_target=None),
+    'sin_0': ScalarCallable(name='sin', arg_id_to_dtype={0:np.float32,
+    -1: np.float32}, arg_id_to_descr=None, name_in_target='sinf')}
+
+Description Inference
+---------------------
+
+Although this step has no significance for a ``ScalarCallable``, it
+forms a very important part of ``CallableKernel``. In which the
+``dim_tags``, ``shape`` and ``address_space`` of the arguments of the
+callable kernel is altered.
+
+-  The ``dim_tags`` attribute helps to ensure that the memory layout
+   between the caller and the callee kernel is coherent.
+-  The ``address_space`` attribute ensures that, while writing the device
+   code we emit the appropriate scope qualifiers for the function
+   declaration arguments.
+-  The ``shape`` attribute helps in:
+
+   -  Storage allocation.
+   -  Memory layout.
+   -  Out of bounds accesses to be caught in ``Loo.py``.
+
+Hence, in the ``Loo.py`` pipeline, one might expect the following
+developments of the ``sin`` pymbolic call expression node.
+
+::
+
+    sin -> (Kernel creation) -> ResolvedFunction(Variable('sin')) ->
+    (Type Inference) -> ResolvedFunction(Variable('sin_0')) ->
+    (Descriptor Inference) -> ResolvedFunction(Variable('sin_1'))
+
+Changes on the target side to accommodate the new function interface
+--------------------------------------------------------------------
+
+The earlier "function\_mangler" as a member method of the class
+``lp.ASTBuilderBase`` will be replaced by ``function_scopers``. The
+function scopers would return a list of functions with the signature
+``(target, identifier)->lp.InKernelCallable``.
+
+An example: Calling BLAS
+------------------------
+
+.. literalinclude:: ../examples/python/external-call.py
+
diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py
new file mode 100644
index 00000000..2ea26065
--- /dev/null
+++ b/loopy/kernel/function_interface.py
@@ -0,0 +1,867 @@
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2018 Andreas Klöckner, Kaushik Kulkarni"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+
+import re
+import six
+
+from six.moves import zip
+
+from pytools import ImmutableRecord
+from loopy.diagnostic import LoopyError
+
+from loopy.symbolic import parse_tagged_name
+
+from loopy.symbolic import (ResolvedFunction, SubstitutionRuleMappingContext,
+        RuleAwareIdentityMapper, SubstitutionRuleExpander)
+
+from loopy.kernel import LoopKernel
+
+
+# {{{ argument descriptors
+
+class ValueArgDescriptor(ImmutableRecord):
+    hash_fields = ()
+
+    update_persistent_hash = LoopKernel.update_persistent_hash
+
+
+class ArrayArgDescriptor(ImmutableRecord):
+    """
+    Records information about an array argument to an in-kernel callable, to be
+    passed to and returned from
+    :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`, used
+    for matching shape and scope of caller and callee kernels.
+
+    ..attribute:: shape
+
+        Shape of the array.
+
+    .. attribute:: address_space
+
+        An attribute of :class:`loopy.kernel.data.AddressSpace`.
+
+    .. attribute:: dim_tags
+
+        A tuple of instances of :class:`loopy.kernel.array._StrideArrayDimTagBase`
+    """
+
+    fields = set(['shape', 'address_space', 'dim_tags'])
+
+    def __init__(self, shape, address_space, dim_tags):
+
+        # {{{ sanity checks
+
+        from loopy.kernel.array import FixedStrideArrayDimTag
+
+        assert isinstance(shape, tuple)
+        assert isinstance(dim_tags, tuple)
+
+        # FIXME at least vector dim tags should be supported
+        assert all(isinstance(dim_tag, FixedStrideArrayDimTag) for dim_tag in
+                dim_tags)
+
+        # }}}
+
+        super(ArrayArgDescriptor, self).__init__(
+                shape=shape,
+                address_space=address_space,
+                dim_tags=dim_tags)
+
+    hash_fields = (
+            "shape",
+            "address_space",
+            "dim_tags")
+
+    update_persistent_hash = LoopKernel.update_persistent_hash
+
+# }}}
+
+
+# {{{ helper function for in-kernel callables
+
+def get_kw_pos_association(kernel):
+    """
+    Returns a tuple of ``(kw_to_pos, pos_to_kw)`` for the arguments in
+    *kernel*.
+    """
+    from loopy.kernel.tools import infer_arg_is_output_only
+    kernel = infer_arg_is_output_only(kernel)
+    kw_to_pos = {}
+    pos_to_kw = {}
+
+    read_count = 0
+    write_count = -1
+
+    for arg in kernel.args:
+        if not arg.is_output_only:
+            kw_to_pos[arg.name] = read_count
+            pos_to_kw[read_count] = arg.name
+            read_count += 1
+        else:
+            kw_to_pos[arg.name] = write_count
+            pos_to_kw[write_count] = arg.name
+            write_count -= 1
+
+    return kw_to_pos, pos_to_kw
+
+
+class GridOverrideForCalleeKernel(ImmutableRecord):
+    """
+    Helper class to set the
+    :attr:`loopy.kernel.LoopKernel.override_get_grid_size_for_insn_ids` of the
+    callee kernels. Refer
+    :func:`loopy.kernel.function_interface.GridOverrideForCalleeKernel.__call__`,
+    :func:`loopy.kernel.function_interface.CallbleKernel.with_hw_axes_sizes`.
+
+    .. attribute:: local_size
+
+        The local work group size that has to be set in the callee kernel.
+
+    .. attribute:: global_size
+
+        The global work group size that to be set in the callee kernel.
+
+    .. note::
+
+        This class acts as a pseduo-callable and its significance lies in
+        solving picklability issues.
+    """
+    fields = set(["local_size", "global_size"])
+
+    def __init__(self, local_size, global_size):
+        self.local_size = local_size
+        self.global_size = global_size
+
+    def __call__(self, insn_ids, program_callables_info, ignore_auto=True):
+        return self.local_size, self.global_size
+
+# }}}
+
+
+# {{{ template class
+
+class InKernelCallable(ImmutableRecord):
+    """
+    An abstract interface to define a callable encountered in a kernel.
+
+    .. attribute:: name
+
+        The name of the callable which can be encountered within a kernel.
+
+    .. attribute:: arg_id_to_dtype
+
+        A mapping which indicates the arguments types and result types it would
+        be handling. This would be set once the callable is type specialized.
+
+    .. attribute:: arg_id_to_descr
+
+        A mapping which gives indicates the argument shape and ``dim_tags`` it
+        would be responsible for generating code. These parameters would be set,
+        once it is shape and stride(``dim_tags``) specialized.
+
+    .. note::
+
+        Negative "id" values ``-i`` in the mapping attributes indicate
+        return value with (0-based) index *i*.
+
+    .. automethod:: __init__
+    .. automethod:: with_types
+    .. automethod:: with_descrs
+    .. automethod:: with_target
+    .. automethod:: with_hw_axes_sizes
+    .. automethod:: generate_preambles
+    .. automethod:: emit_call
+    .. automethod:: emit_call_insn
+    .. automethod:: is_ready_for_codegen
+    """
+
+    fields = set(["arg_id_to_dtype", "arg_id_to_descr"])
+    init_arg_names = ("arg_id_to_dtype", "arg_id_to_descr")
+
+    def __init__(self, arg_id_to_dtype=None, arg_id_to_descr=None):
+
+        super(InKernelCallable, self).__init__(
+                arg_id_to_dtype=arg_id_to_dtype,
+                arg_id_to_descr=arg_id_to_descr)
+
+    def __getinitargs__(self):
+        return (self.arg_id_to_dtype, self.arg_id_to_descr)
+
+    update_persistent_hash = LoopKernel.update_persistent_hash
+
+    def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info):
+        """
+        :arg arg_id_to_type: a mapping from argument identifiers
+            (integers for positional arguments, names for keyword
+            arguments) to :class:`loopy.types.LoopyType` instances.
+            Unspecified/unknown types are not represented in *arg_id_to_type*.
+
+            Return values are denoted by negative integers, with the
+            first returned value identified as *-1*.
+
+        :returns: a tuple ``(new_self, arg_id_to_type)``, where *new_self* is a
+            new :class:`InKernelCallable` specialized for the given types,
+            and *arg_id_to_type* is a mapping of the same form as the
+            argument above, however it may have more information present.
+            Any argument information exists both by its positional and
+            its keyword identifier.
+        """
+        # FIXME: In all these with_** functions add that also passes a
+        # program_callables_info
+
+        raise NotImplementedError()
+
+    def with_descrs(self, arg_id_to_descr, program_callables_info):
+        """
+        :arg arg_id_to_descr: a mapping from argument identifiers
+            (integers for positional arguments, names for keyword
+            arguments) to :class:`loopy.ArrayArgDescriptor` instances.
+            Unspecified/unknown types are not represented in *arg_id_to_descr*.
+
+            Return values are denoted by negative integers, with the
+            first returned value identified as *-1*.
+
+        :returns: a copy of *self* which is a new instance of
+            :class:`InKernelCallable` specialized for the given types, and
+            *arg_id_to_descr* is a mapping of the same form as the argument above,
+            however it may have more information present.  Any argument information
+            exists both by its positional and its keyword identifier.
+        """
+
+        raise NotImplementedError()
+
+    def with_target(self, target):
+        """
+        Returns a copy of *self* with all the ``dtypes`` in
+        ``in_knl_callable.arg_id_to_dtype`` associated with the *target*. Refer
+        :meth:`loopy.types.LoopyType.with_target`.
+
+        :arg target: An instance of :class:`loopy.target.TargetBase`.
+        """
+
+        if target is None:
+            raise LoopyError("target cannot be None for with_target")
+
+        def with_target_if_not_None(dtype):
+            """
+            Returns a copy of :arg:`dtype` associated with the target. If
+            ``dtype`` is *None* returns *None*.
+            """
+            if dtype:
+                return dtype.with_target(target)
+            else:
+                return None
+
+        new_arg_id_to_dtype = None
+        if self.arg_id_to_dtype is not None:
+            new_arg_id_to_dtype = dict((id, with_target_if_not_None(dtype)) for id,
+                    dtype in self.arg_id_to_dtype.items())
+
+        return self.copy(arg_id_to_dtype=new_arg_id_to_dtype)
+
+    def with_hw_axes_sizes(self, local_size, global_size):
+        """
+        Returns a copy of *self* with modifications to comply with the grid
+        sizes ``(local_size, global_size)`` of the kernel in which it is
+        supposed to be called.
+
+        :arg local_size: An instance of :class:`islpy.PwAff`.
+        :arg global_size: An instance of :class:`islpy.PwAff`.
+        """
+        raise NotImplementedError()
+
+    def is_ready_for_codegen(self):
+
+        return (self.arg_id_to_dtype is not None and
+                self.arg_id_to_descr is not None)
+
+    def generate_preambles(self, target):
+        """ Yields the target specific preamble.
+        """
+        raise NotImplementedError()
+
+    def emit_call(self, expression_to_code_mapper, expression, target):
+
+        raise NotImplementedError()
+
+    def emit_call_insn(self, insn, target, expression_to_code_mapper):
+        """
+        Returns a tuple of ``(call, assignee_is_returned)`` which is the target
+        facing function call that would be seen in the generated code. ``call``
+        is an instance of ``pymbolic.primitives.Call`` ``assignee_is_returned``
+        is an instance of :class:`bool` to indicate if the assignee is returned
+        by value of C-type targets.
+
+        *Example:* If ``assignee_is_returned=True``, then ``a, b = f(c, d)`` is
+            interpreted in the target as ``a = f(c, d, &b)``. If
+            ``assignee_is_returned=False``, then ``a, b = f(c, d)`` is interpreted
+            in the target as the statement ``f(c, d, &a, &b)``.
+        """
+
+        raise NotImplementedError()
+
+    def __hash__(self):
+
+        return hash(tuple(self.fields))
+
+# }}}
+
+
+# {{{ scalar callable
+
+class ScalarCallable(InKernelCallable):
+    """
+    An abstranct interface the to a scalar callable encountered in a kernel.
+
+    .. note::
+
+        The :meth:`ScalarCallable.with_types` is intended to assist with type
+        specialization of the funciton and is expected to be supplemented in the
+        derived subclasses.
+    """
+
+    fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr", "name_in_target"])
+    init_arg_names = ("name", "arg_id_to_dtype", "arg_id_to_descr",
+            "name_in_target")
+    hash_fields = fields
+
+    def __init__(self, name, arg_id_to_dtype=None,
+            arg_id_to_descr=None, name_in_target=None):
+
+        super(ScalarCallable, self).__init__(
+                arg_id_to_dtype=arg_id_to_dtype,
+                arg_id_to_descr=arg_id_to_descr)
+
+        self.name = name
+        self.name_in_target = name_in_target
+
+    def __getinitargs__(self):
+        return (self.arg_id_to_dtype, self.arg_id_to_descr,
+                self.name_in_target)
+
+    def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info):
+        raise LoopyError("No type inference information present for "
+                "the function %s." % (self.name))
+
+    def with_descrs(self, arg_id_to_descr, program_callables_info):
+
+        arg_id_to_descr[-1] = ValueArgDescriptor()
+        return (
+                self.copy(arg_id_to_descr=arg_id_to_descr),
+                program_callables_info)
+
+    def with_hw_axes_sizes(self, global_size, local_size):
+        return self.copy()
+
+    def is_ready_for_codegen(self):
+
+        return (self.arg_id_to_dtype is not None and
+                self.arg_id_to_descr is not None)
+
+    # {{{ code generation
+
+    def emit_call(self, expression_to_code_mapper, expression, target):
+
+        assert self.is_ready_for_codegen()
+
+        # must have single assignee
+        assert len(expression.parameters) == len(self.arg_id_to_dtype) - 1
+        arg_dtypes = tuple(self.arg_id_to_dtype[id] for id in
+                range(len(self.arg_id_to_dtype)-1))
+
+        par_dtypes = tuple(expression_to_code_mapper.infer_type(par) for par in
+                expression.parameters)
+
+        from loopy.expression import dtype_to_type_context
+        # processing the parameters with the required dtypes
+        processed_parameters = tuple(
+                expression_to_code_mapper.rec(par,
+                    dtype_to_type_context(target, tgt_dtype),
+                    tgt_dtype)
+                for par, par_dtype, tgt_dtype in zip(
+                    expression.parameters, par_dtypes, arg_dtypes))
+
+        from pymbolic import var
+        return var(self.name_in_target)(*processed_parameters)
+
+    def emit_call_insn(self, insn, target, expression_to_code_mapper):
+        """
+        Returns a pymbolic call for C-based targets, when the instructions
+        involve multiple return values along with the required type casting.
+        The first assignee is returned, but the rest of them are appended to
+        the parameters and passed by reference.
+
+        *Example:* ``c, d = f(a, b)`` is returned as ``c = f(a, b, &d)``
+
+        :arg insn: An instance of :class:`loopy.kernel.instructions.CallInstruction`.
+        :arg target: An instance of :class:`loopy.target.TargetBase`.
+        :arg expression_to_code_mapper: An instance of :class:`IdentityMapper`
+            responsible for code mapping from :mod:`loopy` syntax to the
+            **target syntax**.
+        """
+
+        # Currently this is formulated such that the first argument is returned
+        # and rest all are passed by reference as arguments to the function.
+        assert self.is_ready_for_codegen()
+
+        from loopy.kernel.instruction import CallInstruction
+
+        assert isinstance(insn, CallInstruction)
+
+        parameters = insn.expression.parameters
+        assignees = insn.assignees[1:]
+
+        par_dtypes = tuple(expression_to_code_mapper.infer_type(par) for par in
+                parameters)
+        arg_dtypes = tuple(self.arg_id_to_dtype[i] for i, _ in
+                enumerate(parameters))
+
+        assignee_dtypes = tuple(self.arg_id_to_dtype[-i-2] for i, _ in
+                enumerate(assignees))
+
+        from loopy.expression import dtype_to_type_context
+        from pymbolic.mapper.stringifier import PREC_NONE
+        from pymbolic import var
+
+        c_parameters = [
+                expression_to_code_mapper(par, PREC_NONE,
+                    dtype_to_type_context(target, tgt_dtype),
+                    tgt_dtype).expr
+                for par, par_dtype, tgt_dtype in zip(
+                    parameters, par_dtypes, arg_dtypes)]
+
+        for i, (a, tgt_dtype) in enumerate(zip(assignees, assignee_dtypes)):
+            if tgt_dtype != expression_to_code_mapper.infer_type(a):
+                raise LoopyError("Type Mismatch in function %s. Expected: %s"
+                        "Got: %s" % (self.name, tgt_dtype,
+                            expression_to_code_mapper.infer_type(a)))
+            c_parameters.append(
+                        var("&")(
+                            expression_to_code_mapper(a, PREC_NONE,
+                                dtype_to_type_context(target, tgt_dtype),
+                                tgt_dtype).expr))
+
+        # assignee is returned whenever the size of assignees is non zero.
+        assignee_is_returned = len(assignees) > 0
+
+        return var(self.name_in_target)(*c_parameters), assignee_is_returned
+
+    def generate_preambles(self, target):
+        return
+        yield
+
+    # }}}
+
+# }}}
+
+
+# {{{ callable kernel
+
+class CallableKernel(InKernelCallable):
+    """
+    Records informations about a callee kernel. Also provides interface through
+    member methods to make the callee kernel compatible to be called from a
+    caller kernel. The :meth:`loopy.register_callable_kernel` should be called
+    in order to initiate association between a function in caller kernel and
+    the callee kernel.
+
+    :meth:`CallableKernel.with_types` should be called in order to match
+    the ``dtypes`` of the arguments that are shared between the caller and the
+    callee kernel.
+
+    :meth:`CallableKernel.with_descrs` should be called in order to match
+    :attr:`ArrayArgDescriptor.dim_tags`, :attr:`ArrayArgDescriptor.shape`,
+    :attr:`ArrayArgDescriptor.address_space`` of the arguments shared between the
+    caller and the callee kernel.
+
+    :meth:`CallableKernel.with_hw_axes` should be called to set the grid
+    sizes for the :attr:`subkernel` of the callable.
+    """
+
+    fields = set(["subkernel", "arg_id_to_dtype", "arg_id_to_descr"])
+    init_arg_names = ("subkernel", "arg_id_to_dtype", "arg_id_to_descr")
+    hash_fields = fields
+
+    def __init__(self, subkernel, arg_id_to_dtype=None,
+            arg_id_to_descr=None):
+        assert isinstance(subkernel, LoopKernel)
+
+        super(CallableKernel, self).__init__(
+                arg_id_to_dtype=arg_id_to_dtype,
+                arg_id_to_descr=arg_id_to_descr)
+
+        self.subkernel = subkernel.copy(
+                args=[arg.copy(dtype=arg.dtype.with_target(subkernel.target))
+                    if arg.dtype is not None else arg for arg in subkernel.args])
+
+    def __getinitargs__(self):
+        return (self.subkernel, self.arg_id_to_dtype,
+                self.arg_id_to_descr)
+
+    @property
+    def name(self):
+        return self.subkernel.name
+
+    def with_types(self, arg_id_to_dtype, caller_kernel,
+            program_callables_info):
+        kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel)
+
+        new_args = []
+        for arg in self.subkernel.args:
+            kw = arg.name
+            if kw in arg_id_to_dtype:
+                # id exists as kw
+                new_args.append(arg.copy(dtype=arg_id_to_dtype[kw]))
+            elif kw_to_pos[kw] in arg_id_to_dtype:
+                # id exists as positional argument
+                new_args.append(arg.copy(
+                    dtype=arg_id_to_dtype[kw_to_pos[kw]]))
+            else:
+                new_args.append(arg)
+
+        from loopy.type_inference import (
+                infer_unknown_types_for_a_single_kernel)
+        pre_specialized_subkernel = self.subkernel.copy(
+                args=new_args)
+
+        # infer the types of the written variables based on the knowledge
+        # of the types of the arguments supplied
+        specialized_kernel, program_callables_info = (
+                infer_unknown_types_for_a_single_kernel(
+                    pre_specialized_subkernel,
+                    program_callables_info,
+                    expect_completion=True))
+
+        new_arg_id_to_dtype = {}
+        for arg in specialized_kernel.args:
+            # associate the updated_arg_id_to_dtype with keyword as well as
+            # positional id.
+            new_arg_id_to_dtype[arg.name] = arg.dtype
+            new_arg_id_to_dtype[kw_to_pos[arg.name]] = arg.dtype
+
+        # Return the kernel call with specialized subkernel and the corresponding
+        # new arg_id_to_dtype
+        return self.copy(subkernel=specialized_kernel,
+                arg_id_to_dtype=new_arg_id_to_dtype), program_callables_info
+
+    def with_descrs(self, arg_id_to_descr, program_callables_info):
+
+        # tune the subkernel so that we have the matching shapes and
+        # dim_tags
+
+        new_args = self.subkernel.args[:]
+        kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel)
+
+        for arg_id, descr in arg_id_to_descr.items():
+            if isinstance(arg_id, int):
+                arg_id = pos_to_kw[arg_id]
+            assert isinstance(arg_id, str)
+
+            if isinstance(descr, ArrayArgDescriptor):
+                new_arg = self.subkernel.arg_dict[arg_id].copy(
+                        shape=descr.shape,
+                        dim_tags=descr.dim_tags,
+                        address_space=descr.address_space)
+                # replacing the new arg with the arg of the same name
+                new_args = [new_arg if arg.name == arg_id else arg for arg in
+                        new_args]
+            elif isinstance(descr, ValueArgDescriptor):
+                pass
+            else:
+                raise LoopyError("Descriptor must be either an instance of "
+                        "ArrayArgDescriptor or ValueArgDescriptor -- got %s." %
+                        type(descr))
+        descriptor_specialized_knl = self.subkernel.copy(args=new_args)
+        from loopy.preprocess import traverse_to_infer_arg_descr
+        descriptor_specialized_knl, program_callables_info = (
+                traverse_to_infer_arg_descr(descriptor_specialized_knl,
+                    program_callables_info))
+
+        return (
+                self.copy(
+                    subkernel=descriptor_specialized_knl,
+                    arg_id_to_descr=arg_id_to_descr),
+                program_callables_info)
+
+    def with_packing_for_args(self):
+        from loopy.kernel.data import AddressSpace
+        kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel)
+
+        arg_id_to_descr = {}
+
+        for pos, kw in pos_to_kw.items():
+            arg = self.subkernel.arg_dict[kw]
+            arg_id_to_descr[pos] = ArrayArgDescriptor(
+                    shape=arg.shape,
+                    dim_tags=arg.dim_tags,
+                    address_space=AddressSpace.GLOBAL)
+
+        return self.copy(subkernel=self.subkernel,
+                arg_id_to_descr=arg_id_to_descr)
+
+    def with_hw_axes_sizes(self, gsize, lsize):
+        return self.copy(
+                subkernel=self.subkernel.copy(
+                    overridden_get_grid_sizes_for_insn_ids=(
+                        GridOverrideForCalleeKernel(lsize, gsize))))
+
+    def is_ready_for_codegen(self):
+        return (self.arg_id_to_dtype is not None and
+                self.arg_id_to_descr is not None)
+
+    def generate_preambles(self, target):
+        """ Yields the *target* specific preambles.
+        """
+        # FIXME Check that this is correct.
+
+        return
+        yield
+
+    def emit_call_insn(self, insn, target, expression_to_code_mapper):
+
+        assert self.is_ready_for_codegen()
+
+        from loopy.kernel.instruction import CallInstruction
+        from pymbolic.primitives import CallWithKwargs
+
+        assert isinstance(insn, CallInstruction)
+
+        parameters = insn.expression.parameters
+        kw_parameters = {}
+        if isinstance(insn.expression, CallWithKwargs):
+            kw_parameters = insn.expression.kw_parameters
+
+        assignees = insn.assignees
+
+        parameters = list(parameters)
+        par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)]
+        kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel)
+        for i in range(len(parameters), len(parameters)+len(kw_parameters)):
+            parameters.append(kw_parameters[pos_to_kw[i]])
+            par_dtypes.append(self.arg_id_to_dtype[pos_to_kw[i]])
+
+        # insert the assigness at the required positions
+        assignee_write_count = -1
+        for i, arg in enumerate(self.subkernel.args):
+            if arg.is_output_only:
+                assignee = assignees[-assignee_write_count-1]
+                parameters.insert(i, assignee)
+                par_dtypes.insert(i, self.arg_id_to_dtype[assignee_write_count])
+                assignee_write_count -= 1
+
+        # no type casting in array calls
+        from loopy.expression import dtype_to_type_context
+        from pymbolic.mapper.stringifier import PREC_NONE
+        from loopy.symbolic import SubArrayRef
+        from pymbolic import var
+
+        c_parameters = [
+                expression_to_code_mapper(par, PREC_NONE,
+                    dtype_to_type_context(target, par_dtype),
+                    par_dtype).expr if isinstance(par, SubArrayRef) else
+                expression_to_code_mapper(par, PREC_NONE,
+                    dtype_to_type_context(target, par_dtype),
+                    par_dtype).expr
+                for par, par_dtype in zip(
+                    parameters, par_dtypes)]
+
+        return var(self.subkernel.name)(*c_parameters), False
+
+# }}}
+
+
+# {{{ mangler callable
+
+class ManglerCallable(ScalarCallable):
+    """
+    A callable whose characateristic is defined by a function mangler.
+
+    .. attribute:: function_mangler
+
+        A function of signature ``(kernel, name , arg_dtypes)`` and returns an
+        instance of ``loopy.CallMangleInfo``.
+    """
+    fields = set(["name", "function_mangler", "arg_id_to_dtype", "arg_id_to_descr",
+        "name_in_target"])
+    init_arg_names = ("name", "function_mangler", "arg_id_to_dtype",
+            "arg_id_to_descr", "name_in_target")
+    hash_fields = set(["name", "arg_id_to_dtype", "arg_id_to_descr",
+        "name_in_target"])
+
+    def __init__(self, name, function_mangler, arg_id_to_dtype=None,
+            arg_id_to_descr=None, name_in_target=None):
+
+        self.function_mangler = function_mangler
+
+        super(ManglerCallable, self).__init__(
+                name=name,
+                arg_id_to_dtype=arg_id_to_dtype,
+                arg_id_to_descr=arg_id_to_descr,
+                name_in_target=name_in_target)
+
+    def __getinitargs__(self):
+        return (self.name, self.function_mangler, self.arg_id_to_dtype,
+                self.arg_id_to_descr, self.name_in_target)
+
+    def with_types(self, arg_id_to_dtype, kernel, program_callables_info):
+        if self.arg_id_to_dtype is not None:
+            # specializing an already specialized function.
+            for arg_id, dtype in arg_id_to_dtype.items():
+                # only checking for the ones which have been provided
+                # if does not match, returns an error.
+                if self.arg_id_to_dtype[arg_id] != arg_id_to_dtype[arg_id]:
+                    raise LoopyError("Overwriting a specialized"
+                            " function is illegal--maybe start with new instance of"
+                            " ManglerCallable?")
+
+        sorted_keys = sorted(arg_id_to_dtype.keys())
+        arg_dtypes = tuple(arg_id_to_dtype[key] for key in sorted_keys if
+                key >= 0)
+
+        mangle_result = self.function_mangler(kernel, self.name,
+                arg_dtypes)
+        if mangle_result:
+            new_arg_id_to_dtype = dict(enumerate(mangle_result.arg_dtypes))
+            new_arg_id_to_dtype.update(dict((-i-1, dtype) for i, dtype in
+                enumerate(mangle_result.result_dtypes)))
+            return (
+                    self.copy(name_in_target=mangle_result.target_name,
+                        arg_id_to_dtype=new_arg_id_to_dtype),
+                    program_callables_info)
+        else:
+            # The function mangler does not agree with the arg id to dtypes
+            # provided. Indicating that is illegal.
+            raise LoopyError("Function %s not coherent with the provided types." % (
+                self.name, kernel.target))
+
+    def mangle_result(self, kernel):
+        """
+        Returns an instance of :class:`loopy.kernel.data.CallMangleInfo` for
+        the given pair :attr:`function_mangler` and :attr:`arg_id_to_dtype`.
+        """
+        sorted_keys = sorted(self.arg_id_to_dtype.keys())
+        arg_dtypes = tuple(self.arg_id_to_dtype[key] for key in sorted_keys if
+                key >= 0)
+
+        return self.function_mangler(kernel, self.name, arg_dtypes)
+
+# }}}
+
+
+# {{{ new pymbolic calls to scoped functions
+
+def next_indexed_variable(function):
+    """
+    Returns an instance of :class:`str` with the next indexed-name in the
+    sequence for the name of *function*.
+
+    *Example:* ``Variable('sin_0')`` will return ``'sin_1'``.
+
+    :arg function: Either an instance of :class:`pymbolic.primitives.Variable`
+        or :class:`loopy.reduction.ArgExtOp` or
+        :class:`loopy.reduction.SegmentedOp`.
+    """
+    from loopy.library.reduction import ArgExtOp, SegmentedOp
+    if isinstance(function, (ArgExtOp, SegmentedOp)):
+        return function.copy()
+    func_name = re.compile(r"^(?P<alpha>\S+?)_(?P<num>\d+?)$")
+
+    match = func_name.match(function.name)
+
+    if match is None:
+        if function.name[-1] == '_':
+            return "{old_name}0".format(old_name=function.name)
+        else:
+            return "{old_name}_0".format(old_name=function.name)
+
+    return "{alpha}_{num}".format(alpha=match.group('alpha'),
+            num=int(match.group('num'))+1)
+
+
+class FunctionNameChanger(RuleAwareIdentityMapper):
+    """
+    Changes the names of scoped functions in calls of expressions according to
+    the mapping ``calls_to_new_functions``
+    """
+
+    def __init__(self, rule_mapping_context, calls_to_new_names,
+            subst_expander):
+        super(FunctionNameChanger, self).__init__(rule_mapping_context)
+        self.calls_to_new_names = calls_to_new_names
+        self.subst_expander = subst_expander
+
+    def map_call(self, expr, expn_state):
+        name, tag = parse_tagged_name(expr.function)
+
+        if name not in self.rule_mapping_context.old_subst_rules:
+            expanded_expr = self.subst_expander(expr)
+            if expr in self.calls_to_new_names:
+                return type(expr)(
+                        ResolvedFunction(self.calls_to_new_names[expr]),
+                        tuple(self.rec(child, expn_state)
+                            for child in expr.parameters))
+            elif expanded_expr in self.calls_to_new_names:
+                # FIXME: this is horribly wrong logic.
+                # investigate how to make edits to a substitution rule
+                return type(expr)(
+                        ResolvedFunction(self.calls_to_new_names[expanded_expr]),
+                        tuple(self.rec(child, expn_state)
+                            for child in expanded_expr.parameters))
+            else:
+                return super(FunctionNameChanger, self).map_call(
+                        expr, expn_state)
+        else:
+            return self.map_substitution(name, tag, expr.parameters, expn_state)
+
+    def map_call_with_kwargs(self, expr, expn_state):
+
+        if expr in self.calls_to_new_names:
+            return type(expr)(
+                ResolvedFunction(self.calls_to_new_names[expr]),
+                tuple(self.rec(child, expn_state)
+                    for child in expr.parameters),
+                dict(
+                    (key, self.rec(val, expn_state))
+                    for key, val in six.iteritems(expr.kw_parameters))
+                    )
+        else:
+            return super(FunctionNameChanger, self).map_call_with_kwargs(
+                    expr, expn_state)
+
+
+def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names):
+    rule_mapping_context = SubstitutionRuleMappingContext(
+                    kernel.substitutions, kernel.get_var_name_generator())
+    subst_expander = SubstitutionRuleExpander(kernel.substitutions)
+    name_changer = FunctionNameChanger(rule_mapping_context,
+            pymbolic_calls_to_new_names, subst_expander)
+
+    return rule_mapping_context.finish_kernel(
+            name_changer.map_kernel(kernel))
+
+# }}}
+
+
+# vim: foldmethod=marker
diff --git a/loopy/program.py b/loopy/program.py
new file mode 100644
index 00000000..096bd1ec
--- /dev/null
+++ b/loopy/program.py
@@ -0,0 +1,684 @@
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import six
+import re
+
+from pytools import ImmutableRecord, memoize_method
+from pymbolic.primitives import Variable
+from functools import wraps
+
+from loopy.symbolic import RuleAwareIdentityMapper, ResolvedFunction
+from loopy.kernel.function_interface import (
+        CallableKernel, ScalarCallable)
+from loopy.diagnostic import LoopyError
+
+from loopy.kernel import LoopKernel
+
+
+class ResolvedFunctionMarker(RuleAwareIdentityMapper):
+    """
+    Mapper to convert the  ``function`` attribute of a
+    :class:`pymbolic.primitives.Call` known in the kernel as instances of
+    :class:`loopy.symbolic.ResolvedFunction`. A function is known in the
+    *kernel*, :func:`loopy.kernel.LoopKernel.find_scoped_function_identifier`
+    returns an instance of
+    :class:`loopy.kernel.function_interface.InKernelCallable`.
+
+    **Example:** If given an expression of the form ``sin(x) + unknown_function(y) +
+    log(z)``, then the mapper would return ``ResolvedFunction('sin')(x) +
+    unknown_function(y) + ResolvedFunction('log')(z)``.
+
+    :arg rule_mapping_context: An instance of
+        :class:`loopy.symbolic.RuleMappingContext`.
+    :arg function_ids: A container with instances of :class:`str` indicating
+        the function identifiers to look for while scoping functions.
+    """
+    def __init__(self, rule_mapping_context, kernel, program_callables_info,
+            function_id_to_in_knl_callable_mappers):
+        super(ResolvedFunctionMarker, self).__init__(rule_mapping_context)
+        self.kernel = kernel
+        self.program_callables_info = program_callables_info
+        # FIXME: function_resolvesrs looks like a very bad name change it
+        self.function_id_to_in_knl_callable_mappers = (
+                function_id_to_in_knl_callable_mappers)
+
+    def find_in_knl_callable_from_identifier(self, identifier):
+        """
+        Returns an instance of
+        :class:`loopy.kernel.function_interface.InKernelCallable` if the
+        :arg:`identifier` is known to any kernel function scoper, otherwise returns
+        *None*.
+        """
+        # FIXME change docs
+        for func_id_to_in_knl_callable_mapper in (
+                self.function_id_to_in_knl_callable_mappers):
+            # fixme: do we really need to given target for the function
+            in_knl_callable = func_id_to_in_knl_callable_mapper(
+                    self.kernel.target, identifier)
+            if in_knl_callable is not None:
+                return in_knl_callable
+
+        return None
+
+    def map_call(self, expr, expn_state):
+        from pymbolic.primitives import Call, CallWithKwargs
+        from loopy.symbolic import parse_tagged_name
+
+        name, tag = parse_tagged_name(expr.function)
+        if name not in self.rule_mapping_context.old_subst_rules:
+            new_call_with_kwargs = self.rec(CallWithKwargs(
+                function=expr.function, parameters=expr.parameters,
+                kw_parameters={}), expn_state)
+            return Call(new_call_with_kwargs.function,
+                    new_call_with_kwargs.parameters)
+        else:
+            return self.map_substitution(name, tag, expr.parameters, expn_state)
+
+    def map_call_with_kwargs(self, expr, expn_state):
+
+        if not isinstance(expr.function, ResolvedFunction):
+
+            # search the kernel for the function.
+            in_knl_callable = self.find_in_knl_callable_from_identifier(
+                    expr.function.name)
+
+            if in_knl_callable:
+                # associate the newly created ResolvedFunction with the
+                # resolved in-kernel callable
+
+                self.program_callables_info, new_func_id = (
+                        self.program_callables_info.with_callable(expr.function,
+                            in_knl_callable, True))
+                return type(expr)(
+                        ResolvedFunction(new_func_id),
+                        tuple(self.rec(child, expn_state)
+                            for child in expr.parameters),
+                        dict(
+                            (key, self.rec(val, expn_state))
+                            for key, val in six.iteritems(expr.kw_parameters))
+                            )
+
+        # this is an unknown function as of yet, do not modify it
+        return super(ResolvedFunctionMarker, self).map_call_with_kwargs(expr,
+                expn_state)
+
+    def map_reduction(self, expr, expn_state):
+        for func_id in (
+                expr.operation.get_scalar_callables()):
+            in_knl_callable = self.find_in_knl_callable_from_identifier(func_id)
+            assert in_knl_callable is not None
+            self.program_callables_info, _ = (
+                    self.program_callables_info.with_callable(func_id,
+                        in_knl_callable, True))
+        return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state)
+
+
+def initialize_program_callables_info_from_kernel(
+        kernel, func_id_to_kernel_callable_mappers):
+    program_callables_info = ProgramCallablesInfo({})
+    program_callables_info = program_callables_info.with_edit_callables_mode()
+
+    from loopy.symbolic import SubstitutionRuleMappingContext
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            kernel.substitutions, kernel.get_var_name_generator())
+
+    resolved_function_marker = ResolvedFunctionMarker(
+            rule_mapping_context, kernel, program_callables_info,
+            func_id_to_kernel_callable_mappers)
+
+    # scoping fucntions and collecting the scoped functions
+    kernel_with_functions_resolved = rule_mapping_context.finish_kernel(
+            resolved_function_marker.map_kernel(kernel))
+    program_callables_info = resolved_function_marker.program_callables_info
+
+    callable_kernel = CallableKernel(kernel_with_functions_resolved)
+    program_callables_info, _ = program_callables_info.with_callable(
+            Variable(kernel.name), callable_kernel, True)
+    program_callables_info = (
+            program_callables_info.with_exit_edit_callables_mode())
+
+    return program_callables_info
+
+
+# {{{ program definition
+
+class Program(ImmutableRecord):
+    def __init__(self,
+            name,
+            program_callables_info,
+            target,
+            func_id_to_in_knl_callable_mappers):
+        assert isinstance(program_callables_info, ProgramCallablesInfo)
+
+        # FIXME: check if all sanity checks have been covered?
+        # FIXME: The comments over here may need some attention.
+        assert name in program_callables_info
+
+        super(Program, self).__init__(
+                name=name,
+                program_callables_info=program_callables_info,
+                target=target,
+                func_id_to_in_knl_callable_mappers=(
+                    func_id_to_in_knl_callable_mappers))
+
+        self._program_executor_cache = {}
+
+    hash_fields = (
+            "name",
+            "program_callables_info",
+            "target",)
+
+    update_persistent_hash = LoopKernel.update_persistent_hash
+
+    def copy(self, **kwargs):
+        if 'target' in kwargs:
+            target = kwargs['target']
+            new_self = super(Program, self).copy(**kwargs)
+            new_resolved_functions = {}
+            for func_id, in_knl_callable in (
+                    new_self.program_callables_info.items()):
+                if isinstance(in_knl_callable, CallableKernel):
+                    subkernel = in_knl_callable.subkernel
+                    new_resolved_functions[func_id] = in_knl_callable.copy(
+                            subkernel=subkernel.copy(target=target))
+                else:
+                    new_resolved_functions[func_id] = in_knl_callable
+
+            program_callables_info = new_self.program_callables_info.copy(
+                    resolved_functions=new_resolved_functions)
+
+            return super(Program, new_self).copy(
+                    program_callables_info=program_callables_info)
+        else:
+            return super(Program, self).copy(**kwargs)
+
+    def get_grid_size_upper_bounds(self, ignore_auto=False):
+        """Return a tuple (global_size, local_size) containing a grid that
+        could accommodate execution of *all* instructions in the kernel.
+
+        *global_size* and *local_size* are :class:`islpy.PwAff` objects.
+        """
+        return self.root_kernel.get_grid_size_upper_bounds(
+                self.program_callables_info,
+                ignore_auto=ignore_auto)
+
+    def get_grid_size_upper_bounds_as_exprs(self, ignore_auto=False):
+        """Return a tuple (global_size, local_size) containing a grid that
+        could accommodate execution of *all* instructions in the kernel.
+
+        *global_size* and *local_size* are :mod:`pymbolic` expressions
+        """
+        return self.root_kernel.get_grid_size_upper_bounds_as_exprs(
+                self.program_callables_info,
+                ignore_auto=ignore_auto)
+
+    # {{{ implementation arguments
+
+    @property
+    @memoize_method
+    def impl_arg_to_arg(self):
+        from loopy.kernel.array import ArrayBase
+
+        result = {}
+
+        for arg in self.args:
+            if not isinstance(arg, ArrayBase):
+                result[arg.name] = arg
+                continue
+
+            if arg.shape is None or arg.dim_tags is None:
+                result[arg.name] = arg
+                continue
+
+            subscripts_and_names = arg.subscripts_and_names()
+            if subscripts_and_names is None:
+                result[arg.name] = arg
+                continue
+
+            for index, sub_arg_name in subscripts_and_names:
+                result[sub_arg_name] = arg
+
+        return result
+
+    # }}}
+
+    @property
+    def root_kernel(self):
+        return self.program_callables_info[self.name].subkernel
+
+    @property
+    def arg_dict(self):
+        return self.root_kernel.arg_dict
+
+    def with_root_kernel(self, root_kernel):
+        new_in_knl_callable = self.program_callables_info[
+                self.name].copy(subkernel=root_kernel)
+        new_resolved_functions = (
+                self.program_callables_info.resolved_functions.copy())
+        new_resolved_functions[self.name] = new_in_knl_callable
+
+        return self.copy(
+                program_callables_info=self.program_callables_info.copy(
+                    resolved_functions=new_resolved_functions))
+
+    @property
+    def args(self):
+        return self.root_kernel.args[:]
+
+    def __call__(self, *args, **kwargs):
+        key = self.target.get_kernel_executor_cache_key(*args, **kwargs)
+        try:
+            pex = self._program_executor_cache[key]
+        except KeyError:
+            pex = self.target.get_kernel_executor(self, *args, **kwargs)
+            self._program_executor_cache[key] = pex
+
+        return pex(*args, **kwargs)
+
+    def __str__(self):
+        # FIXME: make this better
+        print(self.program_callables_info.num_times_callables_called)
+        return (
+                (self.program_callables_info[
+                    self.name].subkernel).__str__() +
+                '\nResolved Functions: ' +
+                (self.program_callables_info.resolved_functions.keys()).__str__() +
+                '\n' + 75*'-' + '\n')
+
+# }}}
+
+
+def next_indexed_function_identifier(function):
+    """
+    Returns an instance of :class:`str` with the next indexed-name in the
+    sequence for the name of *function*.
+
+    *Example:* ``Variable('sin_0')`` will return ``'sin_1'``.
+
+    :arg function: Either an instance of :class:`pymbolic.primitives.Variable`
+        or :class:`loopy.reduction.ArgExtOp` or
+        :class:`loopy.reduction.SegmentedOp`.
+    """
+    from loopy.library.reduction import ArgExtOp, SegmentedOp
+    if isinstance(function, (ArgExtOp, SegmentedOp)):
+        return function.copy()
+    elif isinstance(function, str):
+        function = Variable(function)
+
+    assert isinstance(function, Variable)
+    func_name = re.compile(r"^(?P<alpha>\S+?)_(?P<num>\d+?)$")
+
+    match = func_name.match(function.name)
+
+    if match is None:
+        if function.name[-1] == '_':
+            return "{old_name}0".format(old_name=function.name)
+        else:
+            return "{old_name}_0".format(old_name=function.name)
+
+    return "{alpha}_{num}".format(alpha=match.group('alpha'),
+            num=int(match.group('num'))+1)
+
+
+class ResolvedFunctionRenamer(RuleAwareIdentityMapper):
+    def __init__(self, rule_mapping_context, renaming_dict):
+        super(ResolvedFunctionRenamer, self).__init__(
+                rule_mapping_context)
+        self.renaming_dict = renaming_dict
+
+    def map_resolved_function(self, expr, expn_state):
+        if expr.name in self.renaming_dict:
+            return ResolvedFunction(self.renaming_dict[expr.name])
+        else:
+            return super(ResolvedFunctionRenamer, self).map_resolved_function(
+                    expr, expn_state)
+
+
+def rename_resolved_functions_in_a_single_kernel(kernel,
+        renaming_dict):
+    from loopy.symbolic import SubstitutionRuleMappingContext
+    rule_mapping_context = SubstitutionRuleMappingContext(
+                kernel.substitutions, kernel.get_var_name_generator())
+    resolved_function_renamer = ResolvedFunctionRenamer(rule_mapping_context,
+            renaming_dict)
+    return (
+            rule_mapping_context.finish_kernel(
+                resolved_function_renamer.map_kernel(kernel)))
+
+
+# {{{ program callables info
+
+class ProgramCallablesInfo(ImmutableRecord):
+    # FIXME: dont evalutate num_times_called, rahter compute it from the
+    # resolved_functions
+    # FIXME: make the edit callables thing a ContextManager.
+    def __init__(self, resolved_functions, num_times_callables_called=None,
+            history=None, is_being_edited=False,
+            num_times_hit_during_editing={},
+            renames_needed_after_editing={}):
+
+        if num_times_callables_called is None:
+            num_times_callables_called = dict((func_id, 1) for func_id in
+                    resolved_functions)
+        if history is None:
+            history = dict((func_id, set([func_id])) for func_id in
+                    resolved_functions)
+
+        super(ProgramCallablesInfo, self).__init__(
+                resolved_functions=resolved_functions,
+                num_times_callables_called=num_times_callables_called,
+                history=history,
+                is_being_edited=is_being_edited,
+                num_times_hit_during_editing=num_times_hit_during_editing,
+                renames_needed_after_editing=renames_needed_after_editing)
+
+    hash_fields = (
+            "resolved_functions",
+            "num_times_callables_called",
+            "is_being_edited",
+            "num_times_hit_during_editing",
+            "renames_needed_after_editing",
+            "history")
+
+    update_persistent_hash = LoopKernel.update_persistent_hash
+
+    def with_edit_callables_mode(self):
+        return self.copy(is_being_edited=True,
+                num_times_hit_during_editing=dict((func_id, 0) for func_id in
+                    self.resolved_functions))
+
+    def with_callable(self, function, in_kernel_callable,
+            resolved_for_the_first_time=False):
+        """
+        :arg function: An instance of :class:`pymbolic.primitives.Variable` or
+            :class:`loopy.library.reduction.ReductionOpFunction`.
+
+        :arg in_kernel_callables: An instance of
+            :class:`loopy.InKernelCallable`.
+
+        .. note::
+
+            Assumes that each callable is touched atmost once, the internal
+            working of this function fails if that is violated.
+        """
+        # FIXME: add a note about using enter and exit. ~KK
+        # FIXME: think about a better idea of "with_added_callable" this would
+        # be more convenient for developer-faced usage. ~KK
+
+        if not self.is_being_edited:
+            if function.name in self.resolved_functions and (
+                    self.resolved_functions[function.name] == in_kernel_callable):
+                return self, function
+            else:
+                print('Old: ', self.resolved_functions[function.name])
+                print('New: ', in_kernel_callable)
+                raise LoopyError("Use 'enter_edit_callables_mode' first.")
+
+        from loopy.library.reduction import ArgExtOp, SegmentedOp
+
+        # {{{ sanity checks
+
+        if isinstance(function, str):
+            function = Variable(function)
+
+        assert isinstance(function, (Variable, ArgExtOp, SegmentedOp))
+
+        # }}}
+
+        renames_needed_after_editing = self.renames_needed_after_editing.copy()
+        num_times_hit_during_editing = self.num_times_hit_during_editing.copy()
+        num_times_callables_called = self.num_times_callables_called.copy()
+        history = self.history.copy()
+
+        if not resolved_for_the_first_time:
+            if isinstance(function, (ArgExtOp, SegmentedOp)):
+                num_times_hit_during_editing[function] += 1
+            else:
+                num_times_hit_during_editing[function.name] += 1
+
+        if isinstance(function, (ArgExtOp, SegmentedOp)):
+            unique_function_identifier = function.copy()
+            if not resolved_for_the_first_time:
+                num_times_callables_called[function] -= 1
+
+            num_times_callables_called[unique_function_identifier] = 1
+
+            updated_resolved_functions = self.resolved_functions.copy()
+            updated_resolved_functions[unique_function_identifier] = (
+                    in_kernel_callable)
+
+            return (
+                    self.copy(
+                        resolved_functions=updated_resolved_functions,
+                        num_times_callables_called=num_times_callables_called,
+                        num_times_hit_during_editing=(
+                            num_times_hit_during_editing),
+                        renames_needed_after_editing=(
+                            renames_needed_after_editing)),
+                    unique_function_identifier)
+
+        if in_kernel_callable in self.resolved_functions.values():
+            # the callable already exists, implies return the function
+            # identifier corresposing to that callable.
+            for func_id, in_knl_callable in self.resolved_functions.items():
+                if in_knl_callable == in_kernel_callable:
+                    num_times_callables_called[func_id] += 1
+                    if not resolved_for_the_first_time:
+                        num_times_callables_called[function.name] -= 1
+                        if num_times_callables_called[function.name] == 0:
+                            renames_needed_after_editing[func_id] = function.name
+
+                        history[func_id] = history[func_id] | set([function.name])
+                    return (
+                            self.copy(
+                                history=history,
+                                num_times_hit_during_editing=(
+                                    num_times_hit_during_editing),
+                                num_times_callables_called=(
+                                    num_times_callables_called),
+                                renames_needed_after_editing=(
+                                    renames_needed_after_editing)),
+                            func_id)
+        else:
+            # FIXME: maybe deal with the history over here?
+            # FIXME: once the code logic is running beautify this part.
+            # many "ifs" can be avoided
+            unique_function_identifier = function.name
+            if (resolved_for_the_first_time or
+                    self.num_times_callables_called[function.name] > 1):
+                while unique_function_identifier in self.resolved_functions:
+                    unique_function_identifier = (
+                            next_indexed_function_identifier(
+                                unique_function_identifier))
+
+            if not resolved_for_the_first_time:
+                num_times_callables_called[function.name] -= 1
+
+            num_times_callables_called[unique_function_identifier] = 1
+
+        updated_resolved_functions = self.resolved_functions.copy()
+        updated_resolved_functions[unique_function_identifier] = (
+                in_kernel_callable)
+
+        if not resolved_for_the_first_time:
+            history[unique_function_identifier] = (
+                    history[function.name] | set([unique_function_identifier]))
+        else:
+            history[unique_function_identifier] = set(
+                    [unique_function_identifier])
+
+        return (
+                self.copy(
+                    history=history,
+                    resolved_functions=updated_resolved_functions,
+                    num_times_callables_called=num_times_callables_called,
+                    num_times_hit_during_editing=num_times_hit_during_editing,
+                    renames_needed_after_editing=renames_needed_after_editing),
+                Variable(unique_function_identifier))
+
+    def with_exit_edit_callables_mode(self):
+        assert self.is_being_edited
+
+        num_times_callables_called = {}
+        resolved_functions = {}
+        history = self.history.copy()
+
+        for func_id, in_knl_callable in self.resolved_functions.items():
+            if isinstance(in_knl_callable, CallableKernel):
+                old_subkernel = in_knl_callable.subkernel
+                new_subkernel = rename_resolved_functions_in_a_single_kernel(
+                        old_subkernel, self.renames_needed_after_editing)
+                in_knl_callable = (
+                        in_knl_callable.copy(subkernel=new_subkernel))
+            elif isinstance(in_knl_callable, ScalarCallable):
+                pass
+            else:
+                raise NotImplementedError("Unknown callable type %s." %
+                        type(in_knl_callable).__name__)
+
+            if func_id in self.renames_needed_after_editing:
+                history.pop(func_id)
+
+                new_func_id = self.renames_needed_after_editing[func_id]
+                resolved_functions[new_func_id] = (
+                        in_knl_callable)
+                num_times_callables_called[new_func_id] = (
+                        self.num_times_callables_called[func_id])
+
+            else:
+                resolved_functions[func_id] = in_knl_callable
+                num_times_callables_called[func_id] = (
+                        self.num_times_callables_called[func_id])
+
+        return self.copy(
+                is_being_edited=False,
+                resolved_functions=resolved_functions,
+                num_times_callables_called=num_times_callables_called,
+                num_times_hit_during_editing={},
+                renames_needed_after_editing={})
+
+    def with_deleted_callable(self, func_id, instances=1):
+        num_times_callables_called = self.num_times_callables_called.copy()
+        history = self.history.copy()
+        resolved_functions = self.resolved_functions.copy()
+
+        assert instances <= num_times_callables_called[func_id]
+
+        num_times_callables_called[func_id] -= instances
+
+        if num_times_callables_called[func_id] == 0:
+            num_times_callables_called.pop(func_id)
+            history.pop(func_id)
+            resolved_functions.pop(func_id)
+
+        return self.copy(
+                resolved_functions=resolved_functions,
+                num_times_callables_called=num_times_callables_called,
+                history=history)
+
+    def __getitem__(self, item):
+        return self.resolved_functions[item]
+
+    def __contains__(self, item):
+        return item in self.resolved_functions
+
+    def items(self):
+        return self.resolved_functions.items()
+
+    def values(self):
+        return self.resolved_functions.values()
+
+
+# }}}
+
+
+def default_func_id_to_kernel_callable_mappers(target):
+
+    from loopy.library.function import loopy_specific_callable_scopers
+    return (
+            [loopy_specific_callable_scopers] + (
+                target.get_device_ast_builder().function_scopers()))
+
+
+def make_program_from_kernel(kernel):
+
+    program_callables_info = initialize_program_callables_info_from_kernel(kernel,
+            default_func_id_to_kernel_callable_mappers(kernel.target))
+
+    program = Program(
+            name=kernel.name,
+            program_callables_info=program_callables_info,
+            func_id_to_in_knl_callable_mappers=(
+                default_func_id_to_kernel_callable_mappers(kernel.target)),
+            target=kernel.target)
+
+    return program
+
+
+def iterate_over_kernels_if_given_program(transform_for_single_kernel):
+    def _collective_transform(program_or_kernel, *args, **kwargs):
+        if isinstance(program_or_kernel, Program):
+            program = program_or_kernel
+            new_resolved_functions = {}
+            for func_id, in_knl_callable in program.program_callables_info.items():
+                if isinstance(in_knl_callable, CallableKernel):
+                    new_subkernel = transform_for_single_kernel(
+                            in_knl_callable.subkernel, *args, **kwargs)
+                    in_knl_callable = in_knl_callable.copy(
+                            subkernel=new_subkernel)
+
+                elif isinstance(in_knl_callable, ScalarCallable):
+                    pass
+                else:
+                    raise NotImplementedError("Unknown type of callable %s." % (
+                        type(in_knl_callable).__name__))
+
+                new_resolved_functions[func_id] = in_knl_callable
+
+            new_program_callables_info = program.program_callables_info.copy(
+                    resolved_functions=new_resolved_functions)
+            return program.copy(program_callables_info=new_program_callables_info)
+        else:
+            assert isinstance(program_or_kernel, LoopKernel)
+            kernel = program_or_kernel
+            return transform_for_single_kernel(kernel, *args, **kwargs)
+
+    return wraps(transform_for_single_kernel)(_collective_transform)
+
+
+# {{{ ingoring this for now
+
+# if False and isinstance(function, (ArgExtOp, SegmentedOp)):
+# FIXME: ignoring this casse for now
+# FIXME: If a kernel has two flavors of ArgExtOp then they are
+# overwritten and hence not supported.(for now).
+# updated_resolved_functions = self.scoped_functions.copy()
+# updated_resolved_functions[function] = in_kernel_callable
+# return self.copy(updated_resolved_functions), function.copy()
+
+# }}}
+
+
+# vim: foldmethod=marker
diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py
new file mode 100644
index 00000000..b5b80ad8
--- /dev/null
+++ b/loopy/transform/callable.py
@@ -0,0 +1,707 @@
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import six
+
+import islpy as isl
+from pymbolic.primitives import CallWithKwargs
+
+from loopy.kernel import LoopKernel
+from pytools import ImmutableRecord
+from loopy.diagnostic import LoopyError
+from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase,
+        CInstruction, _DataObliviousInstruction)
+from loopy.symbolic import IdentityMapper, SubstitutionMapper
+from loopy.isl_helpers import simplify_via_aff
+from loopy.kernel.function_interface import (get_kw_pos_association,
+        change_names_of_pymbolic_calls, CallableKernel, ScalarCallable)
+from loopy.program import Program, ResolvedFunctionMarker
+
+__doc__ = """
+.. currentmodule:: loopy
+
+.. autofunction:: register_function_id_to_in_knl_callable_mapper
+
+.. autofunction:: register_callable_kernel
+"""
+
+
+# {{{ register function lookup
+
+def resolved_callables_from_function_lookup(program,
+        func_id_to_kernel_callable_mapper):
+    program_callables_info = program.program_callables_info
+    program_callables_info = program_callables_info.with_edit_callables_mode()
+
+    callable_knls = dict(
+            (func_id, in_knl_callable) for func_id, in_knl_callable in
+            program_callables_info.items() if isinstance(in_knl_callable,
+                CallableKernel))
+    edited_callable_knls = {}
+
+    for func_id, in_knl_callable in callable_knls.items():
+        kernel = in_knl_callable.subkernel
+
+        from loopy.symbolic import SubstitutionRuleMappingContext
+        rule_mapping_context = SubstitutionRuleMappingContext(
+                kernel.substitutions, kernel.get_var_name_generator())
+
+        resolved_function_marker = ResolvedFunctionMarker(
+                rule_mapping_context, kernel, program_callables_info,
+                [func_id_to_kernel_callable_mapper])
+
+        # scoping fucntions and collecting the scoped functions
+        new_subkernel = rule_mapping_context.finish_kernel(
+                resolved_function_marker.map_kernel(kernel))
+        program_callables_info = resolved_function_marker.program_callables_info
+
+        edited_callable_knls[func_id] = in_knl_callable.copy(
+                subkernel=new_subkernel)
+
+    program_callables_info = (
+            program_callables_info.with_exit_edit_callables_mode())
+
+    new_resolved_functions = {}
+
+    for func_id, in_knl_callable in program_callables_info.items():
+        if func_id in edited_callable_knls:
+            new_resolved_functions[func_id] = edited_callable_knls[func_id]
+        else:
+            new_resolved_functions[func_id] = in_knl_callable
+
+    program_callables_info = program_callables_info.copy(
+            resolved_functions=new_resolved_functions)
+
+    return program.copy(program_callables_info=program_callables_info)
+
+
+def register_function_id_to_in_knl_callable_mapper(program,
+        func_id_to_in_knl_callable_mapper):
+    """
+    Returns a copy of *kernel* with the *function_lookup* registered.
+
+    :arg func_id_to_in_knl_callable_mapper: A function of signature ``(target,
+        identifier)`` returning a
+        :class:`loopy.kernel.function_interface.InKernelCallable` or *None* if
+        the *function_identifier* is not known.
+    """
+
+    # adding the function lookup to the set of function lookers in the kernel.
+    if func_id_to_in_knl_callable_mapper not in (
+            program.func_id_to_in_knl_callable_mappers):
+        from loopy.tools import unpickles_equally
+        if not unpickles_equally(func_id_to_in_knl_callable_mapper):
+            raise LoopyError("function '%s' does not "
+                    "compare equally after being upickled "
+                    "and would disrupt loopy's caches"
+                    % func_id_to_in_knl_callable_mapper)
+        new_func_id_mappers = program.func_id_to_in_knl_callable_mappers + (
+                [func_id_to_in_knl_callable_mapper])
+
+    program = resolved_callables_from_function_lookup(program,
+            func_id_to_in_knl_callable_mapper)
+
+    new_program = program.copy(
+            func_id_to_in_knl_callable_mappers=new_func_id_mappers)
+
+    return new_program
+
+# }}}
+
+
+# {{{ register_callable_kernel
+
+class _RegisterCalleeKernel(ImmutableRecord):
+    """
+    Helper class to make the function scoper from
+    :func:`loopy.transform.register_callable_kernel` picklable. As python
+    cannot pickle lexical closures.
+    """
+    fields = set(['callable_kernel'])
+
+    def __init__(self, callable_kernel):
+        self.callable_kernel = callable_kernel
+
+    def __call__(self, target, identifier):
+        if identifier == self.callable_kernel.subkernel.name:
+            return self.callable_kernel
+        return None
+
+
+def register_callable_kernel(program, callee_kernel):
+    """Returns a copy of *caller_kernel*, which would resolve *function_name* in an
+    expression as a call to *callee_kernel*.
+
+    :arg caller_kernel: An instance of :class:`loopy.kernel.LoopKernel`.
+    :arg function_name: An instance of :class:`str`.
+    :arg callee_kernel: An instance of :class:`loopy.kernel.LoopKernel`.
+    """
+
+    # {{{ sanity checks
+
+    assert isinstance(program, Program)
+    assert isinstance(callee_kernel, LoopKernel)
+
+    # check to make sure that the variables with 'out' direction is equal to
+    # the number of assigness in the callee kernel intructions.
+    expected_num_assignees = len([arg for arg in callee_kernel.args if
+        arg.is_output_only])
+    expected_num_parameters = len(callee_kernel.args) - expected_num_assignees
+    for in_knl_callable in program.program_callables_info.values():
+        if isinstance(in_knl_callable, CallableKernel):
+            caller_kernel = in_knl_callable.subkernel
+            for insn in caller_kernel.instructions:
+                if isinstance(insn, CallInstruction) and (
+                        insn.expression.function.name == callee_kernel.name):
+                    if isinstance(insn.expression, CallWithKwargs):
+                        kw_parameters = insn.expression.kw_parameters
+                    else:
+                        kw_parameters = {}
+                    if len(insn.assignees) != expected_num_assignees:
+                        raise LoopyError("The number of arguments with 'out' "
+                                "direction " "in callee kernel %s and the number "
+                                "of assignees in " "instruction %s do not "
+                                "match." % (
+                                    callee_kernel.name, insn.id))
+                    if len(insn.expression.parameters+tuple(
+                            kw_parameters.values())) != expected_num_parameters:
+                        raise LoopyError("The number of expected arguments "
+                                "for the callee kernel %s and the number of "
+                                "parameters in instruction %s do not match."
+                                % (callee_kernel.name, insn.id))
+
+                elif isinstance(insn, (MultiAssignmentBase, CInstruction,
+                        _DataObliviousInstruction)):
+                    pass
+                else:
+                    raise NotImplementedError("unknown instruction %s" % type(insn))
+        elif isinstance(in_knl_callable, ScalarCallable):
+            pass
+        else:
+            raise NotImplementedError("Unknown callable type %s." %
+                    type(in_knl_callable).__name__)
+
+    # }}}
+
+    # take the function resolvers from the Program and resolve the functions in
+    # the callee kernel
+    program_callables_info = (
+            program.program_callables_info.with_edit_callables_mode())
+
+    from loopy.symbolic import SubstitutionRuleMappingContext
+    rule_mapping_context = SubstitutionRuleMappingContext(
+            callee_kernel.substitutions,
+            callee_kernel.get_var_name_generator())
+
+    resolved_function_marker = ResolvedFunctionMarker(
+            rule_mapping_context, callee_kernel, program_callables_info,
+            program.func_id_to_in_knl_callable_mappers)
+
+    callee_kernel = rule_mapping_context.finish_kernel(
+            resolved_function_marker.map_kernel(callee_kernel))
+    program_callables_info = resolved_function_marker.program_callables_info
+
+    program_callables_info = (
+            program_callables_info.with_exit_edit_callables_mode())
+    program = program.copy(program_callables_info=program_callables_info)
+
+    # making the target of the child kernel to be same as the target of parent
+    # kernel.
+    callable_kernel = CallableKernel(subkernel=callee_kernel.copy(
+                        target=program.target,
+                        is_called_from_host=False))
+
+    # FIXME disabling global barriers for callee kernel (for now)
+    from loopy import set_options
+    callee_kernel = set_options(callee_kernel, "disable_global_barriers")
+
+    # FIXME: the number of callables is wrong. This is horrible please
+    # compensate.
+
+    return register_function_id_to_in_knl_callable_mapper(
+            program,
+            _RegisterCalleeKernel(callable_kernel))
+
+# }}}
+
+
+# {{{ kernel inliner mapper
+
+class KernelInliner(SubstitutionMapper):
+    """Mapper to replace variables (indices, temporaries, arguments) in the
+    callee kernel with variables in the caller kernel.
+
+    :arg caller: the caller kernel
+    :arg arg_map: dict of argument name to variables in caller
+    :arg arg_dict: dict of argument name to arguments in callee
+    """
+
+    def __init__(self, subst_func, caller, arg_map, arg_dict):
+        super(KernelInliner, self).__init__(subst_func)
+        self.caller = caller
+        self.arg_map = arg_map
+        self.arg_dict = arg_dict
+
+    def map_subscript(self, expr):
+        if expr.aggregate.name in self.arg_map:
+
+            aggregate = self.subst_func(expr.aggregate)
+            sar = self.arg_map[expr.aggregate.name]  # SubArrayRef in caller
+            callee_arg = self.arg_dict[expr.aggregate.name]  # Arg in callee
+            if aggregate.name in self.caller.arg_dict:
+                caller_arg = self.caller.arg_dict[aggregate.name]  # Arg in caller
+            else:
+                caller_arg = self.caller.temporary_variables[aggregate.name]
+
+            # Firstly, map inner inames to outer inames.
+            outer_indices = self.map_tuple(expr.index_tuple)
+
+            # Next, reshape to match dimension of outer arrays.
+            # We can have e.g. A[3, 2] from outside and B[6] from inside
+            from numbers import Integral
+            if not all(isinstance(d, Integral) for d in callee_arg.shape):
+                raise LoopyError(
+                    "Argument: {0} in callee kernel: {1} does not have "
+                    "constant shape.".format(callee_arg))
+
+            flatten_index = 0
+            for i, idx in enumerate(sar.get_begin_subscript().index_tuple):
+                flatten_index += idx*caller_arg.dim_tags[i].stride
+
+            flatten_index += sum(
+                idx * tag.stride
+                for idx, tag in zip(outer_indices, callee_arg.dim_tags))
+
+            from loopy.isl_helpers import simplify_via_aff
+            flatten_index = simplify_via_aff(flatten_index)
+
+            new_indices = []
+            for dim_tag in caller_arg.dim_tags:
+                ind = flatten_index // dim_tag.stride
+                flatten_index -= (dim_tag.stride * ind)
+                new_indices.append(ind)
+
+            new_indices = tuple(simplify_via_aff(i) for i in new_indices)
+
+            return aggregate.index(tuple(new_indices))
+        else:
+            return super(KernelInliner, self).map_subscript(expr)
+
+# }}}
+
+
+# {{{ inlining of a single call instruction
+
+def _inline_call_instruction(caller_kernel, callee_knl, instruction):
+    """
+    Returns a copy of *kernel* with the *instruction* in the *kernel*
+    replaced by inlining :attr:`subkernel` within it.
+    """
+    callee_label = callee_knl.name[:4] + "_"
+
+    # {{{ duplicate and rename inames
+
+    vng = caller_kernel.get_var_name_generator()
+    ing = caller_kernel.get_instruction_id_generator()
+    dim_type = isl.dim_type.set
+
+    iname_map = {}
+    for iname in callee_knl.all_inames():
+        iname_map[iname] = vng(callee_label+iname)
+
+    new_domains = []
+    new_iname_to_tags = caller_kernel.iname_to_tags.copy()
+
+    # transferring iname tags info from the callee to the caller kernel
+    for domain in callee_knl.domains:
+        new_domain = domain.copy()
+        for i in range(new_domain.n_dim()):
+            iname = new_domain.get_dim_name(dim_type, i)
+
+            if iname in callee_knl.iname_to_tags:
+                new_iname_to_tags[iname_map[iname]] = (
+                        callee_knl.iname_to_tags[iname])
+            new_domain = new_domain.set_dim_name(
+                dim_type, i, iname_map[iname])
+        new_domains.append(new_domain)
+
+    kernel = caller_kernel.copy(domains=caller_kernel.domains + new_domains,
+            iname_to_tags=new_iname_to_tags)
+
+    # }}}
+
+    # {{{ rename temporaries
+
+    temp_map = {}
+    new_temps = kernel.temporary_variables.copy()
+    for name, temp in six.iteritems(callee_knl.temporary_variables):
+        new_name = vng(callee_label+name)
+        temp_map[name] = new_name
+        new_temps[new_name] = temp.copy(name=new_name)
+
+    kernel = kernel.copy(temporary_variables=new_temps)
+
+    # }}}
+
+    # {{{ match kernel arguments
+
+    arg_map = {}  # callee arg name -> caller symbols (e.g. SubArrayRef)
+
+    assignees = instruction.assignees  # writes
+    parameters = instruction.expression.parameters  # reads
+
+    # add keyword parameters
+    from pymbolic.primitives import CallWithKwargs
+
+    if isinstance(instruction.expression, CallWithKwargs):
+        from loopy.kernel.function_interface import get_kw_pos_association
+
+        _, pos_to_kw = get_kw_pos_association(callee_knl)
+        kw_parameters = instruction.expression.kw_parameters
+        for i in range(len(parameters), len(parameters) + len(kw_parameters)):
+            parameters = parameters + (kw_parameters[pos_to_kw[i]],)
+
+    assignee_pos = 0
+    parameter_pos = 0
+    for i, arg in enumerate(callee_knl.args):
+        if arg.is_output_only:
+            arg_map[arg.name] = assignees[assignee_pos]
+            assignee_pos += 1
+        else:
+            arg_map[arg.name] = parameters[parameter_pos]
+            parameter_pos += 1
+
+    # }}}
+
+    # {{{ rewrite instructions
+
+    import pymbolic.primitives as p
+    from pymbolic.mapper.substitutor import make_subst_func
+
+    var_map = dict((p.Variable(k), p.Variable(v))
+                   for k, v in six.iteritems(iname_map))
+    var_map.update(dict((p.Variable(k), p.Variable(v))
+                        for k, v in six.iteritems(temp_map)))
+    var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name))
+                        for k, v in six.iteritems(arg_map)))
+    subst_mapper = KernelInliner(
+        make_subst_func(var_map), kernel, arg_map, callee_knl.arg_dict)
+
+    insn_id = {}
+    for insn in callee_knl.instructions:
+        insn_id[insn.id] = ing(callee_label+insn.id)
+
+    # {{{ root and leave instructions in callee kernel
+
+    dep_map = callee_knl.recursive_insn_dep_map()
+    # roots depend on nothing
+    heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps)
+    # leaves have nothing that depends on them
+    tails = set(dep_map.keys())
+    for insn, deps in six.iteritems(dep_map):
+        tails = tails - deps
+
+    # }}}
+
+    # {{{ use NoOp to mark the start and end of callee kernel
+
+    from loopy.kernel.instruction import NoOpInstruction
+
+    noop_start = NoOpInstruction(
+        id=ing(callee_label+"_start"),
+        within_inames=instruction.within_inames,
+        depends_on=instruction.depends_on
+    )
+    noop_end = NoOpInstruction(
+        id=instruction.id,
+        within_inames=instruction.within_inames,
+        depends_on=frozenset(insn_id[insn] for insn in tails)
+    )
+    # }}}
+
+    inner_insns = [noop_start]
+
+    for insn in callee_knl.instructions:
+        insn = insn.with_transformed_expressions(subst_mapper)
+        within_inames = frozenset(map(iname_map.get, insn.within_inames))
+        within_inames = within_inames | instruction.within_inames
+        depends_on = frozenset(map(insn_id.get, insn.depends_on)) | (
+                instruction.depends_on)
+        if insn.id in heads:
+            depends_on = depends_on | set([noop_start.id])
+        insn = insn.copy(
+            id=insn_id[insn.id],
+            within_inames=within_inames,
+            # TODO: probaby need to keep priority in callee kernel
+            priority=instruction.priority,
+            depends_on=depends_on
+        )
+        inner_insns.append(insn)
+
+    inner_insns.append(noop_end)
+
+    new_insns = []
+    for insn in kernel.instructions:
+        if insn == instruction:
+            new_insns.extend(inner_insns)
+        else:
+            new_insns.append(insn)
+
+    kernel = kernel.copy(instructions=new_insns)
+
+    # }}}
+
+    return kernel
+
+# }}}
+
+
+# {{{ inline callable kernel
+
+def _inline_single_callable_kernel(caller_kernel, function_name,
+        program_callables_info):
+    old_insns = caller_kernel.instructions
+    for insn in old_insns:
+        if isinstance(insn, CallInstruction):
+            # FIXME This seems to use identifiers across namespaces. Why not
+            # check whether the function is a scoped function first? ~AK
+            if insn.expression.function.name in program_callables_info:
+                history_of_identifier = program_callables_info.history[
+                        insn.expression.function.name]
+
+                if function_name in history_of_identifier:
+                    in_knl_callable = program_callables_info[
+                            insn.expression.function.name]
+                    assert isinstance(in_knl_callable, CallableKernel)
+                    caller_kernel = _inline_call_instruction(
+                            caller_kernel, in_knl_callable.subkernel, insn)
+                    program_callables_info = (
+                            program_callables_info.with_deleted_callable(
+                                insn.expression.function.name,
+                                program_callables_info.num_times_callables_called[
+                                    caller_kernel.name]))
+        elif isinstance(insn, (MultiAssignmentBase, CInstruction,
+                _DataObliviousInstruction)):
+            pass
+        else:
+            raise NotImplementedError(
+                    "Unknown instruction type %s"
+                    % type(insn).__name__)
+
+    return caller_kernel, program_callables_info
+
+
+# FIXME This should take a 'within' parameter to be able to only inline
+# *some* calls to a kernel, but not others.
+def inline_callable_kernel(program, function_name):
+    """
+    Returns a copy of *kernel* with the callable kernel addressed by
+    (scoped) name *function_name* inlined.
+    """
+    from loopy.preprocess import infer_arg_descr
+    program = infer_arg_descr(program)
+    program_callables_info = program.program_callables_info
+    old_program_callables_info = program_callables_info.copy()
+
+    edited_callable_kernels = {}
+
+    for func_id, in_knl_callable in old_program_callables_info.items():
+        if function_name not in old_program_callables_info.history[func_id] and (
+                isinstance(in_knl_callable, CallableKernel)):
+            caller_kernel = in_knl_callable.subkernel
+            caller_kernel, program_callables_info = (
+                    _inline_single_callable_kernel(caller_kernel,
+                        function_name,
+                        program_callables_info))
+            edited_callable_kernels[func_id] = in_knl_callable.copy(
+                    subkernel=caller_kernel)
+
+    new_resolved_functions = {}
+    for func_id, in_knl_callable in program_callables_info.items():
+        if func_id in edited_callable_kernels:
+            new_resolved_functions[func_id] = edited_callable_kernels[func_id]
+        else:
+            new_resolved_functions[func_id] = in_knl_callable
+
+    program_callables_info = program_callables_info.copy(
+            resolved_functions=new_resolved_functions)
+
+    return program.copy(program_callables_info=program_callables_info)
+
+# }}}
+
+
+# {{{ tools to match caller to callee args by (guessed) automatic reshaping
+
+# (This is undocumented and not recommended, but it is currently needed
+# to support Firedrake.)
+
+class DimChanger(IdentityMapper):
+    """
+    Mapper to change the dimensions of an argument.
+
+    .. attribute:: callee_arg_dict
+
+        A mapping from the argument name (:class:`str`) to instances of
+        :class:`loopy.kernel.array.ArrayBase`.
+
+    .. attribute:: desried_shape
+
+        A mapping from argument name (:class:`str`) to an instance of
+        :class:`tuple`.
+    """
+    def __init__(self, callee_arg_dict, desired_shape):
+        self.callee_arg_dict = callee_arg_dict
+        self.desired_shape = desired_shape
+
+    def map_subscript(self, expr):
+        callee_arg_dim_tags = self.callee_arg_dict[expr.aggregate.name].dim_tags
+        flattened_index = sum(dim_tag.stride*idx for dim_tag, idx in
+                zip(callee_arg_dim_tags, expr.index_tuple))
+        new_indices = []
+
+        from operator import mul
+        from functools import reduce
+        stride = reduce(mul, self.desired_shape[expr.aggregate.name], 1)
+
+        for length in self.desired_shape[expr.aggregate.name]:
+            stride /= length
+            ind = flattened_index // int(stride)
+            flattened_index -= (int(stride) * ind)
+            new_indices.append(simplify_via_aff(ind))
+
+        return expr.aggregate.index(tuple(new_indices))
+
+
+def _match_caller_callee_argument_dimension_for_single_kernel(
+        caller_knl, program_callables_info, callee_function_name):
+    """
+    Returns a copy of *caller_knl* with the instance of
+    :class:`loopy.kernel.function_interface.CallableKernel` addressed by
+    *callee_function_name* in the *caller_knl* aligned with the argument
+    dimesnsions required by *caller_knl*.
+    """
+    pymbolic_calls_to_new_callables = {}
+    for insn in caller_knl.instructions:
+        if not isinstance(insn, CallInstruction) or (
+                insn.expression.function.name not in
+                program_callables_info):
+            # Call to a callable kernel can only occur through a
+            # CallInstruction.
+            continue
+
+        in_knl_callable = program_callables_info[
+                insn.expression.function.name]
+
+        if in_knl_callable.subkernel.name != callee_function_name:
+            # Not the callable we're looking for.
+            continue
+
+        # getting the caller->callee arg association
+
+        parameters = insn.expression.parameters[:]
+        kw_parameters = {}
+        if isinstance(insn.expression, CallWithKwargs):
+            kw_parameters = insn.expression.kw_parameters
+
+        assignees = insn.assignees
+
+        parameter_shapes = [par.get_array_arg_descriptor(caller_knl).shape
+                for par in parameters]
+        kw_to_pos, pos_to_kw = get_kw_pos_association(in_knl_callable.subkernel)
+        for i in range(len(parameters), len(parameters)+len(kw_parameters)):
+            parameter_shapes.append(kw_parameters[pos_to_kw[i]]
+                    .get_array_arg_descriptor(caller_knl).shape)
+
+        # inserting the assigness at the required positions.
+        assignee_write_count = -1
+        for i, arg in enumerate(in_knl_callable.subkernel.args):
+            if arg.is_output_only:
+                assignee = assignees[-assignee_write_count-1]
+                parameter_shapes.insert(i, assignee
+                        .get_array_arg_descriptor(caller_knl).shape)
+                assignee_write_count -= 1
+
+        callee_arg_to_desired_dim_tag = dict(zip([arg.name for arg in
+            in_knl_callable.subkernel.args], parameter_shapes))
+        dim_changer = DimChanger(in_knl_callable.subkernel.arg_dict,
+                callee_arg_to_desired_dim_tag)
+        new_callee_insns = []
+        for callee_insn in in_knl_callable.subkernel.instructions:
+            if isinstance(callee_insn, MultiAssignmentBase):
+                new_callee_insns.append(callee_insn.copy(expression=dim_changer(
+                    callee_insn.expression),
+                    assignee=dim_changer(callee_insn.assignee)))
+            elif isinstance(callee_insn, (CInstruction,
+                    _DataObliviousInstruction)):
+                pass
+            else:
+                raise NotImplementedError("Unknwon instruction %s." %
+                        type(insn))
+
+        # subkernel with instructions adjusted according to the new dimensions.
+        new_subkernel = in_knl_callable.subkernel.copy(instructions=new_callee_insns)
+
+        new_in_knl_callable = in_knl_callable.copy(subkernel=new_subkernel)
+
+        pymbolic_calls_to_new_callables[insn.expression] = new_in_knl_callable
+
+    if not pymbolic_calls_to_new_callables:
+        # complain if no matching function found.
+        raise LoopyError("No CallableKernel with the name %s found in %s." % (
+            callee_function_name, caller_knl.name))
+
+    return change_names_of_pymbolic_calls(caller_knl,
+            pymbolic_calls_to_new_callables)
+
+
+def _match_caller_callee_argument_dimension_(program, *args, **kwargs):
+    assert isinstance(program, Program)
+
+    new_resolved_functions = {}
+    for func_id, in_knl_callable in program.program_callables_info.items():
+        if isinstance(in_knl_callable, CallableKernel):
+            new_subkernel = (
+                    _match_caller_callee_argument_dimension_for_single_kernel(
+                        in_knl_callable.subkernel, program.program_callables_info,
+                        *args, **kwargs))
+            in_knl_callable = in_knl_callable.copy(
+                    subkernel=new_subkernel)
+
+        elif isinstance(in_knl_callable, ScalarCallable):
+            pass
+        else:
+            raise NotImplementedError("Unknown type of callable %s." % (
+                type(in_knl_callable).__name__))
+
+        new_resolved_functions[func_id] = in_knl_callable
+
+    new_program_callables_info = program.program_callables_info.copy(
+            resolved_functions=new_resolved_functions)
+    return program.copy(program_callables_info=new_program_callables_info)
+
+# }}}
+
+
+# vim: foldmethod=marker
diff --git a/test/test_callables.py b/test/test_callables.py
new file mode 100644
index 00000000..f25bbbe6
--- /dev/null
+++ b/test/test_callables.py
@@ -0,0 +1,414 @@
+from __future__ import division, absolute_import, print_function
+
+__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import numpy as np
+import pyopencl as cl
+import pyopencl.clrandom  # noqa: F401
+import loopy as lp
+import pytest
+import sys
+
+
+from pyopencl.tools import (  # noqa: F401
+        pytest_generate_tests_for_pyopencl
+        as pytest_generate_tests)
+
+from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa: F401
+
+
+def test_register_function_lookup(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    from testlib import register_log2_lookup
+
+    x = np.random.rand(10)
+    ctx = cl.create_some_context()
+    queue = cl.CommandQueue(ctx)
+
+    prog = lp.make_kernel(
+            "{[i]: 0<=i<10}",
+            """
+            y[i] = log2(x[i])
+            """)
+    prog = lp.register_function_id_to_in_knl_callable_mapper(prog,
+            register_log2_lookup)
+
+    evt, (out, ) = prog(queue, x=x)
+
+    assert np.linalg.norm(np.log2(x)-out)/np.linalg.norm(np.log2(x)) < 1e-15
+
+
+@pytest.mark.parametrize("inline", [False, True])
+def test_register_knl(ctx_factory, inline):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+    n = 2 ** 4
+
+    x = np.random.rand(n, n, n, n, n)
+    y = np.random.rand(n, n, n, n, n)
+
+    grandchild_knl = lp.make_kernel_function(
+            "{[i, j]:0<= i, j< 16}",
+            """
+            c[i, j] = 2*a[i, j] + 3*b[i, j]
+            """, name='linear_combo1')
+
+    child_knl = lp.make_kernel_function(
+            "{[i, j]:0<=i, j < 16}",
+            """
+            [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j])
+            """, name='linear_combo2')
+
+    parent_knl = lp.make_kernel(
+            "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}",
+            """
+            [j, l]: z[i, j, k, l, m] = linear_combo2([j, l]: x[i, j, k, l, m],
+                                                     [j, l]: y[i, j, k, l, m])
+            """,
+            kernel_data=[
+                lp.GlobalArg(
+                    name='x',
+                    dtype=np.float64,
+                    shape=(16, 16, 16, 16, 16)),
+                lp.GlobalArg(
+                    name='y',
+                    dtype=np.float64,
+                    shape=(16, 16, 16, 16, 16)), '...'],
+            )
+
+    knl = lp.register_callable_kernel(
+            parent_knl, child_knl)
+    knl = lp.register_callable_kernel(
+            knl, grandchild_knl)
+    if inline:
+        knl = lp.inline_callable_kernel(knl, 'linear_combo2')
+        knl = lp.inline_callable_kernel(knl, 'linear_combo1')
+
+    evt, (out, ) = knl(queue, x=x, y=y)
+
+    assert (np.linalg.norm(2*x+3*y-out)/(
+        np.linalg.norm(2*x+3*y))) < 1e-15
+
+
+@pytest.mark.parametrize("inline", [False, True])
+def test_slices_with_negative_step(ctx_factory, inline):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+    n = 2 ** 4
+
+    x = np.random.rand(n, n, n, n, n)
+    y = np.random.rand(n, n, n, n, n)
+
+    child_knl = lp.make_kernel_function(
+            "{[i, j]:0<=i, j < 16}",
+            """
+            g[i, j] = 2*e[i, j] + 3*f[i, j]
+            """, name="linear_combo")
+
+    parent_knl = lp.make_kernel(
+            "{[i, k, m]: 0<=i, k, m<16}",
+            """
+            z[i, 15:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m],
+                                                   y[i, :, k, :, m])
+            """,
+            kernel_data=[
+                lp.GlobalArg(
+                    name='x',
+                    dtype=np.float64,
+                    shape=(16, 16, 16, 16, 16)),
+                lp.GlobalArg(
+                    name='y',
+                    dtype=np.float64,
+                    shape=(16, 16, 16, 16, 16)),
+                lp.GlobalArg(
+                    name='z',
+                    dtype=np.float64,
+                    shape=(16, 16, 16, 16, 16)), '...'],
+            )
+
+    knl = lp.register_callable_kernel(
+            parent_knl, child_knl)
+    if inline:
+        knl = lp.inline_callable_kernel(knl, 'linear_combo')
+
+    evt, (out, ) = knl(queue, x=x, y=y)
+
+    assert (np.linalg.norm(2*x+3*y-out[:, ::-1, :, :, :])/(
+        np.linalg.norm(2*x+3*y))) < 1e-15
+
+
+@pytest.mark.parametrize("inline", [False, True])
+def test_register_knl_with_call_with_kwargs(ctx_factory, inline):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    n = 2 ** 2
+
+    a_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32)
+    b_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32)
+    c_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64)
+
+    callee_knl = lp.make_kernel_function(
+            "{[i, j]:0<=i, j < %d}" % n,
+            """
+            h[i, j] = 2 * e[i, j] + 3*f[i, j] + 4*g[i, j]
+            <>f1[i, j] = 2*f[i, j]
+            p[i, j] = 7 * e[i, j] + 4*f1[i, j] + 2*g[i, j]
+            """,
+            [
+                lp.GlobalArg('f, e, h, g'), '...'],
+            name='linear_combo')
+
+    caller_knl = lp.make_kernel(
+            "{[i, j, k, l, m]: 0<=i, j, k, l, m<%d}" % n,
+            """
+            <> d[i, j, k, l, m] = 2*b[i, j, k, l, m]
+            [j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m]  = linear_combo(
+                                                     f=[j, l]: a[i, j, k, l, m],
+                                                     g=[j, l]: d[i, j, k, l, m],
+                                                     e=[j, l]: c[i, j, k, l, m])
+            """)
+
+    knl = lp.register_callable_kernel(
+            caller_knl, callee_knl)
+    if inline:
+        knl = lp.inline_callable_kernel(knl, 'linear_combo')
+
+    evt, (out1, out2, ) = knl(queue, a=a_dev, b=b_dev, c=c_dev)
+
+    a = a_dev.get()
+    b = b_dev.get()
+    c = c_dev.get()
+
+    h = out1.get()  # h = 2c + 3a +  8b
+    p = out2.get()  # p = 7c + 8a + 4b
+    h_exact = 3*a + 8*b + 2*c
+    p_exact = 8*a + 4*b + 7*c
+
+    assert np.linalg.norm(h-h_exact)/np.linalg.norm(h_exact) < 1e-7
+    assert np.linalg.norm(p-p_exact)/np.linalg.norm(p_exact) < 1e-7
+
+
+@pytest.mark.parametrize("inline", [False, True])
+def test_register_knl_with_hw_axes(ctx_factory, inline):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    n = 2 ** 4
+
+    x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64)
+    y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64)
+
+    callee_knl = lp.make_kernel_function(
+            "{[i, j]:0<=i, j < 16}",
+            """
+            g[i, j] = 2*e[i, j] + 3*f[i, j]
+            """, name='linear_combo')
+
+    callee_knl = lp.split_iname(callee_knl, "i", 4, inner_tag="l.0", outer_tag="g.0")
+
+    caller_knl = lp.make_kernel(
+            "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}",
+            """
+            [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m],
+                                                     [j, l]: y[i, j, k, l, m])
+            """
+            )
+    caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1")
+
+    knl = lp.register_callable_kernel(
+            caller_knl, callee_knl)
+
+    if inline:
+        knl = lp.inline_callable_kernel(knl, 'linear_combo')
+
+    evt, (out, ) = knl(queue, x=x_dev, y=y_dev)
+
+    x_host = x_dev.get()
+    y_host = y_dev.get()
+
+    assert np.linalg.norm(2*x_host+3*y_host-out.get())/np.linalg.norm(
+            2*x_host+3*y_host) < 1e-15
+
+
+@pytest.mark.parametrize("inline", [False, True])
+def test_shape_translation_through_sub_array_ref(ctx_factory, inline):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64)
+    x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64)
+    x3 = cl.clrandom.rand(queue, (6, 6), dtype=np.float64)
+
+    callee1 = lp.make_kernel_function(
+            "{[i]: 0<=i<6}",
+            """
+            a[i] = 2*abs(b[i])
+            """, name="callee_fn1")
+
+    callee2 = lp.make_kernel_function(
+            "{[i, j]: 0<=i<3 and 0 <= j < 2}",
+            """
+            a[i, j] = 3*b[i, j]
+            """, name="callee_fn2")
+
+    callee3 = lp.make_kernel_function(
+            "{[i]: 0<=i<6}",
+            """
+            a[i] = 5*b[i]
+            """, name="callee_fn3")
+
+    knl = lp.make_kernel(
+            "{[i, j, k, l]:  0<= i < 6 and 0 <= j < 3 and 0 <= k < 2 and 0<=l<6}",
+            """
+            [i]: y1[i//2, i%2] = callee_fn1([i]: x1[i//2, i%2])
+            [j, k]: y2[2*j+k] = callee_fn2([j, k]: x2[2*j+k])
+            [l]: y3[l, l] = callee_fn3([l]: x3[l, l])
+            """)
+
+    knl = lp.register_callable_kernel(knl, callee1)
+    knl = lp.register_callable_kernel(knl, callee2)
+    knl = lp.register_callable_kernel(knl, callee3)
+
+    if inline:
+        knl = lp.inline_callable_kernel(knl, 'callee_fn1')
+        knl = lp.inline_callable_kernel(knl, 'callee_fn2')
+        knl = lp.inline_callable_kernel(knl, 'callee_fn3')
+
+    knl = lp.set_options(knl, "write_cl")
+    knl = lp.set_options(knl, "return_dict")
+    evt, out_dict = knl(queue, x1=x1, x2=x2, x3=x3)
+
+    y1 = out_dict['y1'].get()
+    y2 = out_dict['y2'].get()
+    y3 = out_dict['y3'].get()
+
+    assert (np.linalg.norm(y1-2*x1.get())) < 1e-15
+    assert (np.linalg.norm(y2-3*x2.get())) < 1e-15
+    assert (np.linalg.norm(np.diag(y3-5*x3.get()))) < 1e-15
+
+
+def test_multi_arg_array_call(ctx_factory):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+    import pymbolic.primitives as p
+    n = 10
+    acc_i = p.Variable("acc_i")
+    i = p.Variable("i")
+    index = p.Variable("index")
+    a_i = p.Subscript(p.Variable("a"), p.Variable("i"))
+    argmin_kernel = lp.make_kernel_function(
+            "{[i]: 0 <= i < n}",
+            [
+                lp.Assignment(id="init2", assignee=index,
+                    expression=0),
+                lp.Assignment(id="init1", assignee=acc_i,
+                    expression="214748367"),
+                lp.Assignment(id="insn", assignee=index,
+                    expression=p.If(p.Expression.eq(acc_i, a_i), i, index),
+                    depends_on="update"),
+                lp.Assignment(id="update", assignee=acc_i,
+                    expression=p.Variable("min")(acc_i, a_i),
+                    depends_on="init1,init2")],
+            name="custom_argmin")
+
+    argmin_kernel = lp.fix_parameters(argmin_kernel, n=n)
+
+    knl = lp.make_kernel(
+            "{[i]:0<=i<n}",
+            """
+            min_val, min_index = custom_argmin([i]:b[i])
+            """)
+
+    knl = lp.fix_parameters(knl, n=n)
+    knl = lp.set_options(knl, return_dict=True)
+
+    knl = lp.register_callable_kernel(knl, argmin_kernel)
+    b = np.random.randn(n)
+    evt, out_dict = knl(queue, b=b)
+    tol = 1e-15
+    from numpy.linalg import norm
+    assert(norm(out_dict['min_val'][0] - np.min(b)) < tol)
+    assert(norm(out_dict['min_index'][0] - np.argmin(b)) < tol)
+
+
+@pytest.mark.parametrize("inline", [False, True])
+def test_packing_unpacking(ctx_factory, inline):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64)
+    x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64)
+
+    callee1 = lp.make_kernel_function(
+            "{[i]: 0<=i<6}",
+            """
+            a[i] = 2*b[i]
+            """, name="callee_fn1")
+
+    callee2 = lp.make_kernel_function(
+            "{[i, j]: 0<=i<2 and 0 <= j < 3}",
+            """
+            a[i, j] = 3*b[i, j]
+            """, name="callee_fn2")
+
+    knl = lp.make_kernel(
+            "{[i, j, k]:  0<= i < 3 and 0 <= j < 2 and 0 <= k < 6}",
+            """
+            [i, j]: y1[i, j] = callee_fn1([i, j]: x1[i, j])
+            [k]: y2[k] = callee_fn2([k]: x2[k])
+            """)
+
+    knl = lp.register_callable_kernel(knl, callee1)
+    knl = lp.register_callable_kernel(knl, callee2)
+
+    knl = lp.pack_and_unpack_args_for_call(knl, 'callee_fn1')
+    knl = lp.pack_and_unpack_args_for_call(knl, 'callee_fn2')
+
+    if inline:
+        knl = lp.inline_callable_kernel(knl, 'callee_fn1')
+        knl = lp.inline_callable_kernel(knl, 'callee_fn2')
+
+    knl = lp.set_options(knl, "write_cl")
+    knl = lp.set_options(knl, "return_dict")
+    evt, out_dict = knl(queue, x1=x1, x2=x2)
+
+    y1 = out_dict['y1'].get()
+    y2 = out_dict['y2'].get()
+
+    assert np.linalg.norm(2*x1.get()-y1)/np.linalg.norm(
+            2*x1.get()) < 1e-15
+    assert np.linalg.norm(3*x2.get()-y2)/np.linalg.norm(
+            3*x2.get()) < 1e-15
+
+
+if __name__ == "__main__":
+    if len(sys.argv) > 1:
+        exec(sys.argv[1])
+    else:
+        from pytest import main
+        main([__file__])
+
+# vim: foldmethod=marker
-- 
GitLab


From 28bb8efd90784545444c705c7820d26e4ef2a555 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Sun, 12 Aug 2018 16:45:18 +0530
Subject: [PATCH 05/80] removing unused part of code.

---
 loopy/kernel/function_interface.py | 103 -----
 loopy/transform/callable.py        | 592 +----------------------------
 test/test_callables.py             | 345 -----------------
 3 files changed, 2 insertions(+), 1038 deletions(-)

diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py
index 2ea26065..8b24da21 100644
--- a/loopy/kernel/function_interface.py
+++ b/loopy/kernel/function_interface.py
@@ -524,109 +524,6 @@ class CallableKernel(InKernelCallable):
     def name(self):
         return self.subkernel.name
 
-    def with_types(self, arg_id_to_dtype, caller_kernel,
-            program_callables_info):
-        kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel)
-
-        new_args = []
-        for arg in self.subkernel.args:
-            kw = arg.name
-            if kw in arg_id_to_dtype:
-                # id exists as kw
-                new_args.append(arg.copy(dtype=arg_id_to_dtype[kw]))
-            elif kw_to_pos[kw] in arg_id_to_dtype:
-                # id exists as positional argument
-                new_args.append(arg.copy(
-                    dtype=arg_id_to_dtype[kw_to_pos[kw]]))
-            else:
-                new_args.append(arg)
-
-        from loopy.type_inference import (
-                infer_unknown_types_for_a_single_kernel)
-        pre_specialized_subkernel = self.subkernel.copy(
-                args=new_args)
-
-        # infer the types of the written variables based on the knowledge
-        # of the types of the arguments supplied
-        specialized_kernel, program_callables_info = (
-                infer_unknown_types_for_a_single_kernel(
-                    pre_specialized_subkernel,
-                    program_callables_info,
-                    expect_completion=True))
-
-        new_arg_id_to_dtype = {}
-        for arg in specialized_kernel.args:
-            # associate the updated_arg_id_to_dtype with keyword as well as
-            # positional id.
-            new_arg_id_to_dtype[arg.name] = arg.dtype
-            new_arg_id_to_dtype[kw_to_pos[arg.name]] = arg.dtype
-
-        # Return the kernel call with specialized subkernel and the corresponding
-        # new arg_id_to_dtype
-        return self.copy(subkernel=specialized_kernel,
-                arg_id_to_dtype=new_arg_id_to_dtype), program_callables_info
-
-    def with_descrs(self, arg_id_to_descr, program_callables_info):
-
-        # tune the subkernel so that we have the matching shapes and
-        # dim_tags
-
-        new_args = self.subkernel.args[:]
-        kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel)
-
-        for arg_id, descr in arg_id_to_descr.items():
-            if isinstance(arg_id, int):
-                arg_id = pos_to_kw[arg_id]
-            assert isinstance(arg_id, str)
-
-            if isinstance(descr, ArrayArgDescriptor):
-                new_arg = self.subkernel.arg_dict[arg_id].copy(
-                        shape=descr.shape,
-                        dim_tags=descr.dim_tags,
-                        address_space=descr.address_space)
-                # replacing the new arg with the arg of the same name
-                new_args = [new_arg if arg.name == arg_id else arg for arg in
-                        new_args]
-            elif isinstance(descr, ValueArgDescriptor):
-                pass
-            else:
-                raise LoopyError("Descriptor must be either an instance of "
-                        "ArrayArgDescriptor or ValueArgDescriptor -- got %s." %
-                        type(descr))
-        descriptor_specialized_knl = self.subkernel.copy(args=new_args)
-        from loopy.preprocess import traverse_to_infer_arg_descr
-        descriptor_specialized_knl, program_callables_info = (
-                traverse_to_infer_arg_descr(descriptor_specialized_knl,
-                    program_callables_info))
-
-        return (
-                self.copy(
-                    subkernel=descriptor_specialized_knl,
-                    arg_id_to_descr=arg_id_to_descr),
-                program_callables_info)
-
-    def with_packing_for_args(self):
-        from loopy.kernel.data import AddressSpace
-        kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel)
-
-        arg_id_to_descr = {}
-
-        for pos, kw in pos_to_kw.items():
-            arg = self.subkernel.arg_dict[kw]
-            arg_id_to_descr[pos] = ArrayArgDescriptor(
-                    shape=arg.shape,
-                    dim_tags=arg.dim_tags,
-                    address_space=AddressSpace.GLOBAL)
-
-        return self.copy(subkernel=self.subkernel,
-                arg_id_to_descr=arg_id_to_descr)
-
-    def with_hw_axes_sizes(self, gsize, lsize):
-        return self.copy(
-                subkernel=self.subkernel.copy(
-                    overridden_get_grid_sizes_for_insn_ids=(
-                        GridOverrideForCalleeKernel(lsize, gsize))))
-
     def is_ready_for_codegen(self):
         return (self.arg_id_to_dtype is not None and
                 self.arg_id_to_descr is not None)
diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py
index b5b80ad8..9d9935ab 100644
--- a/loopy/transform/callable.py
+++ b/loopy/transform/callable.py
@@ -21,29 +21,15 @@ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
 OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
-
-import six
-
-import islpy as isl
-from pymbolic.primitives import CallWithKwargs
-
-from loopy.kernel import LoopKernel
-from pytools import ImmutableRecord
 from loopy.diagnostic import LoopyError
-from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase,
-        CInstruction, _DataObliviousInstruction)
-from loopy.symbolic import IdentityMapper, SubstitutionMapper
-from loopy.isl_helpers import simplify_via_aff
-from loopy.kernel.function_interface import (get_kw_pos_association,
-        change_names_of_pymbolic_calls, CallableKernel, ScalarCallable)
-from loopy.program import Program, ResolvedFunctionMarker
+from loopy.kernel.function_interface import CallableKernel
+from loopy.program import ResolvedFunctionMarker
 
 __doc__ = """
 .. currentmodule:: loopy
 
 .. autofunction:: register_function_id_to_in_knl_callable_mapper
 
-.. autofunction:: register_callable_kernel
 """
 
 
@@ -130,578 +116,4 @@ def register_function_id_to_in_knl_callable_mapper(program,
 # }}}
 
 
-# {{{ register_callable_kernel
-
-class _RegisterCalleeKernel(ImmutableRecord):
-    """
-    Helper class to make the function scoper from
-    :func:`loopy.transform.register_callable_kernel` picklable. As python
-    cannot pickle lexical closures.
-    """
-    fields = set(['callable_kernel'])
-
-    def __init__(self, callable_kernel):
-        self.callable_kernel = callable_kernel
-
-    def __call__(self, target, identifier):
-        if identifier == self.callable_kernel.subkernel.name:
-            return self.callable_kernel
-        return None
-
-
-def register_callable_kernel(program, callee_kernel):
-    """Returns a copy of *caller_kernel*, which would resolve *function_name* in an
-    expression as a call to *callee_kernel*.
-
-    :arg caller_kernel: An instance of :class:`loopy.kernel.LoopKernel`.
-    :arg function_name: An instance of :class:`str`.
-    :arg callee_kernel: An instance of :class:`loopy.kernel.LoopKernel`.
-    """
-
-    # {{{ sanity checks
-
-    assert isinstance(program, Program)
-    assert isinstance(callee_kernel, LoopKernel)
-
-    # check to make sure that the variables with 'out' direction is equal to
-    # the number of assigness in the callee kernel intructions.
-    expected_num_assignees = len([arg for arg in callee_kernel.args if
-        arg.is_output_only])
-    expected_num_parameters = len(callee_kernel.args) - expected_num_assignees
-    for in_knl_callable in program.program_callables_info.values():
-        if isinstance(in_knl_callable, CallableKernel):
-            caller_kernel = in_knl_callable.subkernel
-            for insn in caller_kernel.instructions:
-                if isinstance(insn, CallInstruction) and (
-                        insn.expression.function.name == callee_kernel.name):
-                    if isinstance(insn.expression, CallWithKwargs):
-                        kw_parameters = insn.expression.kw_parameters
-                    else:
-                        kw_parameters = {}
-                    if len(insn.assignees) != expected_num_assignees:
-                        raise LoopyError("The number of arguments with 'out' "
-                                "direction " "in callee kernel %s and the number "
-                                "of assignees in " "instruction %s do not "
-                                "match." % (
-                                    callee_kernel.name, insn.id))
-                    if len(insn.expression.parameters+tuple(
-                            kw_parameters.values())) != expected_num_parameters:
-                        raise LoopyError("The number of expected arguments "
-                                "for the callee kernel %s and the number of "
-                                "parameters in instruction %s do not match."
-                                % (callee_kernel.name, insn.id))
-
-                elif isinstance(insn, (MultiAssignmentBase, CInstruction,
-                        _DataObliviousInstruction)):
-                    pass
-                else:
-                    raise NotImplementedError("unknown instruction %s" % type(insn))
-        elif isinstance(in_knl_callable, ScalarCallable):
-            pass
-        else:
-            raise NotImplementedError("Unknown callable type %s." %
-                    type(in_knl_callable).__name__)
-
-    # }}}
-
-    # take the function resolvers from the Program and resolve the functions in
-    # the callee kernel
-    program_callables_info = (
-            program.program_callables_info.with_edit_callables_mode())
-
-    from loopy.symbolic import SubstitutionRuleMappingContext
-    rule_mapping_context = SubstitutionRuleMappingContext(
-            callee_kernel.substitutions,
-            callee_kernel.get_var_name_generator())
-
-    resolved_function_marker = ResolvedFunctionMarker(
-            rule_mapping_context, callee_kernel, program_callables_info,
-            program.func_id_to_in_knl_callable_mappers)
-
-    callee_kernel = rule_mapping_context.finish_kernel(
-            resolved_function_marker.map_kernel(callee_kernel))
-    program_callables_info = resolved_function_marker.program_callables_info
-
-    program_callables_info = (
-            program_callables_info.with_exit_edit_callables_mode())
-    program = program.copy(program_callables_info=program_callables_info)
-
-    # making the target of the child kernel to be same as the target of parent
-    # kernel.
-    callable_kernel = CallableKernel(subkernel=callee_kernel.copy(
-                        target=program.target,
-                        is_called_from_host=False))
-
-    # FIXME disabling global barriers for callee kernel (for now)
-    from loopy import set_options
-    callee_kernel = set_options(callee_kernel, "disable_global_barriers")
-
-    # FIXME: the number of callables is wrong. This is horrible please
-    # compensate.
-
-    return register_function_id_to_in_knl_callable_mapper(
-            program,
-            _RegisterCalleeKernel(callable_kernel))
-
-# }}}
-
-
-# {{{ kernel inliner mapper
-
-class KernelInliner(SubstitutionMapper):
-    """Mapper to replace variables (indices, temporaries, arguments) in the
-    callee kernel with variables in the caller kernel.
-
-    :arg caller: the caller kernel
-    :arg arg_map: dict of argument name to variables in caller
-    :arg arg_dict: dict of argument name to arguments in callee
-    """
-
-    def __init__(self, subst_func, caller, arg_map, arg_dict):
-        super(KernelInliner, self).__init__(subst_func)
-        self.caller = caller
-        self.arg_map = arg_map
-        self.arg_dict = arg_dict
-
-    def map_subscript(self, expr):
-        if expr.aggregate.name in self.arg_map:
-
-            aggregate = self.subst_func(expr.aggregate)
-            sar = self.arg_map[expr.aggregate.name]  # SubArrayRef in caller
-            callee_arg = self.arg_dict[expr.aggregate.name]  # Arg in callee
-            if aggregate.name in self.caller.arg_dict:
-                caller_arg = self.caller.arg_dict[aggregate.name]  # Arg in caller
-            else:
-                caller_arg = self.caller.temporary_variables[aggregate.name]
-
-            # Firstly, map inner inames to outer inames.
-            outer_indices = self.map_tuple(expr.index_tuple)
-
-            # Next, reshape to match dimension of outer arrays.
-            # We can have e.g. A[3, 2] from outside and B[6] from inside
-            from numbers import Integral
-            if not all(isinstance(d, Integral) for d in callee_arg.shape):
-                raise LoopyError(
-                    "Argument: {0} in callee kernel: {1} does not have "
-                    "constant shape.".format(callee_arg))
-
-            flatten_index = 0
-            for i, idx in enumerate(sar.get_begin_subscript().index_tuple):
-                flatten_index += idx*caller_arg.dim_tags[i].stride
-
-            flatten_index += sum(
-                idx * tag.stride
-                for idx, tag in zip(outer_indices, callee_arg.dim_tags))
-
-            from loopy.isl_helpers import simplify_via_aff
-            flatten_index = simplify_via_aff(flatten_index)
-
-            new_indices = []
-            for dim_tag in caller_arg.dim_tags:
-                ind = flatten_index // dim_tag.stride
-                flatten_index -= (dim_tag.stride * ind)
-                new_indices.append(ind)
-
-            new_indices = tuple(simplify_via_aff(i) for i in new_indices)
-
-            return aggregate.index(tuple(new_indices))
-        else:
-            return super(KernelInliner, self).map_subscript(expr)
-
-# }}}
-
-
-# {{{ inlining of a single call instruction
-
-def _inline_call_instruction(caller_kernel, callee_knl, instruction):
-    """
-    Returns a copy of *kernel* with the *instruction* in the *kernel*
-    replaced by inlining :attr:`subkernel` within it.
-    """
-    callee_label = callee_knl.name[:4] + "_"
-
-    # {{{ duplicate and rename inames
-
-    vng = caller_kernel.get_var_name_generator()
-    ing = caller_kernel.get_instruction_id_generator()
-    dim_type = isl.dim_type.set
-
-    iname_map = {}
-    for iname in callee_knl.all_inames():
-        iname_map[iname] = vng(callee_label+iname)
-
-    new_domains = []
-    new_iname_to_tags = caller_kernel.iname_to_tags.copy()
-
-    # transferring iname tags info from the callee to the caller kernel
-    for domain in callee_knl.domains:
-        new_domain = domain.copy()
-        for i in range(new_domain.n_dim()):
-            iname = new_domain.get_dim_name(dim_type, i)
-
-            if iname in callee_knl.iname_to_tags:
-                new_iname_to_tags[iname_map[iname]] = (
-                        callee_knl.iname_to_tags[iname])
-            new_domain = new_domain.set_dim_name(
-                dim_type, i, iname_map[iname])
-        new_domains.append(new_domain)
-
-    kernel = caller_kernel.copy(domains=caller_kernel.domains + new_domains,
-            iname_to_tags=new_iname_to_tags)
-
-    # }}}
-
-    # {{{ rename temporaries
-
-    temp_map = {}
-    new_temps = kernel.temporary_variables.copy()
-    for name, temp in six.iteritems(callee_knl.temporary_variables):
-        new_name = vng(callee_label+name)
-        temp_map[name] = new_name
-        new_temps[new_name] = temp.copy(name=new_name)
-
-    kernel = kernel.copy(temporary_variables=new_temps)
-
-    # }}}
-
-    # {{{ match kernel arguments
-
-    arg_map = {}  # callee arg name -> caller symbols (e.g. SubArrayRef)
-
-    assignees = instruction.assignees  # writes
-    parameters = instruction.expression.parameters  # reads
-
-    # add keyword parameters
-    from pymbolic.primitives import CallWithKwargs
-
-    if isinstance(instruction.expression, CallWithKwargs):
-        from loopy.kernel.function_interface import get_kw_pos_association
-
-        _, pos_to_kw = get_kw_pos_association(callee_knl)
-        kw_parameters = instruction.expression.kw_parameters
-        for i in range(len(parameters), len(parameters) + len(kw_parameters)):
-            parameters = parameters + (kw_parameters[pos_to_kw[i]],)
-
-    assignee_pos = 0
-    parameter_pos = 0
-    for i, arg in enumerate(callee_knl.args):
-        if arg.is_output_only:
-            arg_map[arg.name] = assignees[assignee_pos]
-            assignee_pos += 1
-        else:
-            arg_map[arg.name] = parameters[parameter_pos]
-            parameter_pos += 1
-
-    # }}}
-
-    # {{{ rewrite instructions
-
-    import pymbolic.primitives as p
-    from pymbolic.mapper.substitutor import make_subst_func
-
-    var_map = dict((p.Variable(k), p.Variable(v))
-                   for k, v in six.iteritems(iname_map))
-    var_map.update(dict((p.Variable(k), p.Variable(v))
-                        for k, v in six.iteritems(temp_map)))
-    var_map.update(dict((p.Variable(k), p.Variable(v.subscript.aggregate.name))
-                        for k, v in six.iteritems(arg_map)))
-    subst_mapper = KernelInliner(
-        make_subst_func(var_map), kernel, arg_map, callee_knl.arg_dict)
-
-    insn_id = {}
-    for insn in callee_knl.instructions:
-        insn_id[insn.id] = ing(callee_label+insn.id)
-
-    # {{{ root and leave instructions in callee kernel
-
-    dep_map = callee_knl.recursive_insn_dep_map()
-    # roots depend on nothing
-    heads = set(insn for insn, deps in six.iteritems(dep_map) if not deps)
-    # leaves have nothing that depends on them
-    tails = set(dep_map.keys())
-    for insn, deps in six.iteritems(dep_map):
-        tails = tails - deps
-
-    # }}}
-
-    # {{{ use NoOp to mark the start and end of callee kernel
-
-    from loopy.kernel.instruction import NoOpInstruction
-
-    noop_start = NoOpInstruction(
-        id=ing(callee_label+"_start"),
-        within_inames=instruction.within_inames,
-        depends_on=instruction.depends_on
-    )
-    noop_end = NoOpInstruction(
-        id=instruction.id,
-        within_inames=instruction.within_inames,
-        depends_on=frozenset(insn_id[insn] for insn in tails)
-    )
-    # }}}
-
-    inner_insns = [noop_start]
-
-    for insn in callee_knl.instructions:
-        insn = insn.with_transformed_expressions(subst_mapper)
-        within_inames = frozenset(map(iname_map.get, insn.within_inames))
-        within_inames = within_inames | instruction.within_inames
-        depends_on = frozenset(map(insn_id.get, insn.depends_on)) | (
-                instruction.depends_on)
-        if insn.id in heads:
-            depends_on = depends_on | set([noop_start.id])
-        insn = insn.copy(
-            id=insn_id[insn.id],
-            within_inames=within_inames,
-            # TODO: probaby need to keep priority in callee kernel
-            priority=instruction.priority,
-            depends_on=depends_on
-        )
-        inner_insns.append(insn)
-
-    inner_insns.append(noop_end)
-
-    new_insns = []
-    for insn in kernel.instructions:
-        if insn == instruction:
-            new_insns.extend(inner_insns)
-        else:
-            new_insns.append(insn)
-
-    kernel = kernel.copy(instructions=new_insns)
-
-    # }}}
-
-    return kernel
-
-# }}}
-
-
-# {{{ inline callable kernel
-
-def _inline_single_callable_kernel(caller_kernel, function_name,
-        program_callables_info):
-    old_insns = caller_kernel.instructions
-    for insn in old_insns:
-        if isinstance(insn, CallInstruction):
-            # FIXME This seems to use identifiers across namespaces. Why not
-            # check whether the function is a scoped function first? ~AK
-            if insn.expression.function.name in program_callables_info:
-                history_of_identifier = program_callables_info.history[
-                        insn.expression.function.name]
-
-                if function_name in history_of_identifier:
-                    in_knl_callable = program_callables_info[
-                            insn.expression.function.name]
-                    assert isinstance(in_knl_callable, CallableKernel)
-                    caller_kernel = _inline_call_instruction(
-                            caller_kernel, in_knl_callable.subkernel, insn)
-                    program_callables_info = (
-                            program_callables_info.with_deleted_callable(
-                                insn.expression.function.name,
-                                program_callables_info.num_times_callables_called[
-                                    caller_kernel.name]))
-        elif isinstance(insn, (MultiAssignmentBase, CInstruction,
-                _DataObliviousInstruction)):
-            pass
-        else:
-            raise NotImplementedError(
-                    "Unknown instruction type %s"
-                    % type(insn).__name__)
-
-    return caller_kernel, program_callables_info
-
-
-# FIXME This should take a 'within' parameter to be able to only inline
-# *some* calls to a kernel, but not others.
-def inline_callable_kernel(program, function_name):
-    """
-    Returns a copy of *kernel* with the callable kernel addressed by
-    (scoped) name *function_name* inlined.
-    """
-    from loopy.preprocess import infer_arg_descr
-    program = infer_arg_descr(program)
-    program_callables_info = program.program_callables_info
-    old_program_callables_info = program_callables_info.copy()
-
-    edited_callable_kernels = {}
-
-    for func_id, in_knl_callable in old_program_callables_info.items():
-        if function_name not in old_program_callables_info.history[func_id] and (
-                isinstance(in_knl_callable, CallableKernel)):
-            caller_kernel = in_knl_callable.subkernel
-            caller_kernel, program_callables_info = (
-                    _inline_single_callable_kernel(caller_kernel,
-                        function_name,
-                        program_callables_info))
-            edited_callable_kernels[func_id] = in_knl_callable.copy(
-                    subkernel=caller_kernel)
-
-    new_resolved_functions = {}
-    for func_id, in_knl_callable in program_callables_info.items():
-        if func_id in edited_callable_kernels:
-            new_resolved_functions[func_id] = edited_callable_kernels[func_id]
-        else:
-            new_resolved_functions[func_id] = in_knl_callable
-
-    program_callables_info = program_callables_info.copy(
-            resolved_functions=new_resolved_functions)
-
-    return program.copy(program_callables_info=program_callables_info)
-
-# }}}
-
-
-# {{{ tools to match caller to callee args by (guessed) automatic reshaping
-
-# (This is undocumented and not recommended, but it is currently needed
-# to support Firedrake.)
-
-class DimChanger(IdentityMapper):
-    """
-    Mapper to change the dimensions of an argument.
-
-    .. attribute:: callee_arg_dict
-
-        A mapping from the argument name (:class:`str`) to instances of
-        :class:`loopy.kernel.array.ArrayBase`.
-
-    .. attribute:: desried_shape
-
-        A mapping from argument name (:class:`str`) to an instance of
-        :class:`tuple`.
-    """
-    def __init__(self, callee_arg_dict, desired_shape):
-        self.callee_arg_dict = callee_arg_dict
-        self.desired_shape = desired_shape
-
-    def map_subscript(self, expr):
-        callee_arg_dim_tags = self.callee_arg_dict[expr.aggregate.name].dim_tags
-        flattened_index = sum(dim_tag.stride*idx for dim_tag, idx in
-                zip(callee_arg_dim_tags, expr.index_tuple))
-        new_indices = []
-
-        from operator import mul
-        from functools import reduce
-        stride = reduce(mul, self.desired_shape[expr.aggregate.name], 1)
-
-        for length in self.desired_shape[expr.aggregate.name]:
-            stride /= length
-            ind = flattened_index // int(stride)
-            flattened_index -= (int(stride) * ind)
-            new_indices.append(simplify_via_aff(ind))
-
-        return expr.aggregate.index(tuple(new_indices))
-
-
-def _match_caller_callee_argument_dimension_for_single_kernel(
-        caller_knl, program_callables_info, callee_function_name):
-    """
-    Returns a copy of *caller_knl* with the instance of
-    :class:`loopy.kernel.function_interface.CallableKernel` addressed by
-    *callee_function_name* in the *caller_knl* aligned with the argument
-    dimesnsions required by *caller_knl*.
-    """
-    pymbolic_calls_to_new_callables = {}
-    for insn in caller_knl.instructions:
-        if not isinstance(insn, CallInstruction) or (
-                insn.expression.function.name not in
-                program_callables_info):
-            # Call to a callable kernel can only occur through a
-            # CallInstruction.
-            continue
-
-        in_knl_callable = program_callables_info[
-                insn.expression.function.name]
-
-        if in_knl_callable.subkernel.name != callee_function_name:
-            # Not the callable we're looking for.
-            continue
-
-        # getting the caller->callee arg association
-
-        parameters = insn.expression.parameters[:]
-        kw_parameters = {}
-        if isinstance(insn.expression, CallWithKwargs):
-            kw_parameters = insn.expression.kw_parameters
-
-        assignees = insn.assignees
-
-        parameter_shapes = [par.get_array_arg_descriptor(caller_knl).shape
-                for par in parameters]
-        kw_to_pos, pos_to_kw = get_kw_pos_association(in_knl_callable.subkernel)
-        for i in range(len(parameters), len(parameters)+len(kw_parameters)):
-            parameter_shapes.append(kw_parameters[pos_to_kw[i]]
-                    .get_array_arg_descriptor(caller_knl).shape)
-
-        # inserting the assigness at the required positions.
-        assignee_write_count = -1
-        for i, arg in enumerate(in_knl_callable.subkernel.args):
-            if arg.is_output_only:
-                assignee = assignees[-assignee_write_count-1]
-                parameter_shapes.insert(i, assignee
-                        .get_array_arg_descriptor(caller_knl).shape)
-                assignee_write_count -= 1
-
-        callee_arg_to_desired_dim_tag = dict(zip([arg.name for arg in
-            in_knl_callable.subkernel.args], parameter_shapes))
-        dim_changer = DimChanger(in_knl_callable.subkernel.arg_dict,
-                callee_arg_to_desired_dim_tag)
-        new_callee_insns = []
-        for callee_insn in in_knl_callable.subkernel.instructions:
-            if isinstance(callee_insn, MultiAssignmentBase):
-                new_callee_insns.append(callee_insn.copy(expression=dim_changer(
-                    callee_insn.expression),
-                    assignee=dim_changer(callee_insn.assignee)))
-            elif isinstance(callee_insn, (CInstruction,
-                    _DataObliviousInstruction)):
-                pass
-            else:
-                raise NotImplementedError("Unknwon instruction %s." %
-                        type(insn))
-
-        # subkernel with instructions adjusted according to the new dimensions.
-        new_subkernel = in_knl_callable.subkernel.copy(instructions=new_callee_insns)
-
-        new_in_knl_callable = in_knl_callable.copy(subkernel=new_subkernel)
-
-        pymbolic_calls_to_new_callables[insn.expression] = new_in_knl_callable
-
-    if not pymbolic_calls_to_new_callables:
-        # complain if no matching function found.
-        raise LoopyError("No CallableKernel with the name %s found in %s." % (
-            callee_function_name, caller_knl.name))
-
-    return change_names_of_pymbolic_calls(caller_knl,
-            pymbolic_calls_to_new_callables)
-
-
-def _match_caller_callee_argument_dimension_(program, *args, **kwargs):
-    assert isinstance(program, Program)
-
-    new_resolved_functions = {}
-    for func_id, in_knl_callable in program.program_callables_info.items():
-        if isinstance(in_knl_callable, CallableKernel):
-            new_subkernel = (
-                    _match_caller_callee_argument_dimension_for_single_kernel(
-                        in_knl_callable.subkernel, program.program_callables_info,
-                        *args, **kwargs))
-            in_knl_callable = in_knl_callable.copy(
-                    subkernel=new_subkernel)
-
-        elif isinstance(in_knl_callable, ScalarCallable):
-            pass
-        else:
-            raise NotImplementedError("Unknown type of callable %s." % (
-                type(in_knl_callable).__name__))
-
-        new_resolved_functions[func_id] = in_knl_callable
-
-    new_program_callables_info = program.program_callables_info.copy(
-            resolved_functions=new_resolved_functions)
-    return program.copy(program_callables_info=new_program_callables_info)
-
-# }}}
-
-
 # vim: foldmethod=marker
diff --git a/test/test_callables.py b/test/test_callables.py
index f25bbbe6..d2ca9b71 100644
--- a/test/test_callables.py
+++ b/test/test_callables.py
@@ -26,7 +26,6 @@ import numpy as np
 import pyopencl as cl
 import pyopencl.clrandom  # noqa: F401
 import loopy as lp
-import pytest
 import sys
 
 
@@ -60,350 +59,6 @@ def test_register_function_lookup(ctx_factory):
     assert np.linalg.norm(np.log2(x)-out)/np.linalg.norm(np.log2(x)) < 1e-15
 
 
-@pytest.mark.parametrize("inline", [False, True])
-def test_register_knl(ctx_factory, inline):
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
-    n = 2 ** 4
-
-    x = np.random.rand(n, n, n, n, n)
-    y = np.random.rand(n, n, n, n, n)
-
-    grandchild_knl = lp.make_kernel_function(
-            "{[i, j]:0<= i, j< 16}",
-            """
-            c[i, j] = 2*a[i, j] + 3*b[i, j]
-            """, name='linear_combo1')
-
-    child_knl = lp.make_kernel_function(
-            "{[i, j]:0<=i, j < 16}",
-            """
-            [i, j]: g[i, j] = linear_combo1([i, j]: e[i, j], [i, j]: f[i, j])
-            """, name='linear_combo2')
-
-    parent_knl = lp.make_kernel(
-            "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}",
-            """
-            [j, l]: z[i, j, k, l, m] = linear_combo2([j, l]: x[i, j, k, l, m],
-                                                     [j, l]: y[i, j, k, l, m])
-            """,
-            kernel_data=[
-                lp.GlobalArg(
-                    name='x',
-                    dtype=np.float64,
-                    shape=(16, 16, 16, 16, 16)),
-                lp.GlobalArg(
-                    name='y',
-                    dtype=np.float64,
-                    shape=(16, 16, 16, 16, 16)), '...'],
-            )
-
-    knl = lp.register_callable_kernel(
-            parent_knl, child_knl)
-    knl = lp.register_callable_kernel(
-            knl, grandchild_knl)
-    if inline:
-        knl = lp.inline_callable_kernel(knl, 'linear_combo2')
-        knl = lp.inline_callable_kernel(knl, 'linear_combo1')
-
-    evt, (out, ) = knl(queue, x=x, y=y)
-
-    assert (np.linalg.norm(2*x+3*y-out)/(
-        np.linalg.norm(2*x+3*y))) < 1e-15
-
-
-@pytest.mark.parametrize("inline", [False, True])
-def test_slices_with_negative_step(ctx_factory, inline):
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
-    n = 2 ** 4
-
-    x = np.random.rand(n, n, n, n, n)
-    y = np.random.rand(n, n, n, n, n)
-
-    child_knl = lp.make_kernel_function(
-            "{[i, j]:0<=i, j < 16}",
-            """
-            g[i, j] = 2*e[i, j] + 3*f[i, j]
-            """, name="linear_combo")
-
-    parent_knl = lp.make_kernel(
-            "{[i, k, m]: 0<=i, k, m<16}",
-            """
-            z[i, 15:-1:-1, k, :, m] = linear_combo(x[i, :, k, :, m],
-                                                   y[i, :, k, :, m])
-            """,
-            kernel_data=[
-                lp.GlobalArg(
-                    name='x',
-                    dtype=np.float64,
-                    shape=(16, 16, 16, 16, 16)),
-                lp.GlobalArg(
-                    name='y',
-                    dtype=np.float64,
-                    shape=(16, 16, 16, 16, 16)),
-                lp.GlobalArg(
-                    name='z',
-                    dtype=np.float64,
-                    shape=(16, 16, 16, 16, 16)), '...'],
-            )
-
-    knl = lp.register_callable_kernel(
-            parent_knl, child_knl)
-    if inline:
-        knl = lp.inline_callable_kernel(knl, 'linear_combo')
-
-    evt, (out, ) = knl(queue, x=x, y=y)
-
-    assert (np.linalg.norm(2*x+3*y-out[:, ::-1, :, :, :])/(
-        np.linalg.norm(2*x+3*y))) < 1e-15
-
-
-@pytest.mark.parametrize("inline", [False, True])
-def test_register_knl_with_call_with_kwargs(ctx_factory, inline):
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
-
-    n = 2 ** 2
-
-    a_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32)
-    b_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float32)
-    c_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64)
-
-    callee_knl = lp.make_kernel_function(
-            "{[i, j]:0<=i, j < %d}" % n,
-            """
-            h[i, j] = 2 * e[i, j] + 3*f[i, j] + 4*g[i, j]
-            <>f1[i, j] = 2*f[i, j]
-            p[i, j] = 7 * e[i, j] + 4*f1[i, j] + 2*g[i, j]
-            """,
-            [
-                lp.GlobalArg('f, e, h, g'), '...'],
-            name='linear_combo')
-
-    caller_knl = lp.make_kernel(
-            "{[i, j, k, l, m]: 0<=i, j, k, l, m<%d}" % n,
-            """
-            <> d[i, j, k, l, m] = 2*b[i, j, k, l, m]
-            [j, l]: x[i, j, k, l, m], [j, l]: y[i, j, k, l, m]  = linear_combo(
-                                                     f=[j, l]: a[i, j, k, l, m],
-                                                     g=[j, l]: d[i, j, k, l, m],
-                                                     e=[j, l]: c[i, j, k, l, m])
-            """)
-
-    knl = lp.register_callable_kernel(
-            caller_knl, callee_knl)
-    if inline:
-        knl = lp.inline_callable_kernel(knl, 'linear_combo')
-
-    evt, (out1, out2, ) = knl(queue, a=a_dev, b=b_dev, c=c_dev)
-
-    a = a_dev.get()
-    b = b_dev.get()
-    c = c_dev.get()
-
-    h = out1.get()  # h = 2c + 3a +  8b
-    p = out2.get()  # p = 7c + 8a + 4b
-    h_exact = 3*a + 8*b + 2*c
-    p_exact = 8*a + 4*b + 7*c
-
-    assert np.linalg.norm(h-h_exact)/np.linalg.norm(h_exact) < 1e-7
-    assert np.linalg.norm(p-p_exact)/np.linalg.norm(p_exact) < 1e-7
-
-
-@pytest.mark.parametrize("inline", [False, True])
-def test_register_knl_with_hw_axes(ctx_factory, inline):
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
-
-    n = 2 ** 4
-
-    x_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64)
-    y_dev = cl.clrandom.rand(queue, (n, n, n, n, n), np.float64)
-
-    callee_knl = lp.make_kernel_function(
-            "{[i, j]:0<=i, j < 16}",
-            """
-            g[i, j] = 2*e[i, j] + 3*f[i, j]
-            """, name='linear_combo')
-
-    callee_knl = lp.split_iname(callee_knl, "i", 4, inner_tag="l.0", outer_tag="g.0")
-
-    caller_knl = lp.make_kernel(
-            "{[i, j, k, l, m]: 0<=i, j, k, l, m<16}",
-            """
-            [j, l]: z[i, j, k, l, m] = linear_combo([j, l]: x[i, j, k, l, m],
-                                                     [j, l]: y[i, j, k, l, m])
-            """
-            )
-    caller_knl = lp.split_iname(caller_knl, "i", 4, inner_tag="l.1", outer_tag="g.1")
-
-    knl = lp.register_callable_kernel(
-            caller_knl, callee_knl)
-
-    if inline:
-        knl = lp.inline_callable_kernel(knl, 'linear_combo')
-
-    evt, (out, ) = knl(queue, x=x_dev, y=y_dev)
-
-    x_host = x_dev.get()
-    y_host = y_dev.get()
-
-    assert np.linalg.norm(2*x_host+3*y_host-out.get())/np.linalg.norm(
-            2*x_host+3*y_host) < 1e-15
-
-
-@pytest.mark.parametrize("inline", [False, True])
-def test_shape_translation_through_sub_array_ref(ctx_factory, inline):
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
-
-    x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64)
-    x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64)
-    x3 = cl.clrandom.rand(queue, (6, 6), dtype=np.float64)
-
-    callee1 = lp.make_kernel_function(
-            "{[i]: 0<=i<6}",
-            """
-            a[i] = 2*abs(b[i])
-            """, name="callee_fn1")
-
-    callee2 = lp.make_kernel_function(
-            "{[i, j]: 0<=i<3 and 0 <= j < 2}",
-            """
-            a[i, j] = 3*b[i, j]
-            """, name="callee_fn2")
-
-    callee3 = lp.make_kernel_function(
-            "{[i]: 0<=i<6}",
-            """
-            a[i] = 5*b[i]
-            """, name="callee_fn3")
-
-    knl = lp.make_kernel(
-            "{[i, j, k, l]:  0<= i < 6 and 0 <= j < 3 and 0 <= k < 2 and 0<=l<6}",
-            """
-            [i]: y1[i//2, i%2] = callee_fn1([i]: x1[i//2, i%2])
-            [j, k]: y2[2*j+k] = callee_fn2([j, k]: x2[2*j+k])
-            [l]: y3[l, l] = callee_fn3([l]: x3[l, l])
-            """)
-
-    knl = lp.register_callable_kernel(knl, callee1)
-    knl = lp.register_callable_kernel(knl, callee2)
-    knl = lp.register_callable_kernel(knl, callee3)
-
-    if inline:
-        knl = lp.inline_callable_kernel(knl, 'callee_fn1')
-        knl = lp.inline_callable_kernel(knl, 'callee_fn2')
-        knl = lp.inline_callable_kernel(knl, 'callee_fn3')
-
-    knl = lp.set_options(knl, "write_cl")
-    knl = lp.set_options(knl, "return_dict")
-    evt, out_dict = knl(queue, x1=x1, x2=x2, x3=x3)
-
-    y1 = out_dict['y1'].get()
-    y2 = out_dict['y2'].get()
-    y3 = out_dict['y3'].get()
-
-    assert (np.linalg.norm(y1-2*x1.get())) < 1e-15
-    assert (np.linalg.norm(y2-3*x2.get())) < 1e-15
-    assert (np.linalg.norm(np.diag(y3-5*x3.get()))) < 1e-15
-
-
-def test_multi_arg_array_call(ctx_factory):
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
-    import pymbolic.primitives as p
-    n = 10
-    acc_i = p.Variable("acc_i")
-    i = p.Variable("i")
-    index = p.Variable("index")
-    a_i = p.Subscript(p.Variable("a"), p.Variable("i"))
-    argmin_kernel = lp.make_kernel_function(
-            "{[i]: 0 <= i < n}",
-            [
-                lp.Assignment(id="init2", assignee=index,
-                    expression=0),
-                lp.Assignment(id="init1", assignee=acc_i,
-                    expression="214748367"),
-                lp.Assignment(id="insn", assignee=index,
-                    expression=p.If(p.Expression.eq(acc_i, a_i), i, index),
-                    depends_on="update"),
-                lp.Assignment(id="update", assignee=acc_i,
-                    expression=p.Variable("min")(acc_i, a_i),
-                    depends_on="init1,init2")],
-            name="custom_argmin")
-
-    argmin_kernel = lp.fix_parameters(argmin_kernel, n=n)
-
-    knl = lp.make_kernel(
-            "{[i]:0<=i<n}",
-            """
-            min_val, min_index = custom_argmin([i]:b[i])
-            """)
-
-    knl = lp.fix_parameters(knl, n=n)
-    knl = lp.set_options(knl, return_dict=True)
-
-    knl = lp.register_callable_kernel(knl, argmin_kernel)
-    b = np.random.randn(n)
-    evt, out_dict = knl(queue, b=b)
-    tol = 1e-15
-    from numpy.linalg import norm
-    assert(norm(out_dict['min_val'][0] - np.min(b)) < tol)
-    assert(norm(out_dict['min_index'][0] - np.argmin(b)) < tol)
-
-
-@pytest.mark.parametrize("inline", [False, True])
-def test_packing_unpacking(ctx_factory, inline):
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
-
-    x1 = cl.clrandom.rand(queue, (3, 2), dtype=np.float64)
-    x2 = cl.clrandom.rand(queue, (6, ), dtype=np.float64)
-
-    callee1 = lp.make_kernel_function(
-            "{[i]: 0<=i<6}",
-            """
-            a[i] = 2*b[i]
-            """, name="callee_fn1")
-
-    callee2 = lp.make_kernel_function(
-            "{[i, j]: 0<=i<2 and 0 <= j < 3}",
-            """
-            a[i, j] = 3*b[i, j]
-            """, name="callee_fn2")
-
-    knl = lp.make_kernel(
-            "{[i, j, k]:  0<= i < 3 and 0 <= j < 2 and 0 <= k < 6}",
-            """
-            [i, j]: y1[i, j] = callee_fn1([i, j]: x1[i, j])
-            [k]: y2[k] = callee_fn2([k]: x2[k])
-            """)
-
-    knl = lp.register_callable_kernel(knl, callee1)
-    knl = lp.register_callable_kernel(knl, callee2)
-
-    knl = lp.pack_and_unpack_args_for_call(knl, 'callee_fn1')
-    knl = lp.pack_and_unpack_args_for_call(knl, 'callee_fn2')
-
-    if inline:
-        knl = lp.inline_callable_kernel(knl, 'callee_fn1')
-        knl = lp.inline_callable_kernel(knl, 'callee_fn2')
-
-    knl = lp.set_options(knl, "write_cl")
-    knl = lp.set_options(knl, "return_dict")
-    evt, out_dict = knl(queue, x1=x1, x2=x2)
-
-    y1 = out_dict['y1'].get()
-    y2 = out_dict['y2'].get()
-
-    assert np.linalg.norm(2*x1.get()-y1)/np.linalg.norm(
-            2*x1.get()) < 1e-15
-    assert np.linalg.norm(3*x2.get()-y2)/np.linalg.norm(
-            3*x2.get()) < 1e-15
-
-
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
-- 
GitLab


From 5ed57fe2f50af100a75c08ff1f876c938123d666 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Sun, 12 Aug 2018 18:44:11 +0530
Subject: [PATCH 06/80] minor error handling.

---
 loopy/codegen/__init__.py          | 18 ++++------
 loopy/kernel/__init__.py           | 56 +++++-------------------------
 loopy/kernel/creation.py           |  9 ++---
 loopy/kernel/function_interface.py |  4 ---
 loopy/kernel/instruction.py        | 12 ++-----
 loopy/preprocess.py                | 11 ++----
 loopy/type_inference.py            | 19 ++--------
 7 files changed, 25 insertions(+), 104 deletions(-)

diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 3e675db7..7a25b67e 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -150,7 +150,6 @@ class SeenFunction(ImmutableRecord):
 class CodeGenerationState(object):
     """
     .. attribute:: kernel
-    .. attribute:: target
     .. attribute:: implemented_data_info
 
         a list of :class:`ImplementedDataInfo` objects.
@@ -196,7 +195,7 @@ class CodeGenerationState(object):
     .. attribute:: program_callables_info
     """
 
-    def __init__(self, kernel, target,
+    def __init__(self, kernel,
             implemented_data_info, implemented_domain, implemented_predicates,
             seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map,
             allow_complex,
@@ -206,7 +205,6 @@ class CodeGenerationState(object):
             gen_program_name=None,
             schedule_index_end=None):
         self.kernel = kernel
-        self.target = target
         self.implemented_data_info = implemented_data_info
         self.implemented_domain = implemented_domain
         self.implemented_predicates = implemented_predicates
@@ -224,7 +222,7 @@ class CodeGenerationState(object):
 
     # {{{ copy helpers
 
-    def copy(self, kernel=None, target=None, implemented_data_info=None,
+    def copy(self, kernel=None, implemented_data_info=None,
             implemented_domain=None, implemented_predicates=frozenset(),
             var_subst_map=None, vectorization_info=None,
             is_generating_device_code=None,
@@ -234,9 +232,6 @@ class CodeGenerationState(object):
         if kernel is None:
             kernel = self.kernel
 
-        if target is None:
-            target = self.target
-
         if implemented_data_info is None:
             implemented_data_info = self.implemented_data_info
 
@@ -257,7 +252,6 @@ class CodeGenerationState(object):
 
         return CodeGenerationState(
                 kernel=kernel,
-                target=target,
                 implemented_data_info=implemented_data_info,
                 implemented_domain=implemented_domain or self.implemented_domain,
                 implemented_predicates=(
@@ -389,7 +383,7 @@ class PreambleInfo(ImmutableRecord):
 
 # {{{ main code generation entrypoint
 
-def generate_code_for_a_single_kernel(kernel, program_callables_info, target):
+def generate_code_for_a_single_kernel(kernel, program_callables_info):
     """
     :returns: a :class:`CodeGenerationResult`
     """
@@ -477,7 +471,7 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info, target):
             gen_program_name=(
                 kernel.target.host_program_name_prefix
                 + kernel.name
-                + target.host_program_name_suffix),
+                + kernel.target.host_program_name_suffix),
             schedule_index_end=len(kernel.schedule),
             program_callables_info=program_callables_info)
 
@@ -512,7 +506,7 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info, target):
             )
 
     preamble_generators = (kernel.preamble_generators
-            + target.get_device_ast_builder().preamble_generators())
+            + kernel.target.get_device_ast_builder().preamble_generators())
     for prea_gen in preamble_generators:
         preambles.extend(prea_gen(preamble_info))
 
@@ -555,7 +549,7 @@ def generate_code_v2(program):
         if isinstance(in_knl_callable, CallableKernel):
             codegen_results[func_id] = (
                     generate_code_for_a_single_kernel(in_knl_callable.subkernel,
-                        program.program_callables_info, program.target))
+                        program.program_callables_info))
 
     device_preambles = set()
     for cgr in codegen_results.values():
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index d2723c57..f686e58f 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -1036,25 +1036,19 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 self.get_iname_bounds(iname, constants_only=True).size,
                 constants_only=True)))
 
-    def get_grid_sizes_for_insn_ids_as_dicts(self, insn_ids,
-            program_callables_info, ignore_auto=False):
+    def get_grid_sizes_for_insn_ids(self, insn_ids, program_callables_info,
+            ignore_auto=False):
         """Return a tuple (global_size, local_size) containing a grid that
         could accommodate execution of all instructions whose IDs are given
         in *insn_ids*.
-
         :arg insn_ids: a :class:`frozenset` of instruction IDs
-
-        *global_size* and *local_size* are instances of :class:`dict` with
-        mapping of the form from ``axis`` to :class:`islpy.PwAff` objects.
+        *global_size* and *local_size* are :class:`islpy.PwAff` objects.
         """
 
-        # {{{ collecting the callee kernels in insn_ids
-
-        from loopy.kernel.tools import get_direct_callee_kernels
-        callee_kernels = get_direct_callee_kernels(self,
-                program_callables_info, insn_ids)
-
-        # }}}
+        if self.overridden_get_grid_sizes_for_insn_ids:
+            return self.overridden_get_grid_sizes_for_insn_ids(
+                    insn_ids,
+                    ignore_auto=ignore_auto)
 
         all_inames_by_insns = set()
         for insn_id in insn_ids:
@@ -1069,15 +1063,6 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         global_sizes = {}
         local_sizes = {}
 
-        # updating the grid sizes from the callee_kernels.
-        for callee_kernel in callee_kernels:
-            gsize, lsize = callee_kernel.get_grid_sizes_for_insn_ids_as_dicts(
-                    frozenset(insn.id for insn in callee_kernel.instructions),
-                    program_callables_info, ignore_auto)
-
-            global_sizes.update(gsize)
-            local_sizes.update(lsize)
-
         from loopy.kernel.data import (
                 GroupIndexTag, LocalIndexTag,
                 AutoLocalIndexTagBase)
@@ -1118,31 +1103,6 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
             tgt_dict[tag.axis] = size
 
-        return global_sizes, local_sizes
-
-    def get_grid_sizes_for_insn_ids(self, insn_ids, program_callables_info,
-            ignore_auto=False):
-        """Return a tuple (global_size, local_size) containing a grid that
-        could accommodate execution of all instructions whose IDs are given
-        in *insn_ids*.
-
-        :arg insn_ids: a :class:`frozenset` of instruction IDs
-
-        *global_size* and *local_size* are :class:`islpy.PwAff` objects.
-        """
-
-        if self.overridden_get_grid_sizes_for_insn_ids:
-            return self.overridden_get_grid_sizes_for_insn_ids(
-                    insn_ids,
-                    program_callables_info=program_callables_info,
-                    ignore_auto=ignore_auto)
-
-        assert self.is_called_from_host, ("Callee kernels do not have sufficient "
-                "information to compute grid sizes.")
-
-        global_sizes, local_sizes = self.get_grid_sizes_for_insn_ids_as_dicts(
-                insn_ids, program_callables_info, ignore_auto=ignore_auto)
-
         def to_dim_tuple(size_dict, which, forced_sizes={}):
             forced_sizes = forced_sizes.copy()
 
@@ -1172,6 +1132,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         return (to_dim_tuple(global_sizes, "global"),
                 to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes))
 
+
+
     def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids,
             program_callables_info, ignore_auto=False):
         """Return a tuple (global_size, local_size) containing a grid that
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index bac4afc8..bc996d9c 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -27,16 +27,13 @@ THE SOFTWARE.
 import numpy as np
 
 from pymbolic.mapper import CSECachingMapperMixin
-from pymbolic.primitives import Slice, Variable, Subscript
 from loopy.tools import intern_frozenset_of_ids
 from loopy.symbolic import (
-        IdentityMapper, WalkMapper, SubArrayRef)
+        IdentityMapper, WalkMapper)
 from loopy.kernel.data import (
         InstructionBase,
         MultiAssignmentBase, Assignment,
         SubstitutionRule, AddressSpace)
-from loopy.kernel.instruction import (CInstruction, _DataObliviousInstruction,
-        CallInstruction)
 from loopy.diagnostic import LoopyError, warn_with_kernel
 import islpy as isl
 from islpy import dim_type
@@ -507,11 +504,9 @@ def parse_insn(groups, insn_options):
             assignee_names.append(inner_lhs_i.name)
         elif isinstance(inner_lhs_i, (Subscript, LinearSubscript)):
             assignee_names.append(inner_lhs_i.aggregate.name)
-        elif isinstance(inner_lhs_i, SubArrayRef):
-            assignee_names.append(inner_lhs_i.subscript.aggregate.name)
         else:
             raise LoopyError("left hand side of assignment '%s' must "
-                    "be variable, subscript or a SubArrayRef" % (lhs_i,))
+                    "be variable or subscript" % (lhs_i,))
 
         new_lhs.append(lhs_i)
 
diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py
index 8b24da21..e0954fb7 100644
--- a/loopy/kernel/function_interface.py
+++ b/loopy/kernel/function_interface.py
@@ -571,13 +571,9 @@ class CallableKernel(InKernelCallable):
         # no type casting in array calls
         from loopy.expression import dtype_to_type_context
         from pymbolic.mapper.stringifier import PREC_NONE
-        from loopy.symbolic import SubArrayRef
         from pymbolic import var
 
         c_parameters = [
-                expression_to_code_mapper(par, PREC_NONE,
-                    dtype_to_type_context(target, par_dtype),
-                    par_dtype).expr if isinstance(par, SubArrayRef) else
                 expression_to_code_mapper(par, PREC_NONE,
                     dtype_to_type_context(target, par_dtype),
                     par_dtype).expr
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index 0f548bba..2a03ad63 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -487,7 +487,7 @@ class InstructionBase(ImmutableRecord):
 
 def _get_assignee_var_name(expr):
     from pymbolic.primitives import Variable, Subscript, Lookup
-    from loopy.symbolic import LinearSubscript, SubArrayRef
+    from loopy.symbolic import LinearSubscript
 
     if isinstance(expr, Lookup):
         expr = expr.aggregate
@@ -507,19 +507,13 @@ def _get_assignee_var_name(expr):
 
         return agg.name
 
-    elif isinstance(expr, SubArrayRef):
-        agg = expr.subscript.aggregate
-        assert isinstance(agg, Variable)
-
-        return agg.name
-
     else:
         raise RuntimeError("invalid lvalue '%s'" % expr)
 
 
 def _get_assignee_subscript_deps(expr):
     from pymbolic.primitives import Variable, Subscript, Lookup
-    from loopy.symbolic import LinearSubscript, get_dependencies, SubArrayRef
+    from loopy.symbolic import LinearSubscript, get_dependencies
 
     if isinstance(expr, Lookup):
         expr = expr.aggregate
@@ -530,8 +524,6 @@ def _get_assignee_subscript_deps(expr):
         return get_dependencies(expr.index)
     elif isinstance(expr, LinearSubscript):
         return get_dependencies(expr.index)
-    elif isinstance(expr, SubArrayRef):
-        return get_dependencies(expr.get_begin_subscript().index)
     else:
         raise RuntimeError("invalid lvalue '%s'" % expr)
 
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 3657967a..bf23c4a4 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -2165,7 +2165,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper):
     def map_call(self, expr, expn_state, **kwargs):
         from pymbolic.primitives import Call, CallWithKwargs
         from loopy.kernel.function_interface import ValueArgDescriptor
-        from loopy.symbolic import ResolvedFunction, SubArrayRef
+        from loopy.symbolic import ResolvedFunction
 
         if not isinstance(expr.function, ResolvedFunction):
             # ignore if the call is not to a ResolvedFunction
@@ -2178,8 +2178,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper):
             kw_parameters = expr.kw_parameters
 
         # descriptors for the args and kwargs of the Call
-        arg_id_to_descr = dict((i, par.get_array_arg_descriptor(self.caller_kernel))
-                if isinstance(par, SubArrayRef) else (i, ValueArgDescriptor())
+        arg_id_to_descr = dict((i, ValueArgDescriptor())
                 for i, par in tuple(enumerate(expr.parameters)) +
                 tuple(kw_parameters.items()))
 
@@ -2190,11 +2189,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper):
             assignees = kwargs['assignees']
             assert isinstance(assignees, tuple)
             for i, par in enumerate(assignees):
-                if isinstance(par, SubArrayRef):
-                    assignee_id_to_descr[-i-1] = (
-                            par.get_array_arg_descriptor(self.caller_kernel))
-                else:
-                    assignee_id_to_descr[-i-1] = ValueArgDescriptor()
+                assignee_id_to_descr[-i-1] = ValueArgDescriptor()
 
         # gathering all the descriptors
         combined_arg_id_to_descr = arg_id_to_descr.copy()
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 0e8fa305..3ae9a142 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -36,7 +36,7 @@ from loopy.diagnostic import (
 from loopy.kernel.instruction import _DataObliviousInstruction
 
 from loopy.program import ProgramCallablesInfo
-from loopy.symbolic import SubArrayRef, LinearSubscript
+from loopy.symbolic import LinearSubscript
 from pymbolic.primitives import Variable, Subscript, Lookup
 
 import logging
@@ -548,10 +548,6 @@ class TypeInferenceMapper(CombineMapper):
             return [expr.operation.result_dtypes(self.kernel, rec_result)[0]
                     for rec_result in rec_results]
 
-    def map_sub_array_ref(self, expr):
-        return self.rec(expr.get_begin_subscript())
-
-
 # }}}
 
 
@@ -831,17 +827,8 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info,
                             assignee.aggregate.name].dtype is None:
                         return False
             else:
-                assert isinstance(assignee, SubArrayRef)
-                if assignee.subscript.aggregate.name in kernel.arg_dict:
-                    if kernel.arg_dict[
-                            assignee.subscript.aggregate.name].dtype is None:
-                        return False
-                else:
-                    assert assignee.subscript.aggregate.name in (
-                            kernel.temporary_variables)
-                    if kernel.temporary_variables[
-                            assignee.subscript.aggregate.name] is None:
-                        return False
+                raise NotImplementedError("Unknown assignee type %s" %
+                        type(assignee))
 
         return True
 
-- 
GitLab


From 79fed9786ce5ae90c367ac6cbff1192678aa1014 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Sun, 12 Aug 2018 18:55:30 +0530
Subject: [PATCH 07/80] Flake8

---
 loopy/isl_helpers.py     |  2 +-
 loopy/kernel/__init__.py | 11 -----------
 loopy/target/opencl.py   |  5 -----
 3 files changed, 1 insertion(+), 17 deletions(-)

diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py
index ef07b7e2..5a747d07 100644
--- a/loopy/isl_helpers.py
+++ b/loopy/isl_helpers.py
@@ -27,7 +27,7 @@ THE SOFTWARE.
 
 from six.moves import range, zip
 
-from loopy.diagnostic import StaticValueFindingError, LoopyError
+from loopy.diagnostic import StaticValueFindingError
 
 import islpy as isl
 from islpy import dim_type
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index f686e58f..f5e105c7 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -221,11 +221,6 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         A subclass of :class:`loopy.TargetBase`.
 
-    .. attribute:: is_called_from_host
-
-        An instance of :class:`bool`. Will be set *False* for the kernel which
-        would be called from another top level kernels. Default value is
-        *True*.
     """
 
     # {{{ constructor
@@ -254,8 +249,6 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             state=KernelState.INITIAL,
             target=None,
 
-            is_called_from_host=True,
-
             overridden_get_grid_sizes_for_insn_ids=None,
             _cached_written_variables=None):
         """
@@ -368,7 +361,6 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 options=options,
                 state=state,
                 target=target,
-                is_called_from_host=is_called_from_host,
                 overridden_get_grid_sizes_for_insn_ids=(
                     overridden_get_grid_sizes_for_insn_ids),
                 _cached_written_variables=_cached_written_variables)
@@ -1132,8 +1124,6 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         return (to_dim_tuple(global_sizes, "global"),
                 to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes))
 
-
-
     def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids,
             program_callables_info, ignore_auto=False):
         """Return a tuple (global_size, local_size) containing a grid that
@@ -1456,7 +1446,6 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             "silenced_warnings",
             "options",
             "state",
-            "is_called_from_host",
             "target",
             )
 
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 44f782a7..44bf9c4c 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -470,11 +470,6 @@ class OpenCLCASTBuilder(CASTBuilder):
 
         from loopy.target.c import FunctionDeclarationWrapper
         assert isinstance(fdecl, FunctionDeclarationWrapper)
-        if not codegen_state.kernel.is_called_from_host:
-            # auxiliary kernels need not mention opencl speicific qualifiers
-            # for a functions signature
-            return fdecl
-
         fdecl = fdecl.subdecl
 
         from cgen.opencl import CLKernel, CLRequiredWorkGroupSize
-- 
GitLab


From ec84ad60427fa2ebf2accf03e4b9432bece54be6 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Sun, 12 Aug 2018 19:21:46 +0530
Subject: [PATCH 08/80] adds program_callables_info to grid_override...

---
 loopy/kernel/__init__.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index f5e105c7..be66cf85 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -1040,6 +1040,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         if self.overridden_get_grid_sizes_for_insn_ids:
             return self.overridden_get_grid_sizes_for_insn_ids(
                     insn_ids,
+                    program_callables_info,
                     ignore_auto=ignore_auto)
 
         all_inames_by_insns = set()
-- 
GitLab


From dd995d883c7ea00950f7121533c86a0638cd2b10 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Sun, 12 Aug 2018 19:47:04 +0530
Subject: [PATCH 09/80] took the test to the earlier state.

---
 test/test_loopy.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index 02eeda13..43371c8a 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -409,11 +409,14 @@ def test_ilp_write_race_detection_global(ctx_factory):
 
     knl = lp.tag_inames(knl, dict(j="ilp"))
 
+    knl = lp.preprocess_kernel(knl)
+
     with lp.CacheMode(False):
         from loopy.diagnostic import WriteRaceConditionWarning
         from warnings import catch_warnings
         with catch_warnings(record=True) as warn_list:
-            lp.generate_code_v2(knl)
+            list(lp.generate_loop_schedules(knl.root_kernel,
+                    knl.program_callables_info))
 
             assert any(isinstance(w.message, WriteRaceConditionWarning)
                     for w in warn_list)
-- 
GitLab


From 82a16b6cc6709b5a9f516ef5b1da376b92782b8d Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Mon, 13 Aug 2018 11:27:00 +0530
Subject: [PATCH 10/80] fix the style of code to get started with changing
 ProgramCallablesInfo

---
 loopy/kernel/__init__.py           |  3 +-
 loopy/kernel/function_interface.py |  4 +-
 loopy/library/reduction.py         |  2 +-
 loopy/program.py                   | 70 +++++++-----------------------
 loopy/statistics.py                |  6 +--
 loopy/symbolic.py                  |  8 ++--
 6 files changed, 27 insertions(+), 66 deletions(-)

diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index be66cf85..3f637e53 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -1360,7 +1360,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
     # {{{ direct execution
 
     def __call__(self, *args, **kwargs):
-        # FIXME: scream and then convert to a program
+        raise LoopyError("Calling a LoopKernel is deprecated, call a Program "
+                "instead.")
         from loopy.program import make_program_from_kernel
         program = make_program_from_kernel(self)
         return program(*args, **kwargs)
diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py
index e0954fb7..8c3a6911 100644
--- a/loopy/kernel/function_interface.py
+++ b/loopy/kernel/function_interface.py
@@ -676,8 +676,8 @@ def next_indexed_variable(function):
         or :class:`loopy.reduction.ArgExtOp` or
         :class:`loopy.reduction.SegmentedOp`.
     """
-    from loopy.library.reduction import ArgExtOp, SegmentedOp
-    if isinstance(function, (ArgExtOp, SegmentedOp)):
+    from loopy.library.reduction import ReductionOpFunction
+    if isinstance(function, ReductionOpFunction):
         return function.copy()
     func_name = re.compile(r"^(?P<alpha>\S+?)_(?P<num>\d+?)$")
 
diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index 6ec8e4b2..b968192e 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -504,7 +504,7 @@ class ReductionCallable(ScalarCallable):
 
 
 def reduction_scoper(target, identifier):
-    if isinstance(identifier, (ArgExtOp, SegmentedOp)):
+    if isinstance(identifier, ReductionOpFunction):
         return ReductionCallable(name=identifier)
 
     return None
diff --git a/loopy/program.py b/loopy/program.py
index 096bd1ec..279228af 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -298,14 +298,7 @@ class Program(ImmutableRecord):
         return pex(*args, **kwargs)
 
     def __str__(self):
-        # FIXME: make this better
-        print(self.program_callables_info.num_times_callables_called)
-        return (
-                (self.program_callables_info[
-                    self.name].subkernel).__str__() +
-                '\nResolved Functions: ' +
-                (self.program_callables_info.resolved_functions.keys()).__str__() +
-                '\n' + 75*'-' + '\n')
+        return self.root_kernel.__str__()
 
 # }}}
 
@@ -315,14 +308,14 @@ def next_indexed_function_identifier(function):
     Returns an instance of :class:`str` with the next indexed-name in the
     sequence for the name of *function*.
 
-    *Example:* ``Variable('sin_0')`` will return ``'sin_1'``.
+    *Example:* ``'sin_0'`` will return ``'sin_1'``.
 
-    :arg function: Either an instance of :class:`pymbolic.primitives.Variable`
-        or :class:`loopy.reduction.ArgExtOp` or
-        :class:`loopy.reduction.SegmentedOp`.
+    :arg function: Either an instance of :class:`str`,
+        :class:`pymbolic.primitives.Variable` ,
+        :class:`loopy.reduction.ReductionOpFunction`.
     """
-    from loopy.library.reduction import ArgExtOp, SegmentedOp
-    if isinstance(function, (ArgExtOp, SegmentedOp)):
+    from loopy.library.reduction import ReductionOpFunction
+    if isinstance(function, ReductionOpFunction):
         return function.copy()
     elif isinstance(function, str):
         function = Variable(function)
@@ -371,12 +364,8 @@ def rename_resolved_functions_in_a_single_kernel(kernel,
 # {{{ program callables info
 
 class ProgramCallablesInfo(ImmutableRecord):
-    # FIXME: dont evalutate num_times_called, rahter compute it from the
-    # resolved_functions
-    # FIXME: make the edit callables thing a ContextManager.
     def __init__(self, resolved_functions, num_times_callables_called=None,
             history=None, is_being_edited=False,
-            num_times_hit_during_editing={},
             renames_needed_after_editing={}):
 
         if num_times_callables_called is None:
@@ -391,23 +380,19 @@ class ProgramCallablesInfo(ImmutableRecord):
                 num_times_callables_called=num_times_callables_called,
                 history=history,
                 is_being_edited=is_being_edited,
-                num_times_hit_during_editing=num_times_hit_during_editing,
                 renames_needed_after_editing=renames_needed_after_editing)
 
     hash_fields = (
             "resolved_functions",
             "num_times_callables_called",
             "is_being_edited",
-            "num_times_hit_during_editing",
             "renames_needed_after_editing",
             "history")
 
     update_persistent_hash = LoopKernel.update_persistent_hash
 
     def with_edit_callables_mode(self):
-        return self.copy(is_being_edited=True,
-                num_times_hit_during_editing=dict((func_id, 0) for func_id in
-                    self.resolved_functions))
+        return self.copy(is_being_edited=True)
 
     def with_callable(self, function, in_kernel_callable,
             resolved_for_the_first_time=False):
@@ -426,6 +411,10 @@ class ProgramCallablesInfo(ImmutableRecord):
         # FIXME: add a note about using enter and exit. ~KK
         # FIXME: think about a better idea of "with_added_callable" this would
         # be more convenient for developer-faced usage. ~KK
+        # FIXME: Is this is a bad code? Yes.
+        # Is there a better alternative to it. Definitely maybe.
+        # But I don't want to spend the next 182 years of my life optimizing
+        # some scheme, without even implmenting it to some problem!
 
         if not self.is_being_edited:
             if function.name in self.resolved_functions and (
@@ -436,29 +425,22 @@ class ProgramCallablesInfo(ImmutableRecord):
                 print('New: ', in_kernel_callable)
                 raise LoopyError("Use 'enter_edit_callables_mode' first.")
 
-        from loopy.library.reduction import ArgExtOp, SegmentedOp
+        from loopy.library.reduction import ReductionOpFunction
 
         # {{{ sanity checks
 
         if isinstance(function, str):
             function = Variable(function)
 
-        assert isinstance(function, (Variable, ArgExtOp, SegmentedOp))
+        assert isinstance(function, (Variable, ReductionOpFunction))
 
         # }}}
 
         renames_needed_after_editing = self.renames_needed_after_editing.copy()
-        num_times_hit_during_editing = self.num_times_hit_during_editing.copy()
         num_times_callables_called = self.num_times_callables_called.copy()
         history = self.history.copy()
 
-        if not resolved_for_the_first_time:
-            if isinstance(function, (ArgExtOp, SegmentedOp)):
-                num_times_hit_during_editing[function] += 1
-            else:
-                num_times_hit_during_editing[function.name] += 1
-
-        if isinstance(function, (ArgExtOp, SegmentedOp)):
+        if isinstance(function, ReductionOpFunction):
             unique_function_identifier = function.copy()
             if not resolved_for_the_first_time:
                 num_times_callables_called[function] -= 1
@@ -473,8 +455,6 @@ class ProgramCallablesInfo(ImmutableRecord):
                     self.copy(
                         resolved_functions=updated_resolved_functions,
                         num_times_callables_called=num_times_callables_called,
-                        num_times_hit_during_editing=(
-                            num_times_hit_during_editing),
                         renames_needed_after_editing=(
                             renames_needed_after_editing)),
                     unique_function_identifier)
@@ -494,17 +474,12 @@ class ProgramCallablesInfo(ImmutableRecord):
                     return (
                             self.copy(
                                 history=history,
-                                num_times_hit_during_editing=(
-                                    num_times_hit_during_editing),
                                 num_times_callables_called=(
                                     num_times_callables_called),
                                 renames_needed_after_editing=(
                                     renames_needed_after_editing)),
                             func_id)
         else:
-            # FIXME: maybe deal with the history over here?
-            # FIXME: once the code logic is running beautify this part.
-            # many "ifs" can be avoided
             unique_function_identifier = function.name
             if (resolved_for_the_first_time or
                     self.num_times_callables_called[function.name] > 1):
@@ -534,7 +509,6 @@ class ProgramCallablesInfo(ImmutableRecord):
                     history=history,
                     resolved_functions=updated_resolved_functions,
                     num_times_callables_called=num_times_callables_called,
-                    num_times_hit_during_editing=num_times_hit_during_editing,
                     renames_needed_after_editing=renames_needed_after_editing),
                 Variable(unique_function_identifier))
 
@@ -576,7 +550,6 @@ class ProgramCallablesInfo(ImmutableRecord):
                 is_being_edited=False,
                 resolved_functions=resolved_functions,
                 num_times_callables_called=num_times_callables_called,
-                num_times_hit_during_editing={},
                 renames_needed_after_editing={})
 
     def with_deleted_callable(self, func_id, instances=1):
@@ -668,17 +641,4 @@ def iterate_over_kernels_if_given_program(transform_for_single_kernel):
     return wraps(transform_for_single_kernel)(_collective_transform)
 
 
-# {{{ ingoring this for now
-
-# if False and isinstance(function, (ArgExtOp, SegmentedOp)):
-# FIXME: ignoring this casse for now
-# FIXME: If a kernel has two flavors of ArgExtOp then they are
-# overwritten and hence not supported.(for now).
-# updated_resolved_functions = self.scoped_functions.copy()
-# updated_resolved_functions[function] = in_kernel_callable
-# return self.copy(updated_resolved_functions), function.copy()
-
-# }}}
-
-
 # vim: foldmethod=marker
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 08b7f89e..95e9f62a 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -64,9 +64,9 @@ __doc__ = """
 # Qns:
 # - The variable name, what if multiple kernels use the same name?
 # - We should also add the cumulative effect on the arguments of callee kernels
-# into the caller kernel.
-# FIXME: add an error that there is only one callable kernel. disable for
-# multiple callable kernels.
+# into the caller kernel
+# - Make changes to MemAccessInfo to include the effect of several kernels.
+# - Renovate `count`.
 
 # {{{ GuardedPwQPolynomial
 
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 7a268d06..92b209ac 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -677,16 +677,16 @@ class ResolvedFunction(p.Expression):
     def __init__(self, function):
         if isinstance(function, str):
             function = p.Variable(function)
-        from loopy.library.reduction import ArgExtOp, SegmentedOp
-        assert isinstance(function, (p.Variable, ArgExtOp, SegmentedOp))
+        from loopy.library.reduction import ReductionOpFunction
+        assert isinstance(function, (p.Variable, ReductionOpFunction))
         self.function = function
 
     @property
     def name(self):
-        from loopy.library.reduction import ArgExtOp, SegmentedOp
+        from loopy.library.reduction import ReductionOpFunction
         if isinstance(self.function, p.Variable):
             return self.function.name
-        elif isinstance(self.function, (ArgExtOp, SegmentedOp)):
+        elif isinstance(self.function, ReductionOpFunction):
             return self.function
         else:
             raise LoopyError("Unexpected function type %s in ResolvedFunction." %
-- 
GitLab


From 88d746d0d041435d33aebd2a301855647c054ebe Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Mon, 13 Aug 2018 20:38:16 +0530
Subject: [PATCH 11/80] started with beautifying code.

---
 loopy/program.py | 108 ++++++++++++++++++++++++++++++++++++++++++++---
 1 file changed, 102 insertions(+), 6 deletions(-)

diff --git a/loopy/program.py b/loopy/program.py
index 279228af..1b9d03d4 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -165,6 +165,35 @@ def initialize_program_callables_info_from_kernel(
 # {{{ program definition
 
 class Program(ImmutableRecord):
+    """
+    Records the information about all the callables in a :mod:`loopy` program.
+
+    .. attribute:: name
+
+        An instance of :class:`str`, also the name of the top-most level
+        :class:`loopy.LoopKernel`.
+
+    .. attribute:: program_callables_info
+
+        An instance of :class:`loopy.program.ProgramCallablesInfo`.
+
+    .. attribute:: target
+
+        An instance of :class:`loopy.target.TargetBase`.
+
+    .. attribute:: func_id_to_in_knl_callables_mappers
+
+        A list of functions of the signature ``(target: TargetBase,
+        function_indentifier: str)`` that would return an instance of
+        :class:`loopy.kernel.function_interface.InKernelCallable` or *None*.
+
+    .. note::
+
+        - To create an instance of :class:`loopy.Program`, it is recommeneded to
+            go through :method:`loopy.make_kernel`.
+        - This data structure and its attributes should be considered
+          immutable, any modifications should be done through :method:`copy`.
+    """
     def __init__(self,
             name,
             program_callables_info,
@@ -172,8 +201,6 @@ class Program(ImmutableRecord):
             func_id_to_in_knl_callable_mappers):
         assert isinstance(program_callables_info, ProgramCallablesInfo)
 
-        # FIXME: check if all sanity checks have been covered?
-        # FIXME: The comments over here may need some attention.
         assert name in program_callables_info
 
         super(Program, self).__init__(
@@ -194,6 +221,7 @@ class Program(ImmutableRecord):
 
     def copy(self, **kwargs):
         if 'target' in kwargs:
+            # target attribute of all the callable kernels should be updated.
             target = kwargs['target']
             new_self = super(Program, self).copy(**kwargs)
             new_resolved_functions = {}
@@ -266,13 +294,43 @@ class Program(ImmutableRecord):
 
     @property
     def root_kernel(self):
+        """
+        Returns an instance of :class:`loopy.LoopKernel` denoting the topmost
+        level kernel in codegeneration.
+
+        .. note::
+
+            Syntactic sugar.
+        """
         return self.program_callables_info[self.name].subkernel
 
     @property
     def arg_dict(self):
+        """
+        Returns ``arg_dict`` of the ``root_kernel``.
+
+        .. note::
+
+            Syntactic sugar.
+        """
         return self.root_kernel.arg_dict
 
+    @property
+    def args(self):
+        """
+        Returns ``args`` of the ``root_kernel``.
+
+        .. note::
+
+            Syntactic sugar.
+        """
+        return self.root_kernel.args[:]
+
     def with_root_kernel(self, root_kernel):
+        """
+        Returns a copy of *self* with the topmost level kernel as
+        *root_kernel*.
+        """
         new_in_knl_callable = self.program_callables_info[
                 self.name].copy(subkernel=root_kernel)
         new_resolved_functions = (
@@ -283,10 +341,6 @@ class Program(ImmutableRecord):
                 program_callables_info=self.program_callables_info.copy(
                     resolved_functions=new_resolved_functions))
 
-    @property
-    def args(self):
-        return self.root_kernel.args[:]
-
     def __call__(self, *args, **kwargs):
         key = self.target.get_kernel_executor_cache_key(*args, **kwargs)
         try:
@@ -336,6 +390,10 @@ def next_indexed_function_identifier(function):
 
 
 class ResolvedFunctionRenamer(RuleAwareIdentityMapper):
+    """
+    Mapper to rename the resolved functions in an expression according to
+    *renaming_dict*.
+    """
     def __init__(self, rule_mapping_context, renaming_dict):
         super(ResolvedFunctionRenamer, self).__init__(
                 rule_mapping_context)
@@ -351,6 +409,10 @@ class ResolvedFunctionRenamer(RuleAwareIdentityMapper):
 
 def rename_resolved_functions_in_a_single_kernel(kernel,
         renaming_dict):
+    """
+    Returns a copy of *kernel* with the instances of :class:`ResolvedFunction`
+    renames according to *renaming_dict*.
+    """
     from loopy.symbolic import SubstitutionRuleMappingContext
     rule_mapping_context = SubstitutionRuleMappingContext(
                 kernel.substitutions, kernel.get_var_name_generator())
@@ -364,6 +426,40 @@ def rename_resolved_functions_in_a_single_kernel(kernel,
 # {{{ program callables info
 
 class ProgramCallablesInfo(ImmutableRecord):
+    """
+    Records the information of all the callables called in a :class:`loopy.Program`.
+
+    .. attribute:: resolved_functions
+
+        An instance of :class:`dict` that contains a mapping from function
+        identifier to instances of
+        :class:`loopy.kernel.function_interface.InKernelCallable`
+
+    .. attribute:: num_times_callables_called
+
+        An instace of :class:`dict` that contains a mapping from function
+        identifier to :class:`int`, that denotes the number of times the
+        callable is being called in the entire :class:`loopy.Program`.
+
+    .. attribute:: history
+
+        An instance of :class:`dict` that contains a mapping from function
+        identifier to and instance of :class:`list`that would contain all the
+        names taken by a function before the current name.(For example: one
+        possibility could be ``{'sin_1': ['sin', 'sin_0', 'sin_1']}``)
+
+    .. attribute:: is_being_edited
+
+        An instance of :class:`bool` which is intended to aid the working of
+        :meth:`with_enter_edit_callables_mode`, :meth:`with_callable` and
+        :meth:`with_exit_edit_callables_mode`.
+
+    .. attribute:: renames_needed_after_editing
+
+        An instance of :class:`dict` which is intended to aid the working of
+        :meth:`with_enter_edit_callables_mode`, :meth:`with_callable` and
+        :meth:`with_exit_edit_callables_mode`.
+    """
     def __init__(self, resolved_functions, num_times_callables_called=None,
             history=None, is_being_edited=False,
             renames_needed_after_editing={}):
-- 
GitLab


From e3277fa2d162f773072109a951f05e24816a88e0 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Mon, 13 Aug 2018 21:00:10 +0530
Subject: [PATCH 12/80] changes in program_callables_info design.

---
 loopy/kernel/__init__.py |  7 +++++++
 loopy/program.py         | 42 ++++++++++++++++++++++------------------
 2 files changed, 30 insertions(+), 19 deletions(-)

diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 3f637e53..3b189da5 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -221,6 +221,11 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         A subclass of :class:`loopy.TargetBase`.
 
+    .. attribute:: is_called_from_host
+        An instance of :class:`bool`. Will be set *False* for the kernel which
+        would be called from another top level kernels. Default value is
+        *True*.
+
     """
 
     # {{{ constructor
@@ -248,6 +253,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
             state=KernelState.INITIAL,
             target=None,
+            is_called_from_host=True,
 
             overridden_get_grid_sizes_for_insn_ids=None,
             _cached_written_variables=None):
@@ -361,6 +367,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 options=options,
                 state=state,
                 target=target,
+                is_called_from_host=is_called_from_host,
                 overridden_get_grid_sizes_for_insn_ids=(
                     overridden_get_grid_sizes_for_insn_ids),
                 _cached_written_variables=_cached_written_variables)
diff --git a/loopy/program.py b/loopy/program.py
index 1b9d03d4..0dc327aa 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -460,9 +460,9 @@ class ProgramCallablesInfo(ImmutableRecord):
         :meth:`with_enter_edit_callables_mode`, :meth:`with_callable` and
         :meth:`with_exit_edit_callables_mode`.
     """
-    def __init__(self, resolved_functions, num_times_callables_called=None,
-            history=None, is_being_edited=False,
-            renames_needed_after_editing={}):
+    def __init__(self, resolved_functions,
+            num_times_callables_called=None, history=None,
+            is_being_edited=False, renames_needed_after_editing={}):
 
         if num_times_callables_called is None:
             num_times_callables_called = dict((func_id, 1) for func_id in
@@ -487,11 +487,22 @@ class ProgramCallablesInfo(ImmutableRecord):
 
     update_persistent_hash = LoopKernel.update_persistent_hash
 
+    def add_callable(self, function, in_kernel_callable):
+
+        history[unique_function_identifier] = set(
+                [unique_function_identifier])
+        pass
+
+    def with_updated_num_times_being_called(self):
+        root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable
+                in self.resolved_functions.values() if
+                isinstance(in_knl_callable, CallableKernel) and
+                in_knl_callable.is_called_from_host]
+
     def with_edit_callables_mode(self):
         return self.copy(is_being_edited=True)
 
-    def with_callable(self, function, in_kernel_callable,
-            resolved_for_the_first_time=False):
+    def with_callable(self, function, in_kernel_callable):
         """
         :arg function: An instance of :class:`pymbolic.primitives.Variable` or
             :class:`loopy.library.reduction.ReductionOpFunction`.
@@ -538,8 +549,7 @@ class ProgramCallablesInfo(ImmutableRecord):
 
         if isinstance(function, ReductionOpFunction):
             unique_function_identifier = function.copy()
-            if not resolved_for_the_first_time:
-                num_times_callables_called[function] -= 1
+            num_times_callables_called[function] -= 1
 
             num_times_callables_called[unique_function_identifier] = 1
 
@@ -561,12 +571,11 @@ class ProgramCallablesInfo(ImmutableRecord):
             for func_id, in_knl_callable in self.resolved_functions.items():
                 if in_knl_callable == in_kernel_callable:
                     num_times_callables_called[func_id] += 1
-                    if not resolved_for_the_first_time:
-                        num_times_callables_called[function.name] -= 1
-                        if num_times_callables_called[function.name] == 0:
-                            renames_needed_after_editing[func_id] = function.name
+                    num_times_callables_called[function.name] -= 1
+                    if num_times_callables_called[function.name] == 0:
+                        renames_needed_after_editing[func_id] = function.name
 
-                        history[func_id] = history[func_id] | set([function.name])
+                    history[func_id] = history[func_id] | set([function.name])
                     return (
                             self.copy(
                                 history=history,
@@ -577,16 +586,13 @@ class ProgramCallablesInfo(ImmutableRecord):
                             func_id)
         else:
             unique_function_identifier = function.name
-            if (resolved_for_the_first_time or
-                    self.num_times_callables_called[function.name] > 1):
+            if self.num_times_callables_called[function.name] > 1:
                 while unique_function_identifier in self.resolved_functions:
                     unique_function_identifier = (
                             next_indexed_function_identifier(
                                 unique_function_identifier))
 
-            if not resolved_for_the_first_time:
-                num_times_callables_called[function.name] -= 1
-
+            num_times_callables_called[function.name] -= 1
             num_times_callables_called[unique_function_identifier] = 1
 
         updated_resolved_functions = self.resolved_functions.copy()
@@ -597,8 +603,6 @@ class ProgramCallablesInfo(ImmutableRecord):
             history[unique_function_identifier] = (
                     history[function.name] | set([unique_function_identifier]))
         else:
-            history[unique_function_identifier] = set(
-                    [unique_function_identifier])
 
         return (
                 self.copy(
-- 
GitLab


From a4ebe862bb8e434fc67d85c4b9201bad12577975 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Tue, 14 Aug 2018 09:17:03 +0530
Subject: [PATCH 13/80] new design to interface with program callables info.

---
 loopy/preprocess.py         |   6 +-
 loopy/program.py            | 448 ++++++++++++++++++++++++------------
 loopy/transform/callable.py |  24 +-
 loopy/transform/fusion.py   | 117 +++++-----
 loopy/type_inference.py     |  10 +-
 5 files changed, 384 insertions(+), 221 deletions(-)

diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index bf23c4a4..56db777b 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -2269,6 +2269,9 @@ def traverse_to_infer_arg_descr(kernel, program_callables_info):
 
 def infer_arg_descr(program):
     root_kernel_callable = program.program_callables_info[program.name]
+    from loopy.program import count_callables_in_program_callables_info
+    old_callables_count = count_callables_in_program_callables_info(
+            program.program_callables_info)
     program_callables_info = (
             program.program_callables_info.with_edit_callables_mode())
     root_kernel = program.root_kernel
@@ -2280,7 +2283,8 @@ def infer_arg_descr(program):
     program_callables_info, _ = program_callables_info.with_callable(program.name,
             new_root_kernel_callable)
 
-    program_callables_info = program_callables_info.with_exit_edit_callables_mode()
+    program_callables_info = program_callables_info.with_exit_edit_callables_mode(
+            old_callables_count)
 
     return program.copy(program_callables_info=program_callables_info)
 
diff --git a/loopy/program.py b/loopy/program.py
index 0dc327aa..32869d26 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -29,12 +29,20 @@ from pytools import ImmutableRecord, memoize_method
 from pymbolic.primitives import Variable
 from functools import wraps
 
-from loopy.symbolic import RuleAwareIdentityMapper, ResolvedFunction
+from loopy.symbolic import (
+        RuleAwareIdentityMapper, ResolvedFunction, CombineMapper)
 from loopy.kernel.function_interface import (
         CallableKernel, ScalarCallable)
+from loopy.kernel.instruction import (
+        MultiAssignmentBase, CInstruction, _DataObliviousInstruction)
 from loopy.diagnostic import LoopyError
+from loopy.library.reduction import ReductionOpFunction
 
 from loopy.kernel import LoopKernel
+from collections import Counter
+from pymbolic.primitives import Call, CallWithKwargs
+
+# FIXME: autofunction/autoclass?? ~KK
 
 
 class ResolvedFunctionMarker(RuleAwareIdentityMapper):
@@ -60,7 +68,6 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper):
         super(ResolvedFunctionMarker, self).__init__(rule_mapping_context)
         self.kernel = kernel
         self.program_callables_info = program_callables_info
-        # FIXME: function_resolvesrs looks like a very bad name change it
         self.function_id_to_in_knl_callable_mappers = (
                 function_id_to_in_knl_callable_mappers)
 
@@ -71,7 +78,6 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper):
         :arg:`identifier` is known to any kernel function scoper, otherwise returns
         *None*.
         """
-        # FIXME change docs
         for func_id_to_in_knl_callable_mapper in (
                 self.function_id_to_in_knl_callable_mappers):
             # fixme: do we really need to given target for the function
@@ -83,7 +89,6 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper):
         return None
 
     def map_call(self, expr, expn_state):
-        from pymbolic.primitives import Call, CallWithKwargs
         from loopy.symbolic import parse_tagged_name
 
         name, tag = parse_tagged_name(expr.function)
@@ -109,8 +114,8 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper):
                 # resolved in-kernel callable
 
                 self.program_callables_info, new_func_id = (
-                        self.program_callables_info.with_callable(expr.function,
-                            in_knl_callable, True))
+                        self.program_callables_info.with_add_callable(expr.function,
+                            in_knl_callable))
                 return type(expr)(
                         ResolvedFunction(new_func_id),
                         tuple(self.rec(child, expn_state)
@@ -135,10 +140,15 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper):
         return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state)
 
 
-def initialize_program_callables_info_from_kernel(
-        kernel, func_id_to_kernel_callable_mappers):
+def initialize_program_callables_info_from_kernel(kernel):
+    """
+    Returns an instance of :class:`loopy.ProgramCallablesInfo`, by resolving
+    the functions based on :mod:`loopy`'s default function resolvers.
+    """
+    # collect the default function resolvers
+    func_id_to_kernel_callable_mappers = (
+            default_func_id_to_kernel_callable_mappers(kernel.target))
     program_callables_info = ProgramCallablesInfo({})
-    program_callables_info = program_callables_info.with_edit_callables_mode()
 
     from loopy.symbolic import SubstitutionRuleMappingContext
     rule_mapping_context = SubstitutionRuleMappingContext(
@@ -148,16 +158,17 @@ def initialize_program_callables_info_from_kernel(
             rule_mapping_context, kernel, program_callables_info,
             func_id_to_kernel_callable_mappers)
 
-    # scoping fucntions and collecting the scoped functions
+    # mark the functions as "Resolved" in the expression nodes.
     kernel_with_functions_resolved = rule_mapping_context.finish_kernel(
             resolved_function_marker.map_kernel(kernel))
+    # collect the update program_callables_info
     program_callables_info = resolved_function_marker.program_callables_info
 
     callable_kernel = CallableKernel(kernel_with_functions_resolved)
-    program_callables_info, _ = program_callables_info.with_callable(
-            Variable(kernel.name), callable_kernel, True)
-    program_callables_info = (
-            program_callables_info.with_exit_edit_callables_mode())
+
+    # add the callable kernel to the program_callables_info
+    program_callables_info, _ = program_callables_info.with_add_callable(
+            Variable(kernel.name), callable_kernel)
 
     return program_callables_info
 
@@ -357,33 +368,31 @@ class Program(ImmutableRecord):
 # }}}
 
 
-def next_indexed_function_identifier(function):
+def next_indexed_function_identifier(function_id):
     """
     Returns an instance of :class:`str` with the next indexed-name in the
     sequence for the name of *function*.
 
     *Example:* ``'sin_0'`` will return ``'sin_1'``.
 
-    :arg function: Either an instance of :class:`str`,
-        :class:`pymbolic.primitives.Variable` ,
-        :class:`loopy.reduction.ReductionOpFunction`.
+    :arg function_id: Either an instance of :class:`str`.
     """
-    from loopy.library.reduction import ReductionOpFunction
-    if isinstance(function, ReductionOpFunction):
-        return function.copy()
-    elif isinstance(function, str):
-        function = Variable(function)
 
-    assert isinstance(function, Variable)
+    # {{{ sanity checks
+
+    assert isinstance(function_id, str)
+
+    # }}}
+
     func_name = re.compile(r"^(?P<alpha>\S+?)_(?P<num>\d+?)$")
 
-    match = func_name.match(function.name)
+    match = func_name.match(function_id)
 
     if match is None:
-        if function.name[-1] == '_':
-            return "{old_name}0".format(old_name=function.name)
+        if function_id[-1] == '_':
+            return "{old_name}0".format(old_name=function_id)
         else:
-            return "{old_name}_0".format(old_name=function.name)
+            return "{old_name}_0".format(old_name=function_id)
 
     return "{alpha}_{num}".format(alpha=match.group('alpha'),
             num=int(match.group('num'))+1)
@@ -423,6 +432,115 @@ def rename_resolved_functions_in_a_single_kernel(kernel,
                 resolved_function_renamer.map_kernel(kernel)))
 
 
+# {{{ counting helpers
+
+class CallablesCountingMapper(CombineMapper):
+    """
+    Returns an instance of :class:`collections.Counter` with the count of
+    callables registered in *program_callables_info*.
+
+    .. attribute:: program_callables_info
+
+        An instance of :class:`loopy.program.ProgramCallablesInfo`.
+    """
+    def __init__(self, program_callables_info):
+        self.program_callables_info = program_callables_info
+
+    def combine(self, values):
+        return sum(values, Counter())
+
+    def map_call(self, expr):
+
+        if isinstance(expr, CallWithKwargs):
+            kw_parameters = expr.kw_parameters
+        else:
+            assert isinstance(expr, Call)
+            kw_parameters = {}
+
+        if isinstance(expr.function, (ResolvedFunction)):
+            in_knl_callable = self.program_callables_info[expr.function.name]
+            if isinstance(in_knl_callable, ScalarCallable):
+                return (Counter([expr.function.name]) +
+                        self.combine((self.rec(child) for child in expr.parameters
+                            + tuple(kw_parameters.values()))))
+
+            elif isinstance(in_knl_callable, CallableKernel):
+
+                # callable kernels have more callables in them.
+                callables_count_in_subkernel = (
+                        count_callables_in_kernel(
+                            in_knl_callable.subkernel,
+                            self.program_callables_info))
+
+                return (Counter([expr.function.name]) +
+                        self.combine((self.rec(child) for child in expr.parameters
+                            + tuple(kw_parameters.values())))) + (
+                                    callables_count_in_subkernel)
+            else:
+                raise NotImplementedError("Unknown callable type %s." % (
+                    type))
+        else:
+            return (
+                    self.combine((self.rec(child) for child in expr.parameters
+                        + tuple(kw_parameters.values()))))
+
+    map_call_with_kwargs = map_call
+
+    def map_constant(self, expr):
+        return Counter()
+
+    map_variable = map_constant
+    map_function_symbol = map_constant
+    map_tagged_variable = map_constant
+    map_type_cast = map_constant
+
+
+# FIXME: @memoize_method
+def count_callables_in_kernel(kernel, program_callables_info):
+    """
+    Returns an instance of :class:`collections.Counter` representing the number
+    of callables in the *kernel* that are registered in
+    *program_callables_info*.
+    """
+    assert isinstance(kernel, LoopKernel)
+    callables_count = Counter()
+    callables_counting_mapper = CallablesCountingMapper(
+            program_callables_info)
+
+    for insn in kernel.instructions:
+        if isinstance(insn, MultiAssignmentBase):
+            callables_count += (
+                    callables_counting_mapper(insn.expression))
+        elif isinstance(insn, (_DataObliviousInstruction, CInstruction)):
+            pass
+        else:
+            raise NotImplementedError("Unknown instruction type %s." % (
+                type(insn)))
+
+    return callables_count
+
+
+# FIXME: @memoize_method
+def count_callables_in_program_callables_info(program_callables_info):
+    """
+    Returns an instance of :class:`collection.Counter` representing the number
+    of times the callables is called in program_callables_info.
+    """
+    root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable
+            in program_callables_info.values() if
+            isinstance(in_knl_callable, CallableKernel) and
+            in_knl_callable.subkernel.is_called_from_host]
+
+    from collections import Counter
+    callables_count = Counter([root_kernel_name])
+    callables_count += (
+            count_callables_in_kernel(program_callables_info[
+                root_kernel_name].subkernel, program_callables_info))
+    return callables_count
+
+# }}}
+
+
 # {{{ program callables info
 
 class ProgramCallablesInfo(ImmutableRecord):
@@ -435,12 +553,6 @@ class ProgramCallablesInfo(ImmutableRecord):
         identifier to instances of
         :class:`loopy.kernel.function_interface.InKernelCallable`
 
-    .. attribute:: num_times_callables_called
-
-        An instace of :class:`dict` that contains a mapping from function
-        identifier to :class:`int`, that denotes the number of times the
-        callable is being called in the entire :class:`loopy.Program`.
-
     .. attribute:: history
 
         An instance of :class:`dict` that contains a mapping from function
@@ -453,54 +565,92 @@ class ProgramCallablesInfo(ImmutableRecord):
         An instance of :class:`bool` which is intended to aid the working of
         :meth:`with_enter_edit_callables_mode`, :meth:`with_callable` and
         :meth:`with_exit_edit_callables_mode`.
-
-    .. attribute:: renames_needed_after_editing
-
-        An instance of :class:`dict` which is intended to aid the working of
-        :meth:`with_enter_edit_callables_mode`, :meth:`with_callable` and
-        :meth:`with_exit_edit_callables_mode`.
     """
     def __init__(self, resolved_functions,
-            num_times_callables_called=None, history=None,
-            is_being_edited=False, renames_needed_after_editing={}):
+            history=None, is_being_edited=False):
 
-        if num_times_callables_called is None:
-            num_times_callables_called = dict((func_id, 1) for func_id in
-                    resolved_functions)
         if history is None:
             history = dict((func_id, set([func_id])) for func_id in
                     resolved_functions)
 
         super(ProgramCallablesInfo, self).__init__(
                 resolved_functions=resolved_functions,
-                num_times_callables_called=num_times_callables_called,
                 history=history,
-                is_being_edited=is_being_edited,
-                renames_needed_after_editing=renames_needed_after_editing)
+                is_being_edited=is_being_edited)
 
     hash_fields = (
             "resolved_functions",
-            "num_times_callables_called",
             "is_being_edited",
-            "renames_needed_after_editing",
             "history")
 
     update_persistent_hash = LoopKernel.update_persistent_hash
 
-    def add_callable(self, function, in_kernel_callable):
+    def with_add_callable(self, function, in_kernel_callable):
+        """
+        Returns a copy of *self* with the *function* associated with the
+        *in_kernel_callable*.
+        """
+        # note: this does not require the edit mode to be true.
+        # the reason for the edit mode is that we need to take care of the
+        # renaming that might be needed to be done
+        # PS: delete this note?
+        history = self.history.copy()
+
+        if in_kernel_callable in self.resolved_functions.values():
+            # the callable already exists, implies return the function
+            # identifier corresposing to that callable.
+            for func_id, in_knl_callable in self.resolved_functions.items():
+                if in_knl_callable == in_kernel_callable:
+                    history[func_id] = history[func_id] | set([function.name])
+                    return (
+                            self.copy(
+                                history=history),
+                            func_id)
+        else:
+
+            # {{{ handle ReductionOpFunction
+
+            if isinstance(function, ReductionOpFunction):
+                unique_function_identifier = function.copy()
+                updated_resolved_functions = self.resolved_functions.copy()
+                updated_resolved_functions[unique_function_identifier] = (
+                        in_kernel_callable)
+
+                return (
+                        self.copy(
+                            resolved_functions=updated_resolved_functions),
+                        unique_function_identifier)
+
+            # }}}
+
+            unique_function_identifier = function.name
+            while unique_function_identifier in self.resolved_functions:
+                unique_function_identifier = (
+                        next_indexed_function_identifier(
+                            unique_function_identifier))
+
+        updated_resolved_functions = self.resolved_functions.copy()
+        updated_resolved_functions[unique_function_identifier] = (
+                in_kernel_callable)
 
         history[unique_function_identifier] = set(
                 [unique_function_identifier])
-        pass
 
-    def with_updated_num_times_being_called(self):
-        root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable
-                in self.resolved_functions.values() if
-                isinstance(in_knl_callable, CallableKernel) and
-                in_knl_callable.is_called_from_host]
+        return (
+                self.copy(
+                    history=history,
+                    resolved_functions=updated_resolved_functions),
+                Variable(unique_function_identifier))
 
     def with_edit_callables_mode(self):
-        return self.copy(is_being_edited=True)
+        """
+        Initiates *self* for a walk traversal through all the callables.
+        """
+        # PS: I don't see a need for this method right now.
+        # This is just for validation purposes, maybe needs to disapper if you
+        # find a better solution?
+        return self.copy(
+                is_being_edited=True)
 
     def with_callable(self, function, in_kernel_callable):
         """
@@ -512,27 +662,24 @@ class ProgramCallablesInfo(ImmutableRecord):
 
         .. note::
 
-            Assumes that each callable is touched atmost once, the internal
-            working of this function fails if that is violated.
+            - Use :meth:`with_add_callable` if a callable is being resolved for the
+                first time.
         """
-        # FIXME: add a note about using enter and exit. ~KK
-        # FIXME: think about a better idea of "with_added_callable" this would
-        # be more convenient for developer-faced usage. ~KK
-        # FIXME: Is this is a bad code? Yes.
-        # Is there a better alternative to it. Definitely maybe.
-        # But I don't want to spend the next 182 years of my life optimizing
-        # some scheme, without even implmenting it to some problem!
+
+        # {{{ non-edit mode
 
         if not self.is_being_edited:
             if function.name in self.resolved_functions and (
                     self.resolved_functions[function.name] == in_kernel_callable):
+                # if not being edited, check that the given function is
+                # equal to the the old version of the callable.
                 return self, function
             else:
                 print('Old: ', self.resolved_functions[function.name])
                 print('New: ', in_kernel_callable)
-                raise LoopyError("Use 'enter_edit_callables_mode' first.")
+                raise LoopyError("Use 'with_enter_edit_callables_mode' first.")
 
-        from loopy.library.reduction import ReductionOpFunction
+        # }}}
 
         # {{{ sanity checks
 
@@ -543,87 +690,90 @@ class ProgramCallablesInfo(ImmutableRecord):
 
         # }}}
 
-        renames_needed_after_editing = self.renames_needed_after_editing.copy()
-        num_times_callables_called = self.num_times_callables_called.copy()
         history = self.history.copy()
 
-        if isinstance(function, ReductionOpFunction):
-            unique_function_identifier = function.copy()
-            num_times_callables_called[function] -= 1
-
-            num_times_callables_called[unique_function_identifier] = 1
-
-            updated_resolved_functions = self.resolved_functions.copy()
-            updated_resolved_functions[unique_function_identifier] = (
-                    in_kernel_callable)
-
-            return (
-                    self.copy(
-                        resolved_functions=updated_resolved_functions,
-                        num_times_callables_called=num_times_callables_called,
-                        renames_needed_after_editing=(
-                            renames_needed_after_editing)),
-                    unique_function_identifier)
-
         if in_kernel_callable in self.resolved_functions.values():
-            # the callable already exists, implies return the function
-            # identifier corresposing to that callable.
+
+            # the callable already exists, hence return the function
+            # identifier corresponding to that callable.
             for func_id, in_knl_callable in self.resolved_functions.items():
                 if in_knl_callable == in_kernel_callable:
-                    num_times_callables_called[func_id] += 1
-                    num_times_callables_called[function.name] -= 1
-                    if num_times_callables_called[function.name] == 0:
-                        renames_needed_after_editing[func_id] = function.name
-
                     history[func_id] = history[func_id] | set([function.name])
                     return (
                             self.copy(
-                                history=history,
-                                num_times_callables_called=(
-                                    num_times_callables_called),
-                                renames_needed_after_editing=(
-                                    renames_needed_after_editing)),
+                                history=history),
                             func_id)
         else:
-            unique_function_identifier = function.name
-            if self.num_times_callables_called[function.name] > 1:
-                while unique_function_identifier in self.resolved_functions:
-                    unique_function_identifier = (
-                            next_indexed_function_identifier(
-                                unique_function_identifier))
+            # {{{ handle ReductionOpFunction
+
+            if isinstance(function, ReductionOpFunction):
+                unique_function_identifier = function.copy()
+                updated_resolved_functions = self.resolved_functions.copy()
+                updated_resolved_functions[unique_function_identifier] = (
+                        in_kernel_callable)
 
-            num_times_callables_called[function.name] -= 1
-            num_times_callables_called[unique_function_identifier] = 1
+                return (
+                        self.copy(
+                            resolved_functions=updated_resolved_functions),
+                        unique_function_identifier)
+
+            # }}}
+            unique_function_identifier = function.name
+            while unique_function_identifier in self.resolved_functions:
+                unique_function_identifier = (
+                        next_indexed_function_identifier(
+                            unique_function_identifier))
 
         updated_resolved_functions = self.resolved_functions.copy()
         updated_resolved_functions[unique_function_identifier] = (
                 in_kernel_callable)
 
-        if not resolved_for_the_first_time:
-            history[unique_function_identifier] = (
-                    history[function.name] | set([unique_function_identifier]))
-        else:
+        history[unique_function_identifier] = (
+                history[function.name] | set([unique_function_identifier]))
 
         return (
                 self.copy(
                     history=history,
-                    resolved_functions=updated_resolved_functions,
-                    num_times_callables_called=num_times_callables_called,
-                    renames_needed_after_editing=renames_needed_after_editing),
+                    resolved_functions=updated_resolved_functions),
                 Variable(unique_function_identifier))
 
-    def with_exit_edit_callables_mode(self):
+    def with_exit_edit_callables_mode(self, old_callables_count):
+        """
+        Returns a copy of *self* with renaming of the callables done whenver
+        possible.
+
+        *For example: * If all the ``sin`` got diverged as ``sin_0, sin_1``,
+        then all the renaming is done such that one of flavors of the function
+        is renamed back to ``sin``.
+        """
+
+        new_callables_count = count_callables_in_program_callables_info(
+                self)
+        history = self.history.copy()
+        renames_needed = {}
+
         assert self.is_being_edited
 
-        num_times_callables_called = {}
+        # NOTE:(to self by KK)
+        # all we need to do is change the name of the variables that were seen
+        # in old_callables_count but are no longer available.
+        # Using these 2 figure out the renames needed.
+        for old_func_id in old_callables_count-new_callables_count:
+            # this implies that all the function instances having the name
+            # "func_id" have been renamed to something else.
+            for new_func_id in (
+                    new_callables_count.keys()-renames_needed.keys()):
+                if old_func_id in history[new_func_id]:
+                    renames_needed[new_func_id] = old_func_id
+
         resolved_functions = {}
-        history = self.history.copy()
 
         for func_id, in_knl_callable in self.resolved_functions.items():
             if isinstance(in_knl_callable, CallableKernel):
+                # If callable kernel, perform renames.
                 old_subkernel = in_knl_callable.subkernel
                 new_subkernel = rename_resolved_functions_in_a_single_kernel(
-                        old_subkernel, self.renames_needed_after_editing)
+                        old_subkernel, renames_needed)
                 in_knl_callable = (
                         in_knl_callable.copy(subkernel=new_subkernel))
             elif isinstance(in_knl_callable, ScalarCallable):
@@ -632,44 +782,22 @@ class ProgramCallablesInfo(ImmutableRecord):
                 raise NotImplementedError("Unknown callable type %s." %
                         type(in_knl_callable).__name__)
 
-            if func_id in self.renames_needed_after_editing:
+            if func_id in renames_needed:
+                # If function name itself in renames change the key of the
+                # dict.
                 history.pop(func_id)
 
-                new_func_id = self.renames_needed_after_editing[func_id]
+                new_func_id = renames_needed[func_id]
                 resolved_functions[new_func_id] = (
                         in_knl_callable)
-                num_times_callables_called[new_func_id] = (
-                        self.num_times_callables_called[func_id])
-
             else:
                 resolved_functions[func_id] = in_knl_callable
-                num_times_callables_called[func_id] = (
-                        self.num_times_callables_called[func_id])
 
         return self.copy(
                 is_being_edited=False,
-                resolved_functions=resolved_functions,
-                num_times_callables_called=num_times_callables_called,
-                renames_needed_after_editing={})
-
-    def with_deleted_callable(self, func_id, instances=1):
-        num_times_callables_called = self.num_times_callables_called.copy()
-        history = self.history.copy()
-        resolved_functions = self.resolved_functions.copy()
-
-        assert instances <= num_times_callables_called[func_id]
+                resolved_functions=resolved_functions)
 
-        num_times_callables_called[func_id] -= instances
-
-        if num_times_callables_called[func_id] == 0:
-            num_times_callables_called.pop(func_id)
-            history.pop(func_id)
-            resolved_functions.pop(func_id)
-
-        return self.copy(
-                resolved_functions=resolved_functions,
-                num_times_callables_called=num_times_callables_called,
-                history=history)
+    # {{{ behave like a dict(syntactic sugar)
 
     def __getitem__(self, item):
         return self.resolved_functions[item]
@@ -683,11 +811,16 @@ class ProgramCallablesInfo(ImmutableRecord):
     def values(self):
         return self.resolved_functions.values()
 
+    # }}}
 
 # }}}
 
 
 def default_func_id_to_kernel_callable_mappers(target):
+    """
+    Returns a list of functions that are provided through *target* by deafault.
+    """
+    # FIXME: name scopers is confusing!(change it to something else.)
 
     from loopy.library.function import loopy_specific_callable_scopers
     return (
@@ -695,11 +828,18 @@ def default_func_id_to_kernel_callable_mappers(target):
                 target.get_device_ast_builder().function_scopers()))
 
 
+# {{{ helper functions
+
 def make_program_from_kernel(kernel):
+    """
+    Returns an instance of :class:`loopy.Program` with the *kernel* as the root
+    kernel.
+    """
 
-    program_callables_info = initialize_program_callables_info_from_kernel(kernel,
-            default_func_id_to_kernel_callable_mappers(kernel.target))
+    # get the program callables info
+    program_callables_info = initialize_program_callables_info_from_kernel(kernel)
 
+    # get the program from program callables info
     program = Program(
             name=kernel.name,
             program_callables_info=program_callables_info,
@@ -711,6 +851,12 @@ def make_program_from_kernel(kernel):
 
 
 def iterate_over_kernels_if_given_program(transform_for_single_kernel):
+    """
+    Function wrapper for transformations of the type ``transform(kernel:
+    LoopKernel, *args, **kwargs): LoopKernel``. Returns a function with the
+    ``transform`` being implemented on all of the callable kernels in a
+    :class:`loopy.Program`.
+    """
     def _collective_transform(program_or_kernel, *args, **kwargs):
         if isinstance(program_or_kernel, Program):
             program = program_or_kernel
@@ -740,5 +886,7 @@ def iterate_over_kernels_if_given_program(transform_for_single_kernel):
 
     return wraps(transform_for_single_kernel)(_collective_transform)
 
+# }}}
+
 
 # vim: foldmethod=marker
diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py
index 9d9935ab..90f53095 100644
--- a/loopy/transform/callable.py
+++ b/loopy/transform/callable.py
@@ -35,10 +35,18 @@ __doc__ = """
 
 # {{{ register function lookup
 
-def resolved_callables_from_function_lookup(program,
-        func_id_to_kernel_callable_mapper):
+def _resolved_callables_from_function_lookup(program,
+        func_id_to_in_kernel_callable_mapper):
+    """
+    Returns a copy of *program* with the expression nodes marked "Resolved"
+    if any match is found through the given
+    *func_id_to_in_kernel_callable_mapper*.
+
+    :arg func_id_to_in_kernel_callable_mapper: A function with signature
+        ``(target, identifier)`` that returns either an instance of
+        :class:`loopy.InKernelCallable` or *None*.
+    """
     program_callables_info = program.program_callables_info
-    program_callables_info = program_callables_info.with_edit_callables_mode()
 
     callable_knls = dict(
             (func_id, in_knl_callable) for func_id, in_knl_callable in
@@ -55,9 +63,8 @@ def resolved_callables_from_function_lookup(program,
 
         resolved_function_marker = ResolvedFunctionMarker(
                 rule_mapping_context, kernel, program_callables_info,
-                [func_id_to_kernel_callable_mapper])
+                [func_id_to_in_kernel_callable_mapper])
 
-        # scoping fucntions and collecting the scoped functions
         new_subkernel = rule_mapping_context.finish_kernel(
                 resolved_function_marker.map_kernel(kernel))
         program_callables_info = resolved_function_marker.program_callables_info
@@ -65,9 +72,6 @@ def resolved_callables_from_function_lookup(program,
         edited_callable_knls[func_id] = in_knl_callable.copy(
                 subkernel=new_subkernel)
 
-    program_callables_info = (
-            program_callables_info.with_exit_edit_callables_mode())
-
     new_resolved_functions = {}
 
     for func_id, in_knl_callable in program_callables_info.items():
@@ -85,7 +89,7 @@ def resolved_callables_from_function_lookup(program,
 def register_function_id_to_in_knl_callable_mapper(program,
         func_id_to_in_knl_callable_mapper):
     """
-    Returns a copy of *kernel* with the *function_lookup* registered.
+    Returns a copy of *program* with the *function_lookup* registered.
 
     :arg func_id_to_in_knl_callable_mapper: A function of signature ``(target,
         identifier)`` returning a
@@ -105,7 +109,7 @@ def register_function_id_to_in_knl_callable_mapper(program,
         new_func_id_mappers = program.func_id_to_in_knl_callable_mappers + (
                 [func_id_to_in_knl_callable_mapper])
 
-    program = resolved_callables_from_function_lookup(program,
+    program = _resolved_callables_from_function_lookup(program,
             func_id_to_in_knl_callable_mapper)
 
     new_program = program.copy(
diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py
index d43ce025..f2e62368 100644
--- a/loopy/transform/fusion.py
+++ b/loopy/transform/fusion.py
@@ -292,50 +292,6 @@ def _fuse_two_kernels(knla, knlb):
 
 
 def fuse_loop_kernels(kernels, suffixes=None, data_flow=None):
-    """Return a kernel that performs all the operations in all entries
-    of *kernels*.
-
-    :arg kernels: A list of :class:`loopy.LoopKernel` instances to be fused.
-    :arg suffixes: If given, must be a list of strings of a length matching
-        that of *kernels*. This will be used to disambiguate the names
-        of temporaries, as described below.
-    :arg data_flow: A list of data dependencies
-        ``[(var_name, from_kernel, to_kernel), ...]``.
-        Based on this, the fuser will create dependencies between all
-        writers of *var_name* in ``kernels[from_kernel]`` to
-        readers of *var_name* in ``kernels[to_kernel]``.
-        *from_kernel* and *to_kernel* are indices into *kernels*.
-
-    The components of the kernels are fused as follows:
-
-    *   The resulting kernel will have a domain involving all the inames
-        and parameters occurring across *kernels*.
-        Inames with matching names across *kernels* are fused in such a way
-        that they remain a single iname in the fused kernel.
-        Use :func:`loopy.rename_iname` if this is not desired.
-
-    *   The projection of the domains of each pair of kernels onto their
-        common subset of inames must match in order for fusion to
-        succeed.
-
-    *   Assumptions are fused by taking their conjunction.
-
-    *   If kernel arguments with matching names are encountered across
-        *kernels*, their declarations must match in order for fusion to
-        succeed.
-
-    *   Temporaries are automatically renamed to remain uniquely associated
-        with each instruction stream.
-
-    *   The resulting kernel will contain all instructions from each entry
-        of *kernels*. Clashing instruction IDs will be renamed to ensure
-        uniqueness.
-
-    .. versionchanged:: 2016.2
-
-        *data_flow* was added in version 2016.2
-    """
-
     assert all(isinstance(knl, LoopKernel) for knl in kernels)
     kernels = list(kernels)
 
@@ -419,8 +375,54 @@ def fuse_loop_kernels(kernels, suffixes=None, data_flow=None):
 
 
 def fuse_kernels(programs, suffixes=None, data_flow=None):
+    """Return a kernel that performs all the operations in all entries
+    of *kernels*.
+
+    :arg kernels: A list of :class:`loopy.LoopKernel` instances to be fused.
+    :arg suffixes: If given, must be a list of strings of a length matching
+        that of *kernels*. This will be used to disambiguate the names
+        of temporaries, as described below.
+    :arg data_flow: A list of data dependencies
+        ``[(var_name, from_kernel, to_kernel), ...]``.
+        Based on this, the fuser will create dependencies between all
+        writers of *var_name* in ``kernels[from_kernel]`` to
+        readers of *var_name* in ``kernels[to_kernel]``.
+        *from_kernel* and *to_kernel* are indices into *kernels*.
+
+    The components of the kernels are fused as follows:
+
+    *   The resulting kernel will have a domain involving all the inames
+        and parameters occurring across *kernels*.
+        Inames with matching names across *kernels* are fused in such a way
+        that they remain a single iname in the fused kernel.
+        Use :func:`loopy.rename_iname` if this is not desired.
+
+    *   The projection of the domains of each pair of kernels onto their
+        common subset of inames must match in order for fusion to
+        succeed.
+
+    *   Assumptions are fused by taking their conjunction.
+
+    *   If kernel arguments with matching names are encountered across
+        *kernels*, their declarations must match in order for fusion to
+        succeed.
+
+    *   Temporaries are automatically renamed to remain uniquely associated
+        with each instruction stream.
+
+    *   The resulting kernel will contain all instructions from each entry
+        of *kernels*. Clashing instruction IDs will be renamed to ensure
+        uniqueness.
+
+    .. versionchanged:: 2016.2
+
+        *data_flow* was added in version 2016.2
+    """
+
+    # all the resolved functions in programs must be registered in
+    # main_program_callables_info
     main_prog_callables_info = (
-            programs[0].program_callables_info.with_edit_callables_mode())
+            programs[0].program_callables_info)
     old_root_kernel_callable = (
             programs[0].program_callables_info[programs[0].name])
     kernels = [programs[0].root_kernel]
@@ -431,17 +433,22 @@ def fuse_kernels(programs, suffixes=None, data_flow=None):
         renames_needed = {}
         for old_func_id, in_knl_callable in prog.program_callables_info.items():
             if isinstance(in_knl_callable, CallableKernel):
+                # Fusing programs with multiple callable kernels is tough.
+                # Reason: Need to first figure out the order in which the
+                # callable kernels must be resolved into
+                # main_program_callables_info, because of renaming is
+                # needed to be done in the callable kernels before registering.
+                # Hence disabling it until required.
                 if in_knl_callable.name != prog.name:
                     raise LoopyError("fuse_kernels cannot fuse programs with "
                             "multiple callable kernels.")
+
+                # root kernel are dealt at the end after performing all the
+                # renaming.
                 continue
-            num_times_called = (
-                    prog.program_callables_info.num_times_callables_called[
-                        old_func_id])
-            for i in range(num_times_called):
-                main_prog_callables_info, new_func_id = (
-                        main_prog_callables_info.with_callables(var(old_func_id),
-                            in_knl_callable, True))
+            main_prog_callables_info, new_func_id = (
+                    main_prog_callables_info.with_add_callable(var(old_func_id),
+                        in_knl_callable))
 
             if old_func_id != new_func_id:
                 renames_needed[old_func_id] = new_func_id
@@ -456,12 +463,10 @@ def fuse_kernels(programs, suffixes=None, data_flow=None):
     new_root_kernel_callable = old_root_kernel_callable.copy(
             subkernel=new_root_kernel.copy(name=programs[0].name))
 
-    main_prog_callables_info, _ = main_prog_callables_info.with_callable(
+    # TODO: change the name of the final root kernel.
+    main_prog_callables_info, _ = main_prog_callables_info.with_add_callable(
             var(programs[0].name), new_root_kernel_callable)
 
-    main_prog_callables_info = (
-            main_prog_callables_info.with_exit_edit_callables_mode())
-
     return programs[0].copy(
             program_callables_info=main_prog_callables_info)
 
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 3ae9a142..ab37519e 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -52,7 +52,7 @@ def _debug(kernel, s, *args):
 def get_return_types_as_tuple(arg_id_to_dtype):
     """Returns the types of arguments in  a tuple format.
 
-    :param arg_id_to_dtype: An instance of :class:`dict` which denotes a
+    :arg arg_id_to_dtype: An instance of :class:`dict` which denotes a
                             mapping from the arguments to their inferred types.
     """
     return_arg_id_to_dtype = dict((id, dtype) for id, dtype in
@@ -894,6 +894,9 @@ def infer_unknown_types(program, expect_completion=False):
             program_callables_info[program.name])
     type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel
 
+    from loopy.program import count_callables_in_program_callables_info
+    old_callables_count = count_callables_in_program_callables_info(
+            program_callables_info)
     program_callables_info = (
             program.program_callables_info.with_edit_callables_mode())
     root_kernel, program_callables_info = (
@@ -910,10 +913,9 @@ def infer_unknown_types(program, expect_completion=False):
                 type_inferred_knl_callable))
 
     program_callables_info = (
-            program_callables_info.with_exit_edit_callables_mode())
+            program_callables_info.with_exit_edit_callables_mode(
+                old_callables_count))
 
-    # FIXME: maybe put all of this in a function?
-    # need to infer functions that were left out during inference
     return program.copy(program_callables_info=program_callables_info)
 
 # }}}
-- 
GitLab


From 42229e028ba32c132fde98deee8edec002354131 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Tue, 14 Aug 2018 11:23:35 +0530
Subject: [PATCH 14/80] much better design for program callables info.

---
 loopy/program.py | 19 +++++++++++++++----
 1 file changed, 15 insertions(+), 4 deletions(-)

diff --git a/loopy/program.py b/loopy/program.py
index 32869d26..e3a527ee 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -526,6 +526,8 @@ def count_callables_in_program_callables_info(program_callables_info):
     Returns an instance of :class:`collection.Counter` representing the number
     of times the callables is called in program_callables_info.
     """
+    # should raise an error if there are more than  one root kernels(which is
+    # illegal)
     root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable
             in program_callables_info.values() if
             isinstance(in_knl_callable, CallableKernel) and
@@ -636,6 +638,9 @@ class ProgramCallablesInfo(ImmutableRecord):
         history[unique_function_identifier] = set(
                 [unique_function_identifier])
 
+        if unique_function_identifier == 'loopy_kernel_0':
+            1/0
+
         return (
                 self.copy(
                     history=history,
@@ -719,10 +724,16 @@ class ProgramCallablesInfo(ImmutableRecord):
 
             # }}}
             unique_function_identifier = function.name
-            while unique_function_identifier in self.resolved_functions:
-                unique_function_identifier = (
-                        next_indexed_function_identifier(
-                            unique_function_identifier))
+
+            if isinstance(in_kernel_callable, CallableKernel) and (
+                    in_kernel_callable.subkernel.is_called_from_host):
+                # special treatment if the callable is the root kernel
+                pass
+            else:
+                while unique_function_identifier in self.resolved_functions:
+                    unique_function_identifier = (
+                            next_indexed_function_identifier(
+                                unique_function_identifier))
 
         updated_resolved_functions = self.resolved_functions.copy()
         updated_resolved_functions[unique_function_identifier] = (
-- 
GitLab


From fa0fb70b114f3727a3683488e2cc55c900081873 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Tue, 14 Aug 2018 12:22:50 +0530
Subject: [PATCH 15/80] deal with reduction callables.

---
 loopy/program.py | 29 +++++++++++++++++++++++------
 1 file changed, 23 insertions(+), 6 deletions(-)

diff --git a/loopy/program.py b/loopy/program.py
index e3a527ee..7010e110 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -135,8 +135,9 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper):
             in_knl_callable = self.find_in_knl_callable_from_identifier(func_id)
             assert in_knl_callable is not None
             self.program_callables_info, _ = (
-                    self.program_callables_info.with_callable(func_id,
-                        in_knl_callable, True))
+                    self.program_callables_info.with_add_callable(func_id,
+                        in_knl_callable))
+            # FIXME: where do you deal with the parameters? ~KK
         return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state)
 
 
@@ -486,6 +487,10 @@ class CallablesCountingMapper(CombineMapper):
 
     map_call_with_kwargs = map_call
 
+    def map_reduction(self, expr):
+        return Counter(expr.operation.get_scalar_callables()) + (
+                super(CallablesCountingMapper, self).map_reduction(expr))
+
     def map_constant(self, expr):
         return Counter()
 
@@ -592,10 +597,21 @@ class ProgramCallablesInfo(ImmutableRecord):
         Returns a copy of *self* with the *function* associated with the
         *in_kernel_callable*.
         """
+        # FIXME: pleasse better docs.. ~KK
         # note: this does not require the edit mode to be true.
         # the reason for the edit mode is that we need to take care of the
         # renaming that might be needed to be done
         # PS: delete this note?
+
+        # {{{ sanity checks
+
+        if isinstance(function, str):
+            function = Variable(function)
+
+        assert isinstance(function, (Variable, ReductionOpFunction))
+
+        # }}}
+
         history = self.history.copy()
 
         if in_kernel_callable in self.resolved_functions.values():
@@ -617,9 +633,12 @@ class ProgramCallablesInfo(ImmutableRecord):
                 updated_resolved_functions = self.resolved_functions.copy()
                 updated_resolved_functions[unique_function_identifier] = (
                         in_kernel_callable)
+                history[unique_function_identifier] = set(
+                        [unique_function_identifier])
 
                 return (
                         self.copy(
+                            history=history,
                             resolved_functions=updated_resolved_functions),
                         unique_function_identifier)
 
@@ -638,9 +657,6 @@ class ProgramCallablesInfo(ImmutableRecord):
         history[unique_function_identifier] = set(
                 [unique_function_identifier])
 
-        if unique_function_identifier == 'loopy_kernel_0':
-            1/0
-
         return (
                 self.copy(
                     history=history,
@@ -779,7 +795,8 @@ class ProgramCallablesInfo(ImmutableRecord):
 
         resolved_functions = {}
 
-        for func_id, in_knl_callable in self.resolved_functions.items():
+        for func_id in new_callables_count:
+            in_knl_callable = self.resolved_functions[func_id]
             if isinstance(in_knl_callable, CallableKernel):
                 # If callable kernel, perform renames.
                 old_subkernel = in_knl_callable.subkernel
-- 
GitLab


From a161a4854c2b800884fc12269062f60cafe8b95e Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Tue, 14 Aug 2018 12:26:34 +0530
Subject: [PATCH 16/80] removes wrong invocation of with_callable for
 ManglerCallable.

---
 loopy/type_inference.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index ab37519e..8b5a656c 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -408,8 +408,8 @@ class TypeInferenceMapper(CombineMapper):
                         identifier, function_mangler, arg_id_to_dtype,
                         arg_id_to_descr, mangle_result.target_name)
                 self.program_callables_info, new_function_id = (
-                        self.program_callables_info.with_callable(
-                            expr.function, in_knl_callable, True))
+                        self.program_callables_info.with_add_callable(
+                            expr.function, in_knl_callable))
 
                 if isinstance(expr, Call):
                     self.old_calls_to_new_calls[expr] = new_function_id
-- 
GitLab


From 76336791d7b6cb6919ec97b02a32f4e74740c7db Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Tue, 14 Aug 2018 12:50:27 +0530
Subject: [PATCH 17/80] count callables in expression after expanding for
 substitutitons.

---
 loopy/kernel/__init__.py | 4 ++--
 loopy/program.py         | 8 +++++---
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 3b189da5..89aef660 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -1367,8 +1367,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
     # {{{ direct execution
 
     def __call__(self, *args, **kwargs):
-        raise LoopyError("Calling a LoopKernel is deprecated, call a Program "
-                "instead.")
+        warn("Calling a LoopKernel is deprecated, call a Program "
+                "instead.", DeprecationWarning, stacklevel=2)
         from loopy.program import make_program_from_kernel
         program = make_program_from_kernel(self)
         return program(*args, **kwargs)
diff --git a/loopy/program.py b/loopy/program.py
index 7010e110..12fe756d 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -29,8 +29,8 @@ from pytools import ImmutableRecord, memoize_method
 from pymbolic.primitives import Variable
 from functools import wraps
 
-from loopy.symbolic import (
-        RuleAwareIdentityMapper, ResolvedFunction, CombineMapper)
+from loopy.symbolic import (RuleAwareIdentityMapper, ResolvedFunction,
+        CombineMapper, SubstitutionRuleExpander)
 from loopy.kernel.function_interface import (
         CallableKernel, ScalarCallable)
 from loopy.kernel.instruction import (
@@ -511,11 +511,13 @@ def count_callables_in_kernel(kernel, program_callables_info):
     callables_count = Counter()
     callables_counting_mapper = CallablesCountingMapper(
             program_callables_info)
+    subst_expander = SubstitutionRuleExpander(kernel.substitutions)
 
     for insn in kernel.instructions:
         if isinstance(insn, MultiAssignmentBase):
             callables_count += (
-                    callables_counting_mapper(insn.expression))
+                    callables_counting_mapper(subst_expander(
+                        insn.expression)))
         elif isinstance(insn, (_DataObliviousInstruction, CInstruction)):
             pass
         else:
-- 
GitLab


From ab8bebf0a06bc3661396d0b49176ae47c7ee40f1 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Tue, 14 Aug 2018 13:16:30 +0530
Subject: [PATCH 18/80] pass statistics

---
 loopy/preprocess.py     |  4 +---
 loopy/program.py        | 49 ++++++++++++++++++++++-------------------
 loopy/statistics.py     | 28 ++++++++++-------------
 loopy/type_inference.py |  4 +---
 4 files changed, 40 insertions(+), 45 deletions(-)

diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 56db777b..472c74db 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -2269,9 +2269,7 @@ def traverse_to_infer_arg_descr(kernel, program_callables_info):
 
 def infer_arg_descr(program):
     root_kernel_callable = program.program_callables_info[program.name]
-    from loopy.program import count_callables_in_program_callables_info
-    old_callables_count = count_callables_in_program_callables_info(
-            program.program_callables_info)
+    old_callables_count = program.program_callables_info.callables_count()
     program_callables_info = (
             program.program_callables_info.with_edit_callables_mode())
     root_kernel = program.root_kernel
diff --git a/loopy/program.py b/loopy/program.py
index 12fe756d..a0477bdf 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -526,27 +526,6 @@ def count_callables_in_kernel(kernel, program_callables_info):
 
     return callables_count
 
-
-# FIXME: @memoize_method
-def count_callables_in_program_callables_info(program_callables_info):
-    """
-    Returns an instance of :class:`collection.Counter` representing the number
-    of times the callables is called in program_callables_info.
-    """
-    # should raise an error if there are more than  one root kernels(which is
-    # illegal)
-    root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable
-            in program_callables_info.values() if
-            isinstance(in_knl_callable, CallableKernel) and
-            in_knl_callable.subkernel.is_called_from_host]
-
-    from collections import Counter
-    callables_count = Counter([root_kernel_name])
-    callables_count += (
-            count_callables_in_kernel(program_callables_info[
-                root_kernel_name].subkernel, program_callables_info))
-    return callables_count
-
 # }}}
 
 
@@ -594,6 +573,29 @@ class ProgramCallablesInfo(ImmutableRecord):
 
     update_persistent_hash = LoopKernel.update_persistent_hash
 
+    # FIXME: @memoize_method
+    def callables_count(self):
+        """
+        Returns an instance of :class:`collection.Counter` representing the number
+        of times the callables is called in program_callables_info.
+        """
+        # should raise an error if there are more than  one root kernels(which is
+        # illegal)
+        root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable
+                in self.values() if
+                isinstance(in_knl_callable, CallableKernel) and
+                in_knl_callable.subkernel.is_called_from_host]
+
+        from collections import Counter
+        callables_count = Counter([root_kernel_name])
+        callables_count += (
+                count_callables_in_kernel(self[
+                    root_kernel_name].subkernel, self))
+
+        return callables_count
+
+    # {{{ interface to perfrom edits on callables
+
     def with_add_callable(self, function, in_kernel_callable):
         """
         Returns a copy of *self* with the *function* associated with the
@@ -776,8 +778,7 @@ class ProgramCallablesInfo(ImmutableRecord):
         is renamed back to ``sin``.
         """
 
-        new_callables_count = count_callables_in_program_callables_info(
-                self)
+        new_callables_count = self.callables_count()
         history = self.history.copy()
         renames_needed = {}
 
@@ -827,6 +828,8 @@ class ProgramCallablesInfo(ImmutableRecord):
                 is_being_edited=False,
                 resolved_functions=resolved_functions)
 
+    # }}}
+
     # {{{ behave like a dict(syntactic sugar)
 
     def __getitem__(self, item):
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 95e9f62a..3799967b 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1396,17 +1396,17 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False,
 
     op_map = ToCountMap()
 
+    callables_count = (
+                program.program_callables_info.callables_count())
+
     for func_id, in_knl_callable in program.program_callables_info.items():
         if isinstance(in_knl_callable, CallableKernel):
-            num_times_called = (
-                    program.program_callables_info.num_times_callables_called[
-                        func_id])
             knl = in_knl_callable.subkernel
             knl_op_map = get_op_map_for_single_kernel(knl,
                         program.program_callables_info, numpy_types,
                         count_redundant_work, subgroup_size)
 
-            for i in range(num_times_called):
+            for i in range(callables_count[func_id]):
                 op_map += knl_op_map
         elif isinstance(in_knl_callable, ScalarCallable):
             pass
@@ -1684,18 +1684,17 @@ def get_mem_access_map(program, numpy_types=True, count_redundant_work=False,
 
     access_map = ToCountMap()
 
+    callables_count = program.program_callables_info.callables_count()
+
     for func_id, in_knl_callable in program.program_callables_info.items():
         if isinstance(in_knl_callable, CallableKernel):
-            num_times_called = (
-                    program.program_callables_info.num_times_callables_called[
-                        func_id])
             knl = in_knl_callable.subkernel
             knl_access_map = get_access_map_for_single_kernel(knl,
                         program.program_callables_info, numpy_types,
                         count_redundant_work, subgroup_size)
 
             # FIXME: didn't see any easy way to multiply
-            for i in range(num_times_called):
+            for i in range(callables_count[func_id]):
                 access_map += knl_access_map
         elif isinstance(in_knl_callable, ScalarCallable):
             pass
@@ -1809,18 +1808,16 @@ def get_synchronization_map(program, subgroup_size=None):
     program = preprocess_program(program)
 
     sync_map = ToCountMap()
+    callables_count = program.program_callables_info.callables_count()
 
     for func_id, in_knl_callable in program.program_callables_info.items():
         if isinstance(in_knl_callable, CallableKernel):
-            num_times_called = (
-                    program.program_callables_info.num_times_callables_called[
-                        func_id])
             knl = in_knl_callable.subkernel
             knl_sync_map = get_synchronization_map_for_single_kernel(knl,
                     program.program_callables_info, subgroup_size)
 
             # FIXME: didn't see any easy way to multiply
-            for i in range(num_times_called):
+            for i in range(callables_count[func_id]):
                 sync_map += knl_sync_map
         elif isinstance(in_knl_callable, ScalarCallable):
             pass
@@ -1887,18 +1884,17 @@ def gather_access_footprints(program, ignore_uncountable=False):
     write_footprints = []
     read_footprints = []
 
+    callables_count = program.program_callables_info.callables_count()
+
     for func_id, in_knl_callable in program.program_callables_info.items():
         if isinstance(in_knl_callable, CallableKernel):
-            num_times_called = (
-                    program.program_callables_info.num_times_callables_called[
-                        func_id])
             knl = in_knl_callable.subkernel
             knl_write_footprints, knl_read_footprints = (
                     gather_access_footprints_for_single_kernel(knl,
                         ignore_uncountable))
 
             # FIXME: didn't see any easy way to multiply
-            for i in range(num_times_called):
+            for i in range(callables_count[func_id]):
                 write_footprints.extend(knl_write_footprints)
                 read_footprints.extend(knl_read_footprints)
 
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 8b5a656c..76d4a579 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -894,9 +894,7 @@ def infer_unknown_types(program, expect_completion=False):
             program_callables_info[program.name])
     type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel
 
-    from loopy.program import count_callables_in_program_callables_info
-    old_callables_count = count_callables_in_program_callables_info(
-            program_callables_info)
+    old_callables_count = program_callables_info.callables_count()
     program_callables_info = (
             program.program_callables_info.with_edit_callables_mode())
     root_kernel, program_callables_info = (
-- 
GitLab


From 44b247dc760d6f2eeb9e06b0cf375ce24262b68b Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Tue, 14 Aug 2018 14:28:48 +0530
Subject: [PATCH 19/80] dont rename if given a root kernel.

---
 loopy/program.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/loopy/program.py b/loopy/program.py
index a0477bdf..efc66b5a 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -649,15 +649,25 @@ class ProgramCallablesInfo(ImmutableRecord):
             # }}}
 
             unique_function_identifier = function.name
-            while unique_function_identifier in self.resolved_functions:
-                unique_function_identifier = (
-                        next_indexed_function_identifier(
-                            unique_function_identifier))
+
+            if isinstance(in_kernel_callable, CallableKernel) and (
+                    in_kernel_callable.subkernel.is_called_from_host):
+                # special treatment if the callable is the root kernel
+                pass
+            else:
+                while unique_function_identifier in self.resolved_functions:
+                    unique_function_identifier = (
+                            next_indexed_function_identifier(
+                                unique_function_identifier))
 
         updated_resolved_functions = self.resolved_functions.copy()
         updated_resolved_functions[unique_function_identifier] = (
                 in_kernel_callable)
 
+        if 'strongVolumeKernelR_0' in updated_resolved_functions:
+            import pudb
+            pudb.set_trace()
+
         history[unique_function_identifier] = set(
                 [unique_function_identifier])
 
@@ -759,6 +769,10 @@ class ProgramCallablesInfo(ImmutableRecord):
         updated_resolved_functions[unique_function_identifier] = (
                 in_kernel_callable)
 
+        if 'strongVolumeKernelR_0' in updated_resolved_functions:
+            import pudb
+            pudb.set_trace()
+
         history[unique_function_identifier] = (
                 history[function.name] | set([unique_function_identifier]))
 
-- 
GitLab


From 01e42c10b6e3b362d2dc325c7e1d177e0b7377a0 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Tue, 14 Aug 2018 20:31:08 +0530
Subject: [PATCH 20/80] perform only one rename!

---
 loopy/program.py        | 1 +
 loopy/type_inference.py | 5 -----
 2 files changed, 1 insertion(+), 5 deletions(-)

diff --git a/loopy/program.py b/loopy/program.py
index efc66b5a..911667df 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -809,6 +809,7 @@ class ProgramCallablesInfo(ImmutableRecord):
                     new_callables_count.keys()-renames_needed.keys()):
                 if old_func_id in history[new_func_id]:
                     renames_needed[new_func_id] = old_func_id
+                    break
 
         resolved_functions = {}
 
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 76d4a579..52150dcd 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -882,11 +882,6 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info,
 
 def infer_unknown_types(program, expect_completion=False):
     """Infer types on temporaries and arguments."""
-    from loopy.kernel import LoopKernel
-    if isinstance(program, LoopKernel):
-        # FIXME: deprecate warning needed here
-        from loopy.program import make_program_from_kernel
-        program = make_program_from_kernel(program)
 
     program_callables_info = program.program_callables_info
 
-- 
GitLab


From 50dc2fe4b266a968360fb03749705478372342d6 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Tue, 14 Aug 2018 20:38:25 +0530
Subject: [PATCH 21/80] replace keys() by six.viewkeys() for py2.7.

---
 loopy/program.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/program.py b/loopy/program.py
index 911667df..3872a83e 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -806,7 +806,7 @@ class ProgramCallablesInfo(ImmutableRecord):
             # this implies that all the function instances having the name
             # "func_id" have been renamed to something else.
             for new_func_id in (
-                    new_callables_count.keys()-renames_needed.keys()):
+                    six.viewkeys(new_callables_count)-six.viewkeys(renames_needed)):
                 if old_func_id in history[new_func_id]:
                     renames_needed[new_func_id] = old_func_id
                     break
-- 
GitLab


From 7ab71c675f472e2daa94f02a53c9fa61e8b5e2ff Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Tue, 14 Aug 2018 21:34:23 +0530
Subject: [PATCH 22/80] make ProgramCallablesInfo hashable.

---
 loopy/kernel/__init__.py |  2 ++
 loopy/program.py         | 23 +++++++++++++++--------
 2 files changed, 17 insertions(+), 8 deletions(-)

diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 89aef660..8b2cf3dd 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -1035,6 +1035,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 self.get_iname_bounds(iname, constants_only=True).size,
                 constants_only=True)))
 
+    @memoize_method
     def get_grid_sizes_for_insn_ids(self, insn_ids, program_callables_info,
             ignore_auto=False):
         """Return a tuple (global_size, local_size) containing a grid that
@@ -1132,6 +1133,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         return (to_dim_tuple(global_sizes, "global"),
                 to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes))
 
+    @memoize_method
     def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids,
             program_callables_info, ignore_auto=False):
         """Return a tuple (global_size, local_size) containing a grid that
diff --git a/loopy/program.py b/loopy/program.py
index 3872a83e..d19cd4e8 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -500,7 +500,7 @@ class CallablesCountingMapper(CombineMapper):
     map_type_cast = map_constant
 
 
-# FIXME: @memoize_method
+@memoize_method
 def count_callables_in_kernel(kernel, program_callables_info):
     """
     Returns an instance of :class:`collections.Counter` representing the number
@@ -558,7 +558,7 @@ class ProgramCallablesInfo(ImmutableRecord):
             history=None, is_being_edited=False):
 
         if history is None:
-            history = dict((func_id, set([func_id])) for func_id in
+            history = dict((func_id, frozenset([func_id])) for func_id in
                     resolved_functions)
 
         super(ProgramCallablesInfo, self).__init__(
@@ -571,9 +571,16 @@ class ProgramCallablesInfo(ImmutableRecord):
             "is_being_edited",
             "history")
 
+    def __hash__(self):
+        return hash((
+            frozenset(six.iteritems(self.resolved_functions)),
+            frozenset(six.iteritems(self.history)),
+            self.is_being_edited
+            ))
+
     update_persistent_hash = LoopKernel.update_persistent_hash
 
-    # FIXME: @memoize_method
+    @memoize_method
     def callables_count(self):
         """
         Returns an instance of :class:`collection.Counter` representing the number
@@ -623,7 +630,7 @@ class ProgramCallablesInfo(ImmutableRecord):
             # identifier corresposing to that callable.
             for func_id, in_knl_callable in self.resolved_functions.items():
                 if in_knl_callable == in_kernel_callable:
-                    history[func_id] = history[func_id] | set([function.name])
+                    history[func_id] = history[func_id] | frozenset([function.name])
                     return (
                             self.copy(
                                 history=history),
@@ -637,7 +644,7 @@ class ProgramCallablesInfo(ImmutableRecord):
                 updated_resolved_functions = self.resolved_functions.copy()
                 updated_resolved_functions[unique_function_identifier] = (
                         in_kernel_callable)
-                history[unique_function_identifier] = set(
+                history[unique_function_identifier] = frozenset(
                         [unique_function_identifier])
 
                 return (
@@ -668,7 +675,7 @@ class ProgramCallablesInfo(ImmutableRecord):
             import pudb
             pudb.set_trace()
 
-        history[unique_function_identifier] = set(
+        history[unique_function_identifier] = frozenset(
                 [unique_function_identifier])
 
         return (
@@ -733,7 +740,7 @@ class ProgramCallablesInfo(ImmutableRecord):
             # identifier corresponding to that callable.
             for func_id, in_knl_callable in self.resolved_functions.items():
                 if in_knl_callable == in_kernel_callable:
-                    history[func_id] = history[func_id] | set([function.name])
+                    history[func_id] = history[func_id] | frozenset([function.name])
                     return (
                             self.copy(
                                 history=history),
@@ -774,7 +781,7 @@ class ProgramCallablesInfo(ImmutableRecord):
             pudb.set_trace()
 
         history[unique_function_identifier] = (
-                history[function.name] | set([unique_function_identifier]))
+                history[function.name] | frozenset([unique_function_identifier]))
 
         return (
                 self.copy(
-- 
GitLab


From 8d4af7a2a89e7cff3db9c2a351733abfeb0161ef Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Tue, 14 Aug 2018 22:24:31 +0530
Subject: [PATCH 23/80] update persistent dict changed for frozenset.

---
 loopy/library/reduction.py | 1 -
 loopy/tools.py             | 5 +++++
 2 files changed, 5 insertions(+), 1 deletion(-)

diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index b968192e..b3deba65 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -229,7 +229,6 @@ class ReductionOpFunction(FunctionIdentifier):
 
     update_persistent_hash = LoopKernel.update_persistent_hash
 
-
 # }}}
 
 
diff --git a/loopy/tools.py b/loopy/tools.py
index b243a794..5eabe6c3 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -79,6 +79,11 @@ class LoopyKeyBuilder(KeyBuilderBase):
 
     update_for_defaultdict = update_for_dict
 
+    def update_for_frozenset(self, key_hash, key):
+        for set_key in sorted(key,
+                key=lambda obj: type(obj).__name__ + str(obj)):
+            self.rec(key_hash, set_key)
+
     def update_for_BasicSet(self, key_hash, key):  # noqa
         from islpy import Printer
         prn = Printer.to_str(key.get_ctx())
-- 
GitLab


From f8307a0ed463312a6eb162f7b8ab054babad97f3 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 15 Aug 2018 08:32:16 +0530
Subject: [PATCH 24/80] minor cleanup/comments.

---
 loopy/preprocess.py | 91 +++++++++++++++++++++++++++------------------
 loopy/program.py    |  7 +++-
 2 files changed, 59 insertions(+), 39 deletions(-)

diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 472c74db..e9e55cc4 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -2149,10 +2149,7 @@ def check_atomic_loads(kernel):
 
 class ArgDescrInferenceMapper(RuleAwareIdentityMapper):
     """
-    Returns a set of instances of :class:`tuple` (expr,
-    in_kernel_callable). The mapped `in_kernel_callable` of the
-    :class:`InKernelCallable` are descriptor specialized for the given
-    arguments.
+    Infers the :attr:`loopy`
     """
 
     def __init__(self, rule_mapping_context, caller_kernel,
@@ -2250,9 +2247,11 @@ def traverse_to_infer_arg_descr(kernel, program_callables_info):
     Returns a copy of *kernel* with the argument shapes and strides matching for
     scoped functions in the *kernel*. Refer
     :meth:`loopy.kernel.function_interface.InKernelCallable.with_descrs`.
-    """
-    # FIXME: update this docs, once the design is finalized
 
+    .. note::
+
+        Initiates a walk starting from *kernel* to all its callee kernels.
+    """
     from loopy.symbolic import SubstitutionRuleMappingContext
 
     rule_mapping_context = SubstitutionRuleMappingContext(
@@ -2268,6 +2267,11 @@ def traverse_to_infer_arg_descr(kernel, program_callables_info):
 
 
 def infer_arg_descr(program):
+    """
+    Returns a copy of *program* with the
+    :attr:`loopy.InKernelCallable.arg_id_to_descr` inferred for all the
+    callables.
+    """
     root_kernel_callable = program.program_callables_info[program.name]
     old_callables_count = program.program_callables_info.callables_count()
     program_callables_info = (
@@ -2397,28 +2401,60 @@ def preprocess_single_kernel(kernel, program_callables_info, device=None):
     return kernel
 
 
-def preprocess_kernel(kernel, device=None):
-    # FIXME: error message?
-    return preprocess_program(kernel, device)
+# {{{ hw axes inference
+
+def infer_hw_axes_sizes(program):
+    """
+    Returns copy of *program* with the hardware axes sizes inferred.
+
+    .. note::
+
+        - Firstly, computes the collective hardware axes sizes from all the
+          callable kernels.
+        - Then, overrides the grid sizes of all the callable kernels to the
+          collective value.
+    """
+
+    local_size, global_size = program.get_grid_size_upper_bounds()
+
+    resolved_function_with_hw_axes_sizes_inferred = {}
+
+    for func_id, in_knl_callable in (
+            program.program_callables_info.items()):
+        if func_id == program.name:
+            resolved_function_with_hw_axes_sizes_inferred[func_id] = (
+                    in_knl_callable)
+        else:
+            resolved_function_with_hw_axes_sizes_inferred[func_id] = (
+                    in_knl_callable.with_hw_axes_sizes(local_size, global_size))
+
+    new_program_callables_info = (
+            program.program_callables_info.copy(
+                resolved_functions=resolved_function_with_hw_axes_sizes_inferred))
+
+    program = program.copy(program_callables_info=new_program_callables_info)
+
+# }}}
 
 
 def preprocess_program(program, device=None):
 
     if device is not None:
+        # FIXME: Time to remove this? (Git blame shows 5 years ago)
         from warnings import warn
         warn("passing 'device' to preprocess_kernel() is deprecated",
                 DeprecationWarning, stacklevel=2)
 
     program = infer_unknown_types(program, expect_completion=False)
 
-    # {{{ preprocess the root kernel
+    # {{{ preprocess callable kernels
 
     # Callable editing restrictions:
     #
-    # - cannot edit program_callables_info in :meth:`preprocess_single_kernel`
-    #   as we are iterating over it.
+    # - should not edit program_callables_info in :meth:`preprocess_single_kernel`
+    #   as we are iterating over it.[1]
     #
-    # Refer: https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects
+    # [1] https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects
 
     new_resolved_functions = {}
     for func_id, in_knl_callable in program.program_callables_info.items():
@@ -2431,7 +2467,7 @@ def preprocess_program(program, device=None):
         elif isinstance(in_knl_callable, ScalarCallable):
             pass
         else:
-            raise NotImplementedError("Unknown type of callable %s." % (
+            raise NotImplementedError("Unknown callable type %s." % (
                 type(in_knl_callable).__name__))
 
         new_resolved_functions[func_id] = in_knl_callable
@@ -2445,32 +2481,13 @@ def preprocess_program(program, device=None):
     # infer arg descrs of the callables
     program = infer_arg_descr(program)
 
-    # {{{ hw axes inference
-
-    # FIXME: think of wrapping this in a function?
+    program = infer_hw_axes_sizes(program)
 
-    local_size, global_size = program.get_grid_size_upper_bounds()
-
-    resolved_function_with_hw_axes_sizes_set = {}
-
-    for func_id, in_knl_callable in (
-            program.program_callables_info.items()):
-        if func_id == program.name:
-            resolved_function_with_hw_axes_sizes_set[func_id] = (
-                    in_knl_callable)
-        else:
-            resolved_function_with_hw_axes_sizes_set[func_id] = (
-                    in_knl_callable.with_hw_axes_sizes(local_size, global_size))
-
-    new_program_callables_info = (
-            program.program_callables_info.copy(
-                resolved_functions=resolved_function_with_hw_axes_sizes_set))
+    return program
 
-    program = program.copy(program_callables_info=new_program_callables_info)
 
-    # }}}
-
-    return program
+# FIXME: Do we add a deprecation warning?
+preprocess_kernel = preprocess_program
 
 
 # vim: foldmethod=marker
diff --git a/loopy/program.py b/loopy/program.py
index d19cd4e8..eec8157c 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -861,10 +861,13 @@ class ProgramCallablesInfo(ImmutableRecord):
         return item in self.resolved_functions
 
     def items(self):
-        return self.resolved_functions.items()
+        return six.iteritems(self.resolved_functions)
 
     def values(self):
-        return self.resolved_functions.values()
+        return six.itervalues(self.resolved_functions)
+
+    def keys(self):
+        return six.iterkeys(self.resolved_functions)
 
     # }}}
 
-- 
GitLab


From caec9506a1b42bddb2ce57e009c207aaad4d7dc9 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 15 Aug 2018 08:46:50 +0530
Subject: [PATCH 25/80] with_add_callable -> with_added_callable

---
 loopy/program.py          | 10 +++++-----
 loopy/transform/fusion.py |  4 ++--
 loopy/type_inference.py   |  2 +-
 3 files changed, 8 insertions(+), 8 deletions(-)

diff --git a/loopy/program.py b/loopy/program.py
index eec8157c..90eb64e9 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -114,7 +114,7 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper):
                 # resolved in-kernel callable
 
                 self.program_callables_info, new_func_id = (
-                        self.program_callables_info.with_add_callable(expr.function,
+                        self.program_callables_info.with_added_callable(expr.function,
                             in_knl_callable))
                 return type(expr)(
                         ResolvedFunction(new_func_id),
@@ -135,7 +135,7 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper):
             in_knl_callable = self.find_in_knl_callable_from_identifier(func_id)
             assert in_knl_callable is not None
             self.program_callables_info, _ = (
-                    self.program_callables_info.with_add_callable(func_id,
+                    self.program_callables_info.with_added_callable(func_id,
                         in_knl_callable))
             # FIXME: where do you deal with the parameters? ~KK
         return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state)
@@ -168,7 +168,7 @@ def initialize_program_callables_info_from_kernel(kernel):
     callable_kernel = CallableKernel(kernel_with_functions_resolved)
 
     # add the callable kernel to the program_callables_info
-    program_callables_info, _ = program_callables_info.with_add_callable(
+    program_callables_info, _ = program_callables_info.with_added_callable(
             Variable(kernel.name), callable_kernel)
 
     return program_callables_info
@@ -603,7 +603,7 @@ class ProgramCallablesInfo(ImmutableRecord):
 
     # {{{ interface to perfrom edits on callables
 
-    def with_add_callable(self, function, in_kernel_callable):
+    def with_added_callable(self, function, in_kernel_callable):
         """
         Returns a copy of *self* with the *function* associated with the
         *in_kernel_callable*.
@@ -704,7 +704,7 @@ class ProgramCallablesInfo(ImmutableRecord):
 
         .. note::
 
-            - Use :meth:`with_add_callable` if a callable is being resolved for the
+            - Use :meth:`with_added_callable` if a callable is being resolved for the
                 first time.
         """
 
diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py
index f2e62368..b0d67764 100644
--- a/loopy/transform/fusion.py
+++ b/loopy/transform/fusion.py
@@ -447,7 +447,7 @@ def fuse_kernels(programs, suffixes=None, data_flow=None):
                 # renaming.
                 continue
             main_prog_callables_info, new_func_id = (
-                    main_prog_callables_info.with_add_callable(var(old_func_id),
+                    main_prog_callables_info.with_added_callable(var(old_func_id),
                         in_knl_callable))
 
             if old_func_id != new_func_id:
@@ -464,7 +464,7 @@ def fuse_kernels(programs, suffixes=None, data_flow=None):
             subkernel=new_root_kernel.copy(name=programs[0].name))
 
     # TODO: change the name of the final root kernel.
-    main_prog_callables_info, _ = main_prog_callables_info.with_add_callable(
+    main_prog_callables_info, _ = main_prog_callables_info.with_added_callable(
             var(programs[0].name), new_root_kernel_callable)
 
     return programs[0].copy(
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 52150dcd..04392d8d 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -408,7 +408,7 @@ class TypeInferenceMapper(CombineMapper):
                         identifier, function_mangler, arg_id_to_dtype,
                         arg_id_to_descr, mangle_result.target_name)
                 self.program_callables_info, new_function_id = (
-                        self.program_callables_info.with_add_callable(
+                        self.program_callables_info.with_added_callable(
                             expr.function, in_knl_callable))
 
                 if isinstance(expr, Call):
-- 
GitLab


From f041d166645c5d7f72413f45200b475a4b2bc150 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 15 Aug 2018 09:47:06 +0530
Subject: [PATCH 26/80] Minimalized CallableKernel for MR271

---
 loopy/kernel/function_interface.py | 169 +----------------------------
 loopy/preprocess.py                |   2 +-
 loopy/type_inference.py            | 138 ++++++++++++++++++++++-
 3 files changed, 138 insertions(+), 171 deletions(-)

diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py
index 8c3a6911..5efc44ad 100644
--- a/loopy/kernel/function_interface.py
+++ b/loopy/kernel/function_interface.py
@@ -23,19 +23,11 @@ THE SOFTWARE.
 """
 
 
-import re
-import six
-
 from six.moves import zip
 
 from pytools import ImmutableRecord
 from loopy.diagnostic import LoopyError
 
-from loopy.symbolic import parse_tagged_name
-
-from loopy.symbolic import (ResolvedFunction, SubstitutionRuleMappingContext,
-        RuleAwareIdentityMapper, SubstitutionRuleExpander)
-
 from loopy.kernel import LoopKernel
 
 
@@ -145,7 +137,7 @@ class GridOverrideForCalleeKernel(ImmutableRecord):
 
     .. note::
 
-        This class acts as a pseduo-callable and its significance lies in
+        This class acts as a pseudo-callable and its significance lies in
         solving picklability issues.
     """
     fields = set(["local_size", "global_size"])
@@ -228,8 +220,6 @@ class InKernelCallable(ImmutableRecord):
             Any argument information exists both by its positional and
             its keyword identifier.
         """
-        # FIXME: In all these with_** functions add that also passes a
-        # program_callables_info
 
         raise NotImplementedError()
 
@@ -333,12 +323,12 @@ class InKernelCallable(ImmutableRecord):
 
 class ScalarCallable(InKernelCallable):
     """
-    An abstranct interface the to a scalar callable encountered in a kernel.
+    An abstract interface the to a scalar callable encountered in a kernel.
 
     .. note::
 
         The :meth:`ScalarCallable.with_types` is intended to assist with type
-        specialization of the funciton and is expected to be supplemented in the
+        specialization of the function and is expected to be supplemented in the
         derived subclasses.
     """
 
@@ -520,68 +510,12 @@ class CallableKernel(InKernelCallable):
         return (self.subkernel, self.arg_id_to_dtype,
                 self.arg_id_to_descr)
 
-    @property
-    def name(self):
-        return self.subkernel.name
-
-    def is_ready_for_codegen(self):
-        return (self.arg_id_to_dtype is not None and
-                self.arg_id_to_descr is not None)
-
     def generate_preambles(self, target):
         """ Yields the *target* specific preambles.
         """
-        # FIXME Check that this is correct.
-
         return
         yield
 
-    def emit_call_insn(self, insn, target, expression_to_code_mapper):
-
-        assert self.is_ready_for_codegen()
-
-        from loopy.kernel.instruction import CallInstruction
-        from pymbolic.primitives import CallWithKwargs
-
-        assert isinstance(insn, CallInstruction)
-
-        parameters = insn.expression.parameters
-        kw_parameters = {}
-        if isinstance(insn.expression, CallWithKwargs):
-            kw_parameters = insn.expression.kw_parameters
-
-        assignees = insn.assignees
-
-        parameters = list(parameters)
-        par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)]
-        kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel)
-        for i in range(len(parameters), len(parameters)+len(kw_parameters)):
-            parameters.append(kw_parameters[pos_to_kw[i]])
-            par_dtypes.append(self.arg_id_to_dtype[pos_to_kw[i]])
-
-        # insert the assigness at the required positions
-        assignee_write_count = -1
-        for i, arg in enumerate(self.subkernel.args):
-            if arg.is_output_only:
-                assignee = assignees[-assignee_write_count-1]
-                parameters.insert(i, assignee)
-                par_dtypes.insert(i, self.arg_id_to_dtype[assignee_write_count])
-                assignee_write_count -= 1
-
-        # no type casting in array calls
-        from loopy.expression import dtype_to_type_context
-        from pymbolic.mapper.stringifier import PREC_NONE
-        from pymbolic import var
-
-        c_parameters = [
-                expression_to_code_mapper(par, PREC_NONE,
-                    dtype_to_type_context(target, par_dtype),
-                    par_dtype).expr
-                for par, par_dtype in zip(
-                    parameters, par_dtypes)]
-
-        return var(self.subkernel.name)(*c_parameters), False
-
 # }}}
 
 
@@ -589,7 +523,7 @@ class CallableKernel(InKernelCallable):
 
 class ManglerCallable(ScalarCallable):
     """
-    A callable whose characateristic is defined by a function mangler.
+    A callable whose characteristic is defined by a function mangler.
 
     .. attribute:: function_mangler
 
@@ -662,99 +596,4 @@ class ManglerCallable(ScalarCallable):
 
 # }}}
 
-
-# {{{ new pymbolic calls to scoped functions
-
-def next_indexed_variable(function):
-    """
-    Returns an instance of :class:`str` with the next indexed-name in the
-    sequence for the name of *function*.
-
-    *Example:* ``Variable('sin_0')`` will return ``'sin_1'``.
-
-    :arg function: Either an instance of :class:`pymbolic.primitives.Variable`
-        or :class:`loopy.reduction.ArgExtOp` or
-        :class:`loopy.reduction.SegmentedOp`.
-    """
-    from loopy.library.reduction import ReductionOpFunction
-    if isinstance(function, ReductionOpFunction):
-        return function.copy()
-    func_name = re.compile(r"^(?P<alpha>\S+?)_(?P<num>\d+?)$")
-
-    match = func_name.match(function.name)
-
-    if match is None:
-        if function.name[-1] == '_':
-            return "{old_name}0".format(old_name=function.name)
-        else:
-            return "{old_name}_0".format(old_name=function.name)
-
-    return "{alpha}_{num}".format(alpha=match.group('alpha'),
-            num=int(match.group('num'))+1)
-
-
-class FunctionNameChanger(RuleAwareIdentityMapper):
-    """
-    Changes the names of scoped functions in calls of expressions according to
-    the mapping ``calls_to_new_functions``
-    """
-
-    def __init__(self, rule_mapping_context, calls_to_new_names,
-            subst_expander):
-        super(FunctionNameChanger, self).__init__(rule_mapping_context)
-        self.calls_to_new_names = calls_to_new_names
-        self.subst_expander = subst_expander
-
-    def map_call(self, expr, expn_state):
-        name, tag = parse_tagged_name(expr.function)
-
-        if name not in self.rule_mapping_context.old_subst_rules:
-            expanded_expr = self.subst_expander(expr)
-            if expr in self.calls_to_new_names:
-                return type(expr)(
-                        ResolvedFunction(self.calls_to_new_names[expr]),
-                        tuple(self.rec(child, expn_state)
-                            for child in expr.parameters))
-            elif expanded_expr in self.calls_to_new_names:
-                # FIXME: this is horribly wrong logic.
-                # investigate how to make edits to a substitution rule
-                return type(expr)(
-                        ResolvedFunction(self.calls_to_new_names[expanded_expr]),
-                        tuple(self.rec(child, expn_state)
-                            for child in expanded_expr.parameters))
-            else:
-                return super(FunctionNameChanger, self).map_call(
-                        expr, expn_state)
-        else:
-            return self.map_substitution(name, tag, expr.parameters, expn_state)
-
-    def map_call_with_kwargs(self, expr, expn_state):
-
-        if expr in self.calls_to_new_names:
-            return type(expr)(
-                ResolvedFunction(self.calls_to_new_names[expr]),
-                tuple(self.rec(child, expn_state)
-                    for child in expr.parameters),
-                dict(
-                    (key, self.rec(val, expn_state))
-                    for key, val in six.iteritems(expr.kw_parameters))
-                    )
-        else:
-            return super(FunctionNameChanger, self).map_call_with_kwargs(
-                    expr, expn_state)
-
-
-def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names):
-    rule_mapping_context = SubstitutionRuleMappingContext(
-                    kernel.substitutions, kernel.get_var_name_generator())
-    subst_expander = SubstitutionRuleExpander(kernel.substitutions)
-    name_changer = FunctionNameChanger(rule_mapping_context,
-            pymbolic_calls_to_new_names, subst_expander)
-
-    return rule_mapping_context.finish_kernel(
-            name_changer.map_kernel(kernel))
-
-# }}}
-
-
 # vim: foldmethod=marker
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index e9e55cc4..41674ed9 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -2432,7 +2432,7 @@ def infer_hw_axes_sizes(program):
             program.program_callables_info.copy(
                 resolved_functions=resolved_function_with_hw_axes_sizes_inferred))
 
-    program = program.copy(program_callables_info=new_program_callables_info)
+    return program.copy(program_callables_info=new_program_callables_info)
 
 # }}}
 
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 04392d8d..e5c17886 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -36,7 +36,10 @@ from loopy.diagnostic import (
 from loopy.kernel.instruction import _DataObliviousInstruction
 
 from loopy.program import ProgramCallablesInfo
-from loopy.symbolic import LinearSubscript
+from loopy.symbolic import (
+        LinearSubscript, parse_tagged_name, RuleAwareIdentityMapper,
+        SubstitutionRuleExpander, ResolvedFunction,
+        SubstitutionRuleMappingContext)
 from pymbolic.primitives import Variable, Subscript, Lookup
 
 import logging
@@ -62,6 +65,135 @@ def get_return_types_as_tuple(arg_id_to_dtype):
     return tuple(return_arg_id_to_dtype[id] for id in return_arg_pos)
 
 
+# {{{ renaming helpers
+
+class FunctionNameChanger(RuleAwareIdentityMapper):
+    """
+    Changes the names of scoped functions in calls of expressions according to
+    the mapping ``calls_to_new_functions``
+    """
+
+    def __init__(self, rule_mapping_context, calls_to_new_names,
+            subst_expander):
+        super(FunctionNameChanger, self).__init__(rule_mapping_context)
+        self.calls_to_new_names = calls_to_new_names
+        self.subst_expander = subst_expander
+
+    def map_call(self, expr, expn_state):
+        name, tag = parse_tagged_name(expr.function)
+
+        if name not in self.rule_mapping_context.old_subst_rules:
+            expanded_expr = self.subst_expander(expr)
+            if expr in self.calls_to_new_names:
+                return type(expr)(
+                        ResolvedFunction(self.calls_to_new_names[expr]),
+                        tuple(self.rec(child, expn_state)
+                            for child in expr.parameters))
+            elif expanded_expr in self.calls_to_new_names:
+                # FIXME: This is killing the substitution.
+                # Maybe using a RuleAwareIdentityMapper for TypeInferenceMapper
+                # would help.
+                return type(expr)(
+                        ResolvedFunction(self.calls_to_new_names[expanded_expr]),
+                        tuple(self.rec(child, expn_state)
+                            for child in expanded_expr.parameters))
+            else:
+                return super(FunctionNameChanger, self).map_call(
+                        expr, expn_state)
+        else:
+            return self.map_substitution(name, tag, expr.parameters, expn_state)
+
+    def map_call_with_kwargs(self, expr, expn_state):
+
+        if expr in self.calls_to_new_names:
+            return type(expr)(
+                ResolvedFunction(self.calls_to_new_names[expr]),
+                tuple(self.rec(child, expn_state)
+                    for child in expr.parameters),
+                dict(
+                    (key, self.rec(val, expn_state))
+                    for key, val in six.iteritems(expr.kw_parameters))
+                    )
+        else:
+            return super(FunctionNameChanger, self).map_call_with_kwargs(
+                    expr, expn_state)
+
+
+def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names):
+    """
+    Returns a copy of *kernel* with the names of pymbolic calls changed
+    according to the mapping given by *pymbolic_calls_new_names*.
+
+    :arg pymbolic_calls_to_new_names: A mapping from instances of
+        :class:`pymbolic.primitives.Call` to :class:`str`.
+
+    **Example: **
+
+        - Given a *kernel* --
+
+        .. code::
+
+            -------------------------------------------------------------
+            KERNEL: loopy_kernel
+            -------------------------------------------------------------
+            ARGUMENTS:
+            x: type: <auto/runtime>, shape: (10), dim_tags: (N0:stride:1)
+            y: type: <auto/runtime>, shape: (10), dim_tags: (N0:stride:1)
+            -------------------------------------------------------------
+            DOMAINS:
+            { [i] : 0 <= i <= 9 }
+            -------------------------------------------------------------
+            INAME IMPLEMENTATION TAGS:
+            i: None
+            -------------------------------------------------------------
+            INSTRUCTIONS:
+            for i
+                y[i] = ResolvedFunction('sin')(x[i])
+            end i
+            -------------------------------------------------------------
+
+        - And given a *pymbolic_calls_to_new_names* --
+
+        .. code::
+
+            {Call(ResolvedFunction(Variable('sin')), (Subscript(Variable('x'),
+            Variable('i')),))": 'sin_1'}
+
+        - The following *kernel* is returned --
+
+        .. code::
+
+            -------------------------------------------------------------
+            KERNEL: loopy_kernel
+            -------------------------------------------------------------
+            ARGUMENTS:
+            x: type: <auto/runtime>, shape: (10), dim_tags: (N0:stride:1)
+            y: type: <auto/runtime>, shape: (10), dim_tags: (N0:stride:1)
+            -------------------------------------------------------------
+            DOMAINS:
+            { [i] : 0 <= i <= 9 }
+            -------------------------------------------------------------
+            INAME IMPLEMENTATION TAGS:
+            i: None
+            -------------------------------------------------------------
+            INSTRUCTIONS:
+            for i
+                y[i] = ResolvedFunction('sin_1')(x[i])
+            end i
+            -------------------------------------------------------------
+    """
+    rule_mapping_context = SubstitutionRuleMappingContext(
+                    kernel.substitutions, kernel.get_var_name_generator())
+    subst_expander = SubstitutionRuleExpander(kernel.substitutions)
+    name_changer = FunctionNameChanger(rule_mapping_context,
+            pymbolic_calls_to_new_names, subst_expander)
+
+    return rule_mapping_context.finish_kernel(
+            name_changer.map_kernel(kernel))
+
+# }}}
+
+
 # {{{ type inference mapper
 
 class TypeInferenceMapper(CombineMapper):
@@ -276,7 +408,6 @@ class TypeInferenceMapper(CombineMapper):
     def map_call(self, expr, return_tuple=False):
 
         from pymbolic.primitives import Variable, CallWithKwargs, Call
-        from loopy.symbolic import ResolvedFunction
 
         if isinstance(expr, CallWithKwargs):
             kw_parameters = expr.kw_parameters
@@ -862,9 +993,6 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info,
             args=[new_arg_dict[arg.name] for arg in kernel.args],
             )
 
-    # this has to be subsitutition
-    from loopy.kernel.function_interface import (
-            change_names_of_pymbolic_calls)
     type_specialized_kernel = change_names_of_pymbolic_calls(
             pre_type_specialized_knl, old_calls_to_new_calls)
 
-- 
GitLab


From 4f8ec6989ef1e515fa956214702f7ef11b300305 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 15 Aug 2018 10:42:01 +0530
Subject: [PATCH 27/80] added autofunction/class/methods

---
 loopy/kernel/function_interface.py |  13 +++
 loopy/program.py                   | 143 +++++++++++++++++------------
 2 files changed, 96 insertions(+), 60 deletions(-)

diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py
index 5efc44ad..e4e8c1d5 100644
--- a/loopy/kernel/function_interface.py
+++ b/loopy/kernel/function_interface.py
@@ -30,6 +30,19 @@ from loopy.diagnostic import LoopyError
 
 from loopy.kernel import LoopKernel
 
+__doc__ = """
+
+.. currentmodule:: loopy
+
+.. autoclass:: ValueArgDescriptor
+.. autoclass:: ArrayArgDescriptor
+.. autoclass:: InKernelCallable
+.. autoclass:: CallableKernel
+.. autoclass:: ScalarCallable
+.. autoclass:: ManglerCallable
+
+"""
+
 
 # {{{ argument descriptors
 
diff --git a/loopy/program.py b/loopy/program.py
index 90eb64e9..e5d033e0 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -42,7 +42,17 @@ from loopy.kernel import LoopKernel
 from collections import Counter
 from pymbolic.primitives import Call, CallWithKwargs
 
-# FIXME: autofunction/autoclass?? ~KK
+__doc__ = """
+
+.. currentmodule:: loopy
+
+.. autoclass:: Program
+.. autoclass:: ProgramCallablesInfo
+
+.. autofunction:: make_program_from_kernel
+.. autofunction:: iterate_over_kernels_if_given_program
+
+"""
 
 
 class ResolvedFunctionMarker(RuleAwareIdentityMapper):
@@ -114,8 +124,8 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper):
                 # resolved in-kernel callable
 
                 self.program_callables_info, new_func_id = (
-                        self.program_callables_info.with_added_callable(expr.function,
-                            in_knl_callable))
+                        self.program_callables_info.with_added_callable(
+                            expr.function, in_knl_callable))
                 return type(expr)(
                         ResolvedFunction(new_func_id),
                         tuple(self.rec(child, expn_state)
@@ -137,10 +147,21 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper):
             self.program_callables_info, _ = (
                     self.program_callables_info.with_added_callable(func_id,
                         in_knl_callable))
-            # FIXME: where do you deal with the parameters? ~KK
         return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state)
 
 
+def _default_func_id_to_kernel_callable_mappers(target):
+    """
+    Returns a list of functions that are provided through *target* by deafault.
+    """
+    # FIXME: the name -- scopers is no longer used!(change it) ~KK
+
+    from loopy.library.function import loopy_specific_callable_scopers
+    return (
+            [loopy_specific_callable_scopers] + (
+                target.get_device_ast_builder().function_scopers()))
+
+
 def initialize_program_callables_info_from_kernel(kernel):
     """
     Returns an instance of :class:`loopy.ProgramCallablesInfo`, by resolving
@@ -148,7 +169,7 @@ def initialize_program_callables_info_from_kernel(kernel):
     """
     # collect the default function resolvers
     func_id_to_kernel_callable_mappers = (
-            default_func_id_to_kernel_callable_mappers(kernel.target))
+            _default_func_id_to_kernel_callable_mappers(kernel.target))
     program_callables_info = ProgramCallablesInfo({})
 
     from loopy.symbolic import SubstitutionRuleMappingContext
@@ -553,6 +574,9 @@ class ProgramCallablesInfo(ImmutableRecord):
         An instance of :class:`bool` which is intended to aid the working of
         :meth:`with_enter_edit_callables_mode`, :meth:`with_callable` and
         :meth:`with_exit_edit_callables_mode`.
+
+    .. automethod:: __init__
+    .. automethod:: callables_count
     """
     def __init__(self, resolved_functions,
             history=None, is_being_edited=False):
@@ -580,6 +604,7 @@ class ProgramCallablesInfo(ImmutableRecord):
 
     update_persistent_hash = LoopKernel.update_persistent_hash
 
+    @property
     @memoize_method
     def callables_count(self):
         """
@@ -601,18 +626,36 @@ class ProgramCallablesInfo(ImmutableRecord):
 
         return callables_count
 
-    # {{{ interface to perfrom edits on callables
+    # {{{ interface to perform edits on callables
 
     def with_added_callable(self, function, in_kernel_callable):
         """
         Returns a copy of *self* with the *function* associated with the
         *in_kernel_callable*.
+
+        .. note::
+
+            - Always checks whether the
+              :attr:``loopy.ProgramCallablesInfo.resolved_functions` has
+              *in_kernel_callable*, does not introduce copies.
+
+            - The difference between
+              :meth:`loopy.ProgramCallablesInfo.with_added_callable`
+              and :meth:`ProgramCallablesInfo.with_callable` being that
+              the former has no support for renaming the callable back i.e.
+              ``with_callable`` supports renaming from ``sin_0`` to ``sin``,
+              if possible, through the member method
+              ``loopy.ProgramCallablesInfo.with_exit_edit_callables_mode``
+
+              This subtle difference makes --
+
+              - :meth:`loopy.ProgramCallablesInfo.with_added_callable` suitable
+                for usage while resolving the functions first time, where no
+                renaming is needed.
+
+              - :meth:`loopy.ProgramCallablesInfo.with_callable` suitable for
+                implementing edits in callables during inference-walks.
         """
-        # FIXME: pleasse better docs.. ~KK
-        # note: this does not require the edit mode to be true.
-        # the reason for the edit mode is that we need to take care of the
-        # renaming that might be needed to be done
-        # PS: delete this note?
 
         # {{{ sanity checks
 
@@ -627,7 +670,7 @@ class ProgramCallablesInfo(ImmutableRecord):
 
         if in_kernel_callable in self.resolved_functions.values():
             # the callable already exists, implies return the function
-            # identifier corresposing to that callable.
+            # identifier corresponding to that callable.
             for func_id, in_knl_callable in self.resolved_functions.items():
                 if in_knl_callable == in_kernel_callable:
                     history[func_id] = history[func_id] | frozenset([function.name])
@@ -659,7 +702,7 @@ class ProgramCallablesInfo(ImmutableRecord):
 
             if isinstance(in_kernel_callable, CallableKernel) and (
                     in_kernel_callable.subkernel.is_called_from_host):
-                # special treatment if the callable is the root kernel
+                # do not rename root kernel
                 pass
             else:
                 while unique_function_identifier in self.resolved_functions:
@@ -671,10 +714,6 @@ class ProgramCallablesInfo(ImmutableRecord):
         updated_resolved_functions[unique_function_identifier] = (
                 in_kernel_callable)
 
-        if 'strongVolumeKernelR_0' in updated_resolved_functions:
-            import pudb
-            pudb.set_trace()
-
         history[unique_function_identifier] = frozenset(
                 [unique_function_identifier])
 
@@ -688,24 +727,26 @@ class ProgramCallablesInfo(ImmutableRecord):
         """
         Initiates *self* for a walk traversal through all the callables.
         """
-        # PS: I don't see a need for this method right now.
-        # This is just for validation purposes, maybe needs to disapper if you
-        # find a better solution?
         return self.copy(
                 is_being_edited=True)
 
     def with_callable(self, function, in_kernel_callable):
         """
+        Returns a copy of *self* with the *function* associated with the
+        *in_kernel_callable*. Also refer --
+        :meth:`loopy.ProgramCallablesInfo.with_added_callable`
+
+
         :arg function: An instance of :class:`pymbolic.primitives.Variable` or
             :class:`loopy.library.reduction.ReductionOpFunction`.
 
-        :arg in_kernel_callables: An instance of
+        :arg in_kernel_callable: An instance of
             :class:`loopy.InKernelCallable`.
 
         .. note::
 
             - Use :meth:`with_added_callable` if a callable is being resolved for the
-                first time.
+              first time.
         """
 
         # {{{ non-edit mode
@@ -714,7 +755,7 @@ class ProgramCallablesInfo(ImmutableRecord):
             if function.name in self.resolved_functions and (
                     self.resolved_functions[function.name] == in_kernel_callable):
                 # if not being edited, check that the given function is
-                # equal to the the old version of the callable.
+                # equal to the old version of the callable.
                 return self, function
             else:
                 print('Old: ', self.resolved_functions[function.name])
@@ -764,7 +805,7 @@ class ProgramCallablesInfo(ImmutableRecord):
 
             if isinstance(in_kernel_callable, CallableKernel) and (
                     in_kernel_callable.subkernel.is_called_from_host):
-                # special treatment if the callable is the root kernel
+                # do not rename root kernel
                 pass
             else:
                 while unique_function_identifier in self.resolved_functions:
@@ -776,10 +817,6 @@ class ProgramCallablesInfo(ImmutableRecord):
         updated_resolved_functions[unique_function_identifier] = (
                 in_kernel_callable)
 
-        if 'strongVolumeKernelR_0' in updated_resolved_functions:
-            import pudb
-            pudb.set_trace()
-
         history[unique_function_identifier] = (
                 history[function.name] | frozenset([unique_function_identifier]))
 
@@ -791,39 +828,38 @@ class ProgramCallablesInfo(ImmutableRecord):
 
     def with_exit_edit_callables_mode(self, old_callables_count):
         """
-        Returns a copy of *self* with renaming of the callables done whenver
+        Returns a copy of *self* with renaming of the callables done whenever
         possible.
 
         *For example: * If all the ``sin`` got diverged as ``sin_0, sin_1``,
-        then all the renaming is done such that one of flavors of the function
+        then all the renaming is done such that one of flavors of the callable
         is renamed back to ``sin``.
         """
 
+        assert self.is_being_edited
+
         new_callables_count = self.callables_count()
-        history = self.history.copy()
-        renames_needed = {}
 
-        assert self.is_being_edited
+        # {{{ calculate the renames needed
 
-        # NOTE:(to self by KK)
-        # all we need to do is change the name of the variables that were seen
-        # in old_callables_count but are no longer available.
-        # Using these 2 figure out the renames needed.
+        renames_needed = {}
         for old_func_id in old_callables_count-new_callables_count:
             # this implies that all the function instances having the name
             # "func_id" have been renamed to something else.
             for new_func_id in (
                     six.viewkeys(new_callables_count)-six.viewkeys(renames_needed)):
-                if old_func_id in history[new_func_id]:
+                if old_func_id in self.history[new_func_id]:
                     renames_needed[new_func_id] = old_func_id
                     break
+        # }}}
 
-        resolved_functions = {}
+        new_resolved_functions = {}
+        new_history = {}
 
         for func_id in new_callables_count:
             in_knl_callable = self.resolved_functions[func_id]
             if isinstance(in_knl_callable, CallableKernel):
-                # If callable kernel, perform renames.
+                # if callable kernel, perform renames inside its expressions.
                 old_subkernel = in_knl_callable.subkernel
                 new_subkernel = rename_resolved_functions_in_a_single_kernel(
                         old_subkernel, renames_needed)
@@ -836,19 +872,18 @@ class ProgramCallablesInfo(ImmutableRecord):
                         type(in_knl_callable).__name__)
 
             if func_id in renames_needed:
-                # If function name itself in renames change the key of the
-                # dict.
-                history.pop(func_id)
-
                 new_func_id = renames_needed[func_id]
-                resolved_functions[new_func_id] = (
+                new_resolved_functions[new_func_id] = (
                         in_knl_callable)
+                new_history[new_func_id] = self.history[func_id]
             else:
-                resolved_functions[func_id] = in_knl_callable
+                new_resolved_functions[func_id] = in_knl_callable
+                new_history[func_id] = self.history[func_id]
 
         return self.copy(
                 is_being_edited=False,
-                resolved_functions=resolved_functions)
+                resolved_functions=new_resolved_functions,
+                history=new_history)
 
     # }}}
 
@@ -874,18 +909,6 @@ class ProgramCallablesInfo(ImmutableRecord):
 # }}}
 
 
-def default_func_id_to_kernel_callable_mappers(target):
-    """
-    Returns a list of functions that are provided through *target* by deafault.
-    """
-    # FIXME: name scopers is confusing!(change it to something else.)
-
-    from loopy.library.function import loopy_specific_callable_scopers
-    return (
-            [loopy_specific_callable_scopers] + (
-                target.get_device_ast_builder().function_scopers()))
-
-
 # {{{ helper functions
 
 def make_program_from_kernel(kernel):
@@ -902,7 +925,7 @@ def make_program_from_kernel(kernel):
             name=kernel.name,
             program_callables_info=program_callables_info,
             func_id_to_in_knl_callable_mappers=(
-                default_func_id_to_kernel_callable_mappers(kernel.target)),
+                _default_func_id_to_kernel_callable_mappers(kernel.target)),
             target=kernel.target)
 
     return program
-- 
GitLab


From a28164f965eedd1611752e9d7540d108c2ae8d76 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 15 Aug 2018 10:43:14 +0530
Subject: [PATCH 28/80] made callables count a property.

---
 loopy/preprocess.py     | 2 +-
 loopy/program.py        | 2 +-
 loopy/statistics.py     | 8 ++++----
 loopy/type_inference.py | 2 +-
 4 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 41674ed9..44653316 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -2273,7 +2273,7 @@ def infer_arg_descr(program):
     callables.
     """
     root_kernel_callable = program.program_callables_info[program.name]
-    old_callables_count = program.program_callables_info.callables_count()
+    old_callables_count = program.program_callables_info.callables_count
     program_callables_info = (
             program.program_callables_info.with_edit_callables_mode())
     root_kernel = program.root_kernel
diff --git a/loopy/program.py b/loopy/program.py
index e5d033e0..bdf40a1b 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -838,7 +838,7 @@ class ProgramCallablesInfo(ImmutableRecord):
 
         assert self.is_being_edited
 
-        new_callables_count = self.callables_count()
+        new_callables_count = self.callables_count
 
         # {{{ calculate the renames needed
 
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 3799967b..71a62986 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1397,7 +1397,7 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False,
     op_map = ToCountMap()
 
     callables_count = (
-                program.program_callables_info.callables_count())
+                program.program_callables_info.callables_count)
 
     for func_id, in_knl_callable in program.program_callables_info.items():
         if isinstance(in_knl_callable, CallableKernel):
@@ -1684,7 +1684,7 @@ def get_mem_access_map(program, numpy_types=True, count_redundant_work=False,
 
     access_map = ToCountMap()
 
-    callables_count = program.program_callables_info.callables_count()
+    callables_count = program.program_callables_info.callables_count
 
     for func_id, in_knl_callable in program.program_callables_info.items():
         if isinstance(in_knl_callable, CallableKernel):
@@ -1808,7 +1808,7 @@ def get_synchronization_map(program, subgroup_size=None):
     program = preprocess_program(program)
 
     sync_map = ToCountMap()
-    callables_count = program.program_callables_info.callables_count()
+    callables_count = program.program_callables_info.callables_count
 
     for func_id, in_knl_callable in program.program_callables_info.items():
         if isinstance(in_knl_callable, CallableKernel):
@@ -1884,7 +1884,7 @@ def gather_access_footprints(program, ignore_uncountable=False):
     write_footprints = []
     read_footprints = []
 
-    callables_count = program.program_callables_info.callables_count()
+    callables_count = program.program_callables_info.callables_count
 
     for func_id, in_knl_callable in program.program_callables_info.items():
         if isinstance(in_knl_callable, CallableKernel):
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index e5c17886..d5df36bf 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -1017,7 +1017,7 @@ def infer_unknown_types(program, expect_completion=False):
             program_callables_info[program.name])
     type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel
 
-    old_callables_count = program_callables_info.callables_count()
+    old_callables_count = program_callables_info.callables_count
     program_callables_info = (
             program.program_callables_info.with_edit_callables_mode())
     root_kernel, program_callables_info = (
-- 
GitLab


From 621ef9f8c05abe5f9ba64adc2ecbeae9cdd92e58 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 15 Aug 2018 10:56:22 +0530
Subject: [PATCH 29/80] docs cleanup for Program

---
 loopy/program.py | 11 +++++++++--
 1 file changed, 9 insertions(+), 2 deletions(-)

diff --git a/loopy/program.py b/loopy/program.py
index bdf40a1b..236bbc44 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -222,10 +222,13 @@ class Program(ImmutableRecord):
 
     .. note::
 
-        - To create an instance of :class:`loopy.Program`, it is recommeneded to
+        - To create an instance of :class:`loopy.Program`, it is recommended to
             go through :method:`loopy.make_kernel`.
         - This data structure and its attributes should be considered
           immutable, any modifications should be done through :method:`copy`.
+
+    .. automethod:: __init__
+    .. automethod:: with_root_kernel
     """
     def __init__(self,
             name,
@@ -329,7 +332,7 @@ class Program(ImmutableRecord):
     def root_kernel(self):
         """
         Returns an instance of :class:`loopy.LoopKernel` denoting the topmost
-        level kernel in codegeneration.
+        level kernel.
 
         .. note::
 
@@ -577,6 +580,10 @@ class ProgramCallablesInfo(ImmutableRecord):
 
     .. automethod:: __init__
     .. automethod:: callables_count
+    .. automethod:: with_added_callable
+    .. automethod:: with_edit_callables_mode
+    .. automethod:: with_callable
+    .. automethod:: with_exit_edit_callables_mode
     """
     def __init__(self, resolved_functions,
             history=None, is_being_edited=False):
-- 
GitLab


From 8e64c24f8d0669faaca742138a1982cda56c52cf Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 15 Aug 2018 12:07:20 +0530
Subject: [PATCH 30/80] small error in docs.

---
 doc/tutorial.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 71b8f438..4c67e3d3 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -334,7 +334,7 @@ that these dependencies show up there, too:
 
 .. doctest::
 
-    >>> print(knl.stringify(with_dependencies=True))
+    >>> print(knl.root_kernel.stringify(with_dependencies=True))
     ---------------------------------------------------------------------------
     KERNEL: loopy_kernel
     ---------------------------------------------------------------------------
-- 
GitLab


From 3293f6ae0b24ce1206487835ac52aeb37a06a174 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 15 Aug 2018 12:16:30 +0530
Subject: [PATCH 31/80] callable kernel no longer has a name.

---
 loopy/transform/fusion.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py
index b0d67764..44e69ecf 100644
--- a/loopy/transform/fusion.py
+++ b/loopy/transform/fusion.py
@@ -439,7 +439,7 @@ def fuse_kernels(programs, suffixes=None, data_flow=None):
                 # main_program_callables_info, because of renaming is
                 # needed to be done in the callable kernels before registering.
                 # Hence disabling it until required.
-                if in_knl_callable.name != prog.name:
+                if in_knl_callable.subkernel.name != prog.name:
                     raise LoopyError("fuse_kernels cannot fuse programs with "
                             "multiple callable kernels.")
 
-- 
GitLab


From 70ada3da326053a6023fa050008284aec9d277eb Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 15 Aug 2018 12:32:00 +0530
Subject: [PATCH 32/80] minor changes in docs

---
 doc/tutorial.rst | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 4c67e3d3..8e20dbc2 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1207,7 +1207,8 @@ This happens due to the kernel splitting done by :mod:`loopy`. The splitting
 happens when the instruction schedule is generated. To see the schedule, we
 should call :func:`loopy.get_one_scheduled_kernel`:
 
-   >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
+   >>> knl = lp.preprocess_kernel(knl)
+   >>> knl = lp.get_one_scheduled_kernel(knl.root_kernel, knl.program_callables_info)
    >>> print(knl)
    ---------------------------------------------------------------------------
    KERNEL: rotate_v2
@@ -1237,9 +1238,8 @@ function adds instructions to the kernel without scheduling them. That means
 that :func:`loopy.get_one_scheduled_kernel` needs to be called one more time to
 put those instructions into the schedule.
 
-   >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl))
    >>> knl = lp.save_and_reload_temporaries(knl)
-   >>> knl = lp.get_one_scheduled_kernel(knl)  # Schedule added instructions
+   >>> knl = lp.get_one_scheduled_kernel(knl.root_kernel, knl.program_callables_info)  # Schedule added instructions
    >>> print(knl)
    ---------------------------------------------------------------------------
    KERNEL: rotate_v2
-- 
GitLab


From 66b9f4275979426e6e6c9ced76f51c4fc84ebc3a Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 15 Aug 2018 12:49:01 +0530
Subject: [PATCH 33/80] Pass docs.

---
 doc/tutorial.rst | 22 ++++++++++++----------
 1 file changed, 12 insertions(+), 10 deletions(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 8e20dbc2..597240cc 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1179,7 +1179,7 @@ Let us start with an example. Consider the kernel from above with a
 
 .. doctest::
 
-   >>> knl = lp.make_kernel(
+   >>> prog = lp.make_kernel(
    ...     "[n] -> {[i] : 0<=i<n}",
    ...     """
    ...     for i
@@ -1194,11 +1194,11 @@ Let us start with an example. Consider the kernel from above with a
    ...      ],
    ...     name="rotate_v2",
    ...     assumptions="n mod 16 = 0")
-   >>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0")
+   >>> prog = lp.split_iname(prog, "i", 16, inner_tag="l.0", outer_tag="g.0")
 
 Here is what happens when we try to generate code for the kernel:
 
-   >>> cgr = lp.generate_code_v2(knl)
+   >>> cgr = lp.generate_code_v2(prog)
    Traceback (most recent call last):
    ...
    loopy.diagnostic.MissingDefinitionError: temporary variable 'tmp' gets used in subkernel 'rotate_v2_0' without a definition (maybe you forgot to call loopy.save_and_reload_temporaries?)
@@ -1207,9 +1207,10 @@ This happens due to the kernel splitting done by :mod:`loopy`. The splitting
 happens when the instruction schedule is generated. To see the schedule, we
 should call :func:`loopy.get_one_scheduled_kernel`:
 
-   >>> knl = lp.preprocess_kernel(knl)
-   >>> knl = lp.get_one_scheduled_kernel(knl.root_kernel, knl.program_callables_info)
-   >>> print(knl)
+   >>> prog = lp.preprocess_kernel(prog)
+   >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info)
+   >>> prog = prog.with_root_kernel(knl)
+   >>> print(prog)
    ---------------------------------------------------------------------------
    KERNEL: rotate_v2
    ---------------------------------------------------------------------------
@@ -1238,9 +1239,10 @@ function adds instructions to the kernel without scheduling them. That means
 that :func:`loopy.get_one_scheduled_kernel` needs to be called one more time to
 put those instructions into the schedule.
 
-   >>> knl = lp.save_and_reload_temporaries(knl)
-   >>> knl = lp.get_one_scheduled_kernel(knl.root_kernel, knl.program_callables_info)  # Schedule added instructions
-   >>> print(knl)
+   >>> prog = lp.save_and_reload_temporaries(prog)
+   >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info)  # Schedule added instructions
+   >>> prog = prog.with_root_kernel(knl)
+   >>> print(prog)
    ---------------------------------------------------------------------------
    KERNEL: rotate_v2
    ---------------------------------------------------------------------------
@@ -1279,7 +1281,7 @@ does in more detail:
 
 The kernel translates into two OpenCL kernels.
 
-   >>> cgr = lp.generate_code_v2(knl)
+   >>> cgr = lp.generate_code_v2(prog)
    >>> print(cgr.device_code())
    #define lid(N) ((int) get_local_id(N))
    #define gid(N) ((int) get_group_id(N))
-- 
GitLab


From fba32ca309e7ac03bd521816a08dc98d9695c1df Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 15 Aug 2018 21:11:09 +0530
Subject: [PATCH 34/80] change credits of program.py

---
 loopy/program.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/program.py b/loopy/program.py
index 236bbc44..54d13343 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -1,6 +1,6 @@
 from __future__ import division, absolute_import
 
-__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni"
 
 __license__ = """
 Permission is hereby granted, free of charge, to any person obtaining a copy
-- 
GitLab


From 2636fe29c3e574ff14fb1f66764c5f6b34cc54cd Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Tue, 28 Aug 2018 21:30:11 -0500
Subject: [PATCH 35/80] better function naming, no more usage of "scoped"
 terminology.

---
 doc/ref_call.rst           |  2 +-
 loopy/library/function.py  | 16 +++++++++++++---
 loopy/library/reduction.py |  2 +-
 loopy/program.py           |  6 +++---
 loopy/target/__init__.py   |  2 +-
 loopy/target/c/__init__.py |  4 ++--
 loopy/target/cuda.py       |  4 ++--
 loopy/target/opencl.py     |  4 ++--
 loopy/target/pyopencl.py   |  4 ++--
 loopy/target/python.py     |  4 ++--
 10 files changed, 29 insertions(+), 19 deletions(-)

diff --git a/doc/ref_call.rst b/doc/ref_call.rst
index 4ff1ef2f..147363a1 100644
--- a/doc/ref_call.rst
+++ b/doc/ref_call.rst
@@ -180,7 +180,7 @@ Changes on the target side to accommodate the new function interface
 --------------------------------------------------------------------
 
 The earlier "function\_mangler" as a member method of the class
-``lp.ASTBuilderBase`` will be replaced by ``function_scopers``. The
+``lp.ASTBuilderBase`` will be replaced by ``function_id_in_knl_callable_mapper``. The
 function scopers would return a list of functions with the signature
 ``(target, identifier)->lp.InKernelCallable``.
 
diff --git a/loopy/library/function.py b/loopy/library/function.py
index 8338875d..f3fb5f8c 100644
--- a/loopy/library/function.py
+++ b/loopy/library/function.py
@@ -55,15 +55,25 @@ class IndexOfCallable(ScalarCallable):
                 program_callables_info)
 
 
-def loopy_specific_callable_scopers(target, identifier):
+def loopy_specific_callable_func_id_to_knl_callable_mappers(target, identifier):
+    """
+    Returns an instance of :class:`InKernelCallable` for the *idenitifer*
+    which is not present in *target*, but whose interface is given by
+    :mod:`loo.py`. Callables that fall in this category are --
+
+    - reductions leading to function calls like ``argmin``, ``argmax``.
+    - callables that have a predefined meaning in :mod:`loo.py` like
+      ``make_tuple``, ``index_of``, ``indexof_vec``.
+    """
     if identifier == "make_tuple":
         return MakeTupleCallable(name="make_tuple")
 
     if identifier in ["indexof", "indexof_vec"]:
         return IndexOfCallable(name=identifier)
 
-    from loopy.library.reduction import reduction_scoper
-    return reduction_scoper(target, identifier)
+    from loopy.library.reduction import (
+            reduction_func_id_to_in_knl_callable_mapper)
+    return reduction_func_id_to_in_knl_callable_mapper(target, identifier)
 
 
 # vim: foldmethod=marker
diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index b3deba65..70df864d 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -502,7 +502,7 @@ class ReductionCallable(ScalarCallable):
         return
 
 
-def reduction_scoper(target, identifier):
+def reduction_func_id_to_in_knl_callable_mapper(target, identifier):
     if isinstance(identifier, ReductionOpFunction):
         return ReductionCallable(name=identifier)
 
diff --git a/loopy/program.py b/loopy/program.py
index 54d13343..fd4ae63f 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -156,10 +156,10 @@ def _default_func_id_to_kernel_callable_mappers(target):
     """
     # FIXME: the name -- scopers is no longer used!(change it) ~KK
 
-    from loopy.library.function import loopy_specific_callable_scopers
+    from loopy.library.function import loopy_specific_callable_func_id_to_knl_callable_mappers
     return (
-            [loopy_specific_callable_scopers] + (
-                target.get_device_ast_builder().function_scopers()))
+            [loopy_specific_callable_func_id_to_knl_callable_mappers] + (
+                target.get_device_ast_builder().function_id_in_knl_callable_mapper()))
 
 
 def initialize_program_callables_info_from_kernel(kernel):
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index e3b4853c..92ee2dc5 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -150,7 +150,7 @@ class ASTBuilderBase(object):
 
     # {{{ library
 
-    def function_scopers(self):
+    def function_id_in_knl_callable_mapper(self):
         """
         Returns an instance of list of the functions of signature
         ``(target, identifiers)`` returning either an instance of
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 1579bb31..418ce025 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -484,9 +484,9 @@ class CASTBuilder(ASTBuilderBase):
                     _preamble_generator,
                     ])
 
-    def function_scopers(self):
+    def function_id_in_knl_callable_mapper(self):
         return (
-                super(CASTBuilder, self).function_scopers() + [
+                super(CASTBuilder, self).function_id_in_knl_callable_mapper() + [
                     scope_c_math_functions])
 
     # }}}
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index 89cbfd03..e6abf73f 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -274,9 +274,9 @@ class CUDACASTBuilder(CASTBuilder):
 
     # {{{ library
 
-    def function_scopers(self):
+    def function_id_in_knl_callable_mapper(self):
         return [scope_cuda_functions] + (
-                super(CUDACASTBuilder, self).function_scopers())
+                super(CUDACASTBuilder, self).function_id_in_knl_callable_mapper())
 
     # }}}
 
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 44bf9c4c..d8c195de 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -442,10 +442,10 @@ class OpenCLTarget(CTarget):
 class OpenCLCASTBuilder(CASTBuilder):
     # {{{ library
 
-    def function_scopers(self):
+    def function_id_in_knl_callable_mapper(self):
         return (
                 [scope_opencl_functions] + super(
-                    OpenCLCASTBuilder, self).function_scopers())
+                    OpenCLCASTBuilder, self).function_id_in_knl_callable_mapper())
 
     def symbol_manglers(self):
         return (
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index 03ba2693..0e955648 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -792,11 +792,11 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder):
 
     # {{{ library
 
-    def function_scopers(self):
+    def function_id_in_knl_callable_mapper(self):
         from loopy.library.random123 import random123_function_scoper
         return (
                 [pyopencl_function_scoper, random123_function_scoper] + super(
-                    PyOpenCLCASTBuilder, self).function_scopers())
+                    PyOpenCLCASTBuilder, self).function_id_in_knl_callable_mapper())
 
     def preamble_generators(self):
         return ([
diff --git a/loopy/target/python.py b/loopy/target/python.py
index cd6e6116..0dbecce2 100644
--- a/loopy/target/python.py
+++ b/loopy/target/python.py
@@ -180,10 +180,10 @@ class PythonASTBuilderBase(ASTBuilderBase):
 
     # {{{ code generation guts
 
-    def function_scopers(self):
+    def function_id_in_knl_callable_mapper(self):
         from loopy.target.c import scope_c_math_functions
         return (
-                super(PythonASTBuilderBase, self).function_scopers() +
+                super(PythonASTBuilderBase, self).function_id_in_knl_callable_mapper() +
                 [scope_c_math_functions])
 
     def preamble_generators(self):
-- 
GitLab


From d923227ed2d2557e0b3dcdc505546ada4069a142 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Tue, 28 Aug 2018 21:34:07 -0500
Subject: [PATCH 36/80] flake8 fixes after `sed`

---
 loopy/program.py       | 6 ++++--
 loopy/target/python.py | 3 ++-
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/loopy/program.py b/loopy/program.py
index fd4ae63f..a18d9076 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -156,10 +156,12 @@ def _default_func_id_to_kernel_callable_mappers(target):
     """
     # FIXME: the name -- scopers is no longer used!(change it) ~KK
 
-    from loopy.library.function import loopy_specific_callable_func_id_to_knl_callable_mappers
+    from loopy.library.function import (
+            loopy_specific_callable_func_id_to_knl_callable_mappers)
     return (
             [loopy_specific_callable_func_id_to_knl_callable_mappers] + (
-                target.get_device_ast_builder().function_id_in_knl_callable_mapper()))
+                target.get_device_ast_builder().function_id_in_knl_callable_mapper(
+                    )))
 
 
 def initialize_program_callables_info_from_kernel(kernel):
diff --git a/loopy/target/python.py b/loopy/target/python.py
index 0dbecce2..2e6712ec 100644
--- a/loopy/target/python.py
+++ b/loopy/target/python.py
@@ -183,7 +183,8 @@ class PythonASTBuilderBase(ASTBuilderBase):
     def function_id_in_knl_callable_mapper(self):
         from loopy.target.c import scope_c_math_functions
         return (
-                super(PythonASTBuilderBase, self).function_id_in_knl_callable_mapper() +
+                super(PythonASTBuilderBase,
+                    self).function_id_in_knl_callable_mapper() +
                 [scope_c_math_functions])
 
     def preamble_generators(self):
-- 
GitLab


From 906e1e2eb9a2ee0e850d28f57cccdb5e904ffd57 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Tue, 28 Aug 2018 21:35:03 -0500
Subject: [PATCH 37/80] replaces unnecessary old logic in
 unscoped_call_collector.

---
 loopy/check.py | 7 +------
 1 file changed, 1 insertion(+), 6 deletions(-)

diff --git a/loopy/check.py b/loopy/check.py
index ae5599bc..7033b62d 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -68,10 +68,6 @@ class UnscopedCallCollector(CombineMapper):
     :returns:
         An :class:`frozenset` of function names that are not scoped in
         the kernel.
-
-    .. note::
-        :class:`loopy.library.reduction.ArgExtOp` are ignored, as they are
-        never scoped in the pipeline.
     """
 
     def combine(self, values):
@@ -85,8 +81,7 @@ class UnscopedCallCollector(CombineMapper):
             kw_parameters={}))
 
     def map_call_with_kwargs(self, expr):
-        from loopy.library.reduction import ArgExtOp
-        if not isinstance(expr.function, (ResolvedFunction, ArgExtOp)):
+        if not isinstance(expr.function, ResolvedFunction):
             return (frozenset([expr.function.name]) |
                     self.combine((self.rec(child) for child in expr.parameters
                         + tuple(expr.kw_parameters.values()))))
-- 
GitLab


From eeae2d861228796110337b8b5ccacddf84b53543 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Tue, 28 Aug 2018 22:00:36 -0500
Subject: [PATCH 38/80] Comment rewording, scoper->
 function_id_to_in_knl_callable_mapper

---
 doc/ref_call.rst                   | 6 +++---
 loopy/check.py                     | 4 ++--
 loopy/kernel/__init__.py           | 2 +-
 loopy/kernel/function_interface.py | 2 +-
 loopy/library/random123.py         | 2 +-
 loopy/target/pyopencl.py           | 8 +++++---
 6 files changed, 13 insertions(+), 11 deletions(-)

diff --git a/doc/ref_call.rst b/doc/ref_call.rst
index 147363a1..ab810137 100644
--- a/doc/ref_call.rst
+++ b/doc/ref_call.rst
@@ -30,7 +30,7 @@ kernel, whose name has been resolved by the kernel. The process of matching a
 function idenitifier with the function definition is called "resolving".
 
 A pymbolic ``Call`` node can be converted to a ``ResolvedFunction`` if it
-is "resolved" by one of the ``function_scoper`` in a
+is "resolved" by one of the ``function_id_to_in_knl_callable_mapper`` in a
 :attr:`LoopKernel.scoped_functions`
 
 -  Functions already registered by the target. Some examples include --
@@ -41,11 +41,11 @@ is "resolved" by one of the ``function_scoper`` in a
 -  Functions registered as ``CallableKernels`` using
    ``lp.register_callable_kernel(...)``.
 -  Functions that have been provided through
-   ``lp.register_function_scoper(...)``
+   ``lp.register_function_id_to_in_knl_callable_mapper(...)``
 -  Functions that can be made known from the user through
    ``lp.register_function_mangler``. This is planned to be deprecated,
    as its functionality is superseded by
-   ``lp.register_function_scoper(...)``.
+   ``lp.register_function_id_to_in_knl_callable_mapper(...)``.
 
 Expressions after a function is scoped
 --------------------------------------
diff --git a/loopy/check.py b/loopy/check.py
index 7033b62d..76a56c08 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -181,8 +181,8 @@ def check_loop_priority_inames_known(kernel):
 
 
 def _get_all_unique_iname_tags(kernel):
-    """Returns a set of all the iname tags used in *kernel* that
-    inherit from :class:`loopy.kernel.data.UniqueTag`.
+    """Returns an instance of :class:`set` of all the iname tags used in
+    *kernel* that inherit from :class:`loopy.kernel.data.UniqueTag`.
     """
     from loopy.kernel.data import UniqueTag
     iname_tags = [kernel.iname_to_tag.get(iname) for iname in
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 8b2cf3dd..410f1332 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -223,7 +223,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
     .. attribute:: is_called_from_host
         An instance of :class:`bool`. Will be set *False* for the kernel which
-        would be called from another top level kernels. Default value is
+        would be called from other top level kernels. Default value is
         *True*.
 
     """
diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py
index e4e8c1d5..c8b5a953 100644
--- a/loopy/kernel/function_interface.py
+++ b/loopy/kernel/function_interface.py
@@ -287,7 +287,7 @@ class InKernelCallable(ImmutableRecord):
     def with_hw_axes_sizes(self, local_size, global_size):
         """
         Returns a copy of *self* with modifications to comply with the grid
-        sizes ``(local_size, global_size)`` of the kernel in which it is
+        sizes ``(local_size, global_size)`` of the program in which it is
         supposed to be called.
 
         :arg local_size: An instance of :class:`islpy.PwAff`.
diff --git a/loopy/library/random123.py b/loopy/library/random123.py
index 59ca72df..397e985b 100644
--- a/loopy/library/random123.py
+++ b/loopy/library/random123.py
@@ -231,7 +231,7 @@ class Random123Callable(ScalarCallable):
         return
 
 
-def random123_function_scoper(target, identifier):
+def random123_function_id_to_in_knl_callable_mapper(target, identifier):
     if identifier in FUNC_NAMES_TO_RNG:
         return Random123Callable(name=identifier)
 
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index 0e955648..435a5e79 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -274,7 +274,7 @@ class PyOpenCLCallable(ScalarCallable):
                 program_callables_info)
 
 
-def pyopencl_function_scoper(target, identifier):
+def pyopencl_function_id_to_in_knl_callable_mapper(target, identifier):
     if identifier in ["sqrt", "exp", "log", "sin", "cos", "tan", "sinh", "cosh",
             "tanh", "conj", "real", "imag", "abs"]:
         return PyOpenCLCallable(name=identifier)
@@ -793,9 +793,11 @@ class PyOpenCLCASTBuilder(OpenCLCASTBuilder):
     # {{{ library
 
     def function_id_in_knl_callable_mapper(self):
-        from loopy.library.random123 import random123_function_scoper
+        from loopy.library.random123 import (
+                random123_function_id_to_in_knl_callable_mapper)
         return (
-                [pyopencl_function_scoper, random123_function_scoper] + super(
+                [pyopencl_function_id_to_in_knl_callable_mapper,
+                    random123_function_id_to_in_knl_callable_mapper] + super(
                     PyOpenCLCASTBuilder, self).function_id_in_knl_callable_mapper())
 
     def preamble_generators(self):
-- 
GitLab


From 481573be0b9ebca023ce2994ed866c66cb85d6e3 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Tue, 28 Aug 2018 22:02:41 -0500
Subject: [PATCH 39/80] removes FIXME.

---
 loopy/program.py | 2 --
 1 file changed, 2 deletions(-)

diff --git a/loopy/program.py b/loopy/program.py
index a18d9076..161249e0 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -154,8 +154,6 @@ def _default_func_id_to_kernel_callable_mappers(target):
     """
     Returns a list of functions that are provided through *target* by deafault.
     """
-    # FIXME: the name -- scopers is no longer used!(change it) ~KK
-
     from loopy.library.function import (
             loopy_specific_callable_func_id_to_knl_callable_mappers)
     return (
-- 
GitLab


From 46d1502bf2372803eaaa0483a07190d4cfef60cd Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Tue, 28 Aug 2018 22:34:27 -0500
Subject: [PATCH 40/80] adds a comment that the ref_call needs one more
 revamping, removed unnecessary fixme in type_inference, some other minor
 comment rewording.

---
 doc/ref_call.rst        |  2 ++
 loopy/program.py        | 14 +++++++++-----
 loopy/statistics.py     |  4 ++--
 loopy/type_inference.py |  2 --
 4 files changed, 13 insertions(+), 9 deletions(-)

diff --git a/doc/ref_call.rst b/doc/ref_call.rst
index ab810137..5a59e842 100644
--- a/doc/ref_call.rst
+++ b/doc/ref_call.rst
@@ -4,6 +4,8 @@ Calling Loopy Kernels and External Functions
 Goals of a function interface
 -----------------------------
 
+- *FIXME: * Needs to change after the new design of program.
+
 - Must be able to have complete information of the function just through the
   epxression node.
 - Must adhere to :mod:`loopy` semantics of immutability.
diff --git a/loopy/program.py b/loopy/program.py
index 161249e0..7479ee04 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -556,6 +556,8 @@ def count_callables_in_kernel(kernel, program_callables_info):
 # {{{ program callables info
 
 class ProgramCallablesInfo(ImmutableRecord):
+    # FIXME: is CallablesTable a better name?(similar to symbol table in
+    # compilers.)
     """
     Records the information of all the callables called in a :class:`loopy.Program`.
 
@@ -637,8 +639,11 @@ class ProgramCallablesInfo(ImmutableRecord):
 
     def with_added_callable(self, function, in_kernel_callable):
         """
-        Returns a copy of *self* with the *function* associated with the
-        *in_kernel_callable*.
+        Returns an instance of :class:`tuple` of ``(new_self, new_function)``.
+        ``new_self`` is a copy of *self* with the *function* associated with the
+        *in_kernel_callable*. ``new_function`` is the function identifier that
+        should be noted in the expression node so that it could be associated
+        with an instance of :class:`InKernelCallable`.
 
         .. note::
 
@@ -739,9 +744,8 @@ class ProgramCallablesInfo(ImmutableRecord):
 
     def with_callable(self, function, in_kernel_callable):
         """
-        Returns a copy of *self* with the *function* associated with the
-        *in_kernel_callable*. Also refer --
-        :meth:`loopy.ProgramCallablesInfo.with_added_callable`
+        Returns an instance of :class:`tuple` ``(new_self, new_function)``.
+        Also refer -- :meth:`loopy.ProgramCallablesInfo.with_added_callable`
 
 
         :arg function: An instance of :class:`pymbolic.primitives.Variable` or
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 71a62986..000f651a 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -61,8 +61,8 @@ __doc__ = """
 
 
 # FIXME: this is broken for the callable kernel design.
-# Qns:
-# - The variable name, what if multiple kernels use the same name?
+# - The variable name, what if multiple kernels use the same name?(needs a
+# different MemAccessInfo)
 # - We should also add the cumulative effect on the arguments of callee kernels
 # into the caller kernel
 # - Make changes to MemAccessInfo to include the effect of several kernels.
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index d5df36bf..a2174181 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -969,8 +969,6 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info,
         if isinstance(insn, lp.MultiAssignmentBase):
             # just a dummy run over the expression, to pass over all the
             # functions
-            # FIXME: need a check over here which checks the instruction for
-            # unseen cases
             if _instruction_missed_during_inference(insn):
                 type_inf_mapper(insn.expression, return_tuple=isinstance(insn,
                     lp.CallInstruction), return_dtype_set=True)
-- 
GitLab


From e5b0303aea50dbbea889c0f16f2bea724c8c8fa1 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kgk2@illinois.edu>
Date: Fri, 31 Aug 2018 19:36:54 -0400
Subject: [PATCH 41/80] actually use `for_atomic` in the constructor

---
 loopy/kernel/array.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 186597c6..6bf733a8 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -833,6 +833,7 @@ class ArrayBase(ImmutableRecord):
                 dim_names=dim_names,
                 order=order,
                 alignment=alignment,
+                for_atomic=for_atomic,
                 **kwargs)
 
     def __eq__(self, other):
-- 
GitLab


From 5137aded65aa9e7f55219eba4b66b86055a4f627 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Sat, 1 Sep 2018 07:33:45 -0500
Subject: [PATCH 42/80] actually update the dtype target, for array base
 sub-classes.

---
 loopy/preprocess.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index fc950c78..2afcd3db 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -53,7 +53,7 @@ def prepare_for_caching(kernel):
     for arg in kernel.args:
         dtype = arg.dtype
         if dtype is not None and dtype is not lp.auto and dtype.target is not tgt:
-            arg = arg.copy(dtype=dtype.with_target(kernel.target))
+            arg = arg.copy(dtype=dtype.with_target(tgt), target=tgt)
 
         new_args.append(arg)
 
@@ -61,7 +61,7 @@ def prepare_for_caching(kernel):
     for name, temp in six.iteritems(kernel.temporary_variables):
         dtype = temp.dtype
         if dtype is not None and dtype is not lp.auto and dtype.target is not tgt:
-            temp = temp.copy(dtype=dtype.with_target(tgt))
+            temp = temp.copy(dtype=dtype.with_target(tgt), target=tgt)
 
         new_temporary_variables[name] = temp
 
-- 
GitLab


From d6b4b615ecf049314a75fc2662eef8068cf99f6a Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 3 Sep 2018 18:57:10 -0500
Subject: [PATCH 43/80] removed commented out code w/previous count granularity

---
 loopy/statistics.py | 12 ------------
 1 file changed, 12 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index f8999367..c233ab09 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -715,7 +715,6 @@ class ExpressionOpCounter(CounterBase):
         return ToCountMap(
                     {Op(dtype=self.type_inf(expr),
                         name='func:'+str(expr.function),
-                        #count_granularity=CountGranularity.WORKITEM): 1}
                         count_granularity=CountGranularity.SUBGROUP): 1}
                     ) + self.rec(expr.parameters)
 
@@ -727,7 +726,6 @@ class ExpressionOpCounter(CounterBase):
         return ToCountMap(
                     {Op(dtype=self.type_inf(expr),
                         name='add',
-                        #count_granularity=CountGranularity.WORKITEM):
                         count_granularity=CountGranularity.SUBGROUP):
                      len(expr.children)-1}
                     ) + sum(self.rec(child) for child in expr.children)
@@ -737,20 +735,17 @@ class ExpressionOpCounter(CounterBase):
         assert expr.children
         return sum(ToCountMap({Op(dtype=self.type_inf(expr),
                                   name='mul',
-                                  #count_granularity=CountGranularity.WORKITEM): 1})
                                   count_granularity=CountGranularity.SUBGROUP): 1})
                    + self.rec(child)
                    for child in expr.children
                    if not is_zero(child + 1)) + \
                    ToCountMap({Op(dtype=self.type_inf(expr),
                                   name='mul',
-                                  #count_granularity=CountGranularity.WORKITEM): -1})
                                   count_granularity=CountGranularity.SUBGROUP): -1})
 
     def map_quotient(self, expr, *args):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='div',
-                              #count_granularity=CountGranularity.WORKITEM): 1}) \
                               count_granularity=CountGranularity.SUBGROUP): 1}) \
                                 + self.rec(expr.numerator) \
                                 + self.rec(expr.denominator)
@@ -761,7 +756,6 @@ class ExpressionOpCounter(CounterBase):
     def map_power(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='pow',
-                              #count_granularity=CountGranularity.WORKITEM): 1}) \
                               count_granularity=CountGranularity.SUBGROUP): 1}) \
                                 + self.rec(expr.base) \
                                 + self.rec(expr.exponent)
@@ -769,7 +763,6 @@ class ExpressionOpCounter(CounterBase):
     def map_left_shift(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='shift',
-                              #count_granularity=CountGranularity.WORKITEM): 1}) \
                               count_granularity=CountGranularity.SUBGROUP): 1}) \
                                 + self.rec(expr.shiftee) \
                                 + self.rec(expr.shift)
@@ -779,14 +772,12 @@ class ExpressionOpCounter(CounterBase):
     def map_bitwise_not(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='bw',
-                              #count_granularity=CountGranularity.WORKITEM): 1}) \
                               count_granularity=CountGranularity.SUBGROUP): 1}) \
                                 + self.rec(expr.child)
 
     def map_bitwise_or(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='bw',
-                              #count_granularity=CountGranularity.WORKITEM):
                               count_granularity=CountGranularity.SUBGROUP):
                            len(expr.children)-1}) \
                                 + sum(self.rec(child) for child in expr.children)
@@ -811,7 +802,6 @@ class ExpressionOpCounter(CounterBase):
     def map_min(self, expr):
         return ToCountMap({Op(dtype=self.type_inf(expr),
                               name='maxmin',
-                              #count_granularity=CountGranularity.WORKITEM):
                               count_granularity=CountGranularity.SUBGROUP):
                            len(expr.children)-1}) \
                + sum(self.rec(child) for child in expr.children)
@@ -928,7 +918,6 @@ class LocalMemAccessCounter(MemAccessCounter):
                     sub_map[MemAccess(
                                 mtype='local',
                                 dtype=dtype,
-                                #count_granularity=CountGranularity.WORKITEM)
                                 count_granularity=CountGranularity.SUBGROUP)
                             ] = 1
                     return sub_map
@@ -949,7 +938,6 @@ class LocalMemAccessCounter(MemAccessCounter):
                         lid_strides=dict(sorted(six.iteritems(lid_strides))),
                         gid_strides=dict(sorted(six.iteritems(gid_strides))),
                         variable=name,
-                        #count_granularity=CountGranularity.WORKITEM)] = 1
                         count_granularity=CountGranularity.SUBGROUP)] = 1
 
         return sub_map
-- 
GitLab


From bd9973f33cbbbf9faf75d1fc71ec9ecbca36ed9a Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 3 Sep 2018 19:47:35 -0500
Subject: [PATCH 44/80] combined duplicate implementations of get_insn_count
 into single function

---
 loopy/statistics.py | 185 +++++++++++++++-----------------------------
 1 file changed, 63 insertions(+), 122 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index c233ab09..194775db 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -32,7 +32,7 @@ from functools import reduce
 from loopy.kernel.data import (
         MultiAssignmentBase, TemporaryVariable, AddressSpace)
 from loopy.diagnostic import warn_with_kernel, LoopyError
-from pytools import Record
+from pytools import Record, memoize_method
 
 
 __doc__ = """
@@ -1255,6 +1255,59 @@ def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False)
     else:
         return c
 
+
+@memoize_method
+def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work,
+                    count_granularity=CountGranularity.WORKITEM):
+    insn = knl.id_to_insn[insn_id]
+
+    if count_granularity is None:
+        warn_with_kernel(knl, "get_insn_count_assumes_granularity",
+                         "get_insn_count: No count granularity passed, "
+                         "assuming %s granularity."
+                         % (CountGranularity.WORKITEM))
+        count_granularity == CountGranularity.WORKITEM
+
+    if count_granularity == CountGranularity.WORKITEM:
+        return count_insn_runs(
+            knl, insn, count_redundant_work=count_redundant_work,
+            disregard_local_axes=False)
+
+    ct_disregard_local = count_insn_runs(
+            knl, insn, disregard_local_axes=True,
+            count_redundant_work=count_redundant_work)
+
+    if count_granularity == CountGranularity.WORKGROUP:
+        return ct_disregard_local
+    elif count_granularity == CountGranularity.SUBGROUP:
+        # get the group size
+        from loopy.symbolic import aff_to_expr
+        _, local_size = knl.get_grid_size_upper_bounds()
+        workgroup_size = 1
+        if local_size:
+            for size in local_size:
+                s = aff_to_expr(size)
+                if not isinstance(s, int):
+                    raise LoopyError("Cannot count insn with %s granularity, "
+                                     "work-group size is not integer: %s"
+                                     % (CountGranularity.SUBGROUP, local_size))
+                workgroup_size *= s
+
+        warn_with_kernel(knl, "insn_count_subgroups_upper_bound",
+                "get_insn_count: when counting instruction %s with "
+                "count_granularity=%s, using upper bound for work-group size "
+                "(%d work-items) to compute sub-groups per work-group. When "
+                "multiple device programs present, actual sub-group count may be"
+                "lower." % (insn_id, CountGranularity.SUBGROUP, workgroup_size))
+
+        from pytools import div_ceil
+        return ct_disregard_local*div_ceil(workgroup_size, subgroup_size)
+    else:
+        # this should not happen since this is enforced in Op/MemAccess
+        raise ValueError("get_insn_count: count_granularity '%s' is"
+                "not allowed. count_granularity options: %s"
+                % (count_granularity, CountGranularity.ALL+[None]))
+
 # }}}
 
 
@@ -1360,77 +1413,18 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False,
                              "must be integer, 'guess', or, if you're feeling "
                              "lucky, None." % (subgroup_size))
 
-    # ------------------------------
-    #class CacheHolder(object):
-    #    pass
-
-    #cache_holder = CacheHolder()
-    #from pytools import memoize_in
-
-    #@memoize_in(cache_holder, "insn_count")
-    def get_insn_count(knl, insn, count_granularity=CountGranularity.WORKITEM):
-
-        if count_granularity is None:
-            warn_with_kernel(knl, "get_insn_count_assumes_granularity",
-                             "get_insn_count: No count granularity passed for "
-                             "Op, assuming %s granularity."
-                             % (CountGranularity.WORKITEM))
-            count_granularity == CountGranularity.WORKITEM
-
-        if count_granularity == CountGranularity.WORKITEM:
-            return count_insn_runs(
-                knl, insn, count_redundant_work=count_redundant_work,
-                disregard_local_axes=False)
-
-        ct_disregard_local = count_insn_runs(
-                knl, insn, disregard_local_axes=True,
-                count_redundant_work=count_redundant_work)
-
-        if count_granularity == CountGranularity.WORKGROUP:
-            return ct_disregard_local
-        elif count_granularity == CountGranularity.SUBGROUP:
-            # get the group size
-            from loopy.symbolic import aff_to_expr
-            _, local_size = knl.get_grid_size_upper_bounds()
-            workgroup_size = 1
-            if local_size:
-                for size in local_size:
-                    s = aff_to_expr(size)
-                    if not isinstance(s, int):
-                        raise LoopyError("Cannot count insn with %s granularity, "
-                                         "work-group size is not integer: %s"
-                                         % (CountGranularity.SUBGROUP, local_size))
-                    workgroup_size *= s
-
-            warn_with_kernel(knl, "insn_count_subgroups_upper_bound",
-                    "get_insn_count: when counting instruction %s with "
-                    "count_granularity=%s, using upper bound for work-group size "
-                    "(%d work-items) to compute sub-groups per work-group. When "
-                    "multiple device programs present, actual sub-group count may be"
-                    "lower." % (insn, CountGranularity.SUBGROUP, workgroup_size))
-
-            from pytools import div_ceil
-            return ct_disregard_local*div_ceil(workgroup_size, subgroup_size)
-        else:
-            # this should not happen since this is enforced in Op
-            raise ValueError("get_insn_count: count_granularity '%s' is"
-                    "not allowed. count_granularity options: %s"
-                    % (count_granularity, CountGranularity.ALL+[None]))
-    # ------------------------------
-
     op_map = ToCountMap()
     op_counter = ExpressionOpCounter(knl)
     for insn in knl.instructions:
         if isinstance(insn, (CallInstruction, CInstruction, Assignment)):
             ops = op_counter(insn.assignee) + op_counter(insn.expression)
-            #op_map = op_map + ops*count_insn_runs(
-            #        knl, insn,
-            #        count_redundant_work=count_redundant_work)
             for key, val in six.iteritems(ops):
                 op_map = (
                         op_map
                         + ToCountMap({key: val})
-                        * get_insn_count(knl, insn, key.count_granularity))
+                        * _get_insn_count(knl, insn.id, subgroup_size,
+                                         count_redundant_work,
+                                         key.count_granularity))
 
         elif isinstance(insn, (NoOpInstruction, BarrierInstruction)):
             pass
@@ -1594,63 +1588,6 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                              "must be integer, 'guess', or, if you're feeling "
                              "lucky, None." % (subgroup_size))
 
-    class CacheHolder(object):
-        pass
-
-    cache_holder = CacheHolder()
-    from pytools import memoize_in
-
-    @memoize_in(cache_holder, "insn_count")
-    def get_insn_count(knl, insn_id, count_granularity=CountGranularity.WORKITEM):
-        insn = knl.id_to_insn[insn_id]
-
-        if count_granularity is None:
-            warn_with_kernel(knl, "get_insn_count_assumes_granularity",
-                             "get_insn_count: No count granularity passed for "
-                             "MemAccess, assuming %s granularity."
-                             % (CountGranularity.WORKITEM))
-            count_granularity == CountGranularity.WORKITEM
-
-        if count_granularity == CountGranularity.WORKITEM:
-            return count_insn_runs(
-                knl, insn, count_redundant_work=count_redundant_work,
-                disregard_local_axes=False)
-
-        ct_disregard_local = count_insn_runs(
-                knl, insn, disregard_local_axes=True,
-                count_redundant_work=count_redundant_work)
-
-        if count_granularity == CountGranularity.WORKGROUP:
-            return ct_disregard_local
-        elif count_granularity == CountGranularity.SUBGROUP:
-            # get the group size
-            from loopy.symbolic import aff_to_expr
-            _, local_size = knl.get_grid_size_upper_bounds()
-            workgroup_size = 1
-            if local_size:
-                for size in local_size:
-                    s = aff_to_expr(size)
-                    if not isinstance(s, int):
-                        raise LoopyError("Cannot count insn with %s granularity, "
-                                         "work-group size is not integer: %s"
-                                         % (CountGranularity.SUBGROUP, local_size))
-                    workgroup_size *= s
-
-            warn_with_kernel(knl, "insn_count_subgroups_upper_bound",
-                    "get_insn_count: when counting instruction %s with "
-                    "count_granularity=%s, using upper bound for work-group size "
-                    "(%d work-items) to compute sub-groups per work-group. When "
-                    "multiple device programs present, actual sub-group count may be"
-                    "lower." % (insn_id, CountGranularity.SUBGROUP, workgroup_size))
-
-            from pytools import div_ceil
-            return ct_disregard_local*div_ceil(workgroup_size, subgroup_size)
-        else:
-            # this should not happen since this is enforced in MemAccess
-            raise ValueError("get_insn_count: count_granularity '%s' is"
-                    "not allowed. count_granularity options: %s"
-                    % (count_granularity, CountGranularity.ALL+[None]))
-
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
 
@@ -1679,14 +1616,18 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                 access_map = (
                         access_map
                         + ToCountMap({key: val})
-                        * get_insn_count(knl, insn.id, key.count_granularity))
+                        * _get_insn_count(knl, insn.id, subgroup_size,
+                                          count_redundant_work,
+                                          key.count_granularity))
 
             for key, val in six.iteritems(access_assignee.count_map):
 
                 access_map = (
                         access_map
                         + ToCountMap({key: val})
-                        * get_insn_count(knl, insn.id, key.count_granularity))
+                        * _get_insn_count(knl, insn.id, subgroup_size,
+                                          count_redundant_work,
+                                          key.count_granularity))
 
         elif isinstance(insn, (NoOpInstruction, BarrierInstruction)):
             pass
-- 
GitLab


From de02e453215cec23136b31bcb3741bbe881f6b04 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 3 Sep 2018 19:53:01 -0500
Subject: [PATCH 45/80] moved import statements in get_op_map and
 get_mem_access_map closer to where they are used

---
 loopy/statistics.py | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 194775db..02b79a81 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1375,13 +1375,6 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False,
         raise LoopyError("Kernel '%s': Using operation counting requires the option "
                 "ignore_boostable_into to be set." % knl.name)
 
-    from loopy.preprocess import preprocess_kernel, infer_unknown_types
-    from loopy.kernel.instruction import (
-            CallInstruction, CInstruction, Assignment,
-            NoOpInstruction, BarrierInstruction)
-    knl = infer_unknown_types(knl, expect_completion=True)
-    knl = preprocess_kernel(knl)
-
     if not isinstance(subgroup_size, int):
         # try to find subgroup_size
         subgroup_size_guess = _find_subgroup_size_for_knl(knl)
@@ -1413,8 +1406,17 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False,
                              "must be integer, 'guess', or, if you're feeling "
                              "lucky, None." % (subgroup_size))
 
+    from loopy.preprocess import preprocess_kernel, infer_unknown_types
+    knl = infer_unknown_types(knl, expect_completion=True)
+    knl = preprocess_kernel(knl)
+
     op_map = ToCountMap()
     op_counter = ExpressionOpCounter(knl)
+
+    from loopy.kernel.instruction import (
+            CallInstruction, CInstruction, Assignment,
+            NoOpInstruction, BarrierInstruction)
+
     for insn in knl.instructions:
         if isinstance(insn, (CallInstruction, CInstruction, Assignment)):
             ops = op_counter(insn.assignee) + op_counter(insn.expression)
@@ -1551,7 +1553,6 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
         # (now use these counts to, e.g., predict performance)
 
     """
-    from loopy.preprocess import preprocess_kernel, infer_unknown_types
 
     if not knl.options.ignore_boostable_into:
         raise LoopyError("Kernel '%s': Using operation counting requires the option "
@@ -1588,6 +1589,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                              "must be integer, 'guess', or, if you're feeling "
                              "lucky, None." % (subgroup_size))
 
+    from loopy.preprocess import preprocess_kernel, infer_unknown_types
     knl = infer_unknown_types(knl, expect_completion=True)
     knl = preprocess_kernel(knl)
 
-- 
GitLab


From 0cf111d21244988e976ac7d6a26591b22f60cd11 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 3 Sep 2018 20:23:06 -0500
Subject: [PATCH 46/80] combined duplicate code processing subgroup_size into
 single function

---
 loopy/statistics.py | 100 ++++++++++++++++++--------------------------
 1 file changed, 40 insertions(+), 60 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 02b79a81..f71e1d91 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1375,36 +1375,7 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False,
         raise LoopyError("Kernel '%s': Using operation counting requires the option "
                 "ignore_boostable_into to be set." % knl.name)
 
-    if not isinstance(subgroup_size, int):
-        # try to find subgroup_size
-        subgroup_size_guess = _find_subgroup_size_for_knl(knl)
-
-        if subgroup_size is None:
-            if subgroup_size_guess is None:
-                # 'guess' was not passed and either no target device found
-                # or get_simd_group_size returned None
-                raise ValueError("No sub-group size passed, no target device found. "
-                                 "Either (1) pass integer value for subgroup_size, "
-                                 "(2) ensure that kernel.target is PyOpenClTarget "
-                                 "and kernel.target.device is set, or (3) pass "
-                                 "subgroup_size='guess' and hope for the best.")
-            else:
-                subgroup_size = subgroup_size_guess
-
-        elif subgroup_size == 'guess':
-            if subgroup_size_guess is None:
-                # unable to get subgroup_size from device, so guess
-                subgroup_size = 32
-                warn_with_kernel(knl, "get_op_map_guessing_subgroup_size",
-                                 "get_op_map: 'guess' sub-group size "
-                                 "passed, no target device found, wildly guessing "
-                                 "that sub-group size is %d." % (subgroup_size))
-            else:
-                subgroup_size = subgroup_size_guess
-        else:
-            raise ValueError("Invalid value for subgroup_size: %s. subgroup_size "
-                             "must be integer, 'guess', or, if you're feeling "
-                             "lucky, None." % (subgroup_size))
+    subgroup_size = _process_subgroup_size(knl, subgroup_size)
 
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
     knl = infer_unknown_types(knl, expect_completion=True)
@@ -1465,6 +1436,44 @@ def _find_subgroup_size_for_knl(knl):
         return None
 
 
+@memoize_method
+def _process_subgroup_size(knl, subgroup_size_requested):
+
+    if isinstance(subgroup_size_requested, int):
+        return subgroup_size_requested
+    else:
+        # try to find subgroup_size
+        subgroup_size_guess = _find_subgroup_size_for_knl(knl)
+
+        if subgroup_size_requested is None:
+            if subgroup_size_guess is None:
+                # 'guess' was not passed and either no target device found
+                # or get_simd_group_size returned None
+                raise ValueError("No sub-group size passed, no target device found. "
+                                 "Either (1) pass integer value for subgroup_size, "
+                                 "(2) ensure that kernel.target is PyOpenClTarget "
+                                 "and kernel.target.device is set, or (3) pass "
+                                 "subgroup_size='guess' and hope for the best.")
+            else:
+                return subgroup_size_guess
+
+        elif subgroup_size_requested == 'guess':
+            if subgroup_size_guess is None:
+                # unable to get subgroup_size from device, so guess
+                subgroup_size_guess = 32
+                warn_with_kernel(knl, "get_x_map_guessing_subgroup_size",
+                                 "'guess' sub-group size passed, no target device "
+                                 "found, wildly guessing that sub-group size is %d."
+                                 % (subgroup_size_guess))
+                return subgroup_size_guess
+            else:
+                return subgroup_size_guess
+        else:
+            raise ValueError("Invalid value for subgroup_size: %s. subgroup_size "
+                             "must be integer, 'guess', or, if you're feeling "
+                             "lucky, None." % (subgroup_size_requested))
+
+
 # {{{ get_mem_access_map
 
 def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
@@ -1558,36 +1567,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
         raise LoopyError("Kernel '%s': Using operation counting requires the option "
                 "ignore_boostable_into to be set." % knl.name)
 
-    if not isinstance(subgroup_size, int):
-        # try to find subgroup_size
-        subgroup_size_guess = _find_subgroup_size_for_knl(knl)
-
-        if subgroup_size is None:
-            if subgroup_size_guess is None:
-                # 'guess' was not passed and either no target device found
-                # or get_simd_group_size returned None
-                raise ValueError("No sub-group size passed, no target device found. "
-                                 "Either (1) pass integer value for subgroup_size, "
-                                 "(2) ensure that kernel.target is PyOpenClTarget "
-                                 "and kernel.target.device is set, or (3) pass "
-                                 "subgroup_size='guess' and hope for the best.")
-            else:
-                subgroup_size = subgroup_size_guess
-
-        elif subgroup_size == 'guess':
-            if subgroup_size_guess is None:
-                # unable to get subgroup_size from device, so guess
-                subgroup_size = 32
-                warn_with_kernel(knl, "get_mem_access_map_guessing_subgroup_size",
-                                 "get_mem_access_map: 'guess' sub-group size "
-                                 "passed, no target device found, wildly guessing "
-                                 "that sub-group size is %d." % (subgroup_size))
-            else:
-                subgroup_size = subgroup_size_guess
-        else:
-            raise ValueError("Invalid value for subgroup_size: %s. subgroup_size "
-                             "must be integer, 'guess', or, if you're feeling "
-                             "lucky, None." % (subgroup_size))
+    subgroup_size = _process_subgroup_size(knl, subgroup_size)
 
     from loopy.preprocess import preprocess_kernel, infer_unknown_types
     knl = infer_unknown_types(knl, expect_completion=True)
-- 
GitLab


From b9d8034bb415ca91a2d392db51d278d8b34de0c0 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 3 Sep 2018 20:30:02 -0500
Subject: [PATCH 47/80] added subgroup_size arg to get_op_map in tutorial

---
 doc/tutorial.rst | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 71b8f438..3019a47a 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1551,7 +1551,7 @@ information provided. Now we will count the operations:
 
 .. doctest::
 
-    >>> op_map = lp.get_op_map(knl)
+    >>> op_map = lp.get_op_map(knl, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(op_map))
     Op(np:dtype('float32'), add, workitem) : ...
 
-- 
GitLab


From eb4cfc8ad59d84bbc21cf3c719680e8fdda3c857 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 3 Sep 2018 20:45:21 -0500
Subject: [PATCH 48/80] updated doctests to reflect count granularity change
 for local ops (workitem->subgroup)

---
 doc/tutorial.rst | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 3019a47a..1272d2a5 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1553,11 +1553,11 @@ information provided. Now we will count the operations:
 
     >>> op_map = lp.get_op_map(knl, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(op_map))
-    Op(np:dtype('float32'), add, workitem) : ...
+    Op(np:dtype('float32'), add, subgroup) : ...
 
 Each line of output will look roughly like::
 
-    Op(np:dtype('float32'), add, workitem) : [l, m, n] -> { l * m * n : l > 0 and m > 0 and n > 0 }
+    Op(np:dtype('float32'), add, subgroup) : [l, m, n] -> { l * m * n : l > 0 and m > 0 and n > 0 }
 
 :func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{**
 :class:`loopy.Op` **:** :class:`islpy.PwQPolynomial` **}**. A
@@ -1579,12 +1579,12 @@ One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`:
 
     >>> param_dict = {'n': 256, 'm': 256, 'l': 8}
     >>> from loopy.statistics import CountGranularity as CG
-    >>> f32add = op_map[lp.Op(np.float32, 'add', CG.WORKITEM)].eval_with_dict(param_dict)
-    >>> f32div = op_map[lp.Op(np.float32, 'div', CG.WORKITEM)].eval_with_dict(param_dict)
-    >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.WORKITEM)].eval_with_dict(param_dict)
-    >>> f64add = op_map[lp.Op(np.float64, 'add', CG.WORKITEM)].eval_with_dict(param_dict)
-    >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.WORKITEM)].eval_with_dict(param_dict)
-    >>> i32add = op_map[lp.Op(np.int32, 'add', CG.WORKITEM)].eval_with_dict(param_dict)
+    >>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP)].eval_with_dict(param_dict)
+    >>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP)].eval_with_dict(param_dict)
+    >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP)].eval_with_dict(param_dict)
+    >>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP)].eval_with_dict(param_dict)
+    >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP)].eval_with_dict(param_dict)
+    >>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP)].eval_with_dict(param_dict)
     >>> print("%i\n%i\n%i\n%i\n%i\n%i" %
     ...     (f32add, f32div, f32mul, f64add, f64mul, i32add))
     524288
-- 
GitLab


From 6b34b8aa3a259b969514a9671fa91663a767c8a7 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 3 Sep 2018 21:05:36 -0500
Subject: [PATCH 49/80] passing subgroup_size to get_op_map

---
 test/test_numa_diff.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py
index 6b578838..15d5ea7c 100644
--- a/test/test_numa_diff.py
+++ b/test/test_numa_diff.py
@@ -231,7 +231,7 @@ def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
 
     if 1:
         print("OPS")
-        op_map = lp.get_op_map(hsv)
+        op_map = lp.get_op_map(hsv, subgroup_size=32)
         print(lp.stringify_stats_mapping(op_map))
 
         print("MEM")
-- 
GitLab


From 6e798d058cddda0ca5dcb6c9519f4c42c59db97b Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 3 Sep 2018 21:22:59 -0500
Subject: [PATCH 50/80] bug fix, calling iteritems on dict within ToCountMap,
 rather than ToCountMap itself

---
 loopy/statistics.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index f71e1d91..3fecfb77 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1391,7 +1391,7 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False,
     for insn in knl.instructions:
         if isinstance(insn, (CallInstruction, CInstruction, Assignment)):
             ops = op_counter(insn.assignee) + op_counter(insn.expression)
-            for key, val in six.iteritems(ops):
+            for key, val in six.iteritems(ops.count_map):
                 op_map = (
                         op_map
                         + ToCountMap({key: val})
-- 
GitLab


From aee564f107546e8db9d5b0186c26aca10f2d3b8a Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Thu, 6 Sep 2018 13:38:08 -0500
Subject: [PATCH 51/80] Fix and test generation of ISPC streaming stores

---
 loopy/target/ispc.py |  4 ++--
 test/test_target.py  | 29 +++++++++++++++++++++++++++++
 2 files changed, 31 insertions(+), 2 deletions(-)

diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 261475eb..9009b144 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -437,9 +437,9 @@ class ISPCASTBuilder(CASTBuilder):
                 else:
                     for dep in get_dependencies(term):
                         if filter_iname_tags_by_type(
-                                kernel.iname_to_tags[dep], LocalIndexTag):
+                                kernel.iname_to_tags.get(dep, []), LocalIndexTag):
                             tag, = filter_iname_tags_by_type(
-                                kernel.iname_to_tags[dep], LocalIndexTag, 1)
+                                kernel.iname_to_tags.get(dep, []), LocalIndexTag, 1)
                             if tag.axis == 0:
                                 raise LoopyError(
                                     "streaming store must have stride 1 in "
diff --git a/test/test_target.py b/test/test_target.py
index eb94bdc8..a0011426 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -327,6 +327,35 @@ def test_target_invalid_type_cast():
         lp.TypeCast(dtype, 1)
 
 
+def test_ispc_streaming_stores():
+    stream_dtype = np.float32
+    index_dtype = np.int32
+
+    knl = lp.make_kernel(
+            "{[i]: 0<=i<n}",
+            "a[i] = b[i] + scalar * c[i]",
+            target=lp.ISPCTarget(), index_dtype=index_dtype,
+            name="stream_triad")
+
+    vars = ["a", "b", "c", "scalar"]
+    knl = lp.assume(knl, "n>0")
+    knl = lp.split_iname(
+        knl, "i", 2**18, outer_tag="g.0", slabs=(0, 1))
+    knl = lp.split_iname(knl, "i_inner", 8, inner_tag="l.0")
+    knl = lp.tag_instructions(knl, "!streaming_store")
+
+    knl = lp.add_and_infer_dtypes(knl, {
+        var: stream_dtype
+        for var in vars
+        })
+
+    knl = lp.set_argument_order(knl, vars + ["n"])
+
+    knl = lp.preprocess_kernel(knl)
+    knl = lp.get_one_scheduled_kernel(knl)
+    lp.generate_code_v2(knl).all_code()
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
-- 
GitLab


From 680842c5612a8422cba5e7a6286d37e20cdfdaf6 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Thu, 6 Sep 2018 20:59:39 -0500
Subject: [PATCH 52/80] split_iname handles within correctly

---
 loopy/transform/iname.py | 22 ++++++++++++++++++----
 test/test_transform.py   | 17 +++++++++++++++++
 2 files changed, 35 insertions(+), 4 deletions(-)

diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index 2b618a46..65f1c2ec 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -177,6 +177,22 @@ def _split_iname_backend(kernel, split_iname,
         for syntax.
     """
 
+    from loopy.match import parse_stack_match
+    within = parse_stack_match(within)
+
+    # {{{ return the same kernel if no kernel matches
+
+    def _do_not_transform_if_no_within_matches():
+        for insn in kernel.instructions:
+            if within(kernel, insn, ()):
+                return
+
+        return kernel
+
+    _do_not_transform_if_no_within_matches()
+
+    # }}}
+
     existing_tags = kernel.iname_tags(split_iname)
     from loopy.kernel.data import ForceSequentialTag, filter_iname_tags_by_type
     if (do_tagged_check and existing_tags
@@ -249,7 +265,8 @@ def _split_iname_backend(kernel, split_iname,
 
     new_insns = []
     for insn in kernel.instructions:
-        if split_iname in insn.within_inames:
+        if split_iname in insn.within_inames and (
+                within(kernel, insn, ())):
             new_within_inames = (
                     (insn.within_inames.copy()
                     - frozenset([split_iname]))
@@ -284,9 +301,6 @@ def _split_iname_backend(kernel, split_iname,
             applied_iname_rewrites=applied_iname_rewrites,
             loop_priority=frozenset(new_priorities))
 
-    from loopy.match import parse_stack_match
-    within = parse_stack_match(within)
-
     rule_mapping_context = SubstitutionRuleMappingContext(
             kernel.substitutions, kernel.get_var_name_generator())
     ins = _InameSplitter(rule_mapping_context, within,
diff --git a/test/test_transform.py b/test/test_transform.py
index ed184fb5..394cf668 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -533,6 +533,23 @@ def test_uniquify_instruction_ids():
     assert all(isinstance(id, str) for id in insn_ids)
 
 
+def test_split_iname_only_if_in_within():
+    knl = lp.make_kernel(
+            "{[i]: 0<=i<10}",
+            """
+            c[i] = 3*d[i] {id=to_split}
+            a[i] = 2*b[i] {id=not_to_split}
+            """)
+
+    knl = lp.split_iname(knl, "i", 4, within='id:to_split')
+
+    for insn in knl.instructions:
+        if insn.id == 'to_split':
+            assert insn.within_inames == frozenset({'i_outer', 'i_inner'})
+        if insn.id == 'not_to_split':
+            assert insn.within_inames == frozenset({'i'})
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
-- 
GitLab


From 796c89fcd819efe80e4c868996cd688559c482d9 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Thu, 6 Sep 2018 22:49:25 -0500
Subject: [PATCH 53/80] improve the logic of project_out

---
 loopy/transform/iname.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index 65f1c2ec..fb3609b6 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -177,14 +177,15 @@ def _split_iname_backend(kernel, split_iname,
         for syntax.
     """
 
-    from loopy.match import parse_stack_match
-    within = parse_stack_match(within)
+    from loopy.match import parse_stack_match, parse_match
+    stacked_within = parse_stack_match(within)
+    within = parse_match(within)
 
     # {{{ return the same kernel if no kernel matches
 
     def _do_not_transform_if_no_within_matches():
         for insn in kernel.instructions:
-            if within(kernel, insn, ()):
+            if within(kernel, insn):
                 return
 
         return kernel
@@ -246,10 +247,15 @@ def _split_iname_backend(kernel, split_iname,
         name_dim_type, name_idx = space.get_var_dict()[split_iname]
         s = s.intersect(fixed_constraint_set)
 
-        if within is None:
-            s = s.project_out(name_dim_type, name_idx, 1)
+        def _project_out_only_if_all_instructions_in_within():
+            for insn in kernel.instructions:
+                if split_iname in insn.within_inames and (
+                        not within(kernel, insn)):
+                    return s
+
+            return s.project_out(name_dim_type, name_idx, 1)
 
-        return s
+        return _project_out_only_if_all_instructions_in_within()
 
     new_domains = [process_set(dom) for dom in kernel.domains]
 
@@ -266,7 +272,7 @@ def _split_iname_backend(kernel, split_iname,
     new_insns = []
     for insn in kernel.instructions:
         if split_iname in insn.within_inames and (
-                within(kernel, insn, ())):
+                within(kernel, insn)):
             new_within_inames = (
                     (insn.within_inames.copy()
                     - frozenset([split_iname]))
@@ -303,7 +309,7 @@ def _split_iname_backend(kernel, split_iname,
 
     rule_mapping_context = SubstitutionRuleMappingContext(
             kernel.substitutions, kernel.get_var_name_generator())
-    ins = _InameSplitter(rule_mapping_context, within,
+    ins = _InameSplitter(rule_mapping_context, stacked_within,
             split_iname, outer_iname, inner_iname, new_loop_index)
 
     kernel = ins.map_kernel(kernel)
-- 
GitLab


From 190e41dbfc534c0126e2a5e73659c1120dbdb43d Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Sat, 8 Sep 2018 11:05:01 -0500
Subject: [PATCH 54/80] corrects get_highlighte_code

---
 loopy/target/c/c_execution.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py
index 6b80bae2..f7622936 100644
--- a/loopy/target/c/c_execution.py
+++ b/loopy/target/c/c_execution.py
@@ -402,7 +402,7 @@ class CKernelExecutor(KernelExecutorBase):
         if self.kernel.options.write_cl:
             output = all_code
             if self.kernel.options.highlight_cl:
-                output = get_highlighted_code(code=output)
+                output = get_highlighted_code(output)
 
             if self.kernel.options.write_cl is True:
                 print(output)
-- 
GitLab


From 2cac2bf1c91e87a97b673b907604146a91d2e696 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Mon, 10 Sep 2018 15:58:17 -0500
Subject: [PATCH 55/80] disables support for parse_stack_match in split_iname

---
 loopy/transform/iname.py | 13 +++++--------
 1 file changed, 5 insertions(+), 8 deletions(-)

diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index fb3609b6..ad1da3e7 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -139,8 +139,7 @@ class _InameSplitter(RuleAwareIdentityMapper):
                 and self.split_iname not in expn_state.arg_context
                 and self.within(
                     expn_state.kernel,
-                    expn_state.instruction,
-                    expn_state.stack)):
+                    expn_state.instruction)):
             new_inames = list(expr.inames)
             new_inames.remove(self.split_iname)
             new_inames.extend([self.outer_iname, self.inner_iname])
@@ -157,8 +156,7 @@ class _InameSplitter(RuleAwareIdentityMapper):
                 and self.split_iname not in expn_state.arg_context
                 and self.within(
                     expn_state.kernel,
-                    expn_state.instruction,
-                    expn_state.stack)):
+                    expn_state.instruction)):
             return self.replacement_index
         else:
             return super(_InameSplitter, self).map_variable(expr, expn_state)
@@ -177,8 +175,7 @@ def _split_iname_backend(kernel, split_iname,
         for syntax.
     """
 
-    from loopy.match import parse_stack_match, parse_match
-    stacked_within = parse_stack_match(within)
+    from loopy.match import parse_match
     within = parse_match(within)
 
     # {{{ return the same kernel if no kernel matches
@@ -309,7 +306,7 @@ def _split_iname_backend(kernel, split_iname,
 
     rule_mapping_context = SubstitutionRuleMappingContext(
             kernel.substitutions, kernel.get_var_name_generator())
-    ins = _InameSplitter(rule_mapping_context, stacked_within,
+    ins = _InameSplitter(rule_mapping_context, within,
             split_iname, outer_iname, inner_iname, new_loop_index)
 
     kernel = ins.map_kernel(kernel)
@@ -349,7 +346,7 @@ def split_iname(kernel, split_iname, inner_length,
     :arg inner_tag: The iname tag (see :ref:`iname-tags`) to apply to
         *inner_iname*.
     :arg within: a stack match as understood by
-        :func:`loopy.match.parse_stack_match`.
+        :func:`loopy.match.parse_match`.
     """
     def make_new_loop_index(inner, outer):
         return inner + outer*inner_length
-- 
GitLab


From eb42917a6d5b7a923384ae91902cb7cc89dc63ba Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Mon, 17 Sep 2018 11:50:31 -0500
Subject: [PATCH 56/80] fixes the statistics tests

---
 loopy/statistics.py | 39 ++++++++++++++++-----------------------
 1 file changed, 16 insertions(+), 23 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 9894656b..5dddd49e 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1286,8 +1286,8 @@ def count_insn_runs(knl, program_callables_info, insn, count_redundant_work,
 
 
 @memoize_method
-def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work,
-                    count_granularity=CountGranularity.WORKITEM):
+def _get_insn_count(knl, program_callables_info, insn_id, subgroup_size,
+        count_redundant_work, count_granularity=CountGranularity.WORKITEM):
     insn = knl.id_to_insn[insn_id]
 
     if count_granularity is None:
@@ -1299,11 +1299,12 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work,
 
     if count_granularity == CountGranularity.WORKITEM:
         return count_insn_runs(
-            knl, insn, count_redundant_work=count_redundant_work,
+            knl, program_callables_info, insn,
+            count_redundant_work=count_redundant_work,
             disregard_local_axes=False)
 
     ct_disregard_local = count_insn_runs(
-            knl, insn, disregard_local_axes=True,
+            knl, program_callables_info, insn, disregard_local_axes=True,
             count_redundant_work=count_redundant_work)
 
     if count_granularity == CountGranularity.WORKGROUP:
@@ -1311,7 +1312,7 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work,
     elif count_granularity == CountGranularity.SUBGROUP:
         # get the group size
         from loopy.symbolic import aff_to_expr
-        _, local_size = knl.get_grid_size_upper_bounds()
+        _, local_size = knl.get_grid_size_upper_bounds(program_callables_info)
         workgroup_size = 1
         if local_size:
             for size in local_size:
@@ -1353,12 +1354,8 @@ def get_op_map_for_single_kernel(knl, program_callables_info,
 
     subgroup_size = _process_subgroup_size(knl, subgroup_size)
 
-    from loopy.preprocess import preprocess_kernel, infer_unknown_types
-    knl = infer_unknown_types(knl, expect_completion=True)
-    knl = preprocess_kernel(knl)
-
     op_map = ToCountMap()
-    op_counter = ExpressionOpCounter(knl)
+    op_counter = ExpressionOpCounter(knl, program_callables_info)
 
     from loopy.kernel.instruction import (
             CallInstruction, CInstruction, Assignment,
@@ -1371,9 +1368,9 @@ def get_op_map_for_single_kernel(knl, program_callables_info,
                 op_map = (
                         op_map
                         + ToCountMap({key: val})
-                        * _get_insn_count(knl, insn.id, subgroup_size,
-                                         count_redundant_work,
-                                         key.count_granularity))
+                        * _get_insn_count(knl, program_callables_info, insn.id,
+                            subgroup_size, count_redundant_work,
+                            key.count_granularity))
 
         elif isinstance(insn, (NoOpInstruction, BarrierInstruction)):
             pass
@@ -1547,10 +1544,6 @@ def get_mem_access_map_for_single_kernel(knl, program_callables_info,
 
     subgroup_size = _process_subgroup_size(knl, subgroup_size)
 
-    from loopy.preprocess import preprocess_kernel, infer_unknown_types
-    knl = infer_unknown_types(knl, expect_completion=True)
-    knl = preprocess_kernel(knl)
-
     access_map = ToCountMap()
     access_counter_g = GlobalMemAccessCounter(knl, program_callables_info)
     access_counter_l = LocalMemAccessCounter(knl, program_callables_info)
@@ -1576,18 +1569,18 @@ def get_mem_access_map_for_single_kernel(knl, program_callables_info,
                 access_map = (
                         access_map
                         + ToCountMap({key: val})
-                        * _get_insn_count(knl, insn.id, subgroup_size,
-                                          count_redundant_work,
-                                          key.count_granularity))
+                        * _get_insn_count(knl, program_callables_info, insn.id,
+                            subgroup_size, count_redundant_work,
+                            key.count_granularity))
 
             for key, val in six.iteritems(access_assignee.count_map):
 
                 access_map = (
                         access_map
                         + ToCountMap({key: val})
-                        * _get_insn_count(knl, insn.id, subgroup_size,
-                                          count_redundant_work,
-                                          key.count_granularity))
+                        * _get_insn_count(knl, program_callables_info, insn.id,
+                            subgroup_size, count_redundant_work,
+                            key.count_granularity))
 
         elif isinstance(insn, (NoOpInstruction, BarrierInstruction)):
             pass
-- 
GitLab


From 7389731759bb8b5d8978a7368a2236e7a9554631 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Mon, 17 Sep 2018 12:57:09 -0500
Subject: [PATCH 57/80] make the test adapt to the progam model

---
 test/test_target.py    | 2 --
 test/test_transform.py | 6 +++---
 2 files changed, 3 insertions(+), 5 deletions(-)

diff --git a/test/test_target.py b/test/test_target.py
index 0eee835c..a5186c71 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -347,8 +347,6 @@ def test_ispc_streaming_stores():
 
     knl = lp.set_argument_order(knl, vars + ["n"])
 
-    knl = lp.preprocess_kernel(knl)
-    knl = lp.get_one_scheduled_kernel(knl)
     lp.generate_code_v2(knl).all_code()
 
 
diff --git a/test/test_transform.py b/test/test_transform.py
index f67cb927..04162331 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -544,16 +544,16 @@ def test_uniquify_instruction_ids():
 
 
 def test_split_iname_only_if_in_within():
-    knl = lp.make_kernel(
+    prog = lp.make_kernel(
             "{[i]: 0<=i<10}",
             """
             c[i] = 3*d[i] {id=to_split}
             a[i] = 2*b[i] {id=not_to_split}
             """)
 
-    knl = lp.split_iname(knl, "i", 4, within='id:to_split')
+    prog = lp.split_iname(prog, "i", 4, within='id:to_split')
 
-    for insn in knl.instructions:
+    for insn in prog.root_kernel.instructions:
         if insn.id == 'to_split':
             assert insn.within_inames == frozenset({'i_outer', 'i_inner'})
         if insn.id == 'not_to_split':
-- 
GitLab


From ba27e5defa26d171e5039de2fa877fc1e1b144d0 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Sun, 14 Oct 2018 20:17:13 -0500
Subject: [PATCH 58/80] minor changes after the review

---
 examples/python/hello-loopy.py |  3 +--
 loopy/auto_test.py             |  2 +-
 loopy/check.py                 |  4 ++--
 loopy/codegen/__init__.py      | 11 +++++++++++
 loopy/type_inference.py        |  4 ++--
 5 files changed, 17 insertions(+), 7 deletions(-)

diff --git a/examples/python/hello-loopy.py b/examples/python/hello-loopy.py
index 764cea0e..9098c544 100644
--- a/examples/python/hello-loopy.py
+++ b/examples/python/hello-loopy.py
@@ -16,8 +16,7 @@ a = cl.array.arange(queue, n, dtype=np.float32)
 # ------
 knl = lp.make_kernel(
         "{ [i]: 0<=i<n }",
-        "out[i] = 2*a[i]",
-        target=lp.PyOpenCLTarget(ctx.devices[0]))
+        "out[i] = 2*a[i]")
 
 # transform
 # ---------
diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index 5ce80ed8..bee1b72f 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -118,7 +118,7 @@ def make_ref_args(program, impl_arg_info, queue, parameters):
             shape = evaluate_shape(arg.unvec_shape, parameters)
             dtype = kernel_arg.dtype
 
-            is_output = arg.base_name in program.root_kernel.get_written_variables()
+            is_output = arg.base_name in kernel_arg.is_output_only
 
             if arg.arg_class is ImageArg:
                 storage_array = ary = cl_array.empty(
diff --git a/loopy/check.py b/loopy/check.py
index 76a56c08..bfcd7aa2 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -98,7 +98,7 @@ class UnscopedCallCollector(CombineMapper):
     map_type_cast = map_constant
 
 
-def check_functions_are_scoped(kernel):
+def check_functions_are_resolved(kernel):
     """ Checks if all the calls in the instruction expression have been scoped,
     otherwise indicates to what all calls we await signature. Refer
     :class:`loopy.symbolic.ResolvedFunction` for a detailed explanation of a
@@ -120,7 +120,7 @@ def check_functions_are_scoped(kernel):
             pass
         else:
             raise NotImplementedError(
-                    "Unknown type of instruction %s" % type(insn).__name__)
+                    "Unsupported instruction type %s." % type(insn).__name__)
 
 # }}}
 
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 7a25b67e..d0b19a1e 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -193,6 +193,8 @@ class CodeGenerationState(object):
     .. attribute:: schedule_index_end
 
     .. attribute:: program_callables_info
+
+        An instance of :class:`loopy.ProgramCallablesInfo`.
     """
 
     def __init__(self, kernel,
@@ -386,6 +388,10 @@ class PreambleInfo(ImmutableRecord):
 def generate_code_for_a_single_kernel(kernel, program_callables_info):
     """
     :returns: a :class:`CodeGenerationResult`
+
+    :param kernel: An instance of :class:`loopy.LoopKernel`.
+    :param program_callables_info: An instance of
+        :class:`loopy.ProgramCallablesInfo`.
     """
 
     from loopy.kernel import KernelState
@@ -529,6 +535,11 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info):
 
 
 def generate_code_v2(program):
+    """
+    Returns an instance of :class:`CodeGenerationResult`.
+
+    :param program: An instance of :class:`loopy.Program`.
+    """
     from loopy.kernel import LoopKernel
     from loopy.program import make_program_from_kernel
 
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index a2174181..43986640 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -1000,8 +1000,8 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info,
     if expect_completion:
         # if completion is expected, then it is important that all the
         # callables are scoped.
-        from loopy.check import check_functions_are_scoped
-        check_functions_are_scoped(type_specialized_kernel)
+        from loopy.check import check_functions_are_resolved
+        check_functions_are_resolved(type_specialized_kernel)
 
     return type_specialized_kernel, program_callables_info
 
-- 
GitLab


From 0e9627d8a27ae27be5ae559befdfc52d93806a8e Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Sun, 14 Oct 2018 20:19:03 -0500
Subject: [PATCH 59/80] arg_is_output_only -> args_are_output_only

---
 loopy/kernel/creation.py           | 4 ++--
 loopy/kernel/function_interface.py | 4 ++--
 loopy/kernel/tools.py              | 2 +-
 3 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index bc996d9c..685232c6 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -2166,8 +2166,8 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     check_for_duplicate_names(knl)
     check_written_variable_names(knl)
 
-    from loopy.kernel.tools import infer_arg_is_output_only
-    knl = infer_arg_is_output_only(knl)
+    from loopy.kernel.tools import infer_args_are_output_only
+    knl = infer_args_are_output_only(knl)
 
     from loopy.preprocess import prepare_for_caching
     knl = prepare_for_caching(knl)
diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py
index c8b5a953..323690af 100644
--- a/loopy/kernel/function_interface.py
+++ b/loopy/kernel/function_interface.py
@@ -111,8 +111,8 @@ def get_kw_pos_association(kernel):
     Returns a tuple of ``(kw_to_pos, pos_to_kw)`` for the arguments in
     *kernel*.
     """
-    from loopy.kernel.tools import infer_arg_is_output_only
-    kernel = infer_arg_is_output_only(kernel)
+    from loopy.kernel.tools import infer_args_are_output_only
+    kernel = infer_args_are_output_only(kernel)
     kw_to_pos = {}
     pos_to_kw = {}
 
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 3c0c2443..3f4defc5 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -1866,7 +1866,7 @@ def find_aliasing_equivalence_classes(kernel):
 
 # {{{ direction helper tools
 
-def infer_arg_is_output_only(kernel):
+def infer_args_are_output_only(kernel):
     """
     Returns a copy of *kernel* with the attribute ``is_output_only`` set.
 
-- 
GitLab


From 111a5eb42b33b3d080027175533a06f57d32283a Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Sun, 14 Oct 2018 20:28:15 -0500
Subject: [PATCH 60/80] minor changes after review

---
 loopy/kernel/function_interface.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py
index 323690af..268bdaa1 100644
--- a/loopy/kernel/function_interface.py
+++ b/loopy/kernel/function_interface.py
@@ -111,8 +111,6 @@ def get_kw_pos_association(kernel):
     Returns a tuple of ``(kw_to_pos, pos_to_kw)`` for the arguments in
     *kernel*.
     """
-    from loopy.kernel.tools import infer_args_are_output_only
-    kernel = infer_args_are_output_only(kernel)
     kw_to_pos = {}
     pos_to_kw = {}
 
@@ -136,7 +134,7 @@ class GridOverrideForCalleeKernel(ImmutableRecord):
     """
     Helper class to set the
     :attr:`loopy.kernel.LoopKernel.override_get_grid_size_for_insn_ids` of the
-    callee kernels. Refer
+    callee kernels. Refer to
     :func:`loopy.kernel.function_interface.GridOverrideForCalleeKernel.__call__`,
     :func:`loopy.kernel.function_interface.CallbleKernel.with_hw_axes_sizes`.
 
@@ -301,7 +299,8 @@ class InKernelCallable(ImmutableRecord):
                 self.arg_id_to_descr is not None)
 
     def generate_preambles(self, target):
-        """ Yields the target specific preamble.
+        """
+        Yields the target specific preamble.
         """
         raise NotImplementedError()
 
-- 
GitLab


From c194c74e22513140f9e0afd92a428c42ba3fcfb6 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Sun, 14 Oct 2018 20:30:27 -0500
Subject: [PATCH 61/80] program_callables_info, ProgramCallablesInfo ->
 callables_table, CallablesTable

---
 doc/tutorial.rst                          |   4 +-
 examples/python/global_barrier_removal.py |   2 +-
 loopy/check.py                            |  24 ++---
 loopy/codegen/__init__.py                 |  28 +++---
 loopy/codegen/control.py                  |   2 +-
 loopy/codegen/loop.py                     |   2 +-
 loopy/kernel/__init__.py                  |  16 +--
 loopy/kernel/function_interface.py        |  16 +--
 loopy/kernel/tools.py                     |  12 +--
 loopy/library/function.py                 |  12 +--
 loopy/library/random123.py                |  12 +--
 loopy/library/reduction.py                |   8 +-
 loopy/preprocess.py                       |  98 +++++++++----------
 loopy/program.py                          | 114 +++++++++++-----------
 loopy/schedule/__init__.py                |  18 ++--
 loopy/statistics.py                       |  76 +++++++--------
 loopy/target/__init__.py                  |   2 +-
 loopy/target/c/__init__.py                |  14 +--
 loopy/target/c/codegen/expression.py      |  10 +-
 loopy/target/cuda.py                      |  14 +--
 loopy/target/execution.py                 |   2 +-
 loopy/target/ispc.py                      |   4 +-
 loopy/target/opencl.py                    |  22 ++---
 loopy/target/pyopencl.py                  |  20 ++--
 loopy/target/python.py                    |   6 +-
 loopy/transform/buffer.py                 |  12 +--
 loopy/transform/callable.py               |  14 +--
 loopy/transform/data.py                   |  12 +--
 loopy/transform/fusion.py                 |  12 +--
 loopy/transform/iname.py                  |   4 +-
 loopy/transform/instruction.py            |   2 +-
 loopy/transform/precompute.py             |  12 +--
 loopy/transform/save.py                   |  12 +--
 loopy/transform/subst.py                  |   2 +-
 loopy/type_inference.py                   |  80 +++++++--------
 test/test_loopy.py                        |  14 +--
 test/testlib.py                           |  10 +-
 37 files changed, 362 insertions(+), 362 deletions(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 6a7a977a..25082f88 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1208,7 +1208,7 @@ happens when the instruction schedule is generated. To see the schedule, we
 should call :func:`loopy.get_one_scheduled_kernel`:
 
    >>> prog = lp.preprocess_kernel(prog)
-   >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info)
+   >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table)
    >>> prog = prog.with_root_kernel(knl)
    >>> print(prog)
    ---------------------------------------------------------------------------
@@ -1240,7 +1240,7 @@ that :func:`loopy.get_one_scheduled_kernel` needs to be called one more time to
 put those instructions into the schedule.
 
    >>> prog = lp.save_and_reload_temporaries(prog)
-   >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info)  # Schedule added instructions
+   >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table)  # Schedule added instructions
    >>> prog = prog.with_root_kernel(knl)
    >>> print(prog)
    ---------------------------------------------------------------------------
diff --git a/examples/python/global_barrier_removal.py b/examples/python/global_barrier_removal.py
index cc4926fe..884fb0bd 100644
--- a/examples/python/global_barrier_removal.py
+++ b/examples/python/global_barrier_removal.py
@@ -24,7 +24,7 @@ from loopy.preprocess import preprocess_kernel
 knl = preprocess_kernel(knl)
 
 from loopy.schedule import get_one_scheduled_kernel
-knl = get_one_scheduled_kernel(knl.root_kernel, knl.program_callables_info)
+knl = get_one_scheduled_kernel(knl.root_kernel, knl.callables_table)
 
 # map schedule onto host or device
 print(knl)
diff --git a/loopy/check.py b/loopy/check.py
index bfcd7aa2..64cf80a4 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -206,7 +206,7 @@ def check_multiple_tags_allowed(kernel):
                                  "tags: {1}".format(iname, tags))
 
 
-def check_for_double_use_of_hw_axes(kernel, program_callables_info):
+def check_for_double_use_of_hw_axes(kernel, callables_table):
     from loopy.kernel.data import UniqueTag
     from loopy.kernel.instruction import CallInstruction
     from loopy.kernel.function_interface import CallableKernel
@@ -224,7 +224,7 @@ def check_for_double_use_of_hw_axes(kernel, program_callables_info):
 
         # check usage of iname tags in the callee kernel
         if isinstance(insn, CallInstruction):
-            in_knl_callable = program_callables_info[
+            in_knl_callable = callables_table[
                     insn.expression.function.name]
             if isinstance(in_knl_callable, CallableKernel):
                 # check for collision in iname_tag keys in the instruction
@@ -712,13 +712,13 @@ def check_variable_access_ordered(kernel):
 # }}}
 
 
-def pre_schedule_checks(kernel, program_callables_info):
+def pre_schedule_checks(kernel, callables_table):
     try:
         logger.debug("%s: pre-schedule check: start" % kernel.name)
 
         check_for_duplicate_insn_ids(kernel)
         check_for_orphaned_user_hardware_axes(kernel)
-        check_for_double_use_of_hw_axes(kernel, program_callables_info)
+        check_for_double_use_of_hw_axes(kernel, callables_table)
         check_insn_attributes(kernel)
         check_loop_priority_inames_known(kernel)
         check_multiple_tags_allowed(kernel)
@@ -746,7 +746,7 @@ def pre_schedule_checks(kernel, program_callables_info):
 
 # {{{ check for unused hw axes
 
-def _check_for_unused_hw_axes_in_kernel_chunk(kernel, program_callables_info,
+def _check_for_unused_hw_axes_in_kernel_chunk(kernel, callables_table,
         sched_index=None):
     from loopy.schedule import (CallKernel, RunInstruction,
             Barrier, EnterLoop, LeaveLoop, ReturnFromKernel,
@@ -763,7 +763,7 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, program_callables_info,
         _, past_end_i = gather_schedule_block(kernel.schedule, sched_index)
         group_size, local_size = kernel.get_grid_sizes_for_insn_ids_as_exprs(
                 get_insn_ids_for_block_at(kernel.schedule, sched_index),
-                program_callables_info)
+                callables_table)
 
         group_axes = set(ax for ax, length in enumerate(group_size))
         local_axes = set(ax for ax, length in enumerate(local_size))
@@ -781,7 +781,7 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, program_callables_info,
         sched_item = kernel.schedule[i]
         if isinstance(sched_item, CallKernel):
             i = _check_for_unused_hw_axes_in_kernel_chunk(kernel,
-                    program_callables_info, i)
+                    callables_table, i)
 
         elif isinstance(sched_item, RunInstruction):
             insn = kernel.id_to_insn[sched_item.insn_id]
@@ -832,10 +832,10 @@ def _check_for_unused_hw_axes_in_kernel_chunk(kernel, program_callables_info,
     return past_end_i
 
 
-def check_for_unused_hw_axes_in_insns(kernel, program_callables_info):
+def check_for_unused_hw_axes_in_insns(kernel, callables_table):
     if kernel.schedule:
         _check_for_unused_hw_axes_in_kernel_chunk(kernel,
-                program_callables_info)
+                callables_table)
 
 # }}}
 
@@ -989,15 +989,15 @@ def check_that_shapes_and_strides_are_arguments(kernel):
 # }}}
 
 
-def pre_codegen_checks(kernel, program_callables_info):
+def pre_codegen_checks(kernel, callables_table):
     try:
         logger.debug("pre-codegen check %s: start" % kernel.name)
 
-        check_for_unused_hw_axes_in_insns(kernel, program_callables_info)
+        check_for_unused_hw_axes_in_insns(kernel, callables_table)
         check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel)
         check_that_temporaries_are_defined_in_subkernels_where_used(kernel)
         check_that_all_insns_are_scheduled(kernel)
-        kernel.target.pre_codegen_check(kernel, program_callables_info)
+        kernel.target.pre_codegen_check(kernel, callables_table)
         check_that_shapes_and_strides_are_arguments(kernel)
 
         logger.debug("pre-codegen check %s: done" % kernel.name)
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index d0b19a1e..250e7215 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -192,16 +192,16 @@ class CodeGenerationState(object):
 
     .. attribute:: schedule_index_end
 
-    .. attribute:: program_callables_info
+    .. attribute:: callables_table
 
-        An instance of :class:`loopy.ProgramCallablesInfo`.
+        An instance of :class:`loopy.CallablesTable`.
     """
 
     def __init__(self, kernel,
             implemented_data_info, implemented_domain, implemented_predicates,
             seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map,
             allow_complex,
-            program_callables_info,
+            callables_table,
             vectorization_info=None, var_name_generator=None,
             is_generating_device_code=None,
             gen_program_name=None,
@@ -215,7 +215,7 @@ class CodeGenerationState(object):
         self.seen_atomic_dtypes = seen_atomic_dtypes
         self.var_subst_map = var_subst_map.copy()
         self.allow_complex = allow_complex
-        self.program_callables_info = program_callables_info
+        self.callables_table = callables_table
         self.vectorization_info = vectorization_info
         self.var_name_generator = var_name_generator
         self.is_generating_device_code = is_generating_device_code
@@ -263,7 +263,7 @@ class CodeGenerationState(object):
                 seen_atomic_dtypes=self.seen_atomic_dtypes,
                 var_subst_map=var_subst_map or self.var_subst_map,
                 allow_complex=self.allow_complex,
-                program_callables_info=self.program_callables_info,
+                callables_table=self.callables_table,
                 vectorization_info=vectorization_info,
                 var_name_generator=self.var_name_generator,
                 is_generating_device_code=is_generating_device_code,
@@ -385,19 +385,19 @@ class PreambleInfo(ImmutableRecord):
 
 # {{{ main code generation entrypoint
 
-def generate_code_for_a_single_kernel(kernel, program_callables_info):
+def generate_code_for_a_single_kernel(kernel, callables_table):
     """
     :returns: a :class:`CodeGenerationResult`
 
     :param kernel: An instance of :class:`loopy.LoopKernel`.
-    :param program_callables_info: An instance of
-        :class:`loopy.ProgramCallablesInfo`.
+    :param callables_table: An instance of
+        :class:`loopy.CallablesTable`.
     """
 
     from loopy.kernel import KernelState
     if kernel.schedule is None:
         from loopy.schedule import get_one_scheduled_kernel
-        kernel = get_one_scheduled_kernel(kernel, program_callables_info)
+        kernel = get_one_scheduled_kernel(kernel, callables_table)
 
     if kernel.state != KernelState.SCHEDULED:
         raise LoopyError("cannot generate code for a kernel that has not been "
@@ -419,7 +419,7 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info):
     # }}}
 
     from loopy.check import pre_codegen_checks
-    pre_codegen_checks(kernel, program_callables_info)
+    pre_codegen_checks(kernel, callables_table)
 
     logger.info("%s: generate code: start" % kernel.name)
 
@@ -479,7 +479,7 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info):
                 + kernel.name
                 + kernel.target.host_program_name_suffix),
             schedule_index_end=len(kernel.schedule),
-            program_callables_info=program_callables_info)
+            callables_table=callables_table)
 
     from loopy.codegen.result import generate_host_or_device_program
 
@@ -556,17 +556,17 @@ def generate_code_v2(program):
 
     codegen_results = {}
 
-    for func_id, in_knl_callable in program.program_callables_info.items():
+    for func_id, in_knl_callable in program.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
             codegen_results[func_id] = (
                     generate_code_for_a_single_kernel(in_knl_callable.subkernel,
-                        program.program_callables_info))
+                        program.callables_table))
 
     device_preambles = set()
     for cgr in codegen_results.values():
         device_preambles.update(cgr.device_preambles)
 
-    for in_knl_callable in program.program_callables_info.values():
+    for in_knl_callable in program.callables_table.values():
         for preamble in in_knl_callable.generate_preambles(program.target):
             device_preambles.update([preamble])
 
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index 90bdbda3..81a672a1 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -116,7 +116,7 @@ def generate_code_for_sched_index(codegen_state, sched_index):
 
         glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs(
                 get_insn_ids_for_block_at(kernel.schedule, sched_index),
-                codegen_state.program_callables_info)
+                codegen_state.callables_table)
 
         return merge_codegen_results(codegen_state, [
             codegen_result,
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index 39cf20c7..c282de79 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -248,7 +248,7 @@ def set_up_hw_parallel_loops(codegen_state, schedule_index, next_func,
         return next_func(codegen_state)
 
     global_size, local_size = kernel.get_grid_sizes_for_insn_ids(
-            insn_ids_for_block, codegen_state.program_callables_info)
+            insn_ids_for_block, codegen_state.callables_table)
 
     hw_inames_left = hw_inames_left[:]
     iname = hw_inames_left.pop()
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 410f1332..70079d31 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -1036,7 +1036,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 constants_only=True)))
 
     @memoize_method
-    def get_grid_sizes_for_insn_ids(self, insn_ids, program_callables_info,
+    def get_grid_sizes_for_insn_ids(self, insn_ids, callables_table,
             ignore_auto=False):
         """Return a tuple (global_size, local_size) containing a grid that
         could accommodate execution of all instructions whose IDs are given
@@ -1048,7 +1048,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         if self.overridden_get_grid_sizes_for_insn_ids:
             return self.overridden_get_grid_sizes_for_insn_ids(
                     insn_ids,
-                    program_callables_info,
+                    callables_table,
                     ignore_auto=ignore_auto)
 
         all_inames_by_insns = set()
@@ -1135,7 +1135,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
     @memoize_method
     def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids,
-            program_callables_info, ignore_auto=False):
+            callables_table, ignore_auto=False):
         """Return a tuple (global_size, local_size) containing a grid that
         could accommodate execution of all instructions whose IDs are given
         in *insn_ids*.
@@ -1146,7 +1146,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         """
 
         grid_size, group_size = self.get_grid_sizes_for_insn_ids(
-                insn_ids, program_callables_info, ignore_auto)
+                insn_ids, callables_table, ignore_auto)
 
         def tup_to_exprs(tup):
             from loopy.symbolic import pw_aff_to_expr
@@ -1154,7 +1154,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         return tup_to_exprs(grid_size), tup_to_exprs(group_size)
 
-    def get_grid_size_upper_bounds(self, program_callables_info, ignore_auto=False):
+    def get_grid_size_upper_bounds(self, callables_table, ignore_auto=False):
         """Return a tuple (global_size, local_size) containing a grid that
         could accommodate execution of *all* instructions in the kernel.
 
@@ -1162,10 +1162,10 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         """
         return self.get_grid_sizes_for_insn_ids(
                 frozenset(insn.id for insn in self.instructions),
-                program_callables_info,
+                callables_table,
                 ignore_auto=ignore_auto)
 
-    def get_grid_size_upper_bounds_as_exprs(self, program_callables_info,
+    def get_grid_size_upper_bounds_as_exprs(self, callables_table,
             ignore_auto=False):
         """Return a tuple (global_size, local_size) containing a grid that
         could accommodate execution of *all* instructions in the kernel.
@@ -1175,7 +1175,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         return self.get_grid_sizes_for_insn_ids_as_exprs(
                 frozenset(insn.id for insn in self.instructions),
-                program_callables_info,
+                callables_table,
                 ignore_auto=ignore_auto)
 
     # }}}
diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py
index 268bdaa1..362fbcef 100644
--- a/loopy/kernel/function_interface.py
+++ b/loopy/kernel/function_interface.py
@@ -157,7 +157,7 @@ class GridOverrideForCalleeKernel(ImmutableRecord):
         self.local_size = local_size
         self.global_size = global_size
 
-    def __call__(self, insn_ids, program_callables_info, ignore_auto=True):
+    def __call__(self, insn_ids, callables_table, ignore_auto=True):
         return self.local_size, self.global_size
 
 # }}}
@@ -214,7 +214,7 @@ class InKernelCallable(ImmutableRecord):
 
     update_persistent_hash = LoopKernel.update_persistent_hash
 
-    def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info):
+    def with_types(self, arg_id_to_dtype, caller_kernel, callables_table):
         """
         :arg arg_id_to_type: a mapping from argument identifiers
             (integers for positional arguments, names for keyword
@@ -234,7 +234,7 @@ class InKernelCallable(ImmutableRecord):
 
         raise NotImplementedError()
 
-    def with_descrs(self, arg_id_to_descr, program_callables_info):
+    def with_descrs(self, arg_id_to_descr, callables_table):
         """
         :arg arg_id_to_descr: a mapping from argument identifiers
             (integers for positional arguments, names for keyword
@@ -363,16 +363,16 @@ class ScalarCallable(InKernelCallable):
         return (self.arg_id_to_dtype, self.arg_id_to_descr,
                 self.name_in_target)
 
-    def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info):
+    def with_types(self, arg_id_to_dtype, caller_kernel, callables_table):
         raise LoopyError("No type inference information present for "
                 "the function %s." % (self.name))
 
-    def with_descrs(self, arg_id_to_descr, program_callables_info):
+    def with_descrs(self, arg_id_to_descr, callables_table):
 
         arg_id_to_descr[-1] = ValueArgDescriptor()
         return (
                 self.copy(arg_id_to_descr=arg_id_to_descr),
-                program_callables_info)
+                callables_table)
 
     def with_hw_axes_sizes(self, global_size, local_size):
         return self.copy()
@@ -564,7 +564,7 @@ class ManglerCallable(ScalarCallable):
         return (self.name, self.function_mangler, self.arg_id_to_dtype,
                 self.arg_id_to_descr, self.name_in_target)
 
-    def with_types(self, arg_id_to_dtype, kernel, program_callables_info):
+    def with_types(self, arg_id_to_dtype, kernel, callables_table):
         if self.arg_id_to_dtype is not None:
             # specializing an already specialized function.
             for arg_id, dtype in arg_id_to_dtype.items():
@@ -588,7 +588,7 @@ class ManglerCallable(ScalarCallable):
             return (
                     self.copy(name_in_target=mangle_result.target_name,
                         arg_id_to_dtype=new_arg_id_to_dtype),
-                    program_callables_info)
+                    callables_table)
         else:
             # The function mangler does not agree with the arg id to dtypes
             # provided. Indicating that is illegal.
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 3f4defc5..006ac6ba 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -755,7 +755,7 @@ def get_auto_axis_iname_ranking_by_stride(kernel, insn):
 # }}}
 
 
-def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=None):
+def assign_automatic_axes(kernel, callables_table, axis=0, local_size=None):
     logger.debug("%s: assign automatic axes" % kernel.name)
     # TODO: do the tag removal rigorously, might be easier after switching
     # to set() from tuple()
@@ -769,7 +769,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non
 
     if local_size is None:
         _, local_size = kernel.get_grid_size_upper_bounds_as_exprs(
-                program_callables_info, ignore_auto=True)
+                callables_table, ignore_auto=True)
 
     # {{{ axis assignment helper function
 
@@ -797,7 +797,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non
 
             return assign_automatic_axes(
                     kernel.copy(iname_to_tags=new_iname_to_tags),
-                    program_callables_info,
+                    callables_table,
                     axis=recursion_axis)
 
         if axis is None:
@@ -849,7 +849,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non
                             iname, inner_length=local_size[axis],
                             outer_tag=None, inner_tag=new_tag,
                             do_tagged_check=False),
-                        program_callables_info=program_callables_info,
+                        callables_table=callables_table,
                         axis=recursion_axis, local_size=local_size)
 
         if not kernel.iname_tags_of_type(iname, AutoLocalIndexTagBase):
@@ -871,7 +871,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non
             del new_iname_to_tags[iname]
 
         return assign_automatic_axes(kernel.copy(iname_to_tags=new_iname_to_tags),
-                program_callables_info, axis=recursion_axis, local_size=local_size)
+                callables_table, axis=recursion_axis, local_size=local_size)
 
     # }}}
 
@@ -940,7 +940,7 @@ def assign_automatic_axes(kernel, program_callables_info, axis=0, local_size=Non
         return kernel
     else:
         return assign_automatic_axes(kernel,
-                program_callables_info=program_callables_info, axis=axis+1,
+                callables_table=callables_table, axis=axis+1,
                 local_size=local_size)
 
 # }}}
diff --git a/loopy/library/function.py b/loopy/library/function.py
index f3fb5f8c..f225b62f 100644
--- a/loopy/library/function.py
+++ b/loopy/library/function.py
@@ -26,33 +26,33 @@ from loopy.kernel.function_interface import ScalarCallable
 
 
 class MakeTupleCallable(ScalarCallable):
-    def with_types(self, arg_id_to_dtype, kernel, program_callables_info):
+    def with_types(self, arg_id_to_dtype, kernel, callables_table):
         new_arg_id_to_dtype = arg_id_to_dtype.copy()
         for i in range(len(arg_id_to_dtype)):
             if i in arg_id_to_dtype and arg_id_to_dtype[i] is not None:
                 new_arg_id_to_dtype[-i-1] = new_arg_id_to_dtype[i]
 
         return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
-            name_in_target="loopy_make_tuple"), program_callables_info)
+            name_in_target="loopy_make_tuple"), callables_table)
 
-    def with_descrs(self, arg_id_to_descr, program_callables_info):
+    def with_descrs(self, arg_id_to_descr, callables_table):
         from loopy.kernel.function_interface import ValueArgDescriptor
         new_arg_id_to_descr = dict(((id, ValueArgDescriptor()),
             (-id-1, ValueArgDescriptor())) for id in arg_id_to_descr.keys())
 
         return (
                 self.copy(arg_id_to_descr=new_arg_id_to_descr),
-                program_callables_info)
+                callables_table)
 
 
 class IndexOfCallable(ScalarCallable):
-    def with_types(self, arg_id_to_dtype, kernel, program_callables_info):
+    def with_types(self, arg_id_to_dtype, kernel, callables_table):
         new_arg_id_to_dtype = dict((i, dtype) for i, dtype in
                 arg_id_to_dtype.items() if dtype is not None)
         new_arg_id_to_dtype[-1] = kernel.index_dtype
 
         return (self.copy(arg_id_to_dtype=new_arg_id_to_dtype),
-                program_callables_info)
+                callables_table)
 
 
 def loopy_specific_callable_func_id_to_knl_callable_mappers(target, identifier):
diff --git a/loopy/library/random123.py b/loopy/library/random123.py
index 397e985b..e59a892b 100644
--- a/loopy/library/random123.py
+++ b/loopy/library/random123.py
@@ -169,14 +169,14 @@ class Random123Callable(ScalarCallable):
     Records information about for the random123 functions.
     """
 
-    def with_types(self, arg_id_to_dtype, kernel, program_callables_info):
+    def with_types(self, arg_id_to_dtype, kernel, callables_table):
 
         if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype or (
                 arg_id_to_dtype[0] is None or arg_id_to_dtype[1] is None):
             # the types provided aren't mature enough to specialize the
             # callable
             return (self.copy(),
-                    program_callables_info)
+                    callables_table)
 
         name = self.name
         target = kernel.target
@@ -195,7 +195,7 @@ class Random123Callable(ScalarCallable):
             return (
                     self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
                         name_in_target=fn+"_gen"),
-                    program_callables_info)
+                    callables_table)
 
         elif name == fn + "_f32":
             new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float32),
@@ -203,7 +203,7 @@ class Random123Callable(ScalarCallable):
                     -2: ctr_dtype, 0: ctr_dtype, 1:
                     key_dtype}
             return self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
-                    name_in_target=name), program_callables_info
+                    name_in_target=name), callables_table
 
         elif name == fn + "_f64":
             new_arg_id_to_dtype = {-1: target.vector_dtype(NumpyType(np.float64),
@@ -211,10 +211,10 @@ class Random123Callable(ScalarCallable):
                     -2: ctr_dtype, 0: ctr_dtype, 1:
                     key_dtype}
             return self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
-                    name_in_target=name), program_callables_info
+                    name_in_target=name), callables_table
 
         return (self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                program_callables_info)
+                callables_table)
 
     def generate_preambles(self, target):
         rng_variant = FUNC_NAMES_TO_RNG[self.name]
diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index 70df864d..7c32d0be 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -424,7 +424,7 @@ def parse_reduction_op(name):
 # {{{ reduction specific callables
 
 class ReductionCallable(ScalarCallable):
-    def with_types(self, arg_id_to_dtype, kernel, program_callables_info):
+    def with_types(self, arg_id_to_dtype, kernel, callables_table):
         scalar_dtype = arg_id_to_dtype[0]
         index_dtype = arg_id_to_dtype[1]
         result_dtypes = self.name.reduction_op.result_dtypes(kernel, scalar_dtype,
@@ -436,15 +436,15 @@ class ReductionCallable(ScalarCallable):
                 index_dtype) + "_op"
 
         return self.copy(arg_id_to_dtype=new_arg_id_to_dtype,
-                name_in_target=name_in_target), program_callables_info
+                name_in_target=name_in_target), callables_table
 
-    def with_descr(self, arg_id_to_descr, program_callables_info):
+    def with_descr(self, arg_id_to_descr, callables_table):
         from loopy.library.kernel.function_interface import ValueArgDescriptor
         new_arg_id_to_descr = arg_id_to_descr.copy()
         new_arg_id_to_descr[-1] = ValueArgDescriptor()
         return (
                 self.copy(arg_id_to_descr=arg_id_to_descr),
-                program_callables_info)
+                callables_table)
 
     def generate_preambles(self, target):
         if isinstance(self.name, ArgExtOp):
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 1042c857..85b0c6d4 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -890,7 +890,7 @@ def _insert_subdomain_into_domain_tree(kernel, domains, subdomain):
 # }}}
 
 
-def realize_reduction_for_single_kernel(kernel, program_callables_info,
+def realize_reduction_for_single_kernel(kernel, callables_table,
         insn_id_filter=None, unknown_types_ok=True, automagic_scans_ok=False,
         force_scan=False, force_outer_iname_for_scan=None):
     """Rewrites reductions into their imperative form. With *insn_id_filter*
@@ -1012,7 +1012,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info,
 
     # {{{ sequential
 
-    def map_reduction_seq(expr, rec, program_callables_info, nresults, arg_dtypes,
+    def map_reduction_seq(expr, rec, callables_table, nresults, arg_dtypes,
             reduction_dtypes):
         outer_insn_inames = temp_kernel.insn_inames(insn)
 
@@ -1130,7 +1130,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info,
                 v[iname].lt_set(v[0] + ubound)).get_basic_sets()
         return bs
 
-    def map_reduction_local(expr, rec, program_callables_info, nresults, arg_dtypes,
+    def map_reduction_local(expr, rec, callables_table, nresults, arg_dtypes,
             reduction_dtypes):
         red_iname, = expr.inames
 
@@ -1370,7 +1370,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info,
 
     # {{{ sequential scan
 
-    def map_scan_seq(expr, rec, program_callables_info, nresults, arg_dtypes,
+    def map_scan_seq(expr, rec, callables_table, nresults, arg_dtypes,
             reduction_dtypes, sweep_iname, scan_iname, sweep_min_value,
             scan_min_value, stride):
         outer_insn_inames = temp_kernel.insn_inames(insn)
@@ -1459,7 +1459,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info,
 
     # {{{ local-parallel scan
 
-    def map_scan_local(expr, rec, program_callables_info, nresults, arg_dtypes,
+    def map_scan_local(expr, rec, callables_table, nresults, arg_dtypes,
             reduction_dtypes, sweep_iname, scan_iname, sweep_min_value,
             scan_min_value, stride):
 
@@ -1468,7 +1468,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info,
         assert scan_size > 0
 
         if scan_size == 1:
-            return map_reduction_seq(expr, rec, program_callables_info,
+            return map_reduction_seq(expr, rec, callables_table,
                     nresults, arg_dtypes, reduction_dtypes)
 
         outer_insn_inames = temp_kernel.insn_inames(insn)
@@ -1668,15 +1668,15 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info,
 
     # {{{ seq/par dispatch
 
-    def map_reduction(expr, rec, program_callables_info, nresults=1):
+    def map_reduction(expr, rec, callables_table, nresults=1):
         # Only expand one level of reduction at a time, going from outermost to
         # innermost. Otherwise we get the (iname + insn) dependencies wrong.
 
         from loopy.type_inference import (
                 infer_arg_and_reduction_dtypes_for_reduction_expression)
-        arg_dtypes, reduction_dtypes, program_callables_info = (
+        arg_dtypes, reduction_dtypes, callables_table = (
                 infer_arg_and_reduction_dtypes_for_reduction_expression(
-                    temp_kernel, expr, program_callables_info, unknown_types_ok))
+                    temp_kernel, expr, callables_table, unknown_types_ok))
 
         outer_insn_inames = temp_kernel.insn_inames(insn)
         bad_inames = frozenset(expr.inames) & outer_insn_inames
@@ -1785,7 +1785,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info,
                             for tag in temp_kernel.iname_tags(sweep_iname))))
                 elif parallel:
                     return map_scan_local(
-                            expr, rec, program_callables_info, nresults,
+                            expr, rec, callables_table, nresults,
                             arg_dtypes, reduction_dtypes,
                             sweep_iname, scan_param.scan_iname,
                             scan_param.sweep_lower_bound,
@@ -1793,7 +1793,7 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info,
                             scan_param.stride)
                 elif sequential:
                     return map_scan_seq(
-                            expr, rec, program_callables_info, nresults,
+                            expr, rec, callables_table, nresults,
                             arg_dtypes, reduction_dtypes, sweep_iname,
                             scan_param.scan_iname,
                             scan_param.sweep_lower_bound,
@@ -1814,12 +1814,12 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info,
 
         if n_sequential:
             assert n_local_par == 0
-            return map_reduction_seq(expr, rec, program_callables_info,
+            return map_reduction_seq(expr, rec, callables_table,
                     nresults, arg_dtypes, reduction_dtypes)
         else:
             assert n_local_par > 0
             return map_reduction_local(
-                    expr, rec, program_callables_info, nresults, arg_dtypes,
+                    expr, rec, callables_table, nresults, arg_dtypes,
                     reduction_dtypes)
 
     # }}}
@@ -1854,12 +1854,12 @@ def realize_reduction_for_single_kernel(kernel, program_callables_info,
         from loopy.symbolic import Reduction
         if isinstance(insn.expression, Reduction) and nresults > 1:
             new_expressions = cb_mapper(insn.expression,
-                    program_callables_info=program_callables_info,
+                    callables_table=callables_table,
                     nresults=nresults)
         else:
             new_expressions = (
                     cb_mapper(insn.expression,
-                        program_callables_info=program_callables_info),)
+                        callables_table=callables_table),)
 
         if generated_insns:
             # An expansion happened, so insert the generated stuff plus
@@ -1952,10 +1952,10 @@ def realize_reduction(program, *args, **kwargs):
     assert isinstance(program, Program)
 
     new_resolved_functions = {}
-    for func_id, in_knl_callable in program.program_callables_info.items():
+    for func_id, in_knl_callable in program.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
             new_subkernel = realize_reduction_for_single_kernel(
-                    in_knl_callable.subkernel, program.program_callables_info,
+                    in_knl_callable.subkernel, program.callables_table,
                     *args, **kwargs)
             in_knl_callable = in_knl_callable.copy(
                     subkernel=new_subkernel)
@@ -1968,9 +1968,9 @@ def realize_reduction(program, *args, **kwargs):
 
         new_resolved_functions[func_id] = in_knl_callable
 
-    new_program_callables_info = program.program_callables_info.copy(
+    new_callables_table = program.callables_table.copy(
             resolved_functions=new_resolved_functions)
-    return program.copy(program_callables_info=new_program_callables_info)
+    return program.copy(callables_table=new_callables_table)
 
 # }}}
 
@@ -2153,11 +2153,11 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper):
     """
 
     def __init__(self, rule_mapping_context, caller_kernel,
-            program_callables_info):
+            callables_table):
         super(ArgDescrInferenceMapper, self).__init__(
                 rule_mapping_context)
         self.caller_kernel = caller_kernel
-        self.program_callables_info = program_callables_info
+        self.callables_table = callables_table
 
     def map_call(self, expr, expn_state, **kwargs):
         from pymbolic.primitives import Call, CallWithKwargs
@@ -2193,12 +2193,12 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper):
         combined_arg_id_to_descr.update(assignee_id_to_descr)
 
         # specializing the function according to the parameter description
-        in_knl_callable = self.program_callables_info[expr.function.name]
-        new_in_knl_callable, self.program_callables_info = (
+        in_knl_callable = self.callables_table[expr.function.name]
+        new_in_knl_callable, self.callables_table = (
                 in_knl_callable.with_descrs(
-                    combined_arg_id_to_descr, self.program_callables_info))
-        self.program_callables_info, new_func_id = (
-                self.program_callables_info.with_callable(
+                    combined_arg_id_to_descr, self.callables_table))
+        self.callables_table, new_func_id = (
+                self.callables_table.with_callable(
                     expr.function.function,
                     new_in_knl_callable))
 
@@ -2242,7 +2242,7 @@ class ArgDescrInferenceMapper(RuleAwareIdentityMapper):
         return kernel.copy(instructions=new_insns)
 
 
-def traverse_to_infer_arg_descr(kernel, program_callables_info):
+def traverse_to_infer_arg_descr(kernel, callables_table):
     """
     Returns a copy of *kernel* with the argument shapes and strides matching for
     scoped functions in the *kernel*. Refer
@@ -2258,12 +2258,12 @@ def traverse_to_infer_arg_descr(kernel, program_callables_info):
             kernel.substitutions, kernel.get_var_name_generator())
 
     arg_descr_inf_mapper = ArgDescrInferenceMapper(rule_mapping_context,
-            kernel, program_callables_info)
+            kernel, callables_table)
 
     descr_inferred_kernel = rule_mapping_context.finish_kernel(
             arg_descr_inf_mapper.map_kernel(kernel))
 
-    return descr_inferred_kernel, arg_descr_inf_mapper.program_callables_info
+    return descr_inferred_kernel, arg_descr_inf_mapper.callables_table
 
 
 def infer_arg_descr(program):
@@ -2272,23 +2272,23 @@ def infer_arg_descr(program):
     :attr:`loopy.InKernelCallable.arg_id_to_descr` inferred for all the
     callables.
     """
-    root_kernel_callable = program.program_callables_info[program.name]
-    old_callables_count = program.program_callables_info.callables_count
-    program_callables_info = (
-            program.program_callables_info.with_edit_callables_mode())
+    root_kernel_callable = program.callables_table[program.name]
+    old_callables_count = program.callables_table.callables_count
+    callables_table = (
+            program.callables_table.with_edit_callables_mode())
     root_kernel = program.root_kernel
 
-    new_root_kernel, program_callables_info = traverse_to_infer_arg_descr(
-            root_kernel, program_callables_info)
+    new_root_kernel, callables_table = traverse_to_infer_arg_descr(
+            root_kernel, callables_table)
     new_root_kernel_callable = root_kernel_callable.copy(
             subkernel=new_root_kernel)
-    program_callables_info, _ = program_callables_info.with_callable(program.name,
+    callables_table, _ = callables_table.with_callable(program.name,
             new_root_kernel_callable)
 
-    program_callables_info = program_callables_info.with_exit_edit_callables_mode(
+    callables_table = callables_table.with_exit_edit_callables_mode(
             old_callables_count)
 
-    return program.copy(program_callables_info=program_callables_info)
+    return program.copy(callables_table=callables_table)
 
 # }}}
 
@@ -2298,7 +2298,7 @@ preprocess_cache = WriteOncePersistentDict(
         key_builder=LoopyKeyBuilder())
 
 
-def preprocess_single_kernel(kernel, program_callables_info, device=None):
+def preprocess_single_kernel(kernel, callables_table, device=None):
     from loopy.kernel import KernelState
     if kernel.state >= KernelState.PREPROCESSED:
         return kernel
@@ -2356,7 +2356,7 @@ def preprocess_single_kernel(kernel, program_callables_info, device=None):
     #   because it manipulates the depends_on field, which could prevent
     #   defaults from being applied.
     kernel = realize_reduction_for_single_kernel(kernel,
-            program_callables_info, unknown_types_ok=False)
+            callables_table, unknown_types_ok=False)
 
     # Ordering restriction:
     # add_axes_to_temporaries_for_ilp because reduction accumulators
@@ -2420,7 +2420,7 @@ def infer_hw_axes_sizes(program):
     resolved_function_with_hw_axes_sizes_inferred = {}
 
     for func_id, in_knl_callable in (
-            program.program_callables_info.items()):
+            program.callables_table.items()):
         if func_id == program.name:
             resolved_function_with_hw_axes_sizes_inferred[func_id] = (
                     in_knl_callable)
@@ -2428,11 +2428,11 @@ def infer_hw_axes_sizes(program):
             resolved_function_with_hw_axes_sizes_inferred[func_id] = (
                     in_knl_callable.with_hw_axes_sizes(local_size, global_size))
 
-    new_program_callables_info = (
-            program.program_callables_info.copy(
+    new_callables_table = (
+            program.callables_table.copy(
                 resolved_functions=resolved_function_with_hw_axes_sizes_inferred))
 
-    return program.copy(program_callables_info=new_program_callables_info)
+    return program.copy(callables_table=new_callables_table)
 
 # }}}
 
@@ -2451,16 +2451,16 @@ def preprocess_program(program, device=None):
 
     # Callable editing restrictions:
     #
-    # - should not edit program_callables_info in :meth:`preprocess_single_kernel`
+    # - should not edit callables_table in :meth:`preprocess_single_kernel`
     #   as we are iterating over it.[1]
     #
     # [1] https://docs.python.org/3/library/stdtypes.html#dictionary-view-objects
 
     new_resolved_functions = {}
-    for func_id, in_knl_callable in program.program_callables_info.items():
+    for func_id, in_knl_callable in program.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
             new_subkernel = preprocess_single_kernel(
-                    in_knl_callable.subkernel, program.program_callables_info,
+                    in_knl_callable.subkernel, program.callables_table,
                     device)
             in_knl_callable = in_knl_callable.copy(
                     subkernel=new_subkernel)
@@ -2472,9 +2472,9 @@ def preprocess_program(program, device=None):
 
         new_resolved_functions[func_id] = in_knl_callable
 
-    new_program_callables_info = program.program_callables_info.copy(
+    new_callables_table = program.callables_table.copy(
             resolved_functions=new_resolved_functions)
-    program = program.copy(program_callables_info=new_program_callables_info)
+    program = program.copy(callables_table=new_callables_table)
 
     # }}}
 
diff --git a/loopy/program.py b/loopy/program.py
index 7479ee04..f7c399c1 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -47,7 +47,7 @@ __doc__ = """
 .. currentmodule:: loopy
 
 .. autoclass:: Program
-.. autoclass:: ProgramCallablesInfo
+.. autoclass:: CallablesTable
 
 .. autofunction:: make_program_from_kernel
 .. autofunction:: iterate_over_kernels_if_given_program
@@ -73,11 +73,11 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper):
     :arg function_ids: A container with instances of :class:`str` indicating
         the function identifiers to look for while scoping functions.
     """
-    def __init__(self, rule_mapping_context, kernel, program_callables_info,
+    def __init__(self, rule_mapping_context, kernel, callables_table,
             function_id_to_in_knl_callable_mappers):
         super(ResolvedFunctionMarker, self).__init__(rule_mapping_context)
         self.kernel = kernel
-        self.program_callables_info = program_callables_info
+        self.callables_table = callables_table
         self.function_id_to_in_knl_callable_mappers = (
                 function_id_to_in_knl_callable_mappers)
 
@@ -123,8 +123,8 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper):
                 # associate the newly created ResolvedFunction with the
                 # resolved in-kernel callable
 
-                self.program_callables_info, new_func_id = (
-                        self.program_callables_info.with_added_callable(
+                self.callables_table, new_func_id = (
+                        self.callables_table.with_added_callable(
                             expr.function, in_knl_callable))
                 return type(expr)(
                         ResolvedFunction(new_func_id),
@@ -144,8 +144,8 @@ class ResolvedFunctionMarker(RuleAwareIdentityMapper):
                 expr.operation.get_scalar_callables()):
             in_knl_callable = self.find_in_knl_callable_from_identifier(func_id)
             assert in_knl_callable is not None
-            self.program_callables_info, _ = (
-                    self.program_callables_info.with_added_callable(func_id,
+            self.callables_table, _ = (
+                    self.callables_table.with_added_callable(func_id,
                         in_knl_callable))
         return super(ResolvedFunctionMarker, self).map_reduction(expr, expn_state)
 
@@ -162,37 +162,37 @@ def _default_func_id_to_kernel_callable_mappers(target):
                     )))
 
 
-def initialize_program_callables_info_from_kernel(kernel):
+def initialize_callables_table_from_kernel(kernel):
     """
-    Returns an instance of :class:`loopy.ProgramCallablesInfo`, by resolving
+    Returns an instance of :class:`loopy.CallablesTable`, by resolving
     the functions based on :mod:`loopy`'s default function resolvers.
     """
     # collect the default function resolvers
     func_id_to_kernel_callable_mappers = (
             _default_func_id_to_kernel_callable_mappers(kernel.target))
-    program_callables_info = ProgramCallablesInfo({})
+    callables_table = CallablesTable({})
 
     from loopy.symbolic import SubstitutionRuleMappingContext
     rule_mapping_context = SubstitutionRuleMappingContext(
             kernel.substitutions, kernel.get_var_name_generator())
 
     resolved_function_marker = ResolvedFunctionMarker(
-            rule_mapping_context, kernel, program_callables_info,
+            rule_mapping_context, kernel, callables_table,
             func_id_to_kernel_callable_mappers)
 
     # mark the functions as "Resolved" in the expression nodes.
     kernel_with_functions_resolved = rule_mapping_context.finish_kernel(
             resolved_function_marker.map_kernel(kernel))
-    # collect the update program_callables_info
-    program_callables_info = resolved_function_marker.program_callables_info
+    # collect the update callables_table
+    callables_table = resolved_function_marker.callables_table
 
     callable_kernel = CallableKernel(kernel_with_functions_resolved)
 
-    # add the callable kernel to the program_callables_info
-    program_callables_info, _ = program_callables_info.with_added_callable(
+    # add the callable kernel to the callables_table
+    callables_table, _ = callables_table.with_added_callable(
             Variable(kernel.name), callable_kernel)
 
-    return program_callables_info
+    return callables_table
 
 
 # {{{ program definition
@@ -206,9 +206,9 @@ class Program(ImmutableRecord):
         An instance of :class:`str`, also the name of the top-most level
         :class:`loopy.LoopKernel`.
 
-    .. attribute:: program_callables_info
+    .. attribute:: callables_table
 
-        An instance of :class:`loopy.program.ProgramCallablesInfo`.
+        An instance of :class:`loopy.program.CallablesTable`.
 
     .. attribute:: target
 
@@ -232,16 +232,16 @@ class Program(ImmutableRecord):
     """
     def __init__(self,
             name,
-            program_callables_info,
+            callables_table,
             target,
             func_id_to_in_knl_callable_mappers):
-        assert isinstance(program_callables_info, ProgramCallablesInfo)
+        assert isinstance(callables_table, CallablesTable)
 
-        assert name in program_callables_info
+        assert name in callables_table
 
         super(Program, self).__init__(
                 name=name,
-                program_callables_info=program_callables_info,
+                callables_table=callables_table,
                 target=target,
                 func_id_to_in_knl_callable_mappers=(
                     func_id_to_in_knl_callable_mappers))
@@ -250,7 +250,7 @@ class Program(ImmutableRecord):
 
     hash_fields = (
             "name",
-            "program_callables_info",
+            "callables_table",
             "target",)
 
     update_persistent_hash = LoopKernel.update_persistent_hash
@@ -262,7 +262,7 @@ class Program(ImmutableRecord):
             new_self = super(Program, self).copy(**kwargs)
             new_resolved_functions = {}
             for func_id, in_knl_callable in (
-                    new_self.program_callables_info.items()):
+                    new_self.callables_table.items()):
                 if isinstance(in_knl_callable, CallableKernel):
                     subkernel = in_knl_callable.subkernel
                     new_resolved_functions[func_id] = in_knl_callable.copy(
@@ -270,11 +270,11 @@ class Program(ImmutableRecord):
                 else:
                     new_resolved_functions[func_id] = in_knl_callable
 
-            program_callables_info = new_self.program_callables_info.copy(
+            callables_table = new_self.callables_table.copy(
                     resolved_functions=new_resolved_functions)
 
             return super(Program, new_self).copy(
-                    program_callables_info=program_callables_info)
+                    callables_table=callables_table)
         else:
             return super(Program, self).copy(**kwargs)
 
@@ -285,7 +285,7 @@ class Program(ImmutableRecord):
         *global_size* and *local_size* are :class:`islpy.PwAff` objects.
         """
         return self.root_kernel.get_grid_size_upper_bounds(
-                self.program_callables_info,
+                self.callables_table,
                 ignore_auto=ignore_auto)
 
     def get_grid_size_upper_bounds_as_exprs(self, ignore_auto=False):
@@ -295,7 +295,7 @@ class Program(ImmutableRecord):
         *global_size* and *local_size* are :mod:`pymbolic` expressions
         """
         return self.root_kernel.get_grid_size_upper_bounds_as_exprs(
-                self.program_callables_info,
+                self.callables_table,
                 ignore_auto=ignore_auto)
 
     # {{{ implementation arguments
@@ -338,7 +338,7 @@ class Program(ImmutableRecord):
 
             Syntactic sugar.
         """
-        return self.program_callables_info[self.name].subkernel
+        return self.callables_table[self.name].subkernel
 
     @property
     def arg_dict(self):
@@ -367,14 +367,14 @@ class Program(ImmutableRecord):
         Returns a copy of *self* with the topmost level kernel as
         *root_kernel*.
         """
-        new_in_knl_callable = self.program_callables_info[
+        new_in_knl_callable = self.callables_table[
                 self.name].copy(subkernel=root_kernel)
         new_resolved_functions = (
-                self.program_callables_info.resolved_functions.copy())
+                self.callables_table.resolved_functions.copy())
         new_resolved_functions[self.name] = new_in_knl_callable
 
         return self.copy(
-                program_callables_info=self.program_callables_info.copy(
+                callables_table=self.callables_table.copy(
                     resolved_functions=new_resolved_functions))
 
     def __call__(self, *args, **kwargs):
@@ -462,14 +462,14 @@ def rename_resolved_functions_in_a_single_kernel(kernel,
 class CallablesCountingMapper(CombineMapper):
     """
     Returns an instance of :class:`collections.Counter` with the count of
-    callables registered in *program_callables_info*.
+    callables registered in *callables_table*.
 
-    .. attribute:: program_callables_info
+    .. attribute:: callables_table
 
-        An instance of :class:`loopy.program.ProgramCallablesInfo`.
+        An instance of :class:`loopy.program.CallablesTable`.
     """
-    def __init__(self, program_callables_info):
-        self.program_callables_info = program_callables_info
+    def __init__(self, callables_table):
+        self.callables_table = callables_table
 
     def combine(self, values):
         return sum(values, Counter())
@@ -483,7 +483,7 @@ class CallablesCountingMapper(CombineMapper):
             kw_parameters = {}
 
         if isinstance(expr.function, (ResolvedFunction)):
-            in_knl_callable = self.program_callables_info[expr.function.name]
+            in_knl_callable = self.callables_table[expr.function.name]
             if isinstance(in_knl_callable, ScalarCallable):
                 return (Counter([expr.function.name]) +
                         self.combine((self.rec(child) for child in expr.parameters
@@ -495,7 +495,7 @@ class CallablesCountingMapper(CombineMapper):
                 callables_count_in_subkernel = (
                         count_callables_in_kernel(
                             in_knl_callable.subkernel,
-                            self.program_callables_info))
+                            self.callables_table))
 
                 return (Counter([expr.function.name]) +
                         self.combine((self.rec(child) for child in expr.parameters
@@ -525,16 +525,16 @@ class CallablesCountingMapper(CombineMapper):
 
 
 @memoize_method
-def count_callables_in_kernel(kernel, program_callables_info):
+def count_callables_in_kernel(kernel, callables_table):
     """
     Returns an instance of :class:`collections.Counter` representing the number
     of callables in the *kernel* that are registered in
-    *program_callables_info*.
+    *callables_table*.
     """
     assert isinstance(kernel, LoopKernel)
     callables_count = Counter()
     callables_counting_mapper = CallablesCountingMapper(
-            program_callables_info)
+            callables_table)
     subst_expander = SubstitutionRuleExpander(kernel.substitutions)
 
     for insn in kernel.instructions:
@@ -555,7 +555,7 @@ def count_callables_in_kernel(kernel, program_callables_info):
 
 # {{{ program callables info
 
-class ProgramCallablesInfo(ImmutableRecord):
+class CallablesTable(ImmutableRecord):
     # FIXME: is CallablesTable a better name?(similar to symbol table in
     # compilers.)
     """
@@ -594,7 +594,7 @@ class ProgramCallablesInfo(ImmutableRecord):
             history = dict((func_id, frozenset([func_id])) for func_id in
                     resolved_functions)
 
-        super(ProgramCallablesInfo, self).__init__(
+        super(CallablesTable, self).__init__(
                 resolved_functions=resolved_functions,
                 history=history,
                 is_being_edited=is_being_edited)
@@ -618,7 +618,7 @@ class ProgramCallablesInfo(ImmutableRecord):
     def callables_count(self):
         """
         Returns an instance of :class:`collection.Counter` representing the number
-        of times the callables is called in program_callables_info.
+        of times the callables is called in callables_table.
         """
         # should raise an error if there are more than  one root kernels(which is
         # illegal)
@@ -648,24 +648,24 @@ class ProgramCallablesInfo(ImmutableRecord):
         .. note::
 
             - Always checks whether the
-              :attr:``loopy.ProgramCallablesInfo.resolved_functions` has
+              :attr:``loopy.CallablesTable.resolved_functions` has
               *in_kernel_callable*, does not introduce copies.
 
             - The difference between
-              :meth:`loopy.ProgramCallablesInfo.with_added_callable`
-              and :meth:`ProgramCallablesInfo.with_callable` being that
+              :meth:`loopy.CallablesTable.with_added_callable`
+              and :meth:`CallablesTable.with_callable` being that
               the former has no support for renaming the callable back i.e.
               ``with_callable`` supports renaming from ``sin_0`` to ``sin``,
               if possible, through the member method
-              ``loopy.ProgramCallablesInfo.with_exit_edit_callables_mode``
+              ``loopy.CallablesTable.with_exit_edit_callables_mode``
 
               This subtle difference makes --
 
-              - :meth:`loopy.ProgramCallablesInfo.with_added_callable` suitable
+              - :meth:`loopy.CallablesTable.with_added_callable` suitable
                 for usage while resolving the functions first time, where no
                 renaming is needed.
 
-              - :meth:`loopy.ProgramCallablesInfo.with_callable` suitable for
+              - :meth:`loopy.CallablesTable.with_callable` suitable for
                 implementing edits in callables during inference-walks.
         """
 
@@ -745,7 +745,7 @@ class ProgramCallablesInfo(ImmutableRecord):
     def with_callable(self, function, in_kernel_callable):
         """
         Returns an instance of :class:`tuple` ``(new_self, new_function)``.
-        Also refer -- :meth:`loopy.ProgramCallablesInfo.with_added_callable`
+        Also refer -- :meth:`loopy.CallablesTable.with_added_callable`
 
 
         :arg function: An instance of :class:`pymbolic.primitives.Variable` or
@@ -929,12 +929,12 @@ def make_program_from_kernel(kernel):
     """
 
     # get the program callables info
-    program_callables_info = initialize_program_callables_info_from_kernel(kernel)
+    callables_table = initialize_callables_table_from_kernel(kernel)
 
     # get the program from program callables info
     program = Program(
             name=kernel.name,
-            program_callables_info=program_callables_info,
+            callables_table=callables_table,
             func_id_to_in_knl_callable_mappers=(
                 _default_func_id_to_kernel_callable_mappers(kernel.target)),
             target=kernel.target)
@@ -953,7 +953,7 @@ def iterate_over_kernels_if_given_program(transform_for_single_kernel):
         if isinstance(program_or_kernel, Program):
             program = program_or_kernel
             new_resolved_functions = {}
-            for func_id, in_knl_callable in program.program_callables_info.items():
+            for func_id, in_knl_callable in program.callables_table.items():
                 if isinstance(in_knl_callable, CallableKernel):
                     new_subkernel = transform_for_single_kernel(
                             in_knl_callable.subkernel, *args, **kwargs)
@@ -968,9 +968,9 @@ def iterate_over_kernels_if_given_program(transform_for_single_kernel):
 
                 new_resolved_functions[func_id] = in_knl_callable
 
-            new_program_callables_info = program.program_callables_info.copy(
+            new_callables_table = program.callables_table.copy(
                     resolved_functions=new_resolved_functions)
-            return program.copy(program_callables_info=new_program_callables_info)
+            return program.copy(callables_table=new_callables_table)
         else:
             assert isinstance(program_or_kernel, LoopKernel)
             kernel = program_or_kernel
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index 201bcc25..2b3f7a3b 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -1832,7 +1832,7 @@ class MinRecursionLimitForScheduling(MinRecursionLimit):
 
 # {{{ main scheduling entrypoint
 
-def generate_loop_schedules(kernel, program_callables_info, debug_args={}):
+def generate_loop_schedules(kernel, callables_table, debug_args={}):
     """
     .. warning::
 
@@ -1846,18 +1846,18 @@ def generate_loop_schedules(kernel, program_callables_info, debug_args={}):
 
     with MinRecursionLimitForScheduling(kernel):
         for sched in generate_loop_schedules_inner(kernel,
-                program_callables_info, debug_args=debug_args):
+                callables_table, debug_args=debug_args):
             yield sched
 
 
-def generate_loop_schedules_inner(kernel, program_callables_info, debug_args={}):
+def generate_loop_schedules_inner(kernel, callables_table, debug_args={}):
     from loopy.kernel import KernelState
     if kernel.state not in (KernelState.PREPROCESSED, KernelState.SCHEDULED):
         raise LoopyError("cannot schedule a kernel that has not been "
                 "preprocessed")
 
     from loopy.check import pre_schedule_checks
-    pre_schedule_checks(kernel, program_callables_info)
+    pre_schedule_checks(kernel, callables_table)
 
     schedule_count = 0
 
@@ -1971,7 +1971,7 @@ def generate_loop_schedules_inner(kernel, program_callables_info, debug_args={})
                     kernel, gen_sched)
 
             gsize, lsize = (
-                    kernel.get_grid_size_upper_bounds(program_callables_info))
+                    kernel.get_grid_size_upper_bounds(callables_table))
 
             if (gsize or lsize):
                 if not kernel.options.disable_global_barriers:
@@ -2028,7 +2028,7 @@ schedule_cache = WriteOncePersistentDict(
         key_builder=LoopyKeyBuilder())
 
 
-def _get_one_scheduled_kernel_inner(kernel, program_callables_info):
+def _get_one_scheduled_kernel_inner(kernel, callables_table):
     # This helper function exists to ensure that the generator chain is fully
     # out of scope after the function returns. This allows it to be
     # garbage-collected in the exit handler of the
@@ -2038,10 +2038,10 @@ def _get_one_scheduled_kernel_inner(kernel, program_callables_info):
     #
     # See https://gitlab.tiker.net/inducer/sumpy/issues/31 for context.
 
-    return next(iter(generate_loop_schedules(kernel, program_callables_info)))
+    return next(iter(generate_loop_schedules(kernel, callables_table)))
 
 
-def get_one_scheduled_kernel(kernel, program_callables_info):
+def get_one_scheduled_kernel(kernel, callables_table):
     from loopy import CACHING_ENABLED
 
     sched_cache_key = kernel
@@ -2060,7 +2060,7 @@ def get_one_scheduled_kernel(kernel, program_callables_info):
         with ProcessLogger(logger, "%s: schedule" % kernel.name):
             with MinRecursionLimitForScheduling(kernel):
                 result = _get_one_scheduled_kernel_inner(kernel,
-                        program_callables_info)
+                        callables_table)
 
     if CACHING_ENABLED and not from_cache:
         schedule_cache.store_if_not_present(sched_cache_key, result)
diff --git a/loopy/statistics.py b/loopy/statistics.py
index 5dddd49e..d65387d1 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -648,11 +648,11 @@ class MemAccess(Record):
 # {{{ counter base
 
 class CounterBase(CombineMapper):
-    def __init__(self, knl, program_callables_info):
+    def __init__(self, knl, callables_table):
         self.knl = knl
-        self.program_callables_info = program_callables_info
+        self.callables_table = callables_table
         from loopy.type_inference import TypeInferenceMapper
-        self.type_inf = TypeInferenceMapper(knl, program_callables_info)
+        self.type_inf = TypeInferenceMapper(knl, callables_table)
 
     def combine(self, values):
         return sum(values)
@@ -707,11 +707,11 @@ class CounterBase(CombineMapper):
 # {{{ ExpressionOpCounter
 
 class ExpressionOpCounter(CounterBase):
-    def __init__(self, knl, program_callables_info):
+    def __init__(self, knl, callables_table):
         self.knl = knl
-        self.program_callables_info = program_callables_info
+        self.callables_table = callables_table
         from loopy.type_inference import TypeInferenceMapper
-        self.type_inf = TypeInferenceMapper(knl, program_callables_info)
+        self.type_inf = TypeInferenceMapper(knl, callables_table)
 
     def combine(self, values):
         return sum(values)
@@ -725,7 +725,7 @@ class ExpressionOpCounter(CounterBase):
     def map_call(self, expr):
         from loopy.symbolic import ResolvedFunction
         if isinstance(expr.function, ResolvedFunction):
-            function_identifier = self.program_callables_info[
+            function_identifier = self.callables_table[
                     expr.function.name].name
         else:
             function_identifier = expr.function.name
@@ -1111,7 +1111,7 @@ def count(kernel, set, space=None):
     from loopy.program import Program
     if isinstance(kernel, Program):
         if len([in_knl_callable for in_knl_callable in
-            kernel.program_callables_info.values() if isinstance(in_knl_callable,
+            kernel.callables_table.values() if isinstance(in_knl_callable,
                 CallableKernel)]) != 1:
             raise NotImplementedError("Currently only supported for program with "
                 "only one CallableKernel.")
@@ -1216,10 +1216,10 @@ def count(kernel, set, space=None):
     return add_assumptions_guard(kernel, count)
 
 
-def get_unused_hw_axes_factor(knl, program_callables_info, insn,
+def get_unused_hw_axes_factor(knl, callables_table, insn,
         disregard_local_axes, space=None):
     # FIXME: Multi-kernel support
-    gsize, lsize = knl.get_grid_size_upper_bounds(program_callables_info)
+    gsize, lsize = knl.get_grid_size_upper_bounds(callables_table)
 
     g_used = set()
     l_used = set()
@@ -1257,7 +1257,7 @@ def get_unused_hw_axes_factor(knl, program_callables_info, insn,
     return add_assumptions_guard(knl, result)
 
 
-def count_insn_runs(knl, program_callables_info, insn, count_redundant_work,
+def count_insn_runs(knl, callables_table, insn, count_redundant_work,
         disregard_local_axes=False):
 
     insn_inames = knl.insn_inames(insn)
@@ -1278,7 +1278,7 @@ def count_insn_runs(knl, program_callables_info, insn, count_redundant_work,
     c = count(knl, domain, space=space)
 
     if count_redundant_work:
-        unused_fac = get_unused_hw_axes_factor(knl, program_callables_info,
+        unused_fac = get_unused_hw_axes_factor(knl, callables_table,
                 insn, disregard_local_axes=disregard_local_axes, space=space)
         return c * unused_fac
     else:
@@ -1286,7 +1286,7 @@ def count_insn_runs(knl, program_callables_info, insn, count_redundant_work,
 
 
 @memoize_method
-def _get_insn_count(knl, program_callables_info, insn_id, subgroup_size,
+def _get_insn_count(knl, callables_table, insn_id, subgroup_size,
         count_redundant_work, count_granularity=CountGranularity.WORKITEM):
     insn = knl.id_to_insn[insn_id]
 
@@ -1299,12 +1299,12 @@ def _get_insn_count(knl, program_callables_info, insn_id, subgroup_size,
 
     if count_granularity == CountGranularity.WORKITEM:
         return count_insn_runs(
-            knl, program_callables_info, insn,
+            knl, callables_table, insn,
             count_redundant_work=count_redundant_work,
             disregard_local_axes=False)
 
     ct_disregard_local = count_insn_runs(
-            knl, program_callables_info, insn, disregard_local_axes=True,
+            knl, callables_table, insn, disregard_local_axes=True,
             count_redundant_work=count_redundant_work)
 
     if count_granularity == CountGranularity.WORKGROUP:
@@ -1312,7 +1312,7 @@ def _get_insn_count(knl, program_callables_info, insn_id, subgroup_size,
     elif count_granularity == CountGranularity.SUBGROUP:
         # get the group size
         from loopy.symbolic import aff_to_expr
-        _, local_size = knl.get_grid_size_upper_bounds(program_callables_info)
+        _, local_size = knl.get_grid_size_upper_bounds(callables_table)
         workgroup_size = 1
         if local_size:
             for size in local_size:
@@ -1344,7 +1344,7 @@ def _get_insn_count(knl, program_callables_info, insn_id, subgroup_size,
 # {{{ get_op_map
 
 
-def get_op_map_for_single_kernel(knl, program_callables_info,
+def get_op_map_for_single_kernel(knl, callables_table,
         numpy_types=True, count_redundant_work=False,
                subgroup_size=None):
 
@@ -1355,7 +1355,7 @@ def get_op_map_for_single_kernel(knl, program_callables_info,
     subgroup_size = _process_subgroup_size(knl, subgroup_size)
 
     op_map = ToCountMap()
-    op_counter = ExpressionOpCounter(knl, program_callables_info)
+    op_counter = ExpressionOpCounter(knl, callables_table)
 
     from loopy.kernel.instruction import (
             CallInstruction, CInstruction, Assignment,
@@ -1368,7 +1368,7 @@ def get_op_map_for_single_kernel(knl, program_callables_info,
                 op_map = (
                         op_map
                         + ToCountMap({key: val})
-                        * _get_insn_count(knl, program_callables_info, insn.id,
+                        * _get_insn_count(knl, callables_table, insn.id,
                             subgroup_size, count_redundant_work,
                             key.count_granularity))
 
@@ -1458,13 +1458,13 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False,
     op_map = ToCountMap()
 
     callables_count = (
-                program.program_callables_info.callables_count)
+                program.callables_table.callables_count)
 
-    for func_id, in_knl_callable in program.program_callables_info.items():
+    for func_id, in_knl_callable in program.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
             knl = in_knl_callable.subkernel
             knl_op_map = get_op_map_for_single_kernel(knl,
-                        program.program_callables_info, numpy_types,
+                        program.callables_table, numpy_types,
                         count_redundant_work, subgroup_size)
 
             for i in range(callables_count[func_id]):
@@ -1535,7 +1535,7 @@ def _process_subgroup_size(knl, subgroup_size_requested):
 # {{{ get_mem_access_map
 
 
-def get_mem_access_map_for_single_kernel(knl, program_callables_info,
+def get_mem_access_map_for_single_kernel(knl, callables_table,
         numpy_types=True, count_redundant_work=False, subgroup_size=None):
 
     if not knl.options.ignore_boostable_into:
@@ -1545,8 +1545,8 @@ def get_mem_access_map_for_single_kernel(knl, program_callables_info,
     subgroup_size = _process_subgroup_size(knl, subgroup_size)
 
     access_map = ToCountMap()
-    access_counter_g = GlobalMemAccessCounter(knl, program_callables_info)
-    access_counter_l = LocalMemAccessCounter(knl, program_callables_info)
+    access_counter_g = GlobalMemAccessCounter(knl, callables_table)
+    access_counter_l = LocalMemAccessCounter(knl, callables_table)
 
     from loopy.kernel.instruction import (
             CallInstruction, CInstruction, Assignment,
@@ -1569,7 +1569,7 @@ def get_mem_access_map_for_single_kernel(knl, program_callables_info,
                 access_map = (
                         access_map
                         + ToCountMap({key: val})
-                        * _get_insn_count(knl, program_callables_info, insn.id,
+                        * _get_insn_count(knl, callables_table, insn.id,
                             subgroup_size, count_redundant_work,
                             key.count_granularity))
 
@@ -1578,7 +1578,7 @@ def get_mem_access_map_for_single_kernel(knl, program_callables_info,
                 access_map = (
                         access_map
                         + ToCountMap({key: val})
-                        * _get_insn_count(knl, program_callables_info, insn.id,
+                        * _get_insn_count(knl, callables_table, insn.id,
                             subgroup_size, count_redundant_work,
                             key.count_granularity))
 
@@ -1700,13 +1700,13 @@ def get_mem_access_map(program, numpy_types=True, count_redundant_work=False,
 
     access_map = ToCountMap()
 
-    callables_count = program.program_callables_info.callables_count
+    callables_count = program.callables_table.callables_count
 
-    for func_id, in_knl_callable in program.program_callables_info.items():
+    for func_id, in_knl_callable in program.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
             knl = in_knl_callable.subkernel
             knl_access_map = get_mem_access_map_for_single_kernel(knl,
-                        program.program_callables_info, numpy_types,
+                        program.callables_table, numpy_types,
                         count_redundant_work, subgroup_size)
 
             # FIXME: didn't see any easy way to multiply
@@ -1726,7 +1726,7 @@ def get_mem_access_map(program, numpy_types=True, count_redundant_work=False,
 
 # {{{ get_synchronization_map
 
-def get_synchronization_map_for_single_kernel(knl, program_callables_info,
+def get_synchronization_map_for_single_kernel(knl, callables_table,
         subgroup_size=None):
 
     """Count the number of synchronization events each work-item encounters in
@@ -1772,7 +1772,7 @@ def get_synchronization_map_for_single_kernel(knl, program_callables_info,
     from loopy.schedule import (EnterLoop, LeaveLoop, Barrier,
             CallKernel, ReturnFromKernel, RunInstruction)
     from operator import mul
-    knl = lp.get_one_scheduled_kernel(knl, program_callables_info)
+    knl = lp.get_one_scheduled_kernel(knl, callables_table)
     iname_list = []
 
     result = ToCountMap()
@@ -1824,13 +1824,13 @@ def get_synchronization_map(program, subgroup_size=None):
     program = preprocess_program(program)
 
     sync_map = ToCountMap()
-    callables_count = program.program_callables_info.callables_count
+    callables_count = program.callables_table.callables_count
 
-    for func_id, in_knl_callable in program.program_callables_info.items():
+    for func_id, in_knl_callable in program.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
             knl = in_knl_callable.subkernel
             knl_sync_map = get_synchronization_map_for_single_kernel(knl,
-                    program.program_callables_info, subgroup_size)
+                    program.callables_table, subgroup_size)
 
             # FIXME: didn't see any easy way to multiply
             for i in range(callables_count[func_id]):
@@ -1887,7 +1887,7 @@ def gather_access_footprints_for_single_kernel(kernel, ignore_uncountable=False)
 def gather_access_footprints(program, ignore_uncountable=False):
     # FIMXE: works only for one callable kernel till now.
     if len([in_knl_callable for in_knl_callable in
-        program.program_callables_info.values() if isinstance(in_knl_callable,
+        program.callables_table.values() if isinstance(in_knl_callable,
             CallableKernel)]) != 1:
         raise NotImplementedError("Currently only supported for program with "
             "only one CallableKernel.")
@@ -1900,9 +1900,9 @@ def gather_access_footprints(program, ignore_uncountable=False):
     write_footprints = []
     read_footprints = []
 
-    callables_count = program.program_callables_info.callables_count
+    callables_count = program.callables_table.callables_count
 
-    for func_id, in_knl_callable in program.program_callables_info.items():
+    for func_id, in_knl_callable in program.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
             knl = in_knl_callable.subkernel
             knl_write_footprints, knl_read_footprints = (
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index 92ee2dc5..f27ee4e9 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -80,7 +80,7 @@ class TargetBase(object):
     def preprocess(self, kernel):
         return kernel
 
-    def pre_codegen_check(self, kernel, program_callables_info):
+    def pre_codegen_check(self, kernel, callables_table):
         pass
 
     # }}}
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 418ce025..9b5aaf8e 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -362,7 +362,7 @@ class CMathCallable(ScalarCallable):
     C-Target.
     """
 
-    def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info):
+    def with_types(self, arg_id_to_dtype, caller_kernel, callables_table):
         name = self.name
 
         if name in ["abs", "min", "max"]:
@@ -381,7 +381,7 @@ class CMathCallable(ScalarCallable):
                 # callable
                 return (
                         self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                        program_callables_info)
+                        callables_table)
 
             dtype = arg_id_to_dtype[0]
             dtype = dtype.numpy_dtype
@@ -409,7 +409,7 @@ class CMathCallable(ScalarCallable):
                     self.copy(name_in_target=name,
                         arg_id_to_dtype={0: NumpyType(dtype), -1:
                             NumpyType(dtype)}),
-                    program_callables_info)
+                    callables_table)
 
         # binary functions
         if name in ["fmax", "fmin"]:
@@ -424,7 +424,7 @@ class CMathCallable(ScalarCallable):
                 # callable
                 return (
                         self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                        program_callables_info)
+                        callables_table)
 
             dtype = np.find_common_type(
                 [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items()
@@ -449,11 +449,11 @@ class CMathCallable(ScalarCallable):
             return (
                     self.copy(name_in_target=name,
                         arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}),
-                    program_callables_info)
+                    callables_table)
 
         return (
                 self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                program_callables_info)
+                callables_table)
 
 
 def scope_c_math_functions(target, identifier):
@@ -893,7 +893,7 @@ class CASTBuilder(ASTBuilderBase):
 
         ecm = codegen_state.expression_to_code_mapper
         func_id = insn.expression.function.name
-        in_knl_callable = codegen_state.program_callables_info[func_id]
+        in_knl_callable = codegen_state.callables_table[func_id]
 
         if isinstance(in_knl_callable, ScalarCallable) and (
                 in_knl_callable.name_in_target == 'loopy_make_tuple'):
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 65a8c202..289877d9 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -55,7 +55,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
 
         if type_inf_mapper is None:
             type_inf_mapper = TypeInferenceMapper(self.kernel,
-                    self.codegen_state.program_callables_info)
+                    self.codegen_state.callables_table)
         self.type_inf_mapper = type_inf_mapper
 
         self.allow_complex = codegen_state.allow_complex
@@ -389,7 +389,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
         # {{{ implement indexof, indexof_vec
 
         identifier_name = (
-                self.codegen_state.program_callables_info[expr.function.name].name)
+                self.codegen_state.callables_table[expr.function.name].name)
         if identifier_name in ["indexof", "indexof_vec"]:
             if len(expr.parameters) != 1:
                 raise LoopyError("%s takes exactly one argument" % identifier_name)
@@ -432,11 +432,11 @@ class ExpressionToCExpressionMapper(IdentityMapper):
         # }}}
 
         from loopy.kernel.function_interface import ManglerCallable
-        if isinstance(self.codegen_state.program_callables_info[expr.function.name],
+        if isinstance(self.codegen_state.callables_table[expr.function.name],
                 ManglerCallable):
             from loopy.codegen import SeenFunction
             in_knl_callable = (
-                    self.codegen_state.program_callables_info[
+                    self.codegen_state.callables_table[
                         expr.function.name])
             mangle_result = in_knl_callable.mangle_result(self.kernel)
             self.codegen_state.seen_functions.add(
@@ -445,7 +445,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                         mangle_result.arg_dtypes))
 
         return (
-                self.codegen_state.program_callables_info[
+                self.codegen_state.callables_table[
                     expr.function.name].emit_call(
                         expression_to_code_mapper=self,
                     expression=expr,
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index e6abf73f..32b810eb 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -123,7 +123,7 @@ _CUDA_SPECIFIC_FUNCTIONS = {
 class CudaCallable(ScalarCallable):
 
     def cuda_with_types(self, arg_id_to_dtype, caller_kernel,
-            program_callables_info):
+            callables_table):
 
         name = self.name
 
@@ -138,7 +138,7 @@ class CudaCallable(ScalarCallable):
                 # callable
                 return (
                         self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                        program_callables_info)
+                        callables_table)
 
             dtype = arg_id_to_dtype[0]
             scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["x"]
@@ -146,7 +146,7 @@ class CudaCallable(ScalarCallable):
                     self.copy(name_in_target=name, arg_id_to_dtype={-1:
                         NumpyType(scalar_dtype),
                         0: dtype, 1: dtype}),
-                    program_callables_info)
+                    callables_table)
 
         if name in _CUDA_SPECIFIC_FUNCTIONS:
             num_args = _CUDA_SPECIFIC_FUNCTIONS[name]
@@ -161,7 +161,7 @@ class CudaCallable(ScalarCallable):
                     # callable
                     return (
                             self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                            program_callables_info)
+                            callables_table)
 
             dtype = np.find_common_type(
                     [], [dtype.numpy_dtype for id, dtype in
@@ -177,11 +177,11 @@ class CudaCallable(ScalarCallable):
             return (
                     self.copy(name_in_target=name,
                         arg_id_to_dtype=updated_arg_id_to_dtype),
-                    program_callables_info)
+                    callables_table)
 
         return (
                 self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                program_callables_info)
+                callables_table)
 
 
 def scope_cuda_functions(target, identifier):
@@ -303,7 +303,7 @@ class CUDACASTBuilder(CASTBuilder):
                 codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs(
                         get_insn_ids_for_block_at(
                             codegen_state.kernel.schedule, schedule_index),
-                        codegen_state.program_callables_info)
+                        codegen_state.callables_table)
 
         from loopy.symbolic import get_dependencies
         if not get_dependencies(local_grid_size):
diff --git a/loopy/target/execution.py b/loopy/target/execution.py
index 43963ddb..c067bc4b 100644
--- a/loopy/target/execution.py
+++ b/loopy/target/execution.py
@@ -763,7 +763,7 @@ class KernelExecutorBase(object):
             from loopy.schedule import get_one_scheduled_kernel
             program = program.with_root_kernel(
                     get_one_scheduled_kernel(program.root_kernel,
-                        program.program_callables_info))
+                        program.callables_table))
 
         return program
 
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index f8c42ad6..94a81a65 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -172,9 +172,9 @@ class ISPCTarget(CTarget):
     host_program_name_suffix = ""
     device_program_name_suffix = "_inner"
 
-    def pre_codegen_check(self, kernel, program_callables_info):
+    def pre_codegen_check(self, kernel, callables_table):
         gsize, lsize = kernel.get_grid_size_upper_bounds_as_exprs(
-                program_callables_info)
+                callables_table)
         if len(lsize) > 1:
             for i, ls_i in enumerate(lsize[1:]):
                 if ls_i != 1:
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index d8c195de..ea29665a 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -172,7 +172,7 @@ class OpenCLCallable(ScalarCallable):
     :class:`loopy.target.c.CMathCallable`.
     """
 
-    def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info):
+    def with_types(self, arg_id_to_dtype, caller_kernel, callables_table):
         name = self.name
 
         if name in ["max", "min"]:
@@ -182,7 +182,7 @@ class OpenCLCallable(ScalarCallable):
             if 0 not in arg_id_to_dtype or 1 not in arg_id_to_dtype:
                 return (
                         self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                        program_callables_info)
+                        callables_table)
 
             dtype = np.find_common_type(
                     [], [dtype.numpy_dtype for id, dtype in arg_id_to_dtype.items()
@@ -195,7 +195,7 @@ class OpenCLCallable(ScalarCallable):
                 return (
                         self.copy(name_in_target=name,
                             arg_id_to_dtype={-1: dtype, 0: dtype, 1: dtype}),
-                        program_callables_info)
+                        callables_table)
             else:
                 # Unsupported type.
                 raise LoopyError("%s function not supported for the types %s" %
@@ -212,14 +212,14 @@ class OpenCLCallable(ScalarCallable):
                 # callable
                 return (
                         self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                        program_callables_info)
+                        callables_table)
 
             dtype = arg_id_to_dtype[0]
             scalar_dtype, offset, field_name = dtype.numpy_dtype.fields["s0"]
             return (
                     self.copy(name_in_target=name, arg_id_to_dtype={-1:
                         NumpyType(scalar_dtype), 0: dtype, 1: dtype}),
-                    program_callables_info)
+                    callables_table)
 
         if name in _CL_SIMPLE_MULTI_ARG_FUNCTIONS:
             num_args = _CL_SIMPLE_MULTI_ARG_FUNCTIONS[name]
@@ -234,7 +234,7 @@ class OpenCLCallable(ScalarCallable):
                     # callable
                     return (
                             self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                            program_callables_info)
+                            callables_table)
 
             dtype = np.find_common_type(
                     [], [dtype.numpy_dtype for id, dtype in
@@ -250,7 +250,7 @@ class OpenCLCallable(ScalarCallable):
             return (
                     self.copy(name_in_target=name,
                         arg_id_to_dtype=updated_arg_id_to_dtype),
-                    program_callables_info)
+                    callables_table)
 
         if name in VECTOR_LITERAL_FUNCS:
             base_tp_name, dtype, count = VECTOR_LITERAL_FUNCS[name]
@@ -266,7 +266,7 @@ class OpenCLCallable(ScalarCallable):
                     # callable
                     return (
                             self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                            program_callables_info)
+                            callables_table)
 
             updated_arg_id_to_dtype = dict((id, NumpyType(dtype)) for id in
                     range(count))
@@ -276,13 +276,13 @@ class OpenCLCallable(ScalarCallable):
             return (
                     self.copy(name_in_target="(%s%d) " % (base_tp_name, count),
                         arg_id_to_dtype=updated_arg_id_to_dtype),
-                    program_callables_info)
+                    callables_table)
 
         # does not satisfy any of the conditions needed for specialization.
         # hence just returning a copy of the callable.
         return (
                 self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                program_callables_info)
+                callables_table)
 
 
 def scope_opencl_functions(target, identifier):
@@ -479,7 +479,7 @@ class OpenCLCASTBuilder(CASTBuilder):
         _, local_sizes = codegen_state.kernel.get_grid_sizes_for_insn_ids_as_exprs(
                 get_insn_ids_for_block_at(
                     codegen_state.kernel.schedule, schedule_index),
-                codegen_state.program_callables_info)
+                codegen_state.callables_table)
 
         from loopy.symbolic import get_dependencies
         if not get_dependencies(local_sizes):
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index 435a5e79..d98b6cdd 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -134,7 +134,7 @@ def adjust_local_temp_var_storage(kernel, device):
 
 # {{{ check sizes against device properties
 
-def check_sizes(kernel, program_callables_info, device):
+def check_sizes(kernel, callables_table, device):
     import loopy as lp
 
     from loopy.diagnostic import LoopyAdvisory, LoopyError
@@ -152,7 +152,7 @@ def check_sizes(kernel, program_callables_info, device):
             parameters[arg.name] = arg.approximately
 
     glens, llens = (
-            kernel.get_grid_size_upper_bounds_as_exprs(program_callables_info))
+            kernel.get_grid_size_upper_bounds_as_exprs(callables_table))
 
     if (max(len(glens), len(llens))
             > device.max_work_item_dimensions):
@@ -207,7 +207,7 @@ class PyOpenCLCallable(ScalarCallable):
     Records information about the callables which are not covered by
     :class:`loopy.target.opencl.OpenCLCallable`
     """
-    def with_types(self, arg_id_to_dtype, caller_kernel, program_callables_info):
+    def with_types(self, arg_id_to_dtype, caller_kernel, callables_table):
 
         name = self.name
 
@@ -221,7 +221,7 @@ class PyOpenCLCallable(ScalarCallable):
             # callable
             return (
                     self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                    program_callables_info)
+                    callables_table)
 
         dtype = arg_id_to_dtype[0]
 
@@ -238,7 +238,7 @@ class PyOpenCLCallable(ScalarCallable):
                         self.copy(name_in_target="%s_%s" % (tpname, name),
                             arg_id_to_dtype={0: dtype, -1: NumpyType(
                                 np.dtype(dtype.numpy_dtype.type(0).real))}),
-                        program_callables_info)
+                        callables_table)
 
         if name in ["sqrt", "exp", "log",
                 "sin", "cos", "tan",
@@ -256,7 +256,7 @@ class PyOpenCLCallable(ScalarCallable):
                 return (
                         self.copy(name_in_target="%s_%s" % (tpname, name),
                             arg_id_to_dtype={0: dtype, -1: dtype}),
-                        program_callables_info)
+                        callables_table)
             else:
                 # function calls for floating parameters.
                 numpy_dtype = dtype.numpy_dtype
@@ -267,11 +267,11 @@ class PyOpenCLCallable(ScalarCallable):
                 return (
                         self.copy(name_in_target=name,
                             arg_id_to_dtype={0: dtype, -1: dtype}),
-                        program_callables_info)
+                        callables_table)
 
         return (
                 self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                program_callables_info)
+                callables_table)
 
 
 def pyopencl_function_id_to_in_knl_callable_mapper(target, identifier):
@@ -397,8 +397,8 @@ class PyOpenCLTarget(OpenCLTarget):
             kernel = adjust_local_temp_var_storage(kernel, self.device)
         return kernel
 
-    def pre_codegen_check(self, kernel, program_callables_info):
-        check_sizes(kernel, program_callables_info, self.device)
+    def pre_codegen_check(self, kernel, callables_table):
+        check_sizes(kernel, callables_table, self.device)
 
     def get_host_ast_builder(self):
         return PyOpenCLPythonASTBuilder(self)
diff --git a/loopy/target/python.py b/loopy/target/python.py
index 2e6712ec..1f83112f 100644
--- a/loopy/target/python.py
+++ b/loopy/target/python.py
@@ -45,7 +45,7 @@ class ExpressionToPythonMapper(StringifyMapper):
 
         if type_inf_mapper is None:
             type_inf_mapper = TypeInferenceMapper(self.kernel,
-                    self.codegen_state.program_callables_info)
+                    self.codegen_state.callables_table)
         self.type_inf_mapper = type_inf_mapper
 
     def handle_unsupported_expression(self, victim, enclosing_prec):
@@ -85,7 +85,7 @@ class ExpressionToPythonMapper(StringifyMapper):
     def map_call(self, expr, enclosing_prec):
         from pymbolic.mapper.stringifier import PREC_NONE
 
-        identifier_name = self.codegen_state.program_callables_info[
+        identifier_name = self.codegen_state.callables_table[
                 expr.function.name].name
 
         if identifier_name in ["indexof", "indexof_vec"]:
@@ -93,7 +93,7 @@ class ExpressionToPythonMapper(StringifyMapper):
                     "indexof, indexof_vec not yet supported in Python")
 
         from loopy.kernel.function_interface import ManglerCallable
-        in_knl_callable = self.codegen_state.program_callables_info[
+        in_knl_callable = self.codegen_state.callables_table[
                 expr.function.name]
         if isinstance(in_knl_callable, ManglerCallable):
             from loopy.codegen import SeenFunction
diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py
index 57c4397f..2519b6a1 100644
--- a/loopy/transform/buffer.py
+++ b/loopy/transform/buffer.py
@@ -133,7 +133,7 @@ buffer_array_cache = WriteOncePersistentDict(
 
 
 # Adding an argument? also add something to the cache_key below.
-def buffer_array_for_single_kernel(kernel, program_callables_info, var_name,
+def buffer_array_for_single_kernel(kernel, callables_table, var_name,
         buffer_inames, init_expression=None, store_expression=None,
         within=None, default_tag="l.auto", temporary_scope=None,
         temporary_is_local=None, fetch_bounding_box=False):
@@ -534,7 +534,7 @@ def buffer_array_for_single_kernel(kernel, program_callables_info, var_name,
     kernel = tag_inames(kernel, new_iname_to_tag)
 
     from loopy.kernel.tools import assign_automatic_axes
-    kernel = assign_automatic_axes(kernel, program_callables_info)
+    kernel = assign_automatic_axes(kernel, callables_table)
 
     if CACHING_ENABLED:
         from loopy.preprocess import prepare_for_caching
@@ -548,10 +548,10 @@ def buffer_array(program, *args, **kwargs):
     assert isinstance(program, Program)
 
     new_resolved_functions = {}
-    for func_id, in_knl_callable in program.program_callables_info.items():
+    for func_id, in_knl_callable in program.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
             new_subkernel = buffer_array_for_single_kernel(
-                    in_knl_callable.subkernel, program.program_callables_info,
+                    in_knl_callable.subkernel, program.callables_table,
                     *args, **kwargs)
             in_knl_callable = in_knl_callable.copy(
                     subkernel=new_subkernel)
@@ -564,8 +564,8 @@ def buffer_array(program, *args, **kwargs):
 
         new_resolved_functions[func_id] = in_knl_callable
 
-    new_program_callables_info = program.program_callables_info.copy(
+    new_callables_table = program.callables_table.copy(
             resolved_functions=new_resolved_functions)
-    return program.copy(program_callables_info=new_program_callables_info)
+    return program.copy(callables_table=new_callables_table)
 
 # vim: foldmethod=marker
diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py
index 90f53095..0013de1d 100644
--- a/loopy/transform/callable.py
+++ b/loopy/transform/callable.py
@@ -46,11 +46,11 @@ def _resolved_callables_from_function_lookup(program,
         ``(target, identifier)`` that returns either an instance of
         :class:`loopy.InKernelCallable` or *None*.
     """
-    program_callables_info = program.program_callables_info
+    callables_table = program.callables_table
 
     callable_knls = dict(
             (func_id, in_knl_callable) for func_id, in_knl_callable in
-            program_callables_info.items() if isinstance(in_knl_callable,
+            callables_table.items() if isinstance(in_knl_callable,
                 CallableKernel))
     edited_callable_knls = {}
 
@@ -62,28 +62,28 @@ def _resolved_callables_from_function_lookup(program,
                 kernel.substitutions, kernel.get_var_name_generator())
 
         resolved_function_marker = ResolvedFunctionMarker(
-                rule_mapping_context, kernel, program_callables_info,
+                rule_mapping_context, kernel, callables_table,
                 [func_id_to_in_kernel_callable_mapper])
 
         new_subkernel = rule_mapping_context.finish_kernel(
                 resolved_function_marker.map_kernel(kernel))
-        program_callables_info = resolved_function_marker.program_callables_info
+        callables_table = resolved_function_marker.callables_table
 
         edited_callable_knls[func_id] = in_knl_callable.copy(
                 subkernel=new_subkernel)
 
     new_resolved_functions = {}
 
-    for func_id, in_knl_callable in program_callables_info.items():
+    for func_id, in_knl_callable in callables_table.items():
         if func_id in edited_callable_knls:
             new_resolved_functions[func_id] = edited_callable_knls[func_id]
         else:
             new_resolved_functions[func_id] = in_knl_callable
 
-    program_callables_info = program_callables_info.copy(
+    callables_table = callables_table.copy(
             resolved_functions=new_resolved_functions)
 
-    return program.copy(program_callables_info=program_callables_info)
+    return program.copy(callables_table=callables_table)
 
 
 def register_function_id_to_in_knl_callable_mapper(program,
diff --git a/loopy/transform/data.py b/loopy/transform/data.py
index 5f4f2f2a..888bedc1 100644
--- a/loopy/transform/data.py
+++ b/loopy/transform/data.py
@@ -143,7 +143,7 @@ class _not_provided:  # noqa: N801
     pass
 
 
-def add_prefetch_for_single_kernel(kernel, program_callables_info, var_name,
+def add_prefetch_for_single_kernel(kernel, callables_table, var_name,
         sweep_inames=[], dim_arg_names=None,
 
         # "None" is a valid value here, distinct from the default.
@@ -334,7 +334,7 @@ def add_prefetch_for_single_kernel(kernel, program_callables_info, var_name,
     # warning message.
 
     from loopy.transform.precompute import precompute_for_single_kernel
-    new_kernel = precompute_for_single_kernel(kernel, program_callables_info,
+    new_kernel = precompute_for_single_kernel(kernel, callables_table,
             subst_use, sweep_inames, precompute_inames=dim_arg_names,
             default_tag=default_tag, dtype=arg.dtype,
             fetch_bounding_box=fetch_bounding_box,
@@ -373,10 +373,10 @@ def add_prefetch(program, *args, **kwargs):
     assert isinstance(program, Program)
 
     new_resolved_functions = {}
-    for func_id, in_knl_callable in program.program_callables_info.items():
+    for func_id, in_knl_callable in program.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
             new_subkernel = add_prefetch_for_single_kernel(
-                    in_knl_callable.subkernel, program.program_callables_info,
+                    in_knl_callable.subkernel, program.callables_table,
                     *args, **kwargs)
             in_knl_callable = in_knl_callable.copy(
                     subkernel=new_subkernel)
@@ -389,9 +389,9 @@ def add_prefetch(program, *args, **kwargs):
 
         new_resolved_functions[func_id] = in_knl_callable
 
-    new_program_callables_info = program.program_callables_info.copy(
+    new_callables_table = program.callables_table.copy(
             resolved_functions=new_resolved_functions)
-    return program.copy(program_callables_info=new_program_callables_info)
+    return program.copy(callables_table=new_callables_table)
 
 # }}}
 
diff --git a/loopy/transform/fusion.py b/loopy/transform/fusion.py
index 44e69ecf..9b83f242 100644
--- a/loopy/transform/fusion.py
+++ b/loopy/transform/fusion.py
@@ -420,23 +420,23 @@ def fuse_kernels(programs, suffixes=None, data_flow=None):
     """
 
     # all the resolved functions in programs must be registered in
-    # main_program_callables_info
+    # main_callables_table
     main_prog_callables_info = (
-            programs[0].program_callables_info)
+            programs[0].callables_table)
     old_root_kernel_callable = (
-            programs[0].program_callables_info[programs[0].name])
+            programs[0].callables_table[programs[0].name])
     kernels = [programs[0].root_kernel]
 
     # removing the callable collisions that maybe present
     for prog in programs[1:]:
         root_kernel = prog.root_kernel
         renames_needed = {}
-        for old_func_id, in_knl_callable in prog.program_callables_info.items():
+        for old_func_id, in_knl_callable in prog.callables_table.items():
             if isinstance(in_knl_callable, CallableKernel):
                 # Fusing programs with multiple callable kernels is tough.
                 # Reason: Need to first figure out the order in which the
                 # callable kernels must be resolved into
-                # main_program_callables_info, because of renaming is
+                # main_callables_table, because of renaming is
                 # needed to be done in the callable kernels before registering.
                 # Hence disabling it until required.
                 if in_knl_callable.subkernel.name != prog.name:
@@ -468,6 +468,6 @@ def fuse_kernels(programs, suffixes=None, data_flow=None):
             var(programs[0].name), new_root_kernel_callable)
 
     return programs[0].copy(
-            program_callables_info=main_prog_callables_info)
+            callables_table=main_prog_callables_info)
 
 # vim: foldmethod=marker
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index b6a0454e..fb6682f4 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -1095,7 +1095,7 @@ def get_iname_duplication_options_for_single_kernel(knl, use_boostable_into=Fals
 
 
 def get_iname_duplication_options(program, use_boostable_into=False):
-    for in_knl_callable in program.program_callables_info.values():
+    for in_knl_callable in program.callables_table.values():
         if isinstance(in_knl_callable, CallableKernel):
             for option in get_iname_duplication_options_for_single_kernel(
                     in_knl_callable.subkernel, use_boostable_into):
@@ -1121,7 +1121,7 @@ def has_schedulable_iname_nesting_for_single_kernel(knl):
 def has_schedulable_iname_nesting(program):
     return all(has_schedulable_iname_nesting_for_single_kernel(
         in_knl_callable.subkernel) for in_knl_callable in
-        program.program_callables_info.values() if isinstance(in_knl_callable,
+        program.callables_table.values() if isinstance(in_knl_callable,
             CallableKernel))
 
 # }}}
diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 93cf932b..f73110ec 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -42,7 +42,7 @@ def find_instructions_in_single_kernel(kernel, insn_match):
 def find_instructions(program, insn_match):
     assert isinstance(program, Program)
     insns = []
-    for in_knl_callable in program.program_callables_info.values():
+    for in_knl_callable in program.callables_table.values():
         if isinstance(in_knl_callable, CallableKernel):
             insns += (find_instructions_in_single_kernel(
                 in_knl_callable.subkernel, insn_match))
diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py
index 66c7114a..71b11fa2 100644
--- a/loopy/transform/precompute.py
+++ b/loopy/transform/precompute.py
@@ -261,7 +261,7 @@ class _not_provided(object):  # noqa: N801
     pass
 
 
-def precompute_for_single_kernel(kernel, program_callables_info, subst_use,
+def precompute_for_single_kernel(kernel, callables_table, subst_use,
         sweep_inames=[], within=None, storage_axes=None, temporary_name=None,
         precompute_inames=None, precompute_outer_inames=None,
         storage_axis_to_tag={},
@@ -1047,7 +1047,7 @@ def precompute_for_single_kernel(kernel, program_callables_info, subst_use,
 
     if filter_iname_tags_by_type(new_iname_to_tag.values(), AutoFitLocalIndexTag):
         from loopy.kernel.tools import assign_automatic_axes
-        kernel = assign_automatic_axes(kernel, program_callables_info)
+        kernel = assign_automatic_axes(kernel, callables_table)
 
     return kernel
 
@@ -1056,10 +1056,10 @@ def precompute(program, *args, **kwargs):
     assert isinstance(program, Program)
 
     new_resolved_functions = {}
-    for func_id, in_knl_callable in program.program_callables_info.items():
+    for func_id, in_knl_callable in program.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
             new_subkernel = precompute_for_single_kernel(
-                    in_knl_callable.subkernel, program.program_callables_info,
+                    in_knl_callable.subkernel, program.callables_table,
                     *args, **kwargs)
             in_knl_callable = in_knl_callable.copy(
                     subkernel=new_subkernel)
@@ -1072,8 +1072,8 @@ def precompute(program, *args, **kwargs):
 
         new_resolved_functions[func_id] = in_knl_callable
 
-    new_program_callables_info = program.program_callables_info.copy(
+    new_callables_table = program.callables_table.copy(
             resolved_functions=new_resolved_functions)
-    return program.copy(program_callables_info=new_program_callables_info)
+    return program.copy(callables_table=new_callables_table)
 
 # vim: foldmethod=marker
diff --git a/loopy/transform/save.py b/loopy/transform/save.py
index 4b957b03..e463353e 100644
--- a/loopy/transform/save.py
+++ b/loopy/transform/save.py
@@ -235,9 +235,9 @@ class TemporarySaver(object):
         def new_shape(self):
             return self.hw_dims + self.non_hw_dims
 
-    def __init__(self, kernel, program_callables_info):
+    def __init__(self, kernel, callables_table):
         self.kernel = kernel
-        self.program_callables_info = program_callables_info
+        self.callables_table = callables_table
         self.var_name_gen = kernel.get_var_name_generator()
         self.insn_name_gen = kernel.get_instruction_id_generator()
 
@@ -441,7 +441,7 @@ class TemporarySaver(object):
 
         group_sizes, local_sizes = (
             self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids,
-                self.program_callables_info))
+                self.callables_table))
 
         if temporary.address_space == lp.AddressSpace.LOCAL:
             # Elide local axes in the save slot for local temporaries.
@@ -630,7 +630,7 @@ class TemporarySaver(object):
                     kernel = lp.add_nosync(kernel, "global", source, sink)
 
         from loopy.kernel.tools import assign_automatic_axes
-        return assign_automatic_axes(kernel, self.program_callables_info)
+        return assign_automatic_axes(kernel, self.callables_table)
 
     def save(self, temporary, subkernel):
         self.save_or_reload_impl(temporary, subkernel, "save")
@@ -754,12 +754,12 @@ def save_and_reload_temporaries(program):
         program = lp.preprocess_program(program)
         from loopy.schedule import get_one_scheduled_kernel
         knl = get_one_scheduled_kernel(program.root_kernel,
-                program.program_callables_info)
+                program.callables_table)
 
     assert knl.schedule is not None
 
     liveness = LivenessAnalysis(knl)
-    saver = TemporarySaver(knl, program.program_callables_info)
+    saver = TemporarySaver(knl, program.callables_table)
 
     from loopy.schedule.tools import (
         temporaries_read_in_subkernel, temporaries_written_in_subkernel)
diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py
index afe3fec5..acdf5b2a 100644
--- a/loopy/transform/subst.py
+++ b/loopy/transform/subst.py
@@ -510,7 +510,7 @@ def find_rules_matching(knl, pattern):
 
 def find_one_rule_matching(program, pattern):
     rules = []
-    for in_knl_callable in program.program_callables_info.values():
+    for in_knl_callable in program.callables_table.values():
         if isinstance(in_knl_callable, CallableKernel):
             knl = in_knl_callable.subkernel
             rules.extend(find_rules_matching(knl, pattern))
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 43986640..029381d8 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -35,7 +35,7 @@ from loopy.diagnostic import (
         TypeInferenceFailure, DependencyTypeInferenceFailure)
 from loopy.kernel.instruction import _DataObliviousInstruction
 
-from loopy.program import ProgramCallablesInfo
+from loopy.program import CallablesTable
 from loopy.symbolic import (
         LinearSubscript, parse_tagged_name, RuleAwareIdentityMapper,
         SubstitutionRuleExpander, ResolvedFunction,
@@ -197,7 +197,7 @@ def change_names_of_pymbolic_calls(kernel, pymbolic_calls_to_new_names):
 # {{{ type inference mapper
 
 class TypeInferenceMapper(CombineMapper):
-    def __init__(self, kernel, program_callables_info, new_assignments=None):
+    def __init__(self, kernel, callables_table, new_assignments=None):
         """
         :arg new_assignments: mapping from names to either
             :class:`loopy.kernel.data.TemporaryVariable`
@@ -206,12 +206,12 @@ class TypeInferenceMapper(CombineMapper):
             instances
         """
         self.kernel = kernel
-        assert isinstance(program_callables_info, ProgramCallablesInfo)
+        assert isinstance(callables_table, CallablesTable)
         if new_assignments is None:
             new_assignments = {}
         self.new_assignments = new_assignments
         self.symbols_with_unknown_types = set()
-        self.program_callables_info = program_callables_info
+        self.callables_table = callables_table
         self.old_calls_to_new_calls = {}
 
     def __call__(self, expr, return_tuple=False, return_dtype_set=False):
@@ -245,16 +245,16 @@ class TypeInferenceMapper(CombineMapper):
     # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x)
     # are Python-equal (for many common constants such as integers).
 
-    def copy(self, program_callables_info=None):
-        if program_callables_info is None:
-            program_callables_info = self.program_callables_info
-        return type(self)(self.kernel, program_callables_info,
+    def copy(self, callables_table=None):
+        if callables_table is None:
+            callables_table = self.callables_table
+        return type(self)(self.kernel, callables_table,
                 self.new_assignments)
 
     def with_assignments(self, names_to_vars):
         new_ass = self.new_assignments.copy()
         new_ass.update(names_to_vars)
-        return type(self)(self.kernel, self.program_callables_info, new_ass)
+        return type(self)(self.kernel, self.callables_table, new_ass)
 
     @staticmethod
     def combine(dtype_sets):
@@ -431,7 +431,7 @@ class TypeInferenceMapper(CombineMapper):
 
         # specializing the known function wrt type
         if isinstance(expr.function, ResolvedFunction):
-            in_knl_callable = self.program_callables_info[expr.function.name]
+            in_knl_callable = self.callables_table[expr.function.name]
 
             # {{{ checking that there is no overwriting of types of in_knl_callable
 
@@ -465,17 +465,17 @@ class TypeInferenceMapper(CombineMapper):
 
             # }}}
 
-            in_knl_callable, self.program_callables_info = (
+            in_knl_callable, self.callables_table = (
                     in_knl_callable.with_types(
                         arg_id_to_dtype, self.kernel,
-                        self.program_callables_info))
+                        self.callables_table))
 
             in_knl_callable = in_knl_callable.with_target(self.kernel.target)
 
             # storing the type specialized function so that it can be used for
             # later use
-            self.program_callables_info, new_function_id = (
-                    self.program_callables_info.with_callable(
+            self.callables_table, new_function_id = (
+                    self.callables_table.with_callable(
                         expr.function.function,
                         in_knl_callable))
 
@@ -538,8 +538,8 @@ class TypeInferenceMapper(CombineMapper):
                 in_knl_callable = ManglerCallable(
                         identifier, function_mangler, arg_id_to_dtype,
                         arg_id_to_descr, mangle_result.target_name)
-                self.program_callables_info, new_function_id = (
-                        self.program_callables_info.with_added_callable(
+                self.callables_table, new_function_id = (
+                        self.callables_table.with_added_callable(
                             expr.function, in_knl_callable))
 
                 if isinstance(expr, Call):
@@ -688,7 +688,7 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
 
     if var_name in kernel.all_params():
         return [kernel.index_dtype], [], {}, (
-                type_inf_mapper.program_callables_info)
+                type_inf_mapper.callables_table)
 
     from functools import partial
     debug = partial(_debug, kernel)
@@ -735,13 +735,13 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
     if not dtype_sets:
         return (
                 None, type_inf_mapper.symbols_with_unknown_types, None,
-                type_inf_mapper.program_callables_info)
+                type_inf_mapper.callables_table)
 
     result = type_inf_mapper.combine(dtype_sets)
 
     return (result, type_inf_mapper.symbols_with_unknown_types,
             type_inf_mapper.old_calls_to_new_calls,
-            type_inf_mapper.program_callables_info)
+            type_inf_mapper.callables_table)
 
 # }}}
 
@@ -768,7 +768,7 @@ class _DictUnionView:
 
 # {{{ infer_unknown_types
 
-def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info,
+def infer_unknown_types_for_a_single_kernel(kernel, callables_table,
         expect_completion=False):
     """Infer types on temporaries and arguments."""
 
@@ -831,7 +831,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info,
             new_temp_vars,
             new_arg_dict
             ])
-    type_inf_mapper = TypeInferenceMapper(kernel, program_callables_info,
+    type_inf_mapper = TypeInferenceMapper(kernel, callables_table,
             item_lookup)
 
     from loopy.symbolic import SubstitutionRuleExpander
@@ -867,11 +867,11 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info,
             debug("inferring type for %s %s", type(item).__name__, item.name)
 
             (result, symbols_with_unavailable_types,
-                    new_old_calls_to_new_calls, program_callables_info) = (
+                    new_old_calls_to_new_calls, callables_table) = (
                     _infer_var_type(
                             kernel, item.name, type_inf_mapper, subst_expander))
             type_inf_mapper = type_inf_mapper.copy(
-                    program_callables_info=program_callables_info)
+                    callables_table=callables_table)
 
             failed = not result
             if not failed:
@@ -979,7 +979,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info,
             raise NotImplementedError("Unknown instructions type %s." % (
                 type(insn).__name__))
 
-    program_callables_info = type_inf_mapper.program_callables_info
+    callables_table = type_inf_mapper.callables_table
     old_calls_to_new_calls.update(type_inf_mapper.old_calls_to_new_calls)
 
     end_time = time.time()
@@ -1003,39 +1003,39 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info,
         from loopy.check import check_functions_are_resolved
         check_functions_are_resolved(type_specialized_kernel)
 
-    return type_specialized_kernel, program_callables_info
+    return type_specialized_kernel, callables_table
 
 
 def infer_unknown_types(program, expect_completion=False):
     """Infer types on temporaries and arguments."""
 
-    program_callables_info = program.program_callables_info
+    callables_table = program.callables_table
 
     type_uninferred_knl_callable = (
-            program_callables_info[program.name])
+            callables_table[program.name])
     type_uninferred_root_kernel = type_uninferred_knl_callable.subkernel
 
-    old_callables_count = program_callables_info.callables_count
-    program_callables_info = (
-            program.program_callables_info.with_edit_callables_mode())
-    root_kernel, program_callables_info = (
+    old_callables_count = callables_table.callables_count
+    callables_table = (
+            program.callables_table.with_edit_callables_mode())
+    root_kernel, callables_table = (
             infer_unknown_types_for_a_single_kernel(
                 type_uninferred_root_kernel,
-                program_callables_info, expect_completion))
+                callables_table, expect_completion))
 
     type_inferred_knl_callable = type_uninferred_knl_callable.copy(
             subkernel=root_kernel)
 
-    program_callables_info, _ = (
-            program_callables_info.with_callable(
+    callables_table, _ = (
+            callables_table.with_callable(
                 program.name,
                 type_inferred_knl_callable))
 
-    program_callables_info = (
-            program_callables_info.with_exit_edit_callables_mode(
+    callables_table = (
+            callables_table.with_exit_edit_callables_mode(
                 old_callables_count))
 
-    return program.copy(program_callables_info=program_callables_info)
+    return program.copy(callables_table=callables_table)
 
 # }}}
 
@@ -1043,8 +1043,8 @@ def infer_unknown_types(program, expect_completion=False):
 # {{{ reduction expression helper
 
 def infer_arg_and_reduction_dtypes_for_reduction_expression(
-        kernel, expr, program_callables_info, unknown_types_ok):
-    type_inf_mapper = TypeInferenceMapper(kernel, program_callables_info)
+        kernel, expr, callables_table, unknown_types_ok):
+    type_inf_mapper = TypeInferenceMapper(kernel, callables_table)
     import loopy as lp
 
     if expr.is_tuple_typed:
@@ -1076,7 +1076,7 @@ def infer_arg_and_reduction_dtypes_for_reduction_expression(
             for dt in reduction_dtypes)
 
     return tuple(arg_dtypes), reduction_dtypes, (
-            type_inf_mapper.program_callables_info)
+            type_inf_mapper.callables_table)
 
 # }}}
 
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 43371c8a..fa32ca04 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -416,7 +416,7 @@ def test_ilp_write_race_detection_global(ctx_factory):
         from warnings import catch_warnings
         with catch_warnings(record=True) as warn_list:
             list(lp.generate_loop_schedules(knl.root_kernel,
-                    knl.program_callables_info))
+                    knl.callables_table))
 
             assert any(isinstance(w.message, WriteRaceConditionWarning)
                     for w in warn_list)
@@ -1271,7 +1271,7 @@ def save_and_reload_temporaries_test(queue, prog, out_expect, debug=False):
     from loopy.transform.save import save_and_reload_temporaries
     prog = save_and_reload_temporaries(prog)
     prog = prog.with_root_kernel(lp.get_one_scheduled_kernel(prog.root_kernel,
-        prog.program_callables_info))
+        prog.callables_table))
 
     if debug:
         print(prog)
@@ -2222,7 +2222,7 @@ def test_unscheduled_insn_detection():
         "...")
 
     prog = lp.preprocess_kernel(prog)
-    knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info)
+    knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table)
     prog = prog.with_root_kernel(knl)
     insn1, = lp.find_instructions(prog, "id:insn1")
     insns = prog.root_kernel.instructions[:]
@@ -2392,7 +2392,7 @@ def test_barrier_insertion_near_top_of_loop():
     prog = lp.set_temporary_scope(prog, "a", "local")
     prog = lp.set_temporary_scope(prog, "b", "local")
     prog = lp.preprocess_kernel(prog)
-    knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info)
+    knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table)
 
     print(knl)
 
@@ -2420,7 +2420,7 @@ def test_barrier_insertion_near_bottom_of_loop():
     prog = lp.set_temporary_scope(prog, "a", "local")
     prog = lp.set_temporary_scope(prog, "b", "local")
     prog = lp.preprocess_kernel(prog)
-    knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.program_callables_info)
+    knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table)
 
     print(knl)
 
@@ -2479,7 +2479,7 @@ def test_multi_argument_reduction_type_inference():
             allow_simultaneous=True)
 
     t_inf_mapper = TypeInferenceMapper(prog.root_kernel,
-            prog.program_callables_info)
+            prog.callables_table)
 
     assert (
             t_inf_mapper(expr, return_tuple=True, return_dtype_set=True)
@@ -2836,7 +2836,7 @@ def test_no_barriers_for_nonoverlapping_access(second_index, expect_barrier):
     prog = lp.preprocess_kernel(prog)
 
     knl = lp.get_one_scheduled_kernel(prog.root_kernel,
-            prog.program_callables_info)
+            prog.callables_table)
 
     assert barrier_between(knl, "first", "second") == expect_barrier
 
diff --git a/test/testlib.py b/test/testlib.py
index eebc792d..853e2584 100644
--- a/test/testlib.py
+++ b/test/testlib.py
@@ -9,9 +9,9 @@ class GridOverride(object):
         self.clean = clean
         self.vecsize = vecsize
 
-    def __call__(self, insn_ids, program_callables_info, ignore_auto=True):
+    def __call__(self, insn_ids, callables_table, ignore_auto=True):
         gsize, _ = self.clean.get_grid_sizes_for_insn_ids(insn_ids,
-                program_callables_info, ignore_auto)
+                callables_table, ignore_auto)
         return gsize, (self.vecsize,)
 
 # }}}
@@ -139,14 +139,14 @@ class SeparateTemporariesPreambleTestPreambleGenerator(
 
 class Log2Callable(lp.ScalarCallable):
 
-    def with_types(self, arg_id_to_dtype, kernel, program_callables_info):
+    def with_types(self, arg_id_to_dtype, kernel, callables_table):
 
         if 0 not in arg_id_to_dtype or arg_id_to_dtype[0] is None:
             # the types provided aren't mature enough to specialize the
             # callable
             return (
                     self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                    program_callables_info)
+                    callables_table)
 
         dtype = arg_id_to_dtype[0].numpy_dtype
 
@@ -168,7 +168,7 @@ class Log2Callable(lp.ScalarCallable):
                 self.copy(name_in_target=name_in_target,
                     arg_id_to_dtype={0: NumpyType(dtype), -1:
                         NumpyType(dtype)}),
-                program_callables_info)
+                callables_table)
 
 
 def register_log2_lookup(target, identifier):
-- 
GitLab


From 17bba4838c931a59b539a4bcb5cd9fa09925cad7 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Mon, 15 Oct 2018 14:59:36 -0500
Subject: [PATCH 62/80] minor changes after review

---
 loopy/kernel/__init__.py           | 11 ++---------
 loopy/kernel/function_interface.py | 11 ++++++-----
 loopy/library/reduction.py         | 12 ++++++------
 loopy/program.py                   |  9 ++++-----
 loopy/tools.py                     | 11 +++++++++++
 5 files changed, 29 insertions(+), 25 deletions(-)

diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 70079d31..9f14dafc 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -38,7 +38,7 @@ import re
 from pytools import UniqueNameGenerator, generate_unique_names
 
 from loopy.diagnostic import CannotBranchDomainTree, LoopyError
-from loopy.tools import natsorted
+from loopy.tools import natsorted, update_persistent_hash
 from loopy.diagnostic import StaticValueFindingError
 from loopy.kernel.data import filter_iname_tags_by_type
 from warnings import warn
@@ -1476,14 +1476,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             "symbol_manglers",
             )
 
-    def update_persistent_hash(self, key_hash, key_builder):
-        """Custom hash computation function for use with
-        :class:`pytools.persistent_dict.PersistentDict`.
-
-        Only works in conjunction with :class:`loopy.tools.KeyBuilder`.
-        """
-        for field_name in self.hash_fields:
-            key_builder.rec(key_hash, getattr(self, field_name))
+    update_persistent_hash = update_persistent_hash
 
     def __hash__(self):
         from loopy.tools import LoopyKeyBuilder
diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py
index 362fbcef..636d152d 100644
--- a/loopy/kernel/function_interface.py
+++ b/loopy/kernel/function_interface.py
@@ -28,7 +28,7 @@ from six.moves import zip
 from pytools import ImmutableRecord
 from loopy.diagnostic import LoopyError
 
-from loopy.kernel import LoopKernel
+from loopy.tools import update_persistent_hash
 
 __doc__ = """
 
@@ -49,7 +49,7 @@ __doc__ = """
 class ValueArgDescriptor(ImmutableRecord):
     hash_fields = ()
 
-    update_persistent_hash = LoopKernel.update_persistent_hash
+    update_persistent_hash = update_persistent_hash
 
 
 class ArrayArgDescriptor(ImmutableRecord):
@@ -99,7 +99,7 @@ class ArrayArgDescriptor(ImmutableRecord):
             "address_space",
             "dim_tags")
 
-    update_persistent_hash = LoopKernel.update_persistent_hash
+    update_persistent_hash =update_persistent_hash
 
 # }}}
 
@@ -171,7 +171,8 @@ class InKernelCallable(ImmutableRecord):
 
     .. attribute:: name
 
-        The name of the callable which can be encountered within a kernel.
+        The name of the callable which can be encountered within expressions in
+        a kernel.
 
     .. attribute:: arg_id_to_dtype
 
@@ -212,7 +213,7 @@ class InKernelCallable(ImmutableRecord):
     def __getinitargs__(self):
         return (self.arg_id_to_dtype, self.arg_id_to_descr)
 
-    update_persistent_hash = LoopKernel.update_persistent_hash
+    update_persistent_hash = update_persistent_hash
 
     def with_types(self, arg_id_to_dtype, caller_kernel, callables_table):
         """
diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index 7c32d0be..dd0e1e3e 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -31,7 +31,7 @@ import numpy as np
 from loopy.symbolic import FunctionIdentifier
 from loopy.diagnostic import LoopyError
 from loopy.types import NumpyType
-from loopy.kernel import LoopKernel
+from loopy.tools import update_persistent_hash
 
 
 class ReductionOperation(object):
@@ -227,7 +227,7 @@ class ReductionOpFunction(FunctionIdentifier):
     hash_fields = (
             "reduction_op",)
 
-    update_persistent_hash = LoopKernel.update_persistent_hash
+    update_persistent_hash = update_persistent_hash
 
 # }}}
 
@@ -285,7 +285,7 @@ class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation):
             "which",
             "op",)
 
-    update_persistent_hash = LoopKernel.update_persistent_hash
+    update_persistent_hash = update_persistent_hash
 
 
 class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation):
@@ -298,7 +298,7 @@ class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation):
             "op",
             "base_reduction_class",)
 
-    update_persistent_hash = LoopKernel.update_persistent_hash
+    update_persistent_hash = update_persistent_hash
 
 # }}}
 
@@ -354,7 +354,7 @@ class ArgMaxReductionOperation(_ArgExtremumReductionOperation):
             "update_comparison",
             "neutral_sign",)
 
-    update_persistent_hash = LoopKernel.update_persistent_hash
+    update_persistent_hash = update_persistent_hash
 
 
 class ArgMinReductionOperation(_ArgExtremumReductionOperation):
@@ -366,7 +366,7 @@ class ArgMinReductionOperation(_ArgExtremumReductionOperation):
             "update_comparison",
             "neutral_sign",)
 
-    update_persistent_hash = LoopKernel.update_persistent_hash
+    update_persistent_hash = update_persistent_hash
 
 # }}}
 
diff --git a/loopy/program.py b/loopy/program.py
index f7c399c1..aee2378f 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -39,6 +39,7 @@ from loopy.diagnostic import LoopyError
 from loopy.library.reduction import ReductionOpFunction
 
 from loopy.kernel import LoopKernel
+from loopy.tools import update_persistent_hash
 from collections import Counter
 from pymbolic.primitives import Call, CallWithKwargs
 
@@ -253,7 +254,7 @@ class Program(ImmutableRecord):
             "callables_table",
             "target",)
 
-    update_persistent_hash = LoopKernel.update_persistent_hash
+    update_persistent_hash = update_persistent_hash
 
     def copy(self, **kwargs):
         if 'target' in kwargs:
@@ -611,7 +612,7 @@ class CallablesTable(ImmutableRecord):
             self.is_being_edited
             ))
 
-    update_persistent_hash = LoopKernel.update_persistent_hash
+    update_persistent_hash = update_persistent_hash
 
     @property
     @memoize_method
@@ -620,8 +621,6 @@ class CallablesTable(ImmutableRecord):
         Returns an instance of :class:`collection.Counter` representing the number
         of times the callables is called in callables_table.
         """
-        # should raise an error if there are more than  one root kernels(which is
-        # illegal)
         root_kernel_name, = [in_knl_callable.subkernel.name for in_knl_callable
                 in self.values() if
                 isinstance(in_knl_callable, CallableKernel) and
@@ -737,7 +736,7 @@ class CallablesTable(ImmutableRecord):
 
     def with_edit_callables_mode(self):
         """
-        Initiates *self* for a walk traversal through all the callables.
+        Returns a copy of *self* for a walk traversal through all the callables.
         """
         return self.copy(
                 is_being_edited=True)
diff --git a/loopy/tools.py b/loopy/tools.py
index 5eabe6c3..52fc7d3c 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -43,6 +43,17 @@ else:
         return isinstance(obj, (int, np.integer))
 
 
+def update_persistent_hash(obj, key_hash, key_builder):
+    """
+    Custom hash computation function for use with
+    :class:`pytools.persistent_dict.PersistentDict`.
+
+    Only works in conjunction with :class:`loopy.tools.KeyBuilder`.
+    """
+    for field_name in obj.hash_fields:
+        key_builder.rec(key_hash, getattr(obj, field_name))
+
+
 # {{{ custom KeyBuilder subclass
 
 class PersistentHashWalkMapper(LoopyWalkMapper, PersistentHashWalkMapperBase):
-- 
GitLab


From dc458ada6a51a10c6283f1b90087fd722f13d00f Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 14 Nov 2018 17:41:51 -0600
Subject: [PATCH 63/80] renaming: make_program_from_kernel -> make_program

---
 loopy/__init__.py         |  4 ++--
 loopy/codegen/__init__.py |  4 ++--
 loopy/kernel/__init__.py  |  4 ++--
 loopy/kernel/creation.py  | 12 ++++++------
 loopy/program.py          |  4 ++--
 test/test_diff.py         |  2 +-
 6 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/loopy/__init__.py b/loopy/__init__.py
index 8ebd4d0e..9faa28bc 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -51,7 +51,7 @@ from loopy.kernel.data import (
 from loopy.kernel.function_interface import (
         CallableKernel, ScalarCallable)
 from loopy.program import (
-        Program, make_program_from_kernel)
+        Program, make_program)
 
 from loopy.kernel import LoopKernel, KernelState, kernel_state
 from loopy.kernel.tools import (
@@ -175,7 +175,7 @@ __all__ = [
 
         "ScalarCallable", "CallableKernel",
 
-        "Program", "make_program_from_kernel",
+        "Program", "make_program",
 
         "KernelArgument",
         "ValueArg", "ArrayArg", "GlobalArg", "ConstantArg", "ImageArg",
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 250e7215..55161ebb 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -541,10 +541,10 @@ def generate_code_v2(program):
     :param program: An instance of :class:`loopy.Program`.
     """
     from loopy.kernel import LoopKernel
-    from loopy.program import make_program_from_kernel
+    from loopy.program import make_program
 
     if isinstance(program, LoopKernel):
-        program = make_program_from_kernel(program)
+        program = make_program(program)
 
     from loopy.kernel import KernelState
     if program.root_kernel.state == KernelState.INITIAL:
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 9f14dafc..dd7acf25 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -1371,8 +1371,8 @@ class LoopKernel(ImmutableRecordWithoutPickling):
     def __call__(self, *args, **kwargs):
         warn("Calling a LoopKernel is deprecated, call a Program "
                 "instead.", DeprecationWarning, stacklevel=2)
-        from loopy.program import make_program_from_kernel
-        program = make_program_from_kernel(self)
+        from loopy.program import make_program
+        program = make_program(self)
         return program(*args, **kwargs)
 
     # }}}
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index 685232c6..b794cfb8 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -1954,7 +1954,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
     target = kwargs.pop("target", None)
     seq_dependencies = kwargs.pop("seq_dependencies", False)
     fixed_parameters = kwargs.pop("fixed_parameters", {})
-    make_program = kwargs.pop("make_program", True)
+    is_callee_kernel = kwargs.pop("is_callee_kernel", False)
 
     if defines:
         from warnings import warn
@@ -2174,15 +2174,15 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
 
     creation_plog.done()
 
-    if make_program:
-        from loopy.program import make_program_from_kernel
-        return make_program_from_kernel(knl)
-    else:
+    if is_callee_kernel:
         return knl
+    else:
+        from loopy.program import make_program
+        return make_program(knl)
 
 
 def make_kernel_function(*args, **kwargs):
-    kwargs['make_program'] = False
+    kwargs['is_callee_kernel'] = False
     return make_kernel(*args, **kwargs)
 
 # }}}
diff --git a/loopy/program.py b/loopy/program.py
index aee2378f..c8534f05 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -50,7 +50,7 @@ __doc__ = """
 .. autoclass:: Program
 .. autoclass:: CallablesTable
 
-.. autofunction:: make_program_from_kernel
+.. autofunction:: make_program
 .. autofunction:: iterate_over_kernels_if_given_program
 
 """
@@ -921,7 +921,7 @@ class CallablesTable(ImmutableRecord):
 
 # {{{ helper functions
 
-def make_program_from_kernel(kernel):
+def make_program(kernel):
     """
     Returns an instance of :class:`loopy.Program` with the *kernel* as the root
     kernel.
diff --git a/test/test_diff.py b/test/test_diff.py
index a7fd9298..49efc261 100644
--- a/test/test_diff.py
+++ b/test/test_diff.py
@@ -66,7 +66,7 @@ def test_diff(ctx_factory):
 
     from loopy.transform.diff import diff_kernel
     dknl, diff_map = diff_kernel(knl, "z", "x")
-    dknl = lp.make_program_from_kernel(dknl)
+    dknl = lp.make_program(dknl)
     dknl = lp.remove_unused_arguments(dknl)
 
     dknl = lp.add_inames_to_insn(dknl, "diff_i0", "writes:a_dx or writes:a")
-- 
GitLab


From eca2a3ed2dc9bcae43362dcbf7cf1f1ea3419a1f Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 14 Nov 2018 21:47:43 -0600
Subject: [PATCH 64/80] some changes after review

---
 loopy/__init__.py                  |  4 ++--
 loopy/kernel/creation.py           |  2 +-
 loopy/kernel/function_interface.py | 16 ++++++++++------
 test/test_diff.py                  |  2 +-
 4 files changed, 14 insertions(+), 10 deletions(-)

diff --git a/loopy/__init__.py b/loopy/__init__.py
index 9faa28bc..c2ffe5bf 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -64,7 +64,7 @@ from loopy.kernel.tools import (
         get_subkernels,
         get_subkernel_to_insn_id_map)
 from loopy.types import to_loopy_type
-from loopy.kernel.creation import make_kernel, UniqueName, make_kernel_function
+from loopy.kernel.creation import make_kernel, UniqueName, make_function
 from loopy.library.reduction import register_reduction_parser
 
 # {{{ import transforms
@@ -184,7 +184,7 @@ __all__ = [
         "SubstitutionRule",
         "CallMangleInfo",
 
-        "make_kernel", "UniqueName", "make_kernel_function",
+        "make_kernel", "UniqueName", "make_function",
 
         "register_reduction_parser",
 
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index b794cfb8..823fb1b3 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -2181,7 +2181,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
         return make_program(knl)
 
 
-def make_kernel_function(*args, **kwargs):
+def make_function(*args, **kwargs):
     kwargs['is_callee_kernel'] = False
     return make_kernel(*args, **kwargs)
 
diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py
index 636d152d..17057691 100644
--- a/loopy/kernel/function_interface.py
+++ b/loopy/kernel/function_interface.py
@@ -29,6 +29,7 @@ from pytools import ImmutableRecord
 from loopy.diagnostic import LoopyError
 
 from loopy.tools import update_persistent_hash
+from loopy.kernel import LoopKernel
 
 __doc__ = """
 
@@ -99,7 +100,7 @@ class ArrayArgDescriptor(ImmutableRecord):
             "address_space",
             "dim_tags")
 
-    update_persistent_hash =update_persistent_hash
+    update_persistent_hash = update_persistent_hash
 
 # }}}
 
@@ -176,18 +177,21 @@ class InKernelCallable(ImmutableRecord):
 
     .. attribute:: arg_id_to_dtype
 
-        A mapping which indicates the arguments types and result types it would
-        be handling. This would be set once the callable is type specialized.
+        A mapping which indicates the arguments types and result types of the
+        callable.
 
     .. attribute:: arg_id_to_descr
 
         A mapping which gives indicates the argument shape and ``dim_tags`` it
-        would be responsible for generating code. These parameters would be set,
-        once it is shape and stride(``dim_tags``) specialized.
+        would be responsible for generating code.
 
     .. note::
+        - "``arg_id`` can either be an instance of :class:`int` integer
+          corresponding to the position of the argument or an instance of
+          :class:`str` corresponding to the name of keyword argument accepted
+          by the function.
 
-        Negative "id" values ``-i`` in the mapping attributes indicate
+        - Negative "arg_id" values ``-i`` in the mapping attributes indicate
         return value with (0-based) index *i*.
 
     .. automethod:: __init__
diff --git a/test/test_diff.py b/test/test_diff.py
index 49efc261..d001233c 100644
--- a/test/test_diff.py
+++ b/test/test_diff.py
@@ -55,7 +55,7 @@ def test_diff(ctx_factory):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
-    knl = lp.make_kernel_function(
+    knl = lp.make_function(
          """{ [i,j]: 0<=i,j<n }""",
          """
          <> a = 1/(1+sinh(x[i] + y[j])**2)
-- 
GitLab


From 8b04d088d54806652d3ffaf19364cac1e4aaba2c Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Thu, 15 Nov 2018 00:22:11 -0600
Subject: [PATCH 65/80] small fix to make the tests runnable again

---
 loopy/auto_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/auto_test.py b/loopy/auto_test.py
index bee1b72f..7e23ef06 100644
--- a/loopy/auto_test.py
+++ b/loopy/auto_test.py
@@ -118,7 +118,7 @@ def make_ref_args(program, impl_arg_info, queue, parameters):
             shape = evaluate_shape(arg.unvec_shape, parameters)
             dtype = kernel_arg.dtype
 
-            is_output = arg.base_name in kernel_arg.is_output_only
+            is_output = kernel_arg.is_output_only
 
             if arg.arg_class is ImageArg:
                 storage_array = ary = cl_array.empty(
-- 
GitLab


From 408bb384ec47af2cd464e303458f9017fdf40494 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Thu, 15 Nov 2018 09:21:32 -0600
Subject: [PATCH 66/80] asserts that callees do not generate host program

---
 loopy/codegen/__init__.py |  2 ++
 loopy/codegen/control.py  | 23 ++++++++++---------
 loopy/codegen/result.py   | 47 ++++++++++++++++++++++-----------------
 3 files changed, 41 insertions(+), 31 deletions(-)

diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 55161ebb..3fd94aa2 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -561,6 +561,8 @@ def generate_code_v2(program):
             codegen_results[func_id] = (
                     generate_code_for_a_single_kernel(in_knl_callable.subkernel,
                         program.callables_table))
+            if not in_knl_callable.subkernel.is_called_from_host:
+                assert codegen_results[func_id].host_program is None
 
     device_preambles = set()
     for cgr in codegen_results.values():
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index 81a672a1..5dfd9cb4 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -117,16 +117,19 @@ def generate_code_for_sched_index(codegen_state, sched_index):
         glob_grid, loc_grid = kernel.get_grid_sizes_for_insn_ids_as_exprs(
                 get_insn_ids_for_block_at(kernel.schedule, sched_index),
                 codegen_state.callables_table)
-
-        return merge_codegen_results(codegen_state, [
-            codegen_result,
-
-            codegen_state.ast_builder.get_kernel_call(
-                codegen_state,
-                sched_item.kernel_name,
-                glob_grid, loc_grid,
-                extra_args),
-            ])
+        if kernel.is_called_from_host:
+            return merge_codegen_results(codegen_state, [
+                codegen_result,
+
+                codegen_state.ast_builder.get_kernel_call(
+                    codegen_state,
+                    sched_item.kernel_name,
+                    glob_grid, loc_grid,
+                    extra_args),
+                ])
+        else:
+            # do not generate host code for callee kernels
+            return codegen_result
 
     elif isinstance(sched_item, EnterLoop):
         tags = kernel.iname_tags(sched_item.iname)
diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index 00f19d99..7950c56b 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -292,27 +292,32 @@ def generate_host_or_device_program(codegen_state, schedule_index):
     else:
         codegen_result = build_loop_nest(codegen_state, schedule_index)
 
-    codegen_result = merge_codegen_results(
-            codegen_state,
-            ast_builder.generate_top_of_body(codegen_state)
-            + temp_decls
-            + [codegen_result],
-            collapse=False)
-
-    cur_prog = codegen_result.current_program(codegen_state)
-    body_ast = cur_prog.ast
-    fdecl_ast = ast_builder.get_function_declaration(
-            codegen_state, codegen_result, schedule_index)
-
-    fdef_ast = ast_builder.get_function_definition(
-            codegen_state, codegen_result,
-            schedule_index, fdecl_ast, body_ast)
-
-    codegen_result = codegen_result.with_new_program(
-            codegen_state,
-            cur_prog.copy(
-                ast=ast_builder.process_ast(fdef_ast),
-                body_ast=ast_builder.process_ast(body_ast)))
+    if (codegen_state.is_generating_device_code) or (
+            codegen_state.kernel.is_called_from_host):
+        codegen_result = merge_codegen_results(
+                codegen_state,
+                ast_builder.generate_top_of_body(codegen_state)
+                + temp_decls
+                + [codegen_result],
+                collapse=False)
+
+        cur_prog = codegen_result.current_program(codegen_state)
+        body_ast = cur_prog.ast
+        fdecl_ast = ast_builder.get_function_declaration(
+                codegen_state, codegen_result, schedule_index)
+
+        fdef_ast = ast_builder.get_function_definition(
+                codegen_state, codegen_result,
+                schedule_index, fdecl_ast, body_ast)
+
+        codegen_result = codegen_result.with_new_program(
+                codegen_state,
+                cur_prog.copy(
+                    ast=ast_builder.process_ast(fdef_ast),
+                    body_ast=ast_builder.process_ast(body_ast)))
+    else:
+        codegen_result = codegen_result.copy(
+                host_program=None)
 
     return codegen_result
 
-- 
GitLab


From 3f0d8b5461723c4b365a8ecc03784f8dcaf7c223 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Thu, 15 Nov 2018 09:52:28 -0600
Subject: [PATCH 67/80] store the fdecls in AST format

---
 loopy/codegen/__init__.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 3fd94aa2..00397906 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -568,20 +568,25 @@ def generate_code_v2(program):
     for cgr in codegen_results.values():
         device_preambles.update(cgr.device_preambles)
 
+    # collecting the function declarations of callee kernels
     for in_knl_callable in program.callables_table.values():
         for preamble in in_knl_callable.generate_preambles(program.target):
             device_preambles.update([preamble])
 
     collective_device_program = codegen_results[program.name].device_programs[0]
+    callee_fdecls = []
+
     for func_id, callee_cgr in codegen_results.items():
         if func_id != program.name:
             assert len(callee_cgr.device_programs) == 1
             callee_prog_ast = callee_cgr.device_programs[0].ast
             collective_device_program = collective_device_program.copy(
                     ast=Collection([callee_prog_ast, collective_device_program.ast]))
+            callee_fdecls.append(callee_prog_ast.fdecl)
 
-            device_preambles.update([('98_%s' % func_id,
-                str(callee_prog_ast.fdecl)), ])
+    for callee_fdecl in callee_fdecls:
+        collective_device_program = collective_device_program.copy(
+                ast=Collection([callee_fdecl, collective_device_program.ast]))
 
     collective_device_programs = [collective_device_program] + (
             codegen_results[program.name].device_programs[1:])
-- 
GitLab


From d191d34ff87d44e7ad72f8f3b2f2324a28a399fe Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Thu, 15 Nov 2018 09:53:52 -0600
Subject: [PATCH 68/80] removes assymetry between host and device preambles

---
 loopy/codegen/result.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index 7950c56b..268a70b2 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -133,7 +133,7 @@ class CodeGenerationResult(ImmutableRecord):
         preamble_codes = process_preambles(
                 getattr(self, "host_preambles", [])
                 +
-                list(getattr(self, "device_preambles", []))
+                getattr(self, "device_preambles", [])
                 )
 
         return (
-- 
GitLab


From b2903df6c6227960e720ea35cff174df877d4dd7 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Tue, 20 Nov 2018 11:46:56 -0600
Subject: [PATCH 69/80] small typo, to re-enable making callee kernels

---
 loopy/kernel/creation.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index 823fb1b3..c7991873 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -2182,7 +2182,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
 
 
 def make_function(*args, **kwargs):
-    kwargs['is_callee_kernel'] = False
+    kwargs['is_callee_kernel'] = True
     return make_kernel(*args, **kwargs)
 
 # }}}
-- 
GitLab


From 95ee6fed7549c36dd421b8eb9fcd768d53a139a5 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Tue, 20 Nov 2018 12:19:34 -0600
Subject: [PATCH 70/80] made device preambles list back again

---
 loopy/codegen/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 00397906..d8a7effc 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -564,14 +564,14 @@ def generate_code_v2(program):
             if not in_knl_callable.subkernel.is_called_from_host:
                 assert codegen_results[func_id].host_program is None
 
-    device_preambles = set()
+    device_preambles = []
     for cgr in codegen_results.values():
-        device_preambles.update(cgr.device_preambles)
+        device_preambles.extend(cgr.device_preambles)
 
     # collecting the function declarations of callee kernels
     for in_knl_callable in program.callables_table.values():
         for preamble in in_knl_callable.generate_preambles(program.target):
-            device_preambles.update([preamble])
+            device_preambles.append(preamble)
 
     collective_device_program = codegen_results[program.name].device_programs[0]
     callee_fdecls = []
-- 
GitLab


From c12c610978b2b1ecab1a6b619f64315b241bfa0e Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Tue, 20 Nov 2018 12:45:04 -0600
Subject: [PATCH 71/80] Merge 'master' into 'new_function_interface'

---
 .gitlab-ci.yml                                | 19 ++++++++++-
 LICENSE                                       | 21 ++++++++++++
 .../make-linux-build-docker-inner-part-2.sh   |  4 +++
 loopy/frontend/fortran/tree.py                |  2 +-
 loopy/kernel/tools.py                         |  4 +--
 loopy/schedule/__init__.py                    | 10 ++++--
 loopy/statistics.py                           | 20 ++++++++----
 loopy/symbolic.py                             |  2 +-
 loopy/target/cuda.py                          |  2 +-
 loopy/target/pyopencl.py                      |  3 +-
 requirements.txt                              |  5 +--
 setup.cfg                                     |  2 +-
 test/test_loopy.py                            | 19 +++++++++++
 test/test_numa_diff.py                        |  2 +-
 test/test_reduction.py                        | 32 +++++++++++--------
 test/test_statistics.py                       | 14 +++++---
 test/test_target.py                           | 17 ++++++++++
 17 files changed, 137 insertions(+), 41 deletions(-)
 create mode 100644 LICENSE

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 1caef802..ea69114d 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -12,6 +12,10 @@ Python 2.7 POCL:
   - pocl
   except:
   - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
 
 Python 2.7 with legacy PyOpenCL:
   script:
@@ -29,6 +33,10 @@ Python 2.7 with legacy PyOpenCL:
   except:
   - tags
   retry: 2
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
 
 Python 3.6 POCL:
   script:
@@ -43,6 +51,10 @@ Python 3.6 POCL:
   - pocl
   except:
   - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
 
 Python 3.6 POCL Twice With Cache:
   script:
@@ -59,6 +71,10 @@ Python 3.6 POCL Twice With Cache:
   - pocl
   except:
   - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
 
 # PyPy POCL:
 #   script:
@@ -77,7 +93,7 @@ Python 3.6 POCL Examples:
   script:
   - export PY_EXE=python3.6
   - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib jupyter nbconvert"
+  - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipykernel nbconvert"
   - ". ./build-py-project-and-run-examples.sh"
   tags:
   - python3.6
@@ -87,6 +103,7 @@ Python 3.6 POCL Examples:
   except:
   - tags
 
+
 CentOS binary:
   script:
   - (cd build-helpers; ./make-linux-build-docker.sh --nodate)
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 00000000..601df74b
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 Andreas Klöckner and contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/build-helpers/make-linux-build-docker-inner-part-2.sh b/build-helpers/make-linux-build-docker-inner-part-2.sh
index 1e35a1e1..035634b1 100755
--- a/build-helpers/make-linux-build-docker-inner-part-2.sh
+++ b/build-helpers/make-linux-build-docker-inner-part-2.sh
@@ -23,6 +23,10 @@ git clone --recursive git://github.com/inducer/loopy
 cd loopy
 
 grep -v pyopencl requirements.txt > myreq.txt
+
+# needed for pyinstaller package to be usable
+echo packaging >> myreq.txt
+
 pip install -r myreq.txt
 python setup.py install
 
diff --git a/loopy/frontend/fortran/tree.py b/loopy/frontend/fortran/tree.py
index b1df6e3d..6939bb6a 100644
--- a/loopy/frontend/fortran/tree.py
+++ b/loopy/frontend/fortran/tree.py
@@ -53,7 +53,7 @@ class FTreeWalkerBase(object):
 
     ENTITY_RE = re.compile(
             r"^(?P<name>[_0-9a-zA-Z]+)"
-            "(\((?P<shape>[-+*0-9:a-zA-Z, \t]+)\))?$")
+            r"(\((?P<shape>[-+*0-9:a-zA-Z, \t]+)\))?$")
 
     def parse_dimension_specs(self, node, dim_decls):
         def parse_bounds(bounds_str):
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 006ac6ba..3aaa8d56 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -1253,7 +1253,7 @@ def draw_dependencies_as_unicode_arrows(
         for dep in insn.depends_on:
             reverse_deps.setdefault(dep, set()).add(insn.id)
 
-    # mapping of (from_id, to_id) tuples to column_index
+    # mapping of to_id tuples to column_index
     dep_to_column = {}
 
     # {{{ find column assignments
@@ -1330,7 +1330,7 @@ def draw_dependencies_as_unicode_arrows(
 
             elif insn.id in starts:
                 starts.remove(insn.id)
-                if starts:
+                if starts or pointed_at_insn_id not in processed_ids:
                     # will continue downward
                     row[col] = do_flag_downward(u"├", pointed_at_insn_id)
 
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index 2b3f7a3b..3dc1c0bb 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -794,9 +794,13 @@ def generate_loop_schedules_internal(
 
         if not is_ready:
             if debug_mode:
-                print("instruction '%s' is missing insn depedencies '%s'" % (
-                        format_insn(kernel, insn.id), ",".join(
-                            insn.depends_on - sched_state.scheduled_insn_ids)))
+                # These are not that interesting when understanding scheduler
+                # failures.
+
+                # print("instruction '%s' is missing insn depedencies '%s'" % (
+                #         format_insn(kernel, insn.id), ",".join(
+                #             insn.depends_on - sched_state.scheduled_insn_ids)))
+                pass
             continue
 
         want = kernel.insn_inames(insn) - sched_state.parallel_inames
diff --git a/loopy/statistics.py b/loopy/statistics.py
index d65387d1..454cca18 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -707,9 +707,10 @@ class CounterBase(CombineMapper):
 # {{{ ExpressionOpCounter
 
 class ExpressionOpCounter(CounterBase):
-    def __init__(self, knl, callables_table):
+    def __init__(self, knl, callables_table, count_within_subscripts=True):
         self.knl = knl
         self.callables_table = callables_table
+        self.count_within_subscripts = count_within_subscripts
         from loopy.type_inference import TypeInferenceMapper
         self.type_inf = TypeInferenceMapper(knl, callables_table)
 
@@ -737,7 +738,10 @@ class ExpressionOpCounter(CounterBase):
                     ) + self.rec(expr.parameters)
 
     def map_subscript(self, expr):
-        return self.rec(expr.index)
+        if self.count_within_subscripts:
+            return self.rec(expr.index)
+        else:
+            return ToCountMap()
 
     def map_sum(self, expr):
         assert expr.children
@@ -1343,10 +1347,9 @@ def _get_insn_count(knl, callables_table, insn_id, subgroup_size,
 
 # {{{ get_op_map
 
-
 def get_op_map_for_single_kernel(knl, callables_table,
         numpy_types=True, count_redundant_work=False,
-               subgroup_size=None):
+        count_within_subscripts=True, subgroup_size=None):
 
     if not knl.options.ignore_boostable_into:
         raise LoopyError("Kernel '%s': Using operation counting requires the option "
@@ -1394,7 +1397,7 @@ def get_op_map_for_single_kernel(knl, callables_table,
 
 
 def get_op_map(program, numpy_types=True, count_redundant_work=False,
-               subgroup_size=None):
+               count_within_subscripts=True, subgroup_size=None):
 
     """Count the number of operations in a loopy kernel.
 
@@ -1410,6 +1413,9 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False,
         (Likely desirable for performance modeling, but undesirable for code
         optimization.)
 
+    :arg count_within_subscripts: A :class:`bool` specifying whether to
+        count operations inside array indices.
+
     :arg subgroup_size: (currently unused) An :class:`int`, :class:`str`
         ``'guess'``, or *None* that specifies the sub-group size. An OpenCL
         sub-group is an implementation-dependent grouping of work-items within
@@ -1464,8 +1470,8 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False,
         if isinstance(in_knl_callable, CallableKernel):
             knl = in_knl_callable.subkernel
             knl_op_map = get_op_map_for_single_kernel(knl,
-                        program.callables_table, numpy_types,
-                        count_redundant_work, subgroup_size)
+                    program.callables_table, numpy_types, count_redundant_work,
+                    count_within_subscripts, subgroup_size)
 
             for i in range(callables_count[func_id]):
                 op_map += knl_op_map
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 92b209ac..04cf2d02 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -1696,7 +1696,7 @@ def get_access_range(domain, subscript, assumptions, shape=None,
             if shape is not None:
                 try:
                     shape_aff = guarded_aff_from_expr(access_map.space, shape[idim])
-                except ExpressionToAffineConversionError as sub_err:
+                except ExpressionToAffineConversionError:
                     pass
 
             if shape_aff is None:
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index 32b810eb..6b4385bf 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -344,7 +344,7 @@ class CUDACASTBuilder(CASTBuilder):
     _VEC_AXES = "xyzw"
 
     def add_vector_access(self, access_expr, index):
-        return access_expr.a(self._VEC_AXES[index])
+        return access_expr.attr(self._VEC_AXES[index])
 
     def emit_barrier(self, synchronization_kind, mem_kind, comment):
         """
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index d98b6cdd..5ef56457 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -125,7 +125,8 @@ def adjust_local_temp_var_storage(kernel, device):
 
             new_storage_shape = storage_shape
 
-        new_temp_vars[temp_var.name] = temp_var.copy(storage_shape=new_storage_shape)
+        new_temp_vars[temp_var.name] = temp_var.copy(
+                storage_shape=tuple(new_storage_shape))
 
     return kernel.copy(temporary_variables=new_temp_vars)
 
diff --git a/requirements.txt b/requirements.txt
index a3e88cfe..97c20247 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,4 @@ git+https://github.com/inducer/codepy.git
 git+https://github.com/inducer/f2py
 
 # Optional, needed for using the C preprocessor on Fortran
-ply>=3.6
-
-# This is needed for the pyinstaller executable to be usable.
-packaging
+ply>=3.6
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index b939ce0c..eec3dfd1 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [flake8]
-ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814
+ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814,W504
 max-line-length=85
 exclude=
     loopy/target/c/compyte/ndarray,
diff --git a/test/test_loopy.py b/test/test_loopy.py
index fa32ca04..b770497f 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2890,6 +2890,25 @@ def test_dep_cycle_printing_and_error():
         print(lp.generate_code(knl).device_code())
 
 
+def test_backwards_dep_printing_and_error():
+    knl = lp.make_kernel(
+            "{[i]: 0<=i<n}",
+            """
+            c[i] = a[i] + b[i]                       {id=insn1}
+            c[i] = 2*c[i]                            {id=insn2, dep=insn1}
+            c[i] = 7*c[i] + a[i]*a[i] + b[i]*b[i]    {id=insn3, dep=insn2}
+            b[i] = b[i] + c[i]                                 {id=insn4, dep=insn3}
+            d[i] = 7*a[i ]                                     {id=insn5, dep=insn4}
+            a[i] = a[i] + d[i]                                 {id=insn6, dep=insn5}
+            """, [
+                lp.GlobalArg('a, b', dtype=np.float64),
+                "..."
+            ])
+
+    # Used to crash with KeyError
+    print(knl)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py
index 62f490ce..1ba44e77 100644
--- a/test/test_numa_diff.py
+++ b/test/test_numa_diff.py
@@ -47,8 +47,8 @@ __all__ = [
 from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa
 
 
-@pytest.mark.parametrize("Nq", [7])
 @pytest.mark.parametrize("ilp_multiple", [1, 2])
+@pytest.mark.parametrize("Nq", [7])
 @pytest.mark.parametrize("opt_level", [11])
 def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
     ctx = ctx_factory()
diff --git a/test/test_reduction.py b/test/test_reduction.py
index 96dab405..aaf11ee2 100644
--- a/test/test_reduction.py
+++ b/test/test_reduction.py
@@ -219,32 +219,38 @@ def test_local_parallel_reduction(ctx_factory, size):
 def test_global_parallel_reduction(ctx_factory, size):
     ctx = ctx_factory()
 
-    prog = lp.make_kernel(
+    knl = lp.make_kernel(
             "{[i]: 0 <= i < n }",
             """
             # Using z[0] instead of z works around a bug in ancient PyOpenCL.
-            z[0] = sum(i, i/13)
+            z[0] = sum(i, a[i])
             """)
 
-    ref_prog = prog
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
+    ref_knl = knl
 
     gsize = 128
-    prog = lp.split_iname(prog, "i", gsize * 20)
-    prog = lp.split_iname(prog, "i_inner", gsize, outer_tag="l.0")
-    prog = lp.split_reduction_inward(prog, "i_inner_inner")
-    prog = lp.split_reduction_inward(prog, "i_inner_outer")
+    knl = lp.split_iname(knl, "i", gsize * 20)
+    knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0")
+    knl = lp.split_reduction_outward(knl, "i_outer")
+    knl = lp.split_reduction_inward(knl, "i_inner_outer")
     from loopy.transform.data import reduction_arg_to_subst_rule
-    prog = reduction_arg_to_subst_rule(prog, "i_outer")
-    prog = lp.precompute(prog, "red_i_outer_arg", "i_outer",
+    knl = reduction_arg_to_subst_rule(knl, "i_outer")
+
+    knl = lp.precompute(knl, "red_i_outer_arg", "i_outer",
             temporary_scope=lp.temp_var_scope.GLOBAL,
             default_tag="l.auto")
-    prog = lp.realize_reduction(prog)
-    prog = lp.add_dependency(
-            prog, "writes:acc_i_outer",
+    knl = lp.realize_reduction(knl)
+    knl = lp.tag_inames(knl, "i_outer_0:g.0")
+
+    # Keep the i_outer accumulator on the  correct (lower) side of the barrier,
+    # otherwise there will be useless save/reload code generated.
+    knl = lp.add_dependency(
+            knl, "writes:acc_i_outer",
             "id:red_i_outer_arg_barrier")
 
     lp.auto_test_vs_ref(
-            ref_prog, ctx, prog, parameters={"n": size},
+            ref_knl, ctx, knl, parameters={"n": size},
             print_ref_code=True)
 
 
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 3f236652..41b44b5a 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -57,7 +57,8 @@ def test_op_counter_basic():
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
                                        g=np.float64, h=np.float64))
-    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True,
+                           count_within_subscripts=True)
     n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, SGS)
@@ -161,7 +162,8 @@ def test_op_counter_specialops():
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
                                        g=np.float64, h=np.float64))
-    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True,
+                           count_within_subscripts=True)
     n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, SGS)
@@ -206,7 +208,8 @@ def test_op_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int64, h=np.int64))
 
-    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True,
+                           count_within_subscripts=False)
     n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, SGS)
@@ -226,7 +229,7 @@ def test_op_counter_bitwise():
     i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.SUBGROUP)
                       ].eval_with_dict(params)
     # (count-per-sub-group)*n_subgroups
-    assert i32add == n*m+n*m*ell*n_subgroups
+    assert i32add == n*m*ell*n_subgroups
     assert i32bw == 2*n*m*ell*n_subgroups
     assert i64bw == 2*n*m*n_subgroups
     assert i64add == i64mul == n*m*n_subgroups
@@ -1153,7 +1156,8 @@ def test_summations_and_filters():
     assert f32lall == (3*n*m*ell)*n_subgroups
     assert f64lall == (2*n*m)*n_subgroups
 
-    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True,
+                           count_within_subscripts=True)
     #for k, v in op_map.items():
     #    print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v)
 
diff --git a/test/test_target.py b/test/test_target.py
index a5186c71..095bf093 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -350,6 +350,23 @@ def test_ispc_streaming_stores():
     lp.generate_code_v2(knl).all_code()
 
 
+def test_cuda_short_vector():
+    knl = lp.make_kernel(
+        "{ [i]: 0<=i<n }",
+        "out[i] = 2*a[i]",
+        target=lp.CudaTarget())
+
+    knl = lp.set_options(knl, write_code=True)
+    knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec")
+    knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4)
+    knl = lp.tag_array_axes(knl, "a,out", "C,vec")
+
+    knl = lp.set_options(knl, write_wrapper=True)
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
+
+    print(lp.generate_code_v2(knl).device_code())
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
-- 
GitLab


From cb151a4bdae8a1a9643ce6a6c93da80e5b5e56de Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Tue, 20 Nov 2018 13:23:59 -0600
Subject: [PATCH 72/80] another one of ArrayBase erros

---
 loopy/kernel/array.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 6bf733a8..0ed1f940 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -834,6 +834,7 @@ class ArrayBase(ImmutableRecord):
                 order=order,
                 alignment=alignment,
                 for_atomic=for_atomic,
+                target=target,
                 **kwargs)
 
     def __eq__(self, other):
-- 
GitLab


From a385bd0632e26896a55978e4064a145fbf24a93b Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 21 Nov 2018 05:27:09 -0600
Subject: [PATCH 73/80] import changes from statistics to count within
 subscripts

---
 loopy/statistics.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 454cca18..88aa49bb 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -1358,7 +1358,8 @@ def get_op_map_for_single_kernel(knl, callables_table,
     subgroup_size = _process_subgroup_size(knl, subgroup_size)
 
     op_map = ToCountMap()
-    op_counter = ExpressionOpCounter(knl, callables_table)
+    op_counter = ExpressionOpCounter(knl, callables_table,
+            count_within_subscripts)
 
     from loopy.kernel.instruction import (
             CallInstruction, CInstruction, Assignment,
-- 
GitLab


From dc0f57d8bb1fee4ed9fd4a7f6ccb39dc9a81d502 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 21 Nov 2018 09:06:27 -0600
Subject: [PATCH 74/80] Some more merge leftovers from new_function_interface

---
 loopy/kernel/__init__.py    | 67 ++++++++++++++++++++++++++++++++-----
 loopy/kernel/creation.py    |  7 +++-
 loopy/transform/callable.py | 64 ++++++++++++++++++-----------------
 3 files changed, 97 insertions(+), 41 deletions(-)

diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 928eed26..26db6ec4 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -1036,20 +1036,17 @@ class LoopKernel(ImmutableRecordWithoutPickling):
                 constants_only=True)))
 
     @memoize_method
-    def get_grid_sizes_for_insn_ids(self, insn_ids, callables_table,
-            ignore_auto=False):
+    def get_grid_sizes_for_insn_ids_as_dicts(self, insn_ids,
+            callables_table, ignore_auto=False):
         """Return a tuple (global_size, local_size) containing a grid that
         could accommodate execution of all instructions whose IDs are given
         in *insn_ids*.
+
         :arg insn_ids: a :class:`frozenset` of instruction IDs
-        *global_size* and *local_size* are :class:`islpy.PwAff` objects.
-        """
 
-        if self.overridden_get_grid_sizes_for_insn_ids:
-            return self.overridden_get_grid_sizes_for_insn_ids(
-                    insn_ids,
-                    callables_table,
-                    ignore_auto=ignore_auto)
+        *global_size* and *local_size* are instances of :class:`dict` with
+        mapping of the form from ``axis`` to :class:`islpy.PwAff` objects.
+        """
 
         # {{{ collecting the callee kernels in insn_ids
 
@@ -1124,6 +1121,58 @@ class LoopKernel(ImmutableRecordWithoutPickling):
         return global_sizes, local_sizes
 
     @memoize_method
+    def get_grid_sizes_for_insn_ids(self, insn_ids, callables_table,
+            ignore_auto=False):
+        """Return a tuple (global_size, local_size) containing a grid that
+        could accommodate execution of all instructions whose IDs are given
+        in *insn_ids*.
+
+        :arg insn_ids: a :class:`frozenset` of instruction IDs
+
+        *global_size* and *local_size* are :class:`islpy.PwAff` objects.
+        """
+
+        if self.overridden_get_grid_sizes_for_insn_ids:
+            return self.overridden_get_grid_sizes_for_insn_ids(
+                    insn_ids,
+                    callables_table=callables_table,
+                    ignore_auto=ignore_auto)
+
+        assert self.is_called_from_host, ("Callee kernels do not have sufficient "
+                "information to compute grid sizes.")
+
+        global_sizes, local_sizes = self.get_grid_sizes_for_insn_ids_as_dicts(
+                insn_ids, callables_table, ignore_auto=ignore_auto)
+
+        def to_dim_tuple(size_dict, which, forced_sizes={}):
+            forced_sizes = forced_sizes.copy()
+
+            size_list = []
+            sorted_axes = sorted(six.iterkeys(size_dict))
+
+            while sorted_axes or forced_sizes:
+                if sorted_axes:
+                    cur_axis = sorted_axes.pop(0)
+                else:
+                    cur_axis = None
+
+                if len(size_list) in forced_sizes:
+                    size_list.append(forced_sizes.pop(len(size_list)))
+                    continue
+
+                assert cur_axis is not None
+
+                if cur_axis > len(size_list):
+                    raise LoopyError("%s axis %d unused for %s" % (
+                        which, len(size_list), self.name))
+
+                size_list.append(size_dict[cur_axis])
+
+            return tuple(size_list)
+
+        return (to_dim_tuple(global_sizes, "global"),
+                to_dim_tuple(local_sizes, "local", forced_sizes=self.local_sizes))
+
     def get_grid_sizes_for_insn_ids_as_exprs(self, insn_ids,
             callables_table, ignore_auto=False):
         """Return a tuple (global_size, local_size) containing a grid that
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index 060b5d76..52e299b6 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -2146,7 +2146,7 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
 
     # {{{ handle kernel language version
 
-    if is_callee_kernel:
+    if not is_callee_kernel:
         from loopy.version import LANGUAGE_VERSION_SYMBOLS
 
         version_to_symbol = dict(
@@ -2353,6 +2353,11 @@ def make_kernel(domains, instructions, kernel_data=["..."], **kwargs):
 
 
 def make_function(*args, **kwargs):
+    lang_version = kwargs.pop('lang_version', None)
+    if lang_version:
+        raise LoopyError("lang_version should be set for program, not "
+                "functions.")
+
     kwargs['is_callee_kernel'] = True
     return make_kernel(*args, **kwargs)
 
diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py
index 532f6021..e293543f 100644
--- a/loopy/transform/callable.py
+++ b/loopy/transform/callable.py
@@ -173,7 +173,7 @@ def register_callable_kernel(program, callee_kernel):
     expected_num_assignees = len([arg for arg in callee_kernel.args if
         arg.is_output_only])
     expected_num_parameters = len(callee_kernel.args) - expected_num_assignees
-    for in_knl_callable in program.program_callables_info.values():
+    for in_knl_callable in program.callables_table.values():
         if isinstance(in_knl_callable, CallableKernel):
             caller_kernel = in_knl_callable.subkernel
             for insn in caller_kernel.instructions:
@@ -211,8 +211,9 @@ def register_callable_kernel(program, callee_kernel):
 
     # take the function resolvers from the Program and resolve the functions in
     # the callee kernel
-    program_callables_info = (
-            program.program_callables_info.with_edit_callables_mode())
+    old_callables_count = program.callables_table.callables_count
+    callables_table = (
+            program.callables_table.with_edit_callables_mode())
 
     from loopy.symbolic import SubstitutionRuleMappingContext
     rule_mapping_context = SubstitutionRuleMappingContext(
@@ -220,16 +221,17 @@ def register_callable_kernel(program, callee_kernel):
             callee_kernel.get_var_name_generator())
 
     resolved_function_marker = ResolvedFunctionMarker(
-            rule_mapping_context, callee_kernel, program_callables_info,
+            rule_mapping_context, callee_kernel, callables_table,
             program.func_id_to_in_knl_callable_mappers)
 
     callee_kernel = rule_mapping_context.finish_kernel(
             resolved_function_marker.map_kernel(callee_kernel))
-    program_callables_info = resolved_function_marker.program_callables_info
+    callables_table = resolved_function_marker.callables_table
 
-    program_callables_info = (
-            program_callables_info.with_exit_edit_callables_mode())
-    program = program.copy(program_callables_info=program_callables_info)
+    callables_table = (
+            callables_table.with_exit_edit_callables_mode(
+                old_callables_count))
+    program = program.copy(callables_table=callables_table)
 
     # making the target of the child kernel to be same as the target of parent
     # kernel.
@@ -492,26 +494,26 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction):
 # {{{ inline callable kernel
 
 def _inline_single_callable_kernel(caller_kernel, function_name,
-        program_callables_info):
+        callables_table):
     old_insns = caller_kernel.instructions
     for insn in old_insns:
         if isinstance(insn, CallInstruction):
             # FIXME This seems to use identifiers across namespaces. Why not
             # check whether the function is a scoped function first? ~AK
-            if insn.expression.function.name in program_callables_info:
-                history_of_identifier = program_callables_info.history[
+            if insn.expression.function.name in callables_table:
+                history_of_identifier = callables_table.history[
                         insn.expression.function.name]
 
                 if function_name in history_of_identifier:
-                    in_knl_callable = program_callables_info[
+                    in_knl_callable = callables_table[
                             insn.expression.function.name]
                     assert isinstance(in_knl_callable, CallableKernel)
                     caller_kernel = _inline_call_instruction(
                             caller_kernel, in_knl_callable.subkernel, insn)
-                    program_callables_info = (
-                            program_callables_info.with_deleted_callable(
+                    callables_table = (
+                            callables_table.with_deleted_callable(
                                 insn.expression.function.name,
-                                program_callables_info.num_times_callables_called[
+                                callables_table.num_times_callables_called[
                                     caller_kernel.name]))
         elif isinstance(insn, (MultiAssignmentBase, CInstruction,
                 _DataObliviousInstruction)):
@@ -521,7 +523,7 @@ def _inline_single_callable_kernel(caller_kernel, function_name,
                     "Unknown instruction type %s"
                     % type(insn).__name__)
 
-    return caller_kernel, program_callables_info
+    return caller_kernel, callables_table
 
 
 # FIXME This should take a 'within' parameter to be able to only inline
@@ -533,33 +535,33 @@ def inline_callable_kernel(program, function_name):
     """
     from loopy.preprocess import infer_arg_descr
     program = infer_arg_descr(program)
-    program_callables_info = program.program_callables_info
-    old_program_callables_info = program_callables_info.copy()
+    callables_table = program.callables_table
+    old_callables_table = callables_table.copy()
 
     edited_callable_kernels = {}
 
-    for func_id, in_knl_callable in old_program_callables_info.items():
-        if function_name not in old_program_callables_info.history[func_id] and (
+    for func_id, in_knl_callable in old_callables_table.items():
+        if function_name not in old_callables_table.history[func_id] and (
                 isinstance(in_knl_callable, CallableKernel)):
             caller_kernel = in_knl_callable.subkernel
-            caller_kernel, program_callables_info = (
+            caller_kernel, callables_table = (
                     _inline_single_callable_kernel(caller_kernel,
                         function_name,
-                        program_callables_info))
+                        callables_table))
             edited_callable_kernels[func_id] = in_knl_callable.copy(
                     subkernel=caller_kernel)
 
     new_resolved_functions = {}
-    for func_id, in_knl_callable in program_callables_info.items():
+    for func_id, in_knl_callable in callables_table.items():
         if func_id in edited_callable_kernels:
             new_resolved_functions[func_id] = edited_callable_kernels[func_id]
         else:
             new_resolved_functions[func_id] = in_knl_callable
 
-    program_callables_info = program_callables_info.copy(
+    callables_table = callables_table.copy(
             resolved_functions=new_resolved_functions)
 
-    return program.copy(program_callables_info=program_callables_info)
+    return program.copy(callables_table=callables_table)
 
 # }}}
 
@@ -719,20 +721,20 @@ def _match_caller_callee_argument_dimension_(program, callee_function_name):
             callee_function_name).map_kernel
 
     caller_knl,  = [in_knl_callable.subkernel for in_knl_callable in
-            program.program_callables_info.values() if isinstance(in_knl_callable,
+            program.callables_table.values() if isinstance(in_knl_callable,
                 CallableKernel) and
             is_invoking_callee(in_knl_callable.subkernel)]
 
-    old_callee_knl = program.program_callables_info[
+    old_callee_knl = program.callables_table[
             callee_function_name].subkernel
     new_callee_kernel = _match_caller_callee_argument_dimension_for_single_kernel(
             caller_knl, old_callee_knl)
 
-    new_program_callables_info = program.program_callables_info.copy()
-    new_program_callables_info.resolved_functions[callee_function_name] = (
-            new_program_callables_info[callee_function_name].copy(
+    new_callables_table = program.callables_table.copy()
+    new_callables_table.resolved_functions[callee_function_name] = (
+            new_callables_table[callee_function_name].copy(
                 subkernel=new_callee_kernel))
-    return program.copy(program_callables_info=new_program_callables_info)
+    return program.copy(callables_table=new_callables_table)
 
 # }}}
 
-- 
GitLab


From 20371326ee0fad5ad62217231bb35e7aa65fe11b Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 21 Nov 2018 10:03:36 -0600
Subject: [PATCH 75/80] some more program_callables_info -> callables_table

---
 loopy/transform/callable.py             | 46 ++++++++++++-------------
 loopy/transform/pack_and_unpack_args.py | 14 ++++----
 2 files changed, 29 insertions(+), 31 deletions(-)

diff --git a/loopy/transform/callable.py b/loopy/transform/callable.py
index e293543f..f812b8ea 100644
--- a/loopy/transform/callable.py
+++ b/loopy/transform/callable.py
@@ -31,7 +31,7 @@ from loopy.kernel import LoopKernel
 from pytools import ImmutableRecord
 from loopy.diagnostic import LoopyError
 from loopy.kernel.instruction import (CallInstruction, MultiAssignmentBase,
-        CInstruction, _DataObliviousInstruction)
+        Assignment, CInstruction, _DataObliviousInstruction)
 from loopy.symbolic import IdentityMapper, SubstitutionMapper, CombineMapper
 from loopy.isl_helpers import simplify_via_aff
 from loopy.kernel.function_interface import (get_kw_pos_association,
@@ -211,26 +211,19 @@ def register_callable_kernel(program, callee_kernel):
 
     # take the function resolvers from the Program and resolve the functions in
     # the callee kernel
-    old_callables_count = program.callables_table.callables_count
-    callables_table = (
-            program.callables_table.with_edit_callables_mode())
-
     from loopy.symbolic import SubstitutionRuleMappingContext
     rule_mapping_context = SubstitutionRuleMappingContext(
             callee_kernel.substitutions,
             callee_kernel.get_var_name_generator())
 
     resolved_function_marker = ResolvedFunctionMarker(
-            rule_mapping_context, callee_kernel, callables_table,
+            rule_mapping_context, callee_kernel, program.callables_table,
             program.func_id_to_in_knl_callable_mappers)
 
     callee_kernel = rule_mapping_context.finish_kernel(
             resolved_function_marker.map_kernel(callee_kernel))
-    callables_table = resolved_function_marker.callables_table
+    callables_table = resolved_function_marker.callables_table.copy()
 
-    callables_table = (
-            callables_table.with_exit_edit_callables_mode(
-                old_callables_count))
     program = program.copy(callables_table=callables_table)
 
     # making the target of the child kernel to be same as the target of parent
@@ -462,15 +455,25 @@ def _inline_call_instruction(caller_kernel, callee_knl, instruction):
                 type(atomicity)(var_map[p.Variable(atomicity.var_name)].name)
                 for atomicity in insn.atomicity)
 
-        insn = insn.copy(
-            id=insn_id[insn.id],
-            within_inames=within_inames,
-            # TODO: probaby need to keep priority in callee kernel
-            priority=instruction.priority,
-            depends_on=depends_on,
-            tags=insn.tags | instruction.tags,
-            atomicity=new_atomicity
-        )
+        if isinstance(insn, Assignment):
+            insn = insn.copy(
+                id=insn_id[insn.id],
+                within_inames=within_inames,
+                # TODO: probaby need to keep priority in callee kernel
+                priority=instruction.priority,
+                depends_on=depends_on,
+                tags=insn.tags | instruction.tags,
+                atomicity=new_atomicity
+            )
+        else:
+            insn = insn.copy(
+                id=insn_id[insn.id],
+                within_inames=within_inames,
+                # TODO: probaby need to keep priority in callee kernel
+                priority=instruction.priority,
+                depends_on=depends_on,
+                tags=insn.tags | instruction.tags,
+            )
         inner_insns.append(insn)
 
     inner_insns.append(noop_end)
@@ -510,11 +513,6 @@ def _inline_single_callable_kernel(caller_kernel, function_name,
                     assert isinstance(in_knl_callable, CallableKernel)
                     caller_kernel = _inline_call_instruction(
                             caller_kernel, in_knl_callable.subkernel, insn)
-                    callables_table = (
-                            callables_table.with_deleted_callable(
-                                insn.expression.function.name,
-                                callables_table.num_times_callables_called[
-                                    caller_kernel.name]))
         elif isinstance(insn, (MultiAssignmentBase, CInstruction,
                 _DataObliviousInstruction)):
             pass
diff --git a/loopy/transform/pack_and_unpack_args.py b/loopy/transform/pack_and_unpack_args.py
index 73407257..e5ed850c 100644
--- a/loopy/transform/pack_and_unpack_args.py
+++ b/loopy/transform/pack_and_unpack_args.py
@@ -37,7 +37,7 @@ __doc__ = """
 
 
 def pack_and_unpack_args_for_call_for_single_kernel(kernel,
-        program_callables_info, call_name, args_to_pack=None,
+        callables_table, call_name, args_to_pack=None,
         args_to_unpack=None):
     """
     Returns a a copy of *kernel* with instructions appended to copy the
@@ -63,10 +63,10 @@ def pack_and_unpack_args_for_call_for_single_kernel(kernel,
         if not isinstance(insn, CallInstruction):
             # pack and unpack call only be done for CallInstructions.
             continue
-        if insn.expression.function.name not in program_callables_info:
+        if insn.expression.function.name not in callables_table:
             continue
 
-        in_knl_callable = program_callables_info[
+        in_knl_callable = callables_table[
                 insn.expression.function.name]
 
         if in_knl_callable.name != call_name:
@@ -324,10 +324,10 @@ def pack_and_unpack_args_for_call(program, *args, **kwargs):
     assert isinstance(program, Program)
 
     new_resolved_functions = {}
-    for func_id, in_knl_callable in program.program_callables_info.items():
+    for func_id, in_knl_callable in program.callables_table.items():
         if isinstance(in_knl_callable, CallableKernel):
             new_subkernel = pack_and_unpack_args_for_call_for_single_kernel(
-                    in_knl_callable.subkernel, program.program_callables_info,
+                    in_knl_callable.subkernel, program.callables_table,
                     *args, **kwargs)
             in_knl_callable = in_knl_callable.copy(
                     subkernel=new_subkernel)
@@ -340,8 +340,8 @@ def pack_and_unpack_args_for_call(program, *args, **kwargs):
 
         new_resolved_functions[func_id] = in_knl_callable
 
-    new_program_callables_info = program.program_callables_info.copy(
+    new_callables_table = program.callables_table.copy(
             resolved_functions=new_resolved_functions)
-    return program.copy(program_callables_info=new_program_callables_info)
+    return program.copy(callables_table=new_callables_table)
 
 # vim: foldmethod=marker
-- 
GitLab


From 600f9d1bdcf3f9f46fb7a56cd9c5fc00ce84a555 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Wed, 21 Nov 2018 10:42:01 -0600
Subject: [PATCH 76/80] re-adds some missing checks

---
 loopy/check.py             | 4 ++--
 loopy/target/c/__init__.py | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/loopy/check.py b/loopy/check.py
index 82b99a43..659e210f 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -729,8 +729,8 @@ def pre_schedule_checks(kernel, callables_table):
         check_for_data_dependent_parallel_bounds(kernel)
         check_bounds(kernel)
         check_write_destinations(kernel)
-        # check_has_schedulable_iname_nesting(kernel)
-        # check_variable_access_ordered(kernel)
+        check_has_schedulable_iname_nesting(kernel)
+        check_variable_access_ordered(kernel)
 
         logger.debug("%s: pre-schedule check: done" % kernel.name)
     except KeyboardInterrupt:
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index ca4d6b00..ac3dec32 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -435,7 +435,7 @@ class CMathCallable(ScalarCallable):
             if dtype.kind == "c":
                 raise LoopyTypeError("%s does not support complex numbers")
 
-            elif dtype.kind == "f" and name in ["fmax", "fmin"]:
+            elif dtype.kind == "f" or name in ["fmax", "fmin"]:
                 from loopy.target.opencl import OpenCLTarget
                 if not isinstance(caller_kernel.target, OpenCLTarget):
                     if dtype == np.float64:
-- 
GitLab


From 1d48377532bc8092bbc613fa09a63f166047ef10 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Thu, 22 Nov 2018 04:17:28 -0600
Subject: [PATCH 77/80] reverted the changes in type inference

---
 loopy/target/c/__init__.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index ac3dec32..58051e42 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -435,7 +435,7 @@ class CMathCallable(ScalarCallable):
             if dtype.kind == "c":
                 raise LoopyTypeError("%s does not support complex numbers")
 
-            elif dtype.kind == "f" or name in ["fmax", "fmin"]:
+            elif dtype.kind == "f":
                 from loopy.target.opencl import OpenCLTarget
                 if not isinstance(caller_kernel.target, OpenCLTarget):
                     if dtype == np.float64:
-- 
GitLab


From a840eed1fed2dd3f0ba636f7f2cd9ae446d55531 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Thu, 22 Nov 2018 05:55:49 -0600
Subject: [PATCH 78/80] minor changes to relax type inference

---
 loopy/statistics.py     | 5 +++++
 loopy/type_inference.py | 4 ++++
 2 files changed, 9 insertions(+)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 965c164e..c621ea72 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -34,6 +34,8 @@ from loopy.kernel.data import (
 from loopy.diagnostic import warn_with_kernel, LoopyError
 from pytools import Record, memoize_method
 from loopy.kernel.function_interface import ScalarCallable, CallableKernel
+from loopy.kernel import LoopKernel
+from loopy.program import make_program
 
 
 __doc__ = """
@@ -1458,6 +1460,9 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False,
 
     """
 
+    if isinstance(program, LoopKernel):
+        program = make_program(program)
+
     from loopy.preprocess import preprocess_program, infer_unknown_types
     program = infer_unknown_types(program, expect_completion=True)
     program = preprocess_program(program)
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 4137709e..5047dcc2 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -457,6 +457,10 @@ class TypeInferenceMapper(CombineMapper):
                                         np.int64):
                             continue
 
+                        if np.can_cast(arg_id_to_dtype[id].dtype.type,
+                                in_knl_callable.arg_id_to_dtype[id].dtype.type):
+                            continue
+
                         # }}}
 
                         raise LoopyError("Overwriting a specialized function "
-- 
GitLab


From 237b7ef44125410dd3d7a23f75fa3a838331e560 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Thu, 22 Nov 2018 06:04:25 -0600
Subject: [PATCH 79/80] some more leftover program_callables_info ->
 callables_table

---
 examples/python/call-external.py   |  6 +++---
 loopy/kernel/function_interface.py | 16 ++++++++--------
 loopy/kernel/tools.py              |  6 +++---
 3 files changed, 14 insertions(+), 14 deletions(-)

diff --git a/examples/python/call-external.py b/examples/python/call-external.py
index 68618a7e..c13d99bd 100644
--- a/examples/python/call-external.py
+++ b/examples/python/call-external.py
@@ -7,14 +7,14 @@ from loopy.target.c import CTarget
 # {{{ blas callable
 
 class BLASCallable(lp.ScalarCallable):
-    def with_types(self, arg_id_to_dtype, kernel, program_callables_info):
+    def with_types(self, arg_id_to_dtype, kernel, callables_table):
         for i in range(0, 2):
             if i not in arg_id_to_dtype or arg_id_to_dtype[i] is None:
                 # the types provided aren't mature enough to specialize the
                 # callable
                 return (
                         self.copy(arg_id_to_dtype=arg_id_to_dtype),
-                        program_callables_info)
+                        callables_table)
 
         mat_dtype = arg_id_to_dtype[0].numpy_dtype
         vec_dtype = arg_id_to_dtype[1].numpy_dtype
@@ -34,7 +34,7 @@ class BLASCallable(lp.ScalarCallable):
         from loopy.types import NumpyType
         return self.copy(name_in_target=name_in_target,
                 arg_id_to_dtype={0: NumpyType(vec_dtype), 1: NumpyType(vec_dtype),
-                    -1: NumpyType(vec_dtype)}), program_callables_info
+                    -1: NumpyType(vec_dtype)}), callables_table
 
     def emit_call_insn(self, insn, target, expression_to_code_mapper):
         assert self.is_ready_for_codegen()
diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py
index fa7a87fe..3e628f5c 100644
--- a/loopy/kernel/function_interface.py
+++ b/loopy/kernel/function_interface.py
@@ -532,7 +532,7 @@ class CallableKernel(InKernelCallable):
         return self.subkernel.name
 
     def with_types(self, arg_id_to_dtype, caller_kernel,
-            program_callables_info):
+            callables_table):
         kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel)
 
         new_args = []
@@ -555,10 +555,10 @@ class CallableKernel(InKernelCallable):
 
         # infer the types of the written variables based on the knowledge
         # of the types of the arguments supplied
-        specialized_kernel, program_callables_info = (
+        specialized_kernel, callables_table = (
                 infer_unknown_types_for_a_single_kernel(
                     pre_specialized_subkernel,
-                    program_callables_info,
+                    callables_table,
                     expect_completion=True))
 
         new_arg_id_to_dtype = {}
@@ -571,9 +571,9 @@ class CallableKernel(InKernelCallable):
         # Return the kernel call with specialized subkernel and the corresponding
         # new arg_id_to_dtype
         return self.copy(subkernel=specialized_kernel,
-                arg_id_to_dtype=new_arg_id_to_dtype), program_callables_info
+                arg_id_to_dtype=new_arg_id_to_dtype), callables_table
 
-    def with_descrs(self, arg_id_to_descr, program_callables_info):
+    def with_descrs(self, arg_id_to_descr, callables_table):
 
         # tune the subkernel so that we have the matching shapes and
         # dim_tags
@@ -602,15 +602,15 @@ class CallableKernel(InKernelCallable):
                         type(descr))
         descriptor_specialized_knl = self.subkernel.copy(args=new_args)
         from loopy.preprocess import traverse_to_infer_arg_descr
-        descriptor_specialized_knl, program_callables_info = (
+        descriptor_specialized_knl, callables_table = (
                 traverse_to_infer_arg_descr(descriptor_specialized_knl,
-                    program_callables_info))
+                    callables_table))
 
         return (
                 self.copy(
                     subkernel=descriptor_specialized_knl,
                     arg_id_to_descr=arg_id_to_descr),
-                program_callables_info)
+                callables_table)
 
     def with_packing_for_args(self):
         from loopy.kernel.data import AddressSpace
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 125577c9..26856d64 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -1866,7 +1866,7 @@ def find_aliasing_equivalence_classes(kernel):
 
 # {{{ callee kernel tools
 
-def get_direct_callee_kernels(kernel, program_callables_info, insn_ids=None,):
+def get_direct_callee_kernels(kernel, callables_table, insn_ids=None,):
     """
     Returns an instance of :class:`frozenset` of all the callee kernels
     called in instructions in the *kernel* whose IDs are given in *insn_ids*.
@@ -1892,8 +1892,8 @@ def get_direct_callee_kernels(kernel, program_callables_info, insn_ids=None,):
         from loopy.kernel.instruction import (CallInstruction,
                 MultiAssignmentBase, CInstruction, _DataObliviousInstruction)
         if isinstance(insn, CallInstruction):
-            if insn.expression.function.name in program_callables_info:
-                in_knl_callable = program_callables_info[
+            if insn.expression.function.name in callables_table:
+                in_knl_callable = callables_table[
                         insn.expression.function.name]
                 if isinstance(in_knl_callable, CallableKernel):
                     return in_knl_callable.subkernel
-- 
GitLab


From 608ac4016fdba92e87a7df384560dac9d2979eb4 Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Thu, 22 Nov 2018 06:29:06 -0600
Subject: [PATCH 80/80] ArrayArg->GlobalArg

---
 doc/tutorial.rst | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index c134e4fb..25082f88 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1145,7 +1145,7 @@ the right by 1 in parallel:
    ...     end
    ...     """,
    ...      [
-   ...         lp.ArrayArg("arr", shape=("n",), dtype=np.int32),
+   ...         lp.GlobalArg("arr", shape=("n",), dtype=np.int32),
    ...          "...",
    ...      ],
    ...     name="rotate_v1",
@@ -1189,7 +1189,7 @@ Let us start with an example. Consider the kernel from above with a
    ...     end
    ...     """,
    ...      [
-   ...         lp.ArrayArg("arr", shape=("n",), dtype=np.int32),
+   ...         lp.GlobalArg("arr", shape=("n",), dtype=np.int32),
    ...          "...",
    ...      ],
    ...     name="rotate_v2",
@@ -1323,8 +1323,8 @@ tagged, as in the following example::
             "{ [i]: 0<=i<n }",
             "out[i%20] = out[i%20] + 2*a[i] {atomic}",
             [
-                lp.ArrayArg("out", dtype, shape=lp.auto, for_atomic=True),
-                lp.ArrayArg("a", dtype, shape=lp.auto),
+                lp.GlobalArg("out", dtype, shape=lp.auto, for_atomic=True),
+                lp.GlobalArg("a", dtype, shape=lp.auto),
                 "..."
                 ],
             assumptions="n>0")
-- 
GitLab