diff --git a/loopy/__init__.py b/loopy/__init__.py
index 89683e0b466714700f18b090ec365d5861ea4d05..4fa8c5fc5a4dbef134eae0d237961fe495ca681d 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -116,6 +116,8 @@ from loopy.transform.batch import to_batched
 from loopy.transform.parameter import assume, fix_parameters
 from loopy.transform.save import save_and_reload_temporaries
 from loopy.transform.add_barrier import add_barrier
+from loopy.transform.register_knl import register_callable_kernel
+
 # }}}
 
 from loopy.type_inference import infer_unknown_types
@@ -222,6 +224,8 @@ __all__ = [
 
         "add_barrier",
 
+        "register_callable_kernel",
+
         # }}}
 
         "get_dot_dependency_graph",
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index d716f0b785f83a84d78475f71ecc76ec23c4c683..25737786cdf2fb4fdda115a22c5e644bfabbebe6 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -1339,7 +1339,6 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             "temporary_variables",
             "iname_to_tag",
             "substitutions",
-            "scoped_functions",
             "iname_slab_increments",
             "loop_priority",
             "silenced_warnings",
@@ -1362,6 +1361,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
             "preamble_generators",
             "function_manglers",
             "symbol_manglers",
+            "scoped_functions",
             )
 
     def update_persistent_hash(self, key_hash, key_builder):
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index c90e8a64b6f47a87e87c5e64d2ef930232d34894..59297e4752f944f751111e8c4ece2f2141afbc03 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -607,6 +607,13 @@ class SubstitutionRule(ImmutableRecord):
 # {{{ function call mangling
 
 class CallMangleInfo(ImmutableRecord):
+    def __init__(self):
+        raise NotImplementedError("New Mangler interface expected")
+
+
+# FIXME: Uncomment it once everything is done.
+# KK: Removed it for the duration the new mangler interface starts working.
+'''
     """
     .. attribute:: target_name
 
@@ -631,6 +638,7 @@ class CallMangleInfo(ImmutableRecord):
                 target_name=target_name,
                 result_dtypes=result_dtypes,
                 arg_dtypes=arg_dtypes)
+'''
 
 # }}}
 
diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py
index 7127d142b9443062553d92b8f1c5eba1182e7b22..bb88cc0916de1264ede05360554dfc1be1e7dbf0 100644
--- a/loopy/kernel/function_interface.py
+++ b/loopy/kernel/function_interface.py
@@ -54,6 +54,13 @@ class ArrayArgDescriptor(ArgDescriptor):
             shape=None,
             mem_scope=None,
             dim_tags=None):
+
+        # {{{ sanity checks
+
+        assert isinstance(shape, tuple)
+
+        # }}}
+
         super(ArgDescriptor, self).__init__(shape=None,
                 mem_scope=mem_scope,
                 dim_tags=dim_tags)
@@ -299,11 +306,11 @@ class InKernelCallable(ImmutableRecord):
                 raise NotImplementedError("InKernelCallable.with_types() for"
                         " %s target" % target)
 
-        # }}}
+            if new_arg_id_to_dtype is not None:
+                # got our speciliazed function
+                return self.copy(arg_id_to_dtype=new_arg_id_to_dtype)
 
-        if new_arg_id_to_dtype is not None:
-            # got our speciliazed function
-            return self.copy(arg_id_to_dtype=new_arg_id_to_dtype)
+        # }}}
 
         if self.subkernel is None:
             # did not find a scalar function and function prototype does not
@@ -326,7 +333,7 @@ class InKernelCallable(ImmutableRecord):
                 new_args.append(arg.copy(
                     dtype=arg_id_to_dtype[kw_to_pos[kw]]))
             else:
-                if kw in self.subkernel.read_variables():
+                if kw in self.subkernel.get_read_variables():
                     # need to know the type of the input arguments for type
                     # inference
                     raise LoopyError("Type of %s variable not supplied to the"
@@ -395,7 +402,7 @@ class InKernelCallable(ImmutableRecord):
             # in the array call.
 
             # Collecting the parameters
-            new_args = self.args.copy()
+            new_args = self.subkernel.args.copy()
             kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel)
 
             for id, descr in arg_id_to_descr.items():
@@ -441,20 +448,59 @@ class InKernelCallable(ImmutableRecord):
 
     def get_target_specific_name(self, target):
 
+        if self.subkernel is None:
+            raise NotImplementedError()
+        else:
+            return self.subkernel.name
+
         raise NotImplementedError()
 
-    def emit_call(self, target):
-        # two varieties of this call, when obtained in between a function and
-        # when obtained as a separate instruction statement.
+    def emit_call(self, insn, target, expression_to_code_mapper):
 
-        raise NotImplementedError()
+        from loopy.kernel.instruction import CallInstruction
+        from pymbolic.primitives import CallWithKwargs
+
+        assert isinstance(insn, CallInstruction)
+
+        parameters = insn.expression.parameters
+        kw_parameters = {}
+        if isinstance(insn.expression, CallWithKwargs):
+            kw_parameters = insn.expression.kw_parameters
+
+        assignees = insn.assignees
+
+        parameters = list(parameters)
+        par_dtypes = [self.arg_id_to_dtype[i] for i, _ in enumerate(parameters)]
+        kw_to_pos, pos_to_kw = get_kw_pos_association(self.subkernel)
+        for i in range(len(parameters), len(parameters)+len(kw_parameters)):
+            parameters.append(kw_parameters[pos_to_kw[i]])
+            par_dtypes.append(self.arg_id_to_dtype[pos_to_kw[i]])
+
+        # TODO: currently no suppport for insn keywords.
+        parameters = parameters + list(assignees)
+        par_dtypes = par_dtypes + [self.arg_id_to_dtype[-i-1] for i, _ in
+                enumerate(assignees)]
+
+        # Note that we are not going to do any type casting in array calls.
+        from loopy.expression import dtype_to_type_context
+        from pymbolic.mapper.stringifier import PREC_NONE
+        c_parameters = [
+                expression_to_code_mapper(par, PREC_NONE,
+                    dtype_to_type_context(target, par_dtype),
+                    par_dtype).expr
+                for par, par_dtype in zip(
+                    parameters, par_dtypes)]
+
+        from pymbolic import var
+        return var(self.get_target_specific_name(target))(*c_parameters)
 
     # }}}
 
     def __eq__(self, other):
         return (self.name == other.name
                 and self.arg_id_to_descr == other.arg_id_to_descr
-                and self.arg_id_to_dtype == other.arg_id_to_dtype)
+                and self.arg_id_to_dtype == other.arg_id_to_dtype
+                and self.subkernel == other.subkernel)
 
     def __hash__(self):
         return hash((self.name, self.subkernel))
@@ -640,6 +686,13 @@ def register_pymbolic_calls_to_knl_callables(kernel,
                 unique_name = next_indexed_name(unique_name)
 
             # book-keeping of the functions and names mappings for later use
+            if in_knl_callable.subkernel is not None:
+                # changing the name of the subkenrel so that it emits a function
+                # with the name same as the name being used in the
+                # scoped_function.
+                new_subkernel = in_knl_callable.subkernel.copy(
+                        name=unique_name)
+                in_knl_callable = in_knl_callable.copy(subkernel=new_subkernel)
             scoped_names_to_functions[unique_name] = in_knl_callable
             scoped_functions_to_names[in_knl_callable] = unique_name
 
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 01eeb513046be661646d440d7f3a5e7d691ae1b6..068953a52709f9cf869a88dad425168fa6c67cb2 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -2135,7 +2135,7 @@ def get_arg_description_from_sub_array_ref(sub_array, kernel):
     """
     from loopy.kernel.function_interface import ArrayArgDescriptor
 
-    name = sub_array.subscript.attribute.name
+    name = sub_array.subscript.aggregate.name
 
     if name in kernel.temporary_variables:
         mem_scope = "LOCAL"
@@ -2161,8 +2161,8 @@ class ArgDescriptionInferer(CombineMapper):
     arguments.
     """
 
-    def __init__(self, scoped_functions):
-        self.scoped_functions = scoped_functions
+    def __init__(self, kernel):
+        self.kernel = kernel
 
     def combine(self, values):
         import operator
@@ -2173,7 +2173,8 @@ class ArgDescriptionInferer(CombineMapper):
         from loopy.symbolic import SubArrayRef
 
         # descriptors for the args
-        arg_id_to_descr = dict((i, get_arg_description_from_sub_array_ref(par))
+        arg_id_to_descr = dict((i,
+            get_arg_description_from_sub_array_ref(par, self.kernel))
                 if isinstance(par, SubArrayRef) else (i, ValueArgDescriptor())
                 for i, par in enumerate(expr.parameters))
 
@@ -2187,7 +2188,8 @@ class ArgDescriptionInferer(CombineMapper):
             for i, par in enumerate(assignees):
                 if isinstance(par, SubArrayRef):
                     assignee_id_to_descr[-i-1] = (
-                            get_arg_description_from_sub_array_ref(par))
+                            get_arg_description_from_sub_array_ref(par,
+                                self.kernel))
                 else:
                     assignee_id_to_descr[-i-1] = ValueArgDescriptor()
 
@@ -2196,20 +2198,21 @@ class ArgDescriptionInferer(CombineMapper):
 
         # specializing the function according to the parameter description
         new_scoped_function = (
-                self.scoped_functions[expr.function.name].with_descrs(
+                self.kernel.scoped_functions[expr.function.name].with_descrs(
                     combined_arg_id_to_dtype))
 
         # collecting the descriptors for args, kwargs, assignees
-        return (
-                frozenset(((expr, new_scoped_function), )) |
-                self.combine((self.rec(child) for child in expr.parameters)))
+        a = frozenset(((expr, new_scoped_function), ))
+        b = self.combine((self.rec(child) for child in expr.parameters))
+        return (a | b)
 
     def map_call_with_kwargs(self, expr, **kwargs):
         from loopy.kernel.function_intergace import ValueArgDescriptor
         from loopy.symbolic import SubArrayRef
 
         # descriptors for the args and kwargs:
-        arg_id_to_descr = dict((i, get_arg_description_from_sub_array_ref(par))
+        arg_id_to_descr = dict((i, get_arg_description_from_sub_array_ref(par,
+            self.kernel))
                 if isinstance(par, SubArrayRef) else ValueArgDescriptor()
                 for i, par in enumerate(expr.parameters) +
                 expr.kw_parameters.items())
@@ -2223,7 +2226,8 @@ class ArgDescriptionInferer(CombineMapper):
             for i, par in enumerate(assignees):
                 if isinstance(par, SubArrayRef):
                     assignee_id_to_descr[-i-1] = (
-                            get_arg_description_from_sub_array_ref(par))
+                            get_arg_description_from_sub_array_ref(par,
+                                self.kernel))
                 else:
                     assignee_id_to_descr[-i-1] = ValueArgDescriptor()
 
@@ -2232,7 +2236,7 @@ class ArgDescriptionInferer(CombineMapper):
 
         # specializing the function according to the parameter description
         new_scoped_function = (
-                self.scoped_functions[expr.function.name].with_descr(
+                self.kernel.scoped_functions[expr.function.name].with_descr(
                     combined_arg_id_to_descr))
 
         # collecting the descriptors for args, kwargs, assignees
@@ -2252,7 +2256,7 @@ def infer_arg_descr(kernel):
     shape and dimensions of the arguments too.
     """
 
-    arg_description_modifier = ArgDescriptionInferer(kernel.scoped_functions)
+    arg_description_modifier = ArgDescriptionInferer(kernel)
     pymbolic_calls_to_functions = set()
 
     for insn in kernel.instructions:
@@ -2264,8 +2268,7 @@ def infer_arg_descr(kernel):
                     arg_description_modifier(insn.expression,
                         assignees=insn.assignees))
         if isinstance(insn, (MultiAssignmentBase, CInstruction)):
-            a = arg_description_modifier(insn.expression)
-            pymbolic_calls_to_functions.update(a)
+            pymbolic_calls_to_functions.update(arg_description_modifier(insn.expression))
         elif isinstance(insn, _DataObliviousInstruction):
             pass
         else:
@@ -2392,9 +2395,10 @@ def preprocess_kernel(kernel, device=None):
     print(75*'-')
     print('Linked Functions:')
     for name, func in kernel.scoped_functions.items():
-        print(name, "=>", func)
+        print(name, "=>", (func.name, func.arg_id_to_dtype,
+            func.arg_id_to_descr, func.subkernel.args))
+        print()
     print(75*'-')
-    1/0
 
     kernel = kernel.target.preprocess(kernel)
 
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 8abda0f2a641a4ab53d4cce05ba4d3ff4e2da6ef..bdfe57982ac3a457c87ce69886f48ec144841c73 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -189,6 +189,9 @@ class CombineMapper(CombineMapperBase):
     def map_reduction(self, expr):
         return self.rec(expr.expr)
 
+    def map_sub_array_ref(self, expr):
+        return self.rec(expr.get_begin_subscript())
+
     map_linear_subscript = CombineMapperBase.map_subscript
 
     map_scoped_function = CombineMapperBase.map_variable
@@ -738,7 +741,7 @@ class SubArrayRef(p.Expression):
                 sub_dim_tags.append(DimTag(dim_tag.stride))
                 sub_shape.append(axis_length)
 
-        return sub_dim_tags, sub_shape
+        return sub_dim_tags, tuple(sub_shape)
 
     def __getinitargs__(self):
         return (self.swept_inames, self.subscript)
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 2b5e394bbcc566510c27b069506b67f60d5cd911..28c346dcc7e0ef718bc729214587853c835dd0e6 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -822,6 +822,10 @@ class CASTBuilder(ASTBuilderBase):
             lhs_expr, rhs_expr, lhs_dtype):
         raise NotImplementedError("atomic updates in %s" % type(self).__name__)
 
+    # FIXME: With the new mangler interface this should not be present,
+    # Commenting this part so that this does not get used anywhere in the
+    # meantime
+    '''
     def emit_tuple_assignment(self, codegen_state, insn):
         ecm = codegen_state.expression_to_code_mapper
 
@@ -844,84 +848,23 @@ class CASTBuilder(ASTBuilderBase):
             assignments.append(Assign(lhs_code, rhs_code))
 
         return block_if_necessary(assignments)
+    '''
 
     def emit_multiple_assignment(self, codegen_state, insn):
         ecm = codegen_state.expression_to_code_mapper
 
-        from pymbolic.primitives import Variable
-        from pymbolic.mapper.stringifier import PREC_NONE
-
-        func_id = insn.expression.function
-        parameters = insn.expression.parameters
-
-        if isinstance(func_id, Variable):
-            func_id = func_id.name
-
-        assignee_var_descriptors = [
-                codegen_state.kernel.get_var_descriptor(a)
-                for a in insn.assignee_var_names()]
-
-        par_dtypes = tuple(ecm.infer_type(par) for par in parameters)
-
-        mangle_result = codegen_state.kernel.mangle_function(func_id, par_dtypes)
-        if mangle_result is None:
-            raise RuntimeError("function '%s' unknown--"
-                    "maybe you need to register a function mangler?"
-                    % func_id)
-
-        assert mangle_result.arg_dtypes is not None
-
-        if mangle_result.target_name == "loopy_make_tuple":
-            # This shorcut avoids actually having to emit a 'make_tuple' function.
-            return self.emit_tuple_assignment(codegen_state, insn)
-
-        from loopy.expression import dtype_to_type_context
-        c_parameters = [
-                ecm(par, PREC_NONE,
-                    dtype_to_type_context(self.target, tgt_dtype),
-                    tgt_dtype).expr
-                for par, par_dtype, tgt_dtype in zip(
-                    parameters, par_dtypes, mangle_result.arg_dtypes)]
-
-        from loopy.codegen import SeenFunction
-        codegen_state.seen_functions.add(
-                SeenFunction(func_id,
-                    mangle_result.target_name,
-                    mangle_result.arg_dtypes))
-
-        from pymbolic import var
-        for i, (a, tgt_dtype) in enumerate(
-                zip(insn.assignees[1:], mangle_result.result_dtypes[1:])):
-            if tgt_dtype != ecm.infer_type(a):
-                raise LoopyError("type mismatch in %d'th (1-based) left-hand "
-                        "side of instruction '%s'" % (i+1, insn.id))
-            c_parameters.append(
-                        # TODO Yuck: The "where-at function": &(...)
-                        var("&")(
-                            ecm(a, PREC_NONE,
-                                dtype_to_type_context(self.target, tgt_dtype),
-                                tgt_dtype).expr))
-
-        from pymbolic import var
-        result = var(mangle_result.target_name)(*c_parameters)
-
-        # In case of no assignees, we are done
-        if len(mangle_result.result_dtypes) == 0:
-            from cgen import ExpressionStatement
-            return ExpressionStatement(
-                    CExpression(self.get_c_expression_to_code_mapper(), result))
-
-        result = ecm.wrap_in_typecast(
-                mangle_result.result_dtypes[0],
-                assignee_var_descriptors[0].dtype,
-                result)
+        func_id = insn.expression.function.name
 
-        lhs_code = ecm(insn.assignees[0], prec=PREC_NONE, type_context=None)
+        in_knl_callable = codegen_state.kernel.scoped_functions[func_id]
+        in_knl_callable_as_call = in_knl_callable.emit_call(
+                insn=insn,
+                target=self.target,
+                expression_to_code_mapper=ecm)
 
-        from cgen import Assign
-        return Assign(
-                lhs_code,
-                CExpression(self.get_c_expression_to_code_mapper(), result))
+        from cgen import ExpressionStatement
+        return ExpressionStatement(
+                CExpression(self.get_c_expression_to_code_mapper(),
+                    in_knl_callable_as_call))
 
     def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
             lbound, ubound, inner):
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 59ed77f9c17fa04d67e251c22bec88fc8b15936c..17e48555512ef7a004f0ac9488b6cd7034657b7f 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -165,6 +165,10 @@ class ExpressionToCExpressionMapper(IdentityMapper):
     def map_tagged_variable(self, expr, type_context):
         return var(expr.name)
 
+    def map_sub_array_ref(self, expr, type_context):
+        return var("&")(self.rec(expr.get_begin_subscript(),
+            type_context))
+
     def map_subscript(self, expr, type_context):
         def base_impl(expr, type_context):
             return self.rec(expr.aggregate, type_context)[self.rec(expr.index, 'i')]
diff --git a/loopy/transform/register_knl.py b/loopy/transform/register_knl.py
index 691c0c51aacc5607f38ee7cf3ee94fe62304bbfb..f43550b5b59e888e7a8cfb4379723d82f361e5c0 100644
--- a/loopy/transform/register_knl.py
+++ b/loopy/transform/register_knl.py
@@ -25,9 +25,9 @@ THE SOFTWARE.
 from loopy.kernel import LoopKernel
 from loopy.kernel.creation import FunctionScoper
 from loopy.diagnostic import LoopyError
-from loopy.function_interface import InKernelCallable
+from loopy.kernel.function_interface import InKernelCallable
 
-from loopy.kenrel.instruction import (MultiAssignmentBase, CallInstruction,
+from loopy.kernel.instruction import (MultiAssignmentBase, CallInstruction,
         CInstruction, _DataObliviousInstruction)
 
 __doc__ = """
@@ -65,15 +65,11 @@ def register_callable_kernel(parent, function_name, child):
         tests so that both of them can be confirmed to be made for each other.
     """
 
-    # {{{ Sanity Checks
+    # {{{ sanity checks
 
     assert isinstance(parent, LoopKernel)
     assert isinstance(child, LoopKernel)
     assert isinstance(function_name, str)
-    assert function_name not in parent.auxiliary_kernels, (
-                "%s has already been used with some other kernel. One"
-                "function can only be associated with a single kernel" % (
-                    function_name))
 
     # }}}
 
@@ -105,7 +101,8 @@ def register_callable_kernel(parent, function_name, child):
         subkernel=child)
 
     # returning the parent kernel with the new scoped function dictionary
-    return parent.copy(scope_functions=scoped_functions)
+    return parent.copy(scoped_functions=scoped_functions,
+            instructions=new_insns)
 
 # }}}
 
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index bc8669528d1388d0e0e4afbbb1deb2e3bf9424f7..13460387226d79dcbc055f0eb245d11090145748 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -253,9 +253,10 @@ class TypeInferenceMapper(CombineMapper):
 
     def map_call(self, expr, return_tuple=False):
         from pymbolic.primitives import Variable
+        from loopy.symbolic import ScopedFunction
 
         identifier = expr.function
-        if isinstance(identifier, Variable):
+        if isinstance(identifier, (Variable, ScopedFunction)):
             identifier = identifier.name
 
         if identifier in ["indexof", "indexof_vec"]:
@@ -297,7 +298,7 @@ class TypeInferenceMapper(CombineMapper):
 
         """
         # Letting this stay over here, as it maybe needed later for maintaining
-        # backward compatibility
+        # backward compatibility: ~KK
         mangle_result = self.kernel.mangle_function(identifier, arg_dtypes)
         if return_tuple:
             if mangle_result is not None:
@@ -428,6 +429,10 @@ class TypeInferenceMapper(CombineMapper):
             return [expr.operation.result_dtypes(self.kernel, rec_result)[0]
                     for rec_result in rec_results]
 
+    def map_sub_array_ref(self, expr):
+        return self.rec(expr.get_begin_subscript())
+
+
 # }}}
 
 
@@ -457,9 +462,16 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
         if isinstance(writer_insn, lp.Assignment):
             result = type_inf_mapper(expr, return_dtype_set=True)
         elif isinstance(writer_insn, lp.CallInstruction):
-            return_dtype_set = type_inf_mapper(expr, return_tuple=True,
+            result = type_inf_mapper(expr, return_dtype_set=True)
+            """
+            # Maybe we need to alter this so that the type_inf_mapper returns a
+            # :class:`dict`?
+            # ask about this to Andreas Sir.
+            return_dtype_set = type_inf_mapper(expr, return_tuple=False,
                     return_dtype_set=True)
 
+            print(return_dtype_set)
+            print(writer_insn.assignee_var_names())
             result = []
             for return_dtype_set in return_dtype_set:
                 result_i = None
@@ -474,6 +486,7 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
                 assert found
                 if result_i is not None:
                     result.append(result_i)
+            """
 
         debug("             result: %s", result)
 
@@ -678,6 +691,18 @@ def infer_unknown_types(kernel, expect_completion=False):
             args=[new_arg_dict[arg.name] for arg in kernel.args],
             )
 
+    #------------------------------------------------------------------------
+    # KK:
+    # FIXME: more type scoped function type specialization but needed for the
+    # specialization of the in kernel callables
+    # for example if an instruction is :
+    # `[i]:z[i] = a_kernel_function([j]:x[j], [k]: y[k])`
+    # and if the user already provided the types of the args: x, y, z.
+    # Then the instruction would not go through the TypeInferenceMapper and hence
+    # the function: `a_kernel_function` would not undergo type specialization,
+    # which would create problems in the future.
+    #------------------------------------------------------------------------
+
     from loopy.kernel.function_interface import (
             register_pymbolic_calls_to_knl_callables)
     return register_pymbolic_calls_to_knl_callables(