diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index e83515d31f1c61e52569d8d0754ce79e7a7f602f..57bf4c6a8aa31c3c5aad3be3b34ccbab7caa9b37 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -32,6 +32,13 @@ from pytools.persistent_dict import WriteOncePersistentDict
 from loopy.tools import LoopyKeyBuilder
 from loopy.version import DATA_MODEL_VERSION
 
+from cgen import Collection
+
+from loopy.kernel.instruction import (
+        Assignment, NoOpInstruction, BarrierInstruction, CallInstruction,
+        CInstruction, _DataObliviousInstruction)
+
+
 import logging
 logger = logging.getLogger(__name__)
 
@@ -187,6 +194,12 @@ class CodeGenerationState(object):
         generated.
 
     .. attribute:: schedule_index_end
+
+    .. attribute:: is_generating_master_kernel
+
+        Can be either `True` or `False`. Indicating whether the code is being
+        generated for a master kernel or an auxiliary kernel.
+
     """
 
     def __init__(self, kernel,
@@ -196,7 +209,8 @@ class CodeGenerationState(object):
             vectorization_info=None, var_name_generator=None,
             is_generating_device_code=None,
             gen_program_name=None,
-            schedule_index_end=None):
+            schedule_index_end=None,
+            is_generating_master_kernel=None):
         self.kernel = kernel
         self.implemented_data_info = implemented_data_info
         self.implemented_domain = implemented_domain
@@ -211,6 +225,7 @@ class CodeGenerationState(object):
         self.is_generating_device_code = is_generating_device_code
         self.gen_program_name = gen_program_name
         self.schedule_index_end = schedule_index_end
+        self.is_generating_master_kernel = is_generating_master_kernel
 
     # {{{ copy helpers
 
@@ -219,7 +234,8 @@ class CodeGenerationState(object):
             var_subst_map=None, vectorization_info=None,
             is_generating_device_code=None,
             gen_program_name=None,
-            schedule_index_end=None):
+            schedule_index_end=None,
+            is_generating_master_kernel=None):
 
         if kernel is None:
             kernel = self.kernel
@@ -242,6 +258,9 @@ class CodeGenerationState(object):
         if schedule_index_end is None:
             schedule_index_end = self.schedule_index_end
 
+        if is_generating_master_kernel is None:
+            is_generating_master_kernel = self.is_generating_master_kernel
+
         return CodeGenerationState(
                 kernel=kernel,
                 implemented_data_info=implemented_data_info,
@@ -257,7 +276,8 @@ class CodeGenerationState(object):
                 var_name_generator=self.var_name_generator,
                 is_generating_device_code=is_generating_device_code,
                 gen_program_name=gen_program_name,
-                schedule_index_end=schedule_index_end)
+                schedule_index_end=schedule_index_end,
+                is_generating_master_kernel=is_generating_master_kernel)
 
     def copy_and_assign(self, name, value):
         """Make a copy of self with variable *name* fixed to *value*."""
@@ -470,13 +490,49 @@ def generate_code_v2(kernel):
                 kernel.target.host_program_name_prefix
                 + kernel.name
                 + kernel.target.host_program_name_suffix),
-            schedule_index_end=len(kernel.schedule))
+            schedule_index_end=len(kernel.schedule),
+            is_generating_master_kernel=True)
 
     from loopy.codegen.result import generate_host_or_device_program
+
+    # {{{ collecting ASTs of auxiliary kernels
+
+    auxiliary_dev_progs = []
+
+    from loopy.codegen.auxiliary_kernels import generate_auxiliary_kernel_device_code
+    for insn in kernel.instructions:
+        if isinstance(insn, CallInstruction):
+            in_knl_callable = kernel.scoped_functions[insn.expression.function.name]
+            if in_knl_callable.subkernel is not None:
+                auxiliary_dev_prog = generate_auxiliary_kernel_device_code(
+                        in_knl_callable.subkernel,
+                        kernel.target).device_programs[0].ast
+                auxiliary_dev_progs.append(auxiliary_dev_prog)
+        elif isinstance(insn, (Assignment, NoOpInstruction, Assignment,
+                               BarrierInstruction, CInstruction,
+                               _DataObliviousInstruction)):
+            pass
+        else:
+            raise NotImplementedError("register_knl not made for %s type of"
+                    "instruciton" % (str(type(insn))))
+
+    # }}}
+
     codegen_result = generate_host_or_device_program(
             codegen_state,
             schedule_index=0)
 
+    # {{{ pasting the auxiliary functions code to the first device program
+
+    new_dev_prog = codegen_result.device_programs[0]
+    for auxiliary_dev_prog in auxiliary_dev_progs:
+        new_dev_prog = new_dev_prog.copy(
+                ast=Collection([auxiliary_dev_prog, new_dev_prog.ast]))
+    new_device_programs = [new_dev_prog] + codegen_result.device_programs[1:]
+    codegen_result = codegen_result.copy(device_programs=new_device_programs)
+
+    # }}}
+
     device_code_str = codegen_result.device_code()
 
     from loopy.check import check_implemented_domains
diff --git a/loopy/codegen/auxiliary_kernels.py b/loopy/codegen/auxiliary_kernels.py
new file mode 100644
index 0000000000000000000000000000000000000000..799ab59bf5e78fcac58bb71e6e9b61ffc7aa4b22
--- /dev/null
+++ b/loopy/codegen/auxiliary_kernels.py
@@ -0,0 +1,188 @@
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2018 Kaushik Kulkarni"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import six
+import islpy as isl
+
+from loopy.codegen import (
+        ImplementedDataInfo,
+        CodeGenerationState)
+from loopy.diagnostic import LoopyError
+from loopy.kernel.instruction import (
+        Assignment, NoOpInstruction, BarrierInstruction, CallInstruction,
+        CInstruction, _DataObliviousInstruction)
+from cgen import Collection
+
+import logging
+logger = logging.getLogger(__name__)
+
+
+__doc__ = """
+.. currentmodule:: loopy
+
+.. autofunction:: generate_auxiliary_kernel_device_code
+
+"""
+
+
+# {{{ code generation for the auxiliary kernel
+
+def generate_auxiliary_kernel_device_code(kernel, target):
+    """
+    Generates device programs for the given auxiliary kernel, with the target
+    specified by the parent kernel
+    :returns: a :class:`CodeGenerationResult`
+    """
+    kernel = kernel.copy(target=target)
+
+    from loopy.kernel import kernel_state
+    if kernel.state == kernel_state.INITIAL:
+        from loopy.preprocess import preprocess_kernel
+        kernel = preprocess_kernel(kernel)
+
+    if kernel.schedule is None:
+        from loopy.schedule import get_one_scheduled_kernel
+        kernel = get_one_scheduled_kernel(kernel)
+
+    if kernel.state != kernel_state.SCHEDULED:
+        raise LoopyError(
+                "cannot generate code for a kernel that has not been "
+                "scheduled")
+
+    from loopy.type_inference import infer_unknown_types
+    kernel = infer_unknown_types(kernel, expect_completion=True)
+
+    from loopy.check import pre_codegen_checks
+    pre_codegen_checks(kernel)
+
+    logger.info("%s: generate Auxillary Kernel code: start" % kernel.name)
+
+    # {{{ examine arg list
+
+    from loopy.kernel.data import ValueArg
+    from loopy.kernel.array import ArrayBase
+
+    implemented_data_info = []
+
+    for arg in kernel.args:
+        is_written = arg.name in kernel.get_written_variables()
+        if isinstance(arg, ArrayBase):
+            implemented_data_info.extend(
+                    arg.decl_info(
+                        kernel.target,
+                        is_written=is_written,
+                        index_dtype=kernel.index_dtype))
+
+        elif isinstance(arg, ValueArg):
+            implemented_data_info.append(ImplementedDataInfo(
+                target=kernel.target,
+                name=arg.name,
+                dtype=arg.dtype,
+                arg_class=ValueArg,
+                is_written=is_written))
+
+        else:
+            raise ValueError("argument type not understood: '%s'" % type(arg))
+
+    allow_complex = False
+    for var in kernel.args + list(six.itervalues(kernel.temporary_variables)):
+        if var.dtype.involves_complex():
+            allow_complex = True
+
+    # }}}
+
+    seen_dtypes = set()
+    seen_functions = set()
+    seen_atomic_dtypes = set()
+
+    initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions)
+    codegen_state = CodeGenerationState(
+            kernel=kernel,
+            implemented_data_info=implemented_data_info,
+            implemented_domain=initial_implemented_domain,
+            implemented_predicates=frozenset(),
+            seen_dtypes=seen_dtypes,
+            seen_functions=seen_functions,
+            seen_atomic_dtypes=seen_atomic_dtypes,
+            var_subst_map={},
+            allow_complex=allow_complex,
+            var_name_generator=kernel.get_var_name_generator(),
+            is_generating_device_code=False,
+            gen_program_name=kernel.name,
+            schedule_index_end=len(kernel.schedule),
+            is_generating_master_kernel=False)
+
+    from loopy.codegen.result import generate_host_or_device_program
+
+    # {{{ collecting ASTs of auxiliary kernels
+
+    auxiliary_dev_progs = []
+
+    from loopy.codegen.auxiliary_kernels import generate_auxiliary_kernel_device_code
+    for insn in kernel.instructions:
+        if isinstance(insn, CallInstruction):
+            in_knl_callable = kernel.scoped_functions[insn.expression.function.name]
+            if in_knl_callable.subkernel is not None:
+                auxiliary_dev_prog = generate_auxiliary_kernel_device_code(
+                        in_knl_callable.subkernel,
+                        kernel.target).device_programs[0].ast
+                auxiliary_dev_progs.append(auxiliary_dev_prog)
+        elif isinstance(insn, (Assignment, NoOpInstruction, Assignment,
+                               BarrierInstruction, CInstruction,
+                               _DataObliviousInstruction)):
+            pass
+        else:
+            raise NotImplementedError("register_knl not made for %s type of"
+                    "instruciton" % (str(type(insn))))
+
+    # }}}
+
+    codegen_result = generate_host_or_device_program(
+            codegen_state,
+            schedule_index=0)
+
+    # {{{ pasting the auxiliary functions code to the first device program
+
+    new_dev_prog = codegen_result.device_programs[0]
+    for auxiliary_dev_prog in auxiliary_dev_progs:
+        new_dev_prog = new_dev_prog.copy(
+                ast=Collection([auxiliary_dev_prog, new_dev_prog.ast]))
+    new_device_programs = [new_dev_prog] + codegen_result.device_programs[1:]
+    codegen_result = codegen_result.copy(device_programs=new_device_programs)
+
+    # }}}
+
+    # For faster unpickling in the common case when implemented_domains isn't needed.
+    from loopy.tools import LazilyUnpicklingDict
+    codegen_result = codegen_result.copy(
+            implemented_domains=LazilyUnpicklingDict(
+                    codegen_result.implemented_domains))
+
+    logger.info("%s: generate code: done" % kernel.name)
+
+    return codegen_result
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/loopy/kernel/function_interface.py b/loopy/kernel/function_interface.py
index bb88cc0916de1264ede05360554dfc1be1e7dbf0..ee44d5ea412318f2fe49be3bc5f5556546b04aa4 100644
--- a/loopy/kernel/function_interface.py
+++ b/loopy/kernel/function_interface.py
@@ -61,7 +61,7 @@ class ArrayArgDescriptor(ArgDescriptor):
 
         # }}}
 
-        super(ArgDescriptor, self).__init__(shape=None,
+        super(ArgDescriptor, self).__init__(shape=shape,
                 mem_scope=mem_scope,
                 dim_tags=dim_tags)
 
@@ -412,6 +412,7 @@ class InKernelCallable(ImmutableRecord):
                 new_args[id] = new_args[id].copy(shape=descr.shape,
                         dim_tags=descr.dim_tags)
 
+
             descriptor_specialized_knl = self.subkernel.copy(args=new_args)
 
             return self.copy(subkernel=descriptor_specialized_knl,
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 068953a52709f9cf869a88dad425168fa6c67cb2..eedfca6f91ad890d1defb189778080279ebb6613 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -2202,9 +2202,8 @@ class ArgDescriptionInferer(CombineMapper):
                     combined_arg_id_to_dtype))
 
         # collecting the descriptors for args, kwargs, assignees
-        a = frozenset(((expr, new_scoped_function), ))
-        b = self.combine((self.rec(child) for child in expr.parameters))
-        return (a | b)
+        return (frozenset(((expr, new_scoped_function), )) |
+                self.combine((self.rec(child) for child in expr.parameters)))
 
     def map_call_with_kwargs(self, expr, **kwargs):
         from loopy.kernel.function_intergace import ValueArgDescriptor
@@ -2267,8 +2266,9 @@ def infer_arg_descr(kernel):
             pymbolic_calls_to_functions.update(
                     arg_description_modifier(insn.expression,
                         assignees=insn.assignees))
-        if isinstance(insn, (MultiAssignmentBase, CInstruction)):
-            pymbolic_calls_to_functions.update(arg_description_modifier(insn.expression))
+        elif isinstance(insn, (MultiAssignmentBase, CInstruction)):
+            pymbolic_calls_to_functions.update(arg_description_modifier(
+                insn.expression))
         elif isinstance(insn, _DataObliviousInstruction):
             pass
         else:
@@ -2386,20 +2386,10 @@ def preprocess_kernel(kernel, device=None):
     # have been established
     kernel = check_atomic_loads(kernel)
 
+    # inferring the shape and dim_tags of the arguments involved in a function
+    # call.
     kernel = infer_arg_descr(kernel)
 
-    print(75*'-')
-    print("This is after Type Inference")
-    for insn in kernel.instructions:
-        print(insn)
-    print(75*'-')
-    print('Linked Functions:')
-    for name, func in kernel.scoped_functions.items():
-        print(name, "=>", (func.name, func.arg_id_to_dtype,
-            func.arg_id_to_descr, func.subkernel.args))
-        print()
-    print(75*'-')
-
     kernel = kernel.target.preprocess(kernel)
 
     logger.info("%s: preprocess done" % kernel.name)
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 13460387226d79dcbc055f0eb245d11090145748..b1b1446db123fbfb0296f2bccbe14df1b9d4fb13 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -459,34 +459,8 @@ def _infer_var_type(kernel, var_name, type_inf_mapper, subst_expander):
         expr = subst_expander(writer_insn.expression)
 
         debug("             via expr %s", expr)
-        if isinstance(writer_insn, lp.Assignment):
-            result = type_inf_mapper(expr, return_dtype_set=True)
-        elif isinstance(writer_insn, lp.CallInstruction):
-            result = type_inf_mapper(expr, return_dtype_set=True)
-            """
-            # Maybe we need to alter this so that the type_inf_mapper returns a
-            # :class:`dict`?
-            # ask about this to Andreas Sir.
-            return_dtype_set = type_inf_mapper(expr, return_tuple=False,
-                    return_dtype_set=True)
-
-            print(return_dtype_set)
-            print(writer_insn.assignee_var_names())
-            result = []
-            for return_dtype_set in return_dtype_set:
-                result_i = None
-                found = False
-                for assignee, comp_dtype_set in zip(
-                        writer_insn.assignee_var_names(), return_dtype_set):
-                    if assignee == var_name:
-                        found = True
-                        result_i = comp_dtype_set
-                        break
 
-                assert found
-                if result_i is not None:
-                    result.append(result_i)
-            """
+        result = type_inf_mapper(expr, return_dtype_set=True)
 
         debug("             result: %s", result)