diff --git a/loopy/check.py b/loopy/check.py
index 727b02a85acf16c4a8ec4b5793ecc850c294fd14..f50ee5cfaa3c6a12ed542adf683beb660616dffc 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -486,11 +486,12 @@ def check_write_destinations(kernel):
 # {{{ check_has_schedulable_iname_nesting
 
 def check_has_schedulable_iname_nesting(kernel):
-    from loopy.transform.iname import (has_schedulable_iname_nesting,
-                                       get_iname_duplication_options)
-    if not has_schedulable_iname_nesting(kernel):
+    from loopy.transform.iname import (
+            has_schedulable_iname_nesting_for_single_kernel,
+            get_iname_duplication_options_for_single_kernel)
+    if not has_schedulable_iname_nesting_for_single_kernel(kernel):
         import itertools as it
-        opt = get_iname_duplication_options(kernel)
+        opt = get_iname_duplication_options_for_single_kernel(kernel)
         opt_str = "\n".join("* Duplicate %s within instructions %s" % (i, w)
                             for i, w in it.islice(opt, 3))
         raise LoopyError("Kernel does not have a schedulable iname nesting. "
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index ed1e7a5bc8da0ee79154e9053eaeb6a624545a65..e9e7c9a447afb559e3536ab3cb1219111a3a2e0d 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -154,6 +154,7 @@ class SeenFunction(ImmutableRecord):
 class CodeGenerationState(object):
     """
     .. attribute:: kernel
+    .. attribute:: target
     .. attribute:: implemented_data_info
 
         a list of :class:`ImplementedDataInfo` objects.
@@ -199,7 +200,7 @@ class CodeGenerationState(object):
     .. attribute:: program_callables_info
     """
 
-    def __init__(self, kernel,
+    def __init__(self, kernel, target,
             implemented_data_info, implemented_domain, implemented_predicates,
             seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map,
             allow_complex,
@@ -209,6 +210,7 @@ class CodeGenerationState(object):
             gen_program_name=None,
             schedule_index_end=None):
         self.kernel = kernel
+        self.target = target
         self.implemented_data_info = implemented_data_info
         self.implemented_domain = implemented_domain
         self.implemented_predicates = implemented_predicates
@@ -226,7 +228,7 @@ class CodeGenerationState(object):
 
     # {{{ copy helpers
 
-    def copy(self, kernel=None, implemented_data_info=None,
+    def copy(self, kernel=None, target=None, implemented_data_info=None,
             implemented_domain=None, implemented_predicates=frozenset(),
             var_subst_map=None, vectorization_info=None,
             is_generating_device_code=None,
@@ -236,6 +238,9 @@ class CodeGenerationState(object):
         if kernel is None:
             kernel = self.kernel
 
+        if target is None:
+            target = self.target
+
         if implemented_data_info is None:
             implemented_data_info = self.implemented_data_info
 
@@ -256,6 +261,7 @@ class CodeGenerationState(object):
 
         return CodeGenerationState(
                 kernel=kernel,
+                target=target,
                 implemented_data_info=implemented_data_info,
                 implemented_domain=implemented_domain or self.implemented_domain,
                 implemented_predicates=(
@@ -413,7 +419,7 @@ class PreambleInfo(ImmutableRecord):
 
 # {{{ main code generation entrypoint
 
-def generate_code_for_a_single_kernel(kernel, program_callables_info):
+def generate_code_for_a_single_kernel(kernel, program_callables_info, target):
     """
     :returns: a :class:`CodeGenerationResult`
     """
@@ -459,13 +465,13 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info):
         if isinstance(arg, ArrayBase):
             implemented_data_info.extend(
                     arg.decl_info(
-                        kernel.target,
+                        target,
                         is_written=is_written,
                         index_dtype=kernel.index_dtype))
 
         elif isinstance(arg, ValueArg):
             implemented_data_info.append(ImplementedDataInfo(
-                target=kernel.target,
+                target=target,
                 name=arg.name,
                 dtype=arg.dtype,
                 arg_class=ValueArg,
@@ -488,6 +494,7 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info):
     initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions)
     codegen_state = CodeGenerationState(
             kernel=kernel,
+            target=target,
             implemented_data_info=implemented_data_info,
             implemented_domain=initial_implemented_domain,
             implemented_predicates=frozenset(),
@@ -499,9 +506,9 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info):
             var_name_generator=kernel.get_var_name_generator(),
             is_generating_device_code=False,
             gen_program_name=(
-                kernel.target.host_program_name_prefix
+                target.host_program_name_prefix
                 + kernel.name
-                + kernel.target.host_program_name_suffix),
+                + target.host_program_name_suffix),
             schedule_index_end=len(kernel.schedule),
             program_callables_info=program_callables_info)
 
@@ -536,7 +543,7 @@ def generate_code_for_a_single_kernel(kernel, program_callables_info):
             )
 
     preamble_generators = (kernel.preamble_generators
-            + kernel.target.get_device_ast_builder().preamble_generators())
+            + target.get_device_ast_builder().preamble_generators())
     for prea_gen in preamble_generators:
         preambles.extend(prea_gen(preamble_info))
 
@@ -579,7 +586,7 @@ def generate_code_v2(program):
         if isinstance(in_knl_callable, CallableKernel):
             codegen_results[func_id] = (
                     generate_code_for_a_single_kernel(in_knl_callable.subkernel,
-                        program.program_callables_info))
+                        program.program_callables_info, program.target))
 
     device_preambles = set()
     for cgr in codegen_results.values():
diff --git a/loopy/program.py b/loopy/program.py
index bb5b9b1aca54137ded259ccc812f8ba7430ee13b..df7bd1bdd2fd04ca2a2061f6f700608590b5d773 100644
--- a/loopy/program.py
+++ b/loopy/program.py
@@ -192,6 +192,28 @@ class Program(ImmutableRecord):
 
     update_persistent_hash = LoopKernel.update_persistent_hash
 
+    def copy(self, **kwargs):
+        if 'target' in kwargs:
+            target = kwargs['target']
+            new_self = super(Program, self).copy(**kwargs)
+            new_resolved_functions = {}
+            for func_id, in_knl_callable in (
+                    new_self.program_callables_info.items()):
+                if isinstance(in_knl_callable, CallableKernel):
+                    subkernel = in_knl_callable.subkernel
+                    new_resolved_functions[func_id] = in_knl_callable.copy(
+                            subkernel=subkernel.copy(target=target))
+                else:
+                    new_resolved_functions[func_id] = in_knl_callable
+
+                program_callables_info = new_self.program_callables_info.copy(
+                        resolved_functions=new_resolved_functions)
+
+                return new_self.copy(
+                        program_callables_info=program_callables_info)
+        else:
+            return super(Program, self).copy(**kwargs)
+
     def get_grid_size_upper_bounds(self, ignore_auto=False):
         """Return a tuple (global_size, local_size) containing a grid that
         could accommodate execution of *all* instructions in the kernel.
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index caa02c17afaa75180328cadbc3ed307d1f49823f..75aa62467eef7f58591e640e9b4f3c80f97e37dc 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -36,6 +36,7 @@ from loopy.diagnostic import LoopyError
 
 from loopy.program import iterate_over_kernels_if_given_program
 from loopy.kernel import LoopKernel
+from loopy.kernel.function_interface import CallableKernel, ScalarCallable
 
 
 __doc__ = """
@@ -982,7 +983,7 @@ def _get_iname_duplication_options(insn_iname_sets, old_common_inames=frozenset(
     # If partitioning was empty, we have recursed successfully and yield nothing
 
 
-def get_iname_duplication_options(knl, use_boostable_into=False):
+def get_iname_duplication_options_for_single_kernel(knl, use_boostable_into=False):
     """List options for duplication of inames, if necessary for schedulability
 
     :returns: a generator listing all options to duplicate inames, if duplication
@@ -1048,7 +1049,7 @@ def get_iname_duplication_options(knl, use_boostable_into=False):
         # If we find a duplication option and to not use boostable_into
         # information, we restart this generator with use_boostable_into=True
         if not use_boostable_into and not knl.options.ignore_boostable_into:
-            for option in get_iname_duplication_options(knl, True):
+            for option in get_iname_duplication_options_for_single_kernel(knl, True):
                 yield option
 
             # Emit a warning that we needed boostable_into
@@ -1076,12 +1077,34 @@ def get_iname_duplication_options(knl, use_boostable_into=False):
             yield iname, within
 
 
-def has_schedulable_iname_nesting(knl):
+def get_iname_duplication_options(program, use_boostable_into=False):
+    for in_knl_callable in program.program_callables_info.values():
+        if isinstance(in_knl_callable, CallableKernel):
+            yield from get_iname_duplication_options_for_single_kernel(
+                    in_knl_callable.subkernel, use_boostable_into)
+        elif isinstance(in_knl_callable, ScalarCallable):
+            pass
+        else:
+            raise NotImplementedError("Unknown type of in kernel callable %s."
+                    % (type(in_knl_callable)))
+
+    return
+
+
+def has_schedulable_iname_nesting_for_single_kernel(knl):
     """
     :returns: a :class:`bool` indicating whether this kernel needs
         an iname duplication in order to be schedulable.
     """
-    return not bool(next(get_iname_duplication_options(knl), False))
+    return not bool(next(get_iname_duplication_options_for_single_kernel(knl),
+        False))
+
+
+def has_schedulable_iname_nesting(program):
+    return all(has_schedulable_iname_nesting_for_single_kernel(
+        in_knl_callable.subkernel) for in_knl_callable in
+        program.program_callables_info.values() if isinstance(in_knl_callable,
+            CallableKernel))
 
 # }}}
 
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 13d9c722ed1fdb15f405d19c8e21389b974dcc9f..65c91871ad276d5e99c295971ca4ab2522176742 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -36,7 +36,7 @@ from loopy.diagnostic import (
 from loopy.kernel.instruction import _DataObliviousInstruction
 
 from loopy.program import ProgramCallablesInfo
-from loopy.symbolic import SubArrayRef
+from loopy.symbolic import SubArrayRef, LinearSubscript
 from pymbolic.primitives import Variable, Subscript
 
 import logging
@@ -819,7 +819,7 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info,
                     if kernel.temporary_variables[assignee.name].dtype is None:
                         return False
 
-            elif isinstance(assignee, Subscript):
+            elif isinstance(assignee, (Subscript, LinearSubscript)):
                 if assignee.aggregate.name in kernel.arg_dict:
                     if kernel.arg_dict[assignee.aggregate.name].dtype is None:
                         return False
@@ -828,7 +828,6 @@ def infer_unknown_types_for_a_single_kernel(kernel, program_callables_info,
                     if kernel.temporary_variables[
                             assignee.aggregate.name].dtype is None:
                         return False
-
             else:
                 assert isinstance(assignee, SubArrayRef)
                 if assignee.subscript.aggregate.name in kernel.arg_dict: