diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index 0947d00cd9af64ba0b07d2e0c3d420ff3995a6f6..3306d30e4486418b7b4f78e4f1d95a4fd39b45bc 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -241,7 +241,7 @@ def wrap_in_if(codegen_state, condition_exprs, inner):
 
 def generate_host_or_device_program(codegen_state, schedule_index):
     ast_builder = codegen_state.ast_builder
-    temp_decls = ast_builder.get_temporary_decls(codegen_state)
+    temp_decls = ast_builder.get_temporary_decls(codegen_state, schedule_index)
 
     from functools import partial
 
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index 3ec3a50b11f72a2975ac4366d495326bfcb69b37..eb39539b9c489320b227da7c7397c0748a704159 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -156,7 +156,7 @@ class ASTBuilderBase(object):
     def generate_top_of_body(self, codegen_state):
         return []
 
-    def get_temporary_decls(self, codegen_state):
+    def get_temporary_decls(self, codegen_state, schedule_index):
         raise NotImplementedError
 
     def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):
@@ -239,7 +239,7 @@ class DummyHostASTBuilder(ASTBuilderBase):
             schedule_index):
         return None
 
-    def get_temporary_decls(self, codegen_state):
+    def get_temporary_decls(self, codegen_state, schedule_index):
         return []
 
     def get_expression_to_code_mapper(self, codegen_state):
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 6aca830d99c5637fc92e96f361c3c8bef5d65229..55741c76b58c80e4ad256ffee5a41fcd01b0f12d 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -233,7 +233,7 @@ class CASTBuilder(ASTBuilderBase):
                         [self.idi_to_cgen_declarator(codegen_state.kernel, idi)
                             for idi in codegen_state.implemented_data_info])
 
-    def get_temporary_decls(self, codegen_state):
+    def get_temporary_decls(self, codegen_state, schedule_index):
         from loopy.kernel.data import temp_var_scope
 
         kernel = codegen_state.kernel
@@ -261,7 +261,7 @@ class CASTBuilder(ASTBuilderBase):
                         temp_decls.append(
                                 self.wrap_temporary_decl(
                                     self.get_temporary_decl(
-                                        kernel, tv, idi), tv.scope))
+                                        kernel, schedule_index, tv, idi), tv.scope))
 
             else:
                 offset = 0
@@ -346,7 +346,7 @@ class CASTBuilder(ASTBuilderBase):
         return ExpressionToCMapper(
                 codegen_state, fortran_abi=self.target.fortran_abi)
 
-    def get_temporary_decl(self, knl, temp_var, decl_info):
+    def get_temporary_decl(self, knl, schedule_index, temp_var, decl_info):
         temp_var_decl = POD(self, decl_info.dtype, decl_info.name)
 
         if decl_info.shape:
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 896ea9158223435e3bef933818fbf3bc51a424b4..7f19bdbdf838bcf43b66eb9c1c4fbf58699634a9 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -74,12 +74,19 @@ class ExprToISPCMapper(ExpressionToCMapper):
                         "for constant '%s'" % expr)
 
     def map_variable(self, expr, enclosing_prec, type_context):
-        if expr.name in self.kernel.temporary_variables:
-            gsize, lsize = self.kernel.get_grid_sizes_as_exprs()
+        tv = self.kernel.temporary_variables.get(expr.name)
+
+        from loopy.kernel.data import temp_var_scope
+        if tv is not None and tv.scope == temp_var_scope.PRIVATE:
+            # FIXME: This is a pretty coarse way of deciding what
+            # private temporaries get duplicated. Refine? (See also
+            # below in decl generation)
+            gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs()
             if lsize:
                 return "%s[programIndex]" % expr.name
             else:
                 return expr.name
+
         else:
             return super(ExprToISPCMapper, self).map_variable(
                     expr, enclosing_prec, type_context)
@@ -291,7 +298,7 @@ class ISPCASTBuilder(CASTBuilder):
         else:
             raise LoopyError("unknown barrier kind")
 
-    def get_temporary_decl(self, knl, temp_var, decl_info):
+    def get_temporary_decl(self, knl, sched_index, temp_var, decl_info):
         from loopy.target.c import POD  # uses the correct complex type
         temp_var_decl = POD(self, decl_info.dtype, decl_info.name)
 
@@ -299,7 +306,10 @@ class ISPCASTBuilder(CASTBuilder):
 
         from loopy.kernel.data import temp_var_scope
         if temp_var.scope == temp_var_scope.PRIVATE:
-            gsize, lsize = knl.get_grid_sizes_as_exprs()
+            # FIXME: This is a pretty coarse way of deciding what
+            # private temporaries get duplicated. Refine? (See also
+            # above in expr to code mapper)
+            _, lsize = knl.get_grid_size_upper_bounds_as_exprs()
             shape = lsize + shape
 
         if shape:
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index 72147daf80fdfdb10e8e62d3b2163879a325e962..9179d0ec441b4e312d6c624c95b845cc05149897 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -631,8 +631,8 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase):
         # no such thing in Python
         return None
 
-    def get_temporary_decls(self, codegen_state):
-        # FIXME: Create global temporaries
+    def get_temporary_decls(self, codegen_state, schedule_state):
+        # Temporaries allocated in get_function_definition
         return []
 
     def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):