diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 0947d00cd9af64ba0b07d2e0c3d420ff3995a6f6..3306d30e4486418b7b4f78e4f1d95a4fd39b45bc 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -241,7 +241,7 @@ def wrap_in_if(codegen_state, condition_exprs, inner): def generate_host_or_device_program(codegen_state, schedule_index): ast_builder = codegen_state.ast_builder - temp_decls = ast_builder.get_temporary_decls(codegen_state) + temp_decls = ast_builder.get_temporary_decls(codegen_state, schedule_index) from functools import partial diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 3ec3a50b11f72a2975ac4366d495326bfcb69b37..eb39539b9c489320b227da7c7397c0748a704159 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -156,7 +156,7 @@ class ASTBuilderBase(object): def generate_top_of_body(self, codegen_state): return [] - def get_temporary_decls(self, codegen_state): + def get_temporary_decls(self, codegen_state, schedule_index): raise NotImplementedError def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): @@ -239,7 +239,7 @@ class DummyHostASTBuilder(ASTBuilderBase): schedule_index): return None - def get_temporary_decls(self, codegen_state): + def get_temporary_decls(self, codegen_state, schedule_index): return [] def get_expression_to_code_mapper(self, codegen_state): diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 6aca830d99c5637fc92e96f361c3c8bef5d65229..55741c76b58c80e4ad256ffee5a41fcd01b0f12d 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -233,7 +233,7 @@ class CASTBuilder(ASTBuilderBase): [self.idi_to_cgen_declarator(codegen_state.kernel, idi) for idi in codegen_state.implemented_data_info]) - def get_temporary_decls(self, codegen_state): + def get_temporary_decls(self, codegen_state, schedule_index): from loopy.kernel.data import temp_var_scope kernel = codegen_state.kernel @@ -261,7 +261,7 @@ class CASTBuilder(ASTBuilderBase): temp_decls.append( self.wrap_temporary_decl( self.get_temporary_decl( - kernel, tv, idi), tv.scope)) + kernel, schedule_index, tv, idi), tv.scope)) else: offset = 0 @@ -346,7 +346,7 @@ class CASTBuilder(ASTBuilderBase): return ExpressionToCMapper( codegen_state, fortran_abi=self.target.fortran_abi) - def get_temporary_decl(self, knl, temp_var, decl_info): + def get_temporary_decl(self, knl, schedule_index, temp_var, decl_info): temp_var_decl = POD(self, decl_info.dtype, decl_info.name) if decl_info.shape: diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 896ea9158223435e3bef933818fbf3bc51a424b4..7f19bdbdf838bcf43b66eb9c1c4fbf58699634a9 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -74,12 +74,19 @@ class ExprToISPCMapper(ExpressionToCMapper): "for constant '%s'" % expr) def map_variable(self, expr, enclosing_prec, type_context): - if expr.name in self.kernel.temporary_variables: - gsize, lsize = self.kernel.get_grid_sizes_as_exprs() + tv = self.kernel.temporary_variables.get(expr.name) + + from loopy.kernel.data import temp_var_scope + if tv is not None and tv.scope == temp_var_scope.PRIVATE: + # FIXME: This is a pretty coarse way of deciding what + # private temporaries get duplicated. Refine? (See also + # below in decl generation) + gsize, lsize = self.kernel.get_grid_size_upper_bounds_as_exprs() if lsize: return "%s[programIndex]" % expr.name else: return expr.name + else: return super(ExprToISPCMapper, self).map_variable( expr, enclosing_prec, type_context) @@ -291,7 +298,7 @@ class ISPCASTBuilder(CASTBuilder): else: raise LoopyError("unknown barrier kind") - def get_temporary_decl(self, knl, temp_var, decl_info): + def get_temporary_decl(self, knl, sched_index, temp_var, decl_info): from loopy.target.c import POD # uses the correct complex type temp_var_decl = POD(self, decl_info.dtype, decl_info.name) @@ -299,7 +306,10 @@ class ISPCASTBuilder(CASTBuilder): from loopy.kernel.data import temp_var_scope if temp_var.scope == temp_var_scope.PRIVATE: - gsize, lsize = knl.get_grid_sizes_as_exprs() + # FIXME: This is a pretty coarse way of deciding what + # private temporaries get duplicated. Refine? (See also + # above in expr to code mapper) + _, lsize = knl.get_grid_size_upper_bounds_as_exprs() shape = lsize + shape if shape: diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 72147daf80fdfdb10e8e62d3b2163879a325e962..9179d0ec441b4e312d6c624c95b845cc05149897 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -631,8 +631,8 @@ class PyOpenCLPythonASTBuilder(PythonASTBuilderBase): # no such thing in Python return None - def get_temporary_decls(self, codegen_state): - # FIXME: Create global temporaries + def get_temporary_decls(self, codegen_state, schedule_state): + # Temporaries allocated in get_function_definition return [] def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args):