diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 66210bb2ab326a48609debc0e491aeac4350ed65..dbd5ca28d1784f2ba8217838982c4fb22d09221e 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -175,7 +175,7 @@ class CTarget(TargetBase): # {{{ declare temporaries base_storage_sizes = {} - base_storage_to_is_local = {} + base_storage_to_scope = {} base_storage_to_align_bytes = {} from cgen import ArrayOf, Pointer, Initializer, AlignedAttribute, Value @@ -193,21 +193,16 @@ class CTarget(TargetBase): if not tv.base_storage: for idi in decl_info: - temp_var_decl = POD(self, idi.dtype, idi.name) - - if idi.shape: - temp_var_decl = ArrayOf(temp_var_decl, - " * ".join(str(s) for s in idi.shape)) - temp_decls.append( - self.wrap_temporary_decl(temp_var_decl, tv.is_local)) + self.wrap_temporary_decl( + self.get_temporary_decl(kernel, tv, idi), tv.scope)) else: offset = 0 base_storage_sizes.setdefault(tv.base_storage, []).append( tv.nbytes) - base_storage_to_is_local.setdefault(tv.base_storage, []).append( - tv.is_local) + base_storage_to_scope.setdefault(tv.base_storage, []).append( + tv.scope) align_size = tv.dtype.itemsize @@ -223,9 +218,9 @@ class CTarget(TargetBase): cast_decl = POD(self, idi.dtype, "") temp_var_decl = POD(self, idi.dtype, idi.name) - cast_decl = self.wrap_temporary_decl(cast_decl, tv.is_local) + cast_decl = self.wrap_temporary_decl(cast_decl, tv.scope) temp_var_decl = self.wrap_temporary_decl( - temp_var_decl, tv.is_local) + temp_var_decl, tv.scope) # The 'restrict' part of this is a complete lie--of course # all these temporaries are aliased. But we're promising to @@ -253,7 +248,7 @@ class CTarget(TargetBase): for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)): bs_var_decl = Value("char", bs_name) bs_var_decl = self.wrap_temporary_decl( - bs_var_decl, base_storage_to_is_local[bs_name]) + bs_var_decl, base_storage_to_scope[bs_name]) bs_var_decl = ArrayOf(bs_var_decl, max(bs_sizes)) alignment = max(base_storage_to_align_bytes[bs_name]) @@ -286,7 +281,18 @@ class CTarget(TargetBase): from loopy.target.c.codegen.expression import LoopyCCodeMapper return LoopyCCodeMapper(codegen_state, fortran_abi=self.fortran_abi) - def wrap_temporary_decl(self, decl, is_local): + def get_temporary_decl(self, knl, temp_var, decl_info): + from loopy.codegen import POD # uses the correct complex type + temp_var_decl = POD(self, decl_info.dtype, decl_info.name) + + if decl_info.shape: + from cgen import ArrayOf + temp_var_decl = ArrayOf(temp_var_decl, + " * ".join(str(s) for s in decl_info.shape)) + + return temp_var_decl + + def wrap_temporary_decl(self, decl, scope): return decl def get_value_arg_decl(self, name, shape, dtype, is_written): diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 6d3833a5cb719f8a0149f304256a738b7c1fc89a..93b5da96fc2380d34beea350667adafc23028fac 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -32,6 +32,7 @@ from loopy.target.c import CTarget from loopy.target.c.codegen.expression import LoopyCCodeMapper from loopy.diagnostic import LoopyError from loopy.types import NumpyType +from loopy.kernel.data import temp_var_scope # {{{ vector types @@ -284,12 +285,15 @@ class CudaTarget(CTarget): else: raise LoopyError("unknown barrier kind") - def wrap_temporary_decl(self, decl, is_local): - if is_local: + def wrap_temporary_decl(self, decl, scope): + if scope == temp_var_scope.LOCAL: from cgen.cuda import CudaShared return CudaShared(decl) - else: + elif scope == temp_var_scope.PRIVATE: return decl + else: + raise ValueError("unexpected temporary variable scope: %s" + % scope) def get_global_arg_decl(self, name, shape, dtype, is_written): from loopy.codegen import POD # uses the correct complex type diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 8ab556929b54a0e8b5721300041be0c55432e9af..20add6dfb32fa0d1d25461114286b43ca3fbcc35 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -29,6 +29,7 @@ import numpy as np # noqa from loopy.target.c import CTarget from loopy.target.c.codegen.expression import LoopyCCodeMapper from loopy.diagnostic import LoopyError +from pymbolic.mapper.stringifier import (PREC_SUM, PREC_CALL) from pytools import memoize_method @@ -71,6 +72,49 @@ class LoopyISPCCodeMapper(LoopyCCodeMapper): raise RuntimeError("don't know how to generated code " "for constant '%s'" % expr) + + def map_variable(self, expr, enclosing_prec, type_context): + if expr.name in self.kernel.temporary_variables: + gsize, lsize = self.kernel.get_grid_sizes_as_exprs() + if lsize: + return "%s[programIndex]" % expr.name + else: + return expr.name + else: + return super(LoopyISPCCodeMapper, self).map_variable( + expr, enclosing_prec, type_context) + + def map_subscript(self, expr, enclosing_prec, type_context): + from loopy.kernel.data import TemporaryVariable + + ary = self.find_array(expr) + + if isinstance(ary, TemporaryVariable): + gsize, lsize = self.kernel.get_grid_sizes_as_exprs() + if lsize: + from loopy.kernel.array import get_access_info + from pymbolic import evaluate + + access_info = get_access_info(self.kernel.target, ary, expr.index, + lambda expr: evaluate(expr, self.codegen_state.var_subst_map), + self.codegen_state.vectorization_info) + + subscript, = access_info.subscripts + result = self.parenthesize_if_needed( + "%s[programIndex + %s]" % ( + access_info.array_name, + self.rec(lsize*subscript, PREC_SUM, 'i')), + enclosing_prec, PREC_CALL) + + if access_info.vector_index is not None: + return self.kernel.target.add_vector_access( + result, access_info.vector_index) + else: + return result + + return super(LoopyISPCCodeMapper, self).map_subscript( + expr, enclosing_prec, type_context) + # }}} @@ -243,7 +287,25 @@ class ISPCTarget(CTarget): else: raise LoopyError("unknown barrier kind") - def wrap_temporary_decl(self, decl, is_local): + def get_temporary_decl(self, knl, temp_var, decl_info): + from loopy.codegen import POD # uses the correct complex type + temp_var_decl = POD(self, decl_info.dtype, decl_info.name) + + shape = decl_info.shape + + from loopy.kernel.data import temp_var_scope + if temp_var.scope == temp_var_scope.PRIVATE: + gsize, lsize = knl.get_grid_sizes_as_exprs() + shape = lsize + shape + + if shape: + from cgen import ArrayOf + temp_var_decl = ArrayOf(temp_var_decl, + " * ".join(str(s) for s in shape)) + + return temp_var_decl + + def wrap_temporary_decl(self, decl, scope): from cgen.ispc import ISPCUniform return ISPCUniform(decl) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index a87132cde442e452dd8f24b15e02570d3199641b..f083a70b0288d88d544285e0063eb2e677dfefcf 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -32,6 +32,7 @@ from pytools import memoize_method from loopy.diagnostic import LoopyError from loopy.types import NumpyType from loopy.target.c import DTypeRegistryWrapper +from loopy.kernel.data import temp_var_scope # {{{ dtype registry wrappers @@ -378,12 +379,15 @@ class OpenCLTarget(CTarget): else: raise LoopyError("unknown barrier kind") - def wrap_temporary_decl(self, decl, is_local): - if is_local: + def wrap_temporary_decl(self, decl, scope): + if scope == temp_var_scope.LOCAL: from cgen.opencl import CLLocal return CLLocal(decl) - else: + elif scope == temp_var_scope.PRIVATE: return decl + else: + raise ValueError("unexpected temporary variable scope: %s" + % scope) def get_global_arg_decl(self, name, shape, dtype, is_written): from cgen.opencl import CLGlobal