From 54db43d8d0f3700d175e0e3666516c9f086d5768 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Thu, 28 Apr 2016 19:51:41 -0500 Subject: [PATCH] ISPC backend: Generate an instance of each private temporary for each element of the workgroup --- loopy/target/c/__init__.py | 34 +++++++++++--------- loopy/target/cuda.py | 10 ++++-- loopy/target/ispc.py | 64 +++++++++++++++++++++++++++++++++++++- loopy/target/opencl.py | 10 ++++-- 4 files changed, 97 insertions(+), 21 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 66210bb2a..dbd5ca28d 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -175,7 +175,7 @@ class CTarget(TargetBase): # {{{ declare temporaries base_storage_sizes = {} - base_storage_to_is_local = {} + base_storage_to_scope = {} base_storage_to_align_bytes = {} from cgen import ArrayOf, Pointer, Initializer, AlignedAttribute, Value @@ -193,21 +193,16 @@ class CTarget(TargetBase): if not tv.base_storage: for idi in decl_info: - temp_var_decl = POD(self, idi.dtype, idi.name) - - if idi.shape: - temp_var_decl = ArrayOf(temp_var_decl, - " * ".join(str(s) for s in idi.shape)) - temp_decls.append( - self.wrap_temporary_decl(temp_var_decl, tv.is_local)) + self.wrap_temporary_decl( + self.get_temporary_decl(kernel, tv, idi), tv.scope)) else: offset = 0 base_storage_sizes.setdefault(tv.base_storage, []).append( tv.nbytes) - base_storage_to_is_local.setdefault(tv.base_storage, []).append( - tv.is_local) + base_storage_to_scope.setdefault(tv.base_storage, []).append( + tv.scope) align_size = tv.dtype.itemsize @@ -223,9 +218,9 @@ class CTarget(TargetBase): cast_decl = POD(self, idi.dtype, "") temp_var_decl = POD(self, idi.dtype, idi.name) - cast_decl = self.wrap_temporary_decl(cast_decl, tv.is_local) + cast_decl = self.wrap_temporary_decl(cast_decl, tv.scope) temp_var_decl = self.wrap_temporary_decl( - temp_var_decl, tv.is_local) + temp_var_decl, tv.scope) # The 'restrict' part of this is a complete lie--of course # all these temporaries are aliased. But we're promising to @@ -253,7 +248,7 @@ class CTarget(TargetBase): for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)): bs_var_decl = Value("char", bs_name) bs_var_decl = self.wrap_temporary_decl( - bs_var_decl, base_storage_to_is_local[bs_name]) + bs_var_decl, base_storage_to_scope[bs_name]) bs_var_decl = ArrayOf(bs_var_decl, max(bs_sizes)) alignment = max(base_storage_to_align_bytes[bs_name]) @@ -286,7 +281,18 @@ class CTarget(TargetBase): from loopy.target.c.codegen.expression import LoopyCCodeMapper return LoopyCCodeMapper(codegen_state, fortran_abi=self.fortran_abi) - def wrap_temporary_decl(self, decl, is_local): + def get_temporary_decl(self, knl, temp_var, decl_info): + from loopy.codegen import POD # uses the correct complex type + temp_var_decl = POD(self, decl_info.dtype, decl_info.name) + + if decl_info.shape: + from cgen import ArrayOf + temp_var_decl = ArrayOf(temp_var_decl, + " * ".join(str(s) for s in decl_info.shape)) + + return temp_var_decl + + def wrap_temporary_decl(self, decl, scope): return decl def get_value_arg_decl(self, name, shape, dtype, is_written): diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 6d3833a5c..93b5da96f 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -32,6 +32,7 @@ from loopy.target.c import CTarget from loopy.target.c.codegen.expression import LoopyCCodeMapper from loopy.diagnostic import LoopyError from loopy.types import NumpyType +from loopy.kernel.data import temp_var_scope # {{{ vector types @@ -284,12 +285,15 @@ class CudaTarget(CTarget): else: raise LoopyError("unknown barrier kind") - def wrap_temporary_decl(self, decl, is_local): - if is_local: + def wrap_temporary_decl(self, decl, scope): + if scope == temp_var_scope.LOCAL: from cgen.cuda import CudaShared return CudaShared(decl) - else: + elif scope == temp_var_scope.PRIVATE: return decl + else: + raise ValueError("unexpected temporary variable scope: %s" + % scope) def get_global_arg_decl(self, name, shape, dtype, is_written): from loopy.codegen import POD # uses the correct complex type diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 8ab556929..20add6dfb 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -29,6 +29,7 @@ import numpy as np # noqa from loopy.target.c import CTarget from loopy.target.c.codegen.expression import LoopyCCodeMapper from loopy.diagnostic import LoopyError +from pymbolic.mapper.stringifier import (PREC_SUM, PREC_CALL) from pytools import memoize_method @@ -71,6 +72,49 @@ class LoopyISPCCodeMapper(LoopyCCodeMapper): raise RuntimeError("don't know how to generated code " "for constant '%s'" % expr) + + def map_variable(self, expr, enclosing_prec, type_context): + if expr.name in self.kernel.temporary_variables: + gsize, lsize = self.kernel.get_grid_sizes_as_exprs() + if lsize: + return "%s[programIndex]" % expr.name + else: + return expr.name + else: + return super(LoopyISPCCodeMapper, self).map_variable( + expr, enclosing_prec, type_context) + + def map_subscript(self, expr, enclosing_prec, type_context): + from loopy.kernel.data import TemporaryVariable + + ary = self.find_array(expr) + + if isinstance(ary, TemporaryVariable): + gsize, lsize = self.kernel.get_grid_sizes_as_exprs() + if lsize: + from loopy.kernel.array import get_access_info + from pymbolic import evaluate + + access_info = get_access_info(self.kernel.target, ary, expr.index, + lambda expr: evaluate(expr, self.codegen_state.var_subst_map), + self.codegen_state.vectorization_info) + + subscript, = access_info.subscripts + result = self.parenthesize_if_needed( + "%s[programIndex + %s]" % ( + access_info.array_name, + self.rec(lsize*subscript, PREC_SUM, 'i')), + enclosing_prec, PREC_CALL) + + if access_info.vector_index is not None: + return self.kernel.target.add_vector_access( + result, access_info.vector_index) + else: + return result + + return super(LoopyISPCCodeMapper, self).map_subscript( + expr, enclosing_prec, type_context) + # }}} @@ -243,7 +287,25 @@ class ISPCTarget(CTarget): else: raise LoopyError("unknown barrier kind") - def wrap_temporary_decl(self, decl, is_local): + def get_temporary_decl(self, knl, temp_var, decl_info): + from loopy.codegen import POD # uses the correct complex type + temp_var_decl = POD(self, decl_info.dtype, decl_info.name) + + shape = decl_info.shape + + from loopy.kernel.data import temp_var_scope + if temp_var.scope == temp_var_scope.PRIVATE: + gsize, lsize = knl.get_grid_sizes_as_exprs() + shape = lsize + shape + + if shape: + from cgen import ArrayOf + temp_var_decl = ArrayOf(temp_var_decl, + " * ".join(str(s) for s in shape)) + + return temp_var_decl + + def wrap_temporary_decl(self, decl, scope): from cgen.ispc import ISPCUniform return ISPCUniform(decl) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index a87132cde..f083a70b0 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -32,6 +32,7 @@ from pytools import memoize_method from loopy.diagnostic import LoopyError from loopy.types import NumpyType from loopy.target.c import DTypeRegistryWrapper +from loopy.kernel.data import temp_var_scope # {{{ dtype registry wrappers @@ -378,12 +379,15 @@ class OpenCLTarget(CTarget): else: raise LoopyError("unknown barrier kind") - def wrap_temporary_decl(self, decl, is_local): - if is_local: + def wrap_temporary_decl(self, decl, scope): + if scope == temp_var_scope.LOCAL: from cgen.opencl import CLLocal return CLLocal(decl) - else: + elif scope == temp_var_scope.PRIVATE: return decl + else: + raise ValueError("unexpected temporary variable scope: %s" + % scope) def get_global_arg_decl(self, name, shape, dtype, is_written): from cgen.opencl import CLGlobal -- GitLab