From 54db43d8d0f3700d175e0e3666516c9f086d5768 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Thu, 28 Apr 2016 19:51:41 -0500
Subject: [PATCH] ISPC backend: Generate an instance of each private temporary
 for each element of the workgroup

---
 loopy/target/c/__init__.py | 34 +++++++++++---------
 loopy/target/cuda.py       | 10 ++++--
 loopy/target/ispc.py       | 64 +++++++++++++++++++++++++++++++++++++-
 loopy/target/opencl.py     | 10 ++++--
 4 files changed, 97 insertions(+), 21 deletions(-)

diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 66210bb2a..dbd5ca28d 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -175,7 +175,7 @@ class CTarget(TargetBase):
         # {{{ declare temporaries
 
         base_storage_sizes = {}
-        base_storage_to_is_local = {}
+        base_storage_to_scope = {}
         base_storage_to_align_bytes = {}
 
         from cgen import ArrayOf, Pointer, Initializer, AlignedAttribute, Value
@@ -193,21 +193,16 @@ class CTarget(TargetBase):
 
             if not tv.base_storage:
                 for idi in decl_info:
-                    temp_var_decl = POD(self, idi.dtype, idi.name)
-
-                    if idi.shape:
-                        temp_var_decl = ArrayOf(temp_var_decl,
-                                " * ".join(str(s) for s in idi.shape))
-
                     temp_decls.append(
-                            self.wrap_temporary_decl(temp_var_decl, tv.is_local))
+                            self.wrap_temporary_decl(
+                                self.get_temporary_decl(kernel, tv, idi), tv.scope))
 
             else:
                 offset = 0
                 base_storage_sizes.setdefault(tv.base_storage, []).append(
                         tv.nbytes)
-                base_storage_to_is_local.setdefault(tv.base_storage, []).append(
-                        tv.is_local)
+                base_storage_to_scope.setdefault(tv.base_storage, []).append(
+                        tv.scope)
 
                 align_size = tv.dtype.itemsize
 
@@ -223,9 +218,9 @@ class CTarget(TargetBase):
                     cast_decl = POD(self, idi.dtype, "")
                     temp_var_decl = POD(self, idi.dtype, idi.name)
 
-                    cast_decl = self.wrap_temporary_decl(cast_decl, tv.is_local)
+                    cast_decl = self.wrap_temporary_decl(cast_decl, tv.scope)
                     temp_var_decl = self.wrap_temporary_decl(
-                            temp_var_decl, tv.is_local)
+                            temp_var_decl, tv.scope)
 
                     # The 'restrict' part of this is a complete lie--of course
                     # all these temporaries are aliased. But we're promising to
@@ -253,7 +248,7 @@ class CTarget(TargetBase):
         for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)):
             bs_var_decl = Value("char", bs_name)
             bs_var_decl = self.wrap_temporary_decl(
-                    bs_var_decl, base_storage_to_is_local[bs_name])
+                    bs_var_decl, base_storage_to_scope[bs_name])
             bs_var_decl = ArrayOf(bs_var_decl, max(bs_sizes))
 
             alignment = max(base_storage_to_align_bytes[bs_name])
@@ -286,7 +281,18 @@ class CTarget(TargetBase):
         from loopy.target.c.codegen.expression import LoopyCCodeMapper
         return LoopyCCodeMapper(codegen_state, fortran_abi=self.fortran_abi)
 
-    def wrap_temporary_decl(self, decl, is_local):
+    def get_temporary_decl(self, knl, temp_var, decl_info):
+        from loopy.codegen import POD  # uses the correct complex type
+        temp_var_decl = POD(self, decl_info.dtype, decl_info.name)
+
+        if decl_info.shape:
+            from cgen import ArrayOf
+            temp_var_decl = ArrayOf(temp_var_decl,
+                    " * ".join(str(s) for s in decl_info.shape))
+
+        return temp_var_decl
+
+    def wrap_temporary_decl(self, decl, scope):
         return decl
 
     def get_value_arg_decl(self, name, shape, dtype, is_written):
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index 6d3833a5c..93b5da96f 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -32,6 +32,7 @@ from loopy.target.c import CTarget
 from loopy.target.c.codegen.expression import LoopyCCodeMapper
 from loopy.diagnostic import LoopyError
 from loopy.types import NumpyType
+from loopy.kernel.data import temp_var_scope
 
 
 # {{{ vector types
@@ -284,12 +285,15 @@ class CudaTarget(CTarget):
         else:
             raise LoopyError("unknown barrier kind")
 
-    def wrap_temporary_decl(self, decl, is_local):
-        if is_local:
+    def wrap_temporary_decl(self, decl, scope):
+        if scope == temp_var_scope.LOCAL:
             from cgen.cuda import CudaShared
             return CudaShared(decl)
-        else:
+        elif scope == temp_var_scope.PRIVATE:
             return decl
+        else:
+            raise ValueError("unexpected temporary variable scope: %s"
+                    % scope)
 
     def get_global_arg_decl(self, name, shape, dtype, is_written):
         from loopy.codegen import POD  # uses the correct complex type
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 8ab556929..20add6dfb 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -29,6 +29,7 @@ import numpy as np  # noqa
 from loopy.target.c import CTarget
 from loopy.target.c.codegen.expression import LoopyCCodeMapper
 from loopy.diagnostic import LoopyError
+from pymbolic.mapper.stringifier import (PREC_SUM, PREC_CALL)
 
 from pytools import memoize_method
 
@@ -71,6 +72,49 @@ class LoopyISPCCodeMapper(LoopyCCodeMapper):
 
                 raise RuntimeError("don't know how to generated code "
                         "for constant '%s'" % expr)
+
+    def map_variable(self, expr, enclosing_prec, type_context):
+        if expr.name in self.kernel.temporary_variables:
+            gsize, lsize = self.kernel.get_grid_sizes_as_exprs()
+            if lsize:
+                return "%s[programIndex]" % expr.name
+            else:
+                return expr.name
+        else:
+            return super(LoopyISPCCodeMapper, self).map_variable(
+                    expr, enclosing_prec, type_context)
+
+    def map_subscript(self, expr, enclosing_prec, type_context):
+        from loopy.kernel.data import TemporaryVariable
+
+        ary = self.find_array(expr)
+
+        if isinstance(ary, TemporaryVariable):
+            gsize, lsize = self.kernel.get_grid_sizes_as_exprs()
+            if lsize:
+                from loopy.kernel.array import get_access_info
+                from pymbolic import evaluate
+
+                access_info = get_access_info(self.kernel.target, ary, expr.index,
+                    lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
+                    self.codegen_state.vectorization_info)
+
+                subscript, = access_info.subscripts
+                result = self.parenthesize_if_needed(
+                        "%s[programIndex + %s]" % (
+                            access_info.array_name,
+                            self.rec(lsize*subscript, PREC_SUM, 'i')),
+                        enclosing_prec, PREC_CALL)
+
+                if access_info.vector_index is not None:
+                    return self.kernel.target.add_vector_access(
+                        result, access_info.vector_index)
+                else:
+                    return result
+
+        return super(LoopyISPCCodeMapper, self).map_subscript(
+                expr, enclosing_prec, type_context)
+
 # }}}
 
 
@@ -243,7 +287,25 @@ class ISPCTarget(CTarget):
         else:
             raise LoopyError("unknown barrier kind")
 
-    def wrap_temporary_decl(self, decl, is_local):
+    def get_temporary_decl(self, knl, temp_var, decl_info):
+        from loopy.codegen import POD  # uses the correct complex type
+        temp_var_decl = POD(self, decl_info.dtype, decl_info.name)
+
+        shape = decl_info.shape
+
+        from loopy.kernel.data import temp_var_scope
+        if temp_var.scope == temp_var_scope.PRIVATE:
+            gsize, lsize = knl.get_grid_sizes_as_exprs()
+            shape = lsize + shape
+
+        if shape:
+            from cgen import ArrayOf
+            temp_var_decl = ArrayOf(temp_var_decl,
+                    " * ".join(str(s) for s in shape))
+
+        return temp_var_decl
+
+    def wrap_temporary_decl(self, decl, scope):
         from cgen.ispc import ISPCUniform
         return ISPCUniform(decl)
 
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index a87132cde..f083a70b0 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -32,6 +32,7 @@ from pytools import memoize_method
 from loopy.diagnostic import LoopyError
 from loopy.types import NumpyType
 from loopy.target.c import DTypeRegistryWrapper
+from loopy.kernel.data import temp_var_scope
 
 
 # {{{ dtype registry wrappers
@@ -378,12 +379,15 @@ class OpenCLTarget(CTarget):
         else:
             raise LoopyError("unknown barrier kind")
 
-    def wrap_temporary_decl(self, decl, is_local):
-        if is_local:
+    def wrap_temporary_decl(self, decl, scope):
+        if scope == temp_var_scope.LOCAL:
             from cgen.opencl import CLLocal
             return CLLocal(decl)
-        else:
+        elif scope == temp_var_scope.PRIVATE:
             return decl
+        else:
+            raise ValueError("unexpected temporary variable scope: %s"
+                    % scope)
 
     def get_global_arg_decl(self, name, shape, dtype, is_written):
         from cgen.opencl import CLGlobal
-- 
GitLab