From 5710fca2819dcca6c35757acb097e3cfe55f077d Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Mon, 7 Dec 2015 13:34:41 -0600
Subject: [PATCH] Finish ISPC backend

---
 loopy/codegen/__init__.py            |   3 +-
 loopy/codegen/loop.py                |  16 ++---
 loopy/kernel/data.py                 |   4 ++
 loopy/target/__init__.py             |   6 ++
 loopy/target/c/__init__.py           | 102 ++++++++++++++++++++++-----
 loopy/target/c/codegen/expression.py |  31 ++------
 loopy/target/ispc/__init__.py        |  95 +++++++++++++++++++++++--
 loopy/target/opencl/__init__.py      |  30 ++++----
 test/test_loopy.py                   |   4 +-
 9 files changed, 210 insertions(+), 81 deletions(-)

diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 225f7e7fe..2e136d7bb 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -499,7 +499,6 @@ def generate_code(kernel, device=None):
 
     from loopy.kernel.data import ValueArg
     from loopy.kernel.array import ArrayBase
-    from cgen import Const
 
     impl_arg_info = []
 
@@ -516,7 +515,7 @@ def generate_code(kernel, device=None):
                 target=kernel.target,
                 name=arg.name,
                 dtype=arg.dtype,
-                cgen_declarator=Const(POD(kernel.target, arg.dtype, arg.name)),
+                cgen_declarator=arg.get_arg_decl(kernel.target),
                 arg_class=ValueArg))
 
         else:
diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index eb5c00d29..74292ce79 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -423,7 +423,7 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state):
             from cgen import Comment
             result.append(Comment(cmt))
 
-        from cgen import Initializer, POD, Const, Line, For
+        from cgen import Initializer, POD, Const, Line
         from loopy.symbolic import aff_to_expr
 
         if (static_ubound - static_lbound).plain_is_zero():
@@ -436,16 +436,10 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state):
                 ]))
 
         else:
-            from loopy.codegen import wrap_in
-
-            result.append(wrap_in(For,
-                    "%s %s = %s"
-                    % (kernel.target.dtype_to_typename(kernel.index_dtype),
-                        loop_iname, ecm(aff_to_expr(static_lbound), PREC_NONE, "i")),
-                    "%s <= %s" % (
-                        loop_iname, ecm(aff_to_expr(static_ubound), PREC_NONE, "i")),
-                    "++%s" % loop_iname,
-                    inner))
+            result.append(
+                kernel.target.emit_sequential_loop(
+                       codegen_state, loop_iname, kernel.index_dtype,
+                       static_lbound, static_ubound, inner))
 
     return gen_code_block(result)
 
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index ccc2e378f..c95ca0e91 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -278,6 +278,10 @@ class ValueArg(KernelArgument):
         key_builder.rec(key_hash, self.name)
         key_builder.rec(key_hash, self.dtype)
 
+    def get_arg_decl(self, target):
+        return target.get_value_arg_decl(self.name, (),
+                self.dtype, False)
+
 # }}}
 
 
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index 4c54570aa..17534bf6d 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -97,12 +97,18 @@ class TargetBase(object):
 
     # {{{ code generation guts
 
+    def get_expression_to_code_mapper(self, codegen_state):
+        raise NotImplementedError()
+
     def get_global_axis_expr(self, axis):
         raise NotImplementedError()
 
     def get_local_axis_expr(self, axis):
         raise NotImplementedError()
 
+    def add_vector_access(self, access_str, index):
+        raise NotImplementedError()
+
     def emit_barrier(self, kind, comment):
         """
         :arg kind: ``"local"`` or ``"global"``
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index e98c2af00..0cf179922 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -32,7 +32,32 @@ from loopy.target import TargetBase
 from pytools import memoize_method
 
 
+# {{{ preamble generator
+
+def _preamble_generator(kernel, seen_dtypes, seen_functions):
+    c_funcs = set(func.c_name for func in seen_functions)
+    if "int_floor_div" in c_funcs:
+        yield ("05_int_floor_div", """
+            #define int_floor_div(a,b) \
+              (( (a) - \
+                 ( ( (a)<0 ) != ( (b)<0 )) \
+                  *( (b) + ( (b)<0 ) - ( (b)>=0 ) )) \
+               / (b) )
+            """)
+
+    if "int_floor_div_pos_b" in c_funcs:
+        yield ("05_int_floor_div_pos_b", """
+            #define int_floor_div_pos_b(a,b) ( \
+                ( (a) - ( ((a)<0) ? ((b)-1) : 0 )  ) / (b) \
+                )
+            """)
+
+# }}}
+
+
 class CTarget(TargetBase):
+    # {{{ types
+
     @memoize_method
     def get_dtype_registry(self):
         from loopy.target.c.compyte.dtypes import (
@@ -49,14 +74,24 @@ class CTarget(TargetBase):
         raise KeyError()
 
     def get_or_register_dtype(self, names, dtype=None):
+        # These kind of shouldn't be here.
         return self.get_dtype_registry().get_or_register_dtype(names, dtype)
 
     def dtype_to_typename(self, dtype):
+        # These kind of shouldn't be here.
         return self.get_dtype_registry().dtype_to_ctype(dtype)
 
-    def get_expression_to_code_mapper(self, codegen_state):
-        from loopy.target.c.codegen.expression import LoopyCCodeMapper
-        return LoopyCCodeMapper(codegen_state)
+    # }}}
+
+    # {{{ library
+
+    def preamble_generators(self):
+        return (
+                super(CTarget, self).preamble_generators() + [
+                    _preamble_generator,
+                    ])
+
+    # }}}
 
     # {{{ code generation
 
@@ -95,7 +130,6 @@ class CTarget(TargetBase):
 
         from cgen import ArrayOf, Pointer, Initializer, AlignedAttribute
         from loopy.codegen import POD  # uses the correct complex type
-        from cgen.opencl import CLLocal
 
         class ConstRestrictPointer(Pointer):
             def get_decl_pair(self):
@@ -115,10 +149,8 @@ class CTarget(TargetBase):
                         temp_var_decl = ArrayOf(temp_var_decl,
                                 " * ".join(str(s) for s in idi.shape))
 
-                    if tv.is_local:
-                        temp_var_decl = CLLocal(temp_var_decl)
-
-                    temp_decls.append(temp_var_decl)
+                    temp_decls.append(
+                            self.wrap_temporary_decl(temp_var_decl, tv.is_local))
 
             else:
                 offset = 0
@@ -141,9 +173,9 @@ class CTarget(TargetBase):
                     cast_decl = POD(self, idi.dtype, "")
                     temp_var_decl = POD(self, idi.dtype, idi.name)
 
-                    if tv.is_local:
-                        cast_decl = CLLocal(cast_decl)
-                        temp_var_decl = CLLocal(temp_var_decl)
+                    cast_decl = self.wrap_temporary_decl(cast_decl, tv.is_local)
+                    temp_var_decl = self.wrap_temporary_decl(
+                            temp_var_decl, tv.is_local)
 
                     # The 'restrict' part of this is a complete lie--of course
                     # all these temporaries are aliased. But we're promising to
@@ -170,9 +202,8 @@ class CTarget(TargetBase):
 
         for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)):
             bs_var_decl = POD(self, np.int8, bs_name)
-            if base_storage_to_is_local[bs_name]:
-                bs_var_decl = CLLocal(bs_var_decl)
-
+            bs_var_decl = self.wrap_temporary_decl(
+                    bs_var_decl, base_storage_to_is_local[bs_name])
             bs_var_decl = ArrayOf(bs_var_decl, max(bs_sizes))
 
             alignment = max(base_storage_to_align_bytes[bs_name])
@@ -197,18 +228,57 @@ class CTarget(TargetBase):
 
         return body, gen_code.implemented_domains
 
+    # }}}
+
+    # {{{ code generation guts
+
+    def get_expression_to_code_mapper(self, codegen_state):
+        from loopy.target.c.codegen.expression import LoopyCCodeMapper
+        return LoopyCCodeMapper(codegen_state)
+
+    def wrap_temporary_decl(self, decl, is_local):
+        return decl
+
+    def get_value_arg_decl(self, name, shape, dtype, is_written):
+        assert shape == ()
+
+        from loopy.codegen import POD  # uses the correct complex type
+        result = POD(self, dtype, name)
+        if not is_written:
+            from cgen import Const
+            result = Const(result)
+        return result
+
     def get_global_arg_decl(self, name, shape, dtype, is_written):
         from loopy.codegen import POD  # uses the correct complex type
         from cgen import RestrictPointer, Const
 
-        arg_decl = RestrictPointer(
-                POD(self, dtype, name))
+        arg_decl = RestrictPointer(POD(self, dtype, name))
 
         if not is_written:
             arg_decl = Const(arg_decl)
 
         return arg_decl
 
+    def emit_sequential_loop(self, codegen_state, iname, iname_dtype,
+            static_lbound, static_ubound, inner):
+        ecm = codegen_state.expression_to_code_mapper
+
+        from loopy.symbolic import aff_to_expr
+
+        from loopy.codegen import wrap_in
+        from pymbolic.mapper.stringifier import PREC_NONE
+        from cgen import For
+
+        return wrap_in(For,
+                "%s %s = %s"
+                % (self.dtype_to_typename(iname_dtype),
+                    iname, ecm(aff_to_expr(static_lbound), PREC_NONE, "i")),
+                "%s <= %s" % (
+                    iname, ecm(aff_to_expr(static_ubound), PREC_NONE, "i")),
+                "++%s" % iname,
+                inner)
+
     # }}}
 
 # vim: foldmethod=marker
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 112e7e5b9..97bec6e59 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -39,14 +39,6 @@ from loopy.diagnostic import LoopyError
 from loopy.tools import is_integer
 
 
-def get_opencl_vec_member(idx):
-    if idx is None:
-        return idx
-
-    # The 'int' avoids an 'L' suffix for long ints.
-    return "s%s" % hex(int(idx))[2:]
-
-
 # {{{ C code mapper
 
 class LoopyCCodeMapper(RecursiveMapper):
@@ -176,8 +168,6 @@ class LoopyCCodeMapper(RecursiveMapper):
                 lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
                 self.codegen_state.vectorization_info)
 
-        vec_member = get_opencl_vec_member(access_info.vector_index)
-
         from loopy.kernel.data import ImageArg, GlobalArg, TemporaryVariable
 
         if isinstance(ary, ImageArg):
@@ -200,21 +190,11 @@ class LoopyCCodeMapper(RecursiveMapper):
             if len(access_info.subscripts) == 0:
                 if isinstance(ary, GlobalArg):
                     # unsubscripted global args are pointers
-                    if vec_member is not None:
-                        return "%s->%s" % (
-                                access_info.array_name,
-                                vec_member)
-                    else:
-                        return "*" + access_info.array_name
+                    result = "*" + access_info.array_name
 
                 else:
                     # unsubscripted temp vars are scalars
-                    if vec_member is not None:
-                        return "%s.%s" % (
-                                access_info.array_name,
-                                vec_member)
-                    else:
-                        return access_info.array_name
+                    result = access_info.array_name
 
             else:
                 subscript, = access_info.subscripts
@@ -224,9 +204,10 @@ class LoopyCCodeMapper(RecursiveMapper):
                             self.rec(subscript, PREC_NONE, 'i')),
                         enclosing_prec, PREC_CALL)
 
-                if vec_member:
-                    result += "."+vec_member
-
+            if access_info.vector_index is not None:
+                return self.kernel.target.add_vector_access(
+                    result, access_info.vector_index)
+            else:
                 return result
 
         else:
diff --git a/loopy/target/ispc/__init__.py b/loopy/target/ispc/__init__.py
index ffd07db13..e0c4b75a3 100644
--- a/loopy/target/ispc/__init__.py
+++ b/loopy/target/ispc/__init__.py
@@ -33,6 +33,74 @@ from pymbolic import var
 
 
 class ISPCTarget(CTarget):
+    # {{{ top-level codegen
+
+    def generate_code(self, kernel, codegen_state, impl_arg_info):
+        from cgen import (FunctionBody, FunctionDeclaration, Value, Module,
+                Block, Line, Statement as S)
+        from cgen.ispc import ISPCExport, ISPCTask
+
+        knl_body, implemented_domains = kernel.target.generate_body(
+                kernel, codegen_state)
+
+        inner_name = "lp_ispc_inner_"+kernel.name
+        arg_decls = [iai.cgen_declarator for iai in impl_arg_info]
+        knl_fbody = FunctionBody(
+                ISPCTask(
+                    FunctionDeclaration(
+                        Value("void", inner_name),
+                        arg_decls)),
+                knl_body)
+
+        # {{{ generate wrapper
+
+        wrapper_body = Block()
+
+        gsize, lsize = kernel.get_grid_sizes_as_exprs()
+        if len(lsize) > 1:
+            for i, ls_i in enumerate(lsize[1:]):
+                if ls_i != 1:
+                    raise LoopyError("local axis %d (0-based) "
+                            "has length > 1, which is unsupported "
+                            "by ISPC" % ls_i)
+
+        from pymbolic.mapper.stringifier import PREC_COMPARISON, PREC_NONE
+        ccm = self.get_expression_to_code_mapper(codegen_state)
+
+        wrapper_body.extend([
+                S("assert(programCount == %s)"
+                    % ccm(lsize[0], PREC_COMPARISON)),
+                S("launch[%s] %s(%s)"
+                    % (
+                        ", ".join(
+                            ccm(gs_i, PREC_NONE)
+                            for gs_i in gsize),
+                        inner_name,
+                        ", ".join(iai.name for iai in impl_arg_info)
+                        ))
+                ])
+
+        wrapper_fbody = FunctionBody(
+                ISPCExport(
+                    FunctionDeclaration(
+                        Value("void", kernel.name),
+                        [iai.cgen_declarator for iai in impl_arg_info])),
+                wrapper_body)
+
+        # }}}
+
+        mod = Module([
+            knl_fbody,
+            Line(),
+            wrapper_fbody,
+            ])
+
+        return str(mod), implemented_domains
+
+    # }}}
+
+    # {{{ code generation guts
+
     def get_global_axis_expr(self, axis):
         return var("taskIndex%d" % axis)
 
@@ -42,6 +110,9 @@ class ISPCTarget(CTarget):
         else:
             raise LoopyError("ISPC only supports one local axis")
 
+    def add_vector_access(self, access_str, index):
+        return "(%s)[%d]" % (access_str, index)
+
     def emit_barrier(self, kind, comment):
         from loopy.codegen import GeneratedInstruction
         from cgen import Comment, Statement
@@ -61,25 +132,37 @@ class ISPCTarget(CTarget):
         else:
             raise LoopyError("unknown barrier kind")
 
+    def wrap_temporary_decl(self, decl, is_local):
+        from cgen.ispc import ISPCUniform, ISPCVarying
+        if is_local:
+            return ISPCUniform(decl)
+        else:
+            return ISPCVarying(decl)
+
     def get_global_arg_decl(self, name, shape, dtype, is_written):
         from loopy.codegen import POD  # uses the correct complex type
         from cgen import Const
-        from cgen.ispc import ISPCUniformPointer
+        from cgen.ispc import ISPCUniformPointer, ISPCUniform
 
         arg_decl = ISPCUniformPointer(POD(self, dtype, name))
 
         if not is_written:
             arg_decl = Const(arg_decl)
 
+        arg_decl = ISPCUniform(arg_decl)
+
         return arg_decl
 
+    def get_value_arg_decl(self, name, shape, dtype, is_written):
+        result = super(ISPCTarget, self).get_value_arg_decl(
+                name, shape, dtype, is_written)
+
+        from cgen.ispc import ISPCUniform
+        return ISPCUniform(result)
+
     # }}}
 
-# TODO: Fix argument wrapping (value,
-# TODO: Fix local variable wrapping
-# TODO: Fix local variable alloc
-# TODO: Top-level foreach
 # TODO: Generate launch code
-# TODO: Vector types
+# TODO: Vector types (element access: done)
 
 # vim: foldmethod=marker
diff --git a/loopy/target/opencl/__init__.py b/loopy/target/opencl/__init__.py
index 4a39e5245..4a9b0f31d 100644
--- a/loopy/target/opencl/__init__.py
+++ b/loopy/target/opencl/__init__.py
@@ -172,23 +172,6 @@ def opencl_preamble_generator(kernel, seen_dtypes, seen_functions):
             #endif
             """)
 
-    c_funcs = set(func.c_name for func in seen_functions)
-    if "int_floor_div" in c_funcs:
-        yield ("05_int_floor_div", """
-            #define int_floor_div(a,b) \
-              (( (a) - \
-                 ( ( (a)<0 ) != ( (b)<0 )) \
-                  *( (b) + ( (b)<0 ) - ( (b)>=0 ) )) \
-               / (b) )
-            """)
-
-    if "int_floor_div_pos_b" in c_funcs:
-        yield ("05_int_floor_div_pos_b", """
-            #define int_floor_div_pos_b(a,b) ( \
-                ( (a) - ( ((a)<0) ? ((b)-1) : 0 )  ) / (b) \
-                )
-            """)
-
 # }}}
 
 
@@ -290,6 +273,10 @@ class OpenCLTarget(CTarget):
     def get_local_axis_expr(self, axis):
         return var("lid")(axis)
 
+    def add_vector_access(self, access_str, index):
+        # The 'int' avoids an 'L' suffix for long ints.
+        return "(%s).s%s" % (access_str, hex(int(index))[2:])
+
     def emit_barrier(self, kind, comment):
         """
         :arg kind: ``"local"`` or ``"global"``
@@ -309,6 +296,13 @@ class OpenCLTarget(CTarget):
         else:
             raise LoopyError("unknown barrier kind")
 
+    def wrap_temporary_decl(self, decl, is_local):
+        if is_local:
+            from cgen.opencl import CLLocal
+            return CLLocal(decl)
+        else:
+            return decl
+
     def get_global_arg_decl(self, name, shape, dtype, is_written):
         from cgen.opencl import CLGlobal
 
@@ -324,7 +318,7 @@ class OpenCLTarget(CTarget):
         from cgen.opencl import CLImage
         return CLImage(self.num_target_axes(), mode, name)
 
-    def get_arg_decl(self, name, shape, dtype, is_written):
+    def get_constant_arg_decl(self, name, shape, dtype, is_written):
         from loopy.codegen import POD  # uses the correct complex type
         from cgen import RestrictPointer, Const
         from cgen.opencl import CLConstant
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 81c0dd553..3e14ed2d8 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2320,14 +2320,12 @@ def test_ispc_backend():
             "{ [i]: 0<=i<n }",
             "out[i] = 2*a[i]",
             [
-                # Tests that comma'd arguments interoperate with
-                # argument guessing.
                 lp.GlobalArg("out,a", np.float32, shape=lp.auto),
                 "..."
                 ],
             target=ISPCTarget())
 
-    knl = lp.split_iname(knl, "i", 128, inner_tag="l.0")
+    knl = lp.split_iname(knl, "i", 8, inner_tag="l.0")
     knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp")
     knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"])
 
-- 
GitLab