diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index 948c419c7989174d0824c342981f88b6fa4b8e6b..0b509fad8ec2d3ae6a21d5a228e0fd578cb0ab4c 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -79,17 +79,7 @@ def generate_code_for_sched_index(kernel, sched_index, codegen_state): return func(kernel, sched_index, codegen_state) elif isinstance(sched_item, Barrier): - from loopy.codegen import GeneratedInstruction - from cgen import Statement as S # noqa - - if sched_item.comment: - comment = " /* %s */" % sched_item.comment - else: - comment = "" - - return GeneratedInstruction( - ast=S("barrier(CLK_LOCAL_MEM_FENCE)%s" % comment), - implemented_domain=None) + return kernel.target.emit_barrier(sched_item.kind, sched_item.comment) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 936366030a239822d77c0265f698edcbd56ef695..eb5c00d299eff02aad714e324fca92d8d3bdbffe 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -244,12 +244,10 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, tag = kernel.iname_to_tag.get(iname) assert isinstance(tag, UniqueTag) - from pymbolic import var - if isinstance(tag, LocalIndexTag): - hw_axis_expr = var("lid")(tag.axis) + hw_axis_expr = kernel.target.get_local_axis_expr(tag.axis) elif isinstance(tag, GroupIndexTag): - hw_axis_expr = var("gid")(tag.axis) + hw_axis_expr = kernel.target.get_global_axis_expr(tag.axis) else: raise RuntimeError("unexpected hw tag type") diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index a777f458452fe012c8f22226d4a756a03b3e8403..ccc2e378f834a7d77ce9ab431be6349fb594109c 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -221,17 +221,8 @@ class GlobalArg(ArrayBase, KernelArgument): max_target_axes = 1 def get_arg_decl(self, target, name_suffix, shape, dtype, is_written): - from loopy.codegen import POD # uses the correct complex type - from cgen import RestrictPointer, Const - from cgen.opencl import CLGlobal - - arg_decl = RestrictPointer( - POD(target, dtype, self.name + name_suffix)) - - if not is_written: - arg_decl = Const(arg_decl) - - return CLGlobal(arg_decl) + return target.get_global_arg_decl(self.name + name_suffix, shape, + dtype, is_written) class ConstantArg(ArrayBase, KernelArgument): @@ -239,17 +230,8 @@ class ConstantArg(ArrayBase, KernelArgument): max_target_axes = 1 def get_arg_decl(self, target, name_suffix, shape, dtype, is_written): - from loopy.codegen import POD # uses the correct complex type - from cgen import RestrictPointer, Const - from cgen.opencl import CLConstant - - arg_decl = RestrictPointer( - POD(dtype, self.name + name_suffix)) - - if not is_written: - arg_decl = Const(arg_decl) - - return CLConstant(arg_decl) + return target.get_constant_arg_decl(self.name + name_suffix, shape, + dtype, is_written) class ImageArg(ArrayBase, KernelArgument): @@ -261,13 +243,8 @@ class ImageArg(ArrayBase, KernelArgument): return len(self.dim_tags) def get_arg_decl(self, target, name_suffix, shape, dtype, is_written): - if is_written: - mode = "w" - else: - mode = "r" - - from cgen.opencl import CLImage - return CLImage(self.num_target_axes(), mode, self.name+name_suffix) + return target.get_image_arg_decl(self.name + name_suffix, shape, + dtype, is_written) class ValueArg(KernelArgument): diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index d36c39044cb4448d59e5dae78f5cca13afb06d11..4c54570aa8d153c73954124a38e79980e84688bf 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -52,11 +52,7 @@ class TargetBase(object): # }}} - def preprocess(self, kernel): - return kernel - - def pre_codegen_check(self, kernel): - pass + # {{{ library def function_manglers(self): return [] @@ -67,10 +63,24 @@ class TargetBase(object): def preamble_generators(self): return [] - def get_or_register_dtype(self, names, dtype=None): - raise NotImplementedError() + # }}} + + # {{{ top-level codegen + + def preprocess(self, kernel): + return kernel + + def pre_codegen_check(self, kernel): + pass + + def generate_code(self, kernel, codegen_state, impl_arg_info): + pass + + # }}} - def dtype_to_typename(self, dtype): + # {{{ types + + def get_dtype_registry(self): raise NotImplementedError() def is_vector_dtype(self, dtype): @@ -82,3 +92,30 @@ class TargetBase(object): def alignment_requirement(self, type_decl): import struct return struct.calcsize(type_decl.struct_format()) + + # }}} + + # {{{ code generation guts + + def get_global_axis_expr(self, axis): + raise NotImplementedError() + + def get_local_axis_expr(self, axis): + raise NotImplementedError() + + def emit_barrier(self, kind, comment): + """ + :arg kind: ``"local"`` or ``"global"`` + :return: a :class:`loopy.codegen.GeneratedInstruction`. + """ + raise NotImplementedError() + + def get_global_arg_decl(self, name, shape, dtype, is_written): + raise NotImplementedError() + + def get_image_arg_decl(self, name, shape, dtype, is_written): + raise NotImplementedError() + + # }}} + +# vim: foldmethod=marker diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 628e5d9ac14714f58dd6b68e3f8b605880b1f19b..e98c2af0055ac6c9535765c52bf57fdc951084c5 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -197,4 +197,18 @@ class CTarget(TargetBase): return body, gen_code.implemented_domains + def get_global_arg_decl(self, name, shape, dtype, is_written): + from loopy.codegen import POD # uses the correct complex type + from cgen import RestrictPointer, Const + + arg_decl = RestrictPointer( + POD(self, dtype, name)) + + if not is_written: + arg_decl = Const(arg_decl) + + return arg_decl + # }}} + +# vim: foldmethod=marker diff --git a/loopy/target/ispc/__init__.py b/loopy/target/ispc/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..ffd07db13ff405c2650db47c5a46026d847fe812 --- /dev/null +++ b/loopy/target/ispc/__init__.py @@ -0,0 +1,85 @@ +"""Target for Intel ISPC.""" + +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2015 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +import numpy as np # noqa +from loopy.target.c import CTarget +from loopy.diagnostic import LoopyError + +from pymbolic import var + + +class ISPCTarget(CTarget): + def get_global_axis_expr(self, axis): + return var("taskIndex%d" % axis) + + def get_local_axis_expr(self, axis): + if axis == 0: + return var("programIndex") + else: + raise LoopyError("ISPC only supports one local axis") + + def emit_barrier(self, kind, comment): + from loopy.codegen import GeneratedInstruction + from cgen import Comment, Statement + + assert comment + + if kind == "local": + return GeneratedInstruction( + ast=Comment("local barrier: %s" % comment), + implemented_domain=None) + + elif kind == "global": + return GeneratedInstruction( + ast=Statement("sync; /* %s */" % comment), + implemented_domain=None) + + else: + raise LoopyError("unknown barrier kind") + + def get_global_arg_decl(self, name, shape, dtype, is_written): + from loopy.codegen import POD # uses the correct complex type + from cgen import Const + from cgen.ispc import ISPCUniformPointer + + arg_decl = ISPCUniformPointer(POD(self, dtype, name)) + + if not is_written: + arg_decl = Const(arg_decl) + + return arg_decl + + # }}} + +# TODO: Fix argument wrapping (value, +# TODO: Fix local variable wrapping +# TODO: Fix local variable alloc +# TODO: Top-level foreach +# TODO: Generate launch code +# TODO: Vector types + +# vim: foldmethod=marker diff --git a/loopy/target/opencl/__init__.py b/loopy/target/opencl/__init__.py index d038c329a9eff73a95458fde44078c26f3dbbc56..4a39e52453ab24beb5f627127435c0124d435803 100644 --- a/loopy/target/opencl/__init__.py +++ b/loopy/target/opencl/__init__.py @@ -28,6 +28,9 @@ import numpy as np from loopy.target.c import CTarget from pytools import memoize_method +from loopy.diagnostic import LoopyError + +from pymbolic import var # {{{ vector types @@ -192,6 +195,8 @@ def opencl_preamble_generator(kernel, seen_dtypes, seen_functions): # {{{ target class OpenCLTarget(CTarget): + # {{{ library + def function_manglers(self): return ( super(OpenCLTarget, self).function_manglers() + [ @@ -212,6 +217,8 @@ class OpenCLTarget(CTarget): reduction_preamble_generator ]) + # }}} + @memoize_method def get_dtype_registry(self): from loopy.target.c.compyte.dtypes import (DTypeRegistry, @@ -232,6 +239,10 @@ class OpenCLTarget(CTarget): def vector_dtype(self, base, count): return vec.types[base, count] + # }}} + + # {{{ top-level codegen + def wrap_function_declaration(self, kernel, fdecl): from cgen.opencl import CLKernel, CLRequiredWorkGroupSize return CLRequiredWorkGroupSize( @@ -269,6 +280,64 @@ class OpenCLTarget(CTarget): return body, implemented_domains + # }}} + + # {{{ code generation guts + + def get_global_axis_expr(self, axis): + return var("gid")(axis) + + def get_local_axis_expr(self, axis): + return var("lid")(axis) + + def emit_barrier(self, kind, comment): + """ + :arg kind: ``"local"`` or ``"global"`` + :return: a :class:`loopy.codegen.GeneratedInstruction`. + """ + if kind == "local": + if comment: + comment = "/* %s */" % comment + + from loopy.codegen import GeneratedInstruction + from cgen import Statement + return GeneratedInstruction( + ast=Statement("barrier(CLK_LOCAL_MEM_FENCE)%s" % comment), + implemented_domain=None) + elif kind == "global": + raise LoopyError("OpenCL does not have global barriers") + else: + raise LoopyError("unknown barrier kind") + + def get_global_arg_decl(self, name, shape, dtype, is_written): + from cgen.opencl import CLGlobal + + return CLGlobal(super(OpenCLTarget, self).get_global_arg_decl( + name, shape, dtype, is_written)) + + def get_image_arg_decl(self, name, shape, dtype, is_written): + if is_written: + mode = "w" + else: + mode = "r" + + from cgen.opencl import CLImage + return CLImage(self.num_target_axes(), mode, name) + + def get_arg_decl(self, name, shape, dtype, is_written): + from loopy.codegen import POD # uses the correct complex type + from cgen import RestrictPointer, Const + from cgen.opencl import CLConstant + + arg_decl = RestrictPointer(POD(dtype, name)) + + if not is_written: + arg_decl = Const(arg_decl) + + return CLConstant(arg_decl) + + # }}} + # }}} # vim: foldmethod=marker diff --git a/test/test_loopy.py b/test/test_loopy.py index 7dc080e2d1d0c0a81151fe558f63661768c9ef8b..81c0dd5535cc2d6a4337cddc5ba2c8b6a394ebc3 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2313,6 +2313,30 @@ def test_collect_common_factors(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=13)) +def test_ispc_backend(): + from loopy.target.ispc import ISPCTarget + + knl = lp.make_kernel( + "{ [i]: 0<=i<n }", + "out[i] = 2*a[i]", + [ + # Tests that comma'd arguments interoperate with + # argument guessing. + lp.GlobalArg("out,a", np.float32, shape=lp.auto), + "..." + ], + target=ISPCTarget()) + + knl = lp.split_iname(knl, "i", 128, inner_tag="l.0") + knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") + knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"]) + + print( + lp.generate_code( + lp.get_one_scheduled_kernel( + lp.preprocess_kernel(knl)))[0]) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])