diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 225f7e7fec30d38159a93df6ec4df35a45c21bc6..2e136d7bb5264238f024775b4b9b9d31479bb914 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -499,7 +499,6 @@ def generate_code(kernel, device=None): from loopy.kernel.data import ValueArg from loopy.kernel.array import ArrayBase - from cgen import Const impl_arg_info = [] @@ -516,7 +515,7 @@ def generate_code(kernel, device=None): target=kernel.target, name=arg.name, dtype=arg.dtype, - cgen_declarator=Const(POD(kernel.target, arg.dtype, arg.name)), + cgen_declarator=arg.get_arg_decl(kernel.target), arg_class=ValueArg)) else: diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index eb5c00d299eff02aad714e324fca92d8d3bdbffe..74292ce794ace23de486856d08d39304d820d822 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -423,7 +423,7 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): from cgen import Comment result.append(Comment(cmt)) - from cgen import Initializer, POD, Const, Line, For + from cgen import Initializer, POD, Const, Line from loopy.symbolic import aff_to_expr if (static_ubound - static_lbound).plain_is_zero(): @@ -436,16 +436,10 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): ])) else: - from loopy.codegen import wrap_in - - result.append(wrap_in(For, - "%s %s = %s" - % (kernel.target.dtype_to_typename(kernel.index_dtype), - loop_iname, ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), - "%s <= %s" % ( - loop_iname, ecm(aff_to_expr(static_ubound), PREC_NONE, "i")), - "++%s" % loop_iname, - inner)) + result.append( + kernel.target.emit_sequential_loop( + codegen_state, loop_iname, kernel.index_dtype, + static_lbound, static_ubound, inner)) return gen_code_block(result) diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index ccc2e378f834a7d77ce9ab431be6349fb594109c..c95ca0e9144e54c704e8aaa1444aba818bdbe27b 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -278,6 +278,10 @@ class ValueArg(KernelArgument): key_builder.rec(key_hash, self.name) key_builder.rec(key_hash, self.dtype) + def get_arg_decl(self, target): + return target.get_value_arg_decl(self.name, (), + self.dtype, False) + # }}} diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 4c54570aa8d153c73954124a38e79980e84688bf..17534bf6dee1c18fbc831c4a7904c635491b3757 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -97,12 +97,18 @@ class TargetBase(object): # {{{ code generation guts + def get_expression_to_code_mapper(self, codegen_state): + raise NotImplementedError() + def get_global_axis_expr(self, axis): raise NotImplementedError() def get_local_axis_expr(self, axis): raise NotImplementedError() + def add_vector_access(self, access_str, index): + raise NotImplementedError() + def emit_barrier(self, kind, comment): """ :arg kind: ``"local"`` or ``"global"`` diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index e98c2af0055ac6c9535765c52bf57fdc951084c5..0cf17992251778ee971298e736bd1080f6f36f13 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -32,7 +32,32 @@ from loopy.target import TargetBase from pytools import memoize_method +# {{{ preamble generator + +def _preamble_generator(kernel, seen_dtypes, seen_functions): + c_funcs = set(func.c_name for func in seen_functions) + if "int_floor_div" in c_funcs: + yield ("05_int_floor_div", """ + #define int_floor_div(a,b) \ + (( (a) - \ + ( ( (a)<0 ) != ( (b)<0 )) \ + *( (b) + ( (b)<0 ) - ( (b)>=0 ) )) \ + / (b) ) + """) + + if "int_floor_div_pos_b" in c_funcs: + yield ("05_int_floor_div_pos_b", """ + #define int_floor_div_pos_b(a,b) ( \ + ( (a) - ( ((a)<0) ? ((b)-1) : 0 ) ) / (b) \ + ) + """) + +# }}} + + class CTarget(TargetBase): + # {{{ types + @memoize_method def get_dtype_registry(self): from loopy.target.c.compyte.dtypes import ( @@ -49,14 +74,24 @@ class CTarget(TargetBase): raise KeyError() def get_or_register_dtype(self, names, dtype=None): + # These kind of shouldn't be here. return self.get_dtype_registry().get_or_register_dtype(names, dtype) def dtype_to_typename(self, dtype): + # These kind of shouldn't be here. return self.get_dtype_registry().dtype_to_ctype(dtype) - def get_expression_to_code_mapper(self, codegen_state): - from loopy.target.c.codegen.expression import LoopyCCodeMapper - return LoopyCCodeMapper(codegen_state) + # }}} + + # {{{ library + + def preamble_generators(self): + return ( + super(CTarget, self).preamble_generators() + [ + _preamble_generator, + ]) + + # }}} # {{{ code generation @@ -95,7 +130,6 @@ class CTarget(TargetBase): from cgen import ArrayOf, Pointer, Initializer, AlignedAttribute from loopy.codegen import POD # uses the correct complex type - from cgen.opencl import CLLocal class ConstRestrictPointer(Pointer): def get_decl_pair(self): @@ -115,10 +149,8 @@ class CTarget(TargetBase): temp_var_decl = ArrayOf(temp_var_decl, " * ".join(str(s) for s in idi.shape)) - if tv.is_local: - temp_var_decl = CLLocal(temp_var_decl) - - temp_decls.append(temp_var_decl) + temp_decls.append( + self.wrap_temporary_decl(temp_var_decl, tv.is_local)) else: offset = 0 @@ -141,9 +173,9 @@ class CTarget(TargetBase): cast_decl = POD(self, idi.dtype, "") temp_var_decl = POD(self, idi.dtype, idi.name) - if tv.is_local: - cast_decl = CLLocal(cast_decl) - temp_var_decl = CLLocal(temp_var_decl) + cast_decl = self.wrap_temporary_decl(cast_decl, tv.is_local) + temp_var_decl = self.wrap_temporary_decl( + temp_var_decl, tv.is_local) # The 'restrict' part of this is a complete lie--of course # all these temporaries are aliased. But we're promising to @@ -170,9 +202,8 @@ class CTarget(TargetBase): for bs_name, bs_sizes in sorted(six.iteritems(base_storage_sizes)): bs_var_decl = POD(self, np.int8, bs_name) - if base_storage_to_is_local[bs_name]: - bs_var_decl = CLLocal(bs_var_decl) - + bs_var_decl = self.wrap_temporary_decl( + bs_var_decl, base_storage_to_is_local[bs_name]) bs_var_decl = ArrayOf(bs_var_decl, max(bs_sizes)) alignment = max(base_storage_to_align_bytes[bs_name]) @@ -197,18 +228,57 @@ class CTarget(TargetBase): return body, gen_code.implemented_domains + # }}} + + # {{{ code generation guts + + def get_expression_to_code_mapper(self, codegen_state): + from loopy.target.c.codegen.expression import LoopyCCodeMapper + return LoopyCCodeMapper(codegen_state) + + def wrap_temporary_decl(self, decl, is_local): + return decl + + def get_value_arg_decl(self, name, shape, dtype, is_written): + assert shape == () + + from loopy.codegen import POD # uses the correct complex type + result = POD(self, dtype, name) + if not is_written: + from cgen import Const + result = Const(result) + return result + def get_global_arg_decl(self, name, shape, dtype, is_written): from loopy.codegen import POD # uses the correct complex type from cgen import RestrictPointer, Const - arg_decl = RestrictPointer( - POD(self, dtype, name)) + arg_decl = RestrictPointer(POD(self, dtype, name)) if not is_written: arg_decl = Const(arg_decl) return arg_decl + def emit_sequential_loop(self, codegen_state, iname, iname_dtype, + static_lbound, static_ubound, inner): + ecm = codegen_state.expression_to_code_mapper + + from loopy.symbolic import aff_to_expr + + from loopy.codegen import wrap_in + from pymbolic.mapper.stringifier import PREC_NONE + from cgen import For + + return wrap_in(For, + "%s %s = %s" + % (self.dtype_to_typename(iname_dtype), + iname, ecm(aff_to_expr(static_lbound), PREC_NONE, "i")), + "%s <= %s" % ( + iname, ecm(aff_to_expr(static_ubound), PREC_NONE, "i")), + "++%s" % iname, + inner) + # }}} # vim: foldmethod=marker diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 112e7e5b95b3354092a6ced076a9a03140c87105..97bec6e59764426b882be2a6ce0625fc5945f179 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -39,14 +39,6 @@ from loopy.diagnostic import LoopyError from loopy.tools import is_integer -def get_opencl_vec_member(idx): - if idx is None: - return idx - - # The 'int' avoids an 'L' suffix for long ints. - return "s%s" % hex(int(idx))[2:] - - # {{{ C code mapper class LoopyCCodeMapper(RecursiveMapper): @@ -176,8 +168,6 @@ class LoopyCCodeMapper(RecursiveMapper): lambda expr: evaluate(expr, self.codegen_state.var_subst_map), self.codegen_state.vectorization_info) - vec_member = get_opencl_vec_member(access_info.vector_index) - from loopy.kernel.data import ImageArg, GlobalArg, TemporaryVariable if isinstance(ary, ImageArg): @@ -200,21 +190,11 @@ class LoopyCCodeMapper(RecursiveMapper): if len(access_info.subscripts) == 0: if isinstance(ary, GlobalArg): # unsubscripted global args are pointers - if vec_member is not None: - return "%s->%s" % ( - access_info.array_name, - vec_member) - else: - return "*" + access_info.array_name + result = "*" + access_info.array_name else: # unsubscripted temp vars are scalars - if vec_member is not None: - return "%s.%s" % ( - access_info.array_name, - vec_member) - else: - return access_info.array_name + result = access_info.array_name else: subscript, = access_info.subscripts @@ -224,9 +204,10 @@ class LoopyCCodeMapper(RecursiveMapper): self.rec(subscript, PREC_NONE, 'i')), enclosing_prec, PREC_CALL) - if vec_member: - result += "."+vec_member - + if access_info.vector_index is not None: + return self.kernel.target.add_vector_access( + result, access_info.vector_index) + else: return result else: diff --git a/loopy/target/ispc/__init__.py b/loopy/target/ispc/__init__.py index ffd07db13ff405c2650db47c5a46026d847fe812..e0c4b75a3b5aacac8848a3669b475a8197a1168a 100644 --- a/loopy/target/ispc/__init__.py +++ b/loopy/target/ispc/__init__.py @@ -33,6 +33,74 @@ from pymbolic import var class ISPCTarget(CTarget): + # {{{ top-level codegen + + def generate_code(self, kernel, codegen_state, impl_arg_info): + from cgen import (FunctionBody, FunctionDeclaration, Value, Module, + Block, Line, Statement as S) + from cgen.ispc import ISPCExport, ISPCTask + + knl_body, implemented_domains = kernel.target.generate_body( + kernel, codegen_state) + + inner_name = "lp_ispc_inner_"+kernel.name + arg_decls = [iai.cgen_declarator for iai in impl_arg_info] + knl_fbody = FunctionBody( + ISPCTask( + FunctionDeclaration( + Value("void", inner_name), + arg_decls)), + knl_body) + + # {{{ generate wrapper + + wrapper_body = Block() + + gsize, lsize = kernel.get_grid_sizes_as_exprs() + if len(lsize) > 1: + for i, ls_i in enumerate(lsize[1:]): + if ls_i != 1: + raise LoopyError("local axis %d (0-based) " + "has length > 1, which is unsupported " + "by ISPC" % ls_i) + + from pymbolic.mapper.stringifier import PREC_COMPARISON, PREC_NONE + ccm = self.get_expression_to_code_mapper(codegen_state) + + wrapper_body.extend([ + S("assert(programCount == %s)" + % ccm(lsize[0], PREC_COMPARISON)), + S("launch[%s] %s(%s)" + % ( + ", ".join( + ccm(gs_i, PREC_NONE) + for gs_i in gsize), + inner_name, + ", ".join(iai.name for iai in impl_arg_info) + )) + ]) + + wrapper_fbody = FunctionBody( + ISPCExport( + FunctionDeclaration( + Value("void", kernel.name), + [iai.cgen_declarator for iai in impl_arg_info])), + wrapper_body) + + # }}} + + mod = Module([ + knl_fbody, + Line(), + wrapper_fbody, + ]) + + return str(mod), implemented_domains + + # }}} + + # {{{ code generation guts + def get_global_axis_expr(self, axis): return var("taskIndex%d" % axis) @@ -42,6 +110,9 @@ class ISPCTarget(CTarget): else: raise LoopyError("ISPC only supports one local axis") + def add_vector_access(self, access_str, index): + return "(%s)[%d]" % (access_str, index) + def emit_barrier(self, kind, comment): from loopy.codegen import GeneratedInstruction from cgen import Comment, Statement @@ -61,25 +132,37 @@ class ISPCTarget(CTarget): else: raise LoopyError("unknown barrier kind") + def wrap_temporary_decl(self, decl, is_local): + from cgen.ispc import ISPCUniform, ISPCVarying + if is_local: + return ISPCUniform(decl) + else: + return ISPCVarying(decl) + def get_global_arg_decl(self, name, shape, dtype, is_written): from loopy.codegen import POD # uses the correct complex type from cgen import Const - from cgen.ispc import ISPCUniformPointer + from cgen.ispc import ISPCUniformPointer, ISPCUniform arg_decl = ISPCUniformPointer(POD(self, dtype, name)) if not is_written: arg_decl = Const(arg_decl) + arg_decl = ISPCUniform(arg_decl) + return arg_decl + def get_value_arg_decl(self, name, shape, dtype, is_written): + result = super(ISPCTarget, self).get_value_arg_decl( + name, shape, dtype, is_written) + + from cgen.ispc import ISPCUniform + return ISPCUniform(result) + # }}} -# TODO: Fix argument wrapping (value, -# TODO: Fix local variable wrapping -# TODO: Fix local variable alloc -# TODO: Top-level foreach # TODO: Generate launch code -# TODO: Vector types +# TODO: Vector types (element access: done) # vim: foldmethod=marker diff --git a/loopy/target/opencl/__init__.py b/loopy/target/opencl/__init__.py index 4a39e52453ab24beb5f627127435c0124d435803..4a9b0f31d9a8ffb57896fab773c474d6a5bca1bd 100644 --- a/loopy/target/opencl/__init__.py +++ b/loopy/target/opencl/__init__.py @@ -172,23 +172,6 @@ def opencl_preamble_generator(kernel, seen_dtypes, seen_functions): #endif """) - c_funcs = set(func.c_name for func in seen_functions) - if "int_floor_div" in c_funcs: - yield ("05_int_floor_div", """ - #define int_floor_div(a,b) \ - (( (a) - \ - ( ( (a)<0 ) != ( (b)<0 )) \ - *( (b) + ( (b)<0 ) - ( (b)>=0 ) )) \ - / (b) ) - """) - - if "int_floor_div_pos_b" in c_funcs: - yield ("05_int_floor_div_pos_b", """ - #define int_floor_div_pos_b(a,b) ( \ - ( (a) - ( ((a)<0) ? ((b)-1) : 0 ) ) / (b) \ - ) - """) - # }}} @@ -290,6 +273,10 @@ class OpenCLTarget(CTarget): def get_local_axis_expr(self, axis): return var("lid")(axis) + def add_vector_access(self, access_str, index): + # The 'int' avoids an 'L' suffix for long ints. + return "(%s).s%s" % (access_str, hex(int(index))[2:]) + def emit_barrier(self, kind, comment): """ :arg kind: ``"local"`` or ``"global"`` @@ -309,6 +296,13 @@ class OpenCLTarget(CTarget): else: raise LoopyError("unknown barrier kind") + def wrap_temporary_decl(self, decl, is_local): + if is_local: + from cgen.opencl import CLLocal + return CLLocal(decl) + else: + return decl + def get_global_arg_decl(self, name, shape, dtype, is_written): from cgen.opencl import CLGlobal @@ -324,7 +318,7 @@ class OpenCLTarget(CTarget): from cgen.opencl import CLImage return CLImage(self.num_target_axes(), mode, name) - def get_arg_decl(self, name, shape, dtype, is_written): + def get_constant_arg_decl(self, name, shape, dtype, is_written): from loopy.codegen import POD # uses the correct complex type from cgen import RestrictPointer, Const from cgen.opencl import CLConstant diff --git a/test/test_loopy.py b/test/test_loopy.py index 81c0dd5535cc2d6a4337cddc5ba2c8b6a394ebc3..3e14ed2d88c2671e00421314b146466f639c8ea7 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2320,14 +2320,12 @@ def test_ispc_backend(): "{ [i]: 0<=i<n }", "out[i] = 2*a[i]", [ - # Tests that comma'd arguments interoperate with - # argument guessing. lp.GlobalArg("out,a", np.float32, shape=lp.auto), "..." ], target=ISPCTarget()) - knl = lp.split_iname(knl, "i", 128, inner_tag="l.0") + knl = lp.split_iname(knl, "i", 8, inner_tag="l.0") knl = lp.split_iname(knl, "i_outer", 4, outer_tag="g.0", inner_tag="ilp") knl = lp.add_prefetch(knl, "a", ["i_inner", "i_outer_inner"])