diff --git a/loopy/__init__.py b/loopy/__init__.py index 50660f60a82bc4bc9ecb1d5a7102ac0afb42e40c..d1e8698be23a9ea2edf733433d38a534589591f1 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -131,6 +131,8 @@ from loopy.target.cuda import CudaTarget from loopy.target.opencl import OpenCLTarget from loopy.target.pyopencl import PyOpenCLTarget from loopy.target.ispc import ISPCTarget +from loopy.target.numba import NumbaTarget, NumbaCudaTarget + __all__ = [ "TaggedVariable", "Reduction", "LinearSubscript", @@ -228,6 +230,7 @@ __all__ = [ "TargetBase", "CTarget", "CudaTarget", "OpenCLTarget", "PyOpenCLTarget", "ISPCTarget", + "NumbaTarget", "NumbaCudaTarget", "ASTBuilderBase", # {{{ from this file diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 71a1c4b3f5972b383af07ec1b07ad5ca3cba020b..e3bcea77b7083ac8e95e73dfa04a3f3cad40d5a3 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -502,23 +502,7 @@ def generate_code_v2(kernel): for prea_gen in preamble_generators: preambles.extend(prea_gen(preamble_info)) - seen_preamble_tags = set() - dedup_preambles = [] - - for tag, preamble in sorted(preambles, key=lambda tag_code: tag_code[0]): - if tag in seen_preamble_tags: - continue - - seen_preamble_tags.add(tag) - dedup_preambles.append(preamble) - - from loopy.tools import remove_common_indentation - preamble_codes = [ - remove_common_indentation(lines) + "\n" - for lines in dedup_preambles] - - codegen_result = codegen_result.copy( - device_preambles=preamble_codes) + codegen_result = codegen_result.copy(device_preambles=preambles) # }}} diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 82dbe09986b2567e9951b2446313a8ef5fde4f8c..4f8ff6117b9cc1709228ec91db5d8ea85f0fab22 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -26,6 +26,23 @@ import six from pytools import Record +def process_preambles(preambles): + seen_preamble_tags = set() + dedup_preambles = [] + + for tag, preamble in sorted(preambles, key=lambda tag_code: tag_code[0]): + if tag in seen_preamble_tags: + continue + + seen_preamble_tags.add(tag) + dedup_preambles.append(preamble) + + from loopy.tools import remove_common_indentation + return [ + remove_common_indentation(lines) + "\n" + for lines in dedup_preambles] + + # {{{ code generation result class GeneratedProgram(Record): @@ -65,6 +82,7 @@ class CodeGenerationResult(Record): .. automethod:: host_code .. automethod:: device_code + .. automethod:: all_code .. attribute:: implemented_data_info @@ -96,7 +114,7 @@ class CodeGenerationResult(Record): **kwargs) def host_code(self): - preamble_codes = getattr(self, "host_preambles", []) + preamble_codes = process_preambles(getattr(self, "host_preambles", [])) return ( "".join(preamble_codes) @@ -104,13 +122,27 @@ class CodeGenerationResult(Record): str(self.host_program.ast)) def device_code(self): - preamble_codes = getattr(self, "device_preambles", []) + preamble_codes = process_preambles(getattr(self, "device_preambles", [])) return ( "".join(preamble_codes) + "\n" + "\n\n".join(str(dp.ast) for dp in self.device_programs)) + def all_code(self): + preamble_codes = process_preambles( + getattr(self, "host_preambles", []) + + + getattr(self, "device_preambles", []) + ) + + return ( + "".join(preamble_codes) + + "\n" + + "\n\n".join(str(dp.ast) for dp in self.device_programs) + + "\n\n" + + str(self.host_program.ast)) + def current_program(self, codegen_state): if codegen_state.is_generating_device_code: if self.device_programs: diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index aa50f03cb63eaa93a2e807f90c8dd08cd3ab4f4d..51b5f5343b90ac644d9f1d222637bf428d833d6e 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -732,7 +732,7 @@ class ExpressionToCMapper(RecursiveMapper): raise LoopyError("plain C does not have group hw axes") def map_local_hw_index(self, expr, enclosing_prec, type_context): - raise LoopyError("plain C does not have group hw axes") + raise LoopyError("plain C does not have local hw axes") # }}} diff --git a/loopy/target/numba.py b/loopy/target/numba.py new file mode 100644 index 0000000000000000000000000000000000000000..95c1de08c9ef90bda6438d613e45e0515508573d --- /dev/null +++ b/loopy/target/numba.py @@ -0,0 +1,220 @@ +"""Python host AST builder for integration with PyOpenCL.""" + +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2016 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +from pytools import memoize_method + +from loopy.target.python import ExpressionToPythonMapper, PythonASTBuilderBase +from loopy.target import TargetBase, DummyHostASTBuilder + +from loopy.diagnostic import LoopyWarning + + +# {{{ base numba + +def _base_numba_preamble_generator(preamble_info): + yield ("06_numba_imports", """ + import numba as _lpy_numba + """) + + +class NumbaBaseASTBuilder(PythonASTBuilderBase): + def preamble_generators(self): + return ( + super(NumbaBaseASTBuilder, self).preamble_generators() + [ + _base_numba_preamble_generator + ]) + + def get_function_definition(self, codegen_state, codegen_result, + schedule_index, + function_decl, function_body): + + assert function_decl is None + + from genpy import Function + return Function( + codegen_result.current_program(codegen_state).name, + [idi.name for idi in codegen_state.implemented_data_info], + function_body, + decorators=self.get_python_function_decorators()) + + def get_python_function_decorators(self): + return () + + def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): + from pymbolic.mapper.stringifier import PREC_NONE + from genpy import Statement + + ecm = self.get_expression_to_code_mapper(codegen_state) + implemented_data_info = codegen_state.implemented_data_info + + return Statement( + "%s[%s, %s](%s)" % ( + name, + ecm(gsize, PREC_NONE), + ecm(lsize, PREC_NONE), + ", ".join(idi.name for idi in implemented_data_info) + )) + + +class NumbaJITASTBuilder(NumbaBaseASTBuilder): + def get_python_function_decorators(self): + return ("@_lpy_numba.jit",) + + +class NumbaTarget(TargetBase): + """A target for plain Python as understood by Numba, without any parallel extensions. + """ + + def __init__(self): + from warnings import warn + warn("The Numba targets are not yet feature-complete", + LoopyWarning, stacklevel=2) + + def split_kernel_at_global_barriers(self): + return False + + def get_host_ast_builder(self): + return DummyHostASTBuilder(self) + + def get_device_ast_builder(self): + return NumbaJITASTBuilder(self) + + # {{{ types + + @memoize_method + def get_dtype_registry(self): + from loopy.target.c import DTypeRegistryWrapper + from loopy.target.c.compyte.dtypes import ( + DTypeRegistry, fill_registry_with_c_types) + result = DTypeRegistry() + fill_registry_with_c_types(result, respect_windows=False, + include_bool=True) + return DTypeRegistryWrapper(result) + + def is_vector_dtype(self, dtype): + return False + + def get_vector_dtype(self, base, count): + raise KeyError() + + def get_or_register_dtype(self, names, dtype=None): + # These kind of shouldn't be here. + return self.get_dtype_registry().get_or_register_dtype(names, dtype) + + def dtype_to_typename(self, dtype): + # These kind of shouldn't be here. + return self.get_dtype_registry().dtype_to_ctype(dtype) + + # }}} + +# }}} + + +# {{{ numba.cuda + +class NumbaCudaExpressionToPythonMapper(ExpressionToPythonMapper): + _GRID_AXES = "xyz" + + def map_group_hw_index(self, expr, enclosing_prec): + return "_lpy_ncu.blockIdx.%s" % self._GRID_AXES[expr.axis] + + def map_local_hw_index(self, expr, enclosing_prec): + return "_lpy_ncu.threadIdx.%s" % self._GRID_AXES[expr.axis] + + +def _cuda_numba_preamble_generator(preamble_info): + yield ("06_import_numba_cuda", """ + import numba.cuda as _lpy_ncu + """) + + +class NumbaCudaASTBuilder(NumbaBaseASTBuilder): + def preamble_generators(self): + return ( + super(NumbaCudaASTBuilder, self).preamble_generators() + [ + _cuda_numba_preamble_generator + ]) + + def get_python_function_decorators(self): + return ("@_lpy_ncu.jit",) + + def get_expression_to_code_mapper(self, codegen_state): + return NumbaCudaExpressionToPythonMapper(codegen_state) + + +class NumbaCudaTarget(TargetBase): + """A target for plain Python, without any parallel extensions. + """ + + host_program_name_suffix = "" + device_program_name_suffix = "_inner" + + def __init__(self): + from warnings import warn + warn("The Numba target is not yet feature-complete", + LoopyWarning, stacklevel=2) + + def split_kernel_at_global_barriers(self): + return True + + def get_host_ast_builder(self): + return NumbaBaseASTBuilder(self) + + def get_device_ast_builder(self): + return NumbaCudaASTBuilder(self) + + # {{{ types + + @memoize_method + def get_dtype_registry(self): + from loopy.target.c import DTypeRegistryWrapper + from loopy.target.c.compyte.dtypes import ( + DTypeRegistry, fill_registry_with_c_types) + result = DTypeRegistry() + fill_registry_with_c_types(result, respect_windows=False, + include_bool=True) + return DTypeRegistryWrapper(result) + + def is_vector_dtype(self, dtype): + return False + + def get_vector_dtype(self, base, count): + raise KeyError() + + def get_or_register_dtype(self, names, dtype=None): + # These kind of shouldn't be here. + return self.get_dtype_registry().get_or_register_dtype(names, dtype) + + def dtype_to_typename(self, dtype): + # These kind of shouldn't be here. + return self.get_dtype_registry().dtype_to_ctype(dtype) + + # }}} + +# }}} + +# vim: foldmethod=marker diff --git a/loopy/target/python.py b/loopy/target/python.py index 83e8df12459dc0b79d0789341dfc1213c008e084..f2a529c34bc4c56f69db1693058308769919e27a 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -24,11 +24,14 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ +import six +import numpy as np +from pymbolic.mapper import Mapper from pymbolic.mapper.stringifier import StringifyMapper from loopy.expression import TypeInferenceMapper from loopy.kernel.data import ValueArg -from loopy.diagnostic import LoopyError +from loopy.diagnostic import LoopyError # noqa from loopy.target import ASTBuilderBase @@ -43,6 +46,9 @@ class ExpressionToPythonMapper(StringifyMapper): type_inf_mapper = TypeInferenceMapper(self.kernel) self.type_inf_mapper = type_inf_mapper + def handle_unsupported_expression(self, victim, enclosing_prec): + return Mapper.handle_unsupported_expression(self, victim, enclosing_prec) + def rec(self, expr, prec, type_context=None, needed_dtype=None): return super(ExpressionToPythonMapper, self).rec(expr, prec) @@ -52,6 +58,12 @@ class ExpressionToPythonMapper(StringifyMapper): return repr(expr) def map_variable(self, expr, enclosing_prec): + if expr.name in self.codegen_state.var_subst_map: + # Unimplemented: annotate_inames + return str(self.rec( + self.codegen_state.var_subst_map[expr.name], + enclosing_prec)) + if expr.name in self.kernel.all_inames(): return super(ExpressionToPythonMapper, self).map_variable( expr, enclosing_prec) @@ -61,24 +73,147 @@ class ExpressionToPythonMapper(StringifyMapper): return super(ExpressionToPythonMapper, self).map_variable( expr, enclosing_prec) - raise LoopyError("may not refer to %s '%s' in host code" - % (type(var_descr).__name__, expr.name)) + return super(ExpressionToPythonMapper, self).map_variable( + expr, enclosing_prec) def map_subscript(self, expr, enclosing_prec): - raise LoopyError("may not subscript '%s' in host code" - % expr.aggregate.name) + return super(ExpressionToPythonMapper, self).map_subscript( + expr, enclosing_prec) + + def map_call(self, expr, enclosing_prec): + from pymbolic.primitives import Variable + from pymbolic.mapper.stringifier import PREC_NONE + + identifier = expr.function + + if identifier.name in ["indexof", "indexof_vec"]: + raise LoopyError( + "indexof, indexof_vec not yet supported in Python") + + if isinstance(identifier, Variable): + identifier = identifier.name + + par_dtypes = tuple(self.type_inf_mapper(par) for par in expr.parameters) + + str_parameters = None + + mangle_result = self.kernel.mangle_function( + identifier, par_dtypes, + ast_builder=self.codegen_state.ast_builder) + + if mangle_result is None: + raise RuntimeError("function '%s' unknown--" + "maybe you need to register a function mangler?" + % identifier) + + if len(mangle_result.result_dtypes) != 1: + raise LoopyError("functions with more or fewer than one return value " + "may not be used in an expression") + + str_parameters = [ + self.rec(par, PREC_NONE) + for par, par_dtype, tgt_dtype in zip( + expr.parameters, par_dtypes, mangle_result.arg_dtypes)] + + from loopy.codegen import SeenFunction + self.codegen_state.seen_functions.add( + SeenFunction(identifier, + mangle_result.target_name, + mangle_result.arg_dtypes or par_dtypes)) + + return "%s(%s)" % (mangle_result.target_name, ", ".join(str_parameters)) + + def map_group_hw_index(self, expr, enclosing_prec): + raise LoopyError("plain Python does not have group hw axes") + + def map_local_hw_index(self, expr, enclosing_prec): + raise LoopyError("plain Python does not have local hw axes") # }}} # {{{ ast builder +def _numpy_single_arg_function_mangler(kernel, name, arg_dtypes): + if (not isinstance(name, str) + or not hasattr(np, name) + or len(arg_dtypes) != 1): + return None + + arg_dtype, = arg_dtypes + + from loopy.kernel.data import CallMangleInfo + return CallMangleInfo( + target_name="_lpy_np."+name, + result_dtypes=(arg_dtype,), + arg_dtypes=arg_dtypes) + + +def _base_python_preamble_generator(preamble_info): + yield ("00_future", "from __future__ import division, print_function\n") + yield ("05_numpy_import", """ + import numpy as _lpy_np + """) + + class PythonASTBuilderBase(ASTBuilderBase): """A Python host AST builder for integration with PyOpenCL. """ # {{{ code generation guts + def function_manglers(self): + return ( + super(PythonASTBuilderBase, self).function_manglers() + [ + _numpy_single_arg_function_mangler, + ]) + + def preamble_generators(self): + return ( + super(PythonASTBuilderBase, self).preamble_generators() + [ + _base_python_preamble_generator + ]) + + def get_function_declaration(self, codegen_state, codegen_result, + schedule_index): + return None + + def get_function_definition(self, codegen_state, codegen_result, + schedule_index, + function_decl, function_body): + + assert function_decl is None + + from genpy import Function + return Function( + codegen_result.current_program(codegen_state).name, + [idi.name for idi in codegen_state.implemented_data_info], + function_body) + + def get_temporary_decls(self, codegen_state, schedule_index): + kernel = codegen_state.kernel + ecm = codegen_state.expression_to_code_mapper + + result = [] + + from pymbolic.mapper.stringifier import PREC_NONE + from genpy import Assign + + for tv in sorted( + six.itervalues(kernel.temporary_variables), + key=lambda tv: tv.name): + if tv.shape: + result.append( + Assign( + tv.name, + "_lpy_np.empty(%s, dtype=%s)" + % ( + ecm(tv.shape, PREC_NONE, "i"), + "_lpy_np."+tv.dtype.numpy_dtype.name + ))) + + return result + def get_expression_to_code_mapper(self, codegen_state): return ExpressionToPythonMapper(codegen_state) @@ -121,6 +256,10 @@ class PythonASTBuilderBase(ASTBuilderBase): from genpy import If return If(condition_str, ast) + def emit_assignment(self, codegen_state, lhs, rhs): + from genpy import Assign + return Assign(lhs, rhs) + # }}} # }}} diff --git a/test/test_target.py b/test/test_target.py index 67204d9c1fbc02624eebf230b87d33c114c21ef1..e8548ccca6732f2fac6ea84d98d5b06f1d472c6c 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -193,6 +193,37 @@ def test_clamp(ctx_factory): evt, (out,) = knl(queue, x=x, a=np.float32(12), b=np.float32(15)) +def test_numba_target(): + knl = lp.make_kernel( + "{[i,j,k]: 0<=i,j<M and 0<=k<N}", + "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))", + target=lp.NumbaTarget()) + + knl = lp.add_and_infer_dtypes(knl, {"X": np.float32}) + + print(lp.generate_code_v2(knl).device_code()) + + +def test_numba_cuda_target(): + knl = lp.make_kernel( + "{[i,j,k]: 0<=i,j<M and 0<=k<N}", + "D[i,j] = sqrt(sum(k, (X[i, k]-X[j, k])**2))", + target=lp.NumbaCudaTarget()) + + knl = lp.assume(knl, "M>0") + knl = lp.split_iname(knl, "i", 16, outer_tag='g.0') + knl = lp.split_iname(knl, "j", 128, inner_tag='l.0', slabs=(0, 1)) + knl = lp.add_prefetch(knl, "X[i,:]") + knl = lp.fix_parameters(knl, N=3) + knl = lp.set_loop_priority(knl, "i_inner,j_outer") + knl = lp.tag_inames(knl, "k:unr") + knl = lp.tag_array_axes(knl, "X", "N0,N1") + + knl = lp.add_and_infer_dtypes(knl, {"X": np.float32}) + + print(lp.generate_code_v2(knl).all_code()) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])