From 67669015c5d585c2c018e97f8f27c2187dde3ecb Mon Sep 17 00:00:00 2001 From: arghdos Date: Mon, 1 May 2017 12:11:53 -0400 Subject: [PATCH 01/65] first attempt at subclassed executors / wrappers --- loopy/execution.py | 707 ++++++++++++++++++++++++++- loopy/target/c/__init__.py | 10 +- loopy/target/c/c_execution.py | 265 +++++++++++ loopy/target/pyopencl_execution.py | 734 ++++++----------------------- 4 files changed, 1121 insertions(+), 595 deletions(-) create mode 100644 loopy/target/c/c_execution.py diff --git a/loopy/execution.py b/loopy/execution.py index 5680fdbfe..65968e663 100644 --- a/loopy/execution.py +++ b/loopy/execution.py @@ -27,6 +27,8 @@ import six import numpy as np from pytools import ImmutableRecord, memoize_method from loopy.diagnostic import LoopyError +from pytools.py_codegen import ( + Indentation, PythonFunctionGenerator) # {{{ object array argument packing @@ -111,7 +113,613 @@ class SeparateArrayPackingController(object): # }}} -# {{{ KernelExecutorBase +# {{{ ExecutionWrapperGeneratorBase + +class ExecutionWrapperGeneratorBase(object): + """ + A set of common methods for generating a wrapper + for execution of C-based languages + + """ + + def __init__(self, system_args=["_lpy_c_kernels"]): + self.system_args = system_args[:] + + def python_dtype_str(self, dtype): + if dtype.isbuiltin: + return "_lpy_np."+dtype.name + raise Exception('dtype: {} not recognized'.format(dtype)) + + # {{{ invoker generation + + # /!\ This code runs in a namespace controlled by the user. + # Prefix all auxiliary variables with "_lpy". + + # {{{ integer arg finding from shapes + + def generate_integer_arg_finding_from_shapes( + self, gen, kernel, implemented_data_info): + # a mapping from integer argument names to a list of tuples + # (arg_name, expression), where expression is a + # unary function of kernel.arg_dict[arg_name] + # returning the desired integer argument. + iarg_to_sources = {} + + from loopy.kernel.data import GlobalArg + from loopy.symbolic import DependencyMapper, StringifyMapper + from loopy.diagnostic import ParameterFinderWarning + dep_map = DependencyMapper() + + from pymbolic import var + for arg in implemented_data_info: + if arg.arg_class is GlobalArg: + sym_shape = var(arg.name).attr("shape") + for axis_nr, shape_i in enumerate(arg.shape): + if shape_i is None: + continue + + deps = dep_map(shape_i) + + if len(deps) == 1: + integer_arg_var, = deps + + if kernel.arg_dict[integer_arg_var.name].dtype.is_integral(): + from pymbolic.algorithm import solve_affine_equations_for + try: + # friggin' overkill :) + iarg_expr = solve_affine_equations_for( + [integer_arg_var.name], + [(shape_i, sym_shape.index(axis_nr))] + )[integer_arg_var] + except Exception as e: + #from traceback import print_exc + #print_exc() + + # went wrong? oh well + from warnings import warn + warn("Unable to generate code to automatically " + "find '%s' from the shape of '%s':\n%s" + % (integer_arg_var.name, arg.name, str(e)), + ParameterFinderWarning) + else: + iarg_to_sources.setdefault(integer_arg_var.name, [])\ + .append((arg.name, iarg_expr)) + + gen("# {{{ find integer arguments from shapes") + gen("") + + for iarg_name, sources in six.iteritems(iarg_to_sources): + gen("if %s is None:" % iarg_name) + with Indentation(gen): + if_stmt = "if" + for arg_name, value_expr in sources: + gen("%s %s is not None:" % (if_stmt, arg_name)) + with Indentation(gen): + gen("%s = %s" + % (iarg_name, StringifyMapper()(value_expr))) + + if_stmt = "elif" + + gen("") + + gen("# }}}") + gen("") + + # }}} + + # {{{ integer arg finding from offsets + + def generate_integer_arg_finding_from_offsets(self, gen, kernel, + implemented_data_info): + options = kernel.options + + gen("# {{{ find integer arguments from offsets") + gen("") + + for arg in implemented_data_info: + impl_array_name = arg.offset_for_name + if impl_array_name is not None: + gen("if %s is None:" % arg.name) + with Indentation(gen): + gen("if %s is None:" % impl_array_name) + with Indentation(gen): + gen("# Output variable, we'll be allocating " + "it, with zero offset.") + gen("%s = 0" % arg.name) + gen("else:") + with Indentation(gen): + if not options.no_numpy: + gen("_lpy_offset = getattr(%s, \"offset\", 0)" + % impl_array_name) + else: + gen("_lpy_offset = %s.offset" % impl_array_name) + + base_arg = kernel.impl_arg_to_arg[impl_array_name] + + if not options.skip_arg_checks: + gen("%s, _lpy_remdr = divmod(_lpy_offset, %d)" + % (arg.name, base_arg.dtype.itemsize)) + + gen("assert _lpy_remdr == 0, \"Offset of array '%s' is " + "not divisible by its dtype itemsize\"" + % impl_array_name) + gen("del _lpy_remdr") + else: + gen("%s = _lpy_offset // %d" + % (arg.name, base_arg.dtype.itemsize)) + + if not options.skip_arg_checks: + gen("del _lpy_offset") + + gen("# }}}") + gen("") + + # }}} + + # {{{ integer arg finding from strides + + def generate_integer_arg_finding_from_strides( + self, gen, kernel, implemented_data_info): + options = kernel.options + + gen("# {{{ find integer arguments from strides") + gen("") + + for arg in implemented_data_info: + if arg.stride_for_name_and_axis is not None: + impl_array_name, stride_impl_axis = arg.stride_for_name_and_axis + + gen("if %s is None:" % arg.name) + with Indentation(gen): + if not options.skip_arg_checks: + gen("if %s is None:" % impl_array_name) + with Indentation(gen): + gen("raise RuntimeError(\"required stride '%s' for " + "argument '%s' not given or deducible from " + "passed array\")" + % (arg.name, impl_array_name)) + + base_arg = kernel.impl_arg_to_arg[impl_array_name] + + if not options.skip_arg_checks: + gen("%s, _lpy_remdr = divmod(%s.strides[%d], %d)" + % (arg.name, impl_array_name, stride_impl_axis, + base_arg.dtype.dtype.itemsize)) + + gen("assert _lpy_remdr == 0, \"Stride %d of array '%s' " + " is not divisible by its dtype itemsize\"" + % (stride_impl_axis, impl_array_name)) + gen("del _lpy_remdr") + else: + gen("%s = _lpy_offset // %d" + % (arg.name, base_arg.dtype.itemsize)) + + gen("# }}}") + gen("") + + # }}} + + # {{{ check that value args are present + + def generate_value_arg_check( + self, gen, kernel, implemented_data_info): + if kernel.options.skip_arg_checks: + return + + from loopy.kernel.data import ValueArg + + gen("# {{{ check that value args are present") + gen("") + + for arg in implemented_data_info: + if not issubclass(arg.arg_class, ValueArg): + continue + + gen("if %s is None:" % arg.name) + with Indentation(gen): + gen("raise TypeError(\"value argument '%s' " + "was not given and could not be automatically " + "determined\")" % arg.name) + + gen("# }}}") + gen("") + + # }}} + + # {{{ handle non numpy arguements + + def handle_non_numpy_arg(self, gen, arg): + raise Exception('Non-numpy args are not allowed for C-execution') + + # }}} + + # {{{ handle allocation of unspecified arguements + + def handle_alloc(self, gen, arg, kernel_arg, strify, skip_arg_checks): + """ + Handle allocation of non-specified arguements for C-execution + """ + from pymbolic import var + + num_axes = len(arg.unvec_shape) + for i in range(num_axes): + gen("_lpy_shape_%d = %s" % (i, strify(arg.unvec_shape[i]))) + + sym_order = var('_lpy_order') + gen("%s = %s" % (strify(sym_order), arg.order)) + + sym_shape = tuple( + var("_lpy_shape_%d" % i) + for i in range(num_axes)) + + if not skip_arg_checks: + for i in range(num_axes): + gen("assert _lpy_shape_%d > 0, " + "\"'%s' has negative shape in axis %d\"" + % (i, arg.name, i)) + + gen("%(name)s = _lpy_np.empty(%(shape)s, " + "%(dtype)s, order=%(order)s)" + % dict( + name=arg.name, + shape=strify(sym_shape), + order=strify(sym_order), + dtype=self.python_dtype_str( + kernel_arg.dtype.numpy_dtype))) + + if not skip_arg_checks: + for i in range(num_axes): + gen("del _lpy_shape_%d" % i) + gen("del %s" % strify(sym_order)) + gen("") + + # }}} + + # {{{ arg setup + + def generate_arg_setup( + self, gen, kernel, implemented_data_info, options): + import loopy as lp + + from loopy.kernel.data import KernelArgument + from loopy.kernel.array import ArrayBase + from loopy.symbolic import StringifyMapper + from loopy.types import NumpyType + + gen("# {{{ set up array arguments") + gen("") + + if not options.no_numpy: + gen("_lpy_encountered_numpy = False") + gen("_lpy_encountered_dev = False") + gen("") + + args = [] + + strify = StringifyMapper() + + expect_no_more_arguments = False + + for arg_idx, arg in enumerate(implemented_data_info): + is_written = arg.base_name in kernel.get_written_variables() + kernel_arg = kernel.impl_arg_to_arg.get(arg.name) + + if not issubclass(arg.arg_class, KernelArgument): + expect_no_more_arguments = True + continue + + if expect_no_more_arguments: + raise LoopyError("Further arguments encountered after arg info " + "describing a global temporary variable") + + if not issubclass(arg.arg_class, ArrayBase): + args.append(arg.name) + continue + + gen("# {{{ process %s" % arg.name) + gen("") + + if not options.no_numpy: + self.handle_non_numpy_arg(gen, arg) + + if not options.skip_arg_checks and not is_written: + gen("if %s is None:" % arg.name) + with Indentation(gen): + gen("raise RuntimeError(\"input argument '%s' must " + "be supplied\")" % arg.name) + gen("") + + if (is_written + and arg.arg_class is lp.ImageArg + and not options.skip_arg_checks): + gen("if %s is None:" % arg.name) + with Indentation(gen): + gen("raise RuntimeError(\"written image '%s' must " + "be supplied\")" % arg.name) + gen("") + + if is_written and arg.shape is None and not options.skip_arg_checks: + gen("if %s is None:" % arg.name) + with Indentation(gen): + gen("raise RuntimeError(\"written argument '%s' has " + "unknown shape and must be supplied\")" % arg.name) + gen("") + + possibly_made_by_loopy = False + + # {{{ allocate written arrays, if needed + + if is_written and arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \ + and arg.shape is not None: + + if not isinstance(arg.dtype, NumpyType): + raise LoopyError("do not know how to pass arg of type '%s'" + % arg.dtype) + + possibly_made_by_loopy = True + gen("_lpy_made_by_loopy = False") + gen("") + + gen("if %s is None:" % arg.name) + with Indentation(gen): + self.handle_alloc( + gen, arg, kernel_arg, strify, options.skip_arg_checks) + gen("_lpy_made_by_loopy = True") + gen("") + + # }}} + + # {{{ argument checking + + if arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \ + and not options.skip_arg_checks: + if possibly_made_by_loopy: + gen("if not _lpy_made_by_loopy:") + else: + gen("if True:") + + with Indentation(gen): + gen("if %s.dtype != %s:" + % (arg.name, self.python_dtype_str( + kernel_arg.dtype.numpy_dtype))) + with Indentation(gen): + gen("raise TypeError(\"dtype mismatch on argument '%s' " + "(got: %%s, expected: %s)\" %% %s.dtype)" + % (arg.name, arg.dtype, arg.name)) + + # {{{ generate shape checking code + + def strify_allowing_none(shape_axis): + if shape_axis is None: + return "None" + else: + return strify(shape_axis) + + def strify_tuple(t): + if len(t) == 0: + return "()" + else: + return "(%s,)" % ", ".join( + strify_allowing_none(sa) + for sa in t) + + shape_mismatch_msg = ( + "raise TypeError(\"shape mismatch on argument '%s' " + "(got: %%s, expected: %%s)\" " + "%% (%s.shape, %s))" + % (arg.name, arg.name, strify_tuple(arg.unvec_shape))) + + if kernel_arg.shape is None: + pass + + elif any(shape_axis is None for shape_axis in kernel_arg.shape): + gen("if len(%s.shape) != %s:" + % (arg.name, len(arg.unvec_shape))) + with Indentation(gen): + gen(shape_mismatch_msg) + + for i, shape_axis in enumerate(arg.unvec_shape): + if shape_axis is None: + continue + + gen("if %s.shape[%d] != %s:" + % (arg.name, i, strify(shape_axis))) + with Indentation(gen): + gen(shape_mismatch_msg) + + else: # not None, no Nones in tuple + gen("if %s.shape != %s:" + % (arg.name, strify(arg.unvec_shape))) + with Indentation(gen): + gen(shape_mismatch_msg) + + # }}} + + if arg.unvec_strides and kernel_arg.dim_tags: + itemsize = kernel_arg.dtype.numpy_dtype.itemsize + sym_strides = tuple( + itemsize*s_i for s_i in arg.unvec_strides) + gen("if %s.strides != %s:" + % (arg.name, strify(sym_strides))) + with Indentation(gen): + gen("raise TypeError(\"strides mismatch on " + "argument '%s' (got: %%s, expected: %%s)\" " + "%% (%s.strides, %s))" + % (arg.name, arg.name, strify(sym_strides))) + + if not arg.allows_offset: + gen("if %s.offset:" % arg.name) + with Indentation(gen): + gen("raise ValueError(\"Argument '%s' does not " + "allow arrays with offsets. Try passing " + "default_offset=loopy.auto to make_kernel()." + "\")" % arg.name) + gen("") + + # }}} + + if possibly_made_by_loopy and not options.skip_arg_checks: + gen("del _lpy_made_by_loopy") + gen("") + + if arg.arg_class in [lp.GlobalArg, lp.ConstantArg]: + args.append("%s.base_data" % arg.name) + else: + args.append("%s" % arg.name) + + gen("") + + gen("# }}}") + gen("") + + gen("# }}}") + gen("") + + return args + + # }}} + + def target_specific_preamble(self, gen): + """ + Add default C-imports to preamble + """ + gen.add_to_preamble("import numpy as _lpy_np") + gen.add_to_preamble("import loopy.target.c_execution as _lpy_c") + + def intialize_system_args(self, gen): + """ + Override to intialize any default system args + """ + pass + + # {{{ generate invocation + + def generate_invocation(self, gen, kernel_name, args): + gen("for knl in _lpy_c_kernels:") + with Indentation(gen): + gen("{kernel_name}({args})" + .format( + kernel_name='knl.name', + args=", ".join(args))) + + # }}} + + # {{{ output + + def generate_output_handler( + self, gen, options, kernel, implemented_data_info): + + from loopy.kernel.data import KernelArgument + + if not options.no_numpy: + gen("if out_host is None and (_lpy_encountered_numpy " + "and not _lpy_encountered_dev):") + with Indentation(gen): + gen("out_host = True") + + gen("if out_host:") + with Indentation(gen): + gen("pass") # if no outputs (?!) + for arg in implemented_data_info: + if not issubclass(arg.arg_class, KernelArgument): + continue + + gen("") + + if options.return_dict: + gen("return None, {%s}" + % ", ".join("\"%s\": %s" % (arg.name, arg.name) + for arg in implemented_data_info + if issubclass(arg.arg_class, KernelArgument) + if arg.base_name in kernel.get_written_variables())) + else: + out_args = [arg + for arg in implemented_data_info + if issubclass(arg.arg_class, KernelArgument) + if arg.base_name in kernel.get_written_variables()] + if out_args: + gen("return None, (%s,)" + % ", ".join(arg.name for arg in out_args)) + else: + gen("return None, ()") + + # }}} + + def __call__(self, kernel, codegen_result): + """ + Generates the wrapping python invoker for this execution target + + :arg kernel: the loopy :class:`LoopKernel`(s) to be executued + :codegen_result: the loopy :class:`CodeGenerationResult` created + by code generation + + :returns: py_func, a python function that handles excution of this + kernel + """ + options = kernel.options + implemented_data_info = codegen_result.implemented_data_info + host_code = codegen_result.host_code() + + from loopy.kernel.data import KernelArgument + gen = PythonFunctionGenerator( + "invoke_%s_loopy_kernel" % kernel.name, + self.system_args + [ + "%s=None" % idi.name + for idi in implemented_data_info + if issubclass(idi.arg_class, KernelArgument) + ]) + + gen.add_to_preamble("from __future__ import division") + gen.add_to_preamble("") + self.target_specific_preamble(gen) + gen.add_to_preamble("") + gen.add_to_preamble(host_code) + gen.add_to_preamble("") + + self.intialize_system_args(gen) + + self.generate_integer_arg_finding_from_shapes( + gen, kernel, implemented_data_info) + self.generate_integer_arg_finding_from_offsets( + gen, kernel, implemented_data_info) + self.generate_integer_arg_finding_from_strides( + gen, kernel, implemented_data_info) + self.generate_value_arg_check( + gen, kernel, implemented_data_info) + + args = self.generate_arg_setup( + gen, kernel, implemented_data_info, options) + + self.generate_invocation(codegen_result.host_program.name, args) + + self.generate_output_handler(gen, options, kernel, implemented_data_info) + + if options.write_wrapper: + output = gen.get() + if options.highlight_wrapper: + output = get_highlighted_python_code(output) + + if options.write_wrapper is True: + print(output) + else: + with open(options.write_wrapper, "w") as outf: + outf.write(output) + + return gen.get_function() + + +# }}} + + +class _KernelInfo(ImmutableRecord): + pass + + +class _Kernels(object): + pass + + +# {{{ kernel executor class KernelExecutorBase(object): """An object connecting a kernel to a :class:`pyopencl.Context` @@ -121,7 +729,7 @@ class KernelExecutorBase(object): .. automethod:: __call__ """ - def __init__(self, kernel): + def __init__(self, kernel, invoker=ExecutionWrapperGeneratorBase()): """ :arg kernel: a loopy.LoopKernel """ @@ -137,6 +745,8 @@ class KernelExecutorBase(object): arg.dtype is None for arg in kernel.args) + self.invoker = invoker + @memoize_method def get_typed_and_scheduled_kernel(self, var_to_dtype_set): kernel = self.kernel @@ -195,6 +805,99 @@ class KernelExecutorBase(object): return frozenset(six.iteritems(arg_to_dtype)) + # {{{ debugging aids + + def get_highlighted_code(self, arg_to_dtype=None): + return get_highlighted_code( + self.get_code(arg_to_dtype)) + + def get_code(self, arg_to_dtype=None): + if arg_to_dtype is not None: + arg_to_dtype = frozenset(six.iteritems(arg_to_dtype)) + + kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype) + + from loopy.codegen import generate_code_v2 + code = generate_code_v2(kernel) + return code.device_code() + + # }}} + + # {{{ call and info generator + + @memoize_method + def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): + raise NotImplementedError() + + def __call__(self, queue, **kwargs): + """ + :arg allocator: a callable passed a byte count and returning + a :class:`pyopencl.Buffer`. A :class:`pyopencl` allocator + maybe. + :arg wait_for: A list of :class:`pyopencl.Event` instances + for which to wait. + :arg out_host: :class:`bool` + Decides whether output arguments (i.e. arguments + written by the kernel) are to be returned as + :mod:`numpy` arrays. *True* for yes, *False* for no. + + For the default value of *None*, if all (input) array + arguments are :mod:`numpy` arrays, defaults to + returning :mod:`numpy` arrays as well. + + :returns: ``(evt, output)`` where *evt* is a :class:`pyopencl.Event` + associated with the execution of the kernel, and + output is a tuple of output arguments (arguments that + are written as part of the kernel). The order is given + by the order of kernel arguments. If this order is unspecified + (such as when kernel arguments are inferred automatically), + enable :attr:`loopy.Options.return_dict` to make *output* a + :class:`dict` instead, with keys of argument names and values + of the returned arrays. + """ + + allocator = kwargs.pop("allocator", None) + wait_for = kwargs.pop("wait_for", None) + out_host = kwargs.pop("out_host", None) + + kwargs = self.packing_controller.unpack(kwargs) + + kernel_info = self.cl_kernel_info(self.arg_to_dtype_set(kwargs)) + + return kernel_info.invoker( + kernel_info.cl_kernels, queue, allocator, wait_for, + out_host, **kwargs) + + # }}} + +# }}} + +# {{{ code highlighers + + +def get_highlighted_python_code(text): + try: + from pygments import highlight + except ImportError: + return text + else: + from pygments.lexers import PythonLexer + from pygments.formatters import TerminalFormatter + + return highlight(text, PythonLexer(), TerminalFormatter()) + + +def get_highlighted_code(text): + try: + from pygments import highlight + except ImportError: + return text + else: + from pygments.lexers import CLexer + from pygments.formatters import TerminalFormatter + + return highlight(text, CLexer(), TerminalFormatter()) + # }}} # vim: foldmethod=marker diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index e4835a363..85d751260 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -260,8 +260,9 @@ class CTarget(TargetBase): hash_fields = TargetBase.hash_fields + ("fortran_abi",) comparison_fields = TargetBase.comparison_fields + ("fortran_abi",) - def __init__(self, fortran_abi=False): + def __init__(self, fortran_abi=False, compiler=None): self.fortran_abi = fortran_abi + self.compiler = compiler super(CTarget, self).__init__() def split_kernel_at_global_barriers(self): @@ -298,6 +299,13 @@ class CTarget(TargetBase): # These kind of shouldn't be here. return self.get_dtype_registry().dtype_to_ctype(dtype) + def get_kernel_executor_cache_key(self, *args, **kwargs): + return self.compiler + + def get_kernel_executor(self, knl, *args, **kwargs): + from loopy.target.c import CKernelExecutor + return CKernelExecutor(knl, self.compiler) + # }}} diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py new file mode 100644 index 000000000..230777529 --- /dev/null +++ b/loopy/target/c/c_execution.py @@ -0,0 +1,265 @@ +from __future__ import division, with_statement, absolute_import + +__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import tempfile +import cgen +import os +import subprocess + +from loopy.target.c import CTarget, generate_header +from loopy.codegen import generate_code +from loopy.execution import (KernelExecutorBase, _Kernels, + _KernelInfo, ExecutionWrapperGeneratorBase) +from pytools import memoize_method +import weakref + +import ctypes + +import logging +logger = logging.getLogger(__name__) + +""" +The compiler module handles invocation of compilers to generate a shared lib +which can be loaded via ctypes. +""" + + +class CCompiler(object): + """ + Wraps a C compiler to build and load shared libraries. + Defaults to gcc + """ + + source_suffix = 'c' + default_exe = 'gcc' + default_compile_flags = '-std=c99 -g -O3 -fPIC'.split() + default_link_flags = '-shared'.split() + + def __init__(self, cc=None, + cflags=None, + ldflags=None): + self.exe = cc if cc else self.default_exe + self.cflags = cflags or self.default_compile_flags[:] + self.ldflags = ldflags or self.default_link_flags[:] + self.tempdir = tempfile.TemporaryDirectory() + + def _tempname(self, name): + """Build temporary filename path in tempdir.""" + return os.path.join(self.tempdir.name, name) + + def _call(self, args, **kwargs): + """Invoke compiler with arguments.""" + cwd = self.tempdir.name + args_ = [self.exe] + args + logger.debug(args_) + subprocess.check_call(args_, cwd=cwd, **kwargs) + + def build(self, code): + """Compile code, build and load shared library.""" + logger.debug(code) + c_fname = self._tempname('code.' + self.source_suffix) + obj_fname = self._tempname('code.o') + dll_fname = self._tempname('code.so') + with open(c_fname, 'w') as fd: + fd.write(code) + self._call(self.compile_args(c_fname)) + self._call(self.link_args(obj_fname, dll_fname)) + return ctypes.CDLL(dll_fname) + + def compile_args(self, c_fname): + "Construct args for compile command." + return self.cflags + ['-c', c_fname] + + def link_args(self, obj_fname, dll_fname): + "Construct args for link command." + return self.ldflags + ['-shared', obj_fname, '-o', dll_fname] + + +class CppCompiler(CCompiler): + """Subclass of Compiler to invoke a C++ compiler. + Defaults to g++""" + source_suffix = 'cpp' + default_exe = 'g++' + default_compile_flags = '-g -O3'.split() + + +class CompiledKernel(object): + """ + A CompiledKernel wraps a loopy kernel, compiling it and loading the + result as a shared library, and provides access to the kernel as a + ctypes function object, wrapped by the __call__ method, which attempts + to automatically map argument types. + """ + + def __init__(self, knl, comp=None): + assert isinstance(knl.target, CTarget) + self.knl = knl + self.code, _ = generate_code(knl) + self.comp = comp or CCompiler() + self.dll = self.comp.build(self.code) + self.func_decl, = generate_header(knl) + self._arg_info = [] + # TODO knl.args[:].dtype is sufficient + self._visit_func_decl(self.func_decl) + self.name = self.knl.name + restype = self.func_decl.subdecl.typename + if restype == 'void': + self.restype = None + else: + raise ValueError('Unhandled restype %r' % (restype, )) + self._fn = getattr(self.dll, self.name) + self._fn.restype = self.restype + self._fn.argtypes = [ctype for name, ctype in self._arg_info] + self._prepared_call_cache = weakref.WeakKeyDictionary() + + def __call__(self, **kwargs): + """Execute kernel with given args mapped to ctypes equivalents.""" + args_ = [] + for knl_arg, arg_t in zip(self.knl.args, self._fn.argtypes): + arg = kwargs[knl_arg.name] + if hasattr(arg, 'ctypes'): + if arg.size == 0: + # TODO eliminate unused arguments from kernel + arg_ = arg_t(0.0) + else: + arg_ = arg.ctypes.data_as(arg_t) + else: + arg_ = arg_t(arg) + args_.append(arg_) + self._fn(*args_) + + def _append_arg(self, name, dtype, pointer=False): + """Append arg info to current argument list.""" + self._arg_info.append(( + name, + self._dtype_to_ctype(dtype, pointer=pointer) + )) + + def _visit_const(self, node): + """Visit const arg of kernel.""" + if isinstance(node.subdecl, cgen.RestrictPointer): + self._visit_pointer(node.subdecl) + else: + pod = node.subdecl # type: cgen.POD + self._append_arg(pod.name, pod.dtype) + + def _visit_pointer(self, node): + "Visit pointer argument of kernel." + pod = node.subdecl # type: cgen.POD + self._append_arg(pod.name, pod.dtype, pointer=True) + + def _visit_func_decl(self, func_decl): + """Visit nodes of function declaration of kernel.""" + for i, arg in enumerate(func_decl.arg_decls): + if isinstance(arg, cgen.Const): + self._visit_const(arg) + elif isinstance(arg, cgen.RestrictPointer): + self._visit_pointer(arg) + else: + raise ValueError('unhandled type for arg %r' % (arg, )) + + def _dtype_to_ctype(self, dtype, pointer=False): + """Map NumPy dtype to equivalent ctypes type.""" + target = self.knl.target # type: CTarget + registry = target.get_dtype_registry().wrapped_registry + typename = registry.dtype_to_ctype(dtype) + typename = {'unsigned': 'uint'}.get(typename, typename) + basetype = getattr(ctypes, 'c_' + typename) + if pointer: + return ctypes.POINTER(basetype) + return basetype + + +class CKernelExecutor(KernelExecutorBase): + """An object connecting a kernel to a :class:`CompiledKernel` + for execution. + + .. automethod:: __init__ + .. automethod:: __call__ + """ + + def __init__(self, kernel, compiler=None): + """ + :arg kernel: may be a loopy.LoopKernel, a generator returning kernels + (a warning will be issued if more than one is returned). If the + kernel has not yet been loop-scheduled, that is done, too, with no + specific arguments. + """ + + self.compiler = compiler if compiler else CCompiler() + super(CKernelExecutor, self).__init__(kernel) + + @memoize_method + def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): + kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) + + from loopy.codegen import generate_code_v2 + codegen_result = generate_code_v2(kernel) + + dev_code = codegen_result.device_code() + + if self.kernel.options.write_cl: + output = dev_code + if self.kernel.options.highlight_cl: + output = self.get_highlighted_code(output) + + if self.kernel.options.write_cl is True: + print(output) + else: + with open(self.kernel.options.write_cl, "w") as outf: + outf.write(output) + + if self.kernel.options.edit_cl: + from pytools import invoke_editor + dev_code = invoke_editor(dev_code, "code.cl") + + c_kernels = _Kernels() + for dp in codegen_result.device_programs: + setattr(c_kernels, dp.name, CompiledKernel(dp, self.compiler)) + + return _KernelInfo( + kernel=kernel, + c_kernels=c_kernels, + implemented_data_info=codegen_result.implemented_data_info, + invoker=self.invoker(kernel, codegen_result)) + + # }}} + + def __call__(self, **kwargs): + """ + :returns: ``(None, output)`` the output is a tuple of output arguments + (arguments that are written as part of the kernel). The order is given + by the order of kernel arguments. If this order is unspecified + (such as when kernel arguments are inferred automatically), + enable :attr:`loopy.Options.return_dict` to make *output* a + :class:`dict` instead, with keys of argument names and values + of the returned arrays. + """ + + kwargs = self.packing_controller.unpack(kwargs) + + kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs)) + + return kernel_info.invoker( + kernel_info.c_kernels, **kwargs) diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index a8f47adb9..a2574bf8a 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -22,17 +22,12 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import six from six.moves import range, zip from pytools import ImmutableRecord, memoize_method -from loopy.diagnostic import ParameterFinderWarning -from pytools.py_codegen import ( - Indentation, PythonFunctionGenerator) -from loopy.diagnostic import LoopyError -from loopy.types import NumpyType -from loopy.execution import KernelExecutorBase - +from pytools.py_codegen import Indentation +from loopy.execution import (KernelExecutorBase, ExecutionWrapperGeneratorBase, + _KernelInfo, _Kernels) import logging logger = logging.getLogger(__name__) @@ -43,507 +38,123 @@ logger = logging.getLogger(__name__) # Prefix all auxiliary variables with "_lpy". -def python_dtype_str(dtype): - import pyopencl.tools as cl_tools - if dtype.isbuiltin: - return "_lpy_np."+dtype.name - else: - return ("_lpy_cl_tools.get_or_register_dtype(\"%s\")" - % cl_tools.dtype_to_ctype(dtype)) - - -# {{{ integer arg finding from shapes - -def generate_integer_arg_finding_from_shapes(gen, kernel, implemented_data_info): - # a mapping from integer argument names to a list of tuples - # (arg_name, expression), where expression is a - # unary function of kernel.arg_dict[arg_name] - # returning the desired integer argument. - iarg_to_sources = {} - - from loopy.kernel.data import GlobalArg - from loopy.symbolic import DependencyMapper, StringifyMapper - dep_map = DependencyMapper() - - from pymbolic import var - for arg in implemented_data_info: - if arg.arg_class is GlobalArg: - sym_shape = var(arg.name).attr("shape") - for axis_nr, shape_i in enumerate(arg.shape): - if shape_i is None: - continue - - deps = dep_map(shape_i) - - if len(deps) == 1: - integer_arg_var, = deps - - if kernel.arg_dict[integer_arg_var.name].dtype.is_integral(): - from pymbolic.algorithm import solve_affine_equations_for - try: - # friggin' overkill :) - iarg_expr = solve_affine_equations_for( - [integer_arg_var.name], - [(shape_i, sym_shape.index(axis_nr))] - )[integer_arg_var] - except Exception as e: - #from traceback import print_exc - #print_exc() - - # went wrong? oh well - from warnings import warn - warn("Unable to generate code to automatically " - "find '%s' from the shape of '%s':\n%s" - % (integer_arg_var.name, arg.name, str(e)), - ParameterFinderWarning) - else: - iarg_to_sources.setdefault(integer_arg_var.name, []) \ - .append((arg.name, iarg_expr)) - - gen("# {{{ find integer arguments from shapes") - gen("") - - for iarg_name, sources in six.iteritems(iarg_to_sources): - gen("if %s is None:" % iarg_name) - with Indentation(gen): - if_stmt = "if" - for arg_name, value_expr in sources: - gen("%s %s is not None:" % (if_stmt, arg_name)) - with Indentation(gen): - gen("%s = %s" - % (iarg_name, StringifyMapper()(value_expr))) - - if_stmt = "elif" - - gen("") - - gen("# }}}") - gen("") - -# }}} - - -# {{{ integer arg finding from offsets - -def generate_integer_arg_finding_from_offsets(gen, kernel, implemented_data_info): - options = kernel.options - - gen("# {{{ find integer arguments from offsets") - gen("") - - for arg in implemented_data_info: - impl_array_name = arg.offset_for_name - if impl_array_name is not None: - gen("if %s is None:" % arg.name) - with Indentation(gen): - gen("if %s is None:" % impl_array_name) - with Indentation(gen): - gen("# Output variable, we'll be allocating " - "it, with zero offset.") - gen("%s = 0" % arg.name) - gen("else:") - with Indentation(gen): - if not options.no_numpy: - gen("_lpy_offset = getattr(%s, \"offset\", 0)" - % impl_array_name) - else: - gen("_lpy_offset = %s.offset" % impl_array_name) - - base_arg = kernel.impl_arg_to_arg[impl_array_name] - - if not options.skip_arg_checks: - gen("%s, _lpy_remdr = divmod(_lpy_offset, %d)" - % (arg.name, base_arg.dtype.itemsize)) - - gen("assert _lpy_remdr == 0, \"Offset of array '%s' is " - "not divisible by its dtype itemsize\"" - % impl_array_name) - gen("del _lpy_remdr") - else: - gen("%s = _lpy_offset // %d" - % (arg.name, base_arg.dtype.itemsize)) - - if not options.skip_arg_checks: - gen("del _lpy_offset") - - gen("# }}}") - gen("") - -# }}} - - -# {{{ integer arg finding from strides - -def generate_integer_arg_finding_from_strides(gen, kernel, implemented_data_info): - options = kernel.options - - gen("# {{{ find integer arguments from strides") - gen("") - - for arg in implemented_data_info: - if arg.stride_for_name_and_axis is not None: - impl_array_name, stride_impl_axis = arg.stride_for_name_and_axis - - gen("if %s is None:" % arg.name) - with Indentation(gen): - if not options.skip_arg_checks: - gen("if %s is None:" % impl_array_name) - with Indentation(gen): - gen("raise RuntimeError(\"required stride '%s' for " - "argument '%s' not given or deducible from " - "passed array\")" - % (arg.name, impl_array_name)) - - base_arg = kernel.impl_arg_to_arg[impl_array_name] - - if not options.skip_arg_checks: - gen("%s, _lpy_remdr = divmod(%s.strides[%d], %d)" - % (arg.name, impl_array_name, stride_impl_axis, - base_arg.dtype.dtype.itemsize)) - - gen("assert _lpy_remdr == 0, \"Stride %d of array '%s' is " - "not divisible by its dtype itemsize\"" - % (stride_impl_axis, impl_array_name)) - gen("del _lpy_remdr") - else: - gen("%s = _lpy_offset // %d" - % (arg.name, base_arg.dtype.itemsize)) - - gen("# }}}") - gen("") - -# }}} - - -# {{{ check that value args are present - -def generate_value_arg_check(gen, kernel, implemented_data_info): - if kernel.options.skip_arg_checks: - return +class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): + """ + Specialized form of the :class:`ExecutionWrapperGeneratorBase` for + pyopencl execution + """ - from loopy.kernel.data import ValueArg + def __init__(self): + system_args = [ + "_lpy_cl_kernels", "queue", "allocator=None", "wait_for=None", + # ignored if options.no_numpy + "out_host=None" + ] + super(PyOpenCLExecutionWrapperGenerator, self).__init__(system_args) - gen("# {{{ check that value args are present") - gen("") + def python_dtype_str(self, dtype): + import pyopencl.tools as cl_tools + if dtype.isbuiltin: + return "_lpy_np."+dtype.name + else: + return ("_lpy_cl_tools.get_or_register_dtype(\"%s\")" + % cl_tools.dtype_to_ctype(dtype)) - for arg in implemented_data_info: - if not issubclass(arg.arg_class, ValueArg): - continue + # {{{ handle non-numpy args - gen("if %s is None:" % arg.name) + def handle_non_numpy_arg(self, gen, arg): + gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name) with Indentation(gen): - gen("raise TypeError(\"value argument '%s' " - "was not given and could not be automatically " - "determined\")" % arg.name) - - gen("# }}}") - gen("") - -# }}} - - -# {{{ arg setup - -def generate_arg_setup(gen, kernel, implemented_data_info, options): - import loopy as lp - - from loopy.kernel.data import KernelArgument - from loopy.kernel.array import ArrayBase - from loopy.symbolic import StringifyMapper - from pymbolic import var - - gen("# {{{ set up array arguments") - gen("") - - if not options.no_numpy: - gen("_lpy_encountered_numpy = False") - gen("_lpy_encountered_dev = False") - gen("") - - args = [] - - strify = StringifyMapper() - - expect_no_more_arguments = False - - for arg_idx, arg in enumerate(implemented_data_info): - is_written = arg.base_name in kernel.get_written_variables() - kernel_arg = kernel.impl_arg_to_arg.get(arg.name) - - if not issubclass(arg.arg_class, KernelArgument): - expect_no_more_arguments = True - continue - - if expect_no_more_arguments: - raise LoopyError("Further arguments encountered after arg info " - "describing a global temporary variable") - - if not issubclass(arg.arg_class, ArrayBase): - args.append(arg.name) - continue + gen("# synchronous, nothing to worry about") + gen("%s = _lpy_cl_array.to_device(" + "queue, %s, allocator=allocator)" + % (arg.name, arg.name)) + gen("_lpy_encountered_numpy = True") + gen("elif %s is not None:" % arg.name) + with Indentation(gen): + gen("_lpy_encountered_dev = True") - gen("# {{{ process %s" % arg.name) gen("") - if not options.no_numpy: - gen("if isinstance(%s, _lpy_np.ndarray):" % arg.name) - with Indentation(gen): - gen("# synchronous, nothing to worry about") - gen("%s = _lpy_cl_array.to_device(" - "queue, %s, allocator=allocator)" - % (arg.name, arg.name)) - gen("_lpy_encountered_numpy = True") - gen("elif %s is not None:" % arg.name) - with Indentation(gen): - gen("_lpy_encountered_dev = True") - - gen("") - - if not options.skip_arg_checks and not is_written: - gen("if %s is None:" % arg.name) - with Indentation(gen): - gen("raise RuntimeError(\"input argument '%s' must " - "be supplied\")" % arg.name) - gen("") - - if (is_written - and arg.arg_class is lp.ImageArg - and not options.skip_arg_checks): - gen("if %s is None:" % arg.name) - with Indentation(gen): - gen("raise RuntimeError(\"written image '%s' must " - "be supplied\")" % arg.name) - gen("") - - if is_written and arg.shape is None and not options.skip_arg_checks: - gen("if %s is None:" % arg.name) - with Indentation(gen): - gen("raise RuntimeError(\"written argument '%s' has " - "unknown shape and must be supplied\")" % arg.name) - gen("") - - possibly_made_by_loopy = False - - # {{{ allocate written arrays, if needed + # {{{ handle allocation of unspecified arguements - if is_written and arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \ - and arg.shape is not None: - - if not isinstance(arg.dtype, NumpyType): - raise LoopyError("do not know how to pass arg of type '%s'" - % arg.dtype) - - possibly_made_by_loopy = True - gen("_lpy_made_by_loopy = False") - gen("") - - gen("if %s is None:" % arg.name) - with Indentation(gen): - num_axes = len(arg.strides) - for i in range(num_axes): - gen("_lpy_shape_%d = %s" % (i, strify(arg.unvec_shape[i]))) - - itemsize = kernel_arg.dtype.numpy_dtype.itemsize - for i in range(num_axes): - gen("_lpy_strides_%d = %s" % (i, strify( - itemsize*arg.unvec_strides[i]))) - - if not options.skip_arg_checks: - for i in range(num_axes): - gen("assert _lpy_strides_%d > 0, " - "\"'%s' has negative stride in axis %d\"" - % (i, arg.name, i)) - - sym_strides = tuple( - var("_lpy_strides_%d" % i) - for i in range(num_axes)) - sym_shape = tuple( - var("_lpy_shape_%d" % i) - for i in range(num_axes)) - - alloc_size_expr = (sum(astrd*(alen-1) - for alen, astrd in zip(sym_shape, sym_strides)) - + itemsize) - - gen("_lpy_alloc_size = %s" % strify(alloc_size_expr)) - gen("%(name)s = _lpy_cl_array.Array(queue, %(shape)s, " - "%(dtype)s, strides=%(strides)s, " - "data=allocator(_lpy_alloc_size), allocator=allocator)" - % dict( - name=arg.name, - shape=strify(sym_shape), - strides=strify(sym_strides), - dtype=python_dtype_str(kernel_arg.dtype.numpy_dtype))) - - if not options.skip_arg_checks: - for i in range(num_axes): - gen("del _lpy_shape_%d" % i) - gen("del _lpy_strides_%d" % i) - gen("del _lpy_alloc_size") - gen("") - - gen("_lpy_made_by_loopy = True") - gen("") - - # }}} - - # {{{ argument checking - - if arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \ - and not options.skip_arg_checks: - if possibly_made_by_loopy: - gen("if not _lpy_made_by_loopy:") - else: - gen("if True:") - - with Indentation(gen): - gen("if %s.dtype != %s:" - % (arg.name, python_dtype_str(kernel_arg.dtype.numpy_dtype))) - with Indentation(gen): - gen("raise TypeError(\"dtype mismatch on argument '%s' " - "(got: %%s, expected: %s)\" %% %s.dtype)" - % (arg.name, arg.dtype, arg.name)) - - # {{{ generate shape checking code - - def strify_allowing_none(shape_axis): - if shape_axis is None: - return "None" - else: - return strify(shape_axis) - - def strify_tuple(t): - if len(t) == 0: - return "()" - else: - return "(%s,)" % ", ".join( - strify_allowing_none(sa) - for sa in t) - - shape_mismatch_msg = ( - "raise TypeError(\"shape mismatch on argument '%s' " - "(got: %%s, expected: %%s)\" " - "%% (%s.shape, %s))" - % (arg.name, arg.name, strify_tuple(arg.unvec_shape))) - - if kernel_arg.shape is None: - pass - - elif any(shape_axis is None for shape_axis in kernel_arg.shape): - gen("if len(%s.shape) != %s:" - % (arg.name, len(arg.unvec_shape))) - with Indentation(gen): - gen(shape_mismatch_msg) - - for i, shape_axis in enumerate(arg.unvec_shape): - if shape_axis is None: - continue - - gen("if %s.shape[%d] != %s:" - % (arg.name, i, strify(shape_axis))) - with Indentation(gen): - gen(shape_mismatch_msg) - - else: # not None, no Nones in tuple - gen("if %s.shape != %s:" - % (arg.name, strify(arg.unvec_shape))) - with Indentation(gen): - gen(shape_mismatch_msg) - - # }}} - - if arg.unvec_strides and kernel_arg.dim_tags: - itemsize = kernel_arg.dtype.numpy_dtype.itemsize - sym_strides = tuple( - itemsize*s_i for s_i in arg.unvec_strides) - gen("if %s.strides != %s:" - % (arg.name, strify(sym_strides))) - with Indentation(gen): - gen("raise TypeError(\"strides mismatch on " - "argument '%s' (got: %%s, expected: %%s)\" " - "%% (%s.strides, %s))" - % (arg.name, arg.name, strify(sym_strides))) - - if not arg.allows_offset: - gen("if %s.offset:" % arg.name) - with Indentation(gen): - gen("raise ValueError(\"Argument '%s' does not " - "allow arrays with offsets. Try passing " - "default_offset=loopy.auto to make_kernel()." - "\")" % arg.name) - gen("") - - # }}} - - if possibly_made_by_loopy and not options.skip_arg_checks: - gen("del _lpy_made_by_loopy") + def handle_alloc(self, gen, arg, kernel_arg, strify, skip_arg_checks): + """ + Handle allocation of non-specified arguements for pyopencl execution + """ + from pymbolic import var + + num_axes = len(arg.strides) + for i in range(num_axes): + gen("_lpy_shape_%d = %s" % (i, strify(arg.unvec_shape[i]))) + + itemsize = kernel_arg.dtype.numpy_dtype.itemsize + for i in range(num_axes): + gen("_lpy_strides_%d = %s" % (i, strify( + itemsize*arg.unvec_strides[i]))) + + if not skip_arg_checks: + for i in range(num_axes): + gen("assert _lpy_strides_%d > 0, " + "\"'%s' has negative stride in axis %d\"" + % (i, arg.name, i)) + + sym_strides = tuple( + var("_lpy_strides_%d" % i) + for i in range(num_axes)) + sym_shape = tuple( + var("_lpy_shape_%d" % i) + for i in range(num_axes)) + + alloc_size_expr = (sum(astrd*(alen-1) + for alen, astrd in zip(sym_shape, sym_strides)) + + itemsize) + + gen("_lpy_alloc_size = %s" % strify(alloc_size_expr)) + gen("%(name)s = _lpy_cl_array.Array(queue, %(shape)s, " + "%(dtype)s, strides=%(strides)s, " + "data=allocator(_lpy_alloc_size), allocator=allocator)" + % dict( + name=arg.name, + shape=strify(sym_shape), + strides=strify(sym_strides), + dtype=self.python_dtype_str(kernel_arg.dtype.numpy_dtype))) + + if not skip_arg_checks: + for i in range(num_axes): + gen("del _lpy_shape_%d" % i) + gen("del _lpy_strides_%d" % i) + gen("del _lpy_alloc_size") gen("") - if arg.arg_class in [lp.GlobalArg, lp.ConstantArg]: - args.append("%s.base_data" % arg.name) - else: - args.append("%s" % arg.name) + # }}} - gen("") + def target_specific_preamble(self, gen): + """ + Add default pyopencl imports to preamble + """ + gen.add_to_preamble("import numpy as _lpy_np") + gen.add_to_preamble("import pyopencl as _lpy_cl") + gen.add_to_preamble("import pyopencl.array as _lpy_cl_array") + gen.add_to_preamble("import pyopencl.tools as _lpy_cl_tools") - gen("# }}}") + def initialize_system_args(self, gen): + """ + Initializes possibly empty system arguements + """ + gen("if allocator is None:") + with Indentation(gen): + gen("allocator = _lpy_cl_tools.DeferredAllocator(queue.context)") gen("") - gen("# }}}") - gen("") - - return args - -# }}} - - -def generate_invoker(kernel, codegen_result): - options = kernel.options - implemented_data_info = codegen_result.implemented_data_info - host_code = codegen_result.host_code() - - system_args = [ - "_lpy_cl_kernels", "queue", "allocator=None", "wait_for=None", - # ignored if options.no_numpy - "out_host=None" - ] - - from loopy.kernel.data import KernelArgument - gen = PythonFunctionGenerator( - "invoke_%s_loopy_kernel" % kernel.name, - system_args + [ - "%s=None" % idi.name - for idi in implemented_data_info - if issubclass(idi.arg_class, KernelArgument) - ]) - - gen.add_to_preamble("from __future__ import division") - gen.add_to_preamble("") - gen.add_to_preamble("import pyopencl as _lpy_cl") - gen.add_to_preamble("import pyopencl.array as _lpy_cl_array") - gen.add_to_preamble("import pyopencl.tools as _lpy_cl_tools") - gen.add_to_preamble("import numpy as _lpy_np") - gen.add_to_preamble("") - gen.add_to_preamble(host_code) - gen.add_to_preamble("") - - gen("if allocator is None:") - with Indentation(gen): - gen("allocator = _lpy_cl_tools.DeferredAllocator(queue.context)") - gen("") - - generate_integer_arg_finding_from_shapes(gen, kernel, implemented_data_info) - generate_integer_arg_finding_from_offsets(gen, kernel, implemented_data_info) - generate_integer_arg_finding_from_strides(gen, kernel, implemented_data_info) - generate_value_arg_check(gen, kernel, implemented_data_info) - - args = generate_arg_setup(gen, kernel, implemented_data_info, options) - # {{{ generate invocation - gen("_lpy_evt = {kernel_name}({args})" + def generate_invocation(self, gen, kernel_name, args): + gen("for knl in _lpy_cl_kernels:") + with Indentation(gen): + gen("_lpy_evt = {kernel_name}({args})" .format( - kernel_name=codegen_result.host_program.name, + kernel_name=kernel_name, args=", ".join( ["_lpy_cl_kernels", "queue"] + args @@ -551,72 +162,52 @@ def generate_invoker(kernel, codegen_result): # }}} - # {{{ output + # {{{ - if not options.no_numpy: - gen("if out_host is None and (_lpy_encountered_numpy " - "and not _lpy_encountered_dev):") - with Indentation(gen): - gen("out_host = True") + def generate_output_handler( + self, gen, options, kernel, implemented_data_info): - gen("if out_host:") - with Indentation(gen): - gen("pass") # if no outputs (?!) - for arg in implemented_data_info: - if not issubclass(arg.arg_class, KernelArgument): - continue + from loopy.kernel.data import KernelArgument - is_written = arg.base_name in kernel.get_written_variables() - if is_written: - gen("%s = %s.get(queue=queue)" % (arg.name, arg.name)) - - gen("") - - if options.return_dict: - gen("return _lpy_evt, {%s}" - % ", ".join("\"%s\": %s" % (arg.name, arg.name) - for arg in implemented_data_info - if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables())) - else: - out_args = [arg - for arg in implemented_data_info - if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables()] - if out_args: - gen("return _lpy_evt, (%s,)" - % ", ".join(arg.name for arg in out_args)) - else: - gen("return _lpy_evt, ()") + if not options.no_numpy: + gen("if out_host is None and (_lpy_encountered_numpy " + "and not _lpy_encountered_dev):") + with Indentation(gen): + gen("out_host = True") - # }}} + gen("if out_host:") + with Indentation(gen): + gen("pass") # if no outputs (?!) + for arg in implemented_data_info: + if not issubclass(arg.arg_class, KernelArgument): + continue - if options.write_wrapper: - output = gen.get() - if options.highlight_wrapper: - output = get_highlighted_python_code(output) + gen("") - if options.write_wrapper is True: - print(output) + if options.return_dict: + gen("return None, {%s}" + % ", ".join("\"%s\": %s" % (arg.name, arg.name) + for arg in implemented_data_info + if issubclass(arg.arg_class, KernelArgument) + if arg.base_name in kernel.get_written_variables())) else: - with open(options.write_wrapper, "w") as outf: - outf.write(output) - - return gen.get_function() + out_args = [arg + for arg in implemented_data_info + if issubclass(arg.arg_class, KernelArgument) + if arg.base_name in kernel.get_written_variables()] + if out_args: + gen("return None, (%s,)" + % ", ".join(arg.name for arg in out_args)) + else: + gen("return None, ()") + # }}} # }}} # {{{ kernel executor -class _CLKernelInfo(ImmutableRecord): - pass - - -class _CLKernels(object): - pass - class PyOpenCLKernelExecutor(KernelExecutorBase): """An object connecting a kernel to a :class:`pyopencl.Context` @@ -635,7 +226,8 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): specific arguments. """ - super(PyOpenCLKernelExecutor, self).__init__(kernel) + super(PyOpenCLKernelExecutor, self).__init__( + kernel, invoker=PyOpenCLExecutionWrapperGenerator()) self.context = context @@ -644,10 +236,11 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): self.kernel = kernel.copy(target=PyOpenCLTarget(context.devices[0])) @memoize_method - def cl_kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): + def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) from loopy.codegen import generate_code_v2 + from loopy.execution import get_highlighted_code codegen_result = generate_code_v2(kernel) dev_code = codegen_result.device_code() @@ -655,7 +248,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): if self.kernel.options.write_cl: output = dev_code if self.kernel.options.highlight_cl: - output = get_highlighted_cl_code(output) + output = get_highlighted_code(output) if self.kernel.options.write_cl is True: print(output) @@ -673,33 +266,15 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): cl.Program(self.context, dev_code) .build(options=kernel.options.cl_build_options)) - cl_kernels = _CLKernels() + cl_kernels = _Kernels() for dp in codegen_result.device_programs: setattr(cl_kernels, dp.name, getattr(cl_program, dp.name)) - return _CLKernelInfo( + return _KernelInfo( kernel=kernel, cl_kernels=cl_kernels, implemented_data_info=codegen_result.implemented_data_info, - invoker=generate_invoker(kernel, codegen_result)) - - # {{{ debugging aids - - def get_code(self, arg_to_dtype=None): - if arg_to_dtype is not None: - arg_to_dtype = frozenset(six.iteritems(arg_to_dtype)) - - kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype) - - from loopy.codegen import generate_code_v2 - code = generate_code_v2(kernel) - return code.device_code() - - def get_highlighted_code(self, arg_to_dtype=None): - return get_highlighted_cl_code( - self.get_code(arg_to_dtype)) - - # }}} + invoker=self.invoker(kernel, codegen_result)) def __call__(self, queue, **kwargs): """ @@ -742,29 +317,4 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): # }}} - -def get_highlighted_python_code(text): - try: - from pygments import highlight - except ImportError: - return text - else: - from pygments.lexers import PythonLexer - from pygments.formatters import TerminalFormatter - - return highlight(text, PythonLexer(), TerminalFormatter()) - - -def get_highlighted_cl_code(text): - try: - from pygments import highlight - except ImportError: - return text - else: - from pygments.lexers import CLexer - from pygments.formatters import TerminalFormatter - - return highlight(text, CLexer(), TerminalFormatter()) - - # vim: foldmethod=marker -- GitLab From cd9b8c8db145f37bc602ca1f9704cb48186e41d4 Mon Sep 17 00:00:00 2001 From: arghdos Date: Mon, 1 May 2017 14:56:54 -0400 Subject: [PATCH 02/65] fixes --- loopy/auto_test.py | 4 +-- loopy/check.py | 4 +-- loopy/compiled.py | 4 +-- loopy/execution.py | 51 ++++++++++++++++++++---------- loopy/target/c/__init__.py | 2 +- loopy/target/c/c_execution.py | 33 +++++++++++-------- loopy/target/pyopencl_execution.py | 4 +-- 7 files changed, 64 insertions(+), 38 deletions(-) diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 6a4d55975..db29d5130 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -403,7 +403,7 @@ def auto_test_vs_ref( raise LoopyError("ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i+1)) - from loopy.compiled import CompiledKernel, get_highlighted_cl_code + from loopy.compiled import CompiledKernel, get_highlighted_code if isinstance(op_count, (int, float)): warn("op_count should be a list", stacklevel=2) @@ -448,7 +448,7 @@ def auto_test_vs_ref( print(75*"-") print("Reference Code:") print(75*"-") - print(get_highlighted_cl_code(ref_compiled.get_code())) + print(get_highlighted_code(ref_compiled.get_code())) print(75*"-") ref_cl_kernel_info = ref_compiled.cl_kernel_info(frozenset()) diff --git a/loopy/check.py b/loopy/check.py index 6a1e3dc33..e8082c0ee 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -731,8 +731,8 @@ def check_implemented_domains(kernel, implemented_domains, code=None): print(79*"-") print("CODE:") print(79*"-") - from loopy.compiled import get_highlighted_cl_code - print(get_highlighted_cl_code(code)) + from loopy.compiled import get_highlighted_code + print(get_highlighted_code(code)) print(79*"-") raise LoopyError("sanity check failed--implemented and desired " diff --git a/loopy/compiled.py b/loopy/compiled.py index b3e4fe058..062ba60cc 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -24,8 +24,8 @@ THE SOFTWARE. from loopy.target.pyopencl_execution import ( # noqa - PyOpenCLKernelExecutor, - get_highlighted_cl_code) + PyOpenCLKernelExecutor) +from loopy.execution import get_highlighted_code # {{{ compatibility diff --git a/loopy/execution.py b/loopy/execution.py index 65968e663..ad33ae3e7 100644 --- a/loopy/execution.py +++ b/loopy/execution.py @@ -126,7 +126,10 @@ class ExecutionWrapperGeneratorBase(object): self.system_args = system_args[:] def python_dtype_str(self, dtype): - if dtype.isbuiltin: + # TODO: figure out why isbuiltin isn't working in test (requiring second + # line) + if dtype.isbuiltin or \ + np.dtype(str(dtype)).isbuiltin: return "_lpy_np."+dtype.name raise Exception('dtype: {} not recognized'.format(dtype)) @@ -329,7 +332,7 @@ class ExecutionWrapperGeneratorBase(object): # {{{ handle non numpy arguements def handle_non_numpy_arg(self, gen, arg): - raise Exception('Non-numpy args are not allowed for C-execution') + pass # }}} @@ -345,32 +348,45 @@ class ExecutionWrapperGeneratorBase(object): for i in range(num_axes): gen("_lpy_shape_%d = %s" % (i, strify(arg.unvec_shape[i]))) - sym_order = var('_lpy_order') - gen("%s = %s" % (strify(sym_order), arg.order)) - - sym_shape = tuple( - var("_lpy_shape_%d" % i) - for i in range(num_axes)) + itemsize = kernel_arg.dtype.numpy_dtype.itemsize + for i in range(num_axes): + gen("_lpy_strides_%d = %s" % (i, strify( + itemsize*arg.unvec_strides[i]))) if not skip_arg_checks: for i in range(num_axes): - gen("assert _lpy_shape_%d > 0, " - "\"'%s' has negative shape in axis %d\"" + gen("assert _lpy_strides_%d > 0, " + "\"'%s' has negative stride in axis %d\"" % (i, arg.name, i)) + sym_strides = tuple( + var("_lpy_strides_%d" % i) + for i in range(num_axes)) + + sym_shape = tuple( + var("_lpy_shape_%d" % i) + for i in range(num_axes)) + gen("%(name)s = _lpy_np.empty(%(shape)s, " - "%(dtype)s, order=%(order)s)" + "%(dtype)s)" % dict( name=arg.name, shape=strify(sym_shape), - order=strify(sym_order), dtype=self.python_dtype_str( kernel_arg.dtype.numpy_dtype))) + #check strides + gen("%(name)s = _lpy_strided(%(name)s, %(shape)s, " + "%(strides)s)" + % dict( + name=arg.name, + shape=strify(sym_shape), + strides=strify(sym_strides))) + if not skip_arg_checks: for i in range(num_axes): gen("del _lpy_shape_%d" % i) - gen("del %s" % strify(sym_order)) + gen("del _lpy_strides_%d" % i) gen("") # }}} @@ -548,7 +564,8 @@ class ExecutionWrapperGeneratorBase(object): % (arg.name, arg.name, strify(sym_strides))) if not arg.allows_offset: - gen("if %s.offset:" % arg.name) + gen("if hasattr(%s, 'offset') and %s.offset:" % ( + arg.name, arg.name)) with Indentation(gen): gen("raise ValueError(\"Argument '%s' does not " "allow arrays with offsets. Try passing " @@ -584,7 +601,8 @@ class ExecutionWrapperGeneratorBase(object): Add default C-imports to preamble """ gen.add_to_preamble("import numpy as _lpy_np") - gen.add_to_preamble("import loopy.target.c_execution as _lpy_c") + gen.add_to_preamble("from loopy.target.c.compyte.array" + " import as_strided as _lpy_strided") def intialize_system_args(self, gen): """ @@ -690,10 +708,11 @@ class ExecutionWrapperGeneratorBase(object): args = self.generate_arg_setup( gen, kernel, implemented_data_info, options) - self.generate_invocation(codegen_result.host_program.name, args) + self.generate_invocation(gen, codegen_result.host_program.name, args) self.generate_output_handler(gen, options, kernel, implemented_data_info) + import pdb; pdb.set_trace() if options.write_wrapper: output = gen.get() if options.highlight_wrapper: diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 85d751260..d2e72b84b 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -303,7 +303,7 @@ class CTarget(TargetBase): return self.compiler def get_kernel_executor(self, knl, *args, **kwargs): - from loopy.target.c import CKernelExecutor + from loopy.target.c.c_execution import CKernelExecutor return CKernelExecutor(knl, self.compiler) # }}} diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 230777529..618da0226 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -27,8 +27,6 @@ import cgen import os import subprocess -from loopy.target.c import CTarget, generate_header -from loopy.codegen import generate_code from loopy.execution import (KernelExecutorBase, _Kernels, _KernelInfo, ExecutionWrapperGeneratorBase) from pytools import memoize_method @@ -104,21 +102,28 @@ class CppCompiler(CCompiler): default_compile_flags = '-g -O3'.split() -class CompiledKernel(object): +class CompiledCKernel(object): """ - A CompiledKernel wraps a loopy kernel, compiling it and loading the + A CompiledCKernel wraps a loopy kernel, compiling it and loading the result as a shared library, and provides access to the kernel as a ctypes function object, wrapped by the __call__ method, which attempts to automatically map argument types. """ - def __init__(self, knl, comp=None): - assert isinstance(knl.target, CTarget) + def __init__(self, knl, target, comp=None): + from loopy.target.c import CTarget + assert isinstance(target, CTarget) + self.target = target self.knl = knl - self.code, _ = generate_code(knl) + # get code and build + self.code = str(knl.ast) self.comp = comp or CCompiler() self.dll = self.comp.build(self.code) - self.func_decl, = generate_header(knl) + # get the function declaration for interface with ctypes + from loopy.target.c import CFunctionDeclExtractor + self.func_decl = CFunctionDeclExtractor() + self.func_decl(knl.ast) + self.func_decl = self.func_decl.decls[0] self._arg_info = [] # TODO knl.args[:].dtype is sufficient self._visit_func_decl(self.func_decl) @@ -165,7 +170,7 @@ class CompiledKernel(object): self._append_arg(pod.name, pod.dtype) def _visit_pointer(self, node): - "Visit pointer argument of kernel." + """Visit pointer argument of kernel.""" pod = node.subdecl # type: cgen.POD self._append_arg(pod.name, pod.dtype, pointer=True) @@ -181,7 +186,7 @@ class CompiledKernel(object): def _dtype_to_ctype(self, dtype, pointer=False): """Map NumPy dtype to equivalent ctypes type.""" - target = self.knl.target # type: CTarget + target = self.target # type: CTarget registry = target.get_dtype_registry().wrapped_registry typename = registry.dtype_to_ctype(dtype) typename = {'unsigned': 'uint'}.get(typename, typename) @@ -236,7 +241,9 @@ class CKernelExecutor(KernelExecutorBase): c_kernels = _Kernels() for dp in codegen_result.device_programs: - setattr(c_kernels, dp.name, CompiledKernel(dp, self.compiler)) + setattr(c_kernels, dp.name, CompiledCKernel(dp, + self.kernel.target, + self.compiler)) return _KernelInfo( kernel=kernel, @@ -246,7 +253,7 @@ class CKernelExecutor(KernelExecutorBase): # }}} - def __call__(self, **kwargs): + def __call__(self, *args, **kwargs): """ :returns: ``(None, output)`` the output is a tuple of output arguments (arguments that are written as part of the kernel). The order is given @@ -262,4 +269,4 @@ class CKernelExecutor(KernelExecutorBase): kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs)) return kernel_info.invoker( - kernel_info.c_kernels, **kwargs) + kernel_info.c_kernels, *args, **kwargs) diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index a2574bf8a..021bc786f 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -24,7 +24,7 @@ THE SOFTWARE. from six.moves import range, zip -from pytools import ImmutableRecord, memoize_method +from pytools import memoize_method from pytools.py_codegen import Indentation from loopy.execution import (KernelExecutorBase, ExecutionWrapperGeneratorBase, _KernelInfo, _Kernels) @@ -309,7 +309,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): kwargs = self.packing_controller.unpack(kwargs) - kernel_info = self.cl_kernel_info(self.arg_to_dtype_set(kwargs)) + kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs)) return kernel_info.invoker( kernel_info.cl_kernels, queue, allocator, wait_for, -- GitLab From 98303ded5eb205e1d3fac68851b8ab357c72d2e4 Mon Sep 17 00:00:00 2001 From: arghdos Date: Mon, 1 May 2017 15:28:42 -0400 Subject: [PATCH 03/65] runs simple C kernels --- loopy/execution.py | 72 +++++------------------------- loopy/target/c/c_execution.py | 11 ++--- loopy/target/pyopencl_execution.py | 28 +++++++----- 3 files changed, 32 insertions(+), 79 deletions(-) diff --git a/loopy/execution.py b/loopy/execution.py index ad33ae3e7..ee2666745 100644 --- a/loopy/execution.py +++ b/loopy/execution.py @@ -391,6 +391,9 @@ class ExecutionWrapperGeneratorBase(object): # }}} + def get_arg_pass(self, arg): + return arg.name + # {{{ arg setup def generate_arg_setup( @@ -580,7 +583,7 @@ class ExecutionWrapperGeneratorBase(object): gen("") if arg.arg_class in [lp.GlobalArg, lp.ConstantArg]: - args.append("%s.base_data" % arg.name) + args.append(self.get_arg_pass(arg)) else: args.append("%s" % arg.name) @@ -604,7 +607,7 @@ class ExecutionWrapperGeneratorBase(object): gen.add_to_preamble("from loopy.target.c.compyte.array" " import as_strided as _lpy_strided") - def intialize_system_args(self, gen): + def initialize_system_args(self, gen): """ Override to intialize any default system args """ @@ -615,10 +618,8 @@ class ExecutionWrapperGeneratorBase(object): def generate_invocation(self, gen, kernel_name, args): gen("for knl in _lpy_c_kernels:") with Indentation(gen): - gen("{kernel_name}({args})" - .format( - kernel_name='knl.name', - args=", ".join(args))) + gen('knl({args})'.format( + args=", ".join(args))) # }}} @@ -629,21 +630,6 @@ class ExecutionWrapperGeneratorBase(object): from loopy.kernel.data import KernelArgument - if not options.no_numpy: - gen("if out_host is None and (_lpy_encountered_numpy " - "and not _lpy_encountered_dev):") - with Indentation(gen): - gen("out_host = True") - - gen("if out_host:") - with Indentation(gen): - gen("pass") # if no outputs (?!) - for arg in implemented_data_info: - if not issubclass(arg.arg_class, KernelArgument): - continue - - gen("") - if options.return_dict: gen("return None, {%s}" % ", ".join("\"%s\": %s" % (arg.name, arg.name) @@ -663,6 +649,9 @@ class ExecutionWrapperGeneratorBase(object): # }}} + def generate_host_code(self, gen, codegen_result): + pass + def __call__(self, kernel, codegen_result): """ Generates the wrapping python invoker for this execution target @@ -694,7 +683,7 @@ class ExecutionWrapperGeneratorBase(object): gen.add_to_preamble(host_code) gen.add_to_preamble("") - self.intialize_system_args(gen) + self.initialize_system_args(gen) self.generate_integer_arg_finding_from_shapes( gen, kernel, implemented_data_info) @@ -712,7 +701,6 @@ class ExecutionWrapperGeneratorBase(object): self.generate_output_handler(gen, options, kernel, implemented_data_info) - import pdb; pdb.set_trace() if options.write_wrapper: output = gen.get() if options.highlight_wrapper: @@ -849,43 +837,7 @@ class KernelExecutorBase(object): raise NotImplementedError() def __call__(self, queue, **kwargs): - """ - :arg allocator: a callable passed a byte count and returning - a :class:`pyopencl.Buffer`. A :class:`pyopencl` allocator - maybe. - :arg wait_for: A list of :class:`pyopencl.Event` instances - for which to wait. - :arg out_host: :class:`bool` - Decides whether output arguments (i.e. arguments - written by the kernel) are to be returned as - :mod:`numpy` arrays. *True* for yes, *False* for no. - - For the default value of *None*, if all (input) array - arguments are :mod:`numpy` arrays, defaults to - returning :mod:`numpy` arrays as well. - - :returns: ``(evt, output)`` where *evt* is a :class:`pyopencl.Event` - associated with the execution of the kernel, and - output is a tuple of output arguments (arguments that - are written as part of the kernel). The order is given - by the order of kernel arguments. If this order is unspecified - (such as when kernel arguments are inferred automatically), - enable :attr:`loopy.Options.return_dict` to make *output* a - :class:`dict` instead, with keys of argument names and values - of the returned arrays. - """ - - allocator = kwargs.pop("allocator", None) - wait_for = kwargs.pop("wait_for", None) - out_host = kwargs.pop("out_host", None) - - kwargs = self.packing_controller.unpack(kwargs) - - kernel_info = self.cl_kernel_info(self.arg_to_dtype_set(kwargs)) - - return kernel_info.invoker( - kernel_info.cl_kernels, queue, allocator, wait_for, - out_host, **kwargs) + raise NotImplementedError() # }}} diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 618da0226..d819f347a 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -138,11 +138,10 @@ class CompiledCKernel(object): self._fn.argtypes = [ctype for name, ctype in self._arg_info] self._prepared_call_cache = weakref.WeakKeyDictionary() - def __call__(self, **kwargs): + def __call__(self, *args): """Execute kernel with given args mapped to ctypes equivalents.""" args_ = [] - for knl_arg, arg_t in zip(self.knl.args, self._fn.argtypes): - arg = kwargs[knl_arg.name] + for arg, arg_t in zip(args, self._fn.argtypes): if hasattr(arg, 'ctypes'): if arg.size == 0: # TODO eliminate unused arguments from kernel @@ -239,11 +238,9 @@ class CKernelExecutor(KernelExecutorBase): from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.cl") - c_kernels = _Kernels() + c_kernels = [] for dp in codegen_result.device_programs: - setattr(c_kernels, dp.name, CompiledCKernel(dp, - self.kernel.target, - self.compiler)) + c_kernels.append(CompiledCKernel(dp, self.kernel.target, self.compiler)) return _KernelInfo( kernel=kernel, diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 021bc786f..e67c49c59 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -150,15 +150,13 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # {{{ generate invocation def generate_invocation(self, gen, kernel_name, args): - gen("for knl in _lpy_cl_kernels:") - with Indentation(gen): - gen("_lpy_evt = {kernel_name}({args})" - .format( - kernel_name=kernel_name, - args=", ".join( - ["_lpy_cl_kernels", "queue"] - + args - + ["wait_for=wait_for"]))) + gen("_lpy_evt = {kernel_name}({args})" + .format( + kernel_name=kernel_name, + args=", ".join( + ["_lpy_cl_kernels", "queue"] + + args + + ["wait_for=wait_for"]))) # }}} @@ -185,7 +183,7 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): gen("") if options.return_dict: - gen("return None, {%s}" + gen("return _lpy_evt, {%s}" % ", ".join("\"%s\": %s" % (arg.name, arg.name) for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) @@ -196,13 +194,19 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): if issubclass(arg.arg_class, KernelArgument) if arg.base_name in kernel.get_written_variables()] if out_args: - gen("return None, (%s,)" + gen("return _lpy_evt, (%s,)" % ", ".join(arg.name for arg in out_args)) else: - gen("return None, ()") + gen("return _lpy_evt, ()") # }}} + def generate_host_code(self, gen, codegen_result): + gen.add_to_preamble(codegen_result.host_code()) + + def get_arg_pass(self, arg): + return "%s.base_data" % arg.name + # }}} -- GitLab From 47590d9f9a2280e067fa5283594a1b35436a2d35 Mon Sep 17 00:00:00 2001 From: arghdos Date: Mon, 1 May 2017 15:56:37 -0400 Subject: [PATCH 04/65] python2 fixes --- loopy/target/c/c_execution.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index d819f347a..a4562163a 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -60,15 +60,15 @@ class CCompiler(object): self.exe = cc if cc else self.default_exe self.cflags = cflags or self.default_compile_flags[:] self.ldflags = ldflags or self.default_link_flags[:] - self.tempdir = tempfile.TemporaryDirectory() + self.tempdir = tempfile.mkdtemp(prefix="tmp_loopy") def _tempname(self, name): """Build temporary filename path in tempdir.""" - return os.path.join(self.tempdir.name, name) + return os.path.join(self.tempdir, name) def _call(self, args, **kwargs): """Invoke compiler with arguments.""" - cwd = self.tempdir.name + cwd = self.tempdir args_ = [self.exe] + args logger.debug(args_) subprocess.check_call(args_, cwd=cwd, **kwargs) -- GitLab From 82d50df972ba4b03bfaae55117c492df85efdaa1 Mon Sep 17 00:00:00 2001 From: arghdos Date: Mon, 1 May 2017 16:07:39 -0400 Subject: [PATCH 05/65] add tests, fix outstanding cl_kernel_info calls --- loopy/auto_test.py | 8 ++-- loopy/target/c/c_execution.py | 3 +- test/test_c_execution.py | 90 +++++++++++++++++++++++++++++++++++ test/test_loopy.py | 2 +- 4 files changed, 96 insertions(+), 7 deletions(-) create mode 100644 test/test_c_execution.py diff --git a/loopy/auto_test.py b/loopy/auto_test.py index db29d5130..537e65fb3 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -451,12 +451,12 @@ def auto_test_vs_ref( print(get_highlighted_code(ref_compiled.get_code())) print(75*"-") - ref_cl_kernel_info = ref_compiled.cl_kernel_info(frozenset()) + ref_kernel_info = ref_compiled.kernel_info(frozenset()) try: ref_args, ref_arg_data = \ make_ref_args(ref_sched_kernel, - ref_cl_kernel_info.implemented_data_info, + ref_kernel_info.implemented_data_info, ref_queue, parameters) ref_args["out_host"] = False except cl.RuntimeError as e: @@ -541,10 +541,10 @@ def auto_test_vs_ref( compiled = CompiledKernel(ctx, kernel) if args is None: - cl_kernel_info = compiled.cl_kernel_info(frozenset()) + kernel_info = compiled.kernel_info(frozenset()) args = make_args(kernel, - cl_kernel_info.implemented_data_info, + kernel_info.implemented_data_info, queue, ref_arg_data, parameters) args["out_host"] = False diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index a4562163a..e6a1bd0d6 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -27,8 +27,7 @@ import cgen import os import subprocess -from loopy.execution import (KernelExecutorBase, _Kernels, - _KernelInfo, ExecutionWrapperGeneratorBase) +from loopy.execution import (KernelExecutorBase, _KernelInfo) from pytools import memoize_method import weakref diff --git a/test/test_c_execution.py b/test/test_c_execution.py new file mode 100644 index 000000000..914e2a144 --- /dev/null +++ b/test/test_c_execution.py @@ -0,0 +1,90 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import sys +import numpy as np +import loopy as lp +import pyopencl.clmath # noqa +import pyopencl.clrandom # noqa +import pytest + +import logging +logger = logging.getLogger(__name__) + +try: + import faulthandler +except ImportError: + pass +else: + faulthandler.enable() + + +def test_c_target(): + from loopy.target.c import CTarget + + knl = lp.make_kernel( + "{ [i]: 0<=i Date: Mon, 1 May 2017 16:25:33 -0400 Subject: [PATCH 06/65] flake updates --- loopy/target/c/__init__.py | 2 +- test/test_c_execution.py | 4 ---- 2 files changed, 1 insertion(+), 5 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index d2e72b84b..6f82eadd7 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -300,7 +300,7 @@ class CTarget(TargetBase): return self.get_dtype_registry().dtype_to_ctype(dtype) def get_kernel_executor_cache_key(self, *args, **kwargs): - return self.compiler + return None # TODO: ??? def get_kernel_executor(self, knl, *args, **kwargs): from loopy.target.c.c_execution import CKernelExecutor diff --git a/test/test_c_execution.py b/test/test_c_execution.py index 914e2a144..6d7e2afef 100644 --- a/test/test_c_execution.py +++ b/test/test_c_execution.py @@ -22,12 +22,10 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. """ -import sys import numpy as np import loopy as lp import pyopencl.clmath # noqa import pyopencl.clrandom # noqa -import pytest import logging logger = logging.getLogger(__name__) @@ -71,7 +69,6 @@ def test_c_target_strides(): ], target=CTarget()) - # test with C-order knl = __get_kernel('C') a_np = np.reshape(np.arange(16 * 16, dtype=np.float32), (16, -1), @@ -80,7 +77,6 @@ def test_c_target_strides(): assert np.allclose(knl(a=a_np)[1], 2 * a_np) - # test with F-order knl = __get_kernel('F') a_np = np.reshape(np.arange(16 * 16, dtype=np.float32), (16, -1), -- GitLab From b473d4ba94c09403b157cdc993fb009256f4da74 Mon Sep 17 00:00:00 2001 From: arghdos Date: Tue, 2 May 2017 11:15:35 -0400 Subject: [PATCH 07/65] update imports --- loopy/check.py | 2 +- loopy/compiled.py | 1 - 2 files changed, 1 insertion(+), 2 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index e8082c0ee..41095743d 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -731,7 +731,7 @@ def check_implemented_domains(kernel, implemented_domains, code=None): print(79*"-") print("CODE:") print(79*"-") - from loopy.compiled import get_highlighted_code + from loopy.execution import get_highlighted_code print(get_highlighted_code(code)) print(79*"-") diff --git a/loopy/compiled.py b/loopy/compiled.py index 062ba60cc..613bca56f 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -25,7 +25,6 @@ THE SOFTWARE. from loopy.target.pyopencl_execution import ( # noqa PyOpenCLKernelExecutor) -from loopy.execution import get_highlighted_code # {{{ compatibility -- GitLab From 7c27894a8d3332876f4f86dc2c564341dd41557d Mon Sep 17 00:00:00 2001 From: arghdos Date: Tue, 2 May 2017 13:24:41 -0400 Subject: [PATCH 08/65] fix wrong import --- loopy/auto_test.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 537e65fb3..9fefa5e3e 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -403,7 +403,8 @@ def auto_test_vs_ref( raise LoopyError("ref_knl and test_knl argument lists disagree at index " "%d (1-based)" % (i+1)) - from loopy.compiled import CompiledKernel, get_highlighted_code + from loopy.compiled import CompiledKernel + from loopy.execution import get_highlighted_code if isinstance(op_count, (int, float)): warn("op_count should be a list", stacklevel=2) -- GitLab From 1e7ff424255137d0b84ff718a0a98d17880c8633 Mon Sep 17 00:00:00 2001 From: arghdos Date: Tue, 2 May 2017 13:57:34 -0400 Subject: [PATCH 09/65] add missing pyopencl fetches --- loopy/target/pyopencl_execution.py | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index e67c49c59..b6c58ce47 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -180,6 +180,10 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): if not issubclass(arg.arg_class, KernelArgument): continue + is_written = arg.base_name in kernel.get_written_variables() + if is_written: + gen("%s = %s.get(queue=queue)" % (arg.name, arg.name)) + gen("") if options.return_dict: -- GitLab From 89291ac6dcd1e704c128b0254ab7ef2adc61a55f Mon Sep 17 00:00:00 2001 From: arghdos Date: Thu, 4 May 2017 14:13:48 -0400 Subject: [PATCH 10/65] fix editing --- loopy/target/c/c_execution.py | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index e6a1bd0d6..2a60ace79 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -109,13 +109,13 @@ class CompiledCKernel(object): to automatically map argument types. """ - def __init__(self, knl, target, comp=None): + def __init__(self, knl, dev_code, target, comp=None): from loopy.target.c import CTarget assert isinstance(target, CTarget) self.target = target self.knl = knl # get code and build - self.code = str(knl.ast) + self.code = dev_code self.comp = comp or CCompiler() self.dll = self.comp.build(self.code) # get the function declaration for interface with ctypes @@ -235,11 +235,12 @@ class CKernelExecutor(KernelExecutorBase): if self.kernel.options.edit_cl: from pytools import invoke_editor - dev_code = invoke_editor(dev_code, "code.cl") + dev_code = invoke_editor(dev_code, "code.c") c_kernels = [] for dp in codegen_result.device_programs: - c_kernels.append(CompiledCKernel(dp, self.kernel.target, self.compiler)) + c_kernels.append(CompiledCKernel(dp, dev_code, self.kernel.target, + self.compiler)) return _KernelInfo( kernel=kernel, -- GitLab From 9501672f47b284c6cfe7dec9847b32ac7cf8ba09 Mon Sep 17 00:00:00 2001 From: arghdos Date: Fri, 5 May 2017 10:59:00 -0400 Subject: [PATCH 11/65] add kwargs to catch any differing args between executors (e.g. out_host) --- loopy/execution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/execution.py b/loopy/execution.py index ee2666745..ea6236da8 100644 --- a/loopy/execution.py +++ b/loopy/execution.py @@ -674,7 +674,7 @@ class ExecutionWrapperGeneratorBase(object): "%s=None" % idi.name for idi in implemented_data_info if issubclass(idi.arg_class, KernelArgument) - ]) + ] + ['**kw_args']) gen.add_to_preamble("from __future__ import division") gen.add_to_preamble("") -- GitLab From 03d8c4c6f446161db56ddd34bbfef0f4f36f9aad Mon Sep 17 00:00:00 2001 From: arghdos Date: Fri, 5 May 2017 15:06:20 -0400 Subject: [PATCH 12/65] move C generator to c_execution --- loopy/execution.py | 96 +++-------------------- loopy/target/c/c_execution.py | 142 +++++++++++++++++++++++++++++++++- 2 files changed, 153 insertions(+), 85 deletions(-) diff --git a/loopy/execution.py b/loopy/execution.py index ee2666745..30cf6bef1 100644 --- a/loopy/execution.py +++ b/loopy/execution.py @@ -118,20 +118,15 @@ class SeparateArrayPackingController(object): class ExecutionWrapperGeneratorBase(object): """ A set of common methods for generating a wrapper - for execution of C-based languages + for execution """ - def __init__(self, system_args=["_lpy_c_kernels"]): + def __init__(self, system_args): self.system_args = system_args[:] def python_dtype_str(self, dtype): - # TODO: figure out why isbuiltin isn't working in test (requiring second - # line) - if dtype.isbuiltin or \ - np.dtype(str(dtype)).isbuiltin: - return "_lpy_np."+dtype.name - raise Exception('dtype: {} not recognized'.format(dtype)) + raise NotImplementedError() # {{{ invoker generation @@ -332,7 +327,7 @@ class ExecutionWrapperGeneratorBase(object): # {{{ handle non numpy arguements def handle_non_numpy_arg(self, gen, arg): - pass + raise NotImplementedError() # }}} @@ -342,57 +337,12 @@ class ExecutionWrapperGeneratorBase(object): """ Handle allocation of non-specified arguements for C-execution """ - from pymbolic import var - - num_axes = len(arg.unvec_shape) - for i in range(num_axes): - gen("_lpy_shape_%d = %s" % (i, strify(arg.unvec_shape[i]))) - - itemsize = kernel_arg.dtype.numpy_dtype.itemsize - for i in range(num_axes): - gen("_lpy_strides_%d = %s" % (i, strify( - itemsize*arg.unvec_strides[i]))) - - if not skip_arg_checks: - for i in range(num_axes): - gen("assert _lpy_strides_%d > 0, " - "\"'%s' has negative stride in axis %d\"" - % (i, arg.name, i)) - - sym_strides = tuple( - var("_lpy_strides_%d" % i) - for i in range(num_axes)) - - sym_shape = tuple( - var("_lpy_shape_%d" % i) - for i in range(num_axes)) - - gen("%(name)s = _lpy_np.empty(%(shape)s, " - "%(dtype)s)" - % dict( - name=arg.name, - shape=strify(sym_shape), - dtype=self.python_dtype_str( - kernel_arg.dtype.numpy_dtype))) - - #check strides - gen("%(name)s = _lpy_strided(%(name)s, %(shape)s, " - "%(strides)s)" - % dict( - name=arg.name, - shape=strify(sym_shape), - strides=strify(sym_strides))) - - if not skip_arg_checks: - for i in range(num_axes): - gen("del _lpy_shape_%d" % i) - gen("del _lpy_strides_%d" % i) - gen("") + raise NotImplementedError() # }}} def get_arg_pass(self, arg): - return arg.name + raise NotImplementedError() # {{{ arg setup @@ -601,25 +551,20 @@ class ExecutionWrapperGeneratorBase(object): def target_specific_preamble(self, gen): """ - Add default C-imports to preamble + Add target specific imports to preamble """ - gen.add_to_preamble("import numpy as _lpy_np") - gen.add_to_preamble("from loopy.target.c.compyte.array" - " import as_strided as _lpy_strided") + raise NotImplementedError() def initialize_system_args(self, gen): """ Override to intialize any default system args """ - pass + raise NotImplementedError() # {{{ generate invocation def generate_invocation(self, gen, kernel_name, args): - gen("for knl in _lpy_c_kernels:") - with Indentation(gen): - gen('knl({args})'.format( - args=", ".join(args))) + raise NotImplementedError() # }}} @@ -628,29 +573,12 @@ class ExecutionWrapperGeneratorBase(object): def generate_output_handler( self, gen, options, kernel, implemented_data_info): - from loopy.kernel.data import KernelArgument - - if options.return_dict: - gen("return None, {%s}" - % ", ".join("\"%s\": %s" % (arg.name, arg.name) - for arg in implemented_data_info - if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables())) - else: - out_args = [arg - for arg in implemented_data_info - if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables()] - if out_args: - gen("return None, (%s,)" - % ", ".join(arg.name for arg in out_args)) - else: - gen("return None, ()") + raise NotImplementedError() # }}} def generate_host_code(self, gen, codegen_result): - pass + raise NotImplementedError() def __call__(self, kernel, codegen_result): """ diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 2a60ace79..d129a37e0 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -27,15 +27,155 @@ import cgen import os import subprocess -from loopy.execution import (KernelExecutorBase, _KernelInfo) +from loopy.execution import (KernelExecutorBase, _KernelInfo, + ExecutionWrapperGeneratorBase) from pytools import memoize_method +from pytools.py_codegen import ( + Indentation, PythonFunctionGenerator) + import weakref import ctypes +import numpy as np import logging logger = logging.getLogger(__name__) + +class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): + """ + Specialized form of the :class:`ExecutionWrapperGeneratorBase` for + pyopencl execution + """ + + def __init__(self): + system_args = ["_lpy_c_kernels"] + super(ExecutionWrapperGeneratorBase, self).__init__(system_args) + + def python_dtype_str(self, dtype): + # TODO: figure out why isbuiltin isn't working in test (requiring second + # line) + if dtype.isbuiltin or \ + np.dtype(str(dtype)).isbuiltin: + return "_lpy_np."+dtype.name + raise Exception('dtype: {} not recognized'.format(dtype)) + + # {{{ handle non numpy arguements + + def handle_non_numpy_arg(self, gen, arg): + pass + + # }}} + + # {{{ handle allocation of unspecified arguements + + def handle_alloc(self, gen, arg, kernel_arg, strify, skip_arg_checks): + """ + Handle allocation of non-specified arguements for C-execution + """ + from pymbolic import var + + num_axes = len(arg.unvec_shape) + for i in range(num_axes): + gen("_lpy_shape_%d = %s" % (i, strify(arg.unvec_shape[i]))) + + itemsize = kernel_arg.dtype.numpy_dtype.itemsize + for i in range(num_axes): + gen("_lpy_strides_%d = %s" % (i, strify( + itemsize*arg.unvec_strides[i]))) + + if not skip_arg_checks: + for i in range(num_axes): + gen("assert _lpy_strides_%d > 0, " + "\"'%s' has negative stride in axis %d\"" + % (i, arg.name, i)) + + sym_strides = tuple( + var("_lpy_strides_%d" % i) + for i in range(num_axes)) + + sym_shape = tuple( + var("_lpy_shape_%d" % i) + for i in range(num_axes)) + + gen("%(name)s = _lpy_np.empty(%(shape)s, " + "%(dtype)s)" + % dict( + name=arg.name, + shape=strify(sym_shape), + dtype=self.python_dtype_str( + kernel_arg.dtype.numpy_dtype))) + + #check strides + gen("%(name)s = _lpy_strided(%(name)s, %(shape)s, " + "%(strides)s)" + % dict( + name=arg.name, + shape=strify(sym_shape), + strides=strify(sym_strides))) + + if not skip_arg_checks: + for i in range(num_axes): + gen("del _lpy_shape_%d" % i) + gen("del _lpy_strides_%d" % i) + gen("") + + # }}} + + def target_specific_preamble(self, gen): + """ + Add default C-imports to preamble + """ + gen.add_to_preamble("import numpy as _lpy_np") + + def initialize_system_args(self, gen): + """ + Initializes possibly empty system arguements + """ + pass + + # {{{ generate invocation + + def generate_invocation(self, gen, kernel_name, args): + gen("for knl in _lpy_c_kernels:") + with Indentation(gen): + gen('knl({args})'.format( + args=", ".join(args))) + # }}} + + # {{{ + + def generate_output_handler( + self, gen, options, kernel, implemented_data_info): + + from loopy.kernel.data import KernelArgument + + if options.return_dict: + gen("return None, {%s}" + % ", ".join("\"%s\": %s" % (arg.name, arg.name) + for arg in implemented_data_info + if issubclass(arg.arg_class, KernelArgument) + if arg.base_name in kernel.get_written_variables())) + else: + out_args = [arg + for arg in implemented_data_info + if issubclass(arg.arg_class, KernelArgument) + if arg.base_name in kernel.get_written_variables()] + if out_args: + gen("return None, (%s,)" + % ", ".join(arg.name for arg in out_args)) + else: + gen("return None, ()") + + # }}} + + def generate_host_code(self, gen, codegen_result): + pass + + def get_arg_pass(self, arg): + return arg.name + + """ The compiler module handles invocation of compilers to generate a shared lib which can be loaded via ctypes. -- GitLab From 853346ee0a1b13a14e8c316879f3cc8b70fb23a0 Mon Sep 17 00:00:00 2001 From: arghdos Date: Fri, 5 May 2017 15:08:34 -0400 Subject: [PATCH 13/65] fix fix super args --- loopy/execution.py | 2 +- loopy/target/c/c_execution.py | 8 ++++---- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/loopy/execution.py b/loopy/execution.py index 30cf6bef1..653568133 100644 --- a/loopy/execution.py +++ b/loopy/execution.py @@ -664,7 +664,7 @@ class KernelExecutorBase(object): .. automethod:: __call__ """ - def __init__(self, kernel, invoker=ExecutionWrapperGeneratorBase()): + def __init__(self, kernel, invoker): """ :arg kernel: a loopy.LoopKernel """ diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index d129a37e0..6e59125a1 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -30,8 +30,7 @@ import subprocess from loopy.execution import (KernelExecutorBase, _KernelInfo, ExecutionWrapperGeneratorBase) from pytools import memoize_method -from pytools.py_codegen import ( - Indentation, PythonFunctionGenerator) +from pytools.py_codegen import (Indentation) import weakref @@ -50,7 +49,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): def __init__(self): system_args = ["_lpy_c_kernels"] - super(ExecutionWrapperGeneratorBase, self).__init__(system_args) + super(CExecutionWrapperGenerator, self).__init__(system_args) def python_dtype_str(self, dtype): # TODO: figure out why isbuiltin isn't working in test (requiring second @@ -351,7 +350,8 @@ class CKernelExecutor(KernelExecutorBase): """ self.compiler = compiler if compiler else CCompiler() - super(CKernelExecutor, self).__init__(kernel) + super(CKernelExecutor, self).__init__(kernel, + CExecutionWrapperGenerator()) @memoize_method def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): -- GitLab From d15404bbcc2db81840240b5279bc67564228c1a8 Mon Sep 17 00:00:00 2001 From: arghdos Date: Fri, 5 May 2017 15:31:43 -0400 Subject: [PATCH 14/65] ditch _as_strided, as it's causing segfaults in my more complicated kernels --- loopy/target/c/c_execution.py | 26 +++++++++++++++++--------- 1 file changed, 17 insertions(+), 9 deletions(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 6e59125a1..0d0e3113a 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -97,23 +97,31 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): var("_lpy_shape_%d" % i) for i in range(num_axes)) + # find order of array + order = "'C'" + if num_axes > 1: + ldim = arg.unvec_strides[1] + if ldim == arg.unvec_shape[0]: + order = "'F'" + else: + order = "'C'" + gen("%(name)s = _lpy_np.empty(%(shape)s, " - "%(dtype)s)" + "%(dtype)s, order=%(order)s)" % dict( name=arg.name, shape=strify(sym_shape), dtype=self.python_dtype_str( - kernel_arg.dtype.numpy_dtype))) + kernel_arg.dtype.numpy_dtype), + order=order)) #check strides - gen("%(name)s = _lpy_strided(%(name)s, %(shape)s, " - "%(strides)s)" - % dict( - name=arg.name, - shape=strify(sym_shape), - strides=strify(sym_strides))) - if not skip_arg_checks: + gen("assert '%(strides)s == %(name)s.strides', " + "'Strides of loopy created array %(name)s, " + "do not match expected.'" % + dict(name=arg.name, + strides=strify(sym_strides))) for i in range(num_axes): gen("del _lpy_shape_%d" % i) gen("del _lpy_strides_%d" % i) -- GitLab From 65ce2df2c3cba287e9a79300bc80bb7b8ed84176 Mon Sep 17 00:00:00 2001 From: arghdos Date: Fri, 5 May 2017 15:32:06 -0400 Subject: [PATCH 15/65] add nonsquare test to better test the new stride / loopy arg creator --- test/test_c_execution.py | 46 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 46 insertions(+) diff --git a/test/test_c_execution.py b/test/test_c_execution.py index 6d7e2afef..8c990b89c 100644 --- a/test/test_c_execution.py +++ b/test/test_c_execution.py @@ -84,3 +84,49 @@ def test_c_target_strides(): assert np.allclose(knl(a=a_np)[1], 2 * a_np) + + +def test_c_target_strides_nonsquare(): + from loopy.target.c import CTarget + + def __get_kernel(order='C'): + indicies = ['i', 'j', 'k'] + sizes = tuple(np.random.randint(1, 11, size=len(indicies))) + # create domain strings + domain_template = '{{ [{iname}]: 0 <= {iname} < {size} }}' + domains = [] + for idx, size in zip(indicies, sizes): + domains.append(domain_template.format( + iname=idx, + size=size)) + statement = 'out[{indexed}] = 2 * a[{indexed}]'.format( + indexed=', '.join(indicies)) + return lp.make_kernel( + domains, + statement, + [ + lp.GlobalArg("out", np.float32, shape=sizes, order=order), + lp.GlobalArg("a", np.float32, shape=sizes, order=order), + "..." + ], + target=CTarget()) + + # test with C-order + knl = __get_kernel('C') + a_lp = next(x for x in knl.args if x.name == 'a') + a_np = np.reshape(np.arange(np.product(a_lp.shape), dtype=np.float32), + a_lp.shape, + order='C') + + assert np.allclose(knl(a=a_np)[1], + 2 * a_np) + + # test with F-order + knl = __get_kernel('F') + a_lp = next(x for x in knl.args if x.name == 'a') + a_np = np.reshape(np.arange(np.product(a_lp.shape), dtype=np.float32), + a_lp.shape, + order='F') + + assert np.allclose(knl(a=a_np)[1], + 2 * a_np) -- GitLab From d9d0d2d9a0b8fc751213ee746312d5e5bba52ddb Mon Sep 17 00:00:00 2001 From: arghdos Date: Fri, 5 May 2017 10:59:00 -0400 Subject: [PATCH 16/65] add kwargs to catch any differing args between executors (e.g. out_host) --- loopy/execution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/execution.py b/loopy/execution.py index 653568133..382255ace 100644 --- a/loopy/execution.py +++ b/loopy/execution.py @@ -602,7 +602,7 @@ class ExecutionWrapperGeneratorBase(object): "%s=None" % idi.name for idi in implemented_data_info if issubclass(idi.arg_class, KernelArgument) - ]) + ] + ['**kw_args']) gen.add_to_preamble("from __future__ import division") gen.add_to_preamble("") -- GitLab From 879b419481770ca60c413bcf4280e71e1474a68c Mon Sep 17 00:00:00 2001 From: arghdos Date: Fri, 5 May 2017 16:01:47 -0400 Subject: [PATCH 17/65] correct assertion --- loopy/target/c/c_execution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 0d0e3113a..501db3dfd 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -117,7 +117,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): #check strides if not skip_arg_checks: - gen("assert '%(strides)s == %(name)s.strides', " + gen("assert %(strides)s == %(name)s.strides, " "'Strides of loopy created array %(name)s, " "do not match expected.'" % dict(name=arg.name, -- GitLab From 9dbdbaeb72a520a5608317d67f978184c9cbdc42 Mon Sep 17 00:00:00 2001 From: arghdos Date: Wed, 12 Jul 2017 20:07:36 -0400 Subject: [PATCH 18/65] resolve isbuiltin change after hash --- loopy/target/c/c_execution.py | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 501db3dfd..fd06eb0e8 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -52,10 +52,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): super(CExecutionWrapperGenerator, self).__init__(system_args) def python_dtype_str(self, dtype): - # TODO: figure out why isbuiltin isn't working in test (requiring second - # line) - if dtype.isbuiltin or \ - np.dtype(str(dtype)).isbuiltin: + if np.dtype(str(dtype)).isbuiltin: return "_lpy_np."+dtype.name raise Exception('dtype: {} not recognized'.format(dtype)) -- GitLab From 00434a1c636032273c0a562ba2a3ab603b0ac2cf Mon Sep 17 00:00:00 2001 From: arghdos Date: Wed, 12 Jul 2017 20:18:55 -0400 Subject: [PATCH 19/65] move execution.py to target folder --- loopy/auto_test.py | 2 +- loopy/check.py | 2 +- loopy/target/c/c_execution.py | 2 +- loopy/{ => target}/execution.py | 0 loopy/target/pyopencl_execution.py | 4 ++-- test/test_c_execution.py | 2 -- 6 files changed, 5 insertions(+), 7 deletions(-) rename loopy/{ => target}/execution.py (100%) diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 9fefa5e3e..0860caa5b 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -404,7 +404,7 @@ def auto_test_vs_ref( "%d (1-based)" % (i+1)) from loopy.compiled import CompiledKernel - from loopy.execution import get_highlighted_code + from loopy.target.execution import get_highlighted_code if isinstance(op_count, (int, float)): warn("op_count should be a list", stacklevel=2) diff --git a/loopy/check.py b/loopy/check.py index 41095743d..d1ba1ab1a 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -731,7 +731,7 @@ def check_implemented_domains(kernel, implemented_domains, code=None): print(79*"-") print("CODE:") print(79*"-") - from loopy.execution import get_highlighted_code + from loopy.target.execution import get_highlighted_code print(get_highlighted_code(code)) print(79*"-") diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index fd06eb0e8..3e479dc2d 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -27,7 +27,7 @@ import cgen import os import subprocess -from loopy.execution import (KernelExecutorBase, _KernelInfo, +from loopy.target.execution import (KernelExecutorBase, _KernelInfo, ExecutionWrapperGeneratorBase) from pytools import memoize_method from pytools.py_codegen import (Indentation) diff --git a/loopy/execution.py b/loopy/target/execution.py similarity index 100% rename from loopy/execution.py rename to loopy/target/execution.py diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index b6c58ce47..df2067bfa 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -26,7 +26,7 @@ from six.moves import range, zip from pytools import memoize_method from pytools.py_codegen import Indentation -from loopy.execution import (KernelExecutorBase, ExecutionWrapperGeneratorBase, +from loopy.target.execution import (KernelExecutorBase, ExecutionWrapperGeneratorBase, _KernelInfo, _Kernels) import logging logger = logging.getLogger(__name__) @@ -248,7 +248,7 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype_set) from loopy.codegen import generate_code_v2 - from loopy.execution import get_highlighted_code + from loopy.target.execution import get_highlighted_code codegen_result = generate_code_v2(kernel) dev_code = codegen_result.device_code() diff --git a/test/test_c_execution.py b/test/test_c_execution.py index 8c990b89c..5467e992e 100644 --- a/test/test_c_execution.py +++ b/test/test_c_execution.py @@ -24,8 +24,6 @@ THE SOFTWARE. import numpy as np import loopy as lp -import pyopencl.clmath # noqa -import pyopencl.clrandom # noqa import logging logger = logging.getLogger(__name__) -- GitLab From 88695b1a9294604fb3aa7fccf3cc9e6690065ec9 Mon Sep 17 00:00:00 2001 From: arghdos Date: Wed, 12 Jul 2017 20:38:08 -0400 Subject: [PATCH 20/65] update copyright --- loopy/target/c/c_execution.py | 2 +- loopy/target/execution.py | 2 +- test/test_c_execution.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 3e479dc2d..019b2a829 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -1,6 +1,6 @@ from __future__ import division, with_statement, absolute_import -__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" +__copyright__ = "Copyright (C) 2017 Nick Curtis" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 382255ace..ffa0aa2ab 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -1,6 +1,6 @@ from __future__ import division, with_statement, absolute_import -__copyright__ = "Copyright (C) 2012-16 Andreas Kloeckner" +__copyright__ = "Copyright (C) 2012-17 Andreas Kloeckner, Nick Curtis" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy diff --git a/test/test_c_execution.py b/test/test_c_execution.py index 5467e992e..6d3c42da7 100644 --- a/test/test_c_execution.py +++ b/test/test_c_execution.py @@ -1,6 +1,6 @@ from __future__ import division, absolute_import, print_function -__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" +__copyright__ = "Copyright (C) 2017 Nick Curtis" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy -- GitLab From 41e002fec91587933f703d75647178909df6e4a9 Mon Sep 17 00:00:00 2001 From: arghdos Date: Wed, 12 Jul 2017 20:42:44 -0400 Subject: [PATCH 21/65] unify highlighters --- loopy/target/execution.py | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) diff --git a/loopy/target/execution.py b/loopy/target/execution.py index ffa0aa2ab..d89eb8ba8 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -774,28 +774,22 @@ class KernelExecutorBase(object): # {{{ code highlighers -def get_highlighted_python_code(text): +def get_highlighted_code(text, python=False): try: from pygments import highlight except ImportError: return text else: - from pygments.lexers import PythonLexer + from pygments.lexers import CLexer, PythonLexer from pygments.formatters import TerminalFormatter - return highlight(text, PythonLexer(), TerminalFormatter()) + return highlight(text, CLexer() if not python else PythonLexer(), + TerminalFormatter()) -def get_highlighted_code(text): - try: - from pygments import highlight - except ImportError: - return text - else: - from pygments.lexers import CLexer - from pygments.formatters import TerminalFormatter +def get_highlighted_python_code(text): + return get_highlighted_code(text, True) - return highlight(text, CLexer(), TerminalFormatter()) # }}} -- GitLab From 7140faafbdeeb9f7a8d33c797d49c102d0d584e3 Mon Sep 17 00:00:00 2001 From: arghdos Date: Wed, 12 Jul 2017 20:53:33 -0400 Subject: [PATCH 22/65] add ILP / UNR test --- test/test_c_execution.py | 44 ++++++++++++++++++++++++++++++++++++++++ 1 file changed, 44 insertions(+) diff --git a/test/test_c_execution.py b/test/test_c_execution.py index 6d3c42da7..576a61f6a 100644 --- a/test/test_c_execution.py +++ b/test/test_c_execution.py @@ -128,3 +128,47 @@ def test_c_target_strides_nonsquare(): assert np.allclose(knl(a=a_np)[1], 2 * a_np) + + +def test_c_optimizations(): + from loopy.target.c import CTarget + + def __get_kernel(order='C'): + indicies = ['i', 'j', 'k'] + sizes = tuple(np.random.randint(1, 11, size=len(indicies))) + # create domain strings + domain_template = '{{ [{iname}]: 0 <= {iname} < {size} }}' + domains = [] + for idx, size in zip(indicies, sizes): + domains.append(domain_template.format( + iname=idx, + size=size)) + statement = 'out[{indexed}] = 2 * a[{indexed}]'.format( + indexed=', '.join(indicies)) + return lp.make_kernel( + domains, + statement, + [ + lp.GlobalArg("out", np.float32, shape=sizes, order=order), + lp.GlobalArg("a", np.float32, shape=sizes, order=order), + "..." + ], + target=CTarget()), sizes + + # test with ILP + knl, sizes = __get_kernel('C') + lp.split_iname(knl, 'i', 4, inner_tag='ilp') + a_np = np.reshape(np.arange(np.product(sizes), dtype=np.float32), + sizes, + order='C') + + assert np.allclose(knl(a=a_np)[1], 2 * a_np) + + # test with unrolling + knl, sizes = __get_kernel('C') + lp.split_iname(knl, 'i', 4, inner_tag='unr') + a_np = np.reshape(np.arange(np.product(sizes), dtype=np.float32), + sizes, + order='C') + + assert np.allclose(knl(a=a_np)[1], 2 * a_np) -- GitLab From c5799ed5b8f980522823eca71a4b3072bd9f4795 Mon Sep 17 00:00:00 2001 From: arghdos Date: Wed, 12 Jul 2017 20:55:30 -0400 Subject: [PATCH 23/65] add ILP / UNR test --- test/test_c_execution.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_c_execution.py b/test/test_c_execution.py index 576a61f6a..bf8fce262 100644 --- a/test/test_c_execution.py +++ b/test/test_c_execution.py @@ -157,7 +157,7 @@ def test_c_optimizations(): # test with ILP knl, sizes = __get_kernel('C') - lp.split_iname(knl, 'i', 4, inner_tag='ilp') + knl = lp.split_iname(knl, 'i', 4, inner_tag='ilp') a_np = np.reshape(np.arange(np.product(sizes), dtype=np.float32), sizes, order='C') @@ -166,7 +166,7 @@ def test_c_optimizations(): # test with unrolling knl, sizes = __get_kernel('C') - lp.split_iname(knl, 'i', 4, inner_tag='unr') + knl = lp.split_iname(knl, 'i', 4, inner_tag='unr') a_np = np.reshape(np.arange(np.product(sizes), dtype=np.float32), sizes, order='C') -- GitLab From 1c95677d470caa89da4f705e447b840a87bfead5 Mon Sep 17 00:00:00 2001 From: arghdos Date: Thu, 3 Aug 2017 23:00:47 -0400 Subject: [PATCH 24/65] remove old execution.py --- loopy/execution.py | 239 --------------------------------------------- 1 file changed, 239 deletions(-) delete mode 100644 loopy/execution.py diff --git a/loopy/execution.py b/loopy/execution.py deleted file mode 100644 index 07e28f06d..000000000 --- a/loopy/execution.py +++ /dev/null @@ -1,239 +0,0 @@ -from __future__ import division, with_statement, absolute_import - -__copyright__ = "Copyright (C) 2012-16 Andreas Kloeckner" - -__license__ = """ -Permission is hereby granted, free of charge, to any person obtaining a copy -of this software and associated documentation files (the "Software"), to deal -in the Software without restriction, including without limitation the rights -to use, copy, modify, merge, publish, distribute, sublicense, and/or sell -copies of the Software, and to permit persons to whom the Software is -furnished to do so, subject to the following conditions: - -The above copyright notice and this permission notice shall be included in -all copies or substantial portions of the Software. - -THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR -IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, -FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE -AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER -LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, -OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN -THE SOFTWARE. -""" - - -import six -import numpy as np -from pytools import ImmutableRecord, memoize_method -from loopy.diagnostic import LoopyError - -import logging -logger = logging.getLogger(__name__) - -from pytools.persistent_dict import PersistentDict -from loopy.tools import LoopyKeyBuilder -from loopy.version import DATA_MODEL_VERSION - - -# {{{ object array argument packing - -class _PackingInfo(ImmutableRecord): - """ - .. attribute:: name - .. attribute:: sep_shape - - .. attribute:: subscripts_and_names - - A list of type ``[(index, unpacked_name), ...]``. - """ - - -class SeparateArrayPackingController(object): - """For argument arrays with axes tagged to be implemented as separate - arrays, this class provides preprocessing of the incoming arguments so that - all sub-arrays may be passed in one object array (under the original, - un-split argument name) and are unpacked into separate arrays before being - passed to the kernel. - - It also repacks outgoing arrays of this type back into an object array. - """ - - def __init__(self, kernel): - # map from arg name - self.packing_info = {} - - from loopy.kernel.array import ArrayBase - for arg in kernel.args: - if not isinstance(arg, ArrayBase): - continue - - if arg.shape is None or arg.dim_tags is None: - continue - - subscripts_and_names = arg.subscripts_and_names() - - if subscripts_and_names is None: - continue - - self.packing_info[arg.name] = _PackingInfo( - name=arg.name, - sep_shape=arg.sep_shape(), - subscripts_and_names=subscripts_and_names, - is_written=arg.name in kernel.get_written_variables()) - - def unpack(self, kernel_kwargs): - if not self.packing_info: - return kernel_kwargs - - kernel_kwargs = kernel_kwargs.copy() - - for packing_info in six.itervalues(self.packing_info): - arg_name = packing_info.name - if packing_info.name in kernel_kwargs: - arg = kernel_kwargs[arg_name] - for index, unpacked_name in packing_info.subscripts_and_names: - assert unpacked_name not in kernel_kwargs - kernel_kwargs[unpacked_name] = arg[index] - del kernel_kwargs[arg_name] - - return kernel_kwargs - - def pack(self, outputs): - if not self.packing_info: - return outputs - - for packing_info in six.itervalues(self.packing_info): - if not packing_info.is_written: - continue - - result = outputs[packing_info.name] = \ - np.zeros(packing_info.sep_shape, dtype=np.object) - - for index, unpacked_name in packing_info.subscripts_and_names: - result[index] = outputs.pop(unpacked_name) - - return outputs - -# }}} - - -# {{{ KernelExecutorBase - -typed_and_scheduled_cache = PersistentDict( - "loopy-typed-and-scheduled-cache-v1-"+DATA_MODEL_VERSION, - key_builder=LoopyKeyBuilder()) - - -class KernelExecutorBase(object): - """An object connecting a kernel to a :class:`pyopencl.Context` - for execution. - - .. automethod:: __init__ - .. automethod:: __call__ - """ - - def __init__(self, kernel): - """ - :arg kernel: a loopy.LoopKernel - """ - - self.kernel = kernel - - self.packing_controller = SeparateArrayPackingController(kernel) - - self.output_names = tuple(arg.name for arg in self.kernel.args - if arg.name in self.kernel.get_written_variables()) - - self.has_runtime_typed_args = any( - arg.dtype is None - for arg in kernel.args) - - def get_typed_and_scheduled_kernel_uncached(self, arg_to_dtype_set): - from loopy.kernel.tools import add_dtypes - - kernel = self.kernel - - if arg_to_dtype_set: - var_to_dtype = {} - for var, dtype in arg_to_dtype_set: - try: - dest_name = kernel.impl_arg_to_arg[var].name - except KeyError: - dest_name = var - - try: - var_to_dtype[dest_name] = dtype - except KeyError: - raise LoopyError("cannot set type for '%s': " - "no known variable/argument with that name" - % var) - - kernel = add_dtypes(kernel, var_to_dtype) - - from loopy.type_inference import infer_unknown_types - kernel = infer_unknown_types(kernel, expect_completion=True) - - if kernel.schedule is None: - from loopy.preprocess import preprocess_kernel - kernel = preprocess_kernel(kernel) - - from loopy.schedule import get_one_scheduled_kernel - kernel = get_one_scheduled_kernel(kernel) - - return kernel - - @memoize_method - def get_typed_and_scheduled_kernel(self, arg_to_dtype_set): - from loopy import CACHING_ENABLED - - from loopy.preprocess import prepare_for_caching - # prepare_for_caching() gets run by preprocess, but the kernel at this - # stage is not guaranteed to be preprocessed. - cacheable_kernel = prepare_for_caching(self.kernel) - cache_key = (type(self).__name__, cacheable_kernel, arg_to_dtype_set) - - if CACHING_ENABLED: - try: - return typed_and_scheduled_cache[cache_key] - except KeyError: - pass - - logger.debug("%s: typed-and-scheduled cache miss" % self.kernel.name) - - kernel = self.get_typed_and_scheduled_kernel_uncached(arg_to_dtype_set) - - if CACHING_ENABLED: - typed_and_scheduled_cache[cache_key] = kernel - - return kernel - - def arg_to_dtype_set(self, kwargs): - if not self.has_runtime_typed_args: - return None - - from loopy.types import NumpyType - target = self.kernel.target - - impl_arg_to_arg = self.kernel.impl_arg_to_arg - arg_to_dtype = {} - for arg_name, val in six.iteritems(kwargs): - arg = impl_arg_to_arg.get(arg_name, None) - - if arg is None: - # offsets, strides and such - continue - - if arg.dtype is None and val is not None: - try: - dtype = val.dtype - except AttributeError: - pass - else: - arg_to_dtype[arg_name] = NumpyType(dtype, target) - - return frozenset(six.iteritems(arg_to_dtype)) - -# }}} - -# vim: foldmethod=marker -- GitLab From 317d031e5c26143e8e235c3b3b7e173ddbc2302c Mon Sep 17 00:00:00 2001 From: arghdos Date: Thu, 3 Aug 2017 23:01:03 -0400 Subject: [PATCH 25/65] use codepy for build system to enable caching --- loopy/target/c/c_execution.py | 72 +++++++++++++++++++---------------- 1 file changed, 39 insertions(+), 33 deletions(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 019b2a829..4630bd748 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -25,16 +25,17 @@ THE SOFTWARE. import tempfile import cgen import os -import subprocess from loopy.target.execution import (KernelExecutorBase, _KernelInfo, ExecutionWrapperGeneratorBase) from pytools import memoize_method from pytools.py_codegen import (Indentation) - +from codepy.toolchain import guess_toolchain +from codepy.jit import compile_from_string +import six import weakref - import ctypes + import numpy as np import logging @@ -197,44 +198,47 @@ class CCompiler(object): default_compile_flags = '-std=c99 -g -O3 -fPIC'.split() default_link_flags = '-shared'.split() - def __init__(self, cc=None, - cflags=None, - ldflags=None): - self.exe = cc if cc else self.default_exe - self.cflags = cflags or self.default_compile_flags[:] - self.ldflags = ldflags or self.default_link_flags[:] + def __init__(self, cc=default_exe, cflags=default_compile_flags, + ldflags=None, libraries=None, + include_dirs=[], library_dirs=[], defines=[]): + # try to get a default toolchain + self.toolchain = guess_toolchain() + # copy in all differing values + diff = {'cc': cc, + 'cflags': cflags, + 'ldflags': ldflags, + 'libraries': libraries, + 'include_dirs': include_dirs, + 'library_dirs': library_dirs, + 'defines': defines} + # filter empty and those equal to toolchain defaults + diff = {k: v for k, v in six.iteritems(diff) if v and + getattr(self.toolchain, k) != v} + self.toolchain = self.toolchain.copy(**diff) self.tempdir = tempfile.mkdtemp(prefix="tmp_loopy") def _tempname(self, name): """Build temporary filename path in tempdir.""" return os.path.join(self.tempdir, name) - def _call(self, args, **kwargs): - """Invoke compiler with arguments.""" - cwd = self.tempdir - args_ = [self.exe] + args - logger.debug(args_) - subprocess.check_call(args_, cwd=cwd, **kwargs) - - def build(self, code): + @memoize_method + def build(self, name, code, debug=False, wait_on_error=None, + debug_recompile=True): """Compile code, build and load shared library.""" logger.debug(code) c_fname = self._tempname('code.' + self.source_suffix) - obj_fname = self._tempname('code.o') - dll_fname = self._tempname('code.so') - with open(c_fname, 'w') as fd: - fd.write(code) - self._call(self.compile_args(c_fname)) - self._call(self.link_args(obj_fname, dll_fname)) - return ctypes.CDLL(dll_fname) - def compile_args(self, c_fname): - "Construct args for compile command." - return self.cflags + ['-c', c_fname] + # build object + checksum, mod_name, ext_file, recompiled = \ + compile_from_string(self.toolchain, name, code, c_fname, + self.tempdir, debug, wait_on_error, + debug_recompile, False) + + if not recompiled: + logger.debug('Kernel {} compiled from source'.format(name)) - def link_args(self, obj_fname, dll_fname): - "Construct args for link command." - return self.ldflags + ['-shared', obj_fname, '-o', dll_fname] + # and return compiled + return checksum, ctypes.CDLL(ext_file) class CppCompiler(CCompiler): @@ -261,7 +265,9 @@ class CompiledCKernel(object): # get code and build self.code = dev_code self.comp = comp or CCompiler() - self.dll = self.comp.build(self.code) + self.checksum, self.dll = self.comp.build( + self.knl.name, self.code) + # get the function declaration for interface with ctypes from loopy.target.c import CFunctionDeclExtractor self.func_decl = CFunctionDeclExtractor() @@ -384,8 +390,8 @@ class CKernelExecutor(KernelExecutorBase): c_kernels = [] for dp in codegen_result.device_programs: - c_kernels.append(CompiledCKernel(dp, dev_code, self.kernel.target, - self.compiler)) + c_kernels.append(CompiledCKernel(dp, dev_code, + self.kernel.target, self.compiler)) return _KernelInfo( kernel=kernel, -- GitLab From c616d96e504ef197b56ab05a28ee0f324436d9fd Mon Sep 17 00:00:00 2001 From: arghdos Date: Thu, 3 Aug 2017 23:11:18 -0400 Subject: [PATCH 26/65] add depend --- setup.py | 1 + 1 file changed, 1 insertion(+) diff --git a/setup.py b/setup.py index 67d943af3..85ab2ad7d 100644 --- a/setup.py +++ b/setup.py @@ -43,6 +43,7 @@ setup(name="loo.py", "cgen>=2016.1", "islpy>=2016.2", "six>=1.8.0", + "codepy>=2013.1.2", "colorama", "Mako", ], -- GitLab From 6737ed6cad01f0ae8d5a360fa3f620f83ec0e48e Mon Sep 17 00:00:00 2001 From: arghdos Date: Thu, 3 Aug 2017 23:14:40 -0400 Subject: [PATCH 27/65] add to requirements --- requirements.txt | 1 + 1 file changed, 1 insertion(+) diff --git a/requirements.txt b/requirements.txt index c4dbe7a6d..3ff69a123 100644 --- a/requirements.txt +++ b/requirements.txt @@ -4,6 +4,7 @@ git+https://github.com/inducer/cgen.git git+https://github.com/pyopencl/pyopencl.git git+https://github.com/inducer/pymbolic.git git+https://github.com/inducer/genpy.git +git+https://github.com/inducer/codepy.git hg+https://bitbucket.org/inducer/f2py -- GitLab From 7b471258c401640d7ab3499db6b4bbbfa4cb9395 Mon Sep 17 00:00:00 2001 From: arghdos Date: Tue, 8 Aug 2017 10:14:32 -0400 Subject: [PATCH 28/65] fix build --- loopy/target/c/c_execution.py | 162 ++++++++++++++++++++++------------ 1 file changed, 106 insertions(+), 56 deletions(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 4630bd748..33f96eb1c 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -27,7 +27,7 @@ import cgen import os from loopy.target.execution import (KernelExecutorBase, _KernelInfo, - ExecutionWrapperGeneratorBase) + ExecutionWrapperGeneratorBase) from pytools import memoize_method from pytools.py_codegen import (Indentation) from codepy.toolchain import guess_toolchain @@ -43,6 +43,7 @@ logger = logging.getLogger(__name__) class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): + """ Specialized form of the :class:`ExecutionWrapperGeneratorBase` for pyopencl execution @@ -84,16 +85,16 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): if not skip_arg_checks: for i in range(num_axes): gen("assert _lpy_strides_%d > 0, " - "\"'%s' has negative stride in axis %d\"" - % (i, arg.name, i)) + "\"'%s' has negative stride in axis %d\"" + % (i, arg.name, i)) sym_strides = tuple( - var("_lpy_strides_%d" % i) - for i in range(num_axes)) + var("_lpy_strides_%d" % i) + for i in range(num_axes)) sym_shape = tuple( - var("_lpy_shape_%d" % i) - for i in range(num_axes)) + var("_lpy_shape_%d" % i) + for i in range(num_axes)) # find order of array order = "'C'" @@ -105,21 +106,21 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): order = "'C'" gen("%(name)s = _lpy_np.empty(%(shape)s, " - "%(dtype)s, order=%(order)s)" - % dict( - name=arg.name, - shape=strify(sym_shape), - dtype=self.python_dtype_str( - kernel_arg.dtype.numpy_dtype), - order=order)) - - #check strides + "%(dtype)s, order=%(order)s)" + % dict( + name=arg.name, + shape=strify(sym_shape), + dtype=self.python_dtype_str( + kernel_arg.dtype.numpy_dtype), + order=order)) + + # check strides if not skip_arg_checks: gen("assert %(strides)s == %(name)s.strides, " - "'Strides of loopy created array %(name)s, " - "do not match expected.'" % - dict(name=arg.name, - strides=strify(sym_strides))) + "'Strides of loopy created array %(name)s, " + "do not match expected.'" % + dict(name=arg.name, + strides=strify(sym_strides))) for i in range(num_axes): gen("del _lpy_shape_%d" % i) gen("del _lpy_strides_%d" % i) @@ -157,18 +158,18 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): if options.return_dict: gen("return None, {%s}" - % ", ".join("\"%s\": %s" % (arg.name, arg.name) - for arg in implemented_data_info - if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables())) + % ", ".join("\"%s\": %s" % (arg.name, arg.name) + for arg in implemented_data_info + if issubclass(arg.arg_class, KernelArgument) + if arg.base_name in kernel.get_written_variables())) else: out_args = [arg - for arg in implemented_data_info + for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables()] + if arg.base_name in kernel.get_written_variables()] if out_args: gen("return None, (%s,)" - % ", ".join(arg.name for arg in out_args)) + % ", ".join(arg.name for arg in out_args)) else: gen("return None, ()") @@ -188,21 +189,22 @@ which can be loaded via ctypes. class CCompiler(object): + """ Wraps a C compiler to build and load shared libraries. Defaults to gcc """ - source_suffix = 'c' - default_exe = 'gcc' - default_compile_flags = '-std=c99 -g -O3 -fPIC'.split() - default_link_flags = '-shared'.split() - - def __init__(self, cc=default_exe, cflags=default_compile_flags, - ldflags=None, libraries=None, - include_dirs=[], library_dirs=[], defines=[]): + def __init__(self, toolchain=None, + cc='gcc', cflags='-std=c99 -g -O3 -fPIC'.split(), + ldflags='-shared'.split(), libraries=None, + include_dirs=[], library_dirs=[], defines=[], + source_suffix='c', requires_separate_linkage=False): # try to get a default toolchain - self.toolchain = guess_toolchain() + # or subclass supplied version if available + self.toolchain = guess_toolchain() if toolchain is None else toolchain + self.requires_separate_linkage = requires_separate_linkage + self.source_suffix = source_suffix # copy in all differing values diff = {'cc': cc, 'cflags': cflags, @@ -222,34 +224,82 @@ class CCompiler(object): return os.path.join(self.tempdir, name) @memoize_method - def build(self, name, code, debug=False, wait_on_error=None, - debug_recompile=True): - """Compile code, build and load shared library.""" + def _build_obj(self, name, code, debug=False, wait_on_error=None, + debug_recompile=True): + """Compile code, and build object file""" logger.debug(code) c_fname = self._tempname('code.' + self.source_suffix) # build object - checksum, mod_name, ext_file, recompiled = \ + obj_checksum, _, obj_file, recompiled = \ compile_from_string(self.toolchain, name, code, c_fname, self.tempdir, debug, wait_on_error, - debug_recompile, False) + debug_recompile, True) + if not recompiled: + logger.debug('Kernel {} compiled from source'.format(name)) + + return obj_checksum, obj_file + + @memoize_method + def _build_lib(self, name, obj_file, debug=False, wait_on_error=None, + debug_recompile=True): + """Build and load shared library from object file""" + + # read obj file into get "source" + with open(obj_file, 'rb') as file: + obj = file.read() + + from os.path import basename + obj_name = basename(obj_file) + # build object + so_checksum, _, so_file, recompiled = \ + compile_from_string(self.toolchain, name, obj, obj_name, + self.tempdir, debug, wait_on_error, + debug_recompile, object=False, + source_is_binary=True) if not recompiled: logger.debug('Kernel {} compiled from source'.format(name)) + return so_checksum, ctypes.CDLL(so_file) + + def build(self, name, code, debug=False, wait_on_error=None, + debug_recompile=True): + """Compile code, build and load shared library.""" + + # build object + _, obj_file = self._build_obj(name, code, debug=debug, + wait_on_error=wait_on_error, + debug_recompile=debug_recompile) + + # and create library + _, lib = self._build_lib(name, obj_file, debug=debug, + wait_on_error=wait_on_error, + debug_recompile=debug_recompile) + # and return compiled - return checksum, ctypes.CDLL(ext_file) + return lib class CppCompiler(CCompiler): + """Subclass of Compiler to invoke a C++ compiler. Defaults to g++""" - source_suffix = 'cpp' - default_exe = 'g++' - default_compile_flags = '-g -O3'.split() + + def __init__(self, *args, **kwargs): + defaults = {'cc': 'g++', + 'source_suffix': 'cpp', + 'cflags': '-g -O3'.split()} + + # update to use any user specified info + defaults.update(kwargs) + + # and create + super(CppCompiler, self).__init__(*args, **defaults) class CompiledCKernel(object): + """ A CompiledCKernel wraps a loopy kernel, compiling it and loading the result as a shared library, and provides access to the kernel as a @@ -265,8 +315,7 @@ class CompiledCKernel(object): # get code and build self.code = dev_code self.comp = comp or CCompiler() - self.checksum, self.dll = self.comp.build( - self.knl.name, self.code) + self.dll = self.comp.build(self.knl.name, self.code) # get the function declaration for interface with ctypes from loopy.target.c import CFunctionDeclExtractor @@ -345,6 +394,7 @@ class CompiledCKernel(object): class CKernelExecutor(KernelExecutorBase): + """An object connecting a kernel to a :class:`CompiledKernel` for execution. @@ -352,7 +402,8 @@ class CKernelExecutor(KernelExecutorBase): .. automethod:: __call__ """ - def __init__(self, kernel, compiler=None): + def __init__(self, kernel, invoker=CExecutionWrapperGenerator(), + compiler=None): """ :arg kernel: may be a loopy.LoopKernel, a generator returning kernels (a warning will be issued if more than one is returned). If the @@ -361,8 +412,7 @@ class CKernelExecutor(KernelExecutorBase): """ self.compiler = compiler if compiler else CCompiler() - super(CKernelExecutor, self).__init__(kernel, - CExecutionWrapperGenerator()) + super(CKernelExecutor, self).__init__(kernel, invoker=invoker) @memoize_method def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): @@ -391,13 +441,13 @@ class CKernelExecutor(KernelExecutorBase): c_kernels = [] for dp in codegen_result.device_programs: c_kernels.append(CompiledCKernel(dp, dev_code, - self.kernel.target, self.compiler)) + self.kernel.target, self.compiler)) return _KernelInfo( - kernel=kernel, - c_kernels=c_kernels, - implemented_data_info=codegen_result.implemented_data_info, - invoker=self.invoker(kernel, codegen_result)) + kernel=kernel, + c_kernels=c_kernels, + implemented_data_info=codegen_result.implemented_data_info, + invoker=self.invoker(kernel, codegen_result)) # }}} @@ -417,4 +467,4 @@ class CKernelExecutor(KernelExecutorBase): kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs)) return kernel_info.invoker( - kernel_info.c_kernels, *args, **kwargs) + kernel_info.c_kernels, *args, **kwargs) -- GitLab From 350cd59f32afe7c31774dab522a0377805e1b48b Mon Sep 17 00:00:00 2001 From: arghdos Date: Tue, 8 Aug 2017 10:14:52 -0400 Subject: [PATCH 29/65] pass compiler by keyword --- loopy/target/c/__init__.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 3b4fed215..0e1f0ff86 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -304,7 +304,7 @@ class CTarget(TargetBase): def get_kernel_executor(self, knl, *args, **kwargs): from loopy.target.c.c_execution import CKernelExecutor - return CKernelExecutor(knl, self.compiler) + return CKernelExecutor(knl, compiler=self.compiler) # }}} -- GitLab From 0d104bde27cbfa48136e90947c02a61c4a6e6a9b Mon Sep 17 00:00:00 2001 From: arghdos Date: Tue, 8 Aug 2017 10:38:05 -0400 Subject: [PATCH 30/65] fix for doc test --- doc/tutorial.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 92ec799f7..6880db267 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -228,7 +228,7 @@ inspect that code, too, using :attr:`loopy.Options.write_wrapper`: >>> evt, (out,) = knl(queue, a=x_vec_host) from __future__ import division ... - def invoke_loopy_kernel_loopy_kernel(_lpy_cl_kernels, queue, allocator=None, wait_for=None, out_host=None, a=None, n=None, out=None): + def invoke_loopy_kernel_loopy_kernel(_lpy_cl_kernels, queue, allocator=None, wait_for=None, out_host=None, a=None, n=None, out=None, **kw_args): if allocator is None: allocator = _lpy_cl_tools.DeferredAllocator(queue.context) -- GitLab From 7801b4f4136213db9d47607bcad4f152f0e372b9 Mon Sep 17 00:00:00 2001 From: arghdos Date: Tue, 8 Aug 2017 11:17:11 -0400 Subject: [PATCH 31/65] revert to one stage build at the moment to complete c_execution WIP. will move back to enable ISPC later --- loopy/target/c/c_execution.py | 162 ++++++++++++---------------------- 1 file changed, 56 insertions(+), 106 deletions(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 33f96eb1c..4630bd748 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -27,7 +27,7 @@ import cgen import os from loopy.target.execution import (KernelExecutorBase, _KernelInfo, - ExecutionWrapperGeneratorBase) + ExecutionWrapperGeneratorBase) from pytools import memoize_method from pytools.py_codegen import (Indentation) from codepy.toolchain import guess_toolchain @@ -43,7 +43,6 @@ logger = logging.getLogger(__name__) class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): - """ Specialized form of the :class:`ExecutionWrapperGeneratorBase` for pyopencl execution @@ -85,16 +84,16 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): if not skip_arg_checks: for i in range(num_axes): gen("assert _lpy_strides_%d > 0, " - "\"'%s' has negative stride in axis %d\"" - % (i, arg.name, i)) + "\"'%s' has negative stride in axis %d\"" + % (i, arg.name, i)) sym_strides = tuple( - var("_lpy_strides_%d" % i) - for i in range(num_axes)) + var("_lpy_strides_%d" % i) + for i in range(num_axes)) sym_shape = tuple( - var("_lpy_shape_%d" % i) - for i in range(num_axes)) + var("_lpy_shape_%d" % i) + for i in range(num_axes)) # find order of array order = "'C'" @@ -106,21 +105,21 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): order = "'C'" gen("%(name)s = _lpy_np.empty(%(shape)s, " - "%(dtype)s, order=%(order)s)" - % dict( - name=arg.name, - shape=strify(sym_shape), - dtype=self.python_dtype_str( - kernel_arg.dtype.numpy_dtype), - order=order)) - - # check strides + "%(dtype)s, order=%(order)s)" + % dict( + name=arg.name, + shape=strify(sym_shape), + dtype=self.python_dtype_str( + kernel_arg.dtype.numpy_dtype), + order=order)) + + #check strides if not skip_arg_checks: gen("assert %(strides)s == %(name)s.strides, " - "'Strides of loopy created array %(name)s, " - "do not match expected.'" % - dict(name=arg.name, - strides=strify(sym_strides))) + "'Strides of loopy created array %(name)s, " + "do not match expected.'" % + dict(name=arg.name, + strides=strify(sym_strides))) for i in range(num_axes): gen("del _lpy_shape_%d" % i) gen("del _lpy_strides_%d" % i) @@ -158,18 +157,18 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): if options.return_dict: gen("return None, {%s}" - % ", ".join("\"%s\": %s" % (arg.name, arg.name) - for arg in implemented_data_info - if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables())) + % ", ".join("\"%s\": %s" % (arg.name, arg.name) + for arg in implemented_data_info + if issubclass(arg.arg_class, KernelArgument) + if arg.base_name in kernel.get_written_variables())) else: out_args = [arg - for arg in implemented_data_info + for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables()] + if arg.base_name in kernel.get_written_variables()] if out_args: gen("return None, (%s,)" - % ", ".join(arg.name for arg in out_args)) + % ", ".join(arg.name for arg in out_args)) else: gen("return None, ()") @@ -189,22 +188,21 @@ which can be loaded via ctypes. class CCompiler(object): - """ Wraps a C compiler to build and load shared libraries. Defaults to gcc """ - def __init__(self, toolchain=None, - cc='gcc', cflags='-std=c99 -g -O3 -fPIC'.split(), - ldflags='-shared'.split(), libraries=None, - include_dirs=[], library_dirs=[], defines=[], - source_suffix='c', requires_separate_linkage=False): + source_suffix = 'c' + default_exe = 'gcc' + default_compile_flags = '-std=c99 -g -O3 -fPIC'.split() + default_link_flags = '-shared'.split() + + def __init__(self, cc=default_exe, cflags=default_compile_flags, + ldflags=None, libraries=None, + include_dirs=[], library_dirs=[], defines=[]): # try to get a default toolchain - # or subclass supplied version if available - self.toolchain = guess_toolchain() if toolchain is None else toolchain - self.requires_separate_linkage = requires_separate_linkage - self.source_suffix = source_suffix + self.toolchain = guess_toolchain() # copy in all differing values diff = {'cc': cc, 'cflags': cflags, @@ -224,82 +222,34 @@ class CCompiler(object): return os.path.join(self.tempdir, name) @memoize_method - def _build_obj(self, name, code, debug=False, wait_on_error=None, - debug_recompile=True): - """Compile code, and build object file""" + def build(self, name, code, debug=False, wait_on_error=None, + debug_recompile=True): + """Compile code, build and load shared library.""" logger.debug(code) c_fname = self._tempname('code.' + self.source_suffix) # build object - obj_checksum, _, obj_file, recompiled = \ + checksum, mod_name, ext_file, recompiled = \ compile_from_string(self.toolchain, name, code, c_fname, self.tempdir, debug, wait_on_error, - debug_recompile, True) - if not recompiled: - logger.debug('Kernel {} compiled from source'.format(name)) - - return obj_checksum, obj_file - - @memoize_method - def _build_lib(self, name, obj_file, debug=False, wait_on_error=None, - debug_recompile=True): - """Build and load shared library from object file""" - - # read obj file into get "source" - with open(obj_file, 'rb') as file: - obj = file.read() - - from os.path import basename - obj_name = basename(obj_file) + debug_recompile, False) - # build object - so_checksum, _, so_file, recompiled = \ - compile_from_string(self.toolchain, name, obj, obj_name, - self.tempdir, debug, wait_on_error, - debug_recompile, object=False, - source_is_binary=True) if not recompiled: logger.debug('Kernel {} compiled from source'.format(name)) - return so_checksum, ctypes.CDLL(so_file) - - def build(self, name, code, debug=False, wait_on_error=None, - debug_recompile=True): - """Compile code, build and load shared library.""" - - # build object - _, obj_file = self._build_obj(name, code, debug=debug, - wait_on_error=wait_on_error, - debug_recompile=debug_recompile) - - # and create library - _, lib = self._build_lib(name, obj_file, debug=debug, - wait_on_error=wait_on_error, - debug_recompile=debug_recompile) - # and return compiled - return lib + return checksum, ctypes.CDLL(ext_file) class CppCompiler(CCompiler): - """Subclass of Compiler to invoke a C++ compiler. Defaults to g++""" - - def __init__(self, *args, **kwargs): - defaults = {'cc': 'g++', - 'source_suffix': 'cpp', - 'cflags': '-g -O3'.split()} - - # update to use any user specified info - defaults.update(kwargs) - - # and create - super(CppCompiler, self).__init__(*args, **defaults) + source_suffix = 'cpp' + default_exe = 'g++' + default_compile_flags = '-g -O3'.split() class CompiledCKernel(object): - """ A CompiledCKernel wraps a loopy kernel, compiling it and loading the result as a shared library, and provides access to the kernel as a @@ -315,7 +265,8 @@ class CompiledCKernel(object): # get code and build self.code = dev_code self.comp = comp or CCompiler() - self.dll = self.comp.build(self.knl.name, self.code) + self.checksum, self.dll = self.comp.build( + self.knl.name, self.code) # get the function declaration for interface with ctypes from loopy.target.c import CFunctionDeclExtractor @@ -394,7 +345,6 @@ class CompiledCKernel(object): class CKernelExecutor(KernelExecutorBase): - """An object connecting a kernel to a :class:`CompiledKernel` for execution. @@ -402,8 +352,7 @@ class CKernelExecutor(KernelExecutorBase): .. automethod:: __call__ """ - def __init__(self, kernel, invoker=CExecutionWrapperGenerator(), - compiler=None): + def __init__(self, kernel, compiler=None): """ :arg kernel: may be a loopy.LoopKernel, a generator returning kernels (a warning will be issued if more than one is returned). If the @@ -412,7 +361,8 @@ class CKernelExecutor(KernelExecutorBase): """ self.compiler = compiler if compiler else CCompiler() - super(CKernelExecutor, self).__init__(kernel, invoker=invoker) + super(CKernelExecutor, self).__init__(kernel, + CExecutionWrapperGenerator()) @memoize_method def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): @@ -441,13 +391,13 @@ class CKernelExecutor(KernelExecutorBase): c_kernels = [] for dp in codegen_result.device_programs: c_kernels.append(CompiledCKernel(dp, dev_code, - self.kernel.target, self.compiler)) + self.kernel.target, self.compiler)) return _KernelInfo( - kernel=kernel, - c_kernels=c_kernels, - implemented_data_info=codegen_result.implemented_data_info, - invoker=self.invoker(kernel, codegen_result)) + kernel=kernel, + c_kernels=c_kernels, + implemented_data_info=codegen_result.implemented_data_info, + invoker=self.invoker(kernel, codegen_result)) # }}} @@ -467,4 +417,4 @@ class CKernelExecutor(KernelExecutorBase): kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs)) return kernel_info.invoker( - kernel_info.c_kernels, *args, **kwargs) + kernel_info.c_kernels, *args, **kwargs) -- GitLab From f12777439fb31a229db5ba74392284bf09351f3d Mon Sep 17 00:00:00 2001 From: arghdos Date: Tue, 8 Aug 2017 11:29:33 -0400 Subject: [PATCH 32/65] fix for py2.6 --- loopy/target/c/c_execution.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 4630bd748..3b7329d36 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -212,8 +212,8 @@ class CCompiler(object): 'library_dirs': library_dirs, 'defines': defines} # filter empty and those equal to toolchain defaults - diff = {k: v for k, v in six.iteritems(diff) if v and - getattr(self.toolchain, k) != v} + diff = dict((k, v) for k, v in six.iteritems(diff) if v and + getattr(self.toolchain, k) != v) self.toolchain = self.toolchain.copy(**diff) self.tempdir = tempfile.mkdtemp(prefix="tmp_loopy") -- GitLab From 1221b88f5dbcf848034f349c9ff9346b85b9c0a9 Mon Sep 17 00:00:00 2001 From: arghdos Date: Tue, 8 Aug 2017 13:32:38 -0400 Subject: [PATCH 33/65] fix order detection bug when first and last axis had same dim --- loopy/target/c/c_execution.py | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 3b7329d36..110b7dbb6 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -96,13 +96,7 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): for i in range(num_axes)) # find order of array - order = "'C'" - if num_axes > 1: - ldim = arg.unvec_strides[1] - if ldim == arg.unvec_shape[0]: - order = "'F'" - else: - order = "'C'" + order = "'C'" if arg.unvec_strides[-1] == 1 else "'F'" gen("%(name)s = _lpy_np.empty(%(shape)s, " "%(dtype)s, order=%(order)s)" -- GitLab From 68379bc7dbb01e035444064bf849d80833b43ba1 Mon Sep 17 00:00:00 2001 From: arghdos Date: Wed, 9 Aug 2017 14:58:31 -0400 Subject: [PATCH 34/65] fix hostcode to use per-implementation --- loopy/target/execution.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/loopy/target/execution.py b/loopy/target/execution.py index d89eb8ba8..40105662a 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -593,7 +593,6 @@ class ExecutionWrapperGeneratorBase(object): """ options = kernel.options implemented_data_info = codegen_result.implemented_data_info - host_code = codegen_result.host_code() from loopy.kernel.data import KernelArgument gen = PythonFunctionGenerator( @@ -608,7 +607,7 @@ class ExecutionWrapperGeneratorBase(object): gen.add_to_preamble("") self.target_specific_preamble(gen) gen.add_to_preamble("") - gen.add_to_preamble(host_code) + self.generate_host_code(gen, codegen_result) gen.add_to_preamble("") self.initialize_system_args(gen) -- GitLab From a3ebe4a6cad85ea2a5b0e0288731538e5b696e1c Mon Sep 17 00:00:00 2001 From: arghdos Date: Wed, 9 Aug 2017 15:21:36 -0400 Subject: [PATCH 35/65] fix naming --- loopy/target/execution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 40105662a..d5d8959df 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -601,7 +601,7 @@ class ExecutionWrapperGeneratorBase(object): "%s=None" % idi.name for idi in implemented_data_info if issubclass(idi.arg_class, KernelArgument) - ] + ['**kw_args']) + ] + ['**kwargs']) gen.add_to_preamble("from __future__ import division") gen.add_to_preamble("") -- GitLab From 2a5b02c57553a1f51dce6aa77507b0d1063aaa12 Mon Sep 17 00:00:00 2001 From: arghdos Date: Wed, 9 Aug 2017 15:25:30 -0400 Subject: [PATCH 36/65] fix docstring hanging out all on it's own --- loopy/target/c/c_execution.py | 10 ++-------- 1 file changed, 2 insertions(+), 8 deletions(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 110b7dbb6..a3e14f3c3 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -175,16 +175,10 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): return arg.name -""" -The compiler module handles invocation of compilers to generate a shared lib -which can be loaded via ctypes. -""" - - class CCompiler(object): """ - Wraps a C compiler to build and load shared libraries. - Defaults to gcc + The compiler module handles invocation of compilers to generate a shared lib + using codepy, which can subsequently be loaded via ctypes. """ source_suffix = 'c' -- GitLab From 6ebf390d12396278a0093283e7c5bd2bc9fd050d Mon Sep 17 00:00:00 2001 From: arghdos Date: Wed, 9 Aug 2017 15:36:03 -0400 Subject: [PATCH 37/65] fix docstring --- loopy/target/c/c_execution.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index a3e14f3c3..582d8665c 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -230,8 +230,7 @@ class CCompiler(object): class CppCompiler(CCompiler): - """Subclass of Compiler to invoke a C++ compiler. - Defaults to g++""" + """Subclass of CCompiler to invoke a C++ compiler.""" source_suffix = 'cpp' default_exe = 'g++' default_compile_flags = '-g -O3'.split() -- GitLab From 97b4841208a1cee5ba0fb982096c404504fa58b3 Mon Sep 17 00:00:00 2001 From: arghdos Date: Wed, 9 Aug 2017 15:39:32 -0400 Subject: [PATCH 38/65] fix naming --- loopy/target/c/c_execution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 582d8665c..7e1d02a9d 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -229,7 +229,7 @@ class CCompiler(object): return checksum, ctypes.CDLL(ext_file) -class CppCompiler(CCompiler): +class CPlusPlusCompiler(CCompiler): """Subclass of CCompiler to invoke a C++ compiler.""" source_suffix = 'cpp' default_exe = 'g++' -- GitLab From 23b431f5dd890ff0794e6f4ebf64646855f99faf Mon Sep 17 00:00:00 2001 From: arghdos Date: Wed, 9 Aug 2017 15:44:36 -0400 Subject: [PATCH 39/65] better docstring --- loopy/target/c/c_execution.py | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 7e1d02a9d..faef3cb94 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -179,6 +179,20 @@ class CCompiler(object): """ The compiler module handles invocation of compilers to generate a shared lib using codepy, which can subsequently be loaded via ctypes. + + The general strategy here is as follows: + + 1. A :class:`codepy.Toolchain` is guessed from distutils. + The user may override any flags obtained therein by passing in arguements + to cc, cflags, etc. + + 2. The kernel source is built into and object first, then made into a shared + library using :meth:`codepy.jit.compile_from_string`, which additionally + handles caching + + 3. The resulting shared library is turned into a :class:`ctypes.CDLL` + to enable calling by the invoker generated by, e.g., + :class:`CExecutionWrapperGenerator` """ source_suffix = 'c' -- GitLab From 16af629cbc08dc2d8b6aa33102661db9b78c24fb Mon Sep 17 00:00:00 2001 From: arghdos Date: Wed, 9 Aug 2017 16:01:41 -0400 Subject: [PATCH 40/65] fix for updated name --- doc/tutorial.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 6880db267..229ed8474 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -228,7 +228,7 @@ inspect that code, too, using :attr:`loopy.Options.write_wrapper`: >>> evt, (out,) = knl(queue, a=x_vec_host) from __future__ import division ... - def invoke_loopy_kernel_loopy_kernel(_lpy_cl_kernels, queue, allocator=None, wait_for=None, out_host=None, a=None, n=None, out=None, **kw_args): + def invoke_loopy_kernel_loopy_kernel(_lpy_cl_kernels, queue, allocator=None, wait_for=None, out_host=None, a=None, n=None, out=None, **kwargs): if allocator is None: allocator = _lpy_cl_tools.DeferredAllocator(queue.context) -- GitLab From 5a2c14468ae201c8b4c40ada8c18f299f3cd6d4c Mon Sep 17 00:00:00 2001 From: arghdos Date: Wed, 9 Aug 2017 16:11:41 -0400 Subject: [PATCH 41/65] make specification of c / c++ defaults more sane --- loopy/target/c/c_execution.py | 26 +++++++++++++++----------- 1 file changed, 15 insertions(+), 11 deletions(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index faef3cb94..4f275e0a6 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -195,14 +195,10 @@ class CCompiler(object): :class:`CExecutionWrapperGenerator` """ - source_suffix = 'c' - default_exe = 'gcc' - default_compile_flags = '-std=c99 -g -O3 -fPIC'.split() - default_link_flags = '-shared'.split() - - def __init__(self, cc=default_exe, cflags=default_compile_flags, - ldflags=None, libraries=None, - include_dirs=[], library_dirs=[], defines=[]): + def __init__(self, cc='gcc', cflags='-std=c99 -g -O3'.split(), + ldflags=[], libraries=[], + include_dirs=[], library_dirs=[], defines=[], + source_suffix='c'): # try to get a default toolchain self.toolchain = guess_toolchain() # copy in all differing values @@ -218,6 +214,7 @@ class CCompiler(object): getattr(self.toolchain, k) != v) self.toolchain = self.toolchain.copy(**diff) self.tempdir = tempfile.mkdtemp(prefix="tmp_loopy") + self.source_suffix = source_suffix def _tempname(self, name): """Build temporary filename path in tempdir.""" @@ -245,9 +242,16 @@ class CCompiler(object): class CPlusPlusCompiler(CCompiler): """Subclass of CCompiler to invoke a C++ compiler.""" - source_suffix = 'cpp' - default_exe = 'g++' - default_compile_flags = '-g -O3'.split() + + def __init__(self, cc='g++', cflags='', + ldflags=[], libraries=[], + include_dirs=[], library_dirs=[], defines=[], + source_suffix='cpp'): + + super(CPlusPlusCompiler, self).__init__( + cc=cc, cflags=cflags, ldflags=ldflags, libraries=libraries, + include_dirs=include_dirs, library_dirs=library_dirs, + defines=defines, source_suffix=source_suffix) class CompiledCKernel(object): -- GitLab From a8dc3cd02e066dbed0d8d0d384d3a6139faf0297 Mon Sep 17 00:00:00 2001 From: arghdos Date: Fri, 1 Sep 2017 09:59:03 -0400 Subject: [PATCH 42/65] cherry-pick toolchain passing in from ISPC branch --- loopy/target/c/c_execution.py | 36 ++++++++++++++++++++--------------- 1 file changed, 21 insertions(+), 15 deletions(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 4f275e0a6..2177d112d 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -195,24 +195,30 @@ class CCompiler(object): :class:`CExecutionWrapperGenerator` """ - def __init__(self, cc='gcc', cflags='-std=c99 -g -O3'.split(), - ldflags=[], libraries=[], + def __init__(self, toolchain=None, + cc='gcc', cflags='-std=c99 -g -O3 -fPIC'.split(), + ldflags='-shared'.split(), libraries=[], include_dirs=[], library_dirs=[], defines=[], source_suffix='c'): # try to get a default toolchain - self.toolchain = guess_toolchain() - # copy in all differing values - diff = {'cc': cc, - 'cflags': cflags, - 'ldflags': ldflags, - 'libraries': libraries, - 'include_dirs': include_dirs, - 'library_dirs': library_dirs, - 'defines': defines} - # filter empty and those equal to toolchain defaults - diff = dict((k, v) for k, v in six.iteritems(diff) if v and - getattr(self.toolchain, k) != v) - self.toolchain = self.toolchain.copy(**diff) + # or subclass supplied version if available + self.toolchain = guess_toolchain() if toolchain is None else toolchain + self.source_suffix = source_suffix + if toolchain is None: + # copy in all differing values + diff = {'cc': cc, + 'cflags': cflags, + 'ldflags': ldflags, + 'libraries': libraries, + 'include_dirs': include_dirs, + 'library_dirs': library_dirs, + 'defines': defines} + # filter empty and those equal to toolchain defaults + diff = dict((k, v) for k, v in six.iteritems(diff) + if v and + not hasattr(self.toolchain, k) or + getattr(self.toolchain, k) != v) + self.toolchain = self.toolchain.copy(**diff) self.tempdir = tempfile.mkdtemp(prefix="tmp_loopy") self.source_suffix = source_suffix -- GitLab From 5070b2edc621b372a9946d32601e7b414fd2fa37 Mon Sep 17 00:00:00 2001 From: arghdos Date: Fri, 1 Sep 2017 10:00:28 -0400 Subject: [PATCH 43/65] remove debug symbols --- loopy/target/c/c_execution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 2177d112d..ab0beb895 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -196,7 +196,7 @@ class CCompiler(object): """ def __init__(self, toolchain=None, - cc='gcc', cflags='-std=c99 -g -O3 -fPIC'.split(), + cc='gcc', cflags='-std=c99 -O3 -fPIC'.split(), ldflags='-shared'.split(), libraries=[], include_dirs=[], library_dirs=[], defines=[], source_suffix='c'): -- GitLab From 668375b8dffaa82d2c98de2a25f47fa9bfffc229 Mon Sep 17 00:00:00 2001 From: arghdos Date: Fri, 1 Sep 2017 10:02:35 -0400 Subject: [PATCH 44/65] bump codepy version --- setup.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/setup.py b/setup.py index 85ab2ad7d..b8bc17d88 100644 --- a/setup.py +++ b/setup.py @@ -43,7 +43,7 @@ setup(name="loo.py", "cgen>=2016.1", "islpy>=2016.2", "six>=1.8.0", - "codepy>=2013.1.2", + "codepy>=2017.1", "colorama", "Mako", ], -- GitLab From 996b39fe1e60462ac5b29d8b50c588c10868d32d Mon Sep 17 00:00:00 2001 From: arghdos Date: Fri, 1 Sep 2017 10:17:41 -0400 Subject: [PATCH 45/65] remove unused kwargs --- loopy/target/execution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/execution.py b/loopy/target/execution.py index d5d8959df..61788df2d 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -601,7 +601,7 @@ class ExecutionWrapperGeneratorBase(object): "%s=None" % idi.name for idi in implemented_data_info if issubclass(idi.arg_class, KernelArgument) - ] + ['**kwargs']) + ]) gen.add_to_preamble("from __future__ import division") gen.add_to_preamble("") -- GitLab From 4b92863947b2a92b729c385923a7a0ef6b06b2ae Mon Sep 17 00:00:00 2001 From: arghdos Date: Fri, 1 Sep 2017 10:37:23 -0400 Subject: [PATCH 46/65] fix doctest --- doc/tutorial.rst | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 229ed8474..92ec799f7 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -228,7 +228,7 @@ inspect that code, too, using :attr:`loopy.Options.write_wrapper`: >>> evt, (out,) = knl(queue, a=x_vec_host) from __future__ import division ... - def invoke_loopy_kernel_loopy_kernel(_lpy_cl_kernels, queue, allocator=None, wait_for=None, out_host=None, a=None, n=None, out=None, **kwargs): + def invoke_loopy_kernel_loopy_kernel(_lpy_cl_kernels, queue, allocator=None, wait_for=None, out_host=None, a=None, n=None, out=None): if allocator is None: allocator = _lpy_cl_tools.DeferredAllocator(queue.context) -- GitLab From f850c1b470e9dce301912781c03e08da96678c10 Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 27 Nov 2017 17:55:34 -0500 Subject: [PATCH 47/65] this wasn't picked up in the merge for whatever reason --- loopy/target/execution.py | 11 +++++------ 1 file changed, 5 insertions(+), 6 deletions(-) diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 2909f16f5..4c9daf343 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -694,15 +694,14 @@ class KernelExecutorBase(object): self.invoker = invoker - @memoize_method - def get_typed_and_scheduled_kernel(self, var_to_dtype_set): - kernel = self.kernel - + def get_typed_and_scheduled_kernel_uncached(self, arg_to_dtype_set): from loopy.kernel.tools import add_dtypes - if var_to_dtype_set: + kernel = self.kernel + + if arg_to_dtype_set: var_to_dtype = {} - for var, dtype in var_to_dtype_set: + for var, dtype in arg_to_dtype_set: try: dest_name = kernel.impl_arg_to_arg[var].name except KeyError: -- GitLab From e01c640eede264bf70dab77e2f4a77a49db7c8d9 Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 27 Nov 2017 17:55:50 -0500 Subject: [PATCH 48/65] start implementing host code for C --- loopy/target/c/__init__.py | 30 +++++++++++++++--------------- loopy/target/c/c_execution.py | 3 --- loopy/target/execution.py | 2 +- loopy/target/pyopencl_execution.py | 3 --- 4 files changed, 16 insertions(+), 22 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 423311cdb..5a2293c2e 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -422,25 +422,25 @@ class CASTBuilder(ASTBuilderBase): is_first_dev_prog = False break if is_first_dev_prog: - for tv in sorted( - six.itervalues(kernel.temporary_variables), - key=lambda tv: tv.name): + for tv in sorted( + six.itervalues(kernel.temporary_variables), + key=lambda tv: tv.name): - if tv.scope == temp_var_scope.GLOBAL and tv.initializer is not None: - assert tv.read_only + if tv.scope == temp_var_scope.GLOBAL and tv.initializer is not None: + assert tv.read_only - decl_info, = tv.decl_info(self.target, - index_dtype=kernel.index_dtype) - decl = self.wrap_global_constant( - self.get_temporary_decl( - codegen_state, schedule_index, tv, - decl_info)) + decl_info, = tv.decl_info(self.target, + index_dtype=kernel.index_dtype) + decl = self.wrap_global_constant( + self.get_temporary_decl( + codegen_state, schedule_index, tv, + decl_info)) - if tv.initializer is not None: - decl = Initializer(decl, generate_array_literal( - codegen_state, tv, tv.initializer)) + if tv.initializer is not None: + decl = Initializer(decl, generate_array_literal( + codegen_state, tv, tv.initializer)) - result.append(decl) + result.append(decl) fbody = FunctionBody(function_decl, function_body) if not result: diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index ab0beb895..ad2cf8ab1 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -168,9 +168,6 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # }}} - def generate_host_code(self, gen, codegen_result): - pass - def get_arg_pass(self, arg): return arg.name diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 4c9daf343..8162474d8 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -586,7 +586,7 @@ class ExecutionWrapperGeneratorBase(object): # }}} def generate_host_code(self, gen, codegen_result): - raise NotImplementedError() + gen.add_to_preamble(codegen_result.host_code()) def __call__(self, kernel, codegen_result): """ diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 0da502fba..d44984567 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -205,9 +205,6 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # }}} - def generate_host_code(self, gen, codegen_result): - gen.add_to_preamble(codegen_result.host_code()) - def get_arg_pass(self, arg): return "%s.base_data" % arg.name -- GitLab From 9ba5f9edf9d8370d505be574c90e78b2520b5ec5 Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 27 Nov 2017 18:43:47 -0500 Subject: [PATCH 49/65] move C-host code into same file as device code --- loopy/target/c/__init__.py | 11 +++++++++-- loopy/target/c/c_execution.py | 16 +++++++++++----- loopy/target/execution.py | 2 +- loopy/target/pyopencl_execution.py | 3 +++ 4 files changed, 24 insertions(+), 8 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 5a2293c2e..260d41795 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -28,7 +28,7 @@ import six import numpy as np # noqa from loopy.kernel.data import CallMangleInfo -from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder +from loopy.target import TargetBase, ASTBuilderBase from loopy.diagnostic import LoopyError from cgen import Pointer, NestedDeclarator, Block from cgen.mapper import IdentityMapper as CASTIdentityMapperBase @@ -271,7 +271,7 @@ class CTarget(TargetBase): return False def get_host_ast_builder(self): - return DummyHostASTBuilder(self) + return CASTBuilder(self) def get_device_ast_builder(self): return CASTBuilder(self) @@ -483,6 +483,13 @@ class CASTBuilder(ASTBuilderBase): [self.idi_to_cgen_declarator(codegen_state.kernel, idi) for idi in codegen_state.implemented_data_info])) + def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): + from cgen import Block, Statement + implemented_data_info = codegen_state.implemented_data_info + arg_names = [iai.name for iai in implemented_data_info] + + return Block([Statement("%s(%s)" % (name, ", ".join(arg_names)))]) + def get_temporary_decls(self, codegen_state, schedule_index): from loopy.kernel.data import temp_var_scope diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index ad2cf8ab1..e57aa3678 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -168,6 +168,12 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # }}} + def generate_host_code(self, gen, codegen_result): + # "host" code for C is embedded in the same file as the "device" code + # this will enable a logical jumping off point for global barriers for + # OpenMP, etc. + pass + def get_arg_pass(self, arg): return arg.name @@ -380,6 +386,8 @@ class CKernelExecutor(KernelExecutorBase): codegen_result = generate_code_v2(kernel) dev_code = codegen_result.device_code() + host_code = codegen_result.host_code() + all_code = '\n'.join([dev_code, '', host_code]) if self.kernel.options.write_cl: output = dev_code @@ -396,14 +404,12 @@ class CKernelExecutor(KernelExecutorBase): from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.c") - c_kernels = [] - for dp in codegen_result.device_programs: - c_kernels.append(CompiledCKernel(dp, dev_code, - self.kernel.target, self.compiler)) + c_kernel = CompiledCKernel(codegen_result.host_program, all_code, + self.kernel.target, self.compiler) return _KernelInfo( kernel=kernel, - c_kernels=c_kernels, + c_kernels=[c_kernel], implemented_data_info=codegen_result.implemented_data_info, invoker=self.invoker(kernel, codegen_result)) diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 8162474d8..00e6f5efd 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -586,7 +586,7 @@ class ExecutionWrapperGeneratorBase(object): # }}} def generate_host_code(self, gen, codegen_result): - gen.add_to_preamble(codegen_result.host_code()) + raise NotImplementedError def __call__(self, kernel, codegen_result): """ diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index d44984567..9a2b38f8c 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -205,6 +205,9 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # }}} + def generate_host_code(self, gen, codegen_result): + gen(codegen_result.host_code()) + def get_arg_pass(self, arg): return "%s.base_data" % arg.name -- GitLab From 6c03f9716171a5ca3ded79ee06b8a12fff4633d9 Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 27 Nov 2017 18:51:42 -0500 Subject: [PATCH 50/65] have write_cl print both host and device --- loopy/target/c/c_execution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index e57aa3678..75879b05e 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -390,7 +390,7 @@ class CKernelExecutor(KernelExecutorBase): all_code = '\n'.join([dev_code, '', host_code]) if self.kernel.options.write_cl: - output = dev_code + output = all_code if self.kernel.options.highlight_cl: output = self.get_highlighted_code(output) -- GitLab From eaed7323d0d389887ec2796bb86425daa6a77f34 Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 27 Nov 2017 18:58:25 -0500 Subject: [PATCH 51/65] consolidate code highlighting --- loopy/target/c/c_execution.py | 4 ++-- loopy/target/execution.py | 4 ---- 2 files changed, 2 insertions(+), 6 deletions(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 75879b05e..0cb6f155c 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -27,7 +27,7 @@ import cgen import os from loopy.target.execution import (KernelExecutorBase, _KernelInfo, - ExecutionWrapperGeneratorBase) + ExecutionWrapperGeneratorBase, get_highlighted_code) from pytools import memoize_method from pytools.py_codegen import (Indentation) from codepy.toolchain import guess_toolchain @@ -392,7 +392,7 @@ class CKernelExecutor(KernelExecutorBase): if self.kernel.options.write_cl: output = all_code if self.kernel.options.highlight_cl: - output = self.get_highlighted_code(output) + output = get_highlighted_code(output) if self.kernel.options.write_cl is True: print(output) diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 00e6f5efd..4e86e814b 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -778,10 +778,6 @@ class KernelExecutorBase(object): # {{{ debugging aids - def get_highlighted_code(self, arg_to_dtype=None): - return get_highlighted_code( - self.get_code(arg_to_dtype)) - def get_code(self, arg_to_dtype=None): if arg_to_dtype is not None: arg_to_dtype = frozenset(six.iteritems(arg_to_dtype)) -- GitLab From fb9df514a38be4c71a68036ef45b096163d0b4f1 Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 27 Nov 2017 19:10:32 -0500 Subject: [PATCH 52/65] turns out we need this, as other places call this on CompiledKernel --- loopy/target/c/c_execution.py | 2 +- loopy/target/execution.py | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 0cb6f155c..a3679d78c 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -392,7 +392,7 @@ class CKernelExecutor(KernelExecutorBase): if self.kernel.options.write_cl: output = all_code if self.kernel.options.highlight_cl: - output = get_highlighted_code(output) + output = get_highlighted_code(code=output) if self.kernel.options.write_cl is True: print(output) diff --git a/loopy/target/execution.py b/loopy/target/execution.py index 4e86e814b..4d6c0378c 100644 --- a/loopy/target/execution.py +++ b/loopy/target/execution.py @@ -778,6 +778,11 @@ class KernelExecutorBase(object): # {{{ debugging aids + def get_highlighted_code(self, arg_to_dtype=None, code=None): + if code is None: + code = self.get_code(arg_to_dtype) + return get_highlighted_code(code) + def get_code(self, arg_to_dtype=None): if arg_to_dtype is not None: arg_to_dtype = frozenset(six.iteritems(arg_to_dtype)) -- GitLab From 6312a063619af6f4542eda96bf991f17abd300c5 Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 27 Nov 2017 19:14:25 -0500 Subject: [PATCH 53/65] flake fix --- loopy/target/c/__init__.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 260d41795..4b041750f 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -567,10 +567,10 @@ class CASTBuilder(ASTBuilderBase): if tv._base_storage_access_may_be_aliasing: ptrtype = _ConstPointer else: - # The 'restrict' part of this is a complete lie--of course - # all these temporaries are aliased. But we're promising to - # not use them to shovel data from one representation to the - # other. That counts, right? + # The 'restrict' part of this is a complete lie--of course + # all these temporaries are aliased. But we're promising to + # not use them to shovel data from one representation to the + # other. That counts, right? ptrtype = _ConstRestrictPointer cast_decl = ptrtype(cast_decl) -- GitLab From eb071a77ff4d68116726530372746f2ae9fe78f1 Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 27 Nov 2017 19:25:49 -0500 Subject: [PATCH 54/65] updated --- test/test_loopy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index 7da06c919..084f9f045 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -567,7 +567,7 @@ def test_unknown_arg_shape(ctx_factory): assumptions="m<=%d and m>=1 and n mod %d = 0" % (bsize[0], bsize[0])) knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) - cl_kernel_info = CompiledKernel(ctx, knl).cl_kernel_info(frozenset()) # noqa + kernel_info = CompiledKernel(ctx, knl).kernel_info(frozenset()) # noqa # }}} -- GitLab From 5c0a52cc743c3cdd2ada63907caf6e3ac0ae89ba Mon Sep 17 00:00:00 2001 From: Nick Date: Mon, 27 Nov 2017 19:55:11 -0500 Subject: [PATCH 55/65] host code should be in preamble for doctest --- loopy/target/pyopencl_execution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index 9a2b38f8c..0da502fba 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -206,7 +206,7 @@ class PyOpenCLExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): # }}} def generate_host_code(self, gen, codegen_result): - gen(codegen_result.host_code()) + gen.add_to_preamble(codegen_result.host_code()) def get_arg_pass(self, arg): return "%s.base_data" % arg.name -- GitLab From 1eec586cedf8c73621095c1c54cf28c7d91e6451 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 30 Nov 2017 16:21:18 -0600 Subject: [PATCH 56/65] Fix test_reduction_with_conditional after C target tweaks --- test/test_reduction.py | 23 ++++++++++++----------- 1 file changed, 12 insertions(+), 11 deletions(-) diff --git a/test/test_reduction.py b/test/test_reduction.py index 0c37d2228..909a800b2 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -414,21 +414,22 @@ def test_parallel_multi_output_reduction(ctx_factory): def test_reduction_with_conditional(): - # Test whether realization of a reduction inherits predicates - # of the original instruction. Tested with the CTarget, because - # the PyOpenCL target will hoist the conditional into the host - # code in this minimal example. + # The purpose of the 'l' iname is to force the entire kernel (including the + # predicate) into device code. + knl = lp.make_kernel( - "{ [i] : 0<=i<42 }", + "{ [l,i] : 0<=l,i<42 }", """ - if n > 0 - <>b = sum(i, a[i]) + if l > 0 + b[l] = sum(i, l*a[i]) end """, - [lp.GlobalArg("a", dtype=np.float32, shape=(42,)), - lp.GlobalArg("n", dtype=np.float32, shape=())], - target=lp.CTarget()) - code = lp.generate_body(knl) + [lp.ValueArg("n", dtype=np.int32), "..."]) + + knl = lp.tag_inames(knl, "l:g.0") + knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) + code = lp.generate_code_v2(knl).device_code() + print(code) # Check that the if appears before the loop that realizes the reduction. assert code.index("if") < code.index("for") -- GitLab From 0bd1b36dbf8832c545bcb5edba0dcdb1c17260ee Mon Sep 17 00:00:00 2001 From: Nick Date: Fri, 1 Dec 2017 10:14:49 -0500 Subject: [PATCH 57/65] move compiler to subclass --- loopy/target/c/__init__.py | 23 ++++++++++++++++++++--- 1 file changed, 20 insertions(+), 3 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 4b041750f..84da373e0 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -262,9 +262,8 @@ class CTarget(TargetBase): hash_fields = TargetBase.hash_fields + ("fortran_abi",) comparison_fields = TargetBase.comparison_fields + ("fortran_abi",) - def __init__(self, fortran_abi=False, compiler=None): + def __init__(self, fortran_abi=False): self.fortran_abi = fortran_abi - self.compiler = compiler super(CTarget, self).__init__() def split_kernel_at_global_barriers(self): @@ -304,11 +303,29 @@ class CTarget(TargetBase): def get_kernel_executor_cache_key(self, *args, **kwargs): return None # TODO: ??? + def get_kernel_executor(self, knl, *args, **kwargs): + raise NotImplementedError() + + # }}} + + +# {{{ + +class ExecutableCTarget(CTarget): + """ + An executable CTarget that uses (by default) JIT compilation of C-code + """ + from .c_execution import CCompiler + + def __init__(self, compiler=CCompiler(), fortran_abi=False): + super(ExecutableCTarget, self).__init__(fortran_abi=fortran_abi) + self.compiler = compiler + def get_kernel_executor(self, knl, *args, **kwargs): from loopy.target.c.c_execution import CKernelExecutor return CKernelExecutor(knl, compiler=self.compiler) - # }}} +# }}} class _ConstRestrictPointer(Pointer): -- GitLab From 1d21be164a663eed7a2eafede9074a0a8988eeb7 Mon Sep 17 00:00:00 2001 From: Nick Date: Fri, 1 Dec 2017 10:19:17 -0500 Subject: [PATCH 58/65] fix default c++ flags --- loopy/target/c/c_execution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index a3679d78c..1f437f91d 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -252,7 +252,7 @@ class CCompiler(object): class CPlusPlusCompiler(CCompiler): """Subclass of CCompiler to invoke a C++ compiler.""" - def __init__(self, cc='g++', cflags='', + def __init__(self, cc='g++', cflags='-std=c++98 -O3 -fPIC'.split(), ldflags=[], libraries=[], include_dirs=[], library_dirs=[], defines=[], source_suffix='cpp'): -- GitLab From 12c26f33a0a1f25856cae506158b3222dfeb9fef Mon Sep 17 00:00:00 2001 From: Nick Date: Fri, 1 Dec 2017 10:33:13 -0500 Subject: [PATCH 59/65] update CTarget in test --- test/test_c_execution.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/test/test_c_execution.py b/test/test_c_execution.py index bf8fce262..4a561d27f 100644 --- a/test/test_c_execution.py +++ b/test/test_c_execution.py @@ -37,7 +37,7 @@ else: def test_c_target(): - from loopy.target.c import CTarget + from loopy.target.c import ExecutableCTarget knl = lp.make_kernel( "{ [i]: 0<=i Date: Fri, 1 Dec 2017 11:33:03 -0500 Subject: [PATCH 60/65] don't traverse CAST, instead extract dtypes from IDI in subobject --- loopy/target/c/c_execution.py | 121 ++++++++++++++++------------------ 1 file changed, 58 insertions(+), 63 deletions(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 1f437f91d..3ee85f72f 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -33,7 +33,6 @@ from pytools.py_codegen import (Indentation) from codepy.toolchain import guess_toolchain from codepy.jit import compile_from_string import six -import weakref import ctypes import numpy as np @@ -263,6 +262,54 @@ class CPlusPlusCompiler(CCompiler): defines=defines, source_suffix=source_suffix) +class IDIToCDLL(object): + """ + A utility class that extracts arguement and return type info from a + :class:`ImplementedDataInfo` in order to create a :class:`ctype.CDLL` + """ + def __init__(self, target): + self.target = target + self.registry = target.get_dtype_registry().wrapped_registry + + def __call__(self, knl, idi): + # grab return type from AST + from loopy.target.c import CFunctionDeclExtractor + func_decl = CFunctionDeclExtractor() + func_decl(knl.ast) + assert len(func_decl.decls) == 1, ( + "Can't extract multiple function declartions") + restype = func_decl.decls[0].subdecl.typename + if restype == 'void': + restype = None + else: + raise ValueError('Unhandled restype %r' % (restype, )) + + # next loopy through the implemented data info to get the arg data + arg_info = [] + for arg in idi: + # check if pointer + pointer = arg.shape + arg_info.append(self._dtype_to_ctype(arg.dtype, pointer)) + + return restype, arg_info + + def _append_arg(self, name, dtype, pointer=False): + """Append arg info to current argument list.""" + self._arg_info.append(( + name, + self._dtype_to_ctype(dtype, pointer=pointer) + )) + + def _dtype_to_ctype(self, dtype, pointer=False): + """Map NumPy dtype to equivalent ctypes type.""" + typename = self.registry.dtype_to_ctype(dtype) + typename = {'unsigned': 'uint'}.get(typename, typename) + basetype = getattr(ctypes, 'c_' + typename) + if pointer: + return ctypes.POINTER(basetype) + return basetype + + class CompiledCKernel(object): """ A CompiledCKernel wraps a loopy kernel, compiling it and loading the @@ -271,35 +318,23 @@ class CompiledCKernel(object): to automatically map argument types. """ - def __init__(self, knl, dev_code, target, comp=None): + def __init__(self, knl, idi, dev_code, target, comp=CCompiler()): from loopy.target.c import CTarget assert isinstance(target, CTarget) self.target = target - self.knl = knl + self.name = knl.name # get code and build self.code = dev_code - self.comp = comp or CCompiler() + self.comp = comp self.checksum, self.dll = self.comp.build( - self.knl.name, self.code) + self.name, self.code) # get the function declaration for interface with ctypes - from loopy.target.c import CFunctionDeclExtractor - self.func_decl = CFunctionDeclExtractor() - self.func_decl(knl.ast) - self.func_decl = self.func_decl.decls[0] - self._arg_info = [] - # TODO knl.args[:].dtype is sufficient - self._visit_func_decl(self.func_decl) - self.name = self.knl.name - restype = self.func_decl.subdecl.typename - if restype == 'void': - self.restype = None - else: - raise ValueError('Unhandled restype %r' % (restype, )) + func_decl = IDIToCDLL(self.target) + restype, arg_info = func_decl(knl, idi) self._fn = getattr(self.dll, self.name) - self._fn.restype = self.restype - self._fn.argtypes = [ctype for name, ctype in self._arg_info] - self._prepared_call_cache = weakref.WeakKeyDictionary() + self._fn.restype = restype + self._fn.argtypes = [ctype for ctype in arg_info] def __call__(self, *args): """Execute kernel with given args mapped to ctypes equivalents.""" @@ -316,47 +351,6 @@ class CompiledCKernel(object): args_.append(arg_) self._fn(*args_) - def _append_arg(self, name, dtype, pointer=False): - """Append arg info to current argument list.""" - self._arg_info.append(( - name, - self._dtype_to_ctype(dtype, pointer=pointer) - )) - - def _visit_const(self, node): - """Visit const arg of kernel.""" - if isinstance(node.subdecl, cgen.RestrictPointer): - self._visit_pointer(node.subdecl) - else: - pod = node.subdecl # type: cgen.POD - self._append_arg(pod.name, pod.dtype) - - def _visit_pointer(self, node): - """Visit pointer argument of kernel.""" - pod = node.subdecl # type: cgen.POD - self._append_arg(pod.name, pod.dtype, pointer=True) - - def _visit_func_decl(self, func_decl): - """Visit nodes of function declaration of kernel.""" - for i, arg in enumerate(func_decl.arg_decls): - if isinstance(arg, cgen.Const): - self._visit_const(arg) - elif isinstance(arg, cgen.RestrictPointer): - self._visit_pointer(arg) - else: - raise ValueError('unhandled type for arg %r' % (arg, )) - - def _dtype_to_ctype(self, dtype, pointer=False): - """Map NumPy dtype to equivalent ctypes type.""" - target = self.target # type: CTarget - registry = target.get_dtype_registry().wrapped_registry - typename = registry.dtype_to_ctype(dtype) - typename = {'unsigned': 'uint'}.get(typename, typename) - basetype = getattr(ctypes, 'c_' + typename) - if pointer: - return ctypes.POINTER(basetype) - return basetype - class CKernelExecutor(KernelExecutorBase): """An object connecting a kernel to a :class:`CompiledKernel` @@ -404,7 +398,8 @@ class CKernelExecutor(KernelExecutorBase): from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.c") - c_kernel = CompiledCKernel(codegen_result.host_program, all_code, + c_kernel = CompiledCKernel(codegen_result.host_program, + codegen_result.implemented_data_info, all_code, self.kernel.target, self.compiler) return _KernelInfo( -- GitLab From 42ae8e924465bd8cfb78bc0fc7b91759af262d94 Mon Sep 17 00:00:00 2001 From: Nick Date: Fri, 1 Dec 2017 11:33:19 -0500 Subject: [PATCH 61/65] add test to ensure we get non-pointer types right --- test/test_c_execution.py | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/test/test_c_execution.py b/test/test_c_execution.py index 4a561d27f..5cd1e44f6 100644 --- a/test/test_c_execution.py +++ b/test/test_c_execution.py @@ -172,3 +172,20 @@ def test_c_optimizations(): order='C') assert np.allclose(knl(a=a_np)[1], 2 * a_np) + + +def test_function_decl_extractor(): + # ensure that we can tell the difference between pointers, constants, etc. + # in execution + from loopy.target.c import ExecutableCTarget + + knl = lp.make_kernel('{[i]: 0 <= i < 10}', + """ + a[i] = b[i] + v + """, + [lp.GlobalArg('a', shape=(10,), dtype=np.int32), + lp.ConstantArg('b', shape=(10)), + lp.ValueArg('v', dtype=np.int32)], + target=ExecutableCTarget()) + + assert np.allclose(knl(b=np.arange(10), v=-1)[1], np.arange(10) - 1) -- GitLab From 4aaade07b5a6345f1315e33591151547f0f2b03e Mon Sep 17 00:00:00 2001 From: Nick Date: Fri, 1 Dec 2017 11:52:31 -0500 Subject: [PATCH 62/65] flake --- loopy/target/c/c_execution.py | 1 - 1 file changed, 1 deletion(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 3ee85f72f..d46360b08 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -23,7 +23,6 @@ THE SOFTWARE. """ import tempfile -import cgen import os from loopy.target.execution import (KernelExecutorBase, _KernelInfo, -- GitLab From 4ac795f1cff16e81b4e90c85b8410703b5978b6d Mon Sep 17 00:00:00 2001 From: Nick Date: Fri, 1 Dec 2017 11:57:00 -0500 Subject: [PATCH 63/65] no kernel call for base C-target --- loopy/target/c/__init__.py | 6 +----- loopy/target/c/c_execution.py | 10 ++++++---- 2 files changed, 7 insertions(+), 9 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 84da373e0..457d7f544 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -501,11 +501,7 @@ class CASTBuilder(ASTBuilderBase): for idi in codegen_state.implemented_data_info])) def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): - from cgen import Block, Statement - implemented_data_info = codegen_state.implemented_data_info - arg_names = [iai.name for iai in implemented_data_info] - - return Block([Statement("%s(%s)" % (name, ", ".join(arg_names)))]) + return None def get_temporary_decls(self, codegen_state, schedule_index): from loopy.kernel.data import temp_var_scope diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index d46360b08..ccfa616c6 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -397,13 +397,15 @@ class CKernelExecutor(KernelExecutorBase): from pytools import invoke_editor dev_code = invoke_editor(dev_code, "code.c") - c_kernel = CompiledCKernel(codegen_result.host_program, - codegen_result.implemented_data_info, all_code, - self.kernel.target, self.compiler) + c_kernels = [] + for dp in codegen_result.device_programs: + c_kernels.append(CompiledCKernel(dp, + codegen_result.implemented_data_info, all_code, self.kernel.target, + self.compiler)) return _KernelInfo( kernel=kernel, - c_kernels=[c_kernel], + c_kernels=c_kernels, implemented_data_info=codegen_result.implemented_data_info, invoker=self.invoker(kernel, codegen_result)) -- GitLab From 0ad04b1fc7af6f4ae0024443029a28f1d38a785b Mon Sep 17 00:00:00 2001 From: Nick Date: Fri, 1 Dec 2017 14:29:25 -0500 Subject: [PATCH 64/65] typo --- loopy/target/c/c_execution.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index ccfa616c6..f6cf142c6 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -276,7 +276,7 @@ class IDIToCDLL(object): func_decl = CFunctionDeclExtractor() func_decl(knl.ast) assert len(func_decl.decls) == 1, ( - "Can't extract multiple function declartions") + "Can't extract multiple function declarations") restype = func_decl.decls[0].subdecl.typename if restype == 'void': restype = None -- GitLab From dc0f4e51ce58dedfbaea12e35ac3798572950e1d Mon Sep 17 00:00:00 2001 From: Nick Date: Fri, 1 Dec 2017 14:45:37 -0500 Subject: [PATCH 65/65] kernels are void by definition --- loopy/target/c/c_execution.py | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index f6cf142c6..164d59b9e 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -271,26 +271,14 @@ class IDIToCDLL(object): self.registry = target.get_dtype_registry().wrapped_registry def __call__(self, knl, idi): - # grab return type from AST - from loopy.target.c import CFunctionDeclExtractor - func_decl = CFunctionDeclExtractor() - func_decl(knl.ast) - assert len(func_decl.decls) == 1, ( - "Can't extract multiple function declarations") - restype = func_decl.decls[0].subdecl.typename - if restype == 'void': - restype = None - else: - raise ValueError('Unhandled restype %r' % (restype, )) - - # next loopy through the implemented data info to get the arg data + # next loop through the implemented data info to get the arg data arg_info = [] for arg in idi: # check if pointer pointer = arg.shape arg_info.append(self._dtype_to_ctype(arg.dtype, pointer)) - return restype, arg_info + return arg_info def _append_arg(self, name, dtype, pointer=False): """Append arg info to current argument list.""" @@ -330,9 +318,10 @@ class CompiledCKernel(object): # get the function declaration for interface with ctypes func_decl = IDIToCDLL(self.target) - restype, arg_info = func_decl(knl, idi) + arg_info = func_decl(knl, idi) self._fn = getattr(self.dll, self.name) - self._fn.restype = restype + # kernels are void by defn. + self._fn.restype = None self._fn.argtypes = [ctype for ctype in arg_info] def __call__(self, *args): -- GitLab