diff --git a/loopy/target/c/c_execution.py b/loopy/target/c/c_execution.py index 4630bd7480236e7cf25270a285a183e34afd3718..33f96eb1c7fc782f327ff93481526e78b4850e36 100644 --- a/loopy/target/c/c_execution.py +++ b/loopy/target/c/c_execution.py @@ -27,7 +27,7 @@ import cgen import os from loopy.target.execution import (KernelExecutorBase, _KernelInfo, - ExecutionWrapperGeneratorBase) + ExecutionWrapperGeneratorBase) from pytools import memoize_method from pytools.py_codegen import (Indentation) from codepy.toolchain import guess_toolchain @@ -43,6 +43,7 @@ logger = logging.getLogger(__name__) class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): + """ Specialized form of the :class:`ExecutionWrapperGeneratorBase` for pyopencl execution @@ -84,16 +85,16 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): if not skip_arg_checks: for i in range(num_axes): gen("assert _lpy_strides_%d > 0, " - "\"'%s' has negative stride in axis %d\"" - % (i, arg.name, i)) + "\"'%s' has negative stride in axis %d\"" + % (i, arg.name, i)) sym_strides = tuple( - var("_lpy_strides_%d" % i) - for i in range(num_axes)) + var("_lpy_strides_%d" % i) + for i in range(num_axes)) sym_shape = tuple( - var("_lpy_shape_%d" % i) - for i in range(num_axes)) + var("_lpy_shape_%d" % i) + for i in range(num_axes)) # find order of array order = "'C'" @@ -105,21 +106,21 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): order = "'C'" gen("%(name)s = _lpy_np.empty(%(shape)s, " - "%(dtype)s, order=%(order)s)" - % dict( - name=arg.name, - shape=strify(sym_shape), - dtype=self.python_dtype_str( - kernel_arg.dtype.numpy_dtype), - order=order)) - - #check strides + "%(dtype)s, order=%(order)s)" + % dict( + name=arg.name, + shape=strify(sym_shape), + dtype=self.python_dtype_str( + kernel_arg.dtype.numpy_dtype), + order=order)) + + # check strides if not skip_arg_checks: gen("assert %(strides)s == %(name)s.strides, " - "'Strides of loopy created array %(name)s, " - "do not match expected.'" % - dict(name=arg.name, - strides=strify(sym_strides))) + "'Strides of loopy created array %(name)s, " + "do not match expected.'" % + dict(name=arg.name, + strides=strify(sym_strides))) for i in range(num_axes): gen("del _lpy_shape_%d" % i) gen("del _lpy_strides_%d" % i) @@ -157,18 +158,18 @@ class CExecutionWrapperGenerator(ExecutionWrapperGeneratorBase): if options.return_dict: gen("return None, {%s}" - % ", ".join("\"%s\": %s" % (arg.name, arg.name) - for arg in implemented_data_info - if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables())) + % ", ".join("\"%s\": %s" % (arg.name, arg.name) + for arg in implemented_data_info + if issubclass(arg.arg_class, KernelArgument) + if arg.base_name in kernel.get_written_variables())) else: out_args = [arg - for arg in implemented_data_info + for arg in implemented_data_info if issubclass(arg.arg_class, KernelArgument) - if arg.base_name in kernel.get_written_variables()] + if arg.base_name in kernel.get_written_variables()] if out_args: gen("return None, (%s,)" - % ", ".join(arg.name for arg in out_args)) + % ", ".join(arg.name for arg in out_args)) else: gen("return None, ()") @@ -188,21 +189,22 @@ which can be loaded via ctypes. class CCompiler(object): + """ Wraps a C compiler to build and load shared libraries. Defaults to gcc """ - source_suffix = 'c' - default_exe = 'gcc' - default_compile_flags = '-std=c99 -g -O3 -fPIC'.split() - default_link_flags = '-shared'.split() - - def __init__(self, cc=default_exe, cflags=default_compile_flags, - ldflags=None, libraries=None, - include_dirs=[], library_dirs=[], defines=[]): + def __init__(self, toolchain=None, + cc='gcc', cflags='-std=c99 -g -O3 -fPIC'.split(), + ldflags='-shared'.split(), libraries=None, + include_dirs=[], library_dirs=[], defines=[], + source_suffix='c', requires_separate_linkage=False): # try to get a default toolchain - self.toolchain = guess_toolchain() + # or subclass supplied version if available + self.toolchain = guess_toolchain() if toolchain is None else toolchain + self.requires_separate_linkage = requires_separate_linkage + self.source_suffix = source_suffix # copy in all differing values diff = {'cc': cc, 'cflags': cflags, @@ -222,34 +224,82 @@ class CCompiler(object): return os.path.join(self.tempdir, name) @memoize_method - def build(self, name, code, debug=False, wait_on_error=None, - debug_recompile=True): - """Compile code, build and load shared library.""" + def _build_obj(self, name, code, debug=False, wait_on_error=None, + debug_recompile=True): + """Compile code, and build object file""" logger.debug(code) c_fname = self._tempname('code.' + self.source_suffix) # build object - checksum, mod_name, ext_file, recompiled = \ + obj_checksum, _, obj_file, recompiled = \ compile_from_string(self.toolchain, name, code, c_fname, self.tempdir, debug, wait_on_error, - debug_recompile, False) + debug_recompile, True) + if not recompiled: + logger.debug('Kernel {} compiled from source'.format(name)) + + return obj_checksum, obj_file + + @memoize_method + def _build_lib(self, name, obj_file, debug=False, wait_on_error=None, + debug_recompile=True): + """Build and load shared library from object file""" + + # read obj file into get "source" + with open(obj_file, 'rb') as file: + obj = file.read() + + from os.path import basename + obj_name = basename(obj_file) + # build object + so_checksum, _, so_file, recompiled = \ + compile_from_string(self.toolchain, name, obj, obj_name, + self.tempdir, debug, wait_on_error, + debug_recompile, object=False, + source_is_binary=True) if not recompiled: logger.debug('Kernel {} compiled from source'.format(name)) + return so_checksum, ctypes.CDLL(so_file) + + def build(self, name, code, debug=False, wait_on_error=None, + debug_recompile=True): + """Compile code, build and load shared library.""" + + # build object + _, obj_file = self._build_obj(name, code, debug=debug, + wait_on_error=wait_on_error, + debug_recompile=debug_recompile) + + # and create library + _, lib = self._build_lib(name, obj_file, debug=debug, + wait_on_error=wait_on_error, + debug_recompile=debug_recompile) + # and return compiled - return checksum, ctypes.CDLL(ext_file) + return lib class CppCompiler(CCompiler): + """Subclass of Compiler to invoke a C++ compiler. Defaults to g++""" - source_suffix = 'cpp' - default_exe = 'g++' - default_compile_flags = '-g -O3'.split() + + def __init__(self, *args, **kwargs): + defaults = {'cc': 'g++', + 'source_suffix': 'cpp', + 'cflags': '-g -O3'.split()} + + # update to use any user specified info + defaults.update(kwargs) + + # and create + super(CppCompiler, self).__init__(*args, **defaults) class CompiledCKernel(object): + """ A CompiledCKernel wraps a loopy kernel, compiling it and loading the result as a shared library, and provides access to the kernel as a @@ -265,8 +315,7 @@ class CompiledCKernel(object): # get code and build self.code = dev_code self.comp = comp or CCompiler() - self.checksum, self.dll = self.comp.build( - self.knl.name, self.code) + self.dll = self.comp.build(self.knl.name, self.code) # get the function declaration for interface with ctypes from loopy.target.c import CFunctionDeclExtractor @@ -345,6 +394,7 @@ class CompiledCKernel(object): class CKernelExecutor(KernelExecutorBase): + """An object connecting a kernel to a :class:`CompiledKernel` for execution. @@ -352,7 +402,8 @@ class CKernelExecutor(KernelExecutorBase): .. automethod:: __call__ """ - def __init__(self, kernel, compiler=None): + def __init__(self, kernel, invoker=CExecutionWrapperGenerator(), + compiler=None): """ :arg kernel: may be a loopy.LoopKernel, a generator returning kernels (a warning will be issued if more than one is returned). If the @@ -361,8 +412,7 @@ class CKernelExecutor(KernelExecutorBase): """ self.compiler = compiler if compiler else CCompiler() - super(CKernelExecutor, self).__init__(kernel, - CExecutionWrapperGenerator()) + super(CKernelExecutor, self).__init__(kernel, invoker=invoker) @memoize_method def kernel_info(self, arg_to_dtype_set=frozenset(), all_kwargs=None): @@ -391,13 +441,13 @@ class CKernelExecutor(KernelExecutorBase): c_kernels = [] for dp in codegen_result.device_programs: c_kernels.append(CompiledCKernel(dp, dev_code, - self.kernel.target, self.compiler)) + self.kernel.target, self.compiler)) return _KernelInfo( - kernel=kernel, - c_kernels=c_kernels, - implemented_data_info=codegen_result.implemented_data_info, - invoker=self.invoker(kernel, codegen_result)) + kernel=kernel, + c_kernels=c_kernels, + implemented_data_info=codegen_result.implemented_data_info, + invoker=self.invoker(kernel, codegen_result)) # }}} @@ -417,4 +467,4 @@ class CKernelExecutor(KernelExecutorBase): kernel_info = self.kernel_info(self.arg_to_dtype_set(kwargs)) return kernel_info.invoker( - kernel_info.c_kernels, *args, **kwargs) + kernel_info.c_kernels, *args, **kwargs)