diff --git a/doc/ref_transform.rst b/doc/ref_transform.rst index f16f8bfdbb26b716af27762d8502bff592496d7c..5609fc25328ed5b0a243eaa8d33c4d1aa6b3eddc 100644 --- a/doc/ref_transform.rst +++ b/doc/ref_transform.rst @@ -119,6 +119,8 @@ Finishing up .. autofunction:: generate_code_v2 +.. autofunction:: generate_header + Setting options --------------- diff --git a/loopy/__init__.py b/loopy/__init__.py index 9b6071105e31045c7267891f62a153a4da343b18..c12d7318c24e4ca1edb327f9710ae2182e5491b0 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -119,7 +119,7 @@ from loopy.statistics import (get_op_poly, sum_ops_to_dtypes, gather_access_footprints, gather_access_footprint_bytes) from loopy.codegen import ( PreambleInfo, - generate_code, generate_code_v2, generate_body, generate_header) + generate_code, generate_code_v2, generate_body) from loopy.codegen.result import ( GeneratedProgram, CodeGenerationResult) @@ -130,7 +130,7 @@ from loopy.frontend.fortran import (c_preprocess, parse_transformed_fortran, parse_fortran) from loopy.target import TargetBase, ASTBuilderBase -from loopy.target.c import CTarget +from loopy.target.c import CTarget, generate_header from loopy.target.cuda import CudaTarget from loopy.target.opencl import OpenCLTarget from loopy.target.pyopencl import PyOpenCLTarget @@ -238,7 +238,9 @@ __all__ = [ "LoopyError", "LoopyWarning", - "TargetBase", "CTarget", "CudaTarget", "OpenCLTarget", + "TargetBase", + "CTarget", "generate_header", + "CudaTarget", "OpenCLTarget", "PyOpenCLTarget", "ISPCTarget", "NumbaTarget", "NumbaCudaTarget", "ASTBuilderBase", diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index c42fd2339c866cf806f51b5451e85164232f9a66..ffd291d530898a817b7be66a790d5d57fb88a99d 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -550,21 +550,4 @@ def generate_body(kernel): # }}} -def generate_header(kernel): - try: - fde = kernel.target.get_device_decl_extractor() - except NotImplementedError: - from warnings import warn - warn('Header generation for non C-based languages are not implemented', - RuntimeWarning, stacklevel=2) - return None - - codegen_result = generate_code_v2(kernel) - - headers = [] - for dev_prg in codegen_result.device_programs: - headers.append(str(fde(dev_prg.decl_ast))) - - return '\n'.join(headers) - # vim: foldmethod=marker diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py index 29280ac2b901ddecc0843a8f8fe97a5981c7cdf8..04fab05afdc38a8843a566e0e6e6b10098d6415c 100644 --- a/loopy/codegen/result.py +++ b/loopy/codegen/result.py @@ -61,10 +61,6 @@ class GeneratedProgram(Record): Once generated, this captures the AST of the operative function body (including declaration of necessary temporaries), but not the overall function definition. - .. attribute:: decl_ast - Once generated, this captures the AST of the operative function - definition but not the function body or declaration of necessary - temporaries. """ @@ -314,7 +310,6 @@ def generate_host_or_device_program(codegen_state, schedule_index): codegen_state, cur_prog.copy( ast=ast_builder.process_ast(fdef_ast), - decl_ast=ast_builder.process_ast(fdecl_ast), body_ast=ast_builder.process_ast(body_ast))) return codegen_result diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index fc80fc3d9c0382fa17bc0ebaf61049949dc3e977..409b9badb639c500e70404e781036b2e39bf333f 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -138,13 +138,6 @@ class TargetBase(object): """ raise NotImplementedError() - def get_device_decl_extractor(self): - """ - :returns: a FunctionDeclExtractor (if implemented) that extracts a device function's - header definition - """ - raise NotImplementedError() - class ASTBuilderBase(object): """An interface for generating (host or device) ASTs. diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 513386a58d5b933c20dfc95879863d00e1f4dcb4..8b81efb312d996a2cd972a416ddf7d190e580d13 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -29,7 +29,7 @@ import six import numpy as np # noqa from loopy.target import TargetBase, ASTBuilderBase, DummyHostASTBuilder from loopy.diagnostic import LoopyError -from cgen import Pointer +from cgen import Pointer, NestedDeclarator from cgen.mapper import IdentityMapper as CASTIdentityMapperBase from pymbolic.mapper.stringifier import PREC_NONE from loopy.symbolic import IdentityMapper @@ -132,6 +132,10 @@ class POD(Declarator): mapper_method = "map_loopy_pod" + +class FunctionDeclarationWrapper(NestedDeclarator): + mapper_method = "map_function_decl_wrapper" + # }}} @@ -203,6 +207,10 @@ class CASTIdentityMapper(CASTIdentityMapperBase): def map_loopy_pod(self, node, *args, **kwargs): return type(node)(node.ast_builder, node.dtype, node.name) + def map_function_decl_wrapper(self, node, *args, **kwargs): + return FunctionDeclarationWrapper( + self.rec(node.subdecl, *args, **kwargs)) + class SubscriptSubsetCounter(IdentityMapper): def __init__(self, subset_counters): @@ -259,9 +267,6 @@ class CTarget(TargetBase): def get_device_ast_builder(self): return CASTBuilder(self) - def get_device_decl_extractor(self): - return CFunctionDeclExtractor() - # {{{ types @memoize_method @@ -381,10 +386,11 @@ class CASTBuilder(ASTBuilderBase): if self.target.fortran_abi: name += "_" - return FunctionDeclaration( - Value("void", name), - [self.idi_to_cgen_declarator(codegen_state.kernel, idi) - for idi in codegen_state.implemented_data_info]) + return FunctionDeclarationWrapper( + FunctionDeclaration( + Value("void", name), + [self.idi_to_cgen_declarator(codegen_state.kernel, idi) + for idi in codegen_state.implemented_data_info])) def get_temporary_decls(self, codegen_state, schedule_index): from loopy.kernel.data import temp_var_scope @@ -756,12 +762,46 @@ class CASTBuilder(ASTBuilderBase): sc(node) return node + +# {{{ header generation + class CFunctionDeclExtractor(CASTIdentityMapper): def __init__(self): self.decls = [] - def map_function_declaration(self, node): - self.decls.append(node) - return super(self.__class__, self).map_function_declaration(node) + def map_expression(self, expr): + return expr + + def map_function_decl_wrapper(self, node): + self.decls.append(node.subdecl) + return super(CFunctionDeclExtractor, self)\ + .map_function_decl_wrapper(node) + + +def generate_header(kernel, codegen_result=None): + """ + :arg kernel: a :class:`loopy.LoopKernel` + :arg codegen_result: an instance of :class:`loopy.CodeGenerationResult` + :returns: a list of AST nodes (which may have :func:`str` + called on them to produce a string) representing + function declarations for the generated device + functions. + """ + + if not isinstance(kernel.target, CTarget): + raise LoopyError( + 'Header generation for non C-based languages are not implemented') + + if codegen_result is None: + from loopy.codegen import generate_code_v2 + codegen_result = generate_code_v2(kernel) + + fde = CFunctionDeclExtractor() + for dev_prg in codegen_result.device_programs: + fde(dev_prg.ast) + + return fde.decls + +# }}} # vim: foldmethod=marker diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index d31718f15ed563bba0b602e6017536b72b6deed0..51ccc301290460c49af73691f2ebe812b8a6da9b 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -99,6 +99,7 @@ def _create_vector_types(): vec.types[np.dtype(base_type), count] = dtype vec.type_to_scalar_and_count[dtype] = np.dtype(base_type), count + _create_vector_types() @@ -232,6 +233,10 @@ class CUDACASTBuilder(CASTBuilder): fdecl = super(CUDACASTBuilder, self).get_function_declaration( codegen_state, codegen_result, schedule_index) + from loopy.target.c import FunctionDeclarationWrapper + assert isinstance(fdecl, FunctionDeclarationWrapper) + fdecl = fdecl.subdecl + from cgen.cuda import CudaGlobal, CudaLaunchBounds fdecl = CudaGlobal(fdecl) @@ -254,7 +259,7 @@ class CUDACASTBuilder(CASTBuilder): fdecl = CudaLaunchBounds(nthreads, fdecl) - return fdecl + return FunctionDeclarationWrapper(fdecl) def generate_code(self, kernel, codegen_state, impl_arg_info): code, implemented_domains = ( diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index 2c48fb902e746780599f039a44b2d0a5ea787b88..80a69bd00c99258b709ea18b2a716c339b888b02 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -237,16 +237,19 @@ class ISPCASTBuilder(CASTBuilder): arg_names, arg_decls = self._arg_names_and_decls(codegen_state) if codegen_state.is_generating_device_code: - return ISPCTask( + result = ISPCTask( FunctionDeclaration( Value("void", name), arg_decls)) else: - return ISPCExport( + result = ISPCExport( FunctionDeclaration( Value("void", name), arg_decls)) + from loopy.target.c import FunctionDeclarationWrapper + return FunctionDeclarationWrapper(result) + # }}} def get_kernel_call(self, codegen_state, name, gsize, lsize, extra_args): diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index f0436099c6127e6426b03df2c48342b6ee99c67f..31cf7c6b648ebf370a17d8beb2538b9748ddb30a 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -126,6 +126,7 @@ def _create_vector_types(): vec.types[np.dtype(base_type), count] = dtype vec.type_to_scalar_and_count[dtype] = np.dtype(base_type), count + _create_vector_types() @@ -400,6 +401,10 @@ class OpenCLCASTBuilder(CASTBuilder): fdecl = super(OpenCLCASTBuilder, self).get_function_declaration( codegen_state, codegen_result, schedule_index) + from loopy.target.c import FunctionDeclarationWrapper + assert isinstance(fdecl, FunctionDeclarationWrapper) + fdecl = fdecl.subdecl + from cgen.opencl import CLKernel, CLRequiredWorkGroupSize fdecl = CLKernel(fdecl) @@ -415,7 +420,7 @@ class OpenCLCASTBuilder(CASTBuilder): fdecl = CLRequiredWorkGroupSize(local_sizes, fdecl) - return fdecl + return FunctionDeclarationWrapper(fdecl) def generate_top_of_body(self, codegen_state): from loopy.kernel.data import ImageArg diff --git a/test/test_loopy.py b/test/test_loopy.py index 9f1b6ea8e93a62f147aaa67f70b563137766c404..69e0ea325a328d03bbd98cec2163f5c7981a4a78 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1586,9 +1586,8 @@ def test_temp_initializer(ctx_factory, src_order, tmp_order): assert np.array_equal(a, a2) -def test_header_extract(ctx_factory): - ctx = ctx_factory() +def test_header_extract(): knl = lp.make_kernel('{[k]: 0<=k<n}}', """ for k @@ -1601,19 +1600,22 @@ def test_header_extract(ctx_factory): knl = lp.fix_parameters(knl, n=200) #test C - cknl = knl - cknl.target = lp.CTarget() - assert lp.generate_header(cknl) == 'void loopy_kernel(float *restrict T);' + cknl = knl.copy(target=lp.CTarget()) + assert str(lp.generate_header(cknl)[0]) == ( + 'void loopy_kernel(float *__restrict__ T);') #test CUDA - cuknl = knl - cuknl.target = lp.CudaTarget() - assert lp.generate_header(cuknl) == 'extern "C" __global__ void __launch_bounds__(1) loopy_kernel(float *__restrict__ T);' + cuknl = knl.copy(target=lp.CudaTarget()) + assert str(lp.generate_header(cuknl)[0]) == ( + 'extern "C" __global__ void __launch_bounds__(1) ' + 'loopy_kernel(float *__restrict__ T);') #test OpenCL - oclknl = knl - oclknl.target = lp.PyOpenCLTarget() - assert lp.generate_header(oclknl) == '__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float *restrict T);' + oclknl = knl.copy(target=lp.PyOpenCLTarget()) + assert str(lp.generate_header(oclknl)[0]) == ( + '__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) ' + 'loopy_kernel(__global float *__restrict__ T);') + def test_scalars_with_base_storage(ctx_factory): """ Regression test for !50 """ @@ -1678,6 +1680,7 @@ def test_tight_loop_bounds_codegen(): assert for_loop in cgr.device_code() + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])