Skip to content
Snippets Groups Projects
Commit fa3d3d66 authored by Lucas Wilcox's avatar Lucas Wilcox
Browse files

CUDA target: generate launch bounds, extern C

parent 405c789e
No related branches found
No related tags found
No related merge requests found
...@@ -138,6 +138,13 @@ def cuda_function_mangler(kernel, name, arg_dtypes): ...@@ -138,6 +138,13 @@ def cuda_function_mangler(kernel, name, arg_dtypes):
# {{{ target # {{{ target
class CudaTarget(CTarget): class CudaTarget(CTarget):
def __init__(self, extern_c=True):
"""
:arg extern_c: If *True*, declare kernels using "extern C" to
avoid name mangling.
"""
self.extern_c = extern_c
# {{{ library # {{{ library
def function_manglers(self): def function_manglers(self):
...@@ -173,8 +180,19 @@ class CudaTarget(CTarget): ...@@ -173,8 +180,19 @@ class CudaTarget(CTarget):
# {{{ top-level codegen # {{{ top-level codegen
def wrap_function_declaration(self, kernel, fdecl): def wrap_function_declaration(self, kernel, fdecl):
from cgen.cuda import CudaGlobal from cgen.cuda import CudaGlobal, CudaLaunchBounds
return CudaGlobal(fdecl) fdecl = CudaGlobal(fdecl)
if self.extern_c:
from cgen import Extern
fdecl = Extern("C", fdecl)
_, local_grid_size = kernel.get_grid_sizes_as_exprs()
from pytools import product
nthreads = product(local_grid_size)
return CudaLaunchBounds(nthreads, fdecl)
def generate_code(self, kernel, codegen_state, impl_arg_info): def generate_code(self, kernel, codegen_state, impl_arg_info):
code, implemented_domains = ( code, implemented_domains = (
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment