From fa3d3d66d6fdd23cc16c776d2a92b5788cd4c445 Mon Sep 17 00:00:00 2001 From: Lucas C Wilcox <lucas@swirlee.com> Date: Thu, 7 Jan 2016 19:35:05 -0600 Subject: [PATCH] CUDA target: generate launch bounds, extern C --- loopy/target/cuda.py | 22 ++++++++++++++++++++-- 1 file changed, 20 insertions(+), 2 deletions(-) diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index efe755dca..992d5db85 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -138,6 +138,13 @@ def cuda_function_mangler(kernel, name, arg_dtypes): # {{{ target class CudaTarget(CTarget): + def __init__(self, extern_c=True): + """ + :arg extern_c: If *True*, declare kernels using "extern C" to + avoid name mangling. + """ + self.extern_c = extern_c + # {{{ library def function_manglers(self): @@ -173,8 +180,19 @@ class CudaTarget(CTarget): # {{{ top-level codegen def wrap_function_declaration(self, kernel, fdecl): - from cgen.cuda import CudaGlobal - return CudaGlobal(fdecl) + from cgen.cuda import CudaGlobal, CudaLaunchBounds + fdecl = CudaGlobal(fdecl) + + if self.extern_c: + from cgen import Extern + fdecl = Extern("C", fdecl) + + _, local_grid_size = kernel.get_grid_sizes_as_exprs() + + from pytools import product + nthreads = product(local_grid_size) + + return CudaLaunchBounds(nthreads, fdecl) def generate_code(self, kernel, codegen_state, impl_arg_info): code, implemented_domains = ( -- GitLab