From fa3d3d66d6fdd23cc16c776d2a92b5788cd4c445 Mon Sep 17 00:00:00 2001
From: Lucas C Wilcox <lucas@swirlee.com>
Date: Thu, 7 Jan 2016 19:35:05 -0600
Subject: [PATCH] CUDA target: generate launch bounds, extern C

---
 loopy/target/cuda.py | 22 ++++++++++++++++++++--
 1 file changed, 20 insertions(+), 2 deletions(-)

diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index efe755dca..992d5db85 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -138,6 +138,13 @@ def cuda_function_mangler(kernel, name, arg_dtypes):
 # {{{ target
 
 class CudaTarget(CTarget):
+    def __init__(self, extern_c=True):
+        """
+        :arg extern_c: If *True*, declare kernels using "extern C" to
+            avoid name mangling.
+        """
+        self.extern_c = extern_c
+
     # {{{ library
 
     def function_manglers(self):
@@ -173,8 +180,19 @@ class CudaTarget(CTarget):
     # {{{ top-level codegen
 
     def wrap_function_declaration(self, kernel, fdecl):
-        from cgen.cuda import CudaGlobal
-        return CudaGlobal(fdecl)
+        from cgen.cuda import CudaGlobal, CudaLaunchBounds
+        fdecl = CudaGlobal(fdecl)
+
+        if self.extern_c:
+            from cgen import Extern
+            fdecl = Extern("C", fdecl)
+
+        _, local_grid_size = kernel.get_grid_sizes_as_exprs()
+
+        from pytools import product
+        nthreads = product(local_grid_size)
+
+        return CudaLaunchBounds(nthreads, fdecl)
 
     def generate_code(self, kernel, codegen_state, impl_arg_info):
         code, implemented_domains = (
-- 
GitLab