diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index efe755dca025646a10921ded2c7eb8ce6b1052fc..992d5db858bafc91af1d97b36323443c666590e7 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -138,6 +138,13 @@ def cuda_function_mangler(kernel, name, arg_dtypes):
 # {{{ target
 
 class CudaTarget(CTarget):
+    def __init__(self, extern_c=True):
+        """
+        :arg extern_c: If *True*, declare kernels using "extern C" to
+            avoid name mangling.
+        """
+        self.extern_c = extern_c
+
     # {{{ library
 
     def function_manglers(self):
@@ -173,8 +180,19 @@ class CudaTarget(CTarget):
     # {{{ top-level codegen
 
     def wrap_function_declaration(self, kernel, fdecl):
-        from cgen.cuda import CudaGlobal
-        return CudaGlobal(fdecl)
+        from cgen.cuda import CudaGlobal, CudaLaunchBounds
+        fdecl = CudaGlobal(fdecl)
+
+        if self.extern_c:
+            from cgen import Extern
+            fdecl = Extern("C", fdecl)
+
+        _, local_grid_size = kernel.get_grid_sizes_as_exprs()
+
+        from pytools import product
+        nthreads = product(local_grid_size)
+
+        return CudaLaunchBounds(nthreads, fdecl)
 
     def generate_code(self, kernel, codegen_state, impl_arg_info):
         code, implemented_domains = (