diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index 9a9aee76acc65b1b978d1c9171a463656e6cc6a2..4b3237515152965f1d7c0f7ddd271d25e3afb767 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -213,10 +213,16 @@ class CudaTarget(CTarget):
 
         _, local_grid_size = kernel.get_grid_sizes_as_exprs()
 
-        from pytools import product
-        nthreads = product(local_grid_size)
+        from loopy.symbolic import get_dependencies
+        if not get_dependencies(local_grid_size):
+            # Sizes can't have parameter dependencies if they are
+            # to be used in static thread block size.
+            from pytools import product
+            nthreads = product(local_grid_size)
 
-        return CudaLaunchBounds(nthreads, fdecl)
+            fdecl = CudaLaunchBounds(nthreads, fdecl)
+
+        return fdecl
 
     def generate_code(self, kernel, codegen_state, impl_arg_info):
         code, implemented_domains = (
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 7ef944d323264a267146ca364d3699a5f9dfa0d0..a009e93360016128f69c85fee2a555cd016a5ab1 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -239,9 +239,18 @@ class OpenCLTarget(CTarget):
 
     def wrap_function_declaration(self, kernel, fdecl):
         from cgen.opencl import CLKernel, CLRequiredWorkGroupSize
-        return CLRequiredWorkGroupSize(
-                kernel.get_grid_sizes_as_exprs()[1],
-                CLKernel(fdecl))
+        fdecl = CLKernel(fdecl)
+
+        _, local_sizes = kernel.get_grid_sizes_as_exprs()
+
+        from loopy.symbolic import get_dependencies
+        if not get_dependencies(local_sizes):
+            # sizes can't have parameter dependencies if they are
+            # to be used in static WG size.
+
+            fdecl = CLRequiredWorkGroupSize(local_sizes, fdecl)
+
+        return fdecl
 
     def generate_code(self, kernel, codegen_state, impl_arg_info):
         code, implemented_domains = (