diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 9a9aee76acc65b1b978d1c9171a463656e6cc6a2..4b3237515152965f1d7c0f7ddd271d25e3afb767 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -213,10 +213,16 @@ class CudaTarget(CTarget): _, local_grid_size = kernel.get_grid_sizes_as_exprs() - from pytools import product - nthreads = product(local_grid_size) + from loopy.symbolic import get_dependencies + if not get_dependencies(local_grid_size): + # Sizes can't have parameter dependencies if they are + # to be used in static thread block size. + from pytools import product + nthreads = product(local_grid_size) - return CudaLaunchBounds(nthreads, fdecl) + fdecl = CudaLaunchBounds(nthreads, fdecl) + + return fdecl def generate_code(self, kernel, codegen_state, impl_arg_info): code, implemented_domains = ( diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 7ef944d323264a267146ca364d3699a5f9dfa0d0..a009e93360016128f69c85fee2a555cd016a5ab1 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -239,9 +239,18 @@ class OpenCLTarget(CTarget): def wrap_function_declaration(self, kernel, fdecl): from cgen.opencl import CLKernel, CLRequiredWorkGroupSize - return CLRequiredWorkGroupSize( - kernel.get_grid_sizes_as_exprs()[1], - CLKernel(fdecl)) + fdecl = CLKernel(fdecl) + + _, local_sizes = kernel.get_grid_sizes_as_exprs() + + from loopy.symbolic import get_dependencies + if not get_dependencies(local_sizes): + # sizes can't have parameter dependencies if they are + # to be used in static WG size. + + fdecl = CLRequiredWorkGroupSize(local_sizes, fdecl) + + return fdecl def generate_code(self, kernel, codegen_state, impl_arg_info): code, implemented_domains = (