From 166ddd3abb233fe9afd0e0ec9a1acfc07b0082f9 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Mon, 18 Jan 2016 13:30:22 -0600 Subject: [PATCH] Only generate static workgroup size info if workgroup sizes are actually static --- loopy/target/cuda.py | 12 +++++++++--- loopy/target/opencl.py | 15 ++++++++++++--- 2 files changed, 21 insertions(+), 6 deletions(-) diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 9a9aee76a..4b3237515 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -213,10 +213,16 @@ class CudaTarget(CTarget): _, local_grid_size = kernel.get_grid_sizes_as_exprs() - from pytools import product - nthreads = product(local_grid_size) + from loopy.symbolic import get_dependencies + if not get_dependencies(local_grid_size): + # Sizes can't have parameter dependencies if they are + # to be used in static thread block size. + from pytools import product + nthreads = product(local_grid_size) - return CudaLaunchBounds(nthreads, fdecl) + fdecl = CudaLaunchBounds(nthreads, fdecl) + + return fdecl def generate_code(self, kernel, codegen_state, impl_arg_info): code, implemented_domains = ( diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 7ef944d32..a009e9336 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -239,9 +239,18 @@ class OpenCLTarget(CTarget): def wrap_function_declaration(self, kernel, fdecl): from cgen.opencl import CLKernel, CLRequiredWorkGroupSize - return CLRequiredWorkGroupSize( - kernel.get_grid_sizes_as_exprs()[1], - CLKernel(fdecl)) + fdecl = CLKernel(fdecl) + + _, local_sizes = kernel.get_grid_sizes_as_exprs() + + from loopy.symbolic import get_dependencies + if not get_dependencies(local_sizes): + # sizes can't have parameter dependencies if they are + # to be used in static WG size. + + fdecl = CLRequiredWorkGroupSize(local_sizes, fdecl) + + return fdecl def generate_code(self, kernel, codegen_state, impl_arg_info): code, implemented_domains = ( -- GitLab