diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 46bf0aa2489bf7888a5f0d7df11c94da5e05c14d..98e12cca78139c82b63668bec60520a2c513c698 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -244,9 +244,9 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, assert isinstance(tag, UniqueTag) if isinstance(tag, LocalIndexTag): - hw_axis_expr = kernel.target.get_local_axis_expr(tag.axis) + hw_axis_expr = kernel.target.get_local_axis_expr(kernel, tag.axis) elif isinstance(tag, GroupIndexTag): - hw_axis_expr = kernel.target.get_global_axis_expr(tag.axis) + hw_axis_expr = kernel.target.get_global_axis_expr(kernel, tag.axis) else: raise RuntimeError("unexpected hw tag type") diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index ba83b07eeb7cb0d1d28237c1b9d66eba4c75b6fc..5b51808e005e5901fc30d54a8b8ee7758617117c 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -100,10 +100,10 @@ class TargetBase(object): def get_expression_to_code_mapper(self, codegen_state): raise NotImplementedError() - def get_global_axis_expr(self, axis): + def get_global_axis_expr(self, kernel, axis): raise NotImplementedError() - def get_local_axis_expr(self, axis): + def get_local_axis_expr(self, kernel, axis): raise NotImplementedError() def add_vector_access(self, access_str, index): diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 6f66e7b99acff43305130a46716956a659c5aedc..efe755dca025646a10921ded2c7eb8ce6b1052fc 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -200,11 +200,22 @@ class CudaTarget(CTarget): _GRID_AXES = "xyz" - def get_global_axis_expr(self, axis): - return var("blockIdx").attr(self._GRID_AXES[axis]) + @staticmethod + def _get_index_ctype(kernel): + if kernel.index_dtype == np.int32: + return "int32_t" + else: + return "int64_t" + + def get_global_axis_expr(self, kernel, axis): + return var("((%s) blockIdx.%s)" % ( + self._get_index_ctype(kernel), + self._GRID_AXES[axis])) - def get_local_axis_expr(self, axis): - return var("threadIdx").attr(self._GRID_AXES[axis]) + def get_local_axis_expr(self, kernel, axis): + return var("((%s) threadIdx.%s)" % ( + self._get_index_ctype(kernel), + self._GRID_AXES[axis])) _VEC_AXES = "xyzw" diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index e0c4b75a3b5aacac8848a3669b475a8197a1168a..cf11092df82c11c9b90a7911eda985b2e4963af9 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -101,10 +101,10 @@ class ISPCTarget(CTarget): # {{{ code generation guts - def get_global_axis_expr(self, axis): + def get_global_axis_expr(self, kernel, axis): return var("taskIndex%d" % axis) - def get_local_axis_expr(self, axis): + def get_local_axis_expr(self, kernel, axis): if axis == 0: return var("programIndex") else: diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index 4b88e65554ef4b23db59bed852ceec468f8868e3..cf130a0952ad3848da199ac5491e5f958f3787d2 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -267,10 +267,10 @@ class OpenCLTarget(CTarget): # {{{ code generation guts - def get_global_axis_expr(self, axis): + def get_global_axis_expr(self, kernel, axis): return var("gid")(axis) - def get_local_axis_expr(self, axis): + def get_local_axis_expr(self, kernel, axis): return var("lid")(axis) def add_vector_access(self, access_str, index):