diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index 98e12cca78139c82b63668bec60520a2c513c698..6d0b2ca60504060e677d1e3bb941b6c3ccbb3fc2 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -242,11 +242,13 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, tag = kernel.iname_to_tag.get(iname) + from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex + assert isinstance(tag, UniqueTag) - if isinstance(tag, LocalIndexTag): - hw_axis_expr = kernel.target.get_local_axis_expr(kernel, tag.axis) - elif isinstance(tag, GroupIndexTag): - hw_axis_expr = kernel.target.get_global_axis_expr(kernel, tag.axis) + if isinstance(tag, GroupIndexTag): + hw_axis_expr = GroupHardwareAxisIndex(tag.axis) + elif isinstance(tag, LocalIndexTag): + hw_axis_expr = LocalHardwareAxisIndex(tag.axis) else: raise RuntimeError("unexpected hw tag type") diff --git a/loopy/expression.py b/loopy/expression.py index e9f7d8410153f7e4a8ac3317e50aaa1b70baa483..62c2278be5b863105e45a8c0c6d517cdf2b92fd2 100644 --- a/loopy/expression.py +++ b/loopy/expression.py @@ -257,6 +257,12 @@ class TypeInferenceMapper(CombineMapper): map_logical_and = map_comparison map_logical_or = map_comparison + def map_group_hw_index(self, expr, *args): + return self.kernel.index_dtype + + def map_local_hw_index(self, expr, *args): + return self.kernel.index_dtype + def map_reduction(self, expr): return expr.operation.result_dtype( self.kernel.target, self.rec(expr.expr), expr.inames) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index b3dfce3d6415de67951dd445f3028d4d21c3d040..7adab80c68c38f900976eb1adcd90226f40a7d9b 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -69,6 +69,15 @@ import numpy as np # {{{ mappers with support for loopy-specific primitives class IdentityMapperMixin(object): + def map_group_hw_index(self, expr, *args): + return expr + + def map_local_hw_index(self, expr, *args): + return expr + + def map_loopy_function_identifier(self, expr, *args): + return expr + def map_reduction(self, expr, *args): return Reduction(expr.operation, expr.inames, self.rec(expr.expr, *args)) @@ -76,9 +85,6 @@ class IdentityMapperMixin(object): # leaf, doesn't change return expr - def map_loopy_function_identifier(self, expr, *args): - return expr - map_linear_subscript = IdentityMapperBase.map_subscript @@ -92,6 +98,12 @@ class PartialEvaluationMapper(EvaluationMapperBase, IdentityMapperMixin): class WalkMapper(WalkMapperBase): + def map_group_hw_index(self, expr, *args): + self.visit(expr) + + def map_local_hw_index(self, expr, *args): + self.visit(expr) + def map_reduction(self, expr, *args): if not self.visit(expr): return @@ -127,6 +139,12 @@ class ConstantFoldingMapper(ConstantFoldingMapperBase, class StringifyMapper(StringifyMapperBase): + def map_group_hw_index(self, expr, enclosing_prec): + return "grp.%d" % expr.index + + def map_local_hw_index(self, expr, enclosing_prec): + return "loc.%d" % expr.index + def map_reduction(self, expr, prec): return "reduce(%s, [%s], %s)" % ( expr.operation, ", ".join(expr.inames), expr.expr) @@ -177,6 +195,12 @@ class UnidirectionalUnifier(UnidirectionalUnifierBase): class DependencyMapper(DependencyMapperBase): + def map_group_hw_index(self, expr): + return set() + + def map_local_hw_index(self, expr): + return set() + def map_call(self, expr, *args): # Loopy does not have first-class functions. Do not descend # into 'function' attribute of Call. @@ -235,6 +259,27 @@ class SubstitutionRuleExpander(IdentityMapper): # {{{ loopy-specific primitives +class HardwareAxisIndex(Leaf): + def __init__(self, axis): + self.axis = axis + + def stringifier(self): + return StringifyMapper + + def __getinitargs__(self): + return (self.axis,) + + init_arg_names = ("axis",) + + +class GroupHardwareAxisIndex(HardwareAxisIndex): + mapper_method = "map_group_hw_index" + + +class LocalHardwareAxisIndex(HardwareAxisIndex): + mapper_method = "map_local_hw_index" + + class FunctionIdentifier(Leaf): """A base class for symbols representing functions.""" diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py index 5b51808e005e5901fc30d54a8b8ee7758617117c..b8c903fc12cccde4e764731580cc8cfc04151af3 100644 --- a/loopy/target/__init__.py +++ b/loopy/target/__init__.py @@ -100,12 +100,6 @@ class TargetBase(object): def get_expression_to_code_mapper(self, codegen_state): raise NotImplementedError() - def get_global_axis_expr(self, kernel, axis): - raise NotImplementedError() - - def get_local_axis_expr(self, kernel, axis): - raise NotImplementedError() - def add_vector_access(self, access_str, index): raise NotImplementedError() diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py index 97bec6e59764426b882be2a6ce0625fc5945f179..06151855804723ec6b4c98de42865406d863c377 100644 --- a/loopy/target/c/codegen/expression.py +++ b/loopy/target/c/codegen/expression.py @@ -670,6 +670,12 @@ class LoopyCCodeMapper(RecursiveMapper): # }}} + def map_group_hw_index(self, expr, enclosing_prec, type_context): + raise LoopyError("plain C does not have group hw axes") + + def map_local_hw_index(self, expr, enclosing_prec, type_context): + raise LoopyError("plain C does not have group hw axes") + # }}} # vim: fdm=marker diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 992d5db858bafc91af1d97b36323443c666590e7..9a9aee76acc65b1b978d1c9171a463656e6cc6a2 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -27,11 +27,10 @@ THE SOFTWARE. import numpy as np from loopy.target.c import CTarget +from loopy.target.c.codegen.expression import LoopyCCodeMapper from pytools import memoize_method from loopy.diagnostic import LoopyError -from pymbolic import var - # {{{ vector types @@ -135,6 +134,31 @@ def cuda_function_mangler(kernel, name, arg_dtypes): # }}} +# {{{ expression mapper + +class LoopyCudaCCodeMapper(LoopyCCodeMapper): + _GRID_AXES = "xyz" + + @staticmethod + def _get_index_ctype(kernel): + if kernel.index_dtype == np.int32: + return "int32_t" + else: + return "int64_t" + + def map_group_hw_index(self, expr, enclosing_prec, type_context): + return "((%s) blockIdx.%s)" % ( + self._get_index_ctype(self.kernel), + self._GRID_AXES[expr.axis]) + + def map_local_hw_index(self, expr, enclosing_prec, type_context): + return "((%s) threadIdx.%s)" % ( + self._get_index_ctype(self.kernel), + self._GRID_AXES[expr.axis]) + +# }}} + + # {{{ target class CudaTarget(CTarget): @@ -216,24 +240,8 @@ class CudaTarget(CTarget): # {{{ code generation guts - _GRID_AXES = "xyz" - - @staticmethod - def _get_index_ctype(kernel): - if kernel.index_dtype == np.int32: - return "int32_t" - else: - return "int64_t" - - def get_global_axis_expr(self, kernel, axis): - return var("((%s) blockIdx.%s)" % ( - self._get_index_ctype(kernel), - self._GRID_AXES[axis])) - - def get_local_axis_expr(self, kernel, axis): - return var("((%s) threadIdx.%s)" % ( - self._get_index_ctype(kernel), - self._GRID_AXES[axis])) + def get_expression_to_code_mapper(self, codegen_state): + return LoopyCudaCCodeMapper(codegen_state) _VEC_AXES = "xyzw" diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py index cf11092df82c11c9b90a7911eda985b2e4963af9..2d146e82a80a668582156df4232ee0f430a3f546 100644 --- a/loopy/target/ispc.py +++ b/loopy/target/ispc.py @@ -27,11 +27,27 @@ THE SOFTWARE. import numpy as np # noqa from loopy.target.c import CTarget +from loopy.target.c.codegen.expression import LoopyCCodeMapper from loopy.diagnostic import LoopyError from pymbolic import var +# {{{ expression mapper + +class LoopyISPCCodeMapper(LoopyCCodeMapper): + def map_group_hw_index(self, expr, enclosing_prec, type_context): + return "taskIndex%d" % expr.axis + + def map_local_hw_index(self, expr, enclosing_prec, type_context): + if expr.axis == 0: + return var("programIndex") + else: + raise LoopyError("ISPC only supports one local axis") + +# }}} + + class ISPCTarget(CTarget): # {{{ top-level codegen @@ -101,14 +117,8 @@ class ISPCTarget(CTarget): # {{{ code generation guts - def get_global_axis_expr(self, kernel, axis): - return var("taskIndex%d" % axis) - - def get_local_axis_expr(self, kernel, axis): - if axis == 0: - return var("programIndex") - else: - raise LoopyError("ISPC only supports one local axis") + def get_expression_to_code_mapper(self, codegen_state): + return LoopyISPCCodeMapper(codegen_state) def add_vector_access(self, access_str, index): return "(%s)[%d]" % (access_str, index) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index cf130a0952ad3848da199ac5491e5f958f3787d2..7ef944d323264a267146ca364d3699a5f9dfa0d0 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -27,11 +27,10 @@ THE SOFTWARE. import numpy as np from loopy.target.c import CTarget +from loopy.target.c.codegen.expression import LoopyCCodeMapper from pytools import memoize_method from loopy.diagnostic import LoopyError -from pymbolic import var - # {{{ vector types @@ -175,6 +174,18 @@ def opencl_preamble_generator(kernel, seen_dtypes, seen_functions): # }}} +# {{{ expression mapper + +class LoopyOpenCLCCodeMapper(LoopyCCodeMapper): + def map_group_hw_index(self, expr, enclosing_prec, type_context): + return "gid(%d)" % expr.axis + + def map_local_hw_index(self, expr, enclosing_prec, type_context): + return "lid(%d)" % expr.axis + +# }}} + + # {{{ target class OpenCLTarget(CTarget): @@ -267,11 +278,8 @@ class OpenCLTarget(CTarget): # {{{ code generation guts - def get_global_axis_expr(self, kernel, axis): - return var("gid")(axis) - - def get_local_axis_expr(self, kernel, axis): - return var("lid")(axis) + def get_expression_to_code_mapper(self, codegen_state): + return LoopyOpenCLCCodeMapper(codegen_state) def add_vector_access(self, access_str, index): # The 'int' avoids an 'L' suffix for long ints.