diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py
index 98e12cca78139c82b63668bec60520a2c513c698..6d0b2ca60504060e677d1e3bb941b6c3ccbb3fc2 100644
--- a/loopy/codegen/loop.py
+++ b/loopy/codegen/loop.py
@@ -242,11 +242,13 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state,
 
     tag = kernel.iname_to_tag.get(iname)
 
+    from loopy.symbolic import GroupHardwareAxisIndex, LocalHardwareAxisIndex
+
     assert isinstance(tag, UniqueTag)
-    if isinstance(tag, LocalIndexTag):
-        hw_axis_expr = kernel.target.get_local_axis_expr(kernel, tag.axis)
-    elif isinstance(tag, GroupIndexTag):
-        hw_axis_expr = kernel.target.get_global_axis_expr(kernel, tag.axis)
+    if isinstance(tag, GroupIndexTag):
+        hw_axis_expr = GroupHardwareAxisIndex(tag.axis)
+    elif isinstance(tag, LocalIndexTag):
+        hw_axis_expr = LocalHardwareAxisIndex(tag.axis)
     else:
         raise RuntimeError("unexpected hw tag type")
 
diff --git a/loopy/expression.py b/loopy/expression.py
index e9f7d8410153f7e4a8ac3317e50aaa1b70baa483..62c2278be5b863105e45a8c0c6d517cdf2b92fd2 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -257,6 +257,12 @@ class TypeInferenceMapper(CombineMapper):
     map_logical_and = map_comparison
     map_logical_or = map_comparison
 
+    def map_group_hw_index(self, expr, *args):
+        return self.kernel.index_dtype
+
+    def map_local_hw_index(self, expr, *args):
+        return self.kernel.index_dtype
+
     def map_reduction(self, expr):
         return expr.operation.result_dtype(
                 self.kernel.target, self.rec(expr.expr), expr.inames)
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index b3dfce3d6415de67951dd445f3028d4d21c3d040..7adab80c68c38f900976eb1adcd90226f40a7d9b 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -69,6 +69,15 @@ import numpy as np
 # {{{ mappers with support for loopy-specific primitives
 
 class IdentityMapperMixin(object):
+    def map_group_hw_index(self, expr, *args):
+        return expr
+
+    def map_local_hw_index(self, expr, *args):
+        return expr
+
+    def map_loopy_function_identifier(self, expr, *args):
+        return expr
+
     def map_reduction(self, expr, *args):
         return Reduction(expr.operation, expr.inames, self.rec(expr.expr, *args))
 
@@ -76,9 +85,6 @@ class IdentityMapperMixin(object):
         # leaf, doesn't change
         return expr
 
-    def map_loopy_function_identifier(self, expr, *args):
-        return expr
-
     map_linear_subscript = IdentityMapperBase.map_subscript
 
 
@@ -92,6 +98,12 @@ class PartialEvaluationMapper(EvaluationMapperBase, IdentityMapperMixin):
 
 
 class WalkMapper(WalkMapperBase):
+    def map_group_hw_index(self, expr, *args):
+        self.visit(expr)
+
+    def map_local_hw_index(self, expr, *args):
+        self.visit(expr)
+
     def map_reduction(self, expr, *args):
         if not self.visit(expr):
             return
@@ -127,6 +139,12 @@ class ConstantFoldingMapper(ConstantFoldingMapperBase,
 
 
 class StringifyMapper(StringifyMapperBase):
+    def map_group_hw_index(self, expr, enclosing_prec):
+        return "grp.%d" % expr.index
+
+    def map_local_hw_index(self, expr, enclosing_prec):
+        return "loc.%d" % expr.index
+
     def map_reduction(self, expr, prec):
         return "reduce(%s, [%s], %s)" % (
                 expr.operation, ", ".join(expr.inames), expr.expr)
@@ -177,6 +195,12 @@ class UnidirectionalUnifier(UnidirectionalUnifierBase):
 
 
 class DependencyMapper(DependencyMapperBase):
+    def map_group_hw_index(self, expr):
+        return set()
+
+    def map_local_hw_index(self, expr):
+        return set()
+
     def map_call(self, expr, *args):
         # Loopy does not have first-class functions. Do not descend
         # into 'function' attribute of Call.
@@ -235,6 +259,27 @@ class SubstitutionRuleExpander(IdentityMapper):
 
 # {{{ loopy-specific primitives
 
+class HardwareAxisIndex(Leaf):
+    def __init__(self, axis):
+        self.axis = axis
+
+    def stringifier(self):
+        return StringifyMapper
+
+    def __getinitargs__(self):
+        return (self.axis,)
+
+    init_arg_names = ("axis",)
+
+
+class GroupHardwareAxisIndex(HardwareAxisIndex):
+    mapper_method = "map_group_hw_index"
+
+
+class LocalHardwareAxisIndex(HardwareAxisIndex):
+    mapper_method = "map_local_hw_index"
+
+
 class FunctionIdentifier(Leaf):
     """A base class for symbols representing functions."""
 
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index 5b51808e005e5901fc30d54a8b8ee7758617117c..b8c903fc12cccde4e764731580cc8cfc04151af3 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -100,12 +100,6 @@ class TargetBase(object):
     def get_expression_to_code_mapper(self, codegen_state):
         raise NotImplementedError()
 
-    def get_global_axis_expr(self, kernel, axis):
-        raise NotImplementedError()
-
-    def get_local_axis_expr(self, kernel, axis):
-        raise NotImplementedError()
-
     def add_vector_access(self, access_str, index):
         raise NotImplementedError()
 
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 97bec6e59764426b882be2a6ce0625fc5945f179..06151855804723ec6b4c98de42865406d863c377 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -670,6 +670,12 @@ class LoopyCCodeMapper(RecursiveMapper):
 
     # }}}
 
+    def map_group_hw_index(self, expr, enclosing_prec, type_context):
+        raise LoopyError("plain C does not have group hw axes")
+
+    def map_local_hw_index(self, expr, enclosing_prec, type_context):
+        raise LoopyError("plain C does not have group hw axes")
+
 # }}}
 
 # vim: fdm=marker
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index 992d5db858bafc91af1d97b36323443c666590e7..9a9aee76acc65b1b978d1c9171a463656e6cc6a2 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -27,11 +27,10 @@ THE SOFTWARE.
 import numpy as np
 
 from loopy.target.c import CTarget
+from loopy.target.c.codegen.expression import LoopyCCodeMapper
 from pytools import memoize_method
 from loopy.diagnostic import LoopyError
 
-from pymbolic import var
-
 
 # {{{ vector types
 
@@ -135,6 +134,31 @@ def cuda_function_mangler(kernel, name, arg_dtypes):
 # }}}
 
 
+# {{{ expression mapper
+
+class LoopyCudaCCodeMapper(LoopyCCodeMapper):
+    _GRID_AXES = "xyz"
+
+    @staticmethod
+    def _get_index_ctype(kernel):
+        if kernel.index_dtype == np.int32:
+            return "int32_t"
+        else:
+            return "int64_t"
+
+    def map_group_hw_index(self, expr, enclosing_prec, type_context):
+        return "((%s) blockIdx.%s)" % (
+            self._get_index_ctype(self.kernel),
+            self._GRID_AXES[expr.axis])
+
+    def map_local_hw_index(self, expr, enclosing_prec, type_context):
+        return "((%s) threadIdx.%s)" % (
+            self._get_index_ctype(self.kernel),
+            self._GRID_AXES[expr.axis])
+
+# }}}
+
+
 # {{{ target
 
 class CudaTarget(CTarget):
@@ -216,24 +240,8 @@ class CudaTarget(CTarget):
 
     # {{{ code generation guts
 
-    _GRID_AXES = "xyz"
-
-    @staticmethod
-    def _get_index_ctype(kernel):
-        if kernel.index_dtype == np.int32:
-            return "int32_t"
-        else:
-            return "int64_t"
-
-    def get_global_axis_expr(self, kernel, axis):
-        return var("((%s) blockIdx.%s)" % (
-            self._get_index_ctype(kernel),
-            self._GRID_AXES[axis]))
-
-    def get_local_axis_expr(self, kernel, axis):
-        return var("((%s) threadIdx.%s)" % (
-            self._get_index_ctype(kernel),
-            self._GRID_AXES[axis]))
+    def get_expression_to_code_mapper(self, codegen_state):
+        return LoopyCudaCCodeMapper(codegen_state)
 
     _VEC_AXES = "xyzw"
 
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index cf11092df82c11c9b90a7911eda985b2e4963af9..2d146e82a80a668582156df4232ee0f430a3f546 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -27,11 +27,27 @@ THE SOFTWARE.
 
 import numpy as np  # noqa
 from loopy.target.c import CTarget
+from loopy.target.c.codegen.expression import LoopyCCodeMapper
 from loopy.diagnostic import LoopyError
 
 from pymbolic import var
 
 
+# {{{ expression mapper
+
+class LoopyISPCCodeMapper(LoopyCCodeMapper):
+    def map_group_hw_index(self, expr, enclosing_prec, type_context):
+        return "taskIndex%d" % expr.axis
+
+    def map_local_hw_index(self, expr, enclosing_prec, type_context):
+        if expr.axis == 0:
+            return var("programIndex")
+        else:
+            raise LoopyError("ISPC only supports one local axis")
+
+# }}}
+
+
 class ISPCTarget(CTarget):
     # {{{ top-level codegen
 
@@ -101,14 +117,8 @@ class ISPCTarget(CTarget):
 
     # {{{ code generation guts
 
-    def get_global_axis_expr(self, kernel, axis):
-        return var("taskIndex%d" % axis)
-
-    def get_local_axis_expr(self, kernel, axis):
-        if axis == 0:
-            return var("programIndex")
-        else:
-            raise LoopyError("ISPC only supports one local axis")
+    def get_expression_to_code_mapper(self, codegen_state):
+        return LoopyISPCCodeMapper(codegen_state)
 
     def add_vector_access(self, access_str, index):
         return "(%s)[%d]" % (access_str, index)
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index cf130a0952ad3848da199ac5491e5f958f3787d2..7ef944d323264a267146ca364d3699a5f9dfa0d0 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -27,11 +27,10 @@ THE SOFTWARE.
 import numpy as np
 
 from loopy.target.c import CTarget
+from loopy.target.c.codegen.expression import LoopyCCodeMapper
 from pytools import memoize_method
 from loopy.diagnostic import LoopyError
 
-from pymbolic import var
-
 
 # {{{ vector types
 
@@ -175,6 +174,18 @@ def opencl_preamble_generator(kernel, seen_dtypes, seen_functions):
 # }}}
 
 
+# {{{ expression mapper
+
+class LoopyOpenCLCCodeMapper(LoopyCCodeMapper):
+    def map_group_hw_index(self, expr, enclosing_prec, type_context):
+        return "gid(%d)" % expr.axis
+
+    def map_local_hw_index(self, expr, enclosing_prec, type_context):
+        return "lid(%d)" % expr.axis
+
+# }}}
+
+
 # {{{ target
 
 class OpenCLTarget(CTarget):
@@ -267,11 +278,8 @@ class OpenCLTarget(CTarget):
 
     # {{{ code generation guts
 
-    def get_global_axis_expr(self, kernel, axis):
-        return var("gid")(axis)
-
-    def get_local_axis_expr(self, kernel, axis):
-        return var("lid")(axis)
+    def get_expression_to_code_mapper(self, codegen_state):
+        return LoopyOpenCLCCodeMapper(codegen_state)
 
     def add_vector_access(self, access_str, index):
         # The 'int' avoids an 'L' suffix for long ints.