diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 058af59bcc59593189948a0fbd15c7d09349f05c..59df3af620050eef712881e955098af6678cea79 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -28,8 +28,6 @@ from loopy.diagnostic import LoopyError, warn
 from pytools import Record
 import islpy as isl
 
-import numpy as np
-
 from pytools.persistent_dict import PersistentDict
 from loopy.tools import LoopyKeyBuilder
 from loopy.version import DATA_MODEL_VERSION
@@ -205,6 +203,8 @@ class CodeGenerationState(object):
 
         set of :class:`SeenFunction` instances
 
+    .. attribute:: seen_atomic_dtypes
+
     .. attribute:: var_subst_map
 
     .. attribute:: allow_complex
@@ -215,17 +215,19 @@ class CodeGenerationState(object):
     """
 
     def __init__(self, kernel, implemented_domain, implemented_predicates,
-            seen_dtypes, seen_functions, var_subst_map,
+            seen_dtypes, seen_functions, seen_atomic_dtypes, var_subst_map,
             allow_complex,
-            vectorization_info=None):
+            vectorization_info=None, var_name_generator=None):
         self.kernel = kernel
         self.implemented_domain = implemented_domain
         self.implemented_predicates = implemented_predicates
         self.seen_dtypes = seen_dtypes
         self.seen_functions = seen_functions
+        self.seen_atomic_dtypes = seen_atomic_dtypes
         self.var_subst_map = var_subst_map.copy()
         self.allow_complex = allow_complex
         self.vectorization_info = vectorization_info
+        self.var_name_generator = var_name_generator
 
     # {{{ copy helpers
 
@@ -245,9 +247,11 @@ class CodeGenerationState(object):
                     implemented_predicates or self.implemented_predicates),
                 seen_dtypes=self.seen_dtypes,
                 seen_functions=self.seen_functions,
+                seen_atomic_dtypes=self.seen_atomic_dtypes,
                 var_subst_map=var_subst_map or self.var_subst_map,
                 allow_complex=self.allow_complex,
-                vectorization_info=vectorization_info)
+                vectorization_info=vectorization_info,
+                var_name_generator=self.var_name_generator)
 
     def copy_and_assign(self, name, value):
         """Make a copy of self with variable *name* fixed to *value*."""
@@ -347,7 +351,8 @@ class POD(Declarator):
     """
 
     def __init__(self, target, dtype, name):
-        dtype = np.dtype(dtype)
+        from loopy.types import LoopyType
+        assert isinstance(dtype, LoopyType)
 
         self.target = target
         self.ctype = target.dtype_to_typename(dtype)
@@ -528,6 +533,7 @@ def generate_code(kernel, device=None):
 
     seen_dtypes = set()
     seen_functions = set()
+    seen_atomic_dtypes = set()
 
     initial_implemented_domain = isl.BasicSet.from_params(kernel.assumptions)
     codegen_state = CodeGenerationState(
@@ -536,8 +542,10 @@ def generate_code(kernel, device=None):
             implemented_predicates=frozenset(),
             seen_dtypes=seen_dtypes,
             seen_functions=seen_functions,
+            seen_atomic_dtypes=seen_atomic_dtypes,
             var_subst_map={},
-            allow_complex=allow_complex)
+            allow_complex=allow_complex,
+            var_name_generator=kernel.get_var_name_generator())
 
     code_str, implemented_domains = kernel.target.generate_code(
             kernel, codegen_state, impl_arg_info)
@@ -555,10 +563,22 @@ def generate_code(kernel, device=None):
 
     preambles = kernel.preambles[:]
 
+    from pytools import Record
+
+    class PreambleInfo(Record):
+        pass
+
+    preamble_info = PreambleInfo(
+            kernel=kernel,
+            seen_dtypes=seen_dtypes,
+            seen_functions=seen_functions,
+            # a set of LoopyTypes (!)
+            seen_atomic_dtypes=seen_atomic_dtypes)
+
     preamble_generators = (kernel.preamble_generators
             + kernel.target.preamble_generators())
     for prea_gen in preamble_generators:
-        preambles.extend(prea_gen(kernel, seen_dtypes, seen_functions))
+        preambles.extend(prea_gen(preamble_info))
 
     seen_preamble_tags = set()
     dedup_preambles = []
diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index 871ed10a7b4c12aa799002ae5213d400a2ef961f..f4c48443f9062bf362e9a27681d2967b8f82807d 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -92,6 +92,8 @@ def generate_expr_instruction_code(kernel, insn, codegen_state):
 
     from loopy.expression import dtype_to_type_context, VectorizabilityChecker
 
+    # {{{ vectorization handling
+
     if codegen_state.vectorization_info:
         if insn.atomicity:
             raise Unvectorizable("atomic operation")
@@ -111,18 +113,49 @@ def generate_expr_instruction_code(kernel, insn, codegen_state):
         del lhs_is_vector
         del rhs_is_vector
 
-    expr = insn.expression
+    # }}}
 
     (assignee_var_name, assignee_indices), = insn.assignees_and_indices()
-    target_dtype = kernel.get_var_descriptor(assignee_var_name).dtype
+    lhs_dtype = kernel.get_var_descriptor(assignee_var_name).dtype
+
+    if insn.atomicity is not None:
+        lhs_atomicity = [
+                a for a in insn.atomicity if a.var_name == assignee_var_name]
+        assert len(lhs_atomicity) <= 1
+        if lhs_atomicity:
+            lhs_atomicity, = lhs_atomicity
+        else:
+            lhs_atomicity = None
+    else:
+        lhs_atomicity = None
+
+    from loopy.kernel.data import AtomicInit, AtomicUpdate
 
-    from cgen import Assign
     lhs_code = ecm(insn.assignee, prec=PREC_NONE, type_context=None)
-    result = Assign(
-            lhs_code,
-            ecm(expr, prec=PREC_NONE,
-                type_context=dtype_to_type_context(kernel.target, target_dtype),
-                needed_dtype=target_dtype))
+    rhs_type_context = dtype_to_type_context(kernel.target, lhs_dtype)
+    if lhs_atomicity is None:
+        from cgen import Assign
+        result = Assign(
+                lhs_code,
+                ecm(insn.expression, prec=PREC_NONE,
+                    type_context=rhs_type_context,
+                    needed_dtype=lhs_dtype))
+
+    elif isinstance(lhs_atomicity, AtomicInit):
+        raise NotImplementedError("atomic init")
+
+    elif isinstance(lhs_atomicity, AtomicUpdate):
+        codegen_state.seen_atomic_dtypes.add(lhs_dtype)
+        result = kernel.target.generate_atomic_update(
+                kernel, codegen_state, lhs_atomicity,
+                insn.assignee, insn.expression,
+                lhs_dtype, rhs_type_context)
+
+    else:
+        raise ValueError("unexpected lhs atomicity type: %s"
+                % type(lhs_atomicity).__name__)
+
+    # {{{ tracing
 
     if kernel.options.trace_assignments or kernel.options.trace_assignment_values:
         if codegen_state.vectorization_info and is_vector:
@@ -179,6 +212,8 @@ def generate_expr_instruction_code(kernel, insn, codegen_state):
             # print first, execute later -> helps find segfaults
             result = Block([printf_insn, result])
 
+    # }}}
+
     return result
 
 
diff --git a/loopy/expression.py b/loopy/expression.py
index e550886b768dda5e3a42e37af6c5acb29225dc06..16c09b82821044f880e0f459af980da13526211b 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -76,12 +76,19 @@ class TypeInferenceMapper(CombineMapper):
     # /!\ Introduce caches with care--numpy.float32(x) and numpy.float64(x)
     # are Python-equal (for many common constants such as integers).
 
+    def with_assignments(self, names_to_vars):
+        new_ass = self.new_assignments.copy()
+        new_ass.update(names_to_vars)
+        return type(self)(self.kernel, new_ass)
+
     @staticmethod
     def combine(dtypes):
         # dtypes may just be a generator expr
         dtypes = list(dtypes)
 
-        from loopy.types import NumpyType
+        from loopy.types import LoopyType, NumpyType
+        assert all(isinstance(dtype, LoopyType) for dtype in dtypes)
+
         if not all(isinstance(dtype, NumpyType) for dtype in dtypes):
             from pytools import is_single_valued, single_valued
             if not is_single_valued(dtypes):
diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index d03ce86b6a800d9ee5d96d94caa9053fd8b1d458..90dd27133c03d6dfde5cb49707b25980d28348f1 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -314,14 +314,14 @@ def reduction_function_mangler(kernel, func_id, arg_dtypes):
     return None
 
 
-def reduction_preamble_generator(kernel, seen_dtypes, seen_functions):
+def reduction_preamble_generator(preamble_info):
     from loopy.target.opencl import OpenCLTarget
 
-    for func in seen_functions:
+    for func in preamble_info.seen_functions:
         if isinstance(func.name, ArgExtFunction):
-            if not isinstance(kernel.target, OpenCLTarget):
+            if not isinstance(preamble_info.kernel.target, OpenCLTarget):
                 raise LoopyError("only OpenCL supported for now")
 
-            yield get_argext_preamble(kernel.target, func.name)
+            yield get_argext_preamble(preamble_info.kernel.target, func.name)
 
 # vim: fdm=marker
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index 85e58a809e51ff8067c9effb59240fb5125b46db..c8c2324e828a6c69696c4e6a828eb79c29d230e8 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -133,6 +133,10 @@ class TargetBase(object):
     def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written):
         raise NotImplementedError()
 
+    def generate_atomic_update(self, kernel, codegen_state, lhs_atomicity,
+            lhs_expr, rhs_expr, lhs_dtype):
+        raise NotImplementedError("atomic update")
+
     # }}}
 
 # vim: foldmethod=marker
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 5d6a856d14f57b277e6c083717346bbeb11af7b6..f9a5e4390d07231e3933127e262e86e58c4c5279 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -28,14 +28,50 @@ import six
 
 import numpy as np  # noqa
 from loopy.target import TargetBase
+from loopy.diagnostic import LoopyError
 
 from pytools import memoize_method
 
 
+# {{{ dtype registry wrapper
+
+class DTypeRegistryWrapper(object):
+    def __init__(self, wrapped_registry):
+        self.wrapped_registry = wrapped_registry
+
+    def get_or_register_dtype(self, names, dtype=None):
+        if dtype is not None:
+            from loopy.types import LoopyType, NumpyType
+            assert isinstance(dtype, LoopyType)
+
+            if isinstance(dtype, NumpyType):
+                return self.wrapped_registry.get_or_register_dtype(
+                        names, dtype.dtype)
+            else:
+                raise LoopyError(
+                        "unable to get or register type '%s'"
+                        % dtype)
+        else:
+            return self.wrapped_registry.get_or_register_dtype(names, dtype)
+
+    def dtype_to_ctype(self, dtype):
+        from loopy.types import LoopyType, NumpyType
+        assert isinstance(dtype, LoopyType)
+
+        if isinstance(dtype, NumpyType):
+            return self.wrapped_registry.dtype_to_ctype(dtype)
+        else:
+            raise LoopyError(
+                    "unable to convert type '%s' to C"
+                    % dtype)
+
+# }}}
+
+
 # {{{ preamble generator
 
-def _preamble_generator(kernel, seen_dtypes, seen_functions):
-    c_funcs = set(func.c_name for func in seen_functions)
+def _preamble_generator(preamble_info):
+    c_funcs = set(func.c_name for func in preamble_info.seen_functions)
     if "int_floor_div" in c_funcs:
         yield ("05_int_floor_div", """
             #define int_floor_div(a,b) \
@@ -75,7 +111,7 @@ class CTarget(TargetBase):
         result = DTypeRegistry()
         fill_registry_with_c_types(result, respect_windows=False,
                 include_bool=True)
-        return result
+        return DTypeRegistryWrapper(result)
 
     def is_vector_dtype(self, dtype):
         return False
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index ce6e039fa3f60feae72458308915f87babab2867..63a053c586362a10fa2eea70b83b2659646f62d0 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -1,7 +1,4 @@
-from __future__ import division
-from __future__ import absolute_import
-from six.moves import range
-from six.moves import zip
+from __future__ import division, absolute_import
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -26,6 +23,8 @@ THE SOFTWARE.
 """
 
 
+from six.moves import range, zip
+
 import numpy as np
 
 from pymbolic.mapper import RecursiveMapper
@@ -43,17 +42,24 @@ from loopy.types import LoopyType
 # {{{ C code mapper
 
 class LoopyCCodeMapper(RecursiveMapper):
-    def __init__(self, codegen_state, fortran_abi=False):
+    def __init__(self, codegen_state, fortran_abi=False, type_inf_mapper=None):
         self.kernel = codegen_state.kernel
         self.codegen_state = codegen_state
 
-        self.type_inf_mapper = TypeInferenceMapper(self.kernel)
+        if type_inf_mapper is None:
+            type_inf_mapper = TypeInferenceMapper(self.kernel)
+        self.type_inf_mapper = type_inf_mapper
+
         self.allow_complex = codegen_state.allow_complex
 
         self.fortran_abi = fortran_abi
 
     # {{{ helpers
 
+    def with_assignments(self, names_to_vars):
+        type_inf_mapper = self.type_inf_mapper.with_assignments(names_to_vars)
+        return type(self)(self.codegen_state, self.fortran_abi, type_inf_mapper)
+
     def infer_type(self, expr):
         result = self.type_inf_mapper(expr)
         assert isinstance(result, LoopyType)
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 68be1ce3bf9d579592064e4707910802a70cdbc1..1e224ff1f87f37270802736fe441cfebff4c9a4a 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -31,6 +31,34 @@ from loopy.target.c.codegen.expression import LoopyCCodeMapper
 from pytools import memoize_method
 from loopy.diagnostic import LoopyError
 from loopy.types import NumpyType
+from loopy.target.c import DTypeRegistryWrapper
+
+
+# {{{ dtype registry wrappers
+
+class DTypeRegistryWrapperWithAtomics(DTypeRegistryWrapper):
+    def get_or_register_dtype(self, names, dtype=None):
+        if dtype is not None:
+            from loopy.types import AtomicNumpyType, NumpyType
+            if isinstance(dtype, AtomicNumpyType):
+                return super(self.wrapped_registry.get_or_register_dtype(
+                        names, NumpyType(dtype.dtype)))
+
+        return super(self.wrapped_registry.get_or_register_dtype(
+                names, dtype))
+
+
+class DTypeRegistryWrapperWithCL1Atomics(DTypeRegistryWrapperWithAtomics):
+    def dtype_to_ctype(self, dtype):
+        from loopy.types import AtomicNumpyType
+
+        if isinstance(dtype, AtomicNumpyType):
+            return "volatile " + self.wrapped_registry.dtype_to_ctype(dtype)
+        else:
+            return super(DTypeRegistryWrapperWithCL1Atomics, self).dtype_to_ctype(
+                    dtype)
+
+# }}}
 
 
 # {{{ vector types
@@ -159,10 +187,10 @@ def opencl_symbol_mangler(kernel, name):
 
 # {{{ preamble generator
 
-def opencl_preamble_generator(kernel, seen_dtypes, seen_functions):
+def opencl_preamble_generator(preamble_info):
     has_double = False
 
-    for dtype in seen_dtypes:
+    for dtype in preamble_info.seen_dtypes:
         if dtype in [np.float64, np.complex128]:
             has_double = True
 
@@ -173,6 +201,17 @@ def opencl_preamble_generator(kernel, seen_dtypes, seen_functions):
             #endif
             """)
 
+    from loopy.types import AtomicNumpyType
+    seen_64_bit_atomics = any(
+            isinstance(dtype, AtomicNumpyType) and dtype.numpy_dtype.itemsize == 8
+            for dtype in preamble_info.seen_atomic_dtypes)
+
+    if seen_64_bit_atomics:
+        # FIXME: Should gate on "CL1" atomics style
+        yield ("00_enable_64bit_atomics", """
+            #pragma OPENCL EXTENSION cl_khr_int64_base_atomics : enable
+            """)
+
 # }}}
 
 
@@ -194,15 +233,18 @@ class OpenCLTarget(CTarget):
     """A target for the OpenCL C heterogeneous compute programming language.
     """
 
-    def __init__(self, atomics_flavor="cl2"):
+    def __init__(self, atomics_flavor="cl1"):
         """
-        :arg atomics_flavor: one of ``"cl2"`` (C11-style atomics from OpenCL 2.0),
+        :arg atomics_flavor: one of ``"cl1"`` (C11-style atomics from OpenCL 2.0),
             ``"cl1"`` (OpenCL 1.1 atomics, using bit-for-bit compare-and-swap
             for floating point), ``"cl1-exch"`` (OpenCL 1.1 atomics, using
-            double-exchange for floating point).
+            double-exchange for floating point--not yet supported).
         """
         super(OpenCLTarget, self).__init__()
 
+        if atomics_flavor not in ["cl1", "cl2"]:
+            raise ValueError("unsupported atomics flavor: %s" % atomics_flavor)
+
         self.atomics_flavor = atomics_flavor
 
     # {{{ library
@@ -241,7 +283,10 @@ class OpenCLTarget(CTarget):
 
         _register_vector_types(result)
 
-        return result
+        if self.atomics_flavor == "cl1":
+            return DTypeRegistryWrapperWithCL1Atomics(result)
+        else:
+            raise NotImplementedError("atomics flavor: %s" % self.atomics_flavor)
 
     def is_vector_dtype(self, dtype):
         return (isinstance(dtype, NumpyType)
@@ -364,6 +409,91 @@ class OpenCLTarget(CTarget):
 
         return CLConstant(arg_decl)
 
+    # {{{ code generation for atomic update
+
+    def generate_atomic_update(self, kernel, codegen_state, lhs_atomicity,
+            lhs_expr, rhs_expr, lhs_dtype, rhs_type_context):
+        from pymbolic.mapper.stringifier import PREC_NONE
+
+        # FIXME: Could detect operations, generate atomic_{add,...} when
+        # appropriate.
+
+        if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [
+                np.int32, np.int64, np.float32, np.float64]:
+            from cgen import Block, DoWhile, Assign
+            from loopy.codegen import POD
+            old_val_var = codegen_state.var_name_generator("loopy_old_val")
+            new_val_var = codegen_state.var_name_generator("loopy_new_val")
+
+            from loopy.kernel.data import TemporaryVariable
+            ecm = codegen_state.expression_to_code_mapper.with_assignments(
+                    {
+                        old_val_var: TemporaryVariable(old_val_var, lhs_dtype),
+                        new_val_var: TemporaryVariable(new_val_var, lhs_dtype),
+                        })
+
+            lhs_expr_code = ecm(lhs_expr, prec=PREC_NONE, type_context=None)
+
+            from pymbolic.mapper.substitutor import make_subst_func
+            from pymbolic import var
+            from loopy.symbolic import SubstitutionMapper
+
+            subst = SubstitutionMapper(
+                    make_subst_func({lhs_expr: var(old_val_var)}))
+            rhs_expr_code = ecm(subst(rhs_expr), prec=PREC_NONE,
+                    type_context=rhs_type_context,
+                    needed_dtype=lhs_dtype)
+
+            if lhs_dtype.numpy_dtype.itemsize == 4:
+                func_name = "atomic_cmpxchg"
+            elif lhs_dtype.numpy_dtype.itemsize == 8:
+                func_name = "atom_cmpxchg"
+            else:
+                raise LoopyError("unexpected atomic size")
+
+            cast_str = ""
+            old_val = old_val_var
+            new_val = new_val_var
+
+            if lhs_dtype.numpy_dtype.kind == "f":
+                if lhs_dtype.numpy_dtype == np.float32:
+                    ctype = "int"
+                elif lhs_dtype.numpy_dtype == np.float64:
+                    ctype = "long"
+                else:
+                    assert False
+
+                old_val = "*(%s *) &" % ctype + old_val
+                new_val = "*(%s *) &" % ctype + new_val
+                cast_str = "(__global %s *) " % ctype
+
+            return Block([
+                POD(self, NumpyType(lhs_dtype.dtype), old_val_var),
+                POD(self, NumpyType(lhs_dtype.dtype), new_val_var),
+                DoWhile(
+                    "%(func_name)s("
+                    "%(cast_str)s&(%(lhs_expr)s), "
+                    "%(old_val)s, "
+                    "%(new_val)s"
+                    ") != %(old_val)s"
+                    % {
+                        "func_name": func_name,
+                        "cast_str": cast_str,
+                        "lhs_expr": lhs_expr_code,
+                        "old_val": old_val,
+                        "new_val": new_val,
+                        },
+                    Block([
+                        Assign(old_val_var, lhs_expr_code),
+                        Assign(new_val_var, rhs_expr_code),
+                        ])
+                    )
+                ])
+        else:
+            raise NotImplementedError("atomic update for '%s'" % lhs_dtype)
+
+    # }}}
+
     # }}}
 
 # }}}
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index e17f9515ad8528244ab585bb3826ae8a473c4b78..add5bd9e235289350cf53af5b3a9a07049df053f 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -207,12 +207,12 @@ def pyopencl_function_mangler(target, name, arg_dtypes):
 
 # {{{ preamble generator
 
-def pyopencl_preamble_generator(target, seen_dtypes, seen_functions):
+def pyopencl_preamble_generator(preamble_info):
     has_double = False
     has_complex = False
 
     from loopy.types import NumpyType
-    for dtype in seen_dtypes:
+    for dtype in preamble_info.seen_dtypes:
         if (isinstance(dtype, NumpyType)
                 and dtype.dtype in [np.float64, np.complex128]):
             has_double = True
@@ -290,9 +290,15 @@ class PyOpenCLTarget(OpenCLTarget):
         try:
             from pyopencl.compyte.dtypes import TYPE_REGISTRY
         except ImportError:
-            return _LegacyTypeRegistryStub()
+            result = _LegacyTypeRegistryStub()
         else:
-            return TYPE_REGISTRY
+            result = TYPE_REGISTRY
+
+        from loopy.target.opencl import DTypeRegistryWrapperWithCL1Atomics
+        if self.atomics_flavor == "cl1":
+            return DTypeRegistryWrapperWithCL1Atomics(result)
+        else:
+            raise NotImplementedError("atomics flavor: %s" % self.atomics_flavor)
 
     def is_vector_dtype(self, dtype):
         from pyopencl.array import vec
diff --git a/test/test_loopy.py b/test/test_loopy.py
index c7d7755e7a7b51d8085d57b7263b0249bd9520e1..5929e7aa724b18f808260c3d594c91a5fb8e8b7e 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2421,24 +2421,29 @@ def test_chunk_iname(ctx_factory):
     lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=130))
 
 
-def test_atomic(ctx_factory):
+@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])
+def test_atomic(ctx_factory, dtype):
     ctx = ctx_factory()
 
+    if (
+            np.dtype(dtype).itemsize == 8
+            and "cl_khr_int64_base_atomics" not in ctx.devices[0].extensions):
+        pytest.skip("64-bit atomics not supported on device")
+
     knl = lp.make_kernel(
             "{ [i]: 0<=i<n }",
             "out[i%20] = out[i%20] + 2*a[i] {atomic}",
             [
-                lp.GlobalArg("out", np.float32, shape=lp.auto,
-                    for_atomic=True),
-                lp.GlobalArg("a", np.float32, shape=lp.auto),
+                lp.GlobalArg("out", dtype, shape=lp.auto, for_atomic=True),
+                lp.GlobalArg("a", dtype, shape=lp.auto),
                 "..."
                 ],
             assumptions="n>0")
 
     ref_knl = knl
-    knl = lp.chunk_iname(knl, "i", 3, inner_tag="l.0")
-    knl = lp.set_loop_priority(knl, "i_outer, i_inner")
-    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=130))
+    knl = lp.split_iname(knl, "i", 512)
+    knl = lp.split_iname(knl, "i_inner", 128, outer_tag="unr", inner_tag="g.0")
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=10000))
 
 
 if __name__ == "__main__":