diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst
index 33d40385b529f72e54da65238304e87bdb2cddab..088bedc9eeb77723483149dcc445d8aa84ddbbb7 100644
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -262,6 +262,9 @@ These are usually key-value pairs. The following attributes are recognized:
   given instruction groups. See
   :class:`InstructionBase.conflicts_with_groups`.
 
+* ``atomic`` The update embodied by the assignment is carried out
+  atomically. See :attr:`Assignment.atomicity` for precise semantics.
+
 .. _expression-syntax:
 
 Expressions
@@ -286,6 +289,19 @@ C Block Instructions
 
 .. autoclass:: CInstruction
 
+Atomic Operations
+^^^^^^^^^^^^^^^^^
+
+.. autoclass:: memory_ordering
+
+.. autoclass:: memory_scope
+
+.. autoclass:: VarAtomicity
+
+.. autoclass:: AtomicInit
+
+.. autoclass:: AtomicUpdate
+
 .. }}}
 
 Data: Arguments and Temporaries
diff --git a/loopy/__init__.py b/loopy/__init__.py
index 5e3ad508596a20cce8a084e4d70604cdafa47c79..14bce615a29ef20ef3b5d18b2083cf0cdd98e2f4 100644
--- a/loopy/__init__.py
+++ b/loopy/__init__.py
@@ -40,6 +40,7 @@ from loopy.kernel.data import (
         auto,
         KernelArgument,
         ValueArg, GlobalArg, ConstantArg, ImageArg,
+        memory_ordering, memory_scope, VarAtomicity, AtomicInit, AtomicUpdate,
         InstructionBase, Assignment, ExpressionInstruction, CInstruction,
         TemporaryVariable,
         SubstitutionRule)
@@ -125,6 +126,8 @@ __all__ = [
         "LoopKernel", "kernel_state",
 
         "KernelArgument",
+        "memory_ordering", "memory_scope", "VarAtomicity",
+        "AtomicInit", "AtomicUpdate",
         "ValueArg", "ScalarArg", "GlobalArg", "ArrayArg", "ConstantArg", "ImageArg",
         "TemporaryVariable",
         "SubstitutionRule",
diff --git a/loopy/check.py b/loopy/check.py
index 5a1c9e83b52e08f0874f61b500d853b925201731..5ae22cac5e5af8b727ae0ba577fc05a8e571b8a7 100644
--- a/loopy/check.py
+++ b/loopy/check.py
@@ -373,6 +373,37 @@ def pre_schedule_checks(kernel):
 
 # {{{ pre-code-generation checks
 
+def check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel):
+    from loopy.kernel.data import ArrayBase, Assignment
+    from loopy.types import AtomicType
+    atomicity_candidates = (
+            set(v.name for v in six.itervalues(kernel.temporary_variables)
+                if isinstance(v.dtype, AtomicType))
+            |
+            set(v.name for v in kernel.args
+                if isinstance(v, ArrayBase)
+                and isinstance(v.dtype, AtomicType)))
+
+    for insn in kernel.instructions:
+        if not isinstance(insn, Assignment):
+            continue
+
+        atomic_accesses = set(a.var_name for a in insn.atomicity)
+        if not atomic_accesses <= atomicity_candidates:
+            raise LoopyError("atomic access in instruction '%s' to "
+                    "non-atomic variable(s) '%s'"
+                    % (insn.id,
+                        ",".join(atomic_accesses - atomicity_candidates)))
+
+        accessed_atomic_vars = insn.dependency_names() & atomicity_candidates
+        if not accessed_atomic_vars <= atomic_accesses:
+            raise LoopyError("atomic variable(s) '%s' in instruction '%s' "
+                    "used in non-atomic access"
+                    % (
+                        ",".join(accessed_atomic_vars - atomic_accesses),
+                        insn.id))
+
+
 def check_that_shapes_and_strides_are_arguments(kernel):
     from loopy.kernel.data import ValueArg
     from loopy.kernel.array import ArrayBase, FixedStrideArrayDimTag
@@ -383,7 +414,7 @@ def check_that_shapes_and_strides_are_arguments(kernel):
             arg.name
             for arg in kernel.args
             if isinstance(arg, ValueArg)
-            and arg.dtype.kind == "i")
+            and arg.dtype.is_integral())
 
     for arg in kernel.args:
         if isinstance(arg, ArrayBase):
@@ -417,6 +448,7 @@ def pre_codegen_checks(kernel):
     try:
         logger.info("pre-codegen check %s: start" % kernel.name)
 
+        check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel)
         kernel.target.pre_codegen_check(kernel)
         check_that_shapes_and_strides_are_arguments(kernel)
 
diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 91af52cde0000ce811172be53bf21e13ce5fde97..058af59bcc59593189948a0fbd15c7d09349f05c 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -425,11 +425,12 @@ class ImplementedDataInfo(Record):
             offset_for_name=None, stride_for_name_and_axis=None,
             allows_offset=None):
 
-        from loopy.tools import PicklableDtype
+        from loopy.types import LoopyType
+        assert isinstance(dtype, LoopyType)
 
         Record.__init__(self,
                 name=name,
-                picklable_dtype=PicklableDtype(dtype, target=target),
+                dtype=dtype,
                 cgen_declarator=cgen_declarator,
                 arg_class=arg_class,
                 base_name=base_name,
@@ -441,14 +442,6 @@ class ImplementedDataInfo(Record):
                 stride_for_name_and_axis=stride_for_name_and_axis,
                 allows_offset=allows_offset)
 
-    @property
-    def dtype(self):
-        from loopy.tools import PicklableDtype
-        if isinstance(self.picklable_dtype, PicklableDtype):
-            return self.picklable_dtype.dtype
-        else:
-            return self.picklable_dtype
-
 # }}}
 
 
@@ -528,7 +521,7 @@ def generate_code(kernel, device=None):
 
     allow_complex = False
     for var in kernel.args + list(six.itervalues(kernel.temporary_variables)):
-        if var.dtype.kind == "c":
+        if var.dtype.involves_complex():
             allow_complex = True
 
     # }}}
@@ -619,7 +612,7 @@ def generate_body(kernel):
 
     allow_complex = False
     for var in kernel.args + list(six.itervalues(kernel.temporary_variables)):
-        if var.dtype.kind == "c":
+        if var.dtype.involves_complex():
             allow_complex = True
 
     seen_dtypes = set()
diff --git a/loopy/compiled.py b/loopy/compiled.py
index 32536021dcfb3a1460f23af7e066aff9bf772f46..2125fd66347c99adb4014a60c34a6ce44bc05e95 100644
--- a/loopy/compiled.py
+++ b/loopy/compiled.py
@@ -1,8 +1,4 @@
-from __future__ import division, with_statement
-from __future__ import absolute_import
-import six
-from six.moves import range
-from six.moves import zip
+from __future__ import division, with_statement, absolute_import
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
 
@@ -26,6 +22,8 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
+import six
+from six.moves import range, zip
 
 import sys
 import numpy as np
@@ -34,6 +32,7 @@ from loopy.diagnostic import ParameterFinderWarning
 from pytools.py_codegen import (
         Indentation, PythonFunctionGenerator)
 from loopy.diagnostic import LoopyError
+from loopy.types import NumpyType
 
 import logging
 logger = logging.getLogger(__name__)
@@ -162,7 +161,7 @@ def generate_integer_arg_finding_from_shapes(gen, kernel, impl_arg_info, options
                 if len(deps) == 1:
                     integer_arg_var, = deps
 
-                    if kernel.arg_dict[integer_arg_var.name].dtype.kind == "i":
+                    if kernel.arg_dict[integer_arg_var.name].dtype.is_integral():
                         from pymbolic.algorithm import solve_affine_equations_for
                         try:
                             # friggin' overkill :)
@@ -279,7 +278,7 @@ def generate_integer_arg_finding_from_strides(gen, kernel, impl_arg_info, option
                     if not options.skip_arg_checks:
                         gen("%s, _lpy_remdr = divmod(%s.strides[%d], %d)"
                                 % (arg.name, impl_array_name, stride_impl_axis,
-                                    base_arg.dtype.itemsize))
+                                    base_arg.dtype.dtype.itemsize))
 
                         gen("assert _lpy_remdr == 0, \"Stride %d of array '%s' is "
                                 "not divisible by its dtype itemsize\""
@@ -360,16 +359,20 @@ def generate_value_arg_setup(gen, kernel, cl_kernel, impl_arg_info, options):
                         "be supplied")
                 """.format(name=arg.name))
 
-        if sys.version_info < (2, 7) and arg.dtype.kind == "i":
+        if sys.version_info < (2, 7) and arg.dtype.is_integral():
             gen("# cast to long to avoid trouble with struct packing")
             gen("%s = long(%s)" % (arg.name, arg.name))
             gen("")
 
-        if arg.dtype.char == "V":
+        if arg.dtype.is_composite():
             gen("cl_kernel.set_arg(%d, %s)" % (cl_arg_idx, arg.name))
             cl_arg_idx += 1
 
-        elif arg.dtype.kind == "c":
+        elif arg.dtype.is_complex():
+            assert isinstance(arg.dtype, NumpyType)
+
+            dtype = arg.dtype
+
             if warn_about_arg_count_bug:
                 from warnings import warn
                 warn("{knl_name}: arguments include complex numbers, and "
@@ -378,15 +381,15 @@ def generate_value_arg_setup(gen, kernel, cl_kernel, impl_arg_info, options):
                         "disabled".format(
                             knl_name=kernel.name))
 
-            if arg.dtype == np.complex64:
+            if dtype.numpy_dtype == np.complex64:
                 arg_char = "f"
-            elif arg.dtype == np.complex128:
+            elif dtype.numpy_dtype == np.complex128:
                 arg_char = "d"
             else:
-                raise TypeError("unexpected complex type: %s" % arg.dtype)
+                raise TypeError("unexpected complex type: %s" % dtype)
 
             if (work_around_arg_count_bug
-                    and arg.dtype == np.complex128
+                    and dtype.numpy_dtype == np.complex128
                     and fp_arg_count + 2 <= 8):
                 gen(
                         "buf = _lpy_pack('{arg_char}', {arg_var}.real)"
@@ -415,15 +418,19 @@ def generate_value_arg_setup(gen, kernel, cl_kernel, impl_arg_info, options):
 
             fp_arg_count += 2
 
-        else:
-            if arg.dtype.kind == "f":
+        elif isinstance(arg.dtype, NumpyType):
+            if arg.dtype.dtype.kind == "f":
                 fp_arg_count += 1
 
             gen("cl_kernel.set_arg(%d, _lpy_pack('%s', %s))"
-                    % (cl_arg_idx, arg.dtype.char, arg.name))
+                    % (cl_arg_idx, arg.dtype.dtype.char, arg.name))
 
             cl_arg_idx += 1
 
+        else:
+            raise LoopyError("do not know how to pass argument of type '%s'"
+                    % arg.dtype)
+
         gen("")
 
         gen("# }}}")
@@ -510,6 +517,10 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options,
         if is_written and arg.arg_class in [lp.GlobalArg, lp.ConstantArg] \
                 and arg.shape is not None:
 
+            if not isinstance(arg.dtype, NumpyType):
+                raise LoopyError("do not know how to pass arg of type '%s'"
+                        % arg.dtype)
+
             possibly_made_by_loopy = True
             gen("_lpy_made_by_loopy = False")
             gen("")
@@ -520,7 +531,7 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options,
                 for i in range(num_axes):
                     gen("_lpy_shape_%d = %s" % (i, strify(arg.unvec_shape[i])))
 
-                itemsize = kernel_arg.dtype.itemsize
+                itemsize = kernel_arg.dtype.numpy_dtype.itemsize
                 for i in range(num_axes):
                     gen("_lpy_strides_%d = %s" % (i, strify(
                         itemsize*arg.unvec_strides[i])))
@@ -550,7 +561,7 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options,
                             name=arg.name,
                             shape=strify(sym_shape),
                             strides=strify(sym_strides),
-                            dtype=python_dtype_str(arg.dtype)))
+                            dtype=python_dtype_str(kernel_arg.dtype.numpy_dtype)))
 
                 if not options.skip_arg_checks:
                     for i in range(num_axes):
@@ -575,7 +586,7 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options,
 
             with Indentation(gen):
                 gen("if %s.dtype != %s:"
-                        % (arg.name, python_dtype_str(kernel_arg.dtype)))
+                        % (arg.name, python_dtype_str(kernel_arg.dtype.numpy_dtype)))
                 with Indentation(gen):
                     gen("raise TypeError(\"dtype mismatch on argument '%s' "
                             "(got: %%s, expected: %s)\" %% %s.dtype)"
@@ -630,7 +641,7 @@ def generate_array_arg_setup(gen, kernel, impl_arg_info, options,
                 # }}}
 
                 if arg.unvec_strides and kernel_arg.dim_tags:
-                    itemsize = kernel_arg.dtype.itemsize
+                    itemsize = kernel_arg.dtype.numpy_dtype.itemsize
                     sym_strides = tuple(
                             itemsize*s_i for s_i in arg.unvec_strides)
                     gen("if %s.strides != %s:"
diff --git a/loopy/expression.py b/loopy/expression.py
index 62c2278be5b863105e45a8c0c6d517cdf2b92fd2..a363013ea0549827438124150a1e4c0c2af94c64 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -28,6 +28,7 @@ import numpy as np
 from pymbolic.mapper import CombineMapper, RecursiveMapper
 
 from loopy.tools import is_integer
+from loopy.types import NumpyType
 from loopy.codegen import Unvectorizable
 from loopy.diagnostic import (
         LoopyError,
@@ -41,13 +42,13 @@ from loopy.diagnostic import (
 # or None for 'no known context'.
 
 def dtype_to_type_context(target, dtype):
-    dtype = np.dtype(dtype)
+    from loopy.types import NumpyType
 
-    if dtype.kind == 'i':
+    if dtype.is_integral():
         return 'i'
-    if dtype in [np.float64, np.complex128]:
+    if isinstance(dtype, NumpyType) and dtype.dtype in [np.float64, np.complex128]:
         return 'd'
-    if dtype in [np.float32, np.complex64]:
+    if isinstance(dtype, NumpyType) and dtype.dtype in [np.float32, np.complex64]:
         return 'f'
     if target.is_vector_dtype(dtype):
         return dtype_to_type_context(target, dtype.fields["x"][0])
@@ -76,8 +77,21 @@ class TypeInferenceMapper(CombineMapper):
 
     @staticmethod
     def combine(dtypes):
+        # dtypes may just be a generator expr
         dtypes = list(dtypes)
 
+        from loopy.types import NumpyType
+        if not all(isinstance(dtype, NumpyType) for dtype in dtypes):
+            from pytools import is_single_valued, single_valued
+            if not is_single_valued(dtypes):
+                raise TypeInferenceFailure(
+                        "Nothing known about operations between '%s'"
+                        % ", ".join(str(dt) for dt in dtypes))
+
+            return single_valued(dtypes)
+
+        dtypes = [dtype.dtype for dtype in dtypes]
+
         result = dtypes.pop()
         while dtypes:
             other = dtypes.pop()
@@ -107,7 +121,7 @@ class TypeInferenceMapper(CombineMapper):
                             "nothing known about result of operation on "
                             "'%s' and '%s'" % (result, other))
 
-        return result
+        return NumpyType(result)
 
     def map_sum(self, expr):
         dtypes = []
@@ -120,7 +134,7 @@ class TypeInferenceMapper(CombineMapper):
                 dtypes.append(dtype)
 
         from pytools import all
-        if all(dtype.kind == "i" for dtype in dtypes):
+        if all(dtype.is_integral() for dtype in dtypes):
             dtypes.extend(small_integer_dtypes)
 
         return self.combine(dtypes)
@@ -131,9 +145,9 @@ class TypeInferenceMapper(CombineMapper):
         n_dtype = self.rec(expr.numerator)
         d_dtype = self.rec(expr.denominator)
 
-        if n_dtype.kind in "iu" and d_dtype.kind in "iu":
+        if n_dtype.is_integral() and d_dtype.is_integral():
             # both integers
-            return np.dtype(np.float64)
+            return NumpyType(np.dtype(np.float64))
 
         else:
             return self.combine([n_dtype, d_dtype])
@@ -143,20 +157,20 @@ class TypeInferenceMapper(CombineMapper):
             for tp in [np.int32, np.int64]:
                 iinfo = np.iinfo(tp)
                 if iinfo.min <= expr <= iinfo.max:
-                    return np.dtype(tp)
+                    return NumpyType(np.dtype(tp))
 
             else:
                 raise TypeInferenceFailure("integer constant '%s' too large" % expr)
 
         dt = np.asarray(expr).dtype
         if hasattr(expr, "dtype"):
-            return expr.dtype
+            return NumpyType(expr.dtype)
         elif isinstance(expr, np.number):
             # Numpy types are sized
-            return np.dtype(type(expr))
+            return NumpyType(np.dtype(type(expr)))
         elif dt.kind == "f":
             # deduce the smaller type by default
-            return np.dtype(np.float32)
+            return NumpyType(np.dtype(np.float32))
         elif dt.kind == "c":
             if np.complex64(expr) == np.complex128(expr):
                 # (COMPLEX_GUESS_LOGIC)
@@ -245,13 +259,13 @@ class TypeInferenceMapper(CombineMapper):
 
     def map_lookup(self, expr):
         agg_result = self.rec(expr.aggregate)
-        dtype, offset = agg_result.fields[expr.name]
-        return dtype
+        dtype, offset = agg_result.numpy_dtype.fields[expr.name]
+        return NumpyType(dtype)
 
     def map_comparison(self, expr):
         # "bool" is unusable because OpenCL's bool has indeterminate memory
         # format.
-        return np.dtype(np.int32)
+        return NumpyType(np.dtype(np.int32))
 
     map_logical_not = map_comparison
     map_logical_and = map_comparison
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index fb081a959523114ee5fc823eec59420f4adf3457..22a45fac996f6eaf1737fb28ab51ab9ac4f9ead7 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -254,10 +254,11 @@ class LoopKernel(RecordWithoutPickling):
 
         # }}}
 
-        index_dtype = np.dtype(index_dtype)
-        if index_dtype.kind != 'i':
+        from loopy.types import to_loopy_type
+        index_dtype = to_loopy_type(index_dtype).with_target(target)
+        if not index_dtype.is_integral():
             raise TypeError("index_dtype must be an integer")
-        if np.iinfo(index_dtype).min >= 0:
+        if np.iinfo(index_dtype.numpy_dtype).min >= 0:
             raise TypeError("index_dtype must be signed")
 
         if get_grid_sizes is not None:
@@ -1043,6 +1044,9 @@ class LoopKernel(RecordWithoutPickling):
                 options.append("priority=%d" % insn.priority)
             if insn.tags:
                 options.append("tags=%s" % ":".join(insn.tags))
+            if isinstance(insn, lp.Assignment) and insn.atomicity:
+                options.append("atomic=%s" % ":".join(
+                    str(a) for a in insn.atomicity))
             if insn.groups:
                 options.append("groups=%s" % ":".join(insn.groups))
             if insn.conflicts_with_groups:
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 6deb2a348b606d7d717b290e900d7bda6169fb4e..1ad83c0a718b60194cb5789ced7d0a9c7674de75 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -551,7 +551,8 @@ class ArrayBase(Record):
     allowed_extra_kwargs = []
 
     def __init__(self, name, dtype=None, shape=None, dim_tags=None, offset=0,
-            dim_names=None, strides=None, order=None, **kwargs):
+            dim_names=None, strides=None, order=None, for_atomic=False,
+            **kwargs):
         """
         All of the following are optional. Specify either strides or shape.
 
@@ -611,6 +612,9 @@ class ArrayBase(Record):
         :arg order: "F" or "C" for C (row major) or Fortran
             (column major). Defaults to the *default_order* argument
             passed to :func:`loopy.make_kernel`.
+        :arg for_atomic:
+            Whether the array is declared for atomic access, and, if necessary,
+            using atomic-capable data types.
         :arg offset: Offset from the beginning of the buffer to the point from
             which the strides are counted. May be one of
 
@@ -629,22 +633,9 @@ class ArrayBase(Record):
 
         import loopy as lp
 
-        if dtype is not None and dtype is not lp.auto:
-            from loopy.tools import PicklableDtype
-            if not isinstance(dtype, PicklableDtype):
-                picklable_dtype = PicklableDtype(dtype)
-            else:
-                picklable_dtype = dtype
-
-            if picklable_dtype.dtype == object:
-                raise TypeError("loopy does not directly support object arrays "
-                        "(object dtype encountered on array '%s') "
-                        "-- you may want to tag the relevant array axis as 'sep'"
-                        % name)
-        else:
-            picklable_dtype = dtype
-
-        del dtype
+        from loopy.types import to_loopy_type
+        dtype = to_loopy_type(dtype, allow_auto=True, allow_none=True,
+                for_atomic=for_atomic)
 
         strides_known = strides is not None and strides is not lp.auto
         shape_known = shape is not None and shape is not lp.auto
@@ -766,7 +757,7 @@ class ArrayBase(Record):
 
         Record.__init__(self,
                 name=name,
-                picklable_dtype=picklable_dtype,
+                dtype=dtype,
                 shape=shape,
                 dim_tags=dim_tags,
                 offset=offset,
@@ -781,7 +772,7 @@ class ArrayBase(Record):
         return (
                 type(self) == type(other)
                 and self.name == other.name
-                and self.picklable_dtype == other.picklable_dtype
+                and self.dtype == other.dtype
                 and istoee(self.shape, other.shape)
                 and self.dim_tags == other.dim_tags
                 and isee(self.offset, other.offset)
@@ -792,23 +783,6 @@ class ArrayBase(Record):
     def __ne__(self, other):
         return not self.__eq__(other)
 
-    @property
-    def dtype(self):
-        from loopy.tools import PicklableDtype
-        if isinstance(self.picklable_dtype, PicklableDtype):
-            return self.picklable_dtype.dtype
-        else:
-            return self.picklable_dtype
-
-    def get_copy_kwargs(self, **kwargs):
-        result = Record.get_copy_kwargs(self, **kwargs)
-        if "dtype" not in result:
-            result["dtype"] = self.dtype
-
-        del result["picklable_dtype"]
-
-        return result
-
     def stringify(self, include_typename):
         import loopy as lp
 
diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index 830fe0efadb0425673a9c6ac4f62050d384dcd3f..aedc1edc650d570cdeec1ecca8dc08989678df65 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -30,6 +30,7 @@ from loopy.tools import intern_frozenset_of_ids
 from loopy.symbolic import IdentityMapper, WalkMapper
 from loopy.kernel.data import (
         InstructionBase, Assignment, SubstitutionRule)
+from loopy.diagnostic import LoopyError
 import islpy as isl
 from islpy import dim_type
 
@@ -162,6 +163,8 @@ def parse_insn(insn):
         and *inames_to_dup* is None or a list of tuples `(old, new)`.
     """
 
+    import loopy as lp
+
     insn_match = INSN_RE.match(insn)
     subst_match = SUBST_RE.match(insn)
     if insn_match is not None and subst_match is not None:
@@ -189,6 +192,18 @@ def parse_insn(insn):
                 "the following error occurred:" % groups["rhs"])
         raise
 
+    from pymbolic.primitives import Variable, Subscript, Call
+    if isinstance(lhs, Variable):
+        assignee_name = lhs.name
+    elif isinstance(lhs, Subscript):
+        assignee_name = lhs.aggregate.name
+    elif isinstance(lhs, Call):
+        assignee_name = None
+        assert subst_match is not None
+    else:
+        raise LoopyError("left hand side of assignment '%s' must "
+                "be variable or subscript" % lhs)
+
     if insn_match is not None:
         depends_on = None
         depends_on_is_final = False
@@ -201,6 +216,7 @@ def parse_insn(insn):
         forced_iname_deps = frozenset()
         predicates = frozenset()
         tags = ()
+        atomicity = ()
 
         if groups["options"] is not None:
             for option in groups["options"].split(","):
@@ -216,13 +232,13 @@ def parse_insn(insn):
                     opt_key = option[:equal_idx].strip()
                     opt_value = option[equal_idx+1:].strip()
 
-                if opt_key == "id":
+                if opt_key == "id" and opt_value is not None:
                     insn_id = intern(opt_value)
-                elif opt_key == "id_prefix":
+                elif opt_key == "id_prefix" and opt_value is not None:
                     insn_id = UniqueName(opt_value)
-                elif opt_key == "priority":
+                elif opt_key == "priority" and opt_value is not None:
                     priority = int(opt_value)
-                elif opt_key == "dup":
+                elif opt_key == "dup" and opt_value is not None:
                     for value in opt_value.split(":"):
                         arrow_idx = value.find("->")
                         if arrow_idx >= 0:
@@ -231,7 +247,7 @@ def parse_insn(insn):
                         else:
                             inames_to_dup.append((value, None))
 
-                elif opt_key == "dep":
+                elif opt_key == "dep" and opt_value is not None:
                     if opt_value.startswith("*"):
                         depends_on_is_final = True
                         opt_value = (opt_value[1:]).strip()
@@ -240,17 +256,17 @@ def parse_insn(insn):
                             intern(dep.strip()) for dep in opt_value.split(":")
                             if dep.strip())
 
-                elif opt_key == "groups":
+                elif opt_key == "groups" and opt_value is not None:
                     insn_groups = frozenset(
                             intern(grp.strip()) for grp in opt_value.split(":")
                             if grp.strip())
 
-                elif opt_key == "conflicts":
+                elif opt_key == "conflicts" and opt_value is not None:
                     conflicts_with_groups = frozenset(
                             intern(grp.strip()) for grp in opt_value.split(":")
                             if grp.strip())
 
-                elif opt_key == "inames":
+                elif opt_key == "inames" and opt_value is not None:
                     if opt_value.startswith("+"):
                         forced_iname_deps_is_final = False
                         opt_value = (opt_value[1:]).strip()
@@ -259,16 +275,32 @@ def parse_insn(insn):
 
                     forced_iname_deps = intern_frozenset_of_ids(opt_value.split(":"))
 
-                elif opt_key == "if":
+                elif opt_key == "if" and opt_value is not None:
                     predicates = intern_frozenset_of_ids(opt_value.split(":"))
 
-                elif opt_key == "tags":
+                elif opt_key == "tags" and opt_value is not None:
                     tags = tuple(
                             tag.strip() for tag in opt_value.split(":")
                             if tag.strip())
 
+                elif opt_key == "atomic":
+                    if opt_value is None:
+                        atomicity = atomicity + (
+                                lp.AtomicUpdate(assignee_name),)
+                    else:
+                        for v in opt_value.split(":"):
+                            if v == "init":
+                                atomicity = atomicity + (
+                                        lp.AtomicInit(assignee_name),)
+                            else:
+                                raise LoopyError("atomicity directive not "
+                                        "understood: %s"
+                                        % v)
+
                 else:
-                    raise ValueError("unrecognized instruction option '%s'"
+                    raise ValueError(
+                            "unrecognized instruction option '%s' "
+                            "(maybe a missing/extraneous =value?)"
                             % opt_key)
 
         if groups["temp_var_type"] is not None:
@@ -280,11 +312,6 @@ def parse_insn(insn):
         else:
             temp_var_type = None
 
-        from pymbolic.primitives import Variable, Subscript
-        if not isinstance(lhs, (Variable, Subscript)):
-            raise RuntimeError("left hand side of assignment '%s' must "
-                    "be variable or subscript" % lhs)
-
         return Assignment(
                     id=(
                         intern(insn_id)
@@ -298,6 +325,7 @@ def parse_insn(insn):
                     forced_iname_deps=forced_iname_deps,
                     assignee=lhs, expression=rhs,
                     temp_var_type=temp_var_type,
+                    atomicity=atomicity,
                     priority=priority,
                     predicates=predicates,
                     tags=tags), inames_to_dup
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 5b0cf57e50298febe0580da5fb0efe5756557527..f89c2b631b5928f6c6fc2ace61b8c3684d1e3b8b 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -26,7 +26,7 @@ THE SOFTWARE.
 
 
 from six.moves import intern
-import numpy as np
+import numpy as np  # noqa
 from pytools import Record, memoize_method
 from loopy.kernel.array import ArrayBase
 from loopy.diagnostic import LoopyError
@@ -191,32 +191,12 @@ class KernelArgument(Record):
         kwargs["name"] = intern(kwargs.pop("name"))
 
         dtype = kwargs.pop("dtype", None)
-
-        if isinstance(dtype, np.dtype):
-            from loopy.tools import PicklableDtype
-            kwargs["picklable_dtype"] = PicklableDtype(dtype)
-        else:
-            kwargs["picklable_dtype"] = dtype
+        from loopy.types import to_loopy_type
+        kwargs["dtype"] = to_loopy_type(
+                dtype, allow_auto=True, allow_none=True)
 
         Record.__init__(self, **kwargs)
 
-    def get_copy_kwargs(self, **kwargs):
-        result = Record.get_copy_kwargs(self, **kwargs)
-        if "dtype" not in result:
-            result["dtype"] = self.dtype
-
-        del result["picklable_dtype"]
-
-        return result
-
-    @property
-    def dtype(self):
-        from loopy.tools import PicklableDtype
-        if isinstance(self.picklable_dtype, PicklableDtype):
-            return self.picklable_dtype.dtype
-        else:
-            return self.picklable_dtype
-
 
 class GlobalArg(ArrayBase, KernelArgument):
     min_target_axes = 0
@@ -251,11 +231,9 @@ class ImageArg(ArrayBase, KernelArgument):
 
 class ValueArg(KernelArgument):
     def __init__(self, name, dtype=None, approximately=1000):
-        from loopy.tools import PicklableDtype
-        if dtype is not None and not isinstance(dtype, PicklableDtype):
-            dtype = np.dtype(dtype)
-
-        KernelArgument.__init__(self, name=name, dtype=dtype,
+        from loopy.types import to_loopy_type
+        KernelArgument.__init__(self, name=name,
+                dtype=to_loopy_type(dtype, allow_auto=True, allow_none=True),
                 approximately=approximately)
 
     def __str__(self):
@@ -660,6 +638,8 @@ class InstructionBase(Record):
             result.append("priority=%d" % self.priority)
         if self.tags:
             result.append("tags=%s" % ":".join(self.tags))
+        if self.atomicity:
+            result.append("atomic=%s" % ":".join(str(a) for a in self.atomicity))
 
         return result
 
@@ -739,6 +719,148 @@ def _get_assignee_and_index(expr):
         raise RuntimeError("invalid lvalue '%s'" % expr)
 
 
+class memory_ordering:
+    """Ordering of atomic operations, defined as in C11 and OpenCL.
+
+    .. attribute:: relaxed
+    .. attribute:: acquire
+    .. attribute:: release
+    .. attribute:: acq_rel
+    .. attribute:: seq_cst
+    """
+
+    relaxed = 0
+    acquire = 1
+    release = 2
+    acq_rel = 3
+    seq_cst = 4
+
+    @staticmethod
+    def to_string(v):
+        for i in dir(memory_ordering):
+            if i.startswith("_"):
+                continue
+
+            if getattr(memory_ordering, i) == v:
+                return i
+
+        raise ValueError("Unknown value of memory_ordering")
+
+
+class memory_scope:
+    """Scope of atomicity, defined as in OpenCL.
+
+    .. attribute:: auto
+
+        Scope matches the accessibility of the variable.
+
+    .. attribute:: work_item
+    .. attribute:: work_group
+    .. attribute:: work_device
+    .. attribute:: all_svm_devices
+    """
+
+    work_item = 0
+    work_group = 1
+    device = 2
+    all_svm_devices = 2
+
+    auto = -1
+
+    @staticmethod
+    def to_string(v):
+        for i in dir(memory_scope):
+            if i.startswith("_"):
+                continue
+
+            if getattr(memory_scope, i) == v:
+                return i
+
+        raise ValueError("Unknown value of memory_scope")
+
+
+class VarAtomicity(object):
+    """A base class for the description of how atomic access to :attr:`var_name`
+    shall proceed.
+
+    .. attribute:: var_name
+    """
+
+    def __init__(self, var_name):
+        self.var_name = var_name
+
+    def update_persistent_hash(self, key_hash, key_builder):
+        """Custom hash computation function for use with
+        :class:`pytools.persistent_dict.PersistentDict`.
+        """
+
+        key_builder.rec(key_hash, self.var_name)
+
+    def __eq__(self, other):
+        return (type(self) == type(other)
+                and self.var_name == other.var_name)
+
+    def __ne__(self, other):
+        return not self.__eq__(other)
+
+
+class AtomicInit(VarAtomicity):
+    """Describes initialization of an atomic variable. A subclass of
+    :class:`VarAtomicity`.
+    """
+
+    def update_persistent_hash(self, key_hash, key_builder):
+        """Custom hash computation function for use with
+        :class:`pytools.persistent_dict.PersistentDict`.
+        """
+
+        super(AtomicInit, self).update_persistent_hash(key_hash, key_builder)
+        key_builder.rec(key_hash, "AtomicInit")
+
+    def __str__(self):
+        return "update[%s]%s/%s" % (
+                self.var_name,
+                memory_ordering.to_string(self.ordering),
+                memory_scope.to_string(self.scope))
+
+
+class AtomicUpdate(VarAtomicity):
+    """Properties of an atomic operation. A subclass of :class:`VarAtomicity`.
+
+    .. attribute:: ordering
+
+        One of the values from :class:`memory_ordering`
+
+    .. attribute:: scope
+
+        One of the values from :class:`memory_scope`
+    """
+
+    ordering = memory_ordering.seq_cst
+    scope = memory_scope.auto
+
+    def update_persistent_hash(self, key_hash, key_builder):
+        """Custom hash computation function for use with
+        :class:`pytools.persistent_dict.PersistentDict`.
+        """
+
+        super(AtomicUpdate, self).update_persistent_hash(key_hash, key_builder)
+        key_builder.rec(key_hash, "AtomicUpdate")
+        key_builder.rec(key_hash, self.ordering)
+        key_builder.rec(key_hash, self.scope)
+
+    def __eq__(self, other):
+        return (super(AtomicUpdate, self).__eq__(other)
+                and self.ordering == other.ordering
+                and self.scope == other.scope)
+
+    def __str__(self):
+        return "update[%s]%s/%s" % (
+                self.var_name,
+                memory_ordering.to_string(self.ordering),
+                memory_scope.to_string(self.scope))
+
+
 # {{{ assignment
 
 class Assignment(InstructionBase):
@@ -755,11 +877,45 @@ class Assignment(InstructionBase):
         if not *None*, a type that will be assigned to the new temporary variable
         created from the assignee
 
+    .. attribute:: atomicity
+
+        A tuple of instances of :class:`VarAtomicity`. Together, they describe
+        to what extent the assignment is to be carried out in a way that
+        involves atomic operations.
+
+        To describe an atomic update, any memory reads of *exact* occurrences
+        of the left-hand side expression of the assignment in the right hand
+        side are treated , together with the "memory write" part of the
+        assignment, as part of one single atomic update.
+
+        .. note::
+
+            Exact identity of the LHS with RHS subexpressions is required for
+            an atomic update to be recognized. For example, the following update
+            will not be recognized as an update::
+
+                z[i] = z[i+1-1] + a {atomic}
+
+        :mod:`loopy` may to evaluate the right-hand side *multiple times*
+        as part of a single assignment. It is up to the user to ensure that
+        this retains correct semantics.
+
+        For example, the following assignment::
+
+            z[i] = f(z[i]) + a {atomic}
+
+        may generate the following (pseudo-)code::
+
+            DO
+                READ ztemp_old = z[i]
+                EVALUATE ztemp_new = f(ztemp_old) + a
+            WHILE compare_and_swap(z[i], ztemp_new, ztemp_old) did not succeed
+
     .. automethod:: __init__
     """
 
     fields = InstructionBase.fields | \
-            set("assignee expression temp_var_type".split())
+            set("assignee expression temp_var_type atomicity".split())
 
     def __init__(self,
             assignee, expression,
@@ -771,7 +927,8 @@ class Assignment(InstructionBase):
             forced_iname_deps_is_final=None,
             forced_iname_deps=frozenset(),
             boostable=None, boostable_into=None, tags=None,
-            temp_var_type=None, priority=0, predicates=frozenset(),
+            temp_var_type=None, atomicity=(),
+            priority=0, predicates=frozenset(),
             insn_deps=None, insn_deps_is_final=None):
 
         InstructionBase.__init__(self,
@@ -807,6 +964,7 @@ class Assignment(InstructionBase):
         self.assignee = assignee
         self.expression = expression
         self.temp_var_type = temp_var_type
+        self.atomicity = atomicity
 
     # {{{ implement InstructionBase interface
 
diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py
index d1176bc4af1d99b62860364702549b9a1b6b7eda..4d92316bf62eea1f1645a731fd2556294cd3f175 100644
--- a/loopy/library/reduction.py
+++ b/loopy/library/reduction.py
@@ -28,6 +28,7 @@ import numpy as np
 
 from loopy.symbolic import FunctionIdentifier
 from loopy.diagnostic import LoopyError
+from loopy.types import NumpyType
 
 
 class ReductionOperation(object):
@@ -122,7 +123,7 @@ class ProductReductionOperation(ScalarReductionOperation):
 def get_le_neutral(dtype):
     """Return a number y that satisfies (x <= y) for all y."""
 
-    if dtype.kind == "f":
+    if dtype.numpy_dtype.kind == "f":
         # OpenCL 1.1, section 6.11.2
         return var("INFINITY")
     else:
@@ -153,17 +154,17 @@ ARGEXT_STRUCT_DTYPES = {}
 
 class _ArgExtremumReductionOperation(ReductionOperation):
     def prefix(self, dtype):
-        return "loopy_arg%s_%s" % (self.which, dtype.type.__name__)
+        return "loopy_arg%s_%s" % (self.which, dtype.numpy_dtype.type.__name__)
 
     def result_dtype(self, target, dtype, inames):
         try:
             return ARGEXT_STRUCT_DTYPES[dtype]
         except KeyError:
             struct_dtype = np.dtype([("value", dtype), ("index", np.int32)])
-            ARGEXT_STRUCT_DTYPES[dtype] = struct_dtype
+            ARGEXT_STRUCT_DTYPES[dtype] = NumpyType(struct_dtype, target)
 
             target.get_or_register_dtype(self.prefix(dtype)+"_result", struct_dtype)
-            return struct_dtype
+            return ARGEXT_STRUCT_DTYPES[dtype]
 
     def neutral_element(self, dtype, inames):
         return ArgExtFunction(self, dtype, "init", inames)()
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index 4c75cfd250807c6959c1e5167465d34b029d762e..e30d3bcb3f441036bab6558bc2d8691031a1c2e4 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -43,7 +43,7 @@ def prepare_for_caching(kernel):
     new_args = []
 
     for arg in kernel.args:
-        dtype = arg.picklable_dtype
+        dtype = arg.dtype
         if dtype is not None and dtype is not lp.auto:
             dtype = dtype.with_target(kernel.target)
 
@@ -51,7 +51,7 @@ def prepare_for_caching(kernel):
 
     new_temporary_variables = {}
     for name, temp in six.iteritems(kernel.temporary_variables):
-        dtype = temp.picklable_dtype
+        dtype = temp.dtype
         if dtype is not None and dtype is not lp.auto:
             dtype = dtype.with_target(kernel.target)
 
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index d931ebf3373b7ab455787f5be7db7360cc042e66..6a596454018802c0a5a1e837272c106ffa7c55b3 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -37,6 +37,7 @@ from loopy.expression import dtype_to_type_context, TypeInferenceMapper
 
 from loopy.diagnostic import LoopyError
 from loopy.tools import is_integer
+from loopy.types import LoopyType
 
 
 # {{{ C code mapper
@@ -55,6 +56,8 @@ class LoopyCCodeMapper(RecursiveMapper):
 
     def infer_type(self, expr):
         result = self.type_inf_mapper(expr)
+        assert isinstance(result, LoopyType)
+
         self.codegen_state.seen_dtypes.add(result)
         return result
 
@@ -91,11 +94,11 @@ class LoopyCCodeMapper(RecursiveMapper):
 
         actual_type = self.infer_type(expr)
 
-        if (actual_type.kind == "c" and needed_dtype.kind == "c"
+        if (actual_type.is_complex() and needed_dtype.is_complex()
                 and actual_type != needed_dtype):
             result = RecursiveMapper.rec(self, expr, PREC_NONE, type_context)
             return "%s_cast(%s)" % (self.complex_type_name(needed_dtype), result)
-        elif actual_type.kind != "c" and needed_dtype.kind == "c":
+        elif not actual_type.is_complex() and needed_dtype.is_complex():
             result = RecursiveMapper.rec(self, expr, PREC_NONE, type_context)
             return "%s_fromreal(%s)" % (self.complex_type_name(needed_dtype), result)
         else:
@@ -459,9 +462,13 @@ class LoopyCCodeMapper(RecursiveMapper):
     # {{{ deal with complex-valued variables
 
     def complex_type_name(self, dtype):
-        if dtype == np.complex64:
+        from loopy.types import NumpyType
+        if not isinstance(dtype, NumpyType):
+            raise LoopyError("'%s' is not a complex type" % dtype)
+
+        if dtype.dtype == np.complex64:
             return "cfloat"
-        if dtype == np.complex128:
+        if dtype.dtype == np.complex128:
             return "cdouble"
         else:
             raise RuntimeError
@@ -484,17 +491,20 @@ class LoopyCCodeMapper(RecursiveMapper):
             return base_impl(expr, enclosing_prec, type_context)
 
         tgt_dtype = self.infer_type(expr)
-        is_complex = tgt_dtype.kind == 'c'
+        is_complex = tgt_dtype.is_complex()
 
         if not is_complex:
             return base_impl(expr, enclosing_prec, type_context)
         else:
             tgt_name = self.complex_type_name(tgt_dtype)
 
-            reals = [child for child in expr.children
-                    if 'c' != self.infer_type(child).kind]
-            complexes = [child for child in expr.children
-                    if 'c' == self.infer_type(child).kind]
+            reals = []
+            complexes = []
+            for child in expr.children:
+                if self.infer_type(child).is_complex():
+                    complexes.append(child)
+                else:
+                    reals.append(child)
 
             real_sum = self.join_rec(" + ", reals, PREC_SUM, type_context)
 
@@ -534,17 +544,20 @@ class LoopyCCodeMapper(RecursiveMapper):
             return base_impl(expr, enclosing_prec, type_context)
 
         tgt_dtype = self.infer_type(expr)
-        is_complex = 'c' == tgt_dtype.kind
+        is_complex = tgt_dtype.is_complex()
 
         if not is_complex:
             return base_impl(expr, enclosing_prec, type_context)
         else:
             tgt_name = self.complex_type_name(tgt_dtype)
 
-            reals = [child for child in expr.children
-                    if 'c' != self.infer_type(child).kind]
-            complexes = [child for child in expr.children
-                    if 'c' == self.infer_type(child).kind]
+            reals = []
+            complexes = []
+            for child in expr.children:
+                if self.infer_type(child).is_complex():
+                    complexes.append(child)
+                else:
+                    reals.append(child)
 
             real_prd = self.join_rec(" * ", reals, PREC_PRODUCT,
                     type_context)
@@ -574,7 +587,8 @@ class LoopyCCodeMapper(RecursiveMapper):
             # analogous to ^{-1}
             denom = self.rec(expr.denominator, PREC_POWER, type_context)
 
-            if n_dtype.kind not in "fc" and d_dtype.kind not in "fc":
+            if (n_dtype.kind not in "fc"
+                    and d_dtype.kind not in "fc"):
                 # must both be integers
                 if type_context == "f":
                     num = "((float) (%s))" % num
@@ -592,8 +606,8 @@ class LoopyCCodeMapper(RecursiveMapper):
                         denom),
                     enclosing_prec, PREC_PRODUCT)
 
-        n_dtype = self.infer_type(expr.numerator)
-        d_dtype = self.infer_type(expr.denominator)
+        n_dtype = self.infer_type(expr.numerator).numpy_dtype
+        d_dtype = self.infer_type(expr.denominator).numpy_dtype
 
         if not self.allow_complex:
             return base_impl(expr, enclosing_prec, type_context)
@@ -623,7 +637,7 @@ class LoopyCCodeMapper(RecursiveMapper):
 
     def map_remainder(self, expr, enclosing_prec, type_context):
         tgt_dtype = self.infer_type(expr)
-        if 'c' == tgt_dtype.kind:
+        if tgt_dtype.is_complex():
             raise RuntimeError("complex remainder not defined")
 
         return "(%s %% %s)" % (
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index 55f8da4d608d7a1b3a1ca8ba960c498586f4d76d..2fa510b5f83887026697e032a8b463c5df8ed2d1 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -26,10 +26,12 @@ THE SOFTWARE.
 
 import numpy as np
 
+from pytools import memoize_method
+
 from loopy.target.c import CTarget
 from loopy.target.c.codegen.expression import LoopyCCodeMapper
-from pytools import memoize_method
 from loopy.diagnostic import LoopyError
+from loopy.types import NumpyType
 
 
 # {{{ vector types
@@ -141,10 +143,12 @@ class LoopyCudaCCodeMapper(LoopyCCodeMapper):
 
     @staticmethod
     def _get_index_ctype(kernel):
-        if kernel.index_dtype == np.int32:
+        if kernel.index_dtype.numpy_dtype == np.int32:
             return "int32_t"
-        else:
+        elif kernel.index_dtype.numpy_dtype == np.int32:
             return "int64_t"
+        else:
+            raise LoopyError("unexpected index type")
 
     def map_group_hw_index(self, expr, enclosing_prec, type_context):
         return "((%s) blockIdx.%s)" % (
@@ -198,10 +202,10 @@ class CudaTarget(CTarget):
         return result
 
     def is_vector_dtype(self, dtype):
-        return list(vec.types.values())
+        return dtype.numpy_dtype in list(vec.types.values())
 
     def vector_dtype(self, base, count):
-        return vec.types[base, count]
+        return NumpyType(vec.types[base.numpy_dtype, count])
 
     # }}}
 
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index b0b1c247ec78a29b75890b0e9f04448116b97c4e..217678e8218c7fe452cf118175d127866456069f 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -37,9 +37,9 @@ from pytools import memoize_method
 
 class LoopyISPCCodeMapper(LoopyCCodeMapper):
     def _get_index_ctype(self):
-        if self.kernel.index_dtype == np.int32:
+        if self.kernel.index_dtype.numpy_dtype == np.int32:
             return "int32"
-        elif self.kernel.index_dtype == np.int64:
+        elif self.kernel.index_dtype.numpy_dtype == np.int64:
             return "int64"
         else:
             raise ValueError("unexpected index_type")
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index cfdc8620bb4a383e4d48f004a2c682ba8e495a05..db53b7cb29224f8620bcc6466b3f42da5a52107f 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -30,6 +30,7 @@ from loopy.target.c import CTarget
 from loopy.target.c.codegen.expression import LoopyCCodeMapper
 from pytools import memoize_method
 from loopy.diagnostic import LoopyError
+from loopy.types import NumpyType
 
 
 # {{{ vector types
@@ -126,8 +127,8 @@ def opencl_function_mangler(kernel, name, arg_dtypes):
         return arg_dtypes[0], name
 
     if name == "dot":
-        scalar_dtype, offset, field_name = arg_dtypes[0].fields["s0"]
-        return scalar_dtype, name
+        scalar_dtype, offset, field_name = arg_dtypes[0].numpy_dtype.fields["s0"]
+        return NumpyType(scalar_dtype), name
 
     return None
 
@@ -192,6 +193,17 @@ class OpenCLTarget(CTarget):
     """A target for the OpenCL C heterogeneous compute programming language.
     """
 
+    def __init__(self, atomics_flavor="cl2"):
+        """
+        :arg atomics_flavor: one of ``"cl2"`` (C11-style atomics from OpenCL 2.0),
+            ``"cl1"`` (OpenCL 1.1 atomics, using bit-for-bit compare-and-swap
+            for floating point), ``"cl1-exch"`` (OpenCL 1.1 atomics, using
+            double-exchange for floating point).
+        """
+        super(OpenCLTarget, self).__init__()
+
+        self.atomics_flavor = atomics_flavor
+
     # {{{ library
 
     def function_manglers(self):
@@ -231,10 +243,10 @@ class OpenCLTarget(CTarget):
         return result
 
     def is_vector_dtype(self, dtype):
-        return list(vec.types.values())
+        return dtype.numpy_dtype in list(vec.types.values())
 
     def vector_dtype(self, base, count):
-        return vec.types[base, count]
+        return NumpyType(vec.types[base.numpy_dtype, count])
 
     # }}}
 
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index 2947fdc6c5d17e919bdcc329f6243376b00a9969..e14a1dd9d5d74bdabc30e8f70382738d4db4bbbe 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -30,6 +30,7 @@ from six.moves import range
 import numpy as np
 
 from loopy.target.opencl import OpenCLTarget
+from loopy.types import NumpyType
 
 import logging
 logger = logging.getLogger(__name__)
@@ -181,10 +182,10 @@ def pyopencl_function_mangler(target, name, arg_dtypes):
     if len(arg_dtypes) == 1 and isinstance(name, str):
         arg_dtype, = arg_dtypes
 
-        if arg_dtype.kind == "c":
-            if arg_dtype == np.complex64:
+        if arg_dtype.is_complex():
+            if arg_dtype.numpy_dtype == np.complex64:
                 tpname = "cfloat"
-            elif arg_dtype == np.complex128:
+            elif arg_dtype.numpy_dtype == np.complex128:
                 tpname = "cdouble"
             else:
                 raise RuntimeError("unexpected complex type '%s'" % arg_dtype)
@@ -196,7 +197,10 @@ def pyopencl_function_mangler(target, name, arg_dtypes):
                 return arg_dtype, "%s_%s" % (tpname, name)
 
             if name in ["real", "imag", "abs"]:
-                return np.dtype(arg_dtype.type(0).real), "%s_%s" % (tpname, name)
+                return (
+                        NumpyType(
+                            np.dtype(arg_dtype.numpy_dtype.type(0).real)),
+                        "%s_%s" % (tpname, name))
 
     return None
 
@@ -207,10 +211,12 @@ def pyopencl_preamble_generator(target, seen_dtypes, seen_functions):
     has_double = False
     has_complex = False
 
+    from loopy.types import NumpyType
     for dtype in seen_dtypes:
-        if dtype in [np.float64, np.complex128]:
+        if (isinstance(dtype, NumpyType)
+                and dtype.dtype in [np.float64, np.complex128]):
             has_double = True
-        if dtype.kind == "c":
+        if dtype.involves_complex():
             has_complex = True
 
     if has_complex:
@@ -294,7 +300,7 @@ class PyOpenCLTarget(OpenCLTarget):
 
     def vector_dtype(self, base, count):
         from pyopencl.array import vec
-        return vec.types[base, count]
+        return NumpyType(vec.types[base.numpy_dtype, count])
 
     def alignment_requirement(self, type_decl):
         import struct
diff --git a/loopy/tools.py b/loopy/tools.py
index 777532e7af92bc62e6878f564c8c5545f4cb2c4a..ae370d5aaac9ff75f530e1d0951a2f904b686e42 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -1,5 +1,4 @@
-from __future__ import division
-from __future__ import absolute_import
+from __future__ import division, absolute_import
 import six
 
 __copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
@@ -35,7 +34,7 @@ from six.moves import intern
 
 if six.PY2:
     def is_integer(obj):
-        return isinstance(obj, (int, long, np.integer))
+        return isinstance(obj, (int, long, np.integer))  # noqa
 else:
     def is_integer(obj):
         return isinstance(obj, (int, np.integer))
@@ -114,70 +113,6 @@ class PymbolicExpressionHashWrapper(object):
 # }}}
 
 
-# {{{ picklable dtype
-
-class PicklableDtype(object):
-    """This object works around several issues with pickling :class:`numpy.dtype`
-    objects. It does so by serving as a picklable wrapper around the original
-    dtype.
-
-    The issues are the following
-
-    - :class:`numpy.dtype` objects for custom types in :mod:`loopy` are usually
-      registered in the target's dtype registry. This registration may
-      have been lost after unpickling. This container restores it implicitly,
-      as part of unpickling.
-
-    - There is a`numpy bug <https://github.com/numpy/numpy/issues/4317>`_
-      that prevents unpickled dtypes from hashing properly. This is solved
-      by retrieving the 'canonical' type from the dtype registry.
-    """
-
-    def __init__(self, dtype, target=None):
-        assert not isinstance(dtype, PicklableDtype)
-
-        if dtype is None:
-            raise TypeError("may not pass None to construct PicklableDtype")
-
-        self.target = target
-        self.dtype = np.dtype(dtype)
-
-    def __hash__(self):
-        return hash(self.dtype)
-
-    def __eq__(self, other):
-        return (
-                type(self) == type(other)
-                and self.dtype == other.dtype)
-
-    def __ne__(self, other):
-        return not self.__eq__(self, other)
-
-    def __getstate__(self):
-        if self.target is None:
-            raise RuntimeError("unable to pickle dtype: target not known")
-
-        c_name = self.target.dtype_to_typename(self.dtype)
-        return (self.target, c_name, self.dtype)
-
-    def __setstate__(self, state):
-        target, name, dtype = state
-        self.target = target
-        self.dtype = self.target.get_or_register_dtype([name], dtype)
-
-    def with_target(self, target):
-        if (self.target is not None
-                and target is not self.target):
-            raise RuntimeError("target already set to different value")
-
-        return PicklableDtype(self.dtype, target)
-
-    def assert_has_target(self):
-        assert self.target is not None
-
-# }}}
-
-
 # {{{ remove common indentation
 
 def remove_common_indentation(code, require_leading_newline=True,
diff --git a/loopy/types.py b/loopy/types.py
new file mode 100644
index 0000000000000000000000000000000000000000..4ed6aa0ffe6aab8ec9fbdde5721b13658d9059cb
--- /dev/null
+++ b/loopy/types.py
@@ -0,0 +1,212 @@
+from __future__ import division, absolute_import
+
+__copyright__ = "Copyright (C) 2012 Andreas Kloeckner"
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+import six  # noqa
+import numpy as np
+
+from loopy.diagnostic import LoopyError
+
+
+class LoopyType(object):
+    def with_target(self, target):
+        return self
+
+    def is_integral(self):
+        raise NotImplementedError()
+
+    def is_complex(self):
+        raise NotImplementedError()
+
+    def uses_complex(self):
+        raise NotImplementedError()
+
+    def is_composite(self):
+        raise NotImplementedError()
+
+    @property
+    def itemsize(self):
+        raise NotImplementedError()
+
+    @property
+    def numpy_dtype(self):
+        raise ValueError("'%s' is not a numpy type"
+                % str(self))
+
+
+class AtomicType(LoopyType):
+    pass
+
+
+# {{{ numpy-based dtype
+
+class NumpyType(LoopyType):
+    """This object works around several issues with pickling :class:`numpy.dtype`
+    objects. It does so by serving as a picklable wrapper around the original
+    dtype.
+
+    The issues are the following
+
+    - :class:`numpy.dtype` objects for custom types in :mod:`loopy` are usually
+      registered in the target's dtype registry. This registration may
+      have been lost after unpickling. This container restores it implicitly,
+      as part of unpickling.
+
+    - There is a`numpy bug <https://github.com/numpy/numpy/issues/4317>`_
+      that prevents unpickled dtypes from hashing properly. This is solved
+      by retrieving the 'canonical' type from the dtype registry.
+    """
+
+    def __init__(self, dtype, target=None):
+        assert not isinstance(dtype, NumpyType)
+
+        if dtype is None:
+            raise TypeError("may not pass None to construct NumpyType")
+
+        if dtype == object:
+            raise TypeError("loopy does not directly support object arrays")
+
+        self.target = target
+        self.dtype = np.dtype(dtype)
+
+    def __hash__(self):
+        return hash(self.dtype)
+
+    def __eq__(self, other):
+        return (
+                type(self) == type(other)
+                and self.dtype == other.dtype)
+
+    def __ne__(self, other):
+        return not super(NumpyType, self).__eq__(other)
+
+    def __getstate__(self):
+        if self.target is None:
+            raise RuntimeError("unable to pickle dtype: target not known")
+
+        c_name = self.target.dtype_to_typename(self.dtype)
+        return (self.target, c_name, self.dtype)
+
+    def __setstate__(self, state):
+        target, name, dtype = state
+        self.target = target
+        self.dtype = self.target.get_or_register_dtype([name], dtype)
+
+    def with_target(self, target):
+        if (self.target is not None
+                and target is not self.target):
+            raise RuntimeError("target already set to different value")
+
+        return type(self)(self.dtype, target)
+
+    def assert_has_target(self):
+        assert self.target is not None
+
+    def is_integral(self):
+        return self.dtype.kind in "iu"
+
+    def is_complex(self):
+        return self.dtype.kind == "c"
+
+    def involves_complex(self):
+        def dtype_involves_complex(dtype):
+            if dtype.kind == "c":
+                return True
+
+            if dtype.fields is None:
+                return False
+            else:
+                return any(
+                        dtype_involves_complex(f[0])
+                        for f in six.itervalues(dtype.fields))
+
+        return dtype_involves_complex(self.dtype)
+
+    def is_composite(self):
+        return self.dtype.kind == "V"
+
+    @property
+    def itemsize(self):
+        return self.dtype.itemsize
+
+    @property
+    def numpy_dtype(self):
+        return self.dtype
+
+    def __repr__(self):
+        return repr(self.dtype)
+
+# }}}
+
+
+# {{{ atomic dtype
+
+class AtomicNumpyType(NumpyType, AtomicType):
+    """A dtype wrapper that indicates that the described type should be capable
+    of atomic operations.
+    """
+    def __hash__(self):
+        return 0xa7031c ^ hash(self.dtype)
+
+    def __repr__(self):
+        return "atomic:%s" % repr(self.dtype)
+
+# }}}
+
+
+def to_loopy_type(dtype, allow_none=False, allow_auto=False, for_atomic=False):
+    from loopy.kernel.data import auto
+    if allow_none and dtype is None:
+        return dtype
+    elif allow_auto and dtype is auto:
+        return dtype
+
+    numpy_dtype = None
+
+    if dtype is not None:
+        try:
+            numpy_dtype = np.dtype(dtype)
+        except Exception:
+            pass
+
+    if isinstance(dtype, LoopyType):
+        if for_atomic:
+            if isinstance(dtype, NumpyType):
+                return AtomicNumpyType(dtype.dtype)
+            elif not isinstance(dtype, AtomicType):
+                raise LoopyError("do not know how to convert '%s' to an atomic type"
+                        % dtype)
+
+        return dtype
+
+    elif numpy_dtype is not None:
+        if for_atomic:
+            return AtomicNumpyType(numpy_dtype)
+        else:
+            return NumpyType(numpy_dtype)
+
+    else:
+        raise TypeError("dtype must be a LoopyType, or convertible to one, "
+                "found '%s' instead" % type(dtype))
+
+# vim: foldmethod=marker
diff --git a/loopy/version.py b/loopy/version.py
index adc069663503b200bcdd1638c05ae0ffae5f14df..6cbbc59e17b1e6940a1c3247f1bf7b60d5a0ef61 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -32,4 +32,4 @@ except ImportError:
 else:
     _islpy_version = islpy.version.VERSION_TEXT
 
-DATA_MODEL_VERSION = "v19-islpy%s" % _islpy_version
+DATA_MODEL_VERSION = "v21-islpy%s" % _islpy_version
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 606eec7667ca3d76a215c3b487e8c93bc371c36e..c7d7755e7a7b51d8085d57b7263b0249bd9520e1 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2421,6 +2421,26 @@ def test_chunk_iname(ctx_factory):
     lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=130))
 
 
+def test_atomic(ctx_factory):
+    ctx = ctx_factory()
+
+    knl = lp.make_kernel(
+            "{ [i]: 0<=i<n }",
+            "out[i%20] = out[i%20] + 2*a[i] {atomic}",
+            [
+                lp.GlobalArg("out", np.float32, shape=lp.auto,
+                    for_atomic=True),
+                lp.GlobalArg("a", np.float32, shape=lp.auto),
+                "..."
+                ],
+            assumptions="n>0")
+
+    ref_knl = knl
+    knl = lp.chunk_iname(knl, "i", 3, inner_tag="l.0")
+    knl = lp.set_loop_priority(knl, "i_outer, i_inner")
+    lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=130))
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])