diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index dbd99e85016b00b3df4827ad7999e7b57e58af24..464bfc926ebfaab61cb0defd95b7e8264866f146 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -627,7 +627,7 @@ class AtomicInit(VarAtomicity):
                 memory_scope.to_string(self.scope))
 
 
-class AtomicUpdate(VarAtomicity):
+class OrderedAtomic(VarAtomicity):
     """Properties of an atomic operation. A subclass of :class:`VarAtomicity`.
 
     .. attribute:: ordering
@@ -647,22 +647,51 @@ class AtomicUpdate(VarAtomicity):
         :class:`pytools.persistent_dict.PersistentDict`.
         """
 
-        super(AtomicUpdate, self).update_persistent_hash(key_hash, key_builder)
-        key_builder.rec(key_hash, "AtomicUpdate")
+        super(OrderedAtomic, self).update_persistent_hash(key_hash, key_builder)
+        key_builder.rec(key_hash, str(self.__class__.__name__))
         key_builder.rec(key_hash, self.ordering)
         key_builder.rec(key_hash, self.scope)
 
     def __eq__(self, other):
-        return (super(AtomicUpdate, self).__eq__(other)
+        return (super(OrderedAtomic, self).__eq__(other)
                 and self.ordering == other.ordering
                 and self.scope == other.scope)
 
     def __str__(self):
-        return "update[%s]%s/%s" % (
+        return "%s[%s]%s/%s" % (
+                self.op_name,
                 self.var_name,
                 memory_ordering.to_string(self.ordering),
                 memory_scope.to_string(self.scope))
 
+
+class AtomicUpdate(OrderedAtomic):
+    """Properties of an atomic update. A subclass of :class:`VarAtomicity`.
+
+    .. attribute:: ordering
+
+        One of the values from :class:`memory_ordering`
+
+    .. attribute:: scope
+
+        One of the values from :class:`memory_scope`
+    """
+    op_name = 'update'
+
+
+class AtomicLoad(OrderedAtomic):
+    """Properties of an atomic load. A subclass of :class:`VarAtomicity`.
+
+    .. attribute:: ordering
+
+        One of the values from :class:`memory_ordering`
+
+    .. attribute:: scope
+
+        One of the values from :class:`memory_scope`
+    """
+    op_name = 'load'
+
 # }}}
 
 
diff --git a/loopy/preprocess.py b/loopy/preprocess.py
index ac7ac19887388649670154fcd36eba79ba3b4315..f2b5e7a87022e01bd51368cc3ef3cc60d507d958 100644
--- a/loopy/preprocess.py
+++ b/loopy/preprocess.py
@@ -2047,6 +2047,49 @@ def limit_boostability(kernel):
 # }}}
 
 
+# {{{ check for loads of atomic variables
+
+def check_atomic_loads(kernel):
+    """Find instances of AtomicInit or AtomicUpdate with use of other atomic
+    variables to update the atomicity
+    """
+
+    logger.debug("%s: check atomic loads" % kernel.name)
+    from loopy.types import AtomicType
+    from loopy.kernel.array import ArrayBase
+    from loopy.kernel.instruction import Assignment, AtomicLoad
+
+    # find atomic variables
+    atomicity_candidates = (
+            set(v.name for v in six.itervalues(kernel.temporary_variables)
+                if isinstance(v.dtype, AtomicType))
+            |
+            set(v.name for v in kernel.args
+                if isinstance(v, ArrayBase)
+                and isinstance(v.dtype, AtomicType)))
+
+    new_insns = []
+    for insn in kernel.instructions:
+        if isinstance(insn, Assignment):
+            # look for atomic variables
+            atomic_accesses = set(a.var_name for a in insn.atomicity)
+            accessed_atomic_vars = (insn.dependency_names() & atomicity_candidates)\
+                - set([insn.assignee_var_names()[0]])
+            if not accessed_atomic_vars <= atomic_accesses:
+                #if we're missing some
+                missed = accessed_atomic_vars - atomic_accesses
+                for x in missed:
+                    if set([x]) & atomicity_candidates:
+                        insn = insn.copy(
+                            atomicity=insn.atomicity + (AtomicLoad(x),))
+
+        new_insns.append(insn)
+
+    return kernel.copy(instructions=new_insns)
+
+# }}}
+
+
 preprocess_cache = WriteOncePersistentDict(
         "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION,
         key_builder=LoopyKeyBuilder())
@@ -2132,6 +2175,10 @@ def preprocess_kernel(kernel, device=None):
     kernel = find_idempotence(kernel)
     kernel = limit_boostability(kernel)
 
+    # check for atomic loads, much easier to do here now that the dependencies
+    # have been established
+    kernel = check_atomic_loads(kernel)
+
     kernel = kernel.target.preprocess(kernel)
 
     logger.info("%s: preprocess done" % kernel.name)
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 80536d912e7d426feca598ac4b07cd16223b0307..9536fc711a2266a5fae10e83d3d8de8974fc66c5 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -787,7 +787,11 @@ class CASTBuilder(ASTBuilderBase):
                         needed_dtype=lhs_dtype))
 
         elif isinstance(lhs_atomicity, AtomicInit):
-            raise NotImplementedError("atomic init")
+            codegen_state.seen_atomic_dtypes.add(lhs_dtype)
+            return codegen_state.ast_builder.emit_atomic_init(
+                    codegen_state, lhs_atomicity, lhs_var,
+                    insn.assignee, insn.expression,
+                    lhs_dtype, rhs_type_context)
 
         elif isinstance(lhs_atomicity, AtomicUpdate):
             codegen_state.seen_atomic_dtypes.add(lhs_dtype)
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index d2fe4157fc1ff6f9eb7817bea7da8da7e31bbdc1..31e0569b92a9ddada8ad66c2e0a065c191cc61d3 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -501,6 +501,18 @@ class OpenCLCASTBuilder(CASTBuilder):
 
         return CLConstant(arg_decl)
 
+    # {{{
+
+    def emit_atomic_init(self, codegen_state, lhs_atomicity, lhs_var,
+            lhs_expr, rhs_expr, lhs_dtype, rhs_type_context):
+        # for the CL1 flavor, this is as simple as a regular update with whatever
+        # the RHS value is...
+
+        return self.emit_atomic_update(codegen_state, lhs_atomicity, lhs_var,
+            lhs_expr, rhs_expr, lhs_dtype, rhs_type_context)
+
+    # }}}
+
     # {{{ code generation for atomic update
 
     def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var,
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 084f9f045bc45c34a33feb22eaacc05df8d6c02e..927f5a475f2cc11f2d74876676947765be40f010 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -1062,6 +1062,77 @@ def test_atomic(ctx_factory, dtype):
     lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=10000))
 
 
+@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])
+def test_atomic_load(ctx_factory, dtype):
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+    from loopy.kernel.data import temp_var_scope as scopes
+    n = 10
+    vec_width = 4
+
+    if (
+            np.dtype(dtype).itemsize == 8
+            and "cl_khr_int64_base_atomics" not in ctx.devices[0].extensions):
+        pytest.skip("64-bit atomics not supported on device")
+
+    import pyopencl.version  # noqa
+    if (
+            cl.version.VERSION < (2015, 2)
+            and dtype == np.int64):
+        pytest.skip("int64 RNG not supported in PyOpenCL < 2015.2")
+
+    knl = lp.make_kernel(
+            "{ [i,j]: 0<=i,j<n}",
+            """
+            for j
+                <> upper = 0
+                <> lower = 0
+                temp = 0 {id=init, atomic}
+                for i
+                    upper = upper + i * a[i] {id=sum0}
+                    lower = lower - b[i] {id=sum1}
+                end
+                temp = temp + lower {id=temp_sum, dep=sum*:init, atomic,\
+                                           nosync=init}
+                ... lbarrier {id=lb2, dep=temp_sum}
+                out[j] = upper / temp {id=final, dep=lb2, atomic,\
+                                           nosync=init:temp_sum}
+            end
+            """,
+            [
+                lp.GlobalArg("out", dtype, shape=lp.auto, for_atomic=True),
+                lp.GlobalArg("a", dtype, shape=lp.auto),
+                lp.GlobalArg("b", dtype, shape=lp.auto),
+                lp.TemporaryVariable('temp', dtype, for_atomic=True,
+                                     scope=scopes.LOCAL),
+                "..."
+                ],
+            silenced_warnings=["write_race(init)", "write_race(temp_sum)"])
+    knl = lp.fix_parameters(knl, n=n)
+    knl = lp.split_iname(knl, "j", vec_width, inner_tag="l.0")
+    _, out = knl(queue, a=np.arange(n, dtype=dtype), b=np.arange(n, dtype=dtype))
+    assert np.allclose(out, np.full_like(out, ((1 - 2 * n) / 3.0)))
+
+
+@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])
+def test_atomic_init(dtype):
+    vec_width = 4
+
+    knl = lp.make_kernel(
+            "{ [i,j]: 0<=i<100 }",
+            """
+            out[i%4] = 0 {id=init, atomic=init}
+            """,
+            [
+                lp.GlobalArg("out", dtype, shape=lp.auto, for_atomic=True),
+                "..."
+                ],
+            silenced_warnings=["write_race(init)"])
+    knl = lp.split_iname(knl, 'i', vec_width, inner_tag='l.0')
+
+    print(lp.generate_code_v2(knl).device_code())
+
+
 def test_within_inames_and_reduction():
     # See https://github.com/inducer/loopy/issues/24