diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index f4c48443f9062bf362e9a27681d2967b8f82807d..323951a63a7f9c370bfd0a357ae585a6bd1b336b 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -116,7 +116,8 @@ def generate_expr_instruction_code(kernel, insn, codegen_state):
     # }}}
 
     (assignee_var_name, assignee_indices), = insn.assignees_and_indices()
-    lhs_dtype = kernel.get_var_descriptor(assignee_var_name).dtype
+    lhs_var = kernel.get_var_descriptor(assignee_var_name)
+    lhs_dtype = lhs_var.dtype
 
     if insn.atomicity is not None:
         lhs_atomicity = [
@@ -147,7 +148,7 @@ def generate_expr_instruction_code(kernel, insn, codegen_state):
     elif isinstance(lhs_atomicity, AtomicUpdate):
         codegen_state.seen_atomic_dtypes.add(lhs_dtype)
         result = kernel.target.generate_atomic_update(
-                kernel, codegen_state, lhs_atomicity,
+                kernel, codegen_state, lhs_atomicity, lhs_var,
                 insn.assignee, insn.expression,
                 lhs_dtype, rhs_type_context)
 
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index c8c2324e828a6c69696c4e6a828eb79c29d230e8..42a9e5a3dc4c88cceadb3399b5399d609b877cfb 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -133,9 +133,9 @@ class TargetBase(object):
     def get_image_arg_decl(self, name, shape, num_target_axes, dtype, is_written):
         raise NotImplementedError()
 
-    def generate_atomic_update(self, kernel, codegen_state, lhs_atomicity,
+    def generate_atomic_update(self, kernel, codegen_state, lhs_atomicity, lhs_var,
             lhs_expr, rhs_expr, lhs_dtype):
-        raise NotImplementedError("atomic update")
+        raise NotImplementedError("atomic update in target %s" % type(self).__name__)
 
     # }}}
 
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 43c4ea7dc30f139261a3c8ea46e32860c265412f..70bce2331098860ada34c3fc4d865f3ba9207bc1 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -411,7 +411,7 @@ class OpenCLTarget(CTarget):
 
     # {{{ code generation for atomic update
 
-    def generate_atomic_update(self, kernel, codegen_state, lhs_atomicity,
+    def generate_atomic_update(self, kernel, codegen_state, lhs_atomicity, lhs_var,
             lhs_expr, rhs_expr, lhs_dtype, rhs_type_context):
         from pymbolic.mapper.stringifier import PREC_NONE
 
@@ -425,7 +425,7 @@ class OpenCLTarget(CTarget):
             old_val_var = codegen_state.var_name_generator("loopy_old_val")
             new_val_var = codegen_state.var_name_generator("loopy_new_val")
 
-            from loopy.kernel.data import TemporaryVariable
+            from loopy.kernel.data import TemporaryVariable, temp_var_scope
             ecm = codegen_state.expression_to_code_mapper.with_assignments(
                     {
                         old_val_var: TemporaryVariable(old_val_var, lhs_dtype),
@@ -463,9 +463,25 @@ class OpenCLTarget(CTarget):
                 else:
                     assert False
 
+                from loopy.kernel.data import TemporaryVariable, GlobalArg
+                if isinstance(lhs_var, GlobalArg):
+                    var_kind = "__global"
+                elif (
+                        isinstance(lhs_var, TemporaryVariable)
+                        and lhs_var.scope == temp_var_scope.LOCAL):
+                    var_kind = "__local"
+                elif (
+                        isinstance(lhs_var, TemporaryVariable)
+                        and lhs_var.scope == temp_var_scope.GLOBAL):
+                    var_kind = "__global"
+                else:
+                    raise LoopyError("unexpected kind of variable '%s' in "
+                            "atomic operation: "
+                            % (lhs_var.name, type(lhs_var).__name__))
+
                 old_val = "*(%s *) &" % ctype + old_val
                 new_val = "*(%s *) &" % ctype + new_val
-                cast_str = "(__global %s *) " % ctype
+                cast_str = "(%s %s *) " % (var_kind, ctype)
 
             return Block([
                 POD(self, NumpyType(lhs_dtype.dtype), old_val_var),