diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index dbd99e85016b00b3df4827ad7999e7b57e58af24..464bfc926ebfaab61cb0defd95b7e8264866f146 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -627,7 +627,7 @@ class AtomicInit(VarAtomicity): memory_scope.to_string(self.scope)) -class AtomicUpdate(VarAtomicity): +class OrderedAtomic(VarAtomicity): """Properties of an atomic operation. A subclass of :class:`VarAtomicity`. .. attribute:: ordering @@ -647,22 +647,51 @@ class AtomicUpdate(VarAtomicity): :class:`pytools.persistent_dict.PersistentDict`. """ - super(AtomicUpdate, self).update_persistent_hash(key_hash, key_builder) - key_builder.rec(key_hash, "AtomicUpdate") + super(OrderedAtomic, self).update_persistent_hash(key_hash, key_builder) + key_builder.rec(key_hash, str(self.__class__.__name__)) key_builder.rec(key_hash, self.ordering) key_builder.rec(key_hash, self.scope) def __eq__(self, other): - return (super(AtomicUpdate, self).__eq__(other) + return (super(OrderedAtomic, self).__eq__(other) and self.ordering == other.ordering and self.scope == other.scope) def __str__(self): - return "update[%s]%s/%s" % ( + return "%s[%s]%s/%s" % ( + self.op_name, self.var_name, memory_ordering.to_string(self.ordering), memory_scope.to_string(self.scope)) + +class AtomicUpdate(OrderedAtomic): + """Properties of an atomic update. A subclass of :class:`VarAtomicity`. + + .. attribute:: ordering + + One of the values from :class:`memory_ordering` + + .. attribute:: scope + + One of the values from :class:`memory_scope` + """ + op_name = 'update' + + +class AtomicLoad(OrderedAtomic): + """Properties of an atomic load. A subclass of :class:`VarAtomicity`. + + .. attribute:: ordering + + One of the values from :class:`memory_ordering` + + .. attribute:: scope + + One of the values from :class:`memory_scope` + """ + op_name = 'load' + # }}} diff --git a/loopy/preprocess.py b/loopy/preprocess.py index ac7ac19887388649670154fcd36eba79ba3b4315..f2b5e7a87022e01bd51368cc3ef3cc60d507d958 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2047,6 +2047,49 @@ def limit_boostability(kernel): # }}} +# {{{ check for loads of atomic variables + +def check_atomic_loads(kernel): + """Find instances of AtomicInit or AtomicUpdate with use of other atomic + variables to update the atomicity + """ + + logger.debug("%s: check atomic loads" % kernel.name) + from loopy.types import AtomicType + from loopy.kernel.array import ArrayBase + from loopy.kernel.instruction import Assignment, AtomicLoad + + # find atomic variables + atomicity_candidates = ( + set(v.name for v in six.itervalues(kernel.temporary_variables) + if isinstance(v.dtype, AtomicType)) + | + set(v.name for v in kernel.args + if isinstance(v, ArrayBase) + and isinstance(v.dtype, AtomicType))) + + new_insns = [] + for insn in kernel.instructions: + if isinstance(insn, Assignment): + # look for atomic variables + atomic_accesses = set(a.var_name for a in insn.atomicity) + accessed_atomic_vars = (insn.dependency_names() & atomicity_candidates)\ + - set([insn.assignee_var_names()[0]]) + if not accessed_atomic_vars <= atomic_accesses: + #if we're missing some + missed = accessed_atomic_vars - atomic_accesses + for x in missed: + if set([x]) & atomicity_candidates: + insn = insn.copy( + atomicity=insn.atomicity + (AtomicLoad(x),)) + + new_insns.append(insn) + + return kernel.copy(instructions=new_insns) + +# }}} + + preprocess_cache = WriteOncePersistentDict( "loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) @@ -2132,6 +2175,10 @@ def preprocess_kernel(kernel, device=None): kernel = find_idempotence(kernel) kernel = limit_boostability(kernel) + # check for atomic loads, much easier to do here now that the dependencies + # have been established + kernel = check_atomic_loads(kernel) + kernel = kernel.target.preprocess(kernel) logger.info("%s: preprocess done" % kernel.name) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index a2cfbb3600fe73cea3b1dae4e0d203b68aeaabe1..1dcb8c6e5f806f27a38bcd94f51d96bac528f271 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -783,7 +783,11 @@ class CASTBuilder(ASTBuilderBase): needed_dtype=lhs_dtype)) elif isinstance(lhs_atomicity, AtomicInit): - raise NotImplementedError("atomic init") + codegen_state.seen_atomic_dtypes.add(lhs_dtype) + return codegen_state.ast_builder.emit_atomic_init( + codegen_state, lhs_atomicity, lhs_var, + insn.assignee, insn.expression, + lhs_dtype, rhs_type_context) elif isinstance(lhs_atomicity, AtomicUpdate): codegen_state.seen_atomic_dtypes.add(lhs_dtype) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index d2fe4157fc1ff6f9eb7817bea7da8da7e31bbdc1..31e0569b92a9ddada8ad66c2e0a065c191cc61d3 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -501,6 +501,18 @@ class OpenCLCASTBuilder(CASTBuilder): return CLConstant(arg_decl) + # {{{ + + def emit_atomic_init(self, codegen_state, lhs_atomicity, lhs_var, + lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): + # for the CL1 flavor, this is as simple as a regular update with whatever + # the RHS value is... + + return self.emit_atomic_update(codegen_state, lhs_atomicity, lhs_var, + lhs_expr, rhs_expr, lhs_dtype, rhs_type_context) + + # }}} + # {{{ code generation for atomic update def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, diff --git a/test/test_loopy.py b/test/test_loopy.py index 084f9f045bc45c34a33feb22eaacc05df8d6c02e..927f5a475f2cc11f2d74876676947765be40f010 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1062,6 +1062,77 @@ def test_atomic(ctx_factory, dtype): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=10000)) +@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) +def test_atomic_load(ctx_factory, dtype): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + from loopy.kernel.data import temp_var_scope as scopes + n = 10 + vec_width = 4 + + if ( + np.dtype(dtype).itemsize == 8 + and "cl_khr_int64_base_atomics" not in ctx.devices[0].extensions): + pytest.skip("64-bit atomics not supported on device") + + import pyopencl.version # noqa + if ( + cl.version.VERSION < (2015, 2) + and dtype == np.int64): + pytest.skip("int64 RNG not supported in PyOpenCL < 2015.2") + + knl = lp.make_kernel( + "{ [i,j]: 0<=i,j upper = 0 + <> lower = 0 + temp = 0 {id=init, atomic} + for i + upper = upper + i * a[i] {id=sum0} + lower = lower - b[i] {id=sum1} + end + temp = temp + lower {id=temp_sum, dep=sum*:init, atomic,\ + nosync=init} + ... lbarrier {id=lb2, dep=temp_sum} + out[j] = upper / temp {id=final, dep=lb2, atomic,\ + nosync=init:temp_sum} + end + """, + [ + lp.GlobalArg("out", dtype, shape=lp.auto, for_atomic=True), + lp.GlobalArg("a", dtype, shape=lp.auto), + lp.GlobalArg("b", dtype, shape=lp.auto), + lp.TemporaryVariable('temp', dtype, for_atomic=True, + scope=scopes.LOCAL), + "..." + ], + silenced_warnings=["write_race(init)", "write_race(temp_sum)"]) + knl = lp.fix_parameters(knl, n=n) + knl = lp.split_iname(knl, "j", vec_width, inner_tag="l.0") + _, out = knl(queue, a=np.arange(n, dtype=dtype), b=np.arange(n, dtype=dtype)) + assert np.allclose(out, np.full_like(out, ((1 - 2 * n) / 3.0))) + + +@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) +def test_atomic_init(dtype): + vec_width = 4 + + knl = lp.make_kernel( + "{ [i,j]: 0<=i<100 }", + """ + out[i%4] = 0 {id=init, atomic=init} + """, + [ + lp.GlobalArg("out", dtype, shape=lp.auto, for_atomic=True), + "..." + ], + silenced_warnings=["write_race(init)"]) + knl = lp.split_iname(knl, 'i', vec_width, inner_tag='l.0') + + print(lp.generate_code_v2(knl).device_code()) + + def test_within_inames_and_reduction(): # See https://github.com/inducer/loopy/issues/24