From 1e257dfdc9c5793f886b59908061229d1bacdc32 Mon Sep 17 00:00:00 2001 From: arghdos Date: Thu, 17 Aug 2017 15:22:18 -0400 Subject: [PATCH 1/9] add ability to load atomics along the lines of c11 standard --- loopy/check.py | 1 + loopy/kernel/instruction.py | 35 +++++++++++++++++++++++++--- loopy/preprocess.py | 46 +++++++++++++++++++++++++++++++++++++ test/test_loopy.py | 34 +++++++++++++++++++++++++++ 4 files changed, 113 insertions(+), 3 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index e72f9e3e6..8d78e684b 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -493,6 +493,7 @@ def check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel): accessed_atomic_vars = insn.dependency_names() & atomicity_candidates if not accessed_atomic_vars <= atomic_accesses: + missed = atomic_accesses - accessed_atomic_vars raise LoopyError("atomic variable(s) '%s' in instruction '%s' " "used in non-atomic access" % ( diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index d5c388af6..1d20ff71c 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -603,7 +603,7 @@ class AtomicInit(VarAtomicity): memory_scope.to_string(self.scope)) -class AtomicUpdate(VarAtomicity): +class OrderedAtomic(VarAtomicity): """Properties of an atomic operation. A subclass of :class:`VarAtomicity`. .. attribute:: ordering @@ -624,7 +624,7 @@ class AtomicUpdate(VarAtomicity): """ super(AtomicUpdate, self).update_persistent_hash(key_hash, key_builder) - key_builder.rec(key_hash, "AtomicUpdate") + key_builder.rec(key_hash, str(self.__class__.__name__)) key_builder.rec(key_hash, self.ordering) key_builder.rec(key_hash, self.scope) @@ -634,11 +634,40 @@ class AtomicUpdate(VarAtomicity): and self.scope == other.scope) def __str__(self): - return "update[%s]%s/%s" % ( + return "%s[%s]%s/%s" % ( + self.op_name, self.var_name, memory_ordering.to_string(self.ordering), memory_scope.to_string(self.scope)) + +class AtomicUpdate(VarAtomicity): + """Properties of an atomic update. A subclass of :class:`VarAtomicity`. + + .. attribute:: ordering + + One of the values from :class:`memory_ordering` + + .. attribute:: scope + + One of the values from :class:`memory_scope` + """ + op_name = 'update' + + +class AtomicLoad(VarAtomicity): + """Properties of an atomic load. A subclass of :class:`VarAtomicity`. + + .. attribute:: ordering + + One of the values from :class:`memory_ordering` + + .. attribute:: scope + + One of the values from :class:`memory_scope` + """ + op_name = 'load' + # }}} diff --git a/loopy/preprocess.py b/loopy/preprocess.py index ced1aaaa1..554bf6b24 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2020,6 +2020,48 @@ def limit_boostability(kernel): # }}} +# {{{ check for loads of atomic variables + +def check_atomic_loads(kernel): + """Find instances of AtomicInit or AtomicUpdate with use of other atomic + variables to update the atomicity + """ + + logger.debug("%s: check atomic loads" % kernel.name) + from loopy.types import AtomicType + from loopy.kernel.array import ArrayBase + from loopy.kernel.instruction import Assignment, AtomicLoad + + # find atomic variables + atomicity_candidates = ( + set(v.name for v in six.itervalues(kernel.temporary_variables) + if isinstance(v.dtype, AtomicType)) + | + set(v.name for v in kernel.args + if isinstance(v, ArrayBase) + and isinstance(v.dtype, AtomicType))) + + new_insns = [] + for insn in kernel.instructions: + if isinstance(insn, Assignment): + # look for atomic variables + atomic_accesses = set(a.var_name for a in insn.atomicity) + accessed_atomic_vars = insn.dependency_names() & atomicity_candidates + if not accessed_atomic_vars <= atomic_accesses: + #if we're missing some + missed = accessed_atomic_vars - atomic_accesses + for x in missed: + if set([x]) & atomicity_candidates: + insn = insn.copy( + atomicity=insn.atomicity + (AtomicLoad(x),)) + + new_insns.append(insn) + + return kernel.copy(instructions=new_insns) + +# }}} + + preprocess_cache = PersistentDict("loopy-preprocess-cache-v2-"+DATA_MODEL_VERSION, key_builder=LoopyKeyBuilder()) @@ -2104,6 +2146,10 @@ def preprocess_kernel(kernel, device=None): kernel = find_idempotence(kernel) kernel = limit_boostability(kernel) + # check for atomic loads, much easier to do here now that the dependencies + # have been established + kernel = check_atomic_loads(kernel) + kernel = kernel.target.preprocess(kernel) logger.info("%s: preprocess done" % kernel.name) diff --git a/test/test_loopy.py b/test/test_loopy.py index 3593019ad..f767197b0 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1011,6 +1011,40 @@ def test_atomic(ctx_factory, dtype): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=10000)) +def test_atomic_load(ctx_factory): + dtype = np.int32 + ctx = ctx_factory() + + if ( + np.dtype(dtype).itemsize == 8 + and "cl_khr_int64_base_atomics" not in ctx.devices[0].extensions): + pytest.skip("64-bit atomics not supported on device") + + import pyopencl.version # noqa + if ( + cl.version.VERSION < (2015, 2) + and dtype == np.int64): + pytest.skip("int64 RNG not supported in PyOpenCL < 2015.2") + + knl = lp.make_kernel( + "{ [i]: 0<=i Date: Thu, 17 Aug 2017 16:20:06 -0400 Subject: [PATCH 2/9] more fixes, doesn't seem worth it --- loopy/check.py | 1 - loopy/preprocess.py | 3 ++- test/test_loopy.py | 34 ++++++++++++++++++++++++---------- 3 files changed, 26 insertions(+), 12 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 8d78e684b..e72f9e3e6 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -493,7 +493,6 @@ def check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel): accessed_atomic_vars = insn.dependency_names() & atomicity_candidates if not accessed_atomic_vars <= atomic_accesses: - missed = atomic_accesses - accessed_atomic_vars raise LoopyError("atomic variable(s) '%s' in instruction '%s' " "used in non-atomic access" % ( diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 554bf6b24..b67262e6b 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -2046,7 +2046,8 @@ def check_atomic_loads(kernel): if isinstance(insn, Assignment): # look for atomic variables atomic_accesses = set(a.var_name for a in insn.atomicity) - accessed_atomic_vars = insn.dependency_names() & atomicity_candidates + accessed_atomic_vars = (insn.dependency_names() & atomicity_candidates)\ + - set([insn.assignee_var_names()[0]]) if not accessed_atomic_vars <= atomic_accesses: #if we're missing some missed = accessed_atomic_vars - atomic_accesses diff --git a/test/test_loopy.py b/test/test_loopy.py index f767197b0..1bdad6b76 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1014,6 +1014,9 @@ def test_atomic(ctx_factory, dtype): def test_atomic_load(ctx_factory): dtype = np.int32 ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + from loopy.kernel.data import temp_var_scope as scopes + n = 100 if ( np.dtype(dtype).itemsize == 8 @@ -1027,22 +1030,33 @@ def test_atomic_load(ctx_factory): pytest.skip("int64 RNG not supported in PyOpenCL < 2015.2") knl = lp.make_kernel( - "{ [i]: 0<=i upper = 0 + <> lower = 0 + temp[0] = 0 {id=init, atomic} + for i + upper = upper + i * a[i] {id=sum0} + lower = lower - b[i] {id=sum1} + end + ... lbarrier {id=lb1, dep=sum1} + temp[0] = temp[0] + lower {id=temp_sum, dep=sum*:lb1:init, atomic} + ... lbarrier {id=lb2, dep=temp_sum} + out[j] = upper / temp[0] {dep=sum*:temp_sum:lb2, atomic} + end """, [ lp.GlobalArg("out", dtype, shape=lp.auto, for_atomic=True), - lp.GlobalArg('temp', dtype, shape=lp.auto, for_atomic=True), + lp.GlobalArg("a", dtype, shape=lp.auto), + lp.GlobalArg("b", dtype, shape=lp.auto), + lp.TemporaryVariable('temp', dtype, for_atomic=True, + scope=scopes.GLOBAL, shape=(1,)), "..." - ], - assumptions="n>0") + ]) - ref_knl = knl - knl = lp.split_iname(knl, "i", 512) - knl = lp.split_iname(knl, "i_inner", 128, outer_tag="unr", inner_tag="g.0") - lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=10000)) + knl = lp.split_iname(knl, "j", 512, inner_tag="l.0") + _, out = knl(queue, a=np.arange(n), b=np.arange(n)) def test_within_inames_and_reduction(): -- GitLab From 52932e9e16dfce6a597c23ef8b7ec0bdaa00fc57 Mon Sep 17 00:00:00 2001 From: arghdos Date: Fri, 18 Aug 2017 10:23:12 -0400 Subject: [PATCH 3/9] fix atomic load test --- test/test_loopy.py | 14 +++++++++----- 1 file changed, 9 insertions(+), 5 deletions(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index 1bdad6b76..0a3ce11f4 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1017,6 +1017,7 @@ def test_atomic_load(ctx_factory): queue = cl.CommandQueue(ctx) from loopy.kernel.data import temp_var_scope as scopes n = 100 + vec_width = 4 if ( np.dtype(dtype).itemsize == 8 @@ -1041,9 +1042,11 @@ def test_atomic_load(ctx_factory): lower = lower - b[i] {id=sum1} end ... lbarrier {id=lb1, dep=sum1} - temp[0] = temp[0] + lower {id=temp_sum, dep=sum*:lb1:init, atomic} + temp[0] = temp[0] + lower {id=temp_sum, dep=sum*:lb1:init, atomic,\ + nosync=init} ... lbarrier {id=lb2, dep=temp_sum} - out[j] = upper / temp[0] {dep=sum*:temp_sum:lb2, atomic} + out[j] = upper / temp[0] {id=final, dep=sum*:temp_sum:lb2, atomic,\ + nosync=init:temp_sum} end """, [ @@ -1051,12 +1054,13 @@ def test_atomic_load(ctx_factory): lp.GlobalArg("a", dtype, shape=lp.auto), lp.GlobalArg("b", dtype, shape=lp.auto), lp.TemporaryVariable('temp', dtype, for_atomic=True, - scope=scopes.GLOBAL, shape=(1,)), + scope=scopes.LOCAL, shape=(1,)), "..." ]) - knl = lp.split_iname(knl, "j", 512, inner_tag="l.0") - _, out = knl(queue, a=np.arange(n), b=np.arange(n)) + knl = lp.split_iname(knl, "j", vec_width, inner_tag="l.0") + _, out = knl(queue, a=np.arange(n, dtype=dtype), b=np.arange(n, dtype=dtype)) + assert np.allclose(out, np.full_like(out, (-(2 * n - 1) / (3 * vec_width)))) def test_within_inames_and_reduction(): -- GitLab From cfc85e74ca25ddf80f7d7c2d6c13decc8da38b14 Mon Sep 17 00:00:00 2001 From: arghdos Date: Fri, 18 Aug 2017 10:26:15 -0400 Subject: [PATCH 4/9] correct derive class --- loopy/kernel/instruction.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 1d20ff71c..be9dd93fa 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -641,7 +641,7 @@ class OrderedAtomic(VarAtomicity): memory_scope.to_string(self.scope)) -class AtomicUpdate(VarAtomicity): +class AtomicUpdate(OrderedAtomic): """Properties of an atomic update. A subclass of :class:`VarAtomicity`. .. attribute:: ordering @@ -655,7 +655,7 @@ class AtomicUpdate(VarAtomicity): op_name = 'update' -class AtomicLoad(VarAtomicity): +class AtomicLoad(OrderedAtomic): """Properties of an atomic load. A subclass of :class:`VarAtomicity`. .. attribute:: ordering -- GitLab From e5d6e16bc1ac001ccce8ea44e75d9b1336d7b08d Mon Sep 17 00:00:00 2001 From: arghdos Date: Fri, 18 Aug 2017 10:45:27 -0400 Subject: [PATCH 5/9] not quite sure why this is failing, try ensuring the barrier is _after_ the temp init --- test/test_loopy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index 0a3ce11f4..6fecef421 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1041,7 +1041,7 @@ def test_atomic_load(ctx_factory): upper = upper + i * a[i] {id=sum0} lower = lower - b[i] {id=sum1} end - ... lbarrier {id=lb1, dep=sum1} + ... lbarrier {id=lb1, dep=sum1:init} temp[0] = temp[0] + lower {id=temp_sum, dep=sum*:lb1:init, atomic,\ nosync=init} ... lbarrier {id=lb2, dep=temp_sum} -- GitLab From fa2c4ad613e4e9e0a62a96982ecdf2350b2aa4ac Mon Sep 17 00:00:00 2001 From: arghdos Date: Fri, 18 Aug 2017 11:58:44 -0400 Subject: [PATCH 6/9] fix recursive atomic hash loop, switch to float32 & add warning ignores --- loopy/kernel/instruction.py | 4 ++-- test/test_loopy.py | 7 ++++--- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index be9dd93fa..cce65a522 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -623,13 +623,13 @@ class OrderedAtomic(VarAtomicity): :class:`pytools.persistent_dict.PersistentDict`. """ - super(AtomicUpdate, self).update_persistent_hash(key_hash, key_builder) + super(OrderedAtomic, self).update_persistent_hash(key_hash, key_builder) key_builder.rec(key_hash, str(self.__class__.__name__)) key_builder.rec(key_hash, self.ordering) key_builder.rec(key_hash, self.scope) def __eq__(self, other): - return (super(AtomicUpdate, self).__eq__(other) + return (super(OrderedAtomic, self).__eq__(other) and self.ordering == other.ordering and self.scope == other.scope) diff --git a/test/test_loopy.py b/test/test_loopy.py index 6fecef421..54a304d40 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1054,13 +1054,14 @@ def test_atomic_load(ctx_factory): lp.GlobalArg("a", dtype, shape=lp.auto), lp.GlobalArg("b", dtype, shape=lp.auto), lp.TemporaryVariable('temp', dtype, for_atomic=True, - scope=scopes.LOCAL, shape=(1,)), + scope=scopes.LOCAL, shape=(vec_width,)), "..." - ]) + ], + silenced_warnings=["write_race(init)", "write_race(temp_sum)"]) knl = lp.split_iname(knl, "j", vec_width, inner_tag="l.0") _, out = knl(queue, a=np.arange(n, dtype=dtype), b=np.arange(n, dtype=dtype)) - assert np.allclose(out, np.full_like(out, (-(2 * n - 1) / (3 * vec_width)))) + assert np.allclose(out, np.full_like(out, (-(2 * n - 1) / float(3 * vec_width)))) def test_within_inames_and_reduction(): -- GitLab From 5a9b669e0c4f0fb882de96a1baa3c619cd77dc3d Mon Sep 17 00:00:00 2001 From: arghdos Date: Fri, 18 Aug 2017 14:06:33 -0400 Subject: [PATCH 7/9] float for exact comparison --- test/test_loopy.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index 54a304d40..0aff90fd2 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1012,7 +1012,7 @@ def test_atomic(ctx_factory, dtype): def test_atomic_load(ctx_factory): - dtype = np.int32 + dtype = np.float32 ctx = ctx_factory() queue = cl.CommandQueue(ctx) from loopy.kernel.data import temp_var_scope as scopes -- GitLab From e68e85523c281dbac32a932c427993104dd30038 Mon Sep 17 00:00:00 2001 From: arghdos Date: Fri, 18 Aug 2017 14:41:26 -0400 Subject: [PATCH 8/9] add support for atomic_init in OpenCL --- loopy/target/c/__init__.py | 6 +++++- loopy/target/opencl.py | 12 ++++++++++++ test/test_loopy.py | 19 +++++++++++++++++++ 3 files changed, 36 insertions(+), 1 deletion(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index a2ad68250..47130c1f7 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -631,7 +631,11 @@ class CASTBuilder(ASTBuilderBase): needed_dtype=lhs_dtype)) elif isinstance(lhs_atomicity, AtomicInit): - raise NotImplementedError("atomic init") + codegen_state.seen_atomic_dtypes.add(lhs_dtype) + return codegen_state.ast_builder.emit_atomic_init( + codegen_state, lhs_atomicity, lhs_var, + insn.assignee, insn.expression, + lhs_dtype, rhs_type_context) elif isinstance(lhs_atomicity, AtomicUpdate): codegen_state.seen_atomic_dtypes.add(lhs_dtype) diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py index a5f7562c4..95299ef52 100644 --- a/loopy/target/opencl.py +++ b/loopy/target/opencl.py @@ -507,6 +507,18 @@ class OpenCLCASTBuilder(CASTBuilder): return CLConstant(arg_decl) + # {{{ + + def emit_atomic_init(self, codegen_state, lhs_atomicity, lhs_var, + lhs_expr, rhs_expr, lhs_dtype, rhs_type_context): + # for the CL1 flavor, this is as simple as a regular update with whatever + # the RHS value is... + + return self.emit_atomic_update(codegen_state, lhs_atomicity, lhs_var, + lhs_expr, rhs_expr, lhs_dtype, rhs_type_context) + + # }}} + # {{{ code generation for atomic update def emit_atomic_update(self, codegen_state, lhs_atomicity, lhs_var, diff --git a/test/test_loopy.py b/test/test_loopy.py index 0aff90fd2..f11230b1f 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1064,6 +1064,25 @@ def test_atomic_load(ctx_factory): assert np.allclose(out, np.full_like(out, (-(2 * n - 1) / float(3 * vec_width)))) +def test_atomic_init(): + dtype = np.float32 + vec_width = 4 + + knl = lp.make_kernel( + "{ [i,j]: 0<=i<100 }", + """ + out[i%4] = 0 {id=init, atomic=init} + """, + [ + lp.GlobalArg("out", dtype, shape=lp.auto, for_atomic=True), + "..." + ], + silenced_warnings=["write_race(init)"]) + knl = lp.split_iname(knl, 'i', vec_width, inner_tag='l.0') + + print(lp.generate_code_v2(knl).device_code()) + + def test_within_inames_and_reduction(): # See https://github.com/inducer/loopy/issues/24 -- GitLab From be7223899e6d1b7cd63ebc96994b43ad6942f0c0 Mon Sep 17 00:00:00 2001 From: arghdos Date: Mon, 4 Dec 2017 12:02:54 -0500 Subject: [PATCH 9/9] cleanup --- test/test_loopy.py | 25 ++++++++++++------------- 1 file changed, 12 insertions(+), 13 deletions(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index 524ba45f2..927f5a475 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1062,12 +1062,12 @@ def test_atomic(ctx_factory, dtype): lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(n=10000)) -def test_atomic_load(ctx_factory): - dtype = np.float32 +@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) +def test_atomic_load(ctx_factory, dtype): ctx = ctx_factory() queue = cl.CommandQueue(ctx) from loopy.kernel.data import temp_var_scope as scopes - n = 100 + n = 10 vec_width = 4 if ( @@ -1082,21 +1082,20 @@ def test_atomic_load(ctx_factory): pytest.skip("int64 RNG not supported in PyOpenCL < 2015.2") knl = lp.make_kernel( - "{ [i,j]: 0<=i,j<100 }", + "{ [i,j]: 0<=i,j upper = 0 <> lower = 0 - temp[0] = 0 {id=init, atomic} + temp = 0 {id=init, atomic} for i upper = upper + i * a[i] {id=sum0} lower = lower - b[i] {id=sum1} end - ... lbarrier {id=lb1, dep=sum1:init} - temp[0] = temp[0] + lower {id=temp_sum, dep=sum*:lb1:init, atomic,\ + temp = temp + lower {id=temp_sum, dep=sum*:init, atomic,\ nosync=init} ... lbarrier {id=lb2, dep=temp_sum} - out[j] = upper / temp[0] {id=final, dep=sum*:temp_sum:lb2, atomic,\ + out[j] = upper / temp {id=final, dep=lb2, atomic,\ nosync=init:temp_sum} end """, @@ -1105,18 +1104,18 @@ def test_atomic_load(ctx_factory): lp.GlobalArg("a", dtype, shape=lp.auto), lp.GlobalArg("b", dtype, shape=lp.auto), lp.TemporaryVariable('temp', dtype, for_atomic=True, - scope=scopes.LOCAL, shape=(vec_width,)), + scope=scopes.LOCAL), "..." ], silenced_warnings=["write_race(init)", "write_race(temp_sum)"]) - + knl = lp.fix_parameters(knl, n=n) knl = lp.split_iname(knl, "j", vec_width, inner_tag="l.0") _, out = knl(queue, a=np.arange(n, dtype=dtype), b=np.arange(n, dtype=dtype)) - assert np.allclose(out, np.full_like(out, (-(2 * n - 1) / float(3 * vec_width)))) + assert np.allclose(out, np.full_like(out, ((1 - 2 * n) / 3.0))) -def test_atomic_init(): - dtype = np.float32 +@pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64]) +def test_atomic_init(dtype): vec_width = 4 knl = lp.make_kernel( -- GitLab