From 4c86036ee8e9dd915e745c56828ec1addb7aa536 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 1 Dec 2016 14:34:38 -0600 Subject: [PATCH 1/7] Add check for missing definitions (closes #10). The check is to make sure that a non-global temporary gets a definition within every subkernel that uses it. --- loopy/check.py | 24 ++++++++++++++++++++++++ loopy/diagnostic.py | 4 ++++ test/test_loopy.py | 16 ++++++++++++++++ 3 files changed, 44 insertions(+) diff --git a/loopy/check.py b/loopy/check.py index 2556bee7b..8e1709b71 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -502,6 +502,29 @@ def check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel): # }}} +# {{{ check that temporaries are defined in subkernels where used + +def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): + from loopy.schedule.tools import InstructionQuery + from loopy.kernel.data import temp_var_scope + + insn_query = InstructionQuery(kernel) + + for subkernel in insn_query.subkernels(): + for temporary in ( + insn_query.temporaries_read_in_subkernel(subkernel) - + insn_query.temporaries_written_in_subkernel(subkernel)): + + if kernel.temporary_variables[temporary].scope in ( + temp_var_scope.PRIVATE, temp_var_scope.LOCAL): + from loopy.diagnostic import MissingDefinitionError + raise MissingDefinitionError("temporary variable '%s' gets used in " + "subkernel '%s' without a definition (maybe you forgot to call " + "loopy.save_and_reload_temporaries?)" % (temporary, subkernel)) + +# }}} + + # {{{ check that shapes and strides are arguments def check_that_shapes_and_strides_are_arguments(kernel): @@ -552,6 +575,7 @@ def pre_codegen_checks(kernel): check_for_unused_hw_axes_in_insns(kernel) check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel) + check_that_temporaries_are_defined_in_subkernels_where_used(kernel) kernel.target.pre_codegen_check(kernel) check_that_shapes_and_strides_are_arguments(kernel) diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py index 29996d6c7..e76279c27 100644 --- a/loopy/diagnostic.py +++ b/loopy/diagnostic.py @@ -95,6 +95,10 @@ class DependencyTypeInferenceFailure(TypeInferenceFailure): class MissingBarrierError(LoopyError): pass + +class MissingDefinitionError(LoopyError): + pass + # }}} diff --git a/test/test_loopy.py b/test/test_loopy.py index af4269047..81fe49cb4 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1316,6 +1316,22 @@ def test_save_local_multidim_array(ctx_factory, debug=False): save_and_reload_temporaries_test(queue, knl, 1, debug) +def test_missing_temporary_definition_detection(): + knl = lp.make_kernel( + "{ [i]: 0<=i<10 }", + """ + for i + <> t = 1 + ... gbarrier + out[i] = t + end + """, seq_dependencies=True) + + from loopy.diagnostic import MissingDefinitionError + with pytest.raises(MissingDefinitionError): + lp.generate_code_v2(knl) + + def test_global_temporary(ctx_factory): ctx = ctx_factory() -- GitLab From 5108ca72c6d908608ae7a54a5141bfc1b9ee000c Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 1 Dec 2016 16:45:54 -0600 Subject: [PATCH 2/7] Expand tutorial section for global barriers, re-word some things, and update to include the missing definition error. --- doc/tutorial.rst | 188 +++++++++++++++++++++++++++++++++++------------ 1 file changed, 139 insertions(+), 49 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 7c8ba2fc9..06da1f8b2 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1057,6 +1057,16 @@ earlier: Tagged prefetching ~~~~~~~~~~~~~~~~~~ +.. _global_temporaries: + +Temporaries in global memory +~~~~~~~~~~~~~~~~~~~~~~~~~~~~ + +:mod:`loopy` supports using temporaries with global storage duration. As with +local and private temporaries, the runtime allocates storage for global +temporaries when the kernel gets executed. The user must explicitly specify that +a temporary is global. To specify that a temporary is global, use +:func:`loopy.set_temporary_scope`. Substitution rules ~~~~~~~~~~~~~~~~~~ @@ -1074,9 +1084,11 @@ Synchronization .. {{{ -In OpenCL, writes are not generally guaranteed to be immediately visible to -other work items. In order to ensure that memory is consistent across work -items, some sort of synchronization operation is used. +When one work item executing with others writes to a memory location, OpenCL +does not guarantee that other work items will immediately be able to read the +memory location and get back the same thing that was written. In order to ensure +that memory is consistent across work items, some sort of synchronization +operation is used. :mod:`loopy` supports synchronization in the form of *barriers* or *atomic operations*. @@ -1085,28 +1097,37 @@ Barriers ~~~~~~~~ Prior to code generation, :mod:`loopy` performs a check to see that every memory -access is free of dependencies requiring a barrier. A memory access dependency -that exists across multiple work items requires a barrier if it involves at -least one write operation. +access is free of dependencies requiring a barrier. The following kinds of +memory access dependencies require a barrier when they involve more than one +work item: + +* read-after-write +* write-after-read +* write-after-write. :mod:`loopy` supports two kinds of barriers: * *Local barriers* ensure consistency of local memory accesses to items within - *the same* work group. As in OpenCL, all work items in the group are required - to wait until everyone has reached the barrier instruction before continuing. + *the same* work group. This synchronizes with all instructions in the work + group. + +* *Global barriers* ensure consistency of global memory accesses + across *all* work groups, i.e. it synchronizes with every work item + executing the kernel. Note that there is no exact equivalent for + this kind of barrier in OpenCL. [#global-barrier-note]_ -* *Global barriers* ensure consistency of *global* memory accesses across *all* - work groups. Note that there is no exact equivalent in OpenCL. All work items - across all work groups are required to wait until everyone has reached the - barrier instruction before continuing. +Once a work item has reached a barrier, it waits for everyone that it +synchronizes with to reach the barrier before continuing. This means that unless +all work items reach the same barrier, the kernel will hang during execution. By default, :mod:`loopy` inserts local barriers between two instructions when it detects that a dependency involving local memory may occur across work items. To -see this in action, take a look at the section on :ref:`local_temporaries`. +see this in action, take a look at the section on :ref:`local_temporaries`. -In contrast, :mod:`loopy` will *not* insert global barriers -automatically. Consider the following kernel, which attempts to rotate its input -to the right by 1: +In contrast, :mod:`loopy` will *not* insert global barriers automatically. +Global barriers require manual intervention along with some special +post-processing which we describe below. Consider the following kernel, which +attempts to rotate its input to the right by 1 in parallel: .. doctest:: @@ -1125,16 +1146,17 @@ to the right by 1: ... name="rotate_v1", ... assumptions="n mod 16 = 0") >>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0") + +Note the presence of the write-after-read dependency in global memory. Due to +this, :mod:`loopy` will complain that global barrier needs to be inserted: + >>> cgr = lp.generate_code_v2(knl) Traceback (most recent call last): ... MissingBarrierError: Dependency 'rotate depends on maketmp' (for variable 'arr') requires synchronization by a global barrier (add a 'no_sync_with' instruction option to state that no synchronization is needed) -Because of the write-after-read dependency in global memory, a global barrier -needs to be inserted. This can be accomplished with a ``... gbarrier`` -instruction. Note that :mod:`loopy` implements global barriers by splitting the -kernel into multiple device-side kernels, so that the resulting code will -contain more than one kernel. +The syntax for a global barrier instruction is ``... gbarrier``. This needs to +be added between the pair of offending instructions. .. doctest:: @@ -1154,38 +1176,91 @@ contain more than one kernel. ... name="rotate_v2", ... assumptions="n mod 16 = 0") >>> knl = lp.split_iname(knl, "i", 16, inner_tag="l.0", outer_tag="g.0") + +When we try to generate code for this, it will still not work. + >>> cgr = lp.generate_code_v2(knl) - >>> print(cgr.device_code()) - #define lid(N) ((int) get_local_id(N)) - #define gid(N) ((int) get_group_id(N)) - - __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ arr, int const n) - { - int tmp; - - tmp = arr[16 * gid(0) + lid(0)]; - } - - __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n) - { - int tmp; - - arr[((1 + lid(0) + gid(0) * 16) % n)] = tmp; - } + Traceback (most recent call last): + ... + MissingDefinitionError: temporary variable 'tmp' gets used in subkernel 'rotate_v2_0' without a definition (maybe you forgot to call loopy.save_and_reload_temporaries?) -Note that we are not done yet. The problem is that while `tmp` is assigned in -the first kernel, the assignment of `tmp` is not saved for the second -kernel. :mod:`loopy` provides a function called -:func:`loopy.save_and_reload_temporaries` for the purpose of handling the -situation of saving and restoring temporary values across global barriers. In -order to use this function the kernel must be preprocessed and scheduled first, -the latter of which is handled by :func:`loopy.get_one_scheduled_kernel`. +To understand what is going on, you need to know that :mod:`loopy` implements +global barriers by splitting the kernel into multiple device-side kernels. The +splitting happens when the instruction schedule is generated. To see the +schedule, we must first must first call :func:`loopy.get_one_scheduled_kernel`: -.. doctest:: + >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) + >>> print(knl) + --------------------------------------------------------------------------- + KERNEL: rotate_v2 + --------------------------------------------------------------------------- + ... + --------------------------------------------------------------------------- + SCHEDULE: + 0: CALL KERNEL rotate_v2(extra_args=[], extra_inames=[]) + 1: [maketmp] tmp <- arr[i_inner + i_outer*16] + 2: RETURN FROM KERNEL rotate_v2 + 3: ---BARRIER:global--- + 4: CALL KERNEL rotate_v2_0(extra_args=[], extra_inames=[]) + 5: [rotate] arr[((1 + i_inner + i_outer*16) % n)] <- tmp + 6: RETURN FROM KERNEL rotate_v2_0 + --------------------------------------------------------------------------- + +As the error message suggests, taking a look at the generated instruction +schedule will show that while ``tmp`` is assigned in the first kernel, the +assignment to ``tmp`` is not seen by the second kernel. Because the temporary is +in private memory, it does not persist across calls to device kernels (the same +goes for local temporaries). + +:mod:`loopy` provides a function called +:func:`loopy.save_and_reload_temporaries` for the purpose of handling the +task of saving and restoring temporary values across global barriers. This +function adds instructions to the kernel without scheduling them. That means +that :func:`loopy.get_one_scheduled_kernel` needs to be called one more time to +put those instructions into the schedule. >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) >>> knl = lp.save_and_reload_temporaries(knl) - >>> knl = lp.get_one_scheduled_kernel(knl) + >>> knl = lp.get_one_scheduled_kernel(knl) # Schedule added instructions + >>> print(knl) + --------------------------------------------------------------------------- + KERNEL: rotate_v2 + --------------------------------------------------------------------------- + ... + --------------------------------------------------------------------------- + TEMPORARIES: + tmp: type: np:dtype('int32'), shape: () scope:private + tmp_save_slot: type: np:dtype('int32'), shape: (n // 16, 16), dim_tags: (N1:stride:16, N0:stride:1) scope:global + --------------------------------------------------------------------------- + ... + --------------------------------------------------------------------------- + SCHEDULE: + 0: CALL KERNEL rotate_v2(extra_args=['tmp_save_slot'], extra_inames=[]) + 1: [maketmp] tmp <- arr[i_inner + i_outer*16] + 2: [tmp.save] tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] <- tmp + 3: RETURN FROM KERNEL rotate_v2 + 4: ---BARRIER:global--- + 5: CALL KERNEL rotate_v2_0(extra_args=['tmp_save_slot'], extra_inames=[]) + 6: [tmp.reload] tmp <- tmp_save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0] + 7: [rotate] arr[((1 + i_inner + i_outer*16) % n)] <- tmp + 8: RETURN FROM KERNEL rotate_v2_0 + --------------------------------------------------------------------------- + +Here's an overview of what :func:`loopy.save_and_reload_temporaries` actually +does in more detail: + +1. :mod:`loopy` first uses liveness analysis to determine which temporary + variables' live ranges cross a global barrier. + +2. For each temporary, :mod:`loopy` creates a storage slot for the temporary in + global memory (see :ref:`global_temporaries`). + +3. :mod:`loopy` saves the temporary into its global storage slot whenever it + detects the temporary is live-out from a kernel, and reloads the temporary + from its global storage slot when it detects that it needs to do so. + +The kernel translates into two OpenCL kernels. + >>> cgr = lp.generate_code_v2(knl) >>> print(cgr.device_code()) #define lid(N) ((int) get_local_id(N)) @@ -1206,8 +1281,14 @@ the latter of which is handled by :func:`loopy.get_one_scheduled_kernel`. tmp = tmp_save_slot[16 * gid(0) + lid(0)]; arr[((1 + lid(0) + gid(0) * 16) % n)] = tmp; } - >>> evt, (out,) = knl(queue, arr=cl.array.arange(queue, 16, dtype=np.int32), out_host=True) - >>> print(out) + +Executing the kernel does what we expect. + + >>> arr = cl.array.arange(queue, 16, dtype=np.int32) + >>> print(arr) + [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] + >>> evt, (out,) = knl(queue, arr=arr) + >>> print(arr) [15 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14] Atomic operations @@ -1228,6 +1309,9 @@ tagged, as in the following example:: ], assumptions="n>0") +.. [#global-barrier-note] In particular, this is *not* the same as a call to + ``barrier(CLK_GLOBAL_MEM_FENCE)``. + .. }}} .. _more-complicated-programs: @@ -1391,6 +1475,12 @@ considers making *a_fetch* ``local`` (in the OpenCL memory sense of the word) to make use of parallelism in prefetching, it discovers that a write race across the remaining axis of the workgroup would emerge. +Barriers +~~~~~~~~ + +:mod:`loopy` may infer the need for a barrier when it is not necessary. The +``no_sync_with`` instruction attribute can be used to resolve this. + TODO .. }}} -- GitLab From fc8184afb73f6b601c59fa483bb2b0b409cb496f Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 1 Dec 2016 16:47:26 -0600 Subject: [PATCH 3/7] Fix doctest warning. --- doc/tutorial.rst | 1 + 1 file changed, 1 insertion(+) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 06da1f8b2..5a1ad96eb 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -267,6 +267,7 @@ Additionally, for C-based languages, header definitions can be obtained via the :func:`loopy.generate_header`: .. doctest:: + >>> header = str(lp.generate_header(typed_knl)[0]) >>> print(header) __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out); -- GitLab From 8f8e8594d7bbcbdd278f95d1d930570a9307b40b Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 1 Dec 2016 17:10:01 -0600 Subject: [PATCH 4/7] Code generation: make it an error to enter code generation with unscheduled instructions (closes #9). --- loopy/check.py | 22 ++++++++++++++++++++++ loopy/diagnostic.py | 4 ++++ test/test_loopy.py | 17 +++++++++++++++++ 3 files changed, 43 insertions(+) diff --git a/loopy/check.py b/loopy/check.py index 8e1709b71..039061818 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -525,6 +525,27 @@ def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): # }}} +# {{{ check that all instructions are scheduled + +def check_that_all_insns_are_scheduled(kernel): + all_insns = set(insn.id for insn in kernel.instructions) + from loopy.schedule import sched_item_to_insn_id + scheduled_insns = set( + insn_id + for sched_item in kernel.schedule + for insn_id in sched_item_to_insn_id(sched_item)) + + assert scheduled_insns <= all_insns + + if scheduled_insns < all_insns: + from loopy.diagnostic import UnscheduledInstructionError + raise UnscheduledInstructionError( + "unscheduled instructions: '%s'" + % ', '.join(all_insns - scheduled_insns)) + +# }}} + + # {{{ check that shapes and strides are arguments def check_that_shapes_and_strides_are_arguments(kernel): @@ -576,6 +597,7 @@ def pre_codegen_checks(kernel): check_for_unused_hw_axes_in_insns(kernel) check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel) check_that_temporaries_are_defined_in_subkernels_where_used(kernel) + check_that_all_insns_are_scheduled(kernel) kernel.target.pre_codegen_check(kernel) check_that_shapes_and_strides_are_arguments(kernel) diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py index e76279c27..15ab8a1ee 100644 --- a/loopy/diagnostic.py +++ b/loopy/diagnostic.py @@ -99,6 +99,10 @@ class MissingBarrierError(LoopyError): class MissingDefinitionError(LoopyError): pass + +class UnscheduledInstructionError(LoopyError): + pass + # }}} diff --git a/test/test_loopy.py b/test/test_loopy.py index 81fe49cb4..30408b26e 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1817,6 +1817,23 @@ def test_tight_loop_bounds_codegen(): assert for_loop in cgr.device_code() +def test_unscheduled_insn_detection(): + knl = lp.make_kernel( + "{ [i]: 0 <= i < 10 }", + """ + out[i] = i {id=insn1} + """, + "...") + + knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) + insn1, = lp.find_instructions(knl, "id:insn1") + knl.instructions.append(insn1.copy(id="insn2")) + + from loopy.diagnostic import UnscheduledInstructionError + with pytest.raises(UnscheduledInstructionError): + lp.generate_code(knl) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From 7c5a5c90370714097f081657e592d0d9338af689 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Fri, 2 Dec 2016 15:55:21 -0600 Subject: [PATCH 5/7] [ci skip] Fix tutorial typos. --- doc/tutorial.rst | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 5a1ad96eb..d44e8f250 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1188,7 +1188,7 @@ When we try to generate code for this, it will still not work. To understand what is going on, you need to know that :mod:`loopy` implements global barriers by splitting the kernel into multiple device-side kernels. The splitting happens when the instruction schedule is generated. To see the -schedule, we must first must first call :func:`loopy.get_one_scheduled_kernel`: +schedule, we must first call :func:`loopy.get_one_scheduled_kernel`: >>> knl = lp.get_one_scheduled_kernel(lp.preprocess_kernel(knl)) >>> print(knl) @@ -1500,7 +1500,7 @@ be counted, which may facilitate performance prediction and optimization of a The functions used in the following examples may produce warnings. If you have already made the filterwarnings and catch_warnings calls used in the examples above, you may want to reset these before continuing. We will temporarily - supress warnings to keep the output clean: + suppress warnings to keep the output clean: .. doctest:: @@ -1667,7 +1667,7 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: f64 st e: 65536 :class:`loopy.ToCountMap` also makes it easy to determine the total amount -of data moved in bytes. Suppose we want to know the total abount of global +of data moved in bytes. Suppose we want to know the total amount of global memory data loaded and stored. We can produce a map with just this information using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: -- GitLab From 19f4deb6c935043e5aa3c8ac878ce0a97679943d Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 4 Dec 2016 16:23:08 -0600 Subject: [PATCH 6/7] Fix test failures. --- loopy/check.py | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/loopy/check.py b/loopy/check.py index 039061818..4e0382abe 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -515,6 +515,9 @@ def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): insn_query.temporaries_read_in_subkernel(subkernel) - insn_query.temporaries_written_in_subkernel(subkernel)): + if temporary.initializer is not None: + continue + if kernel.temporary_variables[temporary].scope in ( temp_var_scope.PRIVATE, temp_var_scope.LOCAL): from loopy.diagnostic import MissingDefinitionError @@ -528,20 +531,25 @@ def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): # {{{ check that all instructions are scheduled def check_that_all_insns_are_scheduled(kernel): - all_insns = set(insn.id for insn in kernel.instructions) + from loopy.kernel.instruction import NoOpInstruction + + all_schedulable_insns = set( + insn.id for insn in kernel.instructions + # nops are not schedulable + if not isinstance(insn, NoOpInstruction)) from loopy.schedule import sched_item_to_insn_id scheduled_insns = set( insn_id for sched_item in kernel.schedule for insn_id in sched_item_to_insn_id(sched_item)) - assert scheduled_insns <= all_insns + assert scheduled_insns <= all_schedulable_insns - if scheduled_insns < all_insns: + if scheduled_insns < all_schedulable_insns: from loopy.diagnostic import UnscheduledInstructionError raise UnscheduledInstructionError( "unscheduled instructions: '%s'" - % ', '.join(all_insns - scheduled_insns)) + % ', '.join(all_schedulable_insns - scheduled_insns)) # }}} -- GitLab From 2f66b8871a389d4129fae7b0d5fe5042b4e17837 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 4 Dec 2016 18:55:47 -0600 Subject: [PATCH 7/7] Access the temporary through kernel.temporary_variables --- loopy/check.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/check.py b/loopy/check.py index 4e0382abe..5921566e2 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -515,7 +515,7 @@ def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): insn_query.temporaries_read_in_subkernel(subkernel) - insn_query.temporaries_written_in_subkernel(subkernel)): - if temporary.initializer is not None: + if kernel.temporary_variables[temporary].initializer is not None: continue if kernel.temporary_variables[temporary].scope in ( -- GitLab