diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py index 6fbf94306fea84ffe323c696228b41e380d8d857..89600102e09bb96173bb11db5c71d14dd3b2a206 100644 --- a/loopy/diagnostic.py +++ b/loopy/diagnostic.py @@ -93,6 +93,10 @@ class DependencyTypeInferenceFailure(TypeInferenceFailure): TypeInferenceFailure.__init__(self, message) self.symbol = symbol + +class MissingBarrierError(LoopyError): + pass + # }}} diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index cd6e463547128e68e8db86716441d6715cd795ab..921d2538a5068a8c309fe9f412aba588aa6243f2 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -1286,6 +1286,8 @@ class BarrierInstruction(_DataObliviousInstruction): ... gbarrier """ + fields = _DataObliviousInstruction.fields | set(["kind"]) + def __init__(self, id, depends_on=None, depends_on_is_final=None, groups=None, conflicts_with_groups=None, no_sync_with=None, diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index ae9ebacf5ca798c58f87234e72e5f64187f7afc6..7dd35e2227da9f763d2ac02b9440433c9d8521a9 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -87,8 +87,12 @@ class Barrier(ScheduleItem): .. attribute:: kind ``"local"`` or ``"global"`` + + .. attribute:: originating_insn_id """ - hash_fields = __slots__ = ["comment", "kind"] + + hash_fields = ["comment", "kind"] + __slots__ = hash_fields + ["originating_insn_id"] # }}} @@ -370,10 +374,25 @@ def format_insn(kernel, insn_id): insn = kernel.id_to_insn[insn_id] Fore = kernel.options._fore Style = kernel.options._style - return "[%s] %s%s%s <- %s%s%s" % ( + from loopy.kernel.instruction import ( + MultiAssignmentBase, NoOpInstruction, BarrierInstruction) + if isinstance(insn, MultiAssignmentBase): + return "[%s] %s%s%s <- %s%s%s" % ( format_insn_id(kernel, insn_id), Fore.CYAN, ", ".join(str(a) for a in insn.assignees), Style.RESET_ALL, Fore.MAGENTA, str(insn.expression), Style.RESET_ALL) + elif isinstance(insn, BarrierInstruction): + return "[%s] %s... %sbarrier%s" % ( + format_insn_id(kernel, insn_id), + Fore.MAGENTA, insn.kind[0], Style.RESET_ALL) + elif isinstance(insn, NoOpInstruction): + return "[%s] %s... nop%s" % ( + format_insn_id(kernel, insn_id), + Fore.MAGENTA, Style.RESET_ALL) + else: + return "[%s] %s%s%s" % ( + format_insn_id(kernel, insn_id), + Fore.CYAN, str(insn), Style.RESET_ALL) def dump_schedule(kernel, schedule): @@ -1059,7 +1078,29 @@ def filter_nops_from_schedule(kernel, schedule): # }}} -# {{{ barrier insertion +# {{{ convert barrier instructions to proper barriers + +def convert_barrier_instructions_to_barriers(kernel, schedule): + from loopy.kernel.instruction import BarrierInstruction + + result = [] + for sched_item in schedule: + if isinstance(sched_item, RunInstruction): + insn = kernel.id_to_insn[sched_item.insn_id] + if isinstance(insn, BarrierInstruction): + result.append(Barrier( + kind=insn.kind, + originating_insn_id=insn.id)) + continue + + result.append(sched_item) + + return result + +# }}} + + +# {{{ barrier insertion/verification class DependencyRecord(Record): """ @@ -1243,7 +1284,7 @@ def insn_ids_from_schedule(schedule): return result -def insert_barriers(kernel, schedule, reverse, kind, level=0): +def insert_barriers(kernel, schedule, reverse, kind, verify_only, level=0): """ :arg reverse: a :class:`bool`. For ``level > 0``, this function should be called twice, first with ``reverse=False`` to insert barriers for @@ -1259,6 +1300,8 @@ def insert_barriers(kernel, schedule, reverse, kind, level=0): :arg kind: "local" or "global". The :attr:`Barrier.kind` to be inserted. Generally, this function will be called once for each kind of barrier at the top level, where more global barriers should be inserted first. + :arg verify_only: do not insert barriers, only complain if they are + missing. :arg level: the current level of loop nesting, 0 for outermost. """ result = [] @@ -1331,6 +1374,7 @@ def insert_barriers(kernel, schedule, reverse, kind, level=0): subresult = insert_barriers( kernel, subresult, reverse=sub_reverse, kind=kind, + verify_only=verify_only, level=level+1) # {{{ find barriers in loop body @@ -1402,8 +1446,23 @@ def insert_barriers(kernel, schedule, reverse, kind, level=0): source=dep_src_insn_id, reverse=reverse, var_kind=kind) if dep: - issue_barrier(dep=dep) - break + if verify_only: + from loopy.diagnostic import MissingBarrierError + raise MissingBarrierError( + "Dependency '%s' (for variable '%s') " + "requires synchronization " + "by a %s barrier (add a 'no_sync_with' " + "instruction option to state that no" + "synchronization is needed)" + % ( + dep.dep_descr.format( + tgt=dep.target.id, src=dep.source.id), + dep.variable, + kind)) + + else: + issue_barrier(dep=dep) + break result.append(sched_item) candidates.add(sched_item.insn_id) @@ -1526,6 +1585,8 @@ def generate_loop_schedules(kernel, debug_args={}): debug.stop() gen_sched = filter_nops_from_schedule(kernel, gen_sched) + gen_sched = convert_barrier_instructions_to_barriers( + kernel, gen_sched) gsize, lsize = kernel.get_grid_size_upper_bounds() @@ -1534,12 +1595,12 @@ def generate_loop_schedules(kernel, debug_args={}): logger.info("%s: barrier insertion: global" % kernel.name) gen_sched = insert_barriers(kernel, gen_sched, - reverse=False, kind="global") + reverse=False, kind="global", verify_only=True) logger.info("%s: barrier insertion: local" % kernel.name) gen_sched = insert_barriers(kernel, gen_sched, - reverse=False, kind="local") + reverse=False, kind="local", verify_only=False) logger.info("%s: barrier insertion: done" % kernel.name) diff --git a/loopy/statistics.py b/loopy/statistics.py index 6fa4614b9711ce30c44333276970492979554c42..47abfe53a4bfe8598cd09425b5baa81f13525c37 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -32,7 +32,7 @@ from pytools import memoize_in from pymbolic.mapper import CombineMapper from functools import reduce from loopy.kernel.data import MultiAssignmentBase -from loopy.diagnostic import warn, LoopyError +from loopy.diagnostic import warn_with_kernel, LoopyError __doc__ = """ @@ -555,22 +555,19 @@ def count(kernel, set): if not (is_subset and is_superset): if is_subset: - from loopy.diagnostic import warn - warn(kernel, "count_overestimate", + warn_with_kernel(kernel, "count_overestimate", "Barvinok wrappers are not installed. " "Counting routines have overestimated the " "number of integer points in your loop " "domain.") elif is_superset: - from loopy.diagnostic import warn - warn(kernel, "count_underestimate", + warn_with_kernel(kernel, "count_underestimate", "Barvinok wrappers are not installed. " "Counting routines have underestimated the " "number of integer points in your loop " "domain.") else: - from loopy.diagnostic import warn - warn(kernel, "count_misestimate", + warn_with_kernel(kernel, "count_misestimate", "Barvinok wrappers are not installed. " "Counting routines have misestimated the " "number of integer points in your loop " @@ -901,7 +898,7 @@ def gather_access_footprints(kernel, ignore_uncountable=False): for insn in kernel.instructions: if not isinstance(insn, MultiAssignmentBase): - warn(kernel, "count_non_assignment", + warn_with_kernel(kernel, "count_non_assignment", "Non-assignment instruction encountered in " "gather_access_footprints, not counted") continue diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index b245c44d33b3a601855f67777601b1397dcd6efa..467bc8ee801a090641c7e7f8d7f2e7c12a921232 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -135,8 +135,7 @@ def check_sizes(kernel, device): from loopy.diagnostic import LoopyAdvisory, LoopyError if device is None: - from loopy.diagnostic import warn - warn(kernel, "no_device_in_pre_codegen_checks", + warn_with_kernel(kernel, "no_device_in_pre_codegen_checks", "No device parameter was passed to the PyOpenCLTarget. " "Perhaps you want to pass a device to benefit from " "additional checking.", LoopyAdvisory) diff --git a/test/test_loopy.py b/test/test_loopy.py index d407b63fb95ba9ebf2ebf1381c91d61ecfe393c3..f723f5fecf6cec1173e017cd822f28ecd68ea5d4 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1030,9 +1030,12 @@ def test_kernel_splitting(ctx_factory): knl = lp.make_kernel( "{ [i]: 0<=i t_private_scalar = a[k,i+1] - <> t_private_array[i % 2] = a[k,i+1] - c[k,i] = a[k,i+1] - out[k,i] = c[k,i] + t_private_scalar + t_private_array[i % 2] - """) + for i, k + ... gbarrier + <> t_private_scalar = a[k,i+1] + <> t_private_array[i % 2] = a[k,i+1] + c[k,i] = a[k,i+1] + ... gbarrier + out[k,i] = c[k,i] + t_private_scalar + t_private_array[i % 2] + end + """, seq_dependencies=True) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) @@ -1147,10 +1158,14 @@ def test_kernel_splitting_with_loop_and_local_temporary(ctx_factory): knl = lp.make_kernel( "{ [i,k]: 0<=i t_local[i % 8,k] = i % 8 - c[k,i] = a[k,i+1] - out[k,i] = c[k,i] + t_local[i % 8,k] - """) + for i, k + ... gbarrier + <> t_local[i % 8,k] = i % 8 + c[k,i] = a[k,i+1] + ... gbarrier + out[k,i] = c[k,i] + t_local[i % 8,k] + end + """, seq_dependencies=True) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) @@ -1187,9 +1202,12 @@ def test_global_temporary(ctx_factory): knl = lp.make_kernel( "{ [i]: 0<=i c[i] = a[i + 1] - out[i] = c[i] - """) + for i + <> c[i] = a[i + 1] + ... gbarrier + out[i] = c[i] + end + """, seq_dependencies=True) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) @@ -1376,7 +1394,7 @@ def test_sequential_dependencies(ctx_factory): lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(n=5)) -def test_special_instructions(ctx_factory): +def test_nop(ctx_factory): ctx = ctx_factory() knl = lp.make_kernel( @@ -1398,6 +1416,67 @@ def test_special_instructions(ctx_factory): lp.auto_test_vs_ref(knl, ctx, knl, parameters=dict(ntrips=5)) +def test_global_barrier(ctx_factory): + ctx = ctx_factory() + + knl = lp.make_kernel( + "{[i,itrip]: 0<=i z[i] = z[i+1] + z[i] {id=wr_z,dep=top} + <> v[i] = 11 {id=wr_v,dep=top} + ... gbarrier {dep=wr_z:wr_v,id=yoink} + z[i] = z[i] - z[i+1] + v[i] {id=iupd} + end + ... gbarrier {dep=iupd,id=postloop} + z[i] = z[i] - z[i+1] + v[i] {dep=postloop} + end + """) + + knl = lp.fix_parameters(knl, ntrips=3) + knl = lp.add_and_infer_dtypes(knl, {"z": np.float64}) + + ref_knl = knl + ref_knl = lp.set_temporary_scope(ref_knl, "z", "global") + ref_knl = lp.set_temporary_scope(ref_knl, "v", "global") + + knl = lp.split_iname(knl, "i", 256, outer_tag="g.0", inner_tag="l.0") + print(knl) + + knl = lp.preprocess_kernel(knl) + assert knl.temporary_variables["z"].scope == lp.temp_var_scope.GLOBAL + assert knl.temporary_variables["v"].scope == lp.temp_var_scope.GLOBAL + + print(knl) + + lp.auto_test_vs_ref(ref_knl, ctx, knl, parameters=dict(ntrips=5, n=10)) + + +def test_missing_global_barrier(): + knl = lp.make_kernel( + "{[i,itrip]: 0<=i z[i] = z[i] - z[i+1] {id=iupd,dep=yoink} + end + # This is where the barrier should be + z[i] = z[i] - z[i+1] + v[i] {dep=iupd} + end + """) + + knl = lp.set_temporary_scope(knl, "z", "global") + knl = lp.split_iname(knl, "i", 256, outer_tag="g.0") + knl = lp.preprocess_kernel(knl) + + from loopy.diagnostic import MissingBarrierError + with pytest.raises(MissingBarrierError): + lp.get_one_scheduled_kernel(knl) + + def test_index_cse(ctx_factory): knl = lp.make_kernel(["{[i,j,k,l,m]:0<=i,j,k,l,m