diff --git a/examples/python/global_barrier_removal.py b/examples/python/global_barrier_removal.py new file mode 100644 index 0000000000000000000000000000000000000000..ecb45edbf0af51bdcbb1efcbbee8e18e0f3b500f --- /dev/null +++ b/examples/python/global_barrier_removal.py @@ -0,0 +1,28 @@ +import numpy as np +import loopy as lp +import pyopencl as cl +import pyopencl.array + +knl = lp.make_kernel( + "{ [i,k]: 0<=i<n and 0<=k<3 }", + """c[k,i] = a[k, i + 1] + out[k,i] = c[k,i]""", + ["..."]) + +# transform +knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0") +from loopy.kernel.tools import add_dtypes +knl = add_dtypes(knl, {"a": np.float32, "c": np.float32, "out": np.float32, "n": np.int32}) + +# schedule +from loopy.preprocess import preprocess_kernel +knl = preprocess_kernel(knl) + +from loopy.schedule import get_one_scheduled_kernel +knl = get_one_scheduled_kernel(knl) +print(knl) + +# map schedule onto host or device +from loopy.codegen.device_mapping import map_schedule_onto_host_or_device +knl = map_schedule_onto_host_or_device(knl) +print(knl) diff --git a/loopy/codegen/device_mapping.py b/loopy/codegen/device_mapping.py new file mode 100644 index 0000000000000000000000000000000000000000..d40b0c4d255eda0159c77fbf56889b0a55e5f347 --- /dev/null +++ b/loopy/codegen/device_mapping.py @@ -0,0 +1,138 @@ +from pytools import Record + + +class HostForLoop(Record): + pass + + +class HostConditional(Record): + pass + + +class HostBlock(Record): + pass + + +class HostInvokeKernel(Record): + pass + + +def map_schedule_onto_host_or_device(kernel): + from pytools import UniqueNameGenerator + kernel_name_gen = UniqueNameGenerator(forced_prefix=kernel.name) + + from loopy.schedule import ( + RunInstruction, EnterLoop, LeaveLoop, Barrier, + CallKernel, ReturnFromKernel) + + # TODO: Assert that the kernel has been scheduled, etc. + schedule = kernel.schedule + + # Map from loop start to loop end + loop_bounds = {} + active_loops = [] + for idx, sched_item in enumerate(schedule): + if isinstance(sched_item, EnterLoop): + active_loops.append(idx) + elif isinstance(sched_item, LeaveLoop): + loop_bounds[active_loops.pop()] = idx + del active_loops + + # {{{ Inner mapper function + + def inner_mapper(start_idx, end_idx, new_schedule): + # XXX: Doesn't do dependency analysis yet.... + schedule_required_splitting = False + + i = start_idx + current_chunk = [] + while i <= end_idx: + sched_item = schedule[i] + + if isinstance(sched_item, RunInstruction): + current_chunk.append(sched_item) + i += 1 + + elif isinstance(sched_item, EnterLoop): + loop_end = loop_bounds[i] + inner_schedule = [] + loop_required_splitting = inner_mapper( + i + 1, loop_end - 1, inner_schedule) + + start_item = schedule[i] + end_item = schedule[loop_end] + + i = loop_end + 1 + + if loop_required_splitting: + schedule_required_splitting = True + if current_chunk: + # TODO: Do a better job of naming the kernel... + new_kernel_name = kernel_name_gen() + new_schedule.extend( + # TODO: Infer kernel arguments + [CallKernel(kernel_name=new_kernel_name)] + + # TODO: Load state into here + current_chunk + + # TODO: Save state right here + [ReturnFromKernel(kernel_name=new_kernel_name)]) + new_schedule.extend( + [start_item] + + inner_schedule + + [end_item]) + current_chunk = [] + else: + current_chunk.extend( + [start_item] + + inner_schedule + + [end_item]) + + elif isinstance(sched_item, Barrier): + if sched_item.kind == "global": + # Wrap the current chunk into a kernel call. + schedule_required_splitting = True + if current_chunk: + # TODO: Do a better job of naming the kernel + new_kernel_name = kernel_name_gen() + new_schedule.extend( + # TODO: Infer kernel arguments + [CallKernel(kernel_name=new_kernel_name)] + + # TODO: Load state into here + current_chunk + + # TODO: Save state right here + [ReturnFromKernel(kernel_name=new_kernel_name)]) + current_chunk = [] + else: + current_chunk.append(sched_item) + i += 1 + else: + # TODO: Make error message more informative. + raise ValueError() + + if current_chunk and schedule_required_splitting: + # Wrap remainder of schedule into a kernel call. + new_kernel_name = kernel_name_gen() + new_schedule.extend( + # TODO: Infer kernel arguments + [CallKernel(kernel_name=new_kernel_name)] + + # TODO: Load state into here + current_chunk + + # TODO: Save state right here + [ReturnFromKernel(kernel_name=new_kernel_name)]) + else: + new_schedule.extend(current_chunk) + + return schedule_required_splitting + + # }}} + + new_schedule = [] + split_kernel = inner_mapper(0, len(schedule) - 1, new_schedule) + if not split_kernel: + # Wrap everything into a kernel call. + new_schedule = ( + [CallKernel(kernel_name=kernel.name)] + + new_schedule + + [ReturnFromKernel(kernel_name=kernel.name)]) + new_kernel = kernel.copy(schedule=new_schedule) + return new_kernel diff --git a/loopy/schedule.py b/loopy/schedule.py index 8094995d7972c2d203c0d3148419aaba252fbc77..7ac8778378869f7c9cf6cfbe8d4be145e9fad520 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -62,6 +62,14 @@ class RunInstruction(ScheduleItem): hash_fields = __slots__ = ["insn_id"] +class CallKernel(ScheduleItem): + hash_fields = __slots__ = ["kernel_name"] + + +class ReturnFromKernel(ScheduleItem): + hash_fields = __slots__ = ["kernel_name"] + + class Barrier(ScheduleItem): """ .. attribute:: comment @@ -363,6 +371,12 @@ def dump_schedule(kernel, schedule): elif isinstance(sched_item, LeaveLoop): indent = indent[:-4] lines.append(indent + "ENDLOOP %s" % sched_item.iname) + elif isinstance(sched_item, CallKernel): + lines.append(indent + "CALL KERNEL %s" % sched_item.kernel_name) + indent += " " + elif isinstance(sched_item, ReturnFromKernel): + indent = indent[:-4] + lines.append(indent + "RETURN FROM KERNEL %s" % sched_item.kernel_name) elif isinstance(sched_item, RunInstruction): insn = kernel.id_to_insn[sched_item.insn_id] if isinstance(insn, MultiAssignmentBase): @@ -1451,12 +1465,14 @@ def generate_loop_schedules(kernel, debug_args={}): gen_sched = insert_barriers(kernel, gen_sched, reverse=False, kind="global") + """ for sched_item in gen_sched: if ( isinstance(sched_item, Barrier) and sched_item.kind == "global"): raise LoopyError("kernel requires a global barrier %s" % sched_item.comment) + """ logger.info("%s: barrier insertion: local" % kernel.name)