diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 942c7d56e01f9d037b0e2b601f88bc8b96dda151..b983312645062cd6a984201b9efda294fe0e2a74 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1228,18 +1228,18 @@ put those instructions into the schedule. --------------------------------------------------------------------------- TEMPORARIES: tmp: type: np:dtype('int32'), shape: () scope:private - tmp_save_slot: type: np:dtype('int32'), shape: (n // 16, 16), dim_tags: (N1:stride:16, N0:stride:1) scope:global + tmp__save_slot: type: np:dtype('int32'), shape: (n // 16, 16), dim_tags: (N1:stride:16, N0:stride:1) scope:global --------------------------------------------------------------------------- ... --------------------------------------------------------------------------- SCHEDULE: - 0: CALL KERNEL rotate_v2(extra_args=['tmp_save_slot'], extra_inames=[]) + 0: CALL KERNEL rotate_v2(extra_args=['tmp__save_slot'], extra_inames=[]) 1: [maketmp] tmp <- arr[i_inner + i_outer*16] - 2: [tmp.save] tmp_save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] <- tmp + 2: [tmp.save] tmp__save_slot[tmp_save_hw_dim_0_rotate_v2, tmp_save_hw_dim_1_rotate_v2] <- tmp 3: RETURN FROM KERNEL rotate_v2 4: ---BARRIER:global--- - 5: CALL KERNEL rotate_v2_0(extra_args=['tmp_save_slot'], extra_inames=[]) - 6: [tmp.reload] tmp <- tmp_save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0] + 5: CALL KERNEL rotate_v2_0(extra_args=['tmp__save_slot'], extra_inames=[]) + 6: [tmp.reload] tmp <- tmp__save_slot[tmp_reload_hw_dim_0_rotate_v2_0, tmp_reload_hw_dim_1_rotate_v2_0] 7: [rotate] arr[((1 + i_inner + i_outer*16) % n)] <- tmp 8: RETURN FROM KERNEL rotate_v2_0 --------------------------------------------------------------------------- @@ -1264,19 +1264,19 @@ The kernel translates into two OpenCL kernels. #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot) + __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp__save_slot) { int tmp; tmp = arr[16 * gid(0) + lid(0)]; - tmp_save_slot[16 * gid(0) + lid(0)] = tmp; + tmp__save_slot[16 * gid(0) + lid(0)] = tmp; } - __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot) + __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp__save_slot) { int tmp; - tmp = tmp_save_slot[16 * gid(0) + lid(0)]; + tmp = tmp__save_slot[16 * gid(0) + lid(0)]; arr[((1 + lid(0) + gid(0) * 16) % n)] = tmp; } diff --git a/loopy/__init__.py b/loopy/__init__.py index 6cbb3362ef91b27c3b7b1cf6a591f7f9a20c2f7a..206cf49ff71573fb0d33cfffbc029f7b54a0e9f8 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -75,7 +75,8 @@ from loopy.transform.instruction import ( set_instruction_priority, add_dependency, remove_instructions, replace_instruction_ids, - tag_instructions) + tag_instructions, + add_nosync) from loopy.transform.data import ( add_prefetch, change_arg_to_image, @@ -108,6 +109,8 @@ from loopy.transform.batch import to_batched from loopy.transform.parameter import assume, fix_parameters from loopy.transform.save import save_and_reload_temporaries +from loopy.transform.reduction import make_two_level_reduction + # }}} from loopy.type_inference import infer_unknown_types @@ -189,6 +192,7 @@ __all__ = [ "remove_instructions", "replace_instruction_ids", "tag_instructions", + "add_nosync", "extract_subst", "expand_subst", "assignment_to_subst", "find_rules_matching", "find_one_rule_matching", @@ -207,6 +211,8 @@ __all__ = [ "assume", "fix_parameters", + "make_two_level_reduction", + "save_and_reload_temporaries", # }}} diff --git a/loopy/check.py b/loopy/check.py index 6a1e3dc33a33b826ad54c42a549b35ad275d9fe5..68ca4a2b3082a1427393f6c6243a8a0e9cf4fd88 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -505,22 +505,22 @@ def check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel): # {{{ check that temporaries are defined in subkernels where used def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): - from loopy.schedule.tools import InstructionQuery from loopy.kernel.data import temp_var_scope - insn_query = InstructionQuery(kernel) - - for subkernel in insn_query.subkernels(): + for subkernel in kernel.subkernels: defined_base_storage = set() - for temporary in insn_query.temporaries_written_in_subkernel(subkernel): + from loopy.schedule.tools import ( + temporaries_written_in_subkernel, temporaries_read_in_subkernel) + + for temporary in temporaries_written_in_subkernel(kernel, subkernel): tval = kernel.temporary_variables[temporary] if tval.base_storage is not None: defined_base_storage.add(tval.base_storage) for temporary in ( - insn_query.temporaries_read_in_subkernel(subkernel) - - insn_query.temporaries_written_in_subkernel(subkernel)): + temporaries_read_in_subkernel(kernel, subkernel) - + temporaries_written_in_subkernel(kernel, subkernel)): tval = kernel.temporary_variables[temporary] if tval.initializer is not None: @@ -530,16 +530,17 @@ def check_that_temporaries_are_defined_in_subkernels_where_used(kernel): if tval.base_storage is not None: if tval.base_storage not in defined_base_storage: from loopy.diagnostic import MissingDefinitionError - raise MissingDefinitionError("temporary variable '%s' gets used " - "in subkernel '%s' and neither it nor its aliases have a " - "definition" % (temporary, subkernel)) + raise MissingDefinitionError("temporary variable '%s' gets " + "used in subkernel '%s' and neither it nor its " + "aliases have a definition" % (temporary, subkernel)) continue if tval.scope in (temp_var_scope.PRIVATE, temp_var_scope.LOCAL): from loopy.diagnostic import MissingDefinitionError - raise MissingDefinitionError("temporary variable '%s' gets used in " - "subkernel '%s' without a definition (maybe you forgot to call " - "loopy.save_and_reload_temporaries?)" % (temporary, subkernel)) + raise MissingDefinitionError("temporary variable '%s' gets used " + "in subkernel '%s' without a definition (maybe you forgot " + "to call loopy.save_and_reload_temporaries?)" + % (temporary, subkernel)) # }}} diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py index 15ab8a1ee13df440926e51e676223bc6a398df57..512e4ac8619f33856d0a8ed929de0b574f7da014 100644 --- a/loopy/diagnostic.py +++ b/loopy/diagnostic.py @@ -103,6 +103,10 @@ class MissingDefinitionError(LoopyError): class UnscheduledInstructionError(LoopyError): pass + +class ReductionIsNotTriangularError(LoopyError): + pass + # }}} diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 602830de38e457c5ff4a55d7685dc346a7b4de35..0b3068a468ca1ff0e2ab97ae4a8b28dc63e7b8d4 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -29,6 +29,7 @@ from six.moves import range, zip from loopy.diagnostic import StaticValueFindingError +import contextlib import islpy as isl from islpy import dim_type @@ -60,6 +61,14 @@ def dump_space(ls): for dt in dim_type.names) +@contextlib.contextmanager +def no_stderr_output_from_isl(ctx): + prev_on_error = ctx.get_on_error() + ctx.set_on_error(isl.on_error.CONTINUE) + yield + ctx.set_on_error(prev_on_error) + + # {{{ make_slab def make_slab(space, iname, start, stop): diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 793d31791a3295ef1d7c03132f43489ab828f089..cc3414489635243a3f77fcb769572fa8773921cc 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -786,6 +786,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): for var_name in insn.read_dependency_names() & admissible_vars: result.setdefault(var_name, set()).add(insn.id) + return result + @memoize_method def writer_map(self): """ @@ -823,6 +825,138 @@ class LoopKernel(ImmutableRecordWithoutPickling): return result + @property + @memoize_method + def global_barrier_order(self): + """Return a :class:`tuple` of the listing the ids of global barrier instructions + as they appear in order in the kernel. + + See also :class:`loopy.instruction.BarrierInstruction`. + """ + barriers = [] + visiting = set() + visited = set() + + unvisited = set(insn.id for insn in self.instructions) + + while unvisited: + stack = [unvisited.pop()] + + while stack: + top = stack[-1] + + if top in visiting: + visiting.remove(top) + + from loopy.kernel.instruction import BarrierInstruction + insn = self.id_to_insn[top] + if isinstance(insn, BarrierInstruction): + if insn.kind == "global": + barriers.append(top) + + if top in visited: + stack.pop() + continue + + visited.add(top) + visiting.add(top) + + for child in self.id_to_insn[top].depends_on: + # Check for no cycles. + assert child not in visiting + stack.append(child) + + # Ensure this is the only possible order. + for prev_barrier, barrier in zip(barriers, barriers[1:]): + if prev_barrier not in self.recursive_insn_dep_map()[barrier]: + raise LoopyError( + "Unordered global barriers detected: '%s', '%s'" + % (barrier, prev_barrier)) + + return tuple(barriers) + + @memoize_method + def find_most_recent_global_barrier(self, insn_id): + """Return the id of the latest occuring global barrier which the + given instruction (indirectly or directly) depends on, or *None* if this + instruction does not depend on a global barrier. + + The return value is guaranteed to be unique because global barriers are + totally ordered within the kernel. + """ + + if len(self.global_barrier_order) == 0: + return None + + insn = self.id_to_insn[insn_id] + + if len(insn.depends_on) == 0: + return None + + def is_barrier(my_insn_id): + insn = self.id_to_insn[my_insn_id] + from loopy.kernel.instruction import BarrierInstruction + return isinstance(insn, BarrierInstruction) and insn.kind == "global" + + global_barrier_to_ordinal = dict( + (b, i) for i, b in enumerate(self.global_barrier_order)) + + def get_barrier_ordinal(barrier_id): + return (global_barrier_to_ordinal[barrier_id] + if barrier_id is not None + else -1) + + direct_barrier_dependencies = set( + dep for dep in insn.depends_on if is_barrier(dep)) + + if len(direct_barrier_dependencies) > 0: + return max(direct_barrier_dependencies, key=get_barrier_ordinal) + else: + return max((self.find_most_recent_global_barrier(dep) + for dep in insn.depends_on), + key=get_barrier_ordinal) + + @property + @memoize_method + def subkernels(self): + if self.state != kernel_state.SCHEDULED: + raise LoopyError("Kernel must be scheduled") + + from loopy.schedule import CallKernel + + return tuple(sched_item.kernel_name + for sched_item in self.schedule + if isinstance(sched_item, CallKernel)) + + @property + @memoize_method + def subkernel_to_insn_ids(self): + if self.state != kernel_state.SCHEDULED: + raise LoopyError("Kernel must be scheduled") + + from loopy.schedule import ( + sched_item_to_insn_id, CallKernel, ReturnFromKernel) + + subkernel = None + result = {} + + for sched_item in self.schedule: + if isinstance(sched_item, CallKernel): + subkernel = sched_item.kernel_name + result[subkernel] = set() + + if isinstance(sched_item, ReturnFromKernel): + subkernel = None + + if subkernel is not None: + for insn_id in sched_item_to_insn_id(sched_item): + result[subkernel].add(insn_id) + + for subkernel in result: + result[subkernel] = frozenset(result[subkernel]) + + return result + # }}} # {{{ argument wrangling @@ -968,7 +1102,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): try: # insist block size is constant size = static_max_of_pw_aff(size, - constants_only=isinstance(tag, LocalIndexTag)) + constants_only=isinstance(tag, LocalIndexTag), + context=self.assumptions) except ValueError: pass diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index fdd8f1d3764ec03ca40a8338dc512b8cd2ae38cf..fc10258619a82f1637ce738d5b9d6c8bec7a7a2b 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -658,7 +658,8 @@ class MultiAssignmentBase(InstructionBase): @memoize_method def reduction_inames(self): def map_reduction(expr, rec): - rec(expr.expr) + for sub_expr in expr.exprs: + rec(sub_expr) for iname in expr.inames: result.add(iname) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 539bfbed06572b07491c215770a0330963764d1d..d94136e4359009637b0c5ab89ba8009f3f810956 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1354,4 +1354,5 @@ def draw_dependencies_as_unicode_arrows( # }}} + # vim: foldmethod=marker diff --git a/loopy/library/reduction.py b/loopy/library/reduction.py index d24b61c12e43cd16431c9727d8fb057319475633..b6dbc4b43135926778eadce0af049cbb0542458a 100644 --- a/loopy/library/reduction.py +++ b/loopy/library/reduction.py @@ -36,15 +36,19 @@ class ReductionOperation(object): equality-comparable. """ - def result_dtypes(self, target, arg_dtype, inames): + def result_dtypes(self, target, *arg_dtypes): """ - :arg arg_dtype: may be None if not known + :arg arg_dtypes: may be None if not known :returns: None if not known, otherwise the returned type """ raise NotImplementedError - def neutral_element(self, dtype, inames): + @property + def arg_count(self): + raise NotImplementedError + + def neutral_element(self, *dtypes): raise NotImplementedError def __hash__(self): @@ -55,7 +59,7 @@ class ReductionOperation(object): # Force subclasses to override raise NotImplementedError - def __call__(self, dtype, operand1, operand2, inames): + def __call__(self, dtype, operand1, operand2): raise NotImplementedError def __ne__(self, other): @@ -87,7 +91,11 @@ class ScalarReductionOperation(ReductionOperation): """ self.forced_result_type = forced_result_type - def result_dtypes(self, kernel, arg_dtype, inames): + @property + def arg_count(self): + return 1 + + def result_dtypes(self, kernel, arg_dtype): if self.forced_result_type is not None: return (self.parse_result_type( kernel.target, self.forced_result_type),) @@ -114,18 +122,18 @@ class ScalarReductionOperation(ReductionOperation): class SumReductionOperation(ScalarReductionOperation): - def neutral_element(self, dtype, inames): + def neutral_element(self, dtype): return 0 - def __call__(self, dtype, operand1, operand2, inames): + def __call__(self, dtype, operand1, operand2): return operand1 + operand2 class ProductReductionOperation(ScalarReductionOperation): - def neutral_element(self, dtype, inames): + def neutral_element(self, dtype): return 1 - def __call__(self, dtype, operand1, operand2, inames): + def __call__(self, dtype, operand1, operand2): return operand1 * operand2 @@ -166,32 +174,144 @@ def get_ge_neutral(dtype): class MaxReductionOperation(ScalarReductionOperation): - def neutral_element(self, dtype, inames): + def neutral_element(self, dtype): return get_ge_neutral(dtype) - def __call__(self, dtype, operand1, operand2, inames): + def __call__(self, dtype, operand1, operand2): return var("max")(operand1, operand2) class MinReductionOperation(ScalarReductionOperation): - def neutral_element(self, dtype, inames): + def neutral_element(self, dtype): return get_le_neutral(dtype) - def __call__(self, dtype, operand1, operand2, inames): + def __call__(self, dtype, operand1, operand2): return var("min")(operand1, operand2) +# {{{ segmented reduction + +class _SegmentedScalarReductionOperation(ReductionOperation): + def __init__(self, **kwargs): + self.inner_reduction = self.base_reduction_class(**kwargs) + + @property + def arg_count(self): + return 2 + + def prefix(self, scalar_dtype, segment_flag_dtype): + return "loopy_segmented_%s_%s_%s" % (self.which, + scalar_dtype.numpy_dtype.type.__name__, + segment_flag_dtype.numpy_dtype.type.__name__) + + def neutral_element(self, scalar_dtype, segment_flag_dtype): + return SegmentedFunction(self, (scalar_dtype, segment_flag_dtype), "init")() + + def result_dtypes(self, kernel, scalar_dtype, segment_flag_dtype): + return (self.inner_reduction.result_dtypes(kernel, scalar_dtype) + + (segment_flag_dtype,)) + + def __str__(self): + return "segmented_" + self.which + + def __hash__(self): + return hash(type(self)) + + def __eq__(self, other): + return type(self) == type(other) + + def __call__(self, dtypes, operand1, operand2): + return SegmentedFunction(self, dtypes, "update")(*(operand1 + operand2)) + + +class SegmentedSumReductionOperation(_SegmentedScalarReductionOperation): + base_reduction_class = SumReductionOperation + which = "sum" + op = "((%s) + (%s))" + + +class SegmentedProductReductionOperation(_SegmentedScalarReductionOperation): + base_reduction_class = ProductReductionOperation + op = "((%s) * (%s))" + which = "product" + + +class SegmentedFunction(FunctionIdentifier): + init_arg_names = ("reduction_op", "dtypes", "name") + + def __init__(self, reduction_op, dtypes, name): + """ + :arg dtypes: A :class:`tuple` of `(scalar_dtype, segment_flag_dtype)` + """ + self.reduction_op = reduction_op + self.dtypes = dtypes + self.name = name + + @property + def scalar_dtype(self): + return self.dtypes[0] + + @property + def segment_flag_dtype(self): + return self.dtypes[1] + + def __getinitargs__(self): + return (self.reduction_op, self.dtypes, self.name) + + +def get_segmented_function_preamble(kernel, func_id): + op = func_id.reduction_op + prefix = op.prefix(func_id.scalar_dtype, func_id.segment_flag_dtype) + + from pymbolic.mapper.c_code import CCodeMapper + + c_code_mapper = CCodeMapper() + + return (prefix, """ + inline %(scalar_t)s %(prefix)s_init(%(segment_flag_t)s *segment_flag_out) + { + *segment_flag_out = 0; + return %(neutral)s; + } + + inline %(scalar_t)s %(prefix)s_update( + %(scalar_t)s op1, %(segment_flag_t)s segment_flag1, + %(scalar_t)s op2, %(segment_flag_t)s segment_flag2, + %(segment_flag_t)s *segment_flag_out) + { + *segment_flag_out = segment_flag2; + return segment_flag2 ? op2 : %(combined)s; + } + """ % dict( + scalar_t=kernel.target.dtype_to_typename(func_id.scalar_dtype), + prefix=prefix, + segment_flag_t=kernel.target.dtype_to_typename( + func_id.segment_flag_dtype), + neutral=c_code_mapper( + op.inner_reduction.neutral_element(func_id.scalar_dtype)), + combined=op.op % ("op1", "op2"), + )) + + +# }}} + + # {{{ argmin/argmax class _ArgExtremumReductionOperation(ReductionOperation): - def prefix(self, dtype): - return "loopy_arg%s_%s" % (self.which, dtype.numpy_dtype.type.__name__) + def prefix(self, scalar_dtype, index_dtype): + return "loopy_arg%s_%s_%s" % (self.which, + index_dtype.numpy_dtype.type.__name__, + scalar_dtype.numpy_dtype.type.__name__) - def result_dtypes(self, kernel, dtype, inames): - return (dtype, kernel.index_dtype) + def result_dtypes(self, kernel, scalar_dtype, index_dtype): + return (scalar_dtype, index_dtype) - def neutral_element(self, dtype, inames): - return ArgExtFunction(self, dtype, "init", inames)() + def neutral_element(self, scalar_dtype, index_dtype): + return ArgExtFunction(self, (scalar_dtype, index_dtype), "init")() + + def __str__(self): + return self.which def __hash__(self): return hash(type(self)) @@ -199,11 +319,12 @@ class _ArgExtremumReductionOperation(ReductionOperation): def __eq__(self, other): return type(self) == type(other) - def __call__(self, dtype, operand1, operand2, inames): - iname, = inames + @property + def arg_count(self): + return 2 - return ArgExtFunction(self, dtype, "update", inames)( - *(operand1 + (operand2, var(iname)))) + def __call__(self, dtypes, operand1, operand2): + return ArgExtFunction(self, dtypes, "update")(*(operand1 + operand2)) class ArgMaxReductionOperation(_ArgExtremumReductionOperation): @@ -219,21 +340,28 @@ class ArgMinReductionOperation(_ArgExtremumReductionOperation): class ArgExtFunction(FunctionIdentifier): - init_arg_names = ("reduction_op", "scalar_dtype", "name", "inames") + init_arg_names = ("reduction_op", "dtypes", "name") - def __init__(self, reduction_op, scalar_dtype, name, inames): + def __init__(self, reduction_op, dtypes, name): self.reduction_op = reduction_op - self.scalar_dtype = scalar_dtype + self.dtypes = dtypes self.name = name - self.inames = inames + + @property + def scalar_dtype(self): + return self.dtypes[0] + + @property + def index_dtype(self): + return self.dtypes[1] def __getinitargs__(self): - return (self.reduction_op, self.scalar_dtype, self.name, self.inames) + return (self.reduction_op, self.dtypes, self.name) def get_argext_preamble(kernel, func_id): op = func_id.reduction_op - prefix = op.prefix(func_id.scalar_dtype) + prefix = op.prefix(func_id.scalar_dtype, func_id.index_dtype) from pymbolic.mapper.c_code import CCodeMapper @@ -267,7 +395,7 @@ def get_argext_preamble(kernel, func_id): """ % dict( scalar_t=kernel.target.dtype_to_typename(func_id.scalar_dtype), prefix=prefix, - index_t=kernel.target.dtype_to_typename(kernel.index_dtype), + index_t=kernel.target.dtype_to_typename(func_id.index_dtype), neutral=c_code_mapper(neutral(func_id.scalar_dtype)), comp=op.update_comparison, )) @@ -284,6 +412,8 @@ _REDUCTION_OPS = { "min": MinReductionOperation, "argmax": ArgMaxReductionOperation, "argmin": ArgMinReductionOperation, + "segmented_sum": SegmentedSumReductionOperation, + "segmented_product": SegmentedProductReductionOperation, } _REDUCTION_OP_PARSERS = [ @@ -333,9 +463,10 @@ def reduction_function_mangler(kernel, func_id, arg_dtypes): from loopy.kernel.data import CallMangleInfo return CallMangleInfo( - target_name="%s_init" % op.prefix(func_id.scalar_dtype), + target_name="%s_init" % op.prefix( + func_id.scalar_dtype, func_id.index_dtype), result_dtypes=op.result_dtypes( - kernel, func_id.scalar_dtype, func_id.inames), + kernel, func_id.scalar_dtype, func_id.index_dtype), arg_dtypes=(), ) @@ -348,9 +479,10 @@ def reduction_function_mangler(kernel, func_id, arg_dtypes): from loopy.kernel.data import CallMangleInfo return CallMangleInfo( - target_name="%s_update" % op.prefix(func_id.scalar_dtype), + target_name="%s_update" % op.prefix( + func_id.scalar_dtype, func_id.index_dtype), result_dtypes=op.result_dtypes( - kernel, func_id.scalar_dtype, func_id.inames), + kernel, func_id.scalar_dtype, func_id.index_dtype), arg_dtypes=( func_id.scalar_dtype, kernel.index_dtype, @@ -358,6 +490,42 @@ def reduction_function_mangler(kernel, func_id, arg_dtypes): kernel.index_dtype), ) + elif isinstance(func_id, SegmentedFunction) and func_id.name == "init": + from loopy.target.opencl import OpenCLTarget + if not isinstance(kernel.target, OpenCLTarget): + raise LoopyError("only OpenCL supported for now") + + op = func_id.reduction_op + + from loopy.kernel.data import CallMangleInfo + return CallMangleInfo( + target_name="%s_init" % op.prefix( + func_id.scalar_dtype, func_id.segment_flag_dtype), + result_dtypes=op.result_dtypes( + kernel, func_id.scalar_dtype, func_id.segment_flag_dtype), + arg_dtypes=(), + ) + + elif isinstance(func_id, SegmentedFunction) and func_id.name == "update": + from loopy.target.opencl import OpenCLTarget + if not isinstance(kernel.target, OpenCLTarget): + raise LoopyError("only OpenCL supported for now") + + op = func_id.reduction_op + + from loopy.kernel.data import CallMangleInfo + return CallMangleInfo( + target_name="%s_update" % op.prefix( + func_id.scalar_dtype, func_id.segment_flag_dtype), + result_dtypes=op.result_dtypes( + kernel, func_id.scalar_dtype, func_id.segment_flag_dtype), + arg_dtypes=( + func_id.scalar_dtype, + func_id.segment_flag_dtype, + func_id.scalar_dtype, + func_id.segment_flag_dtype), + ) + return None @@ -371,4 +539,10 @@ def reduction_preamble_generator(preamble_info): yield get_argext_preamble(preamble_info.kernel, func.name) + elif isinstance(func.name, SegmentedFunction): + if not isinstance(preamble_info.kernel.target, OpenCLTarget): + raise LoopyError("only OpenCL supported for now") + + yield get_segmented_function_preamble(preamble_info.kernel, func.name) + # vim: fdm=marker diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 2b6d97c38a12b47e5b4653297c18b24c40ed938b..c2aaf9cd1e7074a5e92a064096dd6042a78e8e52 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -30,6 +30,7 @@ from loopy.diagnostic import ( import islpy as isl +from pytools import memoize from pytools.persistent_dict import PersistentDict from loopy.tools import LoopyKeyBuilder @@ -97,7 +98,9 @@ def check_reduction_iname_uniqueness(kernel): iname_to_nonsimultaneous_reduction_count = {} def map_reduction(expr, rec): - rec(expr.expr) + for sub_expr in expr.exprs: + rec(sub_expr) + for iname in expr.inames: iname_to_reduction_count[iname] = ( iname_to_reduction_count.get(iname, 0) + 1) @@ -272,7 +275,663 @@ def find_temporary_scope(kernel): # {{{ rewrite reduction to imperative form -def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): + +# {{{ utils (not stateful) + +from collections import namedtuple + + +_InameClassification = namedtuple("_InameClassifiction", + "sequential, local_parallel, nonlocal_parallel") + + +def _classify_reduction_inames(kernel, inames): + sequential = [] + local_par = [] + nonlocal_par = [] + + from loopy.kernel.data import ( + LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag, + ParallelTag) + + for iname in inames: + iname_tag = kernel.iname_to_tag.get(iname) + + if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)): + # These are nominally parallel, but we can live with + # them as sequential. + sequential.append(iname) + + elif isinstance(iname_tag, LocalIndexTagBase): + local_par.append(iname) + + elif isinstance(iname_tag, (ParallelTag, VectorizeTag)): + nonlocal_par.append(iname) + + else: + sequential.append(iname) + + return _InameClassification( + tuple(sequential), tuple(local_par), tuple(nonlocal_par)) + + +def _add_params_to_domain(domain, param_names): + dim_type = isl.dim_type + nparams_orig = domain.dim(dim_type.param) + domain = domain.add_dims(dim_type.param, len(param_names)) + + for param_idx, param_name in enumerate(param_names): + domain = domain.set_dim_name( + dim_type.param, param_idx + nparams_orig, param_name) + + return domain + + +def _move_set_to_param_dims_except(domain, except_dims): + dim_type = isl.dim_type + + iname_idx = 0 + for iname in domain.get_var_names(dim_type.set): + if iname not in except_dims: + domain = domain.move_dims( + dim_type.param, 0, + dim_type.set, iname_idx, 1) + iname_idx -= 1 + iname_idx += 1 + + return domain + + +def _domain_depends_on_given_set_dims(domain, set_dim_names): + set_dim_names = frozenset(set_dim_names) + + return any( + set_dim_names & set(constr.get_coefficients_by_name()) + for constr in domain.get_constraints()) + + +def _check_reduction_is_triangular(kernel, expr, scan_param): + """Check whether the reduction within `expr` with scan parameters described by + the structure `scan_param` is triangular. This attempts to verify that the + domain for the scan and sweep inames is as follows: + + [params] -> { + [other inames..., scan_iname, sweep_iname]: + (sweep_min_value + <= sweep_iname + <= sweep_max_value) + and + (scan_min_value + <= scan_iname + <= stride * (sweep_iname - sweep_min_value) + scan_min_value) + and + (irrelevant constraints) + } + """ + + orig_domain = kernel.get_inames_domain( + frozenset((scan_param.sweep_iname, scan_param.scan_iname))) + + sweep_iname = scan_param.sweep_iname + scan_iname = scan_param.scan_iname + affs = isl.affs_from_space(orig_domain.space) + + sweep_lower_bound = isl.align_spaces( + scan_param.sweep_lower_bound, + affs[0], + across_dim_types=True) + + sweep_upper_bound = isl.align_spaces( + scan_param.sweep_upper_bound, + affs[0], + across_dim_types=True) + + scan_lower_bound = isl.align_spaces( + scan_param.scan_lower_bound, + affs[0], + across_dim_types=True) + + from itertools import product + + for (sweep_lb_domain, sweep_lb_aff), \ + (sweep_ub_domain, sweep_ub_aff), \ + (scan_lb_domain, scan_lb_aff) in \ + product(sweep_lower_bound.get_pieces(), + sweep_upper_bound.get_pieces(), + scan_lower_bound.get_pieces()): + + # Assumptions inherited from the domains of the pwaffs + assumptions = sweep_lb_domain & sweep_ub_domain & scan_lb_domain + + # Sweep iname constraints + hyp_domain = affs[sweep_iname].ge_set(sweep_lb_aff) + hyp_domain &= affs[sweep_iname].le_set(sweep_ub_aff) + + # Scan iname constraints + hyp_domain &= affs[scan_iname].ge_set(scan_lb_aff) + hyp_domain &= affs[scan_iname].le_set( + scan_param.stride * (affs[sweep_iname] - sweep_lb_aff) + + scan_lb_aff) + + hyp_domain, = (hyp_domain & assumptions).get_basic_sets() + test_domain, = (orig_domain & assumptions).get_basic_sets() + + hyp_gist_against_test = hyp_domain.gist(test_domain) + if _domain_depends_on_given_set_dims(hyp_gist_against_test, + (sweep_iname, scan_iname)): + return False, ( + "gist of hypothesis against test domain " + "has sweep or scan dependent constraints: '%s'" + % hyp_gist_against_test) + + test_gist_against_hyp = test_domain.gist(hyp_domain) + if _domain_depends_on_given_set_dims(test_gist_against_hyp, + (sweep_iname, scan_iname)): + return False, ( + "gist of test against hypothesis domain " + "has sweep or scan dependent constraint: '%s'" + % test_gist_against_hyp) + + return True, "ok" + + +_ScanCandidateParameters = namedtuple( + "_ScanCandidateParameters", + "sweep_iname, scan_iname, sweep_lower_bound, " + "sweep_upper_bound, scan_lower_bound, stride") + + +def _try_infer_scan_candidate_from_expr( + kernel, expr, within_inames, sweep_iname=None): + """Analyze `expr` and determine if it can be implemented as a scan. + """ + from loopy.symbolic import Reduction + assert isinstance(expr, Reduction) + + if len(expr.inames) != 1: + raise ValueError( + "Multiple inames in reduction: '%s'" % (", ".join(expr.inames),)) + + scan_iname, = expr.inames + + from loopy.kernel.tools import DomainChanger + dchg = DomainChanger(kernel, (scan_iname,)) + domain = dchg.get_original_domain() + + if sweep_iname is None: + try: + sweep_iname = _try_infer_sweep_iname( + domain, scan_iname, kernel.all_inames()) + except ValueError as v: + raise ValueError( + "Couldn't determine a sweep iname for the scan " + "expression '%s': %s" % (expr, v)) + + try: + sweep_lower_bound, sweep_upper_bound, scan_lower_bound = ( + _try_infer_scan_and_sweep_bounds( + kernel, scan_iname, sweep_iname, within_inames)) + except ValueError as v: + raise ValueError( + "Couldn't determine bounds for the scan with expression '%s' " + "(sweep iname: '%s', scan iname: '%s'): %s" + % (expr, sweep_iname, scan_iname, v)) + + try: + stride = _try_infer_scan_stride( + kernel, scan_iname, sweep_iname, sweep_lower_bound) + except ValueError as v: + raise ValueError( + "Couldn't determine a scan stride for the scan with expression '%s' " + "(sweep iname: '%s', scan iname: '%s'): %s" + % (expr, sweep_iname, scan_iname, v)) + + return _ScanCandidateParameters(sweep_iname, scan_iname, sweep_lower_bound, + sweep_upper_bound, scan_lower_bound, stride) + + +def _try_infer_sweep_iname(domain, scan_iname, candidate_inames): + """The sweep iname is the outer iname which guides the scan. + + E.g. for a domain of {[i,j]: 0<=i 1: + raise ValueError( + "More than one sweep iname candidate for scan iname '%s' found " + "(via constraint '%s')" % (scan_iname, constr)) + + next_candidate = candidate_vars.pop() + + if sweep_iname_candidate is None: + sweep_iname_candidate = next_candidate + defining_constraint = constr + else: + # Check next_candidate consistency + if sweep_iname_candidate != next_candidate: + raise ValueError( + "More than one sweep iname candidate for scan iname '%s' " + "found (via constraints '%s', '%s')" % + (scan_iname, defining_constraint, constr)) + + if sweep_iname_candidate is None: + raise ValueError( + "Couldn't find any sweep iname candidates for " + "scan iname '%s'" % scan_iname) + + return sweep_iname_candidate + + +def _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname, within_inames): + domain = kernel.get_inames_domain(frozenset((sweep_iname, scan_iname))) + domain = _move_set_to_param_dims_except(domain, (sweep_iname, scan_iname)) + + var_dict = domain.get_var_dict() + sweep_idx = var_dict[sweep_iname][1] + scan_idx = var_dict[scan_iname][1] + + domain = domain.project_out_except( + within_inames | kernel.non_iname_variable_names(), (isl.dim_type.param,)) + + try: + from loopy.isl_helpers import no_stderr_output_from_isl + with no_stderr_output_from_isl(domain.get_ctx()): + sweep_lower_bound = domain.dim_min(sweep_idx) + sweep_upper_bound = domain.dim_max(sweep_idx) + scan_lower_bound = domain.dim_min(scan_idx) + except isl.Error as e: + raise ValueError("isl error: %s" % e) + + return (sweep_lower_bound, sweep_upper_bound, scan_lower_bound) + + +def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound): + """The stride is the number of steps the scan iname takes per iteration + of the sweep iname. This is allowed to be an integer constant. + + E.g. for a domain of {[i,j]: 0<=i 1: + raise ValueError("range in multiple pieces: %s" % scan_iname_range) + elif len(scan_iname_pieces) == 0: + raise ValueError("empty range found for iname '%s'" % scan_iname) + + scan_iname_constr, scan_iname_aff = scan_iname_pieces[0] + + if not scan_iname_constr.plain_is_universe(): + raise ValueError("found constraints: %s" % scan_iname_constr) + + if scan_iname_aff.dim(dim_type.div): + raise ValueError("aff has div: %s" % scan_iname_aff) + + coeffs = scan_iname_aff.get_coefficients_by_name(dim_type.param) + + if len(coeffs) == 0: + try: + scan_iname_aff.get_constant_val() + except: + raise ValueError("range for aff isn't constant: '%s'" % scan_iname_aff) + + # If this point is reached we're assuming the domain is of the form + # {[i,j]: i=0 and j=0}, so the stride is technically 1 - any value + # this function returns will be verified later by + # _check_reduction_is_triangular(). + return 1 + + if sweep_iname not in coeffs: + raise ValueError("didn't find sweep iname in coeffs: %s" % sweep_iname) + + stride = coeffs[sweep_iname] + + if not stride.is_int(): + raise ValueError("stride not an integer: %s" % stride) + + if not stride.is_pos(): + raise ValueError("stride not positive: %s" % stride) + + return stride.to_python() + + +def _get_domain_with_iname_as_param(domain, iname): + dim_type = isl.dim_type + + if domain.find_dim_by_name(dim_type.param, iname) >= 0: + return domain + + iname_idx = domain.find_dim_by_name(dim_type.set, iname) + + assert iname_idx >= 0, (iname, domain) + + return domain.move_dims( + dim_type.param, domain.dim(dim_type.param), + dim_type.set, iname_idx, 1) + + +def _create_domain_for_sweep_tracking(orig_domain, + tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride): + dim_type = isl.dim_type + + subd = isl.BasicSet.universe(orig_domain.params().space) + + # Add tracking_iname and sweep iname. + + subd = _add_params_to_domain(subd, (sweep_iname, tracking_iname)) + + # Here we realize the domain: + # + # [..., i] -> { + # [j]: 0 <= j - l + # and + # j - l <= k * (i - m) + # and + # k * (i - m - 1) < j - l } + # where + # * i is the sweep iname + # * j is the tracking iname + # * k is the stride for the scan + # * l is the lower bound for the scan + # * m is the lower bound for the sweep iname + # + affs = isl.affs_from_space(subd.space) + + subd &= (affs[tracking_iname] - scan_min_value).ge_set(affs[0]) + subd &= (affs[tracking_iname] - scan_min_value)\ + .le_set(stride * (affs[sweep_iname] - sweep_min_value)) + subd &= (affs[tracking_iname] - scan_min_value)\ + .gt_set(stride * (affs[sweep_iname] - sweep_min_value - 1)) + + # Move tracking_iname into a set dim (NOT sweep iname). + subd = subd.move_dims( + dim_type.set, 0, + dim_type.param, subd.dim(dim_type.param) - 1, 1) + + # Simplify (maybe). + orig_domain_with_sweep_param = ( + _get_domain_with_iname_as_param(orig_domain, sweep_iname)) + subd = subd.gist_params(orig_domain_with_sweep_param.params()) + + subd, = subd.get_basic_sets() + + return subd + + +def _strip_if_scalar(reference_exprs, expr): + if len(reference_exprs) == 1: + return expr[0] + else: + return expr + + +def _infer_arg_dtypes_and_reduction_dtypes(kernel, expr, unknown_types_ok): + arg_dtypes = [] + + from loopy.type_inference import TypeInferenceMapper + type_inf_mapper = TypeInferenceMapper(kernel) + import loopy as lp + + for sub_expr in expr.exprs: + try: + arg_dtype = type_inf_mapper(sub_expr) + except DependencyTypeInferenceFailure: + if unknown_types_ok: + arg_dtype = lp.auto + else: + raise LoopyError("failed to determine type of accumulator for " + "reduction sub-expression '%s'" % sub_expr) + else: + arg_dtype = arg_dtype.with_target(kernel.target) + + arg_dtypes.append(arg_dtype) + + reduction_dtypes = expr.operation.result_dtypes(kernel, *arg_dtypes) + reduction_dtypes = tuple( + dt.with_target(kernel.target) + if dt is not lp.auto else dt + for dt in reduction_dtypes) + + return tuple(arg_dtypes), reduction_dtypes + + +def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): + """ + Multi assignment function calls are currently lowered into OpenCL so that + the function call:: + + a, b = segmented_sum(x, y, z, w) + + becomes:: + + a = segmented_sum_mangled(x, y, z, w, &b). + + For OpenCL, the scope of "b" is significant, and the preamble generation + currently assumes the scope is always private. This function forces that to + be the case by introducing temporary assignments into the kernel. + """ + + insn_id_gen = kernel.get_instruction_id_generator() + var_name_gen = kernel.get_var_name_generator() + + new_or_updated_instructions = {} + new_temporaries = {} + + dep_map = dict( + (insn.id, insn.depends_on) for insn in kernel.instructions) + + inverse_dep_map = dict((insn.id, set()) for insn in kernel.instructions) + + import six + for insn_id, deps in six.iteritems(dep_map): + for dep in deps: + inverse_dep_map[dep].add(insn_id) + + del dep_map + + # {{{ utils + + def _add_to_no_sync_with(insn_id, new_no_sync_with_params): + insn = kernel.id_to_insn.get(insn_id) + insn = new_or_updated_instructions.get(insn_id, insn) + new_or_updated_instructions[insn_id] = ( + insn.copy( + no_sync_with=( + insn.no_sync_with | frozenset(new_no_sync_with_params)))) + + def _add_to_depends_on(insn_id, new_depends_on_params): + insn = kernel.id_to_insn.get(insn_id) + insn = new_or_updated_instructions.get(insn_id, insn) + new_or_updated_instructions[insn_id] = ( + insn.copy( + depends_on=insn.depends_on | frozenset(new_depends_on_params))) + + # }}} + + from loopy.kernel.instruction import CallInstruction + for insn in kernel.instructions: + if not isinstance(insn, CallInstruction): + continue + + if len(insn.assignees) <= 1: + continue + + assignees = insn.assignees + assignee_var_names = insn.assignee_var_names() + + new_assignees = [assignees[0]] + newly_added_assignments_ids = set() + needs_replacement = False + + last_added_insn_id = insn.id + + from loopy.kernel.data import temp_var_scope, TemporaryVariable + + FIRST_POINTER_ASSIGNEE_IDX = 1 # noqa + + for assignee_nr, assignee_var_name, assignee in zip( + range(FIRST_POINTER_ASSIGNEE_IDX, len(assignees)), + assignee_var_names[FIRST_POINTER_ASSIGNEE_IDX:], + assignees[FIRST_POINTER_ASSIGNEE_IDX:]): + + if ( + assignee_var_name in kernel.temporary_variables + and + (kernel.temporary_variables[assignee_var_name].scope + == temp_var_scope.PRIVATE)): + new_assignees.append(assignee) + continue + + needs_replacement = True + + # {{{ generate a new assignent instruction + + new_assignee_name = var_name_gen( + "{insn_id}_retval_{assignee_nr}" + .format(insn_id=insn.id, assignee_nr=assignee_nr)) + + new_assignment_id = insn_id_gen( + "{insn_id}_assign_retval_{assignee_nr}" + .format(insn_id=insn.id, assignee_nr=assignee_nr)) + + newly_added_assignments_ids.add(new_assignment_id) + + import loopy as lp + new_temporaries[new_assignee_name] = ( + TemporaryVariable( + name=new_assignee_name, + dtype=lp.auto, + scope=temp_var_scope.PRIVATE)) + + from pymbolic import var + new_assignee = var(new_assignee_name) + new_assignees.append(new_assignee) + + new_or_updated_instructions[new_assignment_id] = ( + make_assignment( + assignees=(assignee,), + expression=new_assignee, + id=new_assignment_id, + depends_on=frozenset([last_added_insn_id]), + depends_on_is_final=True, + no_sync_with=( + insn.no_sync_with | frozenset([(insn.id, "any")])), + predicates=insn.predicates, + within_inames=insn.within_inames)) + + last_added_insn_id = new_assignment_id + + # }}} + + if not needs_replacement: + continue + + # {{{ update originating instruction + + orig_insn = new_or_updated_instructions.get(insn.id, insn) + + new_or_updated_instructions[insn.id] = ( + orig_insn.copy(assignees=tuple(new_assignees))) + + _add_to_no_sync_with(insn.id, + [(id, "any") for id in newly_added_assignments_ids]) + + # }}} + + # {{{ squash spurious memory dependencies amongst new assignments + + for new_insn_id in newly_added_assignments_ids: + _add_to_no_sync_with(new_insn_id, + [(id, "any") + for id in newly_added_assignments_ids + if id != new_insn_id]) + + # }}} + + # {{{ update instructions that depend on the originating instruction + + for inverse_dep in inverse_dep_map[insn.id]: + _add_to_depends_on(inverse_dep, newly_added_assignments_ids) + + for insn_id, scope in ( + new_or_updated_instructions[inverse_dep].no_sync_with): + if insn_id == insn.id: + _add_to_no_sync_with( + inverse_dep, + [(id, scope) for id in newly_added_assignments_ids]) + + # }}} + + new_temporary_variables = kernel.temporary_variables.copy() + new_temporary_variables.update(new_temporaries) + + new_instructions = ( + list(new_or_updated_instructions.values()) + + list(insn + for insn in kernel.instructions + if insn.id not in new_or_updated_instructions)) + + return kernel.copy(temporary_variables=new_temporary_variables, + instructions=new_instructions) + + +def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): + # Intersect with inames, because we could have captured some kernel params + # in here too... + dependent_inames = ( + frozenset(subdomain.get_var_names(isl.dim_type.param)) + & kernel.all_inames()) + idx, = kernel.get_leaf_domain_indices(dependent_inames) + domains.insert(idx + 1, subdomain) + +# }}} + + +def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, + automagic_scans_ok=True, force_scan=False, + force_outer_iname_for_scan=None): """Rewrites reductions into their imperative form. With *insn_id_filter* specified, operate only on the instruction with an instruction id matching *insn_id_filter*. @@ -283,6 +942,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): If *insn_id_filter* is not given, all reductions in all instructions will be realized. + + If *automagic_scans_ok*, this function will attempt to rewrite triangular + reductions as scans automatically. + + If *force_scan* is *True*, this function will attempt to rewrite *all* + candidate reductions as scans and raise an error if this is not possible + (this is most useful combined with *insn_id_filter*). + + If *force_outer_iname_for_scan* is not *None*, this function will attempt + to realize candidate reductions as scans using the specified iname as the + outer (sweep) iname. """ logger.debug("%s: realize reduction" % kernel.name) @@ -295,29 +965,34 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): var_name_gen = kernel.get_var_name_generator() new_temporary_variables = kernel.temporary_variables.copy() - from loopy.type_inference import TypeInferenceMapper - type_inf_mapper = TypeInferenceMapper(kernel) + # Dummy inames to remove after scans have been realized + inames_to_remove = set() + + inames_added_for_scan = set() # {{{ sequential - def map_reduction_seq(expr, rec, nresults, arg_dtype, + def map_reduction_seq(expr, rec, nresults, arg_dtypes, reduction_dtypes): outer_insn_inames = temp_kernel.insn_inames(insn) - from pymbolic import var - acc_var_names = [ - var_name_gen("acc_"+"_".join(expr.inames)) - for i in range(nresults)] - acc_vars = tuple(var(n) for n in acc_var_names) + from loopy.kernel.data import temp_var_scope + acc_var_names = make_temporaries( + name_based_on="acc_"+"_".join(expr.inames), + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + scope=temp_var_scope.PRIVATE) - from loopy.kernel.data import TemporaryVariable, temp_var_scope + init_insn_depends_on = frozenset() - for name, dtype in zip(acc_var_names, reduction_dtypes): - new_temporary_variables[name] = TemporaryVariable( - name=name, - shape=(), - dtype=dtype, - scope=temp_var_scope.PRIVATE) + global_barrier = temp_kernel.find_most_recent_global_barrier(insn.id) + + if global_barrier is not None: + init_insn_depends_on |= frozenset([global_barrier]) + + from pymbolic import var + acc_vars = tuple(var(n) for n in acc_var_names) init_id = insn_id_gen( "%s_%s_init" % (insn.id, "_".join(expr.inames))) @@ -327,8 +1002,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): assignees=acc_vars, within_inames=outer_insn_inames - frozenset(expr.inames), within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset(), - expression=expr.operation.neutral_element(arg_dtype, expr.inames)) + depends_on=init_insn_depends_on, + expression=expr.operation.neutral_element(*arg_dtypes)) generated_insns.append(init_insn) @@ -343,9 +1018,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): id=update_id, assignees=acc_vars, expression=expr.operation( - arg_dtype, - acc_vars if len(acc_vars) > 1 else acc_vars[0], - expr.expr, expr.inames), + arg_dtypes, + _strip_if_scalar(acc_vars, acc_vars), + _strip_if_scalar(acc_vars, expr.exprs)), depends_on=frozenset([init_insn.id]) | insn.depends_on, within_inames=update_insn_iname_deps, within_inames_is_final=insn.within_inames_is_final) @@ -382,7 +1057,15 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): v[iname].lt_set(v[0] + size)).get_basic_sets() return bs - def map_reduction_local(expr, rec, nresults, arg_dtype, + def _make_slab_set_from_range(iname, lbound, ubound): + v = isl.make_zero_and_vars([iname]) + bs, = ( + v[iname].ge_set(v[0] + lbound) + & + v[iname].lt_set(v[0] + ubound)).get_basic_sets() + return bs + + def map_reduction_local(expr, rec, nresults, arg_dtypes, reduction_dtypes): red_iname, = expr.inames @@ -406,6 +1089,24 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): _get_int_iname_size(oiname) for oiname in outer_local_inames) + from loopy.kernel.data import temp_var_scope + + neutral_var_names = make_temporaries( + name_based_on="neutral_"+red_iname, + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + scope=temp_var_scope.PRIVATE) + + acc_var_names = make_temporaries( + name_based_on="acc_"+red_iname, + nvars=nresults, + shape=outer_local_iname_sizes + (size,), + dtypes=reduction_dtypes, + scope=temp_var_scope.LOCAL) + + acc_vars = tuple(var(n) for n in acc_var_names) + # {{{ add separate iname to carry out the reduction # Doing this sheds any odd conditionals that may be active @@ -417,32 +1118,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): # }}} - neutral_var_names = [ - var_name_gen("neutral_"+red_iname) - for i in range(nresults)] - acc_var_names = [ - var_name_gen("acc_"+red_iname) - for i in range(nresults)] - acc_vars = tuple(var(n) for n in acc_var_names) - - from loopy.kernel.data import TemporaryVariable, temp_var_scope - for name, dtype in zip(acc_var_names, reduction_dtypes): - new_temporary_variables[name] = TemporaryVariable( - name=name, - shape=outer_local_iname_sizes + (size,), - dtype=dtype, - scope=temp_var_scope.LOCAL) - for name, dtype in zip(neutral_var_names, reduction_dtypes): - new_temporary_variables[name] = TemporaryVariable( - name=name, - shape=(), - dtype=dtype, - scope=temp_var_scope.PRIVATE) - base_iname_deps = outer_insn_inames - frozenset(expr.inames) - neutral = expr.operation.neutral_element(arg_dtype, expr.inames) - + neutral = expr.operation.neutral_element(*arg_dtypes) init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname)) init_insn = make_assignment( id=init_id, @@ -455,12 +1133,6 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): depends_on=frozenset()) generated_insns.append(init_insn) - def _strip_if_scalar(c): - if len(acc_vars) == 1: - return c[0] - else: - return c - init_neutral_id = insn_id_gen("%s_%s_init_neutral" % (insn.id, red_iname)) init_neutral_insn = make_assignment( id=init_neutral_id, @@ -478,9 +1150,11 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): acc_var[outer_local_iname_vars + (var(red_iname),)] for acc_var in acc_vars), expression=expr.operation( - arg_dtype, - _strip_if_scalar(tuple(var(nvn) for nvn in neutral_var_names)), - expr.expr, expr.inames), + arg_dtypes, + _strip_if_scalar( + expr.exprs, + tuple(var(nvn) for nvn in neutral_var_names)), + _strip_if_scalar(expr.exprs, expr.exprs)), within_inames=( (outer_insn_inames - frozenset(expr.inames)) | frozenset([red_iname])), @@ -513,17 +1187,16 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] for acc_var in acc_vars), expression=expr.operation( - arg_dtype, - _strip_if_scalar(tuple( + arg_dtypes, + _strip_if_scalar(acc_vars, tuple( acc_var[ outer_local_iname_vars + (var(stage_exec_iname),)] for acc_var in acc_vars)), - _strip_if_scalar(tuple( + _strip_if_scalar(acc_vars, tuple( acc_var[ outer_local_iname_vars + ( var(stage_exec_iname) + new_size,)] - for acc_var in acc_vars)), - expr.inames), + for acc_var in acc_vars))), within_inames=( base_iname_deps | frozenset([stage_exec_iname])), within_inames_is_final=insn.within_inames_is_final, @@ -548,30 +1221,371 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars] # }}} - # {{{ seq/par dispatch + # {{{ utils (stateful) - def map_reduction(expr, rec, nresults=1): - # Only expand one level of reduction at a time, going from outermost to - # innermost. Otherwise we get the (iname + insn) dependencies wrong. + @memoize + def get_or_add_sweep_tracking_iname_and_domain( + scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride, + tracking_iname): + domain = temp_kernel.get_inames_domain(frozenset((scan_iname, sweep_iname))) - try: - arg_dtype = type_inf_mapper(expr.expr) - except DependencyTypeInferenceFailure: - if unknown_types_ok: - arg_dtype = lp.auto + inames_added_for_scan.add(tracking_iname) + + new_domain = _create_domain_for_sweep_tracking(domain, + tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride) + + _insert_subdomain_into_domain_tree(temp_kernel, domains, new_domain) + + return tracking_iname + + def replace_var_within_expr(expr, from_var, to_var): + from pymbolic.mapper.substitutor import make_subst_func + + from loopy.symbolic import ( + SubstitutionRuleMappingContext, RuleAwareSubstitutionMapper) + + rule_mapping_context = SubstitutionRuleMappingContext( + temp_kernel.substitutions, var_name_gen) + + from pymbolic import var + mapper = RuleAwareSubstitutionMapper( + rule_mapping_context, + make_subst_func({from_var: var(to_var)}), + within=lambda *args: True) + + return mapper(expr, temp_kernel, None) + + def make_temporaries(name_based_on, nvars, shape, dtypes, scope): + var_names = [ + var_name_gen(name_based_on.format(index=i)) + for i in range(nvars)] + + from loopy.kernel.data import TemporaryVariable + + for name, dtype in zip(var_names, dtypes): + new_temporary_variables[name] = TemporaryVariable( + name=name, + shape=shape, + dtype=dtype, + scope=scope) + + return var_names + + # }}} + + # {{{ sequential scan + + def map_scan_seq(expr, rec, nresults, arg_dtypes, + reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, + scan_min_value, stride): + outer_insn_inames = temp_kernel.insn_inames(insn) + inames_to_remove.add(scan_iname) + + track_iname = var_name_gen( + "{sweep_iname}__seq_scan" + .format(scan_iname=scan_iname, sweep_iname=sweep_iname)) + + get_or_add_sweep_tracking_iname_and_domain( + scan_iname, sweep_iname, sweep_min_value, scan_min_value, + stride, track_iname) + + from loopy.kernel.data import temp_var_scope + acc_var_names = make_temporaries( + name_based_on="acc_" + scan_iname, + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + scope=temp_var_scope.PRIVATE) + + from pymbolic import var + acc_vars = tuple(var(n) for n in acc_var_names) + + init_id = insn_id_gen( + "%s_%s_init" % (insn.id, "_".join(expr.inames))) + + init_insn_depends_on = frozenset() + + global_barrier = temp_kernel.find_most_recent_global_barrier(insn.id) + + if global_barrier is not None: + init_insn_depends_on |= frozenset([global_barrier]) + + init_insn = make_assignment( + id=init_id, + assignees=acc_vars, + within_inames=outer_insn_inames - frozenset( + (sweep_iname,) + expr.inames), + within_inames_is_final=insn.within_inames_is_final, + depends_on=init_insn_depends_on, + expression=expr.operation.neutral_element(*arg_dtypes)) + + generated_insns.append(init_insn) + + updated_inner_exprs = tuple( + replace_var_within_expr(sub_expr, scan_iname, track_iname) + for sub_expr in expr.exprs) + + update_id = insn_id_gen( + based_on="%s_%s_update" % (insn.id, "_".join(expr.inames))) + + update_insn_iname_deps = temp_kernel.insn_inames(insn) | set([track_iname]) + if insn.within_inames_is_final: + update_insn_iname_deps = insn.within_inames | set([track_iname]) + + scan_insn = make_assignment( + id=update_id, + assignees=acc_vars, + expression=expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, acc_vars), + _strip_if_scalar(acc_vars, updated_inner_exprs)), + depends_on=frozenset([init_insn.id]) | insn.depends_on, + within_inames=update_insn_iname_deps, + no_sync_with=insn.no_sync_with, + within_inames_is_final=insn.within_inames_is_final) + + generated_insns.append(scan_insn) + + new_insn_add_depends_on.add(scan_insn.id) - reduction_dtypes = (lp.auto,)*nresults + if nresults == 1: + assert len(acc_vars) == 1 + return acc_vars[0] + else: + return acc_vars + + # }}} + + # {{{ local-parallel scan + + def map_scan_local(expr, rec, nresults, arg_dtypes, + reduction_dtypes, sweep_iname, scan_iname, + sweep_min_value, scan_min_value, stride): + + scan_size = _get_int_iname_size(sweep_iname) + + assert scan_size > 0 + + if scan_size == 1: + return map_reduction_seq( + expr, rec, nresults, arg_dtypes, reduction_dtypes) + + outer_insn_inames = temp_kernel.insn_inames(insn) + from loopy.kernel.data import LocalIndexTagBase + outer_local_inames = tuple( + oiname + for oiname in outer_insn_inames + if isinstance( + kernel.iname_to_tag.get(oiname), + LocalIndexTagBase) + and oiname != sweep_iname) + + from pymbolic import var + outer_local_iname_vars = tuple( + var(oiname) for oiname in outer_local_inames) + + outer_local_iname_sizes = tuple( + _get_int_iname_size(oiname) + for oiname in outer_local_inames) + + track_iname = var_name_gen( + "{sweep_iname}__pre_scan" + .format(scan_iname=scan_iname, sweep_iname=sweep_iname)) + + get_or_add_sweep_tracking_iname_and_domain( + scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride, + track_iname) + + # {{{ add separate iname to carry out the scan + + # Doing this sheds any odd conditionals that may be active + # on our scan_iname. + + base_exec_iname = var_name_gen(sweep_iname + "__scan") + domains.append(_make_slab_set(base_exec_iname, scan_size)) + new_iname_tags[base_exec_iname] = kernel.iname_to_tag[sweep_iname] + + # }}} + + from loopy.kernel.data import temp_var_scope + + """ + neutral_var_names = make_temporaries( + name_based_on="neutral_"+scan_iname, + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + scope=temp_var_scope.PRIVATE) + """ + + read_var_names = make_temporaries( + name_based_on="read_"+scan_iname+"_arg_{index}", + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + scope=temp_var_scope.PRIVATE) + + acc_var_names = make_temporaries( + name_based_on="acc_"+scan_iname, + nvars=nresults, + shape=outer_local_iname_sizes + (scan_size,), + dtypes=reduction_dtypes, + scope=temp_var_scope.LOCAL) + + acc_vars = tuple(var(n) for n in acc_var_names) + read_vars = tuple(var(n) for n in read_var_names) + #neutral_vars = tuple(var(n) for n in neutral_var_names) + + base_iname_deps = (outer_insn_inames + - frozenset(expr.inames) - frozenset([sweep_iname])) + + neutral = expr.operation.neutral_element(*arg_dtypes) + + init_insn_depends_on = insn.depends_on + + global_barrier = temp_kernel.find_most_recent_global_barrier(insn.id) + + if global_barrier is not None: + init_insn_depends_on |= frozenset([global_barrier]) + + init_id = insn_id_gen("%s_%s_init" % (insn.id, scan_iname)) + init_insn = make_assignment( + id=init_id, + assignees=tuple( + acc_var[outer_local_iname_vars + (var(base_exec_iname),)] + for acc_var in acc_vars), + expression=neutral, + within_inames=base_iname_deps | frozenset([base_exec_iname]), + within_inames_is_final=insn.within_inames_is_final, + depends_on=init_insn_depends_on) + generated_insns.append(init_insn) + + updated_inner_exprs = tuple( + replace_var_within_expr(sub_expr, scan_iname, track_iname) + for sub_expr in expr.exprs) + + from loopy.symbolic import Reduction + + from loopy.symbolic import pw_aff_to_expr + sweep_min_value_expr = pw_aff_to_expr(sweep_min_value) + + transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, scan_iname)) + transfer_insn = make_assignment( + id=transfer_id, + assignees=tuple( + acc_var[outer_local_iname_vars + + (var(sweep_iname) - sweep_min_value_expr,)] + for acc_var in acc_vars), + expression=Reduction( + operation=expr.operation, + inames=(track_iname,), + exprs=updated_inner_exprs, + allow_simultaneous=False, + ), + within_inames=outer_insn_inames - frozenset(expr.inames), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset([init_id]) | insn.depends_on, + no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with) + + generated_insns.append(transfer_insn) + + def _strip_if_scalar(c): + if len(acc_vars) == 1: + return c[0] else: - raise LoopyError("failed to determine type of accumulator for " - "reduction '%s'" % expr) + return c + + prev_id = transfer_id + + istage = 0 + cur_size = 1 + + while cur_size < scan_size: + stage_exec_iname = var_name_gen("%s__scan_s%d" % (sweep_iname, istage)) + domains.append( + _make_slab_set_from_range(stage_exec_iname, cur_size, scan_size)) + new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[sweep_iname] + + for read_var, acc_var in zip(read_vars, acc_vars): + read_stage_id = insn_id_gen( + "scan_%s_read_stage_%d" % (scan_iname, istage)) + + read_stage_insn = make_assignment( + id=read_stage_id, + assignees=(read_var,), + expression=( + acc_var[ + outer_local_iname_vars + + (var(stage_exec_iname) - cur_size,)]), + within_inames=( + base_iname_deps | frozenset([stage_exec_iname])), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset([prev_id])) + + if cur_size == 1: + # Performance hack: don't add a barrier here with transfer_insn. + # XXX: If the lowering logic changes, this could be brittle. + read_stage_insn = read_stage_insn.copy( + no_sync_with=( + read_stage_insn.no_sync_with + | frozenset([(transfer_id, "any")]))) + + generated_insns.append(read_stage_insn) + prev_id = read_stage_id + + write_stage_id = insn_id_gen( + "scan_%s_write_stage_%d" % (scan_iname, istage)) + write_stage_insn = make_assignment( + id=write_stage_id, + assignees=tuple( + acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] + for acc_var in acc_vars), + expression=expr.operation( + arg_dtypes, + _strip_if_scalar(tuple( + acc_var[ + outer_local_iname_vars + (var(stage_exec_iname),)] + for acc_var in acc_vars)), + _strip_if_scalar(read_vars) + ), + within_inames=( + base_iname_deps | frozenset([stage_exec_iname])), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset([prev_id]), + ) + + generated_insns.append(write_stage_insn) + prev_id = write_stage_id + + #cur_size = new_size + #bound = cur_size + cur_size *= 2 + istage += 1 + + new_insn_add_depends_on.add(prev_id) + new_insn_add_within_inames.add(sweep_iname) + + output_idx = var(sweep_iname) - sweep_min_value_expr + + if nresults == 1: + assert len(acc_vars) == 1 + return acc_vars[0][outer_local_iname_vars + (output_idx,)] else: - arg_dtype = arg_dtype.with_target(kernel.target) + return [acc_var[outer_local_iname_vars + (output_idx,)] + for acc_var in acc_vars] + + # }}} - reduction_dtypes = expr.operation.result_dtypes( - kernel, arg_dtype, expr.inames) - reduction_dtypes = tuple( - dt.with_target(kernel.target) for dt in reduction_dtypes) + # {{{ seq/par dispatch + + def map_reduction(expr, rec, nresults=1): + # Only expand one level of reduction at a time, going from outermost to + # innermost. Otherwise we get the (iname + insn) dependencies wrong. + + arg_dtypes, reduction_dtypes = ( + _infer_arg_dtypes_and_reduction_dtypes( + temp_kernel, expr, unknown_types_ok)) outer_insn_inames = temp_kernel.insn_inames(insn) bad_inames = frozenset(expr.inames) & outer_insn_inames @@ -579,31 +1593,43 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): raise LoopyError("reduction used within loop(s) that it was " "supposed to reduce over: " + ", ".join(bad_inames)) - n_sequential = 0 - n_local_par = 0 + iname_classes = _classify_reduction_inames(temp_kernel, expr.inames) - from loopy.kernel.data import ( - LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag, - ParallelTag) - for iname in expr.inames: - iname_tag = kernel.iname_to_tag.get(iname) + n_sequential = len(iname_classes.sequential) + n_local_par = len(iname_classes.local_parallel) + n_nonlocal_par = len(iname_classes.nonlocal_parallel) + + really_force_scan = force_scan and ( + len(expr.inames) != 1 or expr.inames[0] not in inames_added_for_scan) + + def _error_if_force_scan_on(cls, msg): + if really_force_scan: + raise cls(msg) - if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)): - # These are nominally parallel, but we can live with - # them as sequential. - n_sequential += 1 + may_be_implemented_as_scan = False + if force_scan or automagic_scans_ok: + from loopy.diagnostic import ReductionIsNotTriangularError - elif isinstance(iname_tag, LocalIndexTagBase): - n_local_par += 1 + try: + # Try to determine scan candidate information (sweep iname, scan + # iname, etc). + scan_param = _try_infer_scan_candidate_from_expr( + temp_kernel, expr, outer_insn_inames, + sweep_iname=force_outer_iname_for_scan) - elif isinstance(iname_tag, (ParallelTag, VectorizeTag)): - raise LoopyError("the only form of parallelism supported " - "by reductions is 'local'--found iname '%s' " - "tagged '%s'" - % (iname, type(iname_tag).__name__)) + except ValueError as v: + error = str(v) else: - n_sequential += 1 + # Ensures the reduction is triangular (somewhat expensive). + may_be_implemented_as_scan, error = ( + _check_reduction_is_triangular( + temp_kernel, expr, scan_param)) + + if not may_be_implemented_as_scan: + _error_if_force_scan_on(ReductionIsNotTriangularError, error) + + # {{{ sanity checks if n_local_par and n_sequential: raise LoopyError("Reduction over '%s' contains both parallel and " @@ -619,21 +1645,84 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): "before code generation." % ", ".join(expr.inames)) - if n_sequential: - assert n_local_par == 0 - return map_reduction_seq(expr, rec, nresults, arg_dtype, - reduction_dtypes) - elif n_local_par: - return map_reduction_local(expr, rec, nresults, arg_dtype, - reduction_dtypes) - else: + if n_nonlocal_par: + bad_inames = iname_classes.nonlocal_parallel + raise LoopyError("the only form of parallelism supported " + "by reductions is 'local'--found iname(s) '%s' " + "respectively tagged '%s'" + % (", ".join(bad_inames), + ", ".join(kernel.iname_to_tag[iname] + for iname in bad_inames))) + + if n_local_par == 0 and n_sequential == 0: from loopy.diagnostic import warn_with_kernel warn_with_kernel(kernel, "empty_reduction", "Empty reduction found (no inames to reduce over). " "Eliminating.") + # FIXME: return a neutral element. + return expr.expr + # }}} + + if may_be_implemented_as_scan: + assert force_scan or automagic_scans_ok + + # We require the "scan" iname to be tagged sequential. + if n_sequential: + sweep_iname = scan_param.sweep_iname + sweep_class = _classify_reduction_inames(kernel, (sweep_iname,)) + + sequential = sweep_iname in sweep_class.sequential + parallel = sweep_iname in sweep_class.local_parallel + bad_parallel = sweep_iname in sweep_class.nonlocal_parallel + + if sweep_iname not in outer_insn_inames: + _error_if_force_scan_on(LoopyError, + "Sweep iname '%s' was detected, but is not an iname " + "for the instruction." % sweep_iname) + elif bad_parallel: + _error_if_force_scan_on(LoopyError, + "Sweep iname '%s' has an unsupported parallel tag '%s' " + "- the only parallelism allowed is 'local'." % + (sweep_iname, temp_kernel.iname_to_tag[sweep_iname])) + elif parallel: + return map_scan_local( + expr, rec, nresults, arg_dtypes, reduction_dtypes, + sweep_iname, scan_param.scan_iname, + scan_param.sweep_lower_bound, + scan_param.scan_lower_bound, + scan_param.stride) + elif sequential: + return map_scan_seq( + expr, rec, nresults, arg_dtypes, reduction_dtypes, + sweep_iname, scan_param.scan_iname, + scan_param.sweep_lower_bound, + scan_param.scan_lower_bound, + scan_param.stride) + + # fallthrough to reduction implementation + + else: + assert n_local_par > 0 + scan_iname, = expr.inames + _error_if_force_scan_on(LoopyError, + "Scan iname '%s' is parallel tagged: this is not allowed " + "(only the sweep iname should be tagged if parallelism " + "is desired)." % scan_iname) + + # fallthrough to reduction implementation + + if n_sequential: + assert n_local_par == 0 + return map_reduction_seq( + expr, rec, nresults, arg_dtypes, reduction_dtypes) + else: + assert n_local_par > 0 + return map_reduction_local( + expr, rec, nresults, arg_dtypes, reduction_dtypes) + # }}} from loopy.symbolic import ReductionCallbackMapper @@ -739,6 +1828,12 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): kernel = lp.tag_inames(kernel, new_iname_tags) + # TODO: remove unused inames... + + kernel = ( + _hackily_ensure_multi_assignment_return_values_are_scoped_private( + kernel)) + return kernel # }}} diff --git a/loopy/schedule/tools.py b/loopy/schedule/tools.py index 5de677e72708be844a5276b3d40ace8b1dad9da0..e058fe30f1f6a9fc2746b4c542e88041f76818d3 100644 --- a/loopy/schedule/tools.py +++ b/loopy/schedule/tools.py @@ -23,10 +23,6 @@ THE SOFTWARE. """ from loopy.kernel.data import temp_var_scope -from loopy.schedule import (BeginBlockItem, CallKernel, EndBlockItem, - RunInstruction, Barrier) - -from pytools import memoize_method # {{{ block boundary finder @@ -37,6 +33,7 @@ def get_block_boundaries(schedule): :class:`loopy.schedule.BlockBeginItem`s to :class:`loopy.schedule.BlockEndItem`s and vice versa. """ + from loopy.schedule import (BeginBlockItem, EndBlockItem) block_bounds = {} active_blocks = [] for idx, sched_item in enumerate(schedule): @@ -51,109 +48,20 @@ def get_block_boundaries(schedule): # }}} -# {{{ instruction query utility - -class InstructionQuery(object): - - def __init__(self, kernel): - self.kernel = kernel - block_bounds = get_block_boundaries(kernel.schedule) - subkernel_slices = {} - from six import iteritems - for start, end in iteritems(block_bounds): - sched_item = kernel.schedule[start] - if isinstance(sched_item, CallKernel): - subkernel_slices[sched_item.kernel_name] = slice(start, end + 1) - self.subkernel_slices = subkernel_slices - - @memoize_method - def subkernels(self): - return frozenset(self.subkernel_slices.keys()) - - @memoize_method - def insns_reading_or_writing(self, var): - return frozenset(insn.id for insn in self.kernel.instructions - if var in insn.read_dependency_names() - or var in insn.assignee_var_names()) - - @memoize_method - def insns_in_subkernel(self, subkernel): - return frozenset(sched_item.insn_id for sched_item - in self.kernel.schedule[self.subkernel_slices[subkernel]] - if isinstance(sched_item, RunInstruction)) - - @memoize_method - def temporaries_read_in_subkernel(self, subkernel): - return frozenset( - var - for insn in self.insns_in_subkernel(subkernel) - for var in self.kernel.id_to_insn[insn].read_dependency_names() - if var in self.kernel.temporary_variables) - - @memoize_method - def temporaries_written_in_subkernel(self, subkernel): - return frozenset( - var - for insn in self.insns_in_subkernel(subkernel) - for var in self.kernel.id_to_insn[insn].assignee_var_names() - if var in self.kernel.temporary_variables) - - @memoize_method - def temporaries_read_or_written_in_subkernel(self, subkernel): - return ( - self.temporaries_read_in_subkernel(subkernel) | - self.temporaries_written_in_subkernel(subkernel)) - - @memoize_method - def inames_in_subkernel(self, subkernel): - subkernel_start = self.subkernel_slices[subkernel].start - return frozenset(self.kernel.schedule[subkernel_start].extra_inames) - - @memoize_method - def pre_and_post_barriers(self, subkernel): - subkernel_start = self.subkernel_slices[subkernel].start - subkernel_end = self.subkernel_slices[subkernel].stop - - def is_global_barrier(item): - return isinstance(item, Barrier) and item.kind == "global" - - try: - pre_barrier = next(item for item in - self.kernel.schedule[subkernel_start::-1] - if is_global_barrier(item)).originating_insn_id - except StopIteration: - pre_barrier = None - - try: - post_barrier = next(item for item in - self.kernel.schedule[subkernel_end:] - if is_global_barrier(item)).originating_insn_id - except StopIteration: - post_barrier = None - - return (pre_barrier, post_barrier) - - @memoize_method - def hw_inames(self, insn_id): - """ - Return the inames that insn runs in and that are tagged as hardware - parallel. - """ - from loopy.kernel.data import HardwareParallelTag - return set(iname for iname in self.kernel.insn_inames(insn_id) - if isinstance(self.kernel.iname_to_tag.get(iname), - HardwareParallelTag)) - - @memoize_method - def common_hw_inames(self, insn_ids): - """ - Return the common set of hardware parallel tagged inames among - the list of instructions. - """ - # Get the list of hardware inames in which the temporary is defined. - if len(insn_ids) == 0: - return set() - return set.intersection(*(self.hw_inames(id) for id in insn_ids)) +# {{{ subkernel tools + +def temporaries_read_in_subkernel(kernel, subkernel): + return frozenset(tv + for insn_id in kernel.subkernel_to_insn_ids[subkernel] + for tv in kernel.id_to_insn[insn_id].read_dependency_names() + if tv in kernel.temporary_variables) + + +def temporaries_written_in_subkernel(kernel, subkernel): + return frozenset(tv + for insn_id in kernel.subkernel_to_insn_ids[subkernel] + for tv in kernel.id_to_insn[insn_id].write_dependency_names() + if tv in kernel.temporary_variables) # }}} @@ -166,23 +74,27 @@ def add_extra_args_to_schedule(kernel): instructions in the schedule with global temporaries. """ new_schedule = [] - - insn_query = InstructionQuery(kernel) + from loopy.schedule import CallKernel for sched_item in kernel.schedule: if isinstance(sched_item, CallKernel): - subrange_temporaries = (insn_query - .temporaries_read_or_written_in_subkernel(sched_item.kernel_name)) + subkernel = sched_item.kernel_name + + used_temporaries = ( + temporaries_read_in_subkernel(kernel, subkernel) + | temporaries_written_in_subkernel(kernel, subkernel)) + more_args = set(tv - for tv in subrange_temporaries - if - kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL - and - kernel.temporary_variables[tv].initializer is None - and - tv not in sched_item.extra_args) + for tv in used_temporaries + if + kernel.temporary_variables[tv].scope == temp_var_scope.GLOBAL + and + kernel.temporary_variables[tv].initializer is None + and + tv not in sched_item.extra_args) + new_schedule.append(sched_item.copy( - extra_args=sched_item.extra_args + sorted(more_args))) + extra_args=sched_item.extra_args + sorted(more_args))) else: new_schedule.append(sched_item) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 50c891be476810887720c4e13c9659966b431f5d..bbf798e5e1d146050d210a04c99af6cb33057087 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -95,7 +95,8 @@ class IdentityMapperMixin(object): new_inames.append(new_sym_iname.name) return Reduction( - expr.operation, tuple(new_inames), self.rec(expr.expr, *args), + expr.operation, tuple(new_inames), + tuple(self.rec(e, *args) for e in expr.exprs), allow_simultaneous=expr.allow_simultaneous) def map_tagged_variable(self, expr, *args): @@ -144,7 +145,8 @@ class WalkMapper(WalkMapperBase): if not self.visit(expr): return - self.rec(expr.expr, *args) + for sub_expr in expr.exprs: + self.rec(sub_expr, *args) map_tagged_variable = WalkMapperBase.map_variable @@ -162,7 +164,7 @@ class CallbackMapper(CallbackMapperBase, IdentityMapper): class CombineMapper(CombineMapperBase): def map_reduction(self, expr): - return self.rec(expr.expr) + return self.combine(self.rec(sub_expr) for sub_expr in expr.exprs) map_linear_subscript = CombineMapperBase.map_subscript @@ -192,9 +194,11 @@ class StringifyMapper(StringifyMapperBase): return "loc.%d" % expr.index def map_reduction(self, expr, prec): + from pymbolic.mapper.stringifier import PREC_NONE return "%sreduce(%s, [%s], %s)" % ( "simul_" if expr.allow_simultaneous else "", - expr.operation, ", ".join(expr.inames), expr.expr) + expr.operation, ", ".join(expr.inames), + ", ".join(self.rec(e, PREC_NONE) for e in expr.exprs)) def map_tagged_variable(self, expr, prec): return "%s$%s" % (expr.name, expr.tag) @@ -225,7 +229,7 @@ class UnidirectionalUnifier(UnidirectionalUnifierBase): ): return [] - return self.rec(expr.expr, other.expr, unis) + return self.rec(expr.exprs, other.exprs, unis) def map_tagged_variable(self, expr, other, urecs): new_uni_record = self.unification_record_from_equation( @@ -258,7 +262,7 @@ class DependencyMapper(DependencyMapperBase): self.rec(child, *args) for child in expr.parameters) def map_reduction(self, expr): - return (self.rec(expr.expr) + return (self.combine(self.rec(sub_expr) for sub_expr in expr.exprs) - set(p.Variable(iname) for iname in expr.inames)) def map_tagged_variable(self, expr): @@ -440,10 +444,10 @@ class Reduction(p.Expression): a list of inames across which reduction on :attr:`expr` is being carried out. - .. attribute:: expr + .. attribute:: exprs - The expression (as a :class:`pymbolic.primitives.Expression`) - on which reduction is performed. + A :class:`tuple` of :class:`pymbolic.primitives.Expression`, + representing the expression(s) over which reduction is performed. .. attribute:: allow_simultaneous @@ -451,9 +455,9 @@ class Reduction(p.Expression): in precisely one reduction, to avoid mis-nesting errors. """ - init_arg_names = ("operation", "inames", "expr", "allow_simultaneous") + init_arg_names = ("operation", "inames", "exprs", "allow_simultaneous") - def __init__(self, operation, inames, expr, allow_simultaneous=False): + def __init__(self, operation, inames, exprs, allow_simultaneous=False): if isinstance(inames, str): inames = tuple(iname.strip() for iname in inames.split(",")) @@ -475,26 +479,28 @@ class Reduction(p.Expression): from loopy.library.reduction import parse_reduction_op operation = parse_reduction_op(operation) + if not isinstance(exprs, tuple): + exprs = (exprs,) + from loopy.library.reduction import ReductionOperation assert isinstance(operation, ReductionOperation) self.operation = operation self.inames = inames - self.expr = expr + self.exprs = exprs self.allow_simultaneous = allow_simultaneous def __getinitargs__(self): - return (self.operation, self.inames, self.expr, self.allow_simultaneous) + return (self.operation, self.inames, self.exprs, self.allow_simultaneous) def get_hash(self): - return hash((self.__class__, self.operation, self.inames, - self.expr)) + return hash((self.__class__, self.operation, self.inames, self.exprs)) def is_equal(self, other): return (other.__class__ == self.__class__ and other.operation == self.operation and other.inames == self.inames - and other.expr == self.expr) + and other.exprs == self.exprs) def stringifier(self): return StringifyMapper @@ -924,7 +930,7 @@ class FunctionToPrimitiveMapper(IdentityMapper): turns those into the actual pymbolic primitives used for that. """ - def _parse_reduction(self, operation, inames, red_expr, + def _parse_reduction(self, operation, inames, red_exprs, allow_simultaneous=False): if isinstance(inames, p.Variable): inames = (inames,) @@ -941,7 +947,7 @@ class FunctionToPrimitiveMapper(IdentityMapper): processed_inames.append(iname.name) - return Reduction(operation, tuple(processed_inames), red_expr, + return Reduction(operation, tuple(processed_inames), red_exprs, allow_simultaneous=allow_simultaneous) def map_call(self, expr): @@ -991,12 +997,17 @@ class FunctionToPrimitiveMapper(IdentityMapper): operation = parse_reduction_op(name) if operation: - if len(expr.parameters) != 2: + # arg_count counts arguments but not inames + if len(expr.parameters) != 1 + operation.arg_count: raise RuntimeError("invalid invocation of " - "reduction operation '%s'" % expr.function.name) + "reduction operation '%s': expected %d arguments, " + "got %d instead" % (expr.function.name, + 1 + operation.arg_count, + len(expr.parameters))) - inames, red_expr = expr.parameters - return self._parse_reduction(operation, inames, self.rec(red_expr)) + inames = expr.parameters[0] + red_exprs = tuple(self.rec(param) for param in expr.parameters[1:]) + return self._parse_reduction(operation, inames, red_exprs) else: return IdentityMapper.map_call(self, expr) @@ -1385,7 +1396,7 @@ class IndexVariableFinder(CombineMapper): return result def map_reduction(self, expr): - result = self.rec(expr.expr) + result = self.combine(self.rec(sub_expr) for sub_expr in expr.exprs) if not (expr.inames_set & result): raise RuntimeError("reduction '%s' does not depend on " diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 575311b11716f5a52e4713aa51922eb348c839d9..c6ff596b005bc782dbf33a1394e840c7f63ff2a5 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -683,7 +683,8 @@ def set_temporary_scope(kernel, temp_var_names, scope): # {{{ reduction_arg_to_subst_rule -def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=None): +def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=None, + arg_number=0): if isinstance(inames, str): inames = [s.strip() for s in inames.split(",")] @@ -693,12 +694,14 @@ def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=No var_name_gen = knl.get_var_name_generator() + # XXX def map_reduction(expr, rec, nresults=1): if frozenset(expr.inames) != inames_set: + assert len(expr.exprs) == 1 return type(expr)( operation=expr.operation, inames=expr.inames, - expr=rec(expr.expr), + exprs=(rec(expr.exprs[0]),), allow_simultaneous=expr.allow_simultaneous) if subst_rule_name is None: @@ -715,15 +718,24 @@ def reduction_arg_to_subst_rule(knl, inames, insn_match=None, subst_rule_name=No substs[my_subst_rule_name] = SubstitutionRule( name=my_subst_rule_name, arguments=tuple(inames), - expression=expr.expr) + expression=expr.exprs[arg_number]) from pymbolic import var iname_vars = [var(iname) for iname in inames] + new_exprs = [] + for sub_expr_number, sub_expr in enumerate(expr.exprs): + if sub_expr_number == arg_number: + new_exprs.append(var(my_subst_rule_name)(*iname_vars)) + else: + new_exprs.append(sub_expr) + + new_exprs = tuple(new_exprs) + return type(expr)( operation=expr.operation, inames=expr.inames, - expr=var(my_subst_rule_name)(*iname_vars), + exprs=new_exprs, allow_simultaneous=expr.allow_simultaneous) from loopy.symbolic import ReductionCallbackMapper diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py index c35b5064365293ac78cdd01af537c9d28bd67193..300faa638dd0434ac92507b3a26c39b8817ede83 100644 --- a/loopy/transform/iname.py +++ b/loopy/transform/iname.py @@ -145,7 +145,8 @@ class _InameSplitter(RuleAwareIdentityMapper): from loopy.symbolic import Reduction return Reduction(expr.operation, tuple(new_inames), - self.rec(expr.expr, expn_state), + tuple(self.rec(sub_expr, expn_state) + for sub_expr in expr.exprs), expr.allow_simultaneous) else: return super(_InameSplitter, self).map_reduction(expr, expn_state) @@ -1191,13 +1192,15 @@ class _ReductionSplitter(RuleAwareIdentityMapper): if self.direction == "in": return Reduction(expr.operation, tuple(leftover_inames), Reduction(expr.operation, tuple(self.inames), - self.rec(expr.expr, expn_state), + tuple(self.rec(sub_expr, expn_state) + for sub_expr in expr.exprs), expr.allow_simultaneous), expr.allow_simultaneous) elif self.direction == "out": return Reduction(expr.operation, tuple(self.inames), Reduction(expr.operation, tuple(leftover_inames), - self.rec(expr.expr, expn_state), + tuple(self.rec(sub_expr, expn_state) + for sub_expr in expr.exprs), expr.allow_simultaneous)) else: assert False @@ -1589,10 +1592,11 @@ class _ReductionInameUniquifier(RuleAwareIdentityMapper): from loopy.symbolic import Reduction return Reduction(expr.operation, tuple(new_inames), - self.rec( - SubstitutionMapper(make_subst_func(subst_dict))( - expr.expr), - expn_state), + tuple(self.rec( + SubstitutionMapper(make_subst_func(subst_dict))( + sub_expr), + expn_state) + for sub_expr in expr.exprs), expr.allow_simultaneous) else: return super(_ReductionInameUniquifier, self).map_reduction( diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 7c9c9688604179dce2aa7dcd6954d76a0df32cc7..410274f907b250bb81583281ebe503e8c8c80373 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -34,7 +34,6 @@ def find_instructions(kernel, insn_match): match = parse_match(insn_match) return [insn for insn in kernel.instructions if match(kernel, insn)] - # }}} @@ -171,6 +170,7 @@ def replace_instruction_ids(kernel, replacements): for insn in kernel.instructions: changed = False new_depends_on = [] + new_no_sync_with = [] for dep in insn.depends_on: if dep in replacements: @@ -179,8 +179,18 @@ def replace_instruction_ids(kernel, replacements): else: new_depends_on.append(dep) + for insn_id, scope in insn.no_sync_with: + if insn_id in replacements: + new_no_sync_with.extend( + (repl, scope) for repl in replacements[insn_id]) + changed = True + else: + new_no_sync_with.append((insn_id, scope)) + new_insns.append( - insn.copy(depends_on=frozenset(new_depends_on)) + insn.copy( + depends_on=frozenset(new_depends_on), + no_sync_with=frozenset(new_no_sync_with)) if changed else insn) return kernel.copy(instructions=new_insns) @@ -207,4 +217,77 @@ def tag_instructions(kernel, new_tag, within=None): # }}} +# {{{ add nosync + +def add_nosync(kernel, scope, source, sink, bidirectional=False, force=False): + """Add a *no_sync_with* directive between *source* and *sink*. + *no_sync_with* is only added if a (syntactic) dependency edge + is present or if the instruction pair is in a conflicting group + (this does not check for memory dependencies). + + :arg kernel: + :arg source: Either a single instruction id, or any instruction id + match understood by :func:`loopy.match.parse_match`. + :arg sink: Either a single instruction id, or any instruction id + match understood by :func:`loopy.match.parse_match`. + :arg scope: A string which is a valid *no_sync_with* scope. + :arg bidirectional: A :class:`bool`. If *True*, add a *no_sync_with* + to both the source and sink instructions, otherwise the directive + is only added to the sink instructions. + :arg force: A :class:`bool`. If *True*, will add a *no_sync_with* + even without the presence of a syntactic dependency edge/ + conflicting instruction group. + + :return: The updated kernel + """ + + if isinstance(source, str) and source in kernel.id_to_insn: + sources = frozenset([source]) + else: + sources = frozenset( + source.id for source in find_instructions(kernel, source)) + + if isinstance(sink, str) and sink in kernel.id_to_insn: + sinks = frozenset([sink]) + else: + sinks = frozenset( + sink.id for sink in find_instructions(kernel, sink)) + + def insns_in_conflicting_groups(insn1_id, insn2_id): + insn1 = kernel.id_to_insn[insn1_id] + insn2 = kernel.id_to_insn[insn2_id] + return ( + bool(insn1.groups & insn2.conflicts_with_groups) + or + bool(insn2.groups & insn1.conflicts_with_groups)) + + from collections import defaultdict + nosync_to_add = defaultdict(set) + + for sink in sinks: + for source in sources: + + needs_nosync = force or ( + source in kernel.recursive_insn_dep_map()[sink] + or insns_in_conflicting_groups(source, sink)) + + if not needs_nosync: + continue + + nosync_to_add[sink].add((source, scope)) + if bidirectional: + nosync_to_add[source].add((sink, scope)) + + new_instructions = list(kernel.instructions) + + for i, insn in enumerate(new_instructions): + if insn.id in nosync_to_add: + new_instructions[i] = insn.copy(no_sync_with=insn.no_sync_with + | frozenset(nosync_to_add[insn.id])) + + return kernel.copy(instructions=new_instructions) + +# }}} + + # vim: foldmethod=marker diff --git a/loopy/transform/reduction.py b/loopy/transform/reduction.py new file mode 100644 index 0000000000000000000000000000000000000000..c63ac9d72a93994f7d6efd7545395611b1ec3279 --- /dev/null +++ b/loopy/transform/reduction.py @@ -0,0 +1,704 @@ +from __future__ import division, absolute_import + +__copyright__ = "Copyright (C) 2017 Matt Wala" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + + +import loopy as lp +import islpy as isl + + +import logging +logger = logging.getLogger(__name__) + + +__doc__ = """ +.. currentmodule:: loopy + +.. autofunction:: make_two_level_reduction +.. autofunction:: make_two_level_scan +""" + + +def make_two_level_reduction( + kernel, insn_id, inner_length, + nonlocal_storage_scope=None, + nonlocal_tag=None, + outer_tag=None, + inner_tag=None): + """ + Two level reduction, mediated through a "nonlocal" array. + + This turns a reduction of the form:: + + [...] result = reduce(i, f(i)) + + into:: + + i -> inner + inner_length * outer + + [..., nl] nonlocal[nl] = reduce(inner, f(nl, inner)) + [...] result = reduce(outer, nonlocal[outer]) + """ + + # {{{ sanity checks + + reduction = kernel.id_to_insn[insn_id].expression + reduction_iname, = reduction.inames + + # }}} + + # {{{ get stable names for everything + + var_name_gen = kernel.get_var_name_generator() + + format_kwargs = {"insn": insn_id, "iname": reduction_iname} + + nonlocal_storage_name = var_name_gen( + "{insn}_nonlocal".format(**format_kwargs)) + + inner_iname = var_name_gen( + "{iname}_inner".format(**format_kwargs)) + outer_iname = var_name_gen( + "{iname}_outer".format(**format_kwargs)) + nonlocal_iname = var_name_gen( + "{iname}_nonlocal".format(**format_kwargs)) + + inner_subst = var_name_gen( + "{insn}_inner_subst".format(**format_kwargs)) + + # }}} + + # First we split this iname. This results in (roughly) + # + # [...] result = reduce([outer, inner], f(outer, inner)) + # + # FIXME: within + + kernel = lp.split_iname(kernel, reduction_iname, inner_length, + outer_iname=outer_iname, inner_iname=inner_iname) + + # Next, we split the reduction inward and then extract a substitution + # rule for the reduction. This results in + # + # subst(outer) := reduce(inner, f(outer, inner)) + # [...] result = reduce([outer], subst(outer)) + # + # FIXME: within, insn_match... + + kernel = lp.split_reduction_inward(kernel, inner_iname) + from loopy.transform.data import reduction_arg_to_subst_rule + kernel = reduction_arg_to_subst_rule(kernel, outer_iname, + subst_rule_name=inner_subst) + + # Next, we precompute the inner iname into its own storage. + + # [...,nl] nonlocal[nl] = reduce(inner, f(nl, inner)) + # [...] result = reduce([outer], nonlocal[outer]) + + kernel = lp.precompute(kernel, inner_subst, + sweep_inames=[outer_iname], + precompute_inames=[nonlocal_iname], + temporary_name=nonlocal_storage_name, + temporary_scope=nonlocal_storage_scope) + + return kernel + + +def _update_instructions(kernel, id_to_new_insn, copy=True): + if not isinstance(id_to_new_insn, dict): + id_to_new_insn = dict((insn.id, insn) for insn in id_to_new_insn) + + new_instructions = ( + list(insn for insn in kernel.instructions + if insn.id not in id_to_new_insn) + + list(id_to_new_insn.values())) + + if copy: + kernel = kernel.copy() + + kernel.instructions = new_instructions + return kernel + + +def _make_slab_set(iname, size): + # FIXME: There is a very similar identically named function in + # preprocess. Refactor. + + if not isinstance(size, (isl.PwAff, isl.Aff)): + from loopy.symbolic import pwaff_from_expr + size = pwaff_from_expr( + isl.Space.params_alloc(isl.DEFAULT_CONTEXT, 0), size) + + base_space = size.get_domain_space() + + space = (base_space + .add_dims(isl.dim_type.set, 1) + .set_dim_name(isl.dim_type.set, base_space.dim(isl.dim_type.set), iname)) + + v = isl.affs_from_space(space) + + size = isl.align_spaces(size, v[0]) + + bs, = ( + v[0].le_set(v[iname]) + & + v[iname].lt_set(v[0] + size)).get_basic_sets() + + return bs + + +def _add_subdomain_to_kernel(kernel, subdomain): + domains = list(kernel.domains) + # Filter out value parameters. + dep_inames = ( + frozenset(subdomain.get_var_names(isl.dim_type.param)) + & kernel.all_inames()) + + indices = kernel.get_leaf_domain_indices(dep_inames) + + if len(indices) == 0: + domains.append(subdomain) + elif len(indices) == 1: + idx, = indices + domains.insert(idx + 1, subdomain) + else: + print(indices) + raise ValueError("more than 1 leaf index") + + return kernel.copy(domains=domains) + + +def _add_scan_subdomain( + kernel, scan_iname, sweep_iname): + """ + Add the following domain to the kernel:: + + [sweep_iname] -> {[scan_iname] : 0 <= scan_iname <= sweep_iname } + """ + sp = ( + isl.Space.set_alloc(isl.DEFAULT_CONTEXT, 1, 1) + .set_dim_name(isl.dim_type.param, 0, sweep_iname) + .set_dim_name(isl.dim_type.set, 0, scan_iname)) + + affs = isl.affs_from_space(sp) + + subd, = ( + affs[scan_iname].le_set(affs[sweep_iname]) + & + affs[scan_iname].ge_set(affs[0])).get_basic_sets() + + return _add_subdomain_to_kernel(kernel, subd) + + +def _expand_subst_within_expression(kernel, expr): + from loopy.symbolic import ( + RuleAwareSubstitutionRuleExpander, SubstitutionRuleMappingContext) + + rule_mapping_context = SubstitutionRuleMappingContext( + kernel.substitutions, kernel.get_var_name_generator()) + submap = RuleAwareSubstitutionRuleExpander( + rule_mapping_context, + kernel.substitutions, + within=lambda *args: True + ) + return submap(expr, kernel, insn=None) + + +def _add_global_barrier(kernel, source, sink, barrier_id): + from loopy.kernel.instruction import BarrierInstruction + + sources = (source,) if isinstance(source, str) else source + sinks = (sink,) if isinstance(sink, str) else sink + + within_inames = kernel.id_to_insn[sources[0]].within_inames + from itertools import chain + for iname in chain(sources[1:], sinks): + within_inames &= kernel.id_to_insn[iname].within_inames + + barrier_insn = BarrierInstruction( + id=barrier_id, + depends_on=frozenset(sources), + within_inames=within_inames, + kind="global") + + sink_insns = (kernel.id_to_insn[sink] for sink in sinks) + updated_sinks = ( + sink.copy(depends_on=sink.depends_on | frozenset([barrier_id])) + for sink in sink_insns) + + kernel = _update_instructions( + kernel, chain([barrier_insn], updated_sinks), copy=True) + + return kernel + + +def _get_scan_level(sweep_iname): + SWEEP_RE = r".*__l(\d+)(?:_outer)?" # noqa + + import re + match_result = re.match(SWEEP_RE, sweep_iname) + + if match_result is None: + return 0 + + return int(match_result.group(1)) + + +def _get_base_iname(iname): + BASE_INAME_RE = r"(.*)__l\d+(?:_outer)?" # noqa + + import re + match_result = re.match(BASE_INAME_RE, iname) + + if match_result is None: + return iname + + return match_result.group(1) + + +def make_two_level_scan( + kernel, insn_id, + scan_iname, + sweep_iname, + inner_length, + local_storage_name=None, + local_storage_scope=None, + local_storage_axes=None, + nonlocal_storage_name=None, + nonlocal_storage_scope=None, + nonlocal_tag=None, + outer_local_tag=None, + inner_local_tag=None, + inner_tag=None, + outer_tag=None, + inner_iname=None, + outer_iname=None): + """ + Two level scan, mediated through a "local" and "nonlocal" array. + + This turns a scan of the form:: + + [...,i] result = reduce(j, f(j)) + + into:: + + [...,l',l''] + [...,nlinit] nonlocal[0] = 0 + [...,nlinit] nonlocal[nlinit+1] = local[nlinit,-1] + [...,nl] + [...,i',i''] result = nonlocal[i'] + local[i',i''] + """ + + # {{{ sanity checks + + # FIXME: More sanity checks... + + insn = kernel.id_to_insn[insn_id] + scan = insn.expression + assert scan.inames[0] == scan_iname + assert len(scan.inames) == 1 + + # }}} + + # {{{ get stable names for everything + + var_name_gen = kernel.get_var_name_generator() + insn_id_gen = kernel.get_instruction_id_generator() + + level = _get_scan_level(sweep_iname) + base_scan_iname = _get_base_iname(scan_iname) + base_sweep_iname = _get_base_iname(sweep_iname) + base_insn_id = _get_base_iname(insn_id) + + format_kwargs = { + "insn": base_insn_id, + "iname": base_scan_iname, + "sweep": base_sweep_iname, + "level": level, + "next_level": level + 1} + + if inner_iname is None: + inner_iname = var_name_gen( + "{sweep}__l{level}".format(**format_kwargs)) + else: + var_name_gen.add_name(inner_iname) + + if outer_iname is None: + outer_iname = var_name_gen( + "{sweep}__l{level}_outer".format(**format_kwargs)) + else: + var_name_gen.add_iname(outer_iname) + + """ + nonlocal_init_head_outer_iname = var_name_gen( + "{sweep}__l{level}_nlhead_outer".format(**format_kwargs)) + + nonlocal_init_head_inner_iname = var_name_gen( + "{sweep}__l{level}_nlhead_inner".format(**format_kwargs)) + """ + + nonlocal_init_tail_outer_iname = var_name_gen( + "{sweep}__l{level}_nltail_outer".format(**format_kwargs)) + + nonlocal_init_tail_inner_iname = var_name_gen( + "{sweep}__l{level}_nltail_inner".format(**format_kwargs)) + + nonlocal_iname = var_name_gen( + "{sweep}__l{level}_nonloc".format(**format_kwargs)) + + inner_local_iname = var_name_gen( + "{sweep}__l{next_level}".format(**format_kwargs)) + + inner_scan_iname = var_name_gen( + "{iname}__l{next_level}".format(**format_kwargs)) + + outer_local_iname = var_name_gen( + "{sweep}__l{next_level}_outer".format(**format_kwargs)) + + outer_scan_iname = var_name_gen( + "{iname}__l{level}".format(**format_kwargs)) + + subst_name = var_name_gen( + "{insn}_inner_subst".format(**format_kwargs)) + + local_subst_name = var_name_gen( + "{insn}_local_subst".format(**format_kwargs)) + + if local_storage_name is None: + local_storage_name = var_name_gen( + "{insn}__l{next_level}".format(**format_kwargs)) + else: + var_name_gen.add_name(local_storage_name) + + if nonlocal_storage_name is None: + nonlocal_storage_name = var_name_gen( + "{insn}__l{level}_outer".format(**format_kwargs)) + else: + var_name_gen.add_name(nonlocal_storage_name) + + local_scan_insn_id = insn_id_gen( + "{insn}__l{next_level}".format(**format_kwargs)) + + nonlocal_scan_insn_id = insn_id_gen( + "{insn}__l{level}".format(**format_kwargs)) + + format_kwargs.update({"nonlocal": nonlocal_storage_name}) + + nonlocal_init_head_insn_id = insn_id_gen( + "{nonlocal}_init_head".format(**format_kwargs)) + + nonlocal_init_tail_insn_id = insn_id_gen( + "{nonlocal}_init_tail".format(**format_kwargs)) + + # }}} + + # {{{ utils + + if local_storage_axes is None: + local_storage_axes = (outer_iname, inner_iname) + + def pick_out_relevant_axes(full_indices, strip_scalar=False): + assert len(full_indices) == 2 + iname_to_index = dict(zip((outer_iname, inner_iname), full_indices)) + + result = [] + for iname in local_storage_axes: + result.append(iname_to_index[iname]) + + assert len(result) > 0 + + return (tuple(result) + if not (strip_scalar and len(result) == 1) + else result[0]) + + # }}} + + # {{{ prepare for two level scan + + # Turn the scan into a substitution rule, replace the original scan with a + # nop and delete the scan iname. + # + # (The presence of the scan iname seems to be making precompute very confused.) + + from loopy.transform.data import reduction_arg_to_subst_rule + kernel = reduction_arg_to_subst_rule( + kernel, scan_iname, subst_rule_name=subst_name) + + kernel = _update_instructions(kernel, {insn_id: insn.copy(expression=0)}) + + """ + from loopy.kernel.instruction import NoOpInstruction + {insn_id: NoOpInstruction( + id=insn_id, + depends_on=insn.depends_on, + groups=insn.groups, + conflicts_with_groups=insn.groups, + no_sync_with=insn.no_sync_with, + within_inames_is_final=insn.within_inames_is_final, + within_inames=insn.within_inames, + priority=insn.priority, + boostable=insn.boostable, + boostable_into=insn.boostable_into, + predicates=insn.predicates, + tags=insn.tags)}, + copy=False) + """ + + kernel = lp.remove_unused_inames(kernel, inames=(scan_iname,)) + + # Make sure we got rid of everything + assert scan_iname not in kernel.all_inames() + + # }}} + + # {{{ implement local scan + + from pymbolic import var + + # FIXME: This can probably be done using split_reduction_inward() + # and will end up looking as less of a mess that way. + + local_scan_expr = _expand_subst_within_expression(kernel, + var(subst_name)(var(outer_iname) * inner_length + + var(inner_scan_iname))) + + kernel = lp.split_iname(kernel, sweep_iname, inner_length, + inner_iname=inner_iname, outer_iname=outer_iname, + inner_tag=inner_tag, outer_tag=outer_tag) + + from loopy.kernel.data import SubstitutionRule + from loopy.symbolic import Reduction + + local_subst = SubstitutionRule( + name=local_subst_name, + arguments=(outer_iname, inner_iname), + expression=Reduction( + scan.operation, (inner_scan_iname,), local_scan_expr)) + + substitutions = kernel.substitutions.copy() + substitutions[local_subst_name] = local_subst + + kernel = kernel.copy(substitutions=substitutions) + + outer_local_iname = outer_iname + + all_precompute_inames = (outer_local_iname, inner_local_iname) + + precompute_inames = pick_out_relevant_axes(all_precompute_inames) + sweep_inames = pick_out_relevant_axes((outer_iname, inner_iname)) + + storage_axis_to_tag = { + outer_iname: outer_local_tag, + inner_iname: inner_local_tag, + outer_local_iname: outer_local_tag, + inner_local_iname: inner_local_tag} + + precompute_outer_inames = ( + frozenset(all_precompute_inames) - frozenset(precompute_inames)) + + insn = kernel.id_to_insn[insn_id] + within_inames = insn.within_inames - frozenset([outer_iname, inner_iname]) + + from pymbolic import var + + kernel = lp.precompute(kernel, + [var(local_subst_name)( + var(outer_iname), var(inner_iname))], + sweep_inames=sweep_inames, + precompute_inames=precompute_inames, + storage_axes=local_storage_axes, + storage_axis_to_tag=storage_axis_to_tag, + precompute_outer_inames=precompute_outer_inames | within_inames, + temporary_name=local_storage_name, + compute_insn_id=local_scan_insn_id) + + compute_insn_with_deps = kernel.id_to_insn[local_scan_insn_id] + compute_insn_with_deps = compute_insn_with_deps.copy( + depends_on=compute_insn_with_deps.depends_on | insn.depends_on) + + kernel = _update_instructions(kernel, (compute_insn_with_deps,)) + + kernel = _add_scan_subdomain(kernel, inner_scan_iname, inner_local_iname) + + # }}} + + # {{{ implement local to nonlocal information transfer + + from loopy.isl_helpers import static_max_of_pw_aff + from loopy.symbolic import pw_aff_to_expr + + local_storage_local_axis_len = ( + kernel.temporary_variables[local_storage_name].shape[-1]) + + nonlocal_storage_len_pw_aff = static_max_of_pw_aff( + kernel.get_iname_bounds(outer_iname).size, + constants_only=False) + + # FIXME: this shouldn't have to have an extra element. + nonlocal_storage_len = pw_aff_to_expr(1 + nonlocal_storage_len_pw_aff) + + nonlocal_tail_inner_subd = _make_slab_set(nonlocal_init_tail_inner_iname, 1) + kernel = _add_subdomain_to_kernel(kernel, nonlocal_tail_inner_subd) + nonlocal_tail_outer_subd = _make_slab_set( + nonlocal_init_tail_outer_iname, nonlocal_storage_len_pw_aff) + kernel = _add_subdomain_to_kernel(kernel, nonlocal_tail_outer_subd) + + """ + nonlocal_head_inner_subd = _make_slab_set(nonlocal_init_head_inner_iname, 1) + kernel = _add_subdomain_to_kernel(kernel, nonlocal_head_inner_subd) + nonlocal_head_outer_subd = _make_slab_set(nonlocal_init_head_outer_iname, 1) + kernel = _add_subdomain_to_kernel(kernel, nonlocal_head_outer_subd) + """ + + kernel = lp.tag_inames(kernel, { + #nonlocal_init_head_outer_iname: outer_local_tag, + #nonlocal_init_head_inner_iname: inner_local_tag, + nonlocal_init_tail_outer_iname: outer_local_tag, + nonlocal_init_tail_inner_iname: inner_local_tag}) + + if nonlocal_storage_name not in kernel.temporary_variables: + + from loopy.kernel.data import TemporaryVariable + new_temporary_variables = kernel.temporary_variables.copy() + + new_temporary_variables[nonlocal_storage_name] = ( + TemporaryVariable( + nonlocal_storage_name, + shape=(nonlocal_storage_len,), + scope=nonlocal_storage_scope, + base_indices=lp.auto, + dtype=lp.auto)) + + kernel = kernel.copy(temporary_variables=new_temporary_variables) + + from loopy.kernel.instruction import make_assignment + + # FIXME: neutral element... + nonlocal_init_head = make_assignment( + id=nonlocal_init_head_insn_id, + assignees=(var(nonlocal_storage_name)[0],), + expression=0, + within_inames=( + within_inames | frozenset([nonlocal_init_tail_outer_iname, + nonlocal_init_tail_inner_iname])), + no_sync_with=frozenset([(nonlocal_init_tail_insn_id, "any")]), + predicates=(var(nonlocal_init_tail_inner_iname).eq(0), + var(nonlocal_init_tail_outer_iname).eq(0)), + depends_on=frozenset([local_scan_insn_id])) + + nonlocal_init_tail = make_assignment( + id=nonlocal_init_tail_insn_id, + assignees=( + var(nonlocal_storage_name)[ + var(nonlocal_init_tail_outer_iname) + 1],), + expression=var(local_storage_name)[ + pick_out_relevant_axes( + (var(nonlocal_init_tail_outer_iname), + var(nonlocal_init_tail_inner_iname) + + local_storage_local_axis_len - 1), + strip_scalar=True)], + no_sync_with=frozenset([(nonlocal_init_head_insn_id, "any")]), + within_inames=( + within_inames | frozenset([nonlocal_init_tail_outer_iname, + nonlocal_init_tail_inner_iname])), + depends_on=frozenset([local_scan_insn_id])) + + kernel = _update_instructions( + kernel, (nonlocal_init_head, nonlocal_init_tail), copy=False) + + # The write race warnings are spurious - the inner iname is length + # 1, so there's really no write race at all here. + kernel = kernel.copy( + silenced_warnings=kernel.silenced_warnings + + ["write_race(%s)" % nonlocal_init_tail_insn_id] + + ["write_race(%s)" % nonlocal_init_head_insn_id]) + + # }}} + + # {{{ implement nonlocal scan + + subd = _make_slab_set(nonlocal_iname, nonlocal_storage_len_pw_aff) + kernel = _add_subdomain_to_kernel(kernel, subd) + + if nonlocal_tag is not None: + kernel = lp.tag_inames(kernel, {nonlocal_iname: nonlocal_tag}) + + kernel = _add_scan_subdomain(kernel, outer_scan_iname, nonlocal_iname) + + nonlocal_scan = make_assignment( + id=nonlocal_scan_insn_id, + assignees=(var(nonlocal_storage_name)[var(nonlocal_iname)],), + expression=Reduction( + scan.operation, + (outer_scan_iname,), + var(nonlocal_storage_name)[var(outer_scan_iname)]), + within_inames=within_inames | frozenset([nonlocal_iname]), + depends_on=( + frozenset([nonlocal_init_tail_insn_id, nonlocal_init_head_insn_id]))) + + kernel = _update_instructions(kernel, (nonlocal_scan,), copy=False) + + if nonlocal_storage_scope == lp.temp_var_scope.GLOBAL: + barrier_id = insn_id_gen( + "{insn}_nonlocal_init_barrier".format(**format_kwargs)) + kernel = _add_global_barrier(kernel, + source=(nonlocal_init_tail_insn_id, nonlocal_init_head_insn_id), + sink=nonlocal_scan_insn_id, + barrier_id=barrier_id) + + # }}} + + # {{{ replace scan with local + nonlocal + + updated_depends_on = (insn.depends_on + | frozenset([nonlocal_scan_insn_id, local_scan_insn_id])) + + if nonlocal_storage_scope == lp.temp_var_scope.GLOBAL: + barrier_id = insn_id_gen( + "{insn}_nonlocal_scan_barrier".format(**format_kwargs)) + kernel = (_add_global_barrier(kernel, + source=nonlocal_scan_insn_id, sink=insn_id, barrier_id=barrier_id)) + updated_depends_on |= frozenset([barrier_id]) + + nonlocal_part = var(nonlocal_storage_name)[var(outer_iname)] + + local_part = var(local_storage_name)[ + pick_out_relevant_axes( + (var(outer_iname), var(inner_iname)), strip_scalar=True)] + + updated_insn = insn.copy( + no_sync_with=( + insn.no_sync_with + | frozenset([(nonlocal_scan_insn_id, "any")])), + depends_on=updated_depends_on, + # XXX: scan binary op + expression=nonlocal_part + local_part) + + kernel = _update_instructions(kernel, (updated_insn,), copy=False) + + # }}} + + return kernel + + +# vim: foldmethod=marker diff --git a/loopy/transform/save.py b/loopy/transform/save.py index 8afc1695a38a37baf165f6ec6ef6567e2012173b..fa98f478d7e09bd77a3dafcd0327c5d6dce737d0 100644 --- a/loopy/transform/save.py +++ b/loopy/transform/save.py @@ -32,7 +32,7 @@ from loopy.schedule import ( EnterLoop, LeaveLoop, RunInstruction, CallKernel, ReturnFromKernel, Barrier) -from loopy.schedule.tools import (get_block_boundaries, InstructionQuery) +from loopy.schedule.tools import get_block_boundaries import logging @@ -193,13 +193,9 @@ class TemporarySaver(object): The name of the new temporary. - .. attribute:: orig_temporary + .. attribute:: orig_temporary_name - The original temporary variable object. - - .. attribute:: hw_inames - - The common list of hw axes that define the original object. + The name of original temporary variable object. .. attribute:: hw_dims @@ -207,6 +203,10 @@ class TemporarySaver(object): of the promoted temporary value, corresponding to hardware dimensions + .. attribute:: hw_tags + + The tags for the inames associated with hw_dims + .. attribute:: non_hw_dims A list of expressions, to be added in front of the shape @@ -214,9 +214,11 @@ class TemporarySaver(object): non-hardware dimensions """ - @memoize_method - def as_variable(self): - temporary = self.orig_temporary + __slots__ = ["name", "orig_temporary_name", "hw_dims", "hw_tags", + "non_hw_dims"] + + def as_kernel_temporary(self, kernel): + temporary = kernel.temporary_variables[self.orig_temporary_name] from loopy.kernel.data import TemporaryVariable return TemporaryVariable( name=self.name, @@ -230,16 +232,172 @@ class TemporarySaver(object): def __init__(self, kernel): self.kernel = kernel - self.insn_query = InstructionQuery(kernel) self.var_name_gen = kernel.get_var_name_generator() self.insn_name_gen = kernel.get_instruction_id_generator() + # These fields keep track of updates to the kernel. self.insns_to_insert = [] self.insns_to_update = {} self.extra_args_to_add = {} self.updated_iname_to_tag = {} self.updated_temporary_variables = {} - self.saves_or_reloads_added = {} + + # temporary name -> save or reload insn ids + from collections import defaultdict + self.temporary_to_save_ids = defaultdict(set) + self.temporary_to_reload_ids = defaultdict(set) + self.subkernel_to_newly_added_insn_ids = defaultdict(set) + + # Maps names of base_storage to the name of the temporary + # representative chosen for saves/reloads + self.base_storage_to_representative = {} + + from loopy.kernel.data import ValueArg + import islpy as isl + self.new_subdomain = ( + isl.BasicSet.universe( + isl.Space.create_from_names( + isl.DEFAULT_CONTEXT, + set=[], + params=set( + arg.name for arg in kernel.args + if isinstance(arg, ValueArg))))) + + @property + @memoize_method + def subkernel_to_slice_indices(self): + result = {} + + for sched_item_idx, sched_item in enumerate(self.kernel.schedule): + if isinstance(sched_item, CallKernel): + start_idx = sched_item_idx + elif isinstance(sched_item, ReturnFromKernel): + result[sched_item.kernel_name] = (start_idx, 1 + sched_item_idx) + + return result + + @property + @memoize_method + def subkernel_to_surrounding_inames(self): + current_outer_inames = set() + within_subkernel = False + result = {} + + for sched_item_idx, sched_item in enumerate(self.kernel.schedule): + if isinstance(sched_item, CallKernel): + within_subkernel = True + result[sched_item.kernel_name] = frozenset(current_outer_inames) + elif isinstance(sched_item, ReturnFromKernel): + within_subkernel = False + elif isinstance(sched_item, EnterLoop): + if not within_subkernel: + current_outer_inames.add(sched_item.iname) + elif isinstance(sched_item, LeaveLoop): + current_outer_inames.discard(sched_item.iname) + + return result + + @memoize_method + def get_enclosing_global_barrier_pair(self, subkernel): + subkernel_start, subkernel_end = ( + self.subkernel_to_slice_indices[subkernel]) + + def is_global_barrier(item): + return isinstance(item, Barrier) and item.kind == "global" + + try: + pre_barrier = next(item for item in + self.kernel.schedule[subkernel_start::-1] + if is_global_barrier(item)).originating_insn_id + except StopIteration: + pre_barrier = None + + try: + post_barrier = next(item for item in + self.kernel.schedule[subkernel_end:] + if is_global_barrier(item)).originating_insn_id + except StopIteration: + post_barrier = None + + return (pre_barrier, post_barrier) + + def get_hw_axis_sizes_and_tags_for_save_slot(self, temporary): + """ + This is used for determining the amount of global storage needed for saving + and restoring the temporary across kernel calls, due to hardware + parallel inames (the inferred axes get prefixed to the number of + dimensions in the temporary). + + In the case of local temporaries, inames that are tagged + hw-local do not contribute to the global storage shape. + """ + accessor_insn_ids = frozenset( + self.kernel.reader_map()[temporary.name] + | self.kernel.writer_map()[temporary.name]) + + group_tags = None + local_tags = None + + def _sortedtags(tags): + return sorted(tags, key=lambda tag: tag.axis) + + for insn_id in accessor_insn_ids: + insn = self.kernel.id_to_insn[insn_id] + + my_group_tags = [] + my_local_tags = [] + + for iname in insn.within_inames: + tag = self.kernel.iname_to_tag.get(iname) + + if tag is None: + continue + + from loopy.kernel.data import ( + GroupIndexTag, LocalIndexTag, ParallelTag) + + if isinstance(tag, GroupIndexTag): + my_group_tags.append(tag) + elif isinstance(tag, LocalIndexTag): + my_local_tags.append(tag) + elif isinstance(tag, ParallelTag): + raise ValueError( + "iname '%s' is tagged with '%s' - only " + "group and local tags are supported for " + "auto save/reload of temporaries" % + (iname, tag)) + + if group_tags is None: + group_tags = _sortedtags(my_group_tags) + local_tags = _sortedtags(my_local_tags) + group_tags_originating_insn_id = insn_id + + if ( + group_tags != _sortedtags(my_group_tags) + or local_tags != _sortedtags(my_local_tags)): + raise ValueError( + "inconsistent parallel tags across instructions that access " + "'%s' (specifically, instruction '%s' has tags '%s' but " + "instruction '%s' has tags '%s')" + % (temporary.name, + group_tags_originating_insn_id, group_tags + local_tags, + insn_id, my_group_tags + my_local_tags)) + + if group_tags is None: + assert local_tags is None + return (), () + + group_sizes, local_sizes = ( + self.kernel.get_grid_sizes_for_insn_ids_as_exprs(accessor_insn_ids)) + + if temporary.scope == lp.temp_var_scope.LOCAL: + # Elide local axes in the save slot for local temporaries. + del local_tags[:] + local_sizes = () + + # We set hw_dims to be arranged according to the order: + # g.0 < g.1 < ... < l.0 < l.1 < ... + return (group_sizes + local_sizes), tuple(group_tags + local_tags) @memoize_method def auto_promote_temporary(self, temporary_name): @@ -255,52 +413,16 @@ class TemporarySaver(object): assert temporary.read_only return None - if temporary.base_storage is not None: - raise ValueError( - "Cannot promote temporaries with base_storage to global") - - # `hw_inames`: The set of hw-parallel tagged inames that this temporary - # is associated with. This is used for determining the shape of the - # global storage needed for saving and restoring the temporary across - # kernel calls. - # - # TODO: Make a policy decision about which dimensions to use. Currently, - # the code looks at each instruction that defines or uses the temporary, - # and takes the common set of hw-parallel tagged inames associated with - # these instructions. - # - # Furthermore, in the case of local temporaries, inames that are tagged - # hw-local do not contribute to the global storage shape. - hw_inames = self.insn_query.common_hw_inames( - self.insn_query.insns_reading_or_writing(temporary.name)) - - # We want hw_inames to be arranged according to the order: - # g.0 < g.1 < ... < l.0 < l.1 < ... - # Sorting lexicographically accomplishes this. - hw_inames = sorted(hw_inames, - key=lambda iname: str(self.kernel.iname_to_tag[iname])) - - # Calculate the sizes of the dimensions that get added in front for - # the global storage of the temporary. - hw_dims = [] - - backing_hw_inames = [] - - for iname in hw_inames: - tag = self.kernel.iname_to_tag[iname] - from loopy.kernel.data import LocalIndexTag - is_local_iname = isinstance(tag, LocalIndexTag) - if is_local_iname and temporary.scope == temp_var_scope.LOCAL: - # Restrict shape to that of group inames for locals. - continue - backing_hw_inames.append(iname) - from loopy.isl_helpers import static_max_of_pw_aff - from loopy.symbolic import aff_to_expr - hw_dims.append( - aff_to_expr( - static_max_of_pw_aff( - self.kernel.get_iname_bounds(iname).size, False))) + base_storage_conflict = ( + self.base_storage_to_representative.get( + temporary.base_storage, temporary) is not temporary) + + if base_storage_conflict: + raise NotImplementedError( + "tried to save/reload multiple temporaries with the " + "same base_storage; this is currently not supported") + hw_dims, hw_tags = self.get_hw_axis_sizes_and_tags_for_save_slot(temporary) non_hw_dims = temporary.shape if len(non_hw_dims) == 0 and len(hw_dims) == 0: @@ -308,11 +430,15 @@ class TemporarySaver(object): non_hw_dims = (1,) backing_temporary = self.PromotedTemporary( - name=self.var_name_gen(temporary.name + "_save_slot"), - orig_temporary=temporary, - hw_dims=tuple(hw_dims), - non_hw_dims=non_hw_dims, - hw_inames=backing_hw_inames) + name=self.var_name_gen(temporary.name + "__save_slot"), + orig_temporary_name=temporary.name, + hw_dims=hw_dims, + hw_tags=hw_tags, + non_hw_dims=non_hw_dims) + + if temporary.base_storage is not None: + self.base_storage_to_representative[temporary.base_storage] = ( + backing_temporary) return backing_temporary @@ -326,23 +452,16 @@ class TemporarySaver(object): if promoted_temporary is None: return - from loopy.kernel.tools import DomainChanger - dchg = DomainChanger( - self.kernel, - frozenset( - self.insn_query.inames_in_subkernel(subkernel) | - set(promoted_temporary.hw_inames))) - - domain, hw_inames, dim_inames, iname_to_tag = \ + new_subdomain, hw_inames, dim_inames, iname_to_tag = ( self.augment_domain_for_save_or_reload( - dchg.domain, promoted_temporary, mode, subkernel) + self.new_subdomain, promoted_temporary, mode, subkernel)) - self.kernel = dchg.get_kernel_with(domain) + self.new_subdomain = new_subdomain save_or_load_insn_id = self.insn_name_gen( "{name}.{mode}".format(name=temporary, mode=mode)) - def subscript_or_var(agg, subscript=()): + def add_subscript_if_subscript_nonempty(agg, subscript=()): from pymbolic.primitives import Subscript, Variable if len(subscript) == 0: return Variable(agg) @@ -351,20 +470,24 @@ class TemporarySaver(object): Variable(agg), tuple(map(Variable, subscript))) - dim_inames_trunc = dim_inames[:len(promoted_temporary.orig_temporary.shape)] + orig_temporary = ( + self.kernel.temporary_variables[ + promoted_temporary.orig_temporary_name]) + dim_inames_trunc = dim_inames[:len(orig_temporary.shape)] args = ( - subscript_or_var( - temporary, dim_inames_trunc), - subscript_or_var( - promoted_temporary.name, hw_inames + dim_inames)) + add_subscript_if_subscript_nonempty( + temporary, subscript=dim_inames_trunc), + add_subscript_if_subscript_nonempty( + promoted_temporary.name, subscript=hw_inames + dim_inames)) if mode == "save": args = reversed(args) - accessing_insns_in_subkernel = ( - self.insn_query.insns_reading_or_writing(temporary) & - self.insn_query.insns_in_subkernel(subkernel)) + accessing_insns_in_subkernel = (frozenset( + self.kernel.reader_map()[temporary] + | self.kernel.writer_map()[temporary]) + & self.kernel.subkernel_to_insn_ids[subkernel]) if mode == "save": depends_on = accessing_insns_in_subkernel @@ -373,7 +496,7 @@ class TemporarySaver(object): depends_on = frozenset() update_deps = accessing_insns_in_subkernel - pre_barrier, post_barrier = self.insn_query.pre_and_post_barriers(subkernel) + pre_barrier, post_barrier = self.get_enclosing_global_barrier_pair(subkernel) if pre_barrier is not None: depends_on |= set([pre_barrier]) @@ -387,16 +510,19 @@ class TemporarySaver(object): *args, id=save_or_load_insn_id, within_inames=( - self.insn_query.inames_in_subkernel(subkernel) | - frozenset(hw_inames + dim_inames)), + self.subkernel_to_surrounding_inames[subkernel] + | frozenset(hw_inames + dim_inames)), within_inames_is_final=True, depends_on=depends_on, boostable=False, boostable_into=frozenset()) - if temporary not in self.saves_or_reloads_added: - self.saves_or_reloads_added[temporary] = set() - self.saves_or_reloads_added[temporary].add(save_or_load_insn_id) + if mode == "save": + self.temporary_to_save_ids[temporary].add(save_or_load_insn_id) + else: + self.temporary_to_reload_ids[temporary].add(save_or_load_insn_id) + + self.subkernel_to_newly_added_insn_ids[subkernel].add(save_or_load_insn_id) self.insns_to_insert.append(save_or_load_insn) @@ -405,8 +531,8 @@ class TemporarySaver(object): self.insns_to_update[insn_id] = insn.copy( depends_on=insn.depends_on | frozenset([save_or_load_insn_id])) - self.updated_temporary_variables[promoted_temporary.name] = \ - promoted_temporary.as_variable() + self.updated_temporary_variables[promoted_temporary.name] = ( + promoted_temporary.as_kernel_temporary(self.kernel)) self.updated_iname_to_tag.update(iname_to_tag) @@ -416,15 +542,6 @@ class TemporarySaver(object): insns_to_insert = dict((insn.id, insn) for insn in self.insns_to_insert) - # Add global no_sync_with between any added reloads and saves - from six import iteritems - for temporary, added_insns in iteritems(self.saves_or_reloads_added): - for insn_id in added_insns: - insn = insns_to_insert[insn_id] - insns_to_insert[insn_id] = insn.copy( - no_sync_with=frozenset( - (added_insn, "global") for added_insn in added_insns)) - for orig_insn in self.kernel.instructions: if orig_insn.id in self.insns_to_update: new_instructions.append(self.insns_to_update[orig_insn.id]) @@ -436,12 +553,30 @@ class TemporarySaver(object): self.updated_iname_to_tag.update(self.kernel.iname_to_tag) self.updated_temporary_variables.update(self.kernel.temporary_variables) + new_domains = list(self.kernel.domains) + import islpy as isl + if self.new_subdomain.dim(isl.dim_type.set) > 0: + new_domains.append(self.new_subdomain) + kernel = self.kernel.copy( + domains=new_domains, instructions=new_instructions, iname_to_tag=self.updated_iname_to_tag, temporary_variables=self.updated_temporary_variables, overridden_get_grid_sizes_for_insn_ids=None) + # Add nosync directives to any saves or reloads that were added with a + # potential dependency chain. + for subkernel in self.kernel.subkernels: + relevant_insns = self.subkernel_to_newly_added_insn_ids[subkernel] + + from itertools import product + for temporary in self.temporary_to_reload_ids: + for source, sink in product( + relevant_insns & self.temporary_to_reload_ids[temporary], + relevant_insns & self.temporary_to_save_ids[temporary]): + kernel = lp.add_nosync(kernel, "global", source, sink) + from loopy.kernel.tools import assign_automatic_axes return assign_automatic_axes(kernel) @@ -456,22 +591,28 @@ class TemporarySaver(object): """ Add new axes to the domain corresponding to the dimensions of `promoted_temporary`. These axes will be used in the save/ - reload stage. + reload stage. These get prefixed onto the already existing axes. """ assert mode in ("save", "reload") import islpy as isl - orig_temporary = promoted_temporary.orig_temporary + orig_temporary = ( + self.kernel.temporary_variables[ + promoted_temporary.orig_temporary_name]) orig_dim = domain.dim(isl.dim_type.set) # Tags for newly added inames iname_to_tag = {} + from loopy.symbolic import aff_from_expr + # FIXME: Restrict size of new inames to access footprint. # Add dimension-dependent inames. dim_inames = [] - domain = domain.add(isl.dim_type.set, len(promoted_temporary.non_hw_dims)) + domain = domain.add(isl.dim_type.set, + len(promoted_temporary.non_hw_dims) + + len(promoted_temporary.hw_dims)) for dim_idx, dim_size in enumerate(promoted_temporary.non_hw_dims): new_iname = self.insn_name_gen("{name}_{mode}_axis_{dim}_{sk}". @@ -493,25 +634,31 @@ class TemporarySaver(object): # Add size information. aff = isl.affs_from_space(domain.space) domain &= aff[0].le_set(aff[new_iname]) - from loopy.symbolic import aff_from_expr domain &= aff[new_iname].lt_set(aff_from_expr(domain.space, dim_size)) - # FIXME: Use promoted_temporary.hw_inames - hw_inames = [] + dim_offset = orig_dim + len(promoted_temporary.non_hw_dims) - # Add hardware inames duplicates. - for t_idx, hw_iname in enumerate(promoted_temporary.hw_inames): + hw_inames = [] + # Add hardware dims. + for hw_iname_idx, (hw_tag, dim) in enumerate( + zip(promoted_temporary.hw_tags, promoted_temporary.hw_dims)): new_iname = self.insn_name_gen("{name}_{mode}_hw_dim_{dim}_{sk}". format(name=orig_temporary.name, mode=mode, - dim=t_idx, + dim=hw_iname_idx, sk=subkernel)) - hw_inames.append(new_iname) - iname_to_tag[new_iname] = self.kernel.iname_to_tag[hw_iname] + domain = domain.set_dim_name( + isl.dim_type.set, dim_offset + hw_iname_idx, new_iname) - from loopy.isl_helpers import duplicate_axes - domain = duplicate_axes( - domain, promoted_temporary.hw_inames, hw_inames) + aff = isl.affs_from_space(domain.space) + domain = (domain + & + aff[0].le_set(aff[new_iname]) + & + aff[new_iname].lt_set(aff_from_expr(domain.space, dim))) + + self.updated_iname_to_tag[new_iname] = hw_tag + hw_inames.append(new_iname) # The operations on the domain above return a Set object, but the # underlying domain should be expressible as a single BasicSet. @@ -551,7 +698,8 @@ def save_and_reload_temporaries(knl): liveness = LivenessAnalysis(knl) saver = TemporarySaver(knl) - insn_query = InstructionQuery(knl) + from loopy.schedule.tools import ( + temporaries_read_in_subkernel, temporaries_written_in_subkernel) for sched_idx, sched_item in enumerate(knl.schedule): @@ -562,9 +710,10 @@ def save_and_reload_temporaries(knl): # Kernel entry: nothing live interesting_temporaries = set() else: + subkernel = sched_item.kernel_name interesting_temporaries = ( - insn_query.temporaries_read_or_written_in_subkernel( - sched_item.kernel_name)) + temporaries_read_in_subkernel(knl, subkernel) + | temporaries_written_in_subkernel(knl, subkernel)) for temporary in liveness[sched_idx].live_out & interesting_temporaries: logger.info("reloading {0} at entry of {1}" @@ -576,9 +725,9 @@ def save_and_reload_temporaries(knl): # Kernel exit: nothing live interesting_temporaries = set() else: + subkernel = sched_item.kernel_name interesting_temporaries = ( - insn_query.temporaries_written_in_subkernel( - sched_item.kernel_name)) + temporaries_written_in_subkernel(knl, subkernel)) for temporary in liveness[sched_idx].live_in & interesting_temporaries: logger.info("saving {0} before return of {1}" diff --git a/loopy/type_inference.py b/loopy/type_inference.py index 4c1e423e93e104fecd0b49a2b1ef2b4a261e38e7..da7a5b52d227a1b9262ac5755d38c6e66c42dfe8 100644 --- a/loopy/type_inference.py +++ b/loopy/type_inference.py @@ -352,28 +352,30 @@ class TypeInferenceMapper(CombineMapper): return [self.kernel.index_dtype] def map_reduction(self, expr, return_tuple=False): - rec_result = self.rec(expr.expr) - - if rec_result: - rec_result, = rec_result - result = expr.operation.result_dtypes( - self.kernel, rec_result, expr.inames) - else: - result = expr.operation.result_dtypes( - self.kernel, None, expr.inames) + """ + :arg return_tuple: If *True*, treat the type of the reduction expression + as a tuple type. Otherwise, the number of expressions being reduced over + must equal 1, and the type of the first expression is returned. + """ + rec_results = tuple(self.rec(sub_expr) for sub_expr in expr.exprs) - if result is None: + if any(len(rec_result) == 0 for rec_result in rec_results): return [] if return_tuple: - return [result] + from itertools import product + return list( + expr.operation.result_dtypes(self.kernel, *product_element) + for product_element in product(*rec_results)) else: - if len(result) != 1 and not return_tuple: + if len(rec_results) != 1: raise LoopyError("reductions with more or fewer than one " "return value may only be used in direct assignments") - return [result[0]] + return list( + expr.operation.result_dtypes(self.kernel, rec_result)[0] + for rec_result in rec_results[0]) # }}} diff --git a/loopy/version.py b/loopy/version.py index 5c6ad47f8571ceb9100f4f7f8dece9d80a35d10c..77d0e21bdd2ef5383c5f874656c25fe1ede21a70 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -32,4 +32,4 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v59-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v60-islpy%s" % _islpy_version diff --git a/test/test_loopy.py b/test/test_loopy.py index 851a7f0762fcec3ccbb55399e183f5fb51322ac1..19a00108449b3be6905ce10905a7f313a3194b7d 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1987,19 +1987,28 @@ def test_integer_reduction(ctx_factory): dtype=to_loopy_type(vtype), shape=lp.auto) - reductions = [('max', lambda x: x == np.max(var_int)), - ('min', lambda x: x == np.min(var_int)), - ('sum', lambda x: x == np.sum(var_int)), - ('product', lambda x: x == np.prod(var_int)), - ('argmax', lambda x: (x[0] == np.max(var_int) and - var_int[out[1]] == np.max(var_int))), - ('argmin', lambda x: (x[0] == np.min(var_int) and - var_int[out[1]] == np.min(var_int)))] - - for reduction, function in reductions: + from collections import namedtuple + ReductionTest = namedtuple('ReductionTest', 'kind, check, args') + + reductions = [ + ReductionTest('max', lambda x: x == np.max(var_int), args='var[k]'), + ReductionTest('min', lambda x: x == np.min(var_int), args='var[k]'), + ReductionTest('sum', lambda x: x == np.sum(var_int), args='var[k]'), + ReductionTest('product', lambda x: x == np.prod(var_int), args='var[k]'), + ReductionTest('argmax', + lambda x: ( + x[0] == np.max(var_int) and var_int[out[1]] == np.max(var_int)), + args='var[k], k'), + ReductionTest('argmin', + lambda x: ( + x[0] == np.min(var_int) and var_int[out[1]] == np.min(var_int)), + args='var[k], k') + ] + + for reduction, function, args in reductions: kstr = ("out" if 'arg' not in reduction else "out[0], out[1]") - kstr += ' = {0}(k, var[k])'.format(reduction) + kstr += ' = {0}(k, {1})'.format(reduction, args) knl = lp.make_kernel('{[k]: 0<=k z[i] = z[i+1] + z[i] {id=wr_z,dep=top} + <> v[i] = 11 {id=wr_v,dep=top} + ... gbarrier {dep=wr_z:wr_v,id=yoink} + z[i] = z[i] - z[i+1] + v[i] {id=iupd, dep=yoink} + end + ... nop {id=nop} + ... gbarrier {dep=iupd,id=postloop} + z[i] = z[i] - z[i+1] + v[i] {id=zzzv,dep=postloop} + end + """) + + assert knl.global_barrier_order == ("top", "yoink", "postloop") + + for insn, barrier in ( + ("nop", None), + ("top", None), + ("wr_z", "top"), + ("wr_v", "top"), + ("yoink", "top"), + ("postloop", "yoink"), + ("zzzv", "postloop")): + assert knl.find_most_recent_global_barrier(insn) == barrier + + +def test_global_barrier_error_if_unordered(): + # FIXME: Should be illegal to declare this + knl = lp.make_kernel("{[i]: 0 <= i < 10}", + """ + ... gbarrier + ... gbarrier + """) + + from loopy.diagnostic import LoopyError + with pytest.raises(LoopyError): + knl.global_barrier_order + + +def test_multi_base_storage_save_and_reload_not_supported(): + # FIXME: This ought to work, change the test when it does. + knl = lp.make_kernel("{[i]: 0<=i<10}", + """ + <>a[0] = 1 + <>b[0] = 2 + ... gbarrier + out = a[0] + b[0] + """, + seq_dependencies=True) + + knl = lp.alias_temporaries(knl, ("a", "b"), synchronize_for_exclusive_use=False) + knl = lp.preprocess_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl) + + with pytest.raises(NotImplementedError): + lp.save_and_reload_temporaries(knl) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) diff --git a/test/test_reduction.py b/test/test_reduction.py index 5887df7a628c46fbf09539fdd48c08aaacd8e409..96d85beb6831bca7369223decfc6ff1863976d35 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -240,12 +240,38 @@ def test_global_parallel_reduction(ctx_factory, size): knl = lp.add_dependency( knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") - lp.auto_test_vs_ref( ref_knl, ctx, knl, parameters={"n": size}, print_ref_code=True) +def test_global_parallel_reduction_2(): + knl = lp.make_kernel( + "{[i]: 0 <= i < n }", + """ + # Using z[0] instead of z works around a bug in ancient PyOpenCL. + z[0] = sum(i, i/13) {id=reduce} + """) + + gsize = 128 + knl = lp.make_two_level_reduction(knl, + "reduce", + inner_length=gsize * 20, + nonlocal_tag="g.0", + nonlocal_storage_scope=lp.temp_var_scope.GLOBAL, + outer_tag=None, + inner_tag=None) + + print(knl) + + knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0") + knl = lp.split_reduction_inward(knl, "i_inner_inner") + + knl = lp.realize_reduction(knl) + + print(knl) + + @pytest.mark.parametrize("size", [1000]) def test_global_mc_parallel_reduction(ctx_factory, size): ctx = ctx_factory() @@ -297,7 +323,7 @@ def test_argmax(ctx_factory): knl = lp.make_kernel( "{[i]: 0<=i<%d}" % n, """ - max_val, max_idx = argmax(i, fabs(a[i])) + max_val, max_idx = argmax(i, fabs(a[i]), i) """) knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) @@ -397,7 +423,7 @@ def test_parallel_multi_output_reduction(): knl = lp.make_kernel( "{[i]: 0<=i<128}", """ - max_val, max_indices = argmax(i, fabs(a[i])) + max_val, max_indices = argmax(i, fabs(a[i]), i) """) knl = lp.tag_inames(knl, dict(i="l.0")) knl = lp.realize_reduction(knl) diff --git a/test/test_scan.py b/test/test_scan.py new file mode 100644 index 0000000000000000000000000000000000000000..5c84d6e4d62f18693b1352d47bf78b0a01bb6508 --- /dev/null +++ b/test/test_scan.py @@ -0,0 +1,550 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = """ +Copyright (C) 2012 Andreas Kloeckner +Copyright (C) 2016, 2017 Matt Wala +""" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import sys +import numpy as np +import loopy as lp +import pyopencl as cl +import pyopencl.clmath # noqa +import pyopencl.clrandom # noqa +import pytest +from pytools import memoize + +import logging +logger = logging.getLogger(__name__) + +try: + import faulthandler +except ImportError: + pass +else: + faulthandler.enable() + +from pyopencl.tools import pytest_generate_tests_for_pyopencl \ + as pytest_generate_tests + +__all__ = [ + "pytest_generate_tests", + "cl" # 'cl.create_some_context' + ] + + +# More things to test. +# - scan(a) + scan(b) +# - test for badly tagged inames +# - global parallel scan + +# TO DO: +# segmented(...) syntax + + +@pytest.mark.parametrize("n", [1, 2, 3, 16]) +@pytest.mark.parametrize("stride", [1, 2]) +def test_sequential_scan(ctx_factory, n, stride): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "[n] -> {[i,j]: 0<=i " + "{[i,j]: sweep_lbound<=i {[i]: 0<=i {[i,j,k]: 0<=k {[i]: 0<=i {[i]: 0 <= i < n}", + "[i] -> {[j]: 0 <= j <= i}", + "[i] -> {[k]: 0 <= k <= i}" + ], + """ + <>tmp[i] = sum(k, 1) + out[i] = sum(j, tmp[j]) + """) + + knl = lp.fix_parameters(knl, n=10) + knl = lp.tag_inames(knl, dict(i=i_tag, j=j_tag)) + + knl = lp.realize_reduction(knl, force_scan=True) + + print(knl) + + evt, (out,) = knl(queue) + + print(out) + + +def test_scan_not_triangular(): + knl = lp.make_kernel( + "{[i,j]: 0<=i<100 and 1<=j<=2*i}", + """ + a[i] = sum(j, j**2) + """ + ) + + with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError): + knl = lp.realize_reduction(knl, force_scan=True) + + +@pytest.mark.parametrize("n", [1, 2, 3, 16, 17]) +def test_local_parallel_scan(ctx_factory, n): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "[n] -> {[i,j]: 0<=i {[i,j]: 1<=i {[i,j]: 0<=i_ = segmented_sum(j, arr[j], segflag[j])", + [ + lp.GlobalArg("arr", np.float32, shape=("n",)), + lp.GlobalArg("segflag", np.int32, shape=("n",)), + "..." + ]) + + knl = lp.fix_parameters(knl, n=n) + knl = lp.tag_inames(knl, dict(i=iname_tag)) + knl = lp.realize_reduction(knl, force_scan=True) + + (evt, (out,)) = knl(queue, arr=arr, segflag=segment_boundaries) + + class SegmentGrouper(object): + + def __init__(self): + self.seg_idx = 0 + self.idx = 0 + + def __call__(self, key): + if self.idx in segment_boundaries_indices: + self.seg_idx += 1 + self.idx += 1 + return self.seg_idx + + from itertools import groupby + + expected = [np.cumsum(list(group)) + for _, group in groupby(arr, SegmentGrouper())] + actual = [np.array(list(group)) + for _, group in groupby(out, SegmentGrouper())] + + assert len(expected) == len(actual) == len(segment_boundaries_indices) + assert [(e == a).all() for e, a in zip(expected, actual)] + + +# {{{ two and three level scan getters + +@memoize +def _get_two_level_scan_kernel(g_size): + knl = lp.make_kernel( + [ + "[n] -> {[i,j]: 0 <= i < n and 0 <= j <= i}", + ], + """ + out[i] = sum(j, a[j]) {id=insn} + """, + "...", + assumptions="n > 0") + + from loopy.transform.reduction import make_two_level_scan + knl = make_two_level_scan( + knl, "insn", inner_length=g_size, + scan_iname="j", + sweep_iname="i", + local_storage_axes=(("i__l0",)), + inner_iname="i__l0", + inner_tag="l.0", + outer_tag="g.0", + local_storage_scope=lp.temp_var_scope.LOCAL, + nonlocal_storage_scope=lp.temp_var_scope.GLOBAL, + inner_local_tag="l.0", + outer_local_tag="g.0") + + knl = lp.realize_reduction(knl, force_scan=True) + + knl = lp.add_nosync( + knl, + scope="global", + source="writes:acc_j__l0", + sink="reads:acc_j__l0") + + from loopy.transform.save import save_and_reload_temporaries + + knl = lp.preprocess_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl) + knl = save_and_reload_temporaries(knl) + knl = lp.get_one_scheduled_kernel(knl) + + return knl + + +@memoize +def _get_three_level_scan_kernel(g_size, p_size): + knl = lp.make_kernel( + [ + "[n] -> {[i,j]: 0 <= i < n and 0 <= j <= i}", + ], + """ + out[i] = sum(j, a[j]) {id=insn} + """, + "...", + assumptions="n > 0") + + from loopy.transform.reduction import make_two_level_scan + knl = make_two_level_scan( + knl, "insn", inner_length=g_size, + scan_iname="j", + sweep_iname="i", + local_storage_axes=(("i__l0",)), + inner_iname="i__l0", + inner_tag=None, + outer_tag="g.0", + local_storage_scope=lp.temp_var_scope.LOCAL, + nonlocal_storage_scope=lp.temp_var_scope.GLOBAL, + inner_local_tag=None, + outer_local_tag="g.0") + + knl = make_two_level_scan( + knl, "insn__l1", inner_length=p_size, + scan_iname="j__l1", + sweep_iname="i__l1", + inner_tag="for", + outer_tag="l.0", + nonlocal_tag="l.0", + local_storage_scope=lp.temp_var_scope.LOCAL, + nonlocal_storage_scope=lp.temp_var_scope.LOCAL, + inner_local_tag="for", + outer_local_tag="l.0") + + knl = lp.tag_inames(knl, dict(i__l0="l.0", + i__l0_nltail_inner="l.0")) + + knl = lp.realize_reduction(knl, force_scan=True) + + knl = lp.add_nosync( + knl, + scope="global", + source="writes:acc_j__l0", + sink="reads:acc_j__l0") + + knl = lp.alias_temporaries(knl, + ("insn__l1", "insn__l2"), + synchronize_for_exclusive_use=False) + + from loopy.transform.save import save_and_reload_temporaries + + knl = lp.preprocess_kernel(knl) + knl = lp.get_one_scheduled_kernel(knl) + knl = save_and_reload_temporaries(knl) + knl = lp.get_one_scheduled_kernel(knl) + + return knl + +# }}} + + +# TODO: Test everything from the matrix +# (l.0, seq) x (l.0, seq) +@pytest.mark.parametrize("input_len", + (1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 32)) +@pytest.mark.parametrize("g_size", (16,)) +def test_two_level_scan(ctx_getter, input_len, g_size): + knl = _get_two_level_scan_kernel(g_size) + + import numpy as np + np.random.seed(0) + a = np.random.randint(low=0, high=100, size=input_len) + + c = ctx_getter() + q = cl.CommandQueue(c) + + _, (out,) = knl(q, a=a) + + assert (out == np.cumsum(a)).all() + + +@pytest.mark.parametrize("input_len", + (1, 2, 3, 4, 5, 6, 7, 8, 10, 11, 12, 13, 14, 15, 16, 17, 32)) +@pytest.mark.parametrize("g_size", (16,)) +@pytest.mark.parametrize("p_size", (4,)) +def test_three_level_scan(ctx_getter, g_size, p_size, input_len): + knl = _get_three_level_scan_kernel(g_size, p_size) + + import numpy as np + np.random.seed(0) + a = np.random.randint(low=0, high=100, size=input_len) + + c = ctx_getter() + q = cl.CommandQueue(c) + + _, (out,) = knl(q, a=a) + + assert (out == np.cumsum(a)).all() + + +def test_scan_extra_constraints_on_domain(): + knl = lp.make_kernel( + "{[i,j,k]: 0<=i 1: + exec(sys.argv[1]) + else: + from py.test.cmdline import main + main([__file__]) + +# vim: foldmethod=marker diff --git a/test/test_transform.py b/test/test_transform.py index ac5a26f6a5683bf9e86055a2729ea5ee995dee1e..b5fcdf04c4781c5f370c911ceb7efcb4042f6b4e 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -402,6 +402,42 @@ def test_precompute_with_preexisting_inames_fail(): precompute_inames="ii,jj") +def test_add_nosync(): + orig_knl = lp.make_kernel("{[i]: 0<=i<10}", + """ + <>tmp[i] = 10 {id=insn1} + <>tmp2[i] = 10 {id=insn2} + + <>tmp3[2*i] = 0 {id=insn3} + <>tmp4 = 1 + tmp3[2*i] {id=insn4} + + <>tmp5[i] = 0 {id=insn5,groups=g1} + tmp5[i] = 1 {id=insn6,conflicts=g1} + """) + + orig_knl = lp.set_temporary_scope(orig_knl, "tmp3", "local") + orig_knl = lp.set_temporary_scope(orig_knl, "tmp5", "local") + + # No dependency present - don't add nosync + knl = lp.add_nosync(orig_knl, "any", "writes:tmp", "writes:tmp2") + assert frozenset() == knl.id_to_insn["insn2"].no_sync_with + + # Dependency present + knl = lp.add_nosync(orig_knl, "local", "writes:tmp3", "reads:tmp3") + assert frozenset() == knl.id_to_insn["insn3"].no_sync_with + assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with + + # Bidirectional + knl = lp.add_nosync( + orig_knl, "local", "writes:tmp3", "reads:tmp3", bidirectional=True) + assert frozenset([("insn4", "local")]) == knl.id_to_insn["insn3"].no_sync_with + assert frozenset([("insn3", "local")]) == knl.id_to_insn["insn4"].no_sync_with + + # Groups + knl = lp.add_nosync(orig_knl, "local", "insn5", "insn6") + assert frozenset([("insn5", "local")]) == knl.id_to_insn["insn6"].no_sync_with + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1])