diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 5eaa12b8124f86cfaf08cf2e83c3382861d9e0f2..92ec799f7045cf63dc75d1386d8a51fd7d42954c 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1120,7 +1120,7 @@ all work items reach the same barrier, the kernel will hang during execution. By default, :mod:`loopy` inserts local barriers between two instructions when it detects that a dependency involving local memory may occur across work items. To -see this in action, take a look at the section on :ref:`local_temporaries`. +see this in action, take a look at the section on :ref:`local_temporaries`. In contrast, :mod:`loopy` will *not* insert global barriers automatically. Global barriers require manual intervention along with some special @@ -1308,7 +1308,7 @@ tagged, as in the following example:: assumptions="n>0") .. [#global-barrier-note] In particular, this is *not* the same as a call to - ``barrier(CLK_GLOBAL_MEM_FENCE)``. + ``barrier(CLK_GLOBAL_MEM_FENCE)``. .. }}} @@ -1533,12 +1533,12 @@ information provided. Now we will count the operations: >>> op_map = lp.get_op_map(knl) >>> print(lp.stringify_stats_mapping(op_map)) - Op(np:dtype('float32'), add) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - Op(np:dtype('float32'), div) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - Op(np:dtype('float32'), mul) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - Op(np:dtype('float64'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } - Op(np:dtype('float64'), mul) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } - Op(np:dtype('int32'), add) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float32'), add) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float32'), div) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float32'), mul) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float64'), add) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float64'), mul) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('int32'), add) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } <BLANKLINE> :func:`loopy.get_op_map` returns a :class:`loopy.ToCountMap` of **{** @@ -1596,9 +1596,9 @@ together into keys containing only the specified fields: >>> op_map_dtype = op_map.group_by('dtype') >>> print(lp.stringify_stats_mapping(op_map_dtype)) - Op(np:dtype('float32'), None) : [n, m, l] -> { 3 * n * m * l : n > 0 and m > 0 and l > 0 } - Op(np:dtype('float64'), None) : [n, m, l] -> { 2 * n * m : n > 0 and m > 0 and l > 0 } - Op(np:dtype('int32'), None) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + Op(np:dtype('float32'), None) : [m, l, n] -> { 3 * m * l * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('float64'), None) : [m, l, n] -> { 2 * m * n : m > 0 and l > 0 and n > 0 } + Op(np:dtype('int32'), None) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } <BLANKLINE> >>> f32op_count = op_map_dtype[lp.Op(dtype=np.float32) ... ].eval_with_dict(param_dict) @@ -1619,12 +1619,12 @@ we'll continue using the kernel from the previous example: >>> mem_map = lp.get_mem_access_map(knl) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 2 * n * m * l : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { n * m * l : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { n * m : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float32'), 0, load, a) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { m * n : m > 0 and l > 0 and n > 0 } <BLANKLINE> :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{** @@ -1674,18 +1674,18 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: >>> bytes_map = mem_map.to_bytes() >>> print(lp.stringify_stats_mapping(bytes_map)) - MemAccess(global, np:dtype('float32'), 0, load, a) : [n, m, l] -> { 8 * n * m * l : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float32'), 0, load, b) : [n, m, l] -> { 4 * n * m * l : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float32'), 0, store, c) : [n, m, l] -> { 4 * n * m * l : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float64'), 0, load, g) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float64'), 0, load, h) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 } - MemAccess(global, np:dtype('float64'), 0, store, e) : [n, m, l] -> { 8 * n * m : n > 0 and m > 0 and l > 0 } + MemAccess(global, np:dtype('float32'), 0, load, a) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, load, b) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), 0, store, c) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, g) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, load, h) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), 0, store, e) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } <BLANKLINE> >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global'] ... ).group_by('direction') >>> print(lp.stringify_stats_mapping(global_ld_st_bytes)) - MemAccess(None, None, None, load, None) : [n, m, l] -> { (16 * n * m + 12 * n * m * l) : n > 0 and m > 0 and l > 0 } - MemAccess(None, None, None, store, None) : [n, m, l] -> { (8 * n * m + 4 * n * m * l) : n > 0 and m > 0 and l > 0 } + MemAccess(None, None, None, load, None) : [m, l, n] -> { (16 * m + 12 * m * l) * n : m > 0 and l > 0 and n > 0 } + MemAccess(None, None, None, store, None) : [m, l, n] -> { (8 * m + 4 * m * l) * n : m > 0 and l > 0 and n > 0 } <BLANKLINE> >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load') ... ].eval_with_dict(param_dict) @@ -1712,12 +1712,12 @@ resulting :class:`islpy.PwQPolynomial` will be more complicated this time. ... outer_tag="l.1", inner_tag="l.0") >>> mem_map = lp.get_mem_access_map(knl_consec) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 1, load, a) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float32'), 1, load, b) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float32'), 1, store, c) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float64'), 1, load, g) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float64'), 1, load, h) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float64'), 1, store, e) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float32'), 1, load, a) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 1, load, b) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 1, store, c) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 1, load, g) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 1, load, h) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 1, store, e) : [m, l, n] -> { ... } <BLANKLINE> With this parallelization, consecutive threads will access consecutive array @@ -1753,12 +1753,12 @@ switch the inner and outer tags in our parallelization of the kernel: ... outer_tag="l.0", inner_tag="l.1") >>> mem_map = lp.get_mem_access_map(knl_nonconsec) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), 128, load, a) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float32'), 128, load, b) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float32'), 128, store, c) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float64'), 128, load, g) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float64'), 128, load, h) : [n, m, l] -> { ... } - MemAccess(global, np:dtype('float64'), 128, store, e) : [n, m, l] -> { ... } + MemAccess(global, np:dtype('float32'), 128, load, a) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 128, load, b) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float32'), 128, store, c) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 128, load, g) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 128, load, h) : [m, l, n] -> { ... } + MemAccess(global, np:dtype('float64'), 128, store, e) : [m, l, n] -> { ... } <BLANKLINE> With this parallelization, consecutive threads will access *nonconsecutive* diff --git a/loopy/auto_test.py b/loopy/auto_test.py index 6a4d559758bc1d7ca52e9dc4da1b7e503e22cc29..56ed87176f891d362ac0555024ef0d8098cd843e 100644 --- a/loopy/auto_test.py +++ b/loopy/auto_test.py @@ -518,9 +518,13 @@ def auto_test_vs_ref( args = None from loopy.kernel import kernel_state + from loopy.target.pyopencl import PyOpenCLTarget if test_knl.state not in [ kernel_state.PREPROCESSED, kernel_state.SCHEDULED]: + if isinstance(test_knl.target, PyOpenCLTarget): + test_knl = test_knl.copy(target=PyOpenCLTarget(ctx.devices[0])) + test_knl = lp.preprocess_kernel(test_knl) if not test_knl.schedule: diff --git a/loopy/check.py b/loopy/check.py index 54ab043d6a38f36852920eb3008d26e28b5cedfb..e72f9e3e6c4db797220729a5f282d4944b31d6ac 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -354,7 +354,7 @@ def check_has_schedulable_iname_nesting(kernel): def pre_schedule_checks(kernel): try: - logger.info("%s: pre-schedule check: start" % kernel.name) + logger.debug("%s: pre-schedule check: start" % kernel.name) check_for_orphaned_user_hardware_axes(kernel) check_for_double_use_of_hw_axes(kernel) @@ -367,7 +367,7 @@ def pre_schedule_checks(kernel): check_write_destinations(kernel) check_has_schedulable_iname_nesting(kernel) - logger.info("%s: pre-schedule check: done" % kernel.name) + logger.debug("%s: pre-schedule check: done" % kernel.name) except KeyboardInterrupt: raise except: @@ -618,7 +618,7 @@ def check_that_shapes_and_strides_are_arguments(kernel): def pre_codegen_checks(kernel): try: - logger.info("pre-codegen check %s: start" % kernel.name) + logger.debug("pre-codegen check %s: start" % kernel.name) check_for_unused_hw_axes_in_insns(kernel) check_that_atomic_ops_are_used_exactly_on_atomic_arrays(kernel) @@ -627,7 +627,7 @@ def pre_codegen_checks(kernel): kernel.target.pre_codegen_check(kernel) check_that_shapes_and_strides_are_arguments(kernel) - logger.info("pre-codegen check %s: done" % kernel.name) + logger.debug("pre-codegen check %s: done" % kernel.name) except: print(75*"=") print("failing kernel during pre-schedule check:") diff --git a/loopy/diagnostic.py b/loopy/diagnostic.py index 15ab8a1ee13df440926e51e676223bc6a398df57..512e4ac8619f33856d0a8ed929de0b574f7da014 100644 --- a/loopy/diagnostic.py +++ b/loopy/diagnostic.py @@ -103,6 +103,10 @@ class MissingDefinitionError(LoopyError): class UnscheduledInstructionError(LoopyError): pass + +class ReductionIsNotTriangularError(LoopyError): + pass + # }}} diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index 36fbb49f4bb77c959877fb0bd21e1de6fb49c74b..5f0884fd44ed5064f3f195d103b164f2163d1d19 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -598,37 +598,63 @@ def get_simple_strides(bset, key_by="name"): assert len(comp_div_set_pieces) == 1 bset, = comp_div_set_pieces - lspace = bset.get_local_space() - for idiv in range(lspace.dim(dim_type.div)): - div = lspace.get_div(idiv) + def _get_indices_and_coeffs(obj, dts): + result = [] + for dt in dts: + for dim_idx in range(obj.dim(dt)): + coeff_val = obj.get_coefficient_val(dt, dim_idx) + if not coeff_val.is_zero(): + result.append((dt, dim_idx, coeff_val)) - # check for sub-divs - supported = True - for dim_idx in range(div.dim(dim_type.div)): - coeff_val = div.get_coefficient_val(dim_type.div, dim_idx) - if not coeff_val.is_zero(): - # sub-divs not supported - supported = False - break + return result + + for cns in bset.get_constraints(): + if not cns.is_equality(): + continue + aff = cns.get_aff() - if not supported: + # recognizes constraints of the form + # -i0 + 2*floor((i0)/2) == 0 + + if aff.dim(dim_type.div) != 1: + continue + + idiv = 0 + div = aff.get_div(idiv) + + # check for sub-divs + if _get_indices_and_coeffs(div, [dim_type.div]): + # found one -> not supported continue denom = div.get_denominator_val().to_python() - inames_and_coeffs = [] - for dt in [dim_type.param, dim_type.in_]: - for dim_idx in range(div.dim(dt)): - coeff_val = div.get_coefficient_val(dt, dim_idx) * denom - if not coeff_val.is_zero(): - inames_and_coeffs.append((dt, dim_idx, coeff_val)) + # if the coefficient in front of the div is not the same as the denominator + if not aff.get_coefficient_val(dim_type.div, idiv).div(denom).is_one(): + # not supported + continue + + inames_and_coeffs = _get_indices_and_coeffs( + div, [dim_type.param, dim_type.in_]) if len(inames_and_coeffs) != 1: continue (dt, dim_idx, coeff), = inames_and_coeffs - if coeff != 1: + if not (coeff * denom).is_one(): + # not supported + continue + + inames_and_coeffs = _get_indices_and_coeffs( + aff, [dim_type.param, dim_type.in_]) + + if len(inames_and_coeffs) != 1: + continue + + (outer_dt, outer_dim_idx, outer_coeff), = inames_and_coeffs + if (not outer_coeff.neg().is_one() + or (outer_dt, outer_dim_idx) != (dt, dim_idx)): # not supported continue diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index dccaca2ec104a4749289f7cd89c491292f618e3d..622f5e49be1e40b4156113d92907fe8b1d9fb859 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -1111,7 +1111,8 @@ class LoopKernel(ImmutableRecordWithoutPickling): return embedding - def stringify(self, what=None, with_dependencies=False): + def stringify(self, what=None, with_dependencies=False, use_separators=True, + show_labels=True): all_what = set([ "name", "arguments", @@ -1150,7 +1151,10 @@ class LoopKernel(ImmutableRecordWithoutPickling): kernel = self - sep = 75*"-" + if use_separators: + sep = [75*"-"] + else: + sep = [] def natorder(key): # Return natural ordering for strings, as opposed to dictionary order. @@ -1167,44 +1171,50 @@ class LoopKernel(ImmutableRecordWithoutPickling): return sorted(seq, key=lambda y: natorder(key(y))) if "name" in what: - lines.append(sep) + lines.extend(sep) lines.append("KERNEL: " + kernel.name) if "arguments" in what: - lines.append(sep) - lines.append("ARGUMENTS:") + lines.extend(sep) + if show_labels: + lines.append("ARGUMENTS:") for arg_name in natsorted(kernel.arg_dict): lines.append(str(kernel.arg_dict[arg_name])) if "domains" in what: - lines.append(sep) - lines.append("DOMAINS:") + lines.extend(sep) + if show_labels: + lines.append("DOMAINS:") for dom, parents in zip(kernel.domains, kernel.all_parents_per_domain()): lines.append(len(parents)*" " + str(dom)) if "tags" in what: - lines.append(sep) - lines.append("INAME IMPLEMENTATION TAGS:") + lines.extend(sep) + if show_labels: + lines.append("INAME IMPLEMENTATION TAGS:") for iname in natsorted(kernel.all_inames()): line = "%s: %s" % (iname, kernel.iname_to_tag.get(iname)) lines.append(line) if "variables" in what and kernel.temporary_variables: - lines.append(sep) - lines.append("TEMPORARIES:") + lines.extend(sep) + if show_labels: + lines.append("TEMPORARIES:") for tv in natsorted(six.itervalues(kernel.temporary_variables), key=lambda tv: tv.name): lines.append(str(tv)) if "rules" in what and kernel.substitutions: - lines.append(sep) - lines.append("SUBSTIUTION RULES:") + lines.extend(sep) + if show_labels: + lines.append("SUBSTIUTION RULES:") for rule_name in natsorted(six.iterkeys(kernel.substitutions)): lines.append(str(kernel.substitutions[rule_name])) if "instructions" in what: - lines.append(sep) - lines.append("INSTRUCTIONS:") + lines.extend(sep) + if show_labels: + lines.append("INSTRUCTIONS:") loop_list_width = 35 # {{{ topological sort @@ -1319,18 +1329,20 @@ class LoopKernel(ImmutableRecordWithoutPickling): dep_lines.append("%s : %s" % (insn.id, ",".join(insn.depends_on))) if "Dependencies" in what and dep_lines: - lines.append(sep) - lines.append("DEPENDENCIES: " - "(use loopy.show_dependency_graph to visualize)") + lines.extend(sep) + if show_labels: + lines.append("DEPENDENCIES: " + "(use loopy.show_dependency_graph to visualize)") lines.extend(dep_lines) if "schedule" in what and kernel.schedule is not None: - lines.append(sep) - lines.append("SCHEDULE:") + lines.extend(sep) + if show_labels: + lines.append("SCHEDULE:") from loopy.schedule import dump_schedule lines.append(dump_schedule(kernel, kernel.schedule)) - lines.append(sep) + lines.extend(sep) return "\n".join(lines) diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py index 0d22dbb88ed99c7c92480d1d39b924cc2198cc3f..08268ca9f27623a6d17a195d3c04acb55e5ec68a 100644 --- a/loopy/kernel/instruction.py +++ b/loopy/kernel/instruction.py @@ -714,7 +714,7 @@ class Assignment(MultiAssignmentBase): z[i] = z[i+1-1] + a {atomic} - :mod:`loopy` may to evaluate the right-hand side *multiple times* + :mod:`loopy` may choose to evaluate the right-hand side *multiple times* as part of a single assignment. It is up to the user to ensure that this retains correct semantics. diff --git a/loopy/preprocess.py b/loopy/preprocess.py index 38499cb91c9b9b677aa5b65ebe6d18d6f1983559..de7f2b593db6d376338aea40171a99fd18778b1e 100644 --- a/loopy/preprocess.py +++ b/loopy/preprocess.py @@ -276,7 +276,425 @@ def find_temporary_scope(kernel): # {{{ rewrite reduction to imperative form -# {{{ reduction utils +# {{{ utils (not stateful) + +from collections import namedtuple + + +_InameClassification = namedtuple("_InameClassifiction", + "sequential, local_parallel, nonlocal_parallel") + + +def _classify_reduction_inames(kernel, inames): + sequential = [] + local_par = [] + nonlocal_par = [] + + from loopy.kernel.data import ( + LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag, + ParallelTag) + + for iname in inames: + iname_tag = kernel.iname_to_tag.get(iname) + + if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)): + # These are nominally parallel, but we can live with + # them as sequential. + sequential.append(iname) + + elif isinstance(iname_tag, LocalIndexTagBase): + local_par.append(iname) + + elif isinstance(iname_tag, (ParallelTag, VectorizeTag)): + nonlocal_par.append(iname) + + else: + sequential.append(iname) + + return _InameClassification( + tuple(sequential), tuple(local_par), tuple(nonlocal_par)) + + +def _add_params_to_domain(domain, param_names): + dim_type = isl.dim_type + nparams_orig = domain.dim(dim_type.param) + domain = domain.add_dims(dim_type.param, len(param_names)) + + for param_idx, param_name in enumerate(param_names): + domain = domain.set_dim_name( + dim_type.param, param_idx + nparams_orig, param_name) + + return domain + + +def _move_set_to_param_dims_except(domain, except_dims): + dim_type = isl.dim_type + + iname_idx = 0 + for iname in domain.get_var_names(dim_type.set): + if iname not in except_dims: + domain = domain.move_dims( + dim_type.param, 0, + dim_type.set, iname_idx, 1) + iname_idx -= 1 + iname_idx += 1 + + return domain + + +def _domain_depends_on_given_set_dims(domain, set_dim_names): + set_dim_names = frozenset(set_dim_names) + + return any( + set_dim_names & set(constr.get_coefficients_by_name()) + for constr in domain.get_constraints()) + + +def _check_reduction_is_triangular(kernel, expr, scan_param): + """Check whether the reduction within `expr` with scan parameters described by + the structure `scan_param` is triangular. This attempts to verify that the + domain for the scan and sweep inames is as follows: + + [params] -> { + [other inames..., scan_iname, sweep_iname]: + (sweep_min_value + <= sweep_iname + <= sweep_max_value) + and + (scan_min_value + <= scan_iname + <= stride * (sweep_iname - sweep_min_value) + scan_min_value) + and + (irrelevant constraints) + } + """ + + orig_domain = kernel.get_inames_domain( + frozenset((scan_param.sweep_iname, scan_param.scan_iname))) + + sweep_iname = scan_param.sweep_iname + scan_iname = scan_param.scan_iname + affs = isl.affs_from_space(orig_domain.space) + + sweep_lower_bound = isl.align_spaces( + scan_param.sweep_lower_bound, + affs[0], + across_dim_types=True) + + sweep_upper_bound = isl.align_spaces( + scan_param.sweep_upper_bound, + affs[0], + across_dim_types=True) + + scan_lower_bound = isl.align_spaces( + scan_param.scan_lower_bound, + affs[0], + across_dim_types=True) + + from itertools import product + + for (sweep_lb_domain, sweep_lb_aff), \ + (sweep_ub_domain, sweep_ub_aff), \ + (scan_lb_domain, scan_lb_aff) in \ + product(sweep_lower_bound.get_pieces(), + sweep_upper_bound.get_pieces(), + scan_lower_bound.get_pieces()): + + # Assumptions inherited from the domains of the pwaffs + assumptions = sweep_lb_domain & sweep_ub_domain & scan_lb_domain + + # Sweep iname constraints + hyp_domain = affs[sweep_iname].ge_set(sweep_lb_aff) + hyp_domain &= affs[sweep_iname].le_set(sweep_ub_aff) + + # Scan iname constraints + hyp_domain &= affs[scan_iname].ge_set(scan_lb_aff) + hyp_domain &= affs[scan_iname].le_set( + scan_param.stride * (affs[sweep_iname] - sweep_lb_aff) + + scan_lb_aff) + + hyp_domain, = (hyp_domain & assumptions).get_basic_sets() + test_domain, = (orig_domain & assumptions).get_basic_sets() + + hyp_gist_against_test = hyp_domain.gist(test_domain) + if _domain_depends_on_given_set_dims(hyp_gist_against_test, + (sweep_iname, scan_iname)): + return False, ( + "gist of hypothesis against test domain " + "has sweep or scan dependent constraints: '%s'" + % hyp_gist_against_test) + + test_gist_against_hyp = test_domain.gist(hyp_domain) + if _domain_depends_on_given_set_dims(test_gist_against_hyp, + (sweep_iname, scan_iname)): + return False, ( + "gist of test against hypothesis domain " + "has sweep or scan dependent constraint: '%s'" + % test_gist_against_hyp) + + return True, "ok" + + +_ScanCandidateParameters = namedtuple( + "_ScanCandidateParameters", + "sweep_iname, scan_iname, sweep_lower_bound, " + "sweep_upper_bound, scan_lower_bound, stride") + + +def _try_infer_scan_candidate_from_expr( + kernel, expr, within_inames, sweep_iname=None): + """Analyze `expr` and determine if it can be implemented as a scan. + """ + from loopy.symbolic import Reduction + assert isinstance(expr, Reduction) + + if len(expr.inames) != 1: + raise ValueError( + "Multiple inames in reduction: '%s'" % (", ".join(expr.inames),)) + + scan_iname, = expr.inames + + from loopy.kernel.tools import DomainChanger + dchg = DomainChanger(kernel, (scan_iname,)) + domain = dchg.get_original_domain() + + if sweep_iname is None: + try: + sweep_iname = _try_infer_sweep_iname( + domain, scan_iname, kernel.all_inames()) + except ValueError as v: + raise ValueError( + "Couldn't determine a sweep iname for the scan " + "expression '%s': %s" % (expr, v)) + + try: + sweep_lower_bound, sweep_upper_bound, scan_lower_bound = ( + _try_infer_scan_and_sweep_bounds( + kernel, scan_iname, sweep_iname, within_inames)) + except ValueError as v: + raise ValueError( + "Couldn't determine bounds for the scan with expression '%s' " + "(sweep iname: '%s', scan iname: '%s'): %s" + % (expr, sweep_iname, scan_iname, v)) + + try: + stride = _try_infer_scan_stride( + kernel, scan_iname, sweep_iname, sweep_lower_bound) + except ValueError as v: + raise ValueError( + "Couldn't determine a scan stride for the scan with expression '%s' " + "(sweep iname: '%s', scan iname: '%s'): %s" + % (expr, sweep_iname, scan_iname, v)) + + return _ScanCandidateParameters(sweep_iname, scan_iname, sweep_lower_bound, + sweep_upper_bound, scan_lower_bound, stride) + + +def _try_infer_sweep_iname(domain, scan_iname, candidate_inames): + """The sweep iname is the outer iname which guides the scan. + + E.g. for a domain of {[i,j]: 0<=i<n and 0<=j<=i}, i is the sweep iname. + """ + constrs = domain.get_constraints() + sweep_iname_candidate = None + + for constr in constrs: + candidate_vars = set([ + var for var in constr.get_var_dict() + if var in candidate_inames]) + + # Irrelevant constraint - skip + if scan_iname not in candidate_vars: + continue + + # No additional inames - skip + if len(candidate_vars) == 1: + continue + + candidate_vars.remove(scan_iname) + + # Depends on more than one iname - error + if len(candidate_vars) > 1: + raise ValueError( + "More than one sweep iname candidate for scan iname '%s' found " + "(via constraint '%s')" % (scan_iname, constr)) + + next_candidate = candidate_vars.pop() + + if sweep_iname_candidate is None: + sweep_iname_candidate = next_candidate + defining_constraint = constr + else: + # Check next_candidate consistency + if sweep_iname_candidate != next_candidate: + raise ValueError( + "More than one sweep iname candidate for scan iname '%s' " + "found (via constraints '%s', '%s')" % + (scan_iname, defining_constraint, constr)) + + if sweep_iname_candidate is None: + raise ValueError( + "Couldn't find any sweep iname candidates for " + "scan iname '%s'" % scan_iname) + + return sweep_iname_candidate + + +def _try_infer_scan_and_sweep_bounds(kernel, scan_iname, sweep_iname, within_inames): + domain = kernel.get_inames_domain(frozenset((sweep_iname, scan_iname))) + domain = _move_set_to_param_dims_except(domain, (sweep_iname, scan_iname)) + + var_dict = domain.get_var_dict() + sweep_idx = var_dict[sweep_iname][1] + scan_idx = var_dict[scan_iname][1] + + domain = domain.project_out_except( + within_inames | kernel.non_iname_variable_names(), (isl.dim_type.param,)) + + try: + with isl.SuppressedWarnings(domain.get_ctx()): + sweep_lower_bound = domain.dim_min(sweep_idx) + sweep_upper_bound = domain.dim_max(sweep_idx) + scan_lower_bound = domain.dim_min(scan_idx) + except isl.Error as e: + raise ValueError("isl error: %s" % e) + + return (sweep_lower_bound, sweep_upper_bound, scan_lower_bound) + + +def _try_infer_scan_stride(kernel, scan_iname, sweep_iname, sweep_lower_bound): + """The stride is the number of steps the scan iname takes per iteration + of the sweep iname. This is allowed to be an integer constant. + + E.g. for a domain of {[i,j]: 0<=i<n and 0<=j<=6*i}, the stride is 6. + """ + dim_type = isl.dim_type + + domain = kernel.get_inames_domain(frozenset([sweep_iname, scan_iname])) + domain_with_sweep_param = _move_set_to_param_dims_except(domain, (scan_iname,)) + + domain_with_sweep_param = domain_with_sweep_param.project_out_except( + (sweep_iname, scan_iname), (dim_type.set, dim_type.param)) + + scan_iname_idx = domain_with_sweep_param.find_dim_by_name( + dim_type.set, scan_iname) + + # Should be equal to k * sweep_iname, where k is the stride. + + try: + with isl.SuppressedWarnings(domain_with_sweep_param.get_ctx()): + scan_iname_range = ( + domain_with_sweep_param.dim_max(scan_iname_idx) + - domain_with_sweep_param.dim_min(scan_iname_idx) + ).gist(domain_with_sweep_param.params()) + except isl.Error as e: + raise ValueError("isl error: '%s'" % e) + + scan_iname_pieces = scan_iname_range.get_pieces() + + if len(scan_iname_pieces) > 1: + raise ValueError("range in multiple pieces: %s" % scan_iname_range) + elif len(scan_iname_pieces) == 0: + raise ValueError("empty range found for iname '%s'" % scan_iname) + + scan_iname_constr, scan_iname_aff = scan_iname_pieces[0] + + if not scan_iname_constr.plain_is_universe(): + raise ValueError("found constraints: %s" % scan_iname_constr) + + if scan_iname_aff.dim(dim_type.div): + raise ValueError("aff has div: %s" % scan_iname_aff) + + coeffs = scan_iname_aff.get_coefficients_by_name(dim_type.param) + + if len(coeffs) == 0: + try: + scan_iname_aff.get_constant_val() + except: + raise ValueError("range for aff isn't constant: '%s'" % scan_iname_aff) + + # If this point is reached we're assuming the domain is of the form + # {[i,j]: i=0 and j=0}, so the stride is technically 1 - any value + # this function returns will be verified later by + # _check_reduction_is_triangular(). + return 1 + + if sweep_iname not in coeffs: + raise ValueError("didn't find sweep iname in coeffs: %s" % sweep_iname) + + stride = coeffs[sweep_iname] + + if not stride.is_int(): + raise ValueError("stride not an integer: %s" % stride) + + if not stride.is_pos(): + raise ValueError("stride not positive: %s" % stride) + + return stride.to_python() + + +def _get_domain_with_iname_as_param(domain, iname): + dim_type = isl.dim_type + + if domain.find_dim_by_name(dim_type.param, iname) >= 0: + return domain + + iname_idx = domain.find_dim_by_name(dim_type.set, iname) + + assert iname_idx >= 0, (iname, domain) + + return domain.move_dims( + dim_type.param, domain.dim(dim_type.param), + dim_type.set, iname_idx, 1) + + +def _create_domain_for_sweep_tracking(orig_domain, + tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride): + dim_type = isl.dim_type + + subd = isl.BasicSet.universe(orig_domain.params().space) + + # Add tracking_iname and sweep iname. + + subd = _add_params_to_domain(subd, (sweep_iname, tracking_iname)) + + # Here we realize the domain: + # + # [..., i] -> { + # [j]: 0 <= j - l + # and + # j - l <= k * (i - m) + # and + # k * (i - m - 1) < j - l } + # where + # * i is the sweep iname + # * j is the tracking iname + # * k is the stride for the scan + # * l is the lower bound for the scan + # * m is the lower bound for the sweep iname + # + affs = isl.affs_from_space(subd.space) + + subd &= (affs[tracking_iname] - scan_min_value).ge_set(affs[0]) + subd &= (affs[tracking_iname] - scan_min_value)\ + .le_set(stride * (affs[sweep_iname] - sweep_min_value)) + subd &= (affs[tracking_iname] - scan_min_value)\ + .gt_set(stride * (affs[sweep_iname] - sweep_min_value - 1)) + + # Move tracking_iname into a set dim (NOT sweep iname). + subd = subd.move_dims( + dim_type.set, 0, + dim_type.param, subd.dim(dim_type.param) - 1, 1) + + # Simplify (maybe). + orig_domain_with_sweep_param = ( + _get_domain_with_iname_as_param(orig_domain, sweep_iname)) + subd = subd.gist_params(orig_domain_with_sweep_param.params()) + + subd, = subd.get_basic_sets() + + return subd + def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): """ @@ -462,10 +880,22 @@ def _hackily_ensure_multi_assignment_return_values_are_scoped_private(kernel): return kernel.copy(temporary_variables=new_temporary_variables, instructions=new_instructions) + +def _insert_subdomain_into_domain_tree(kernel, domains, subdomain): + # Intersect with inames, because we could have captured some kernel params + # in here too... + dependent_inames = ( + frozenset(subdomain.get_var_names(isl.dim_type.param)) + & kernel.all_inames()) + idx, = kernel.get_leaf_domain_indices(dependent_inames) + domains.insert(idx + 1, subdomain) + # }}} -def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): +def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True, + automagic_scans_ok=False, force_scan=False, + force_outer_iname_for_scan=None): """Rewrites reductions into their imperative form. With *insn_id_filter* specified, operate only on the instruction with an instruction id matching *insn_id_filter*. @@ -476,6 +906,17 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): If *insn_id_filter* is not given, all reductions in all instructions will be realized. + + If *automagic_scans_ok*, this function will attempt to rewrite triangular + reductions as scans automatically. + + If *force_scan* is *True*, this function will attempt to rewrite *all* + candidate reductions as scans and raise an error if this is not possible + (this is most useful combined with *insn_id_filter*). + + If *force_outer_iname_for_scan* is not *None*, this function will attempt + to realize candidate reductions as scans using the specified iname as the + outer (sweep) iname. """ logger.debug("%s: realize reduction" % kernel.name) @@ -487,6 +928,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): var_name_gen = kernel.get_var_name_generator() new_temporary_variables = kernel.temporary_variables.copy() + inames_added_for_scan = set() + inames_to_remove = set() # {{{ helpers @@ -496,8 +939,44 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): else: return val + def preprocess_scan_arguments( + insn, expr, nresults, scan_iname, track_iname, + newly_generated_insn_id_set): + """Does iname substitution within scan arguments and returns a set of values + suitable to be passed to the binary op. Returns a tuple.""" + + if nresults > 1: + inner_expr = expr + + # In the case of a multi-argument scan, we need a name for each of + # the arguments in order to pass them to the binary op - so we expand + # items that are not "plain" tuples here. + if not isinstance(inner_expr, tuple): + get_args_insn_id = insn_id_gen( + "%s_%s_get" % (insn.id, "_".join(expr.inames))) + + inner_expr = expand_inner_reduction( + id=get_args_insn_id, + expr=inner_expr, + nresults=nresults, + depends_on=insn.depends_on, + within_inames=insn.within_inames | expr.inames, + within_inames_is_final=insn.within_inames_is_final) + + newly_generated_insn_id_set.add(get_args_insn_id) + + updated_inner_exprs = tuple( + replace_var_within_expr(sub_expr, scan_iname, track_iname) + for sub_expr in inner_expr) + else: + updated_inner_exprs = ( + replace_var_within_expr(expr, scan_iname, track_iname),) + + return updated_inner_exprs + def expand_inner_reduction(id, expr, nresults, depends_on, within_inames, within_inames_is_final): + # FIXME: use make_temporaries from pymbolic.primitives import Call from loopy.symbolic import Reduction assert isinstance(expr, (Call, Reduction)) @@ -537,20 +1016,23 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): reduction_dtypes): outer_insn_inames = temp_kernel.insn_inames(insn) - from pymbolic import var - acc_var_names = [ - var_name_gen("acc_"+"_".join(expr.inames)) - for i in range(nresults)] - acc_vars = tuple(var(n) for n in acc_var_names) + from loopy.kernel.data import temp_var_scope + acc_var_names = make_temporaries( + name_based_on="acc_"+"_".join(expr.inames), + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + scope=temp_var_scope.PRIVATE) - from loopy.kernel.data import TemporaryVariable, temp_var_scope + init_insn_depends_on = frozenset() - for name, dtype in zip(acc_var_names, reduction_dtypes): - new_temporary_variables[name] = TemporaryVariable( - name=name, - shape=(), - dtype=dtype, - scope=temp_var_scope.PRIVATE) + global_barrier = lp.find_most_recent_global_barrier(temp_kernel, insn.id) + + if global_barrier is not None: + init_insn_depends_on |= frozenset([global_barrier]) + + from pymbolic import var + acc_vars = tuple(var(n) for n in acc_var_names) init_id = insn_id_gen( "%s_%s_init" % (insn.id, "_".join(expr.inames))) @@ -560,7 +1042,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): assignees=acc_vars, within_inames=outer_insn_inames - frozenset(expr.inames), within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset(), + depends_on=init_insn_depends_on, expression=expr.operation.neutral_element(*arg_dtypes)) generated_insns.append(init_insn) @@ -574,17 +1056,20 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): reduction_insn_depends_on = set([init_id]) + # In the case of a multi-argument reduction, we need a name for each of + # the arguments in order to pass them to the binary op - so we expand + # items that are not "plain" tuples here. if nresults > 1 and not isinstance(expr.expr, tuple): get_args_insn_id = insn_id_gen( "%s_%s_get" % (insn.id, "_".join(expr.inames))) reduction_expr = expand_inner_reduction( - id=get_args_insn_id, - expr=expr.expr, - nresults=nresults, - depends_on=insn.depends_on, - within_inames=update_insn_iname_deps, - within_inames_is_final=insn.within_inames_is_final) + id=get_args_insn_id, + expr=expr.expr, + nresults=nresults, + depends_on=insn.depends_on, + within_inames=update_insn_iname_deps, + within_inames_is_final=insn.within_inames_is_final) reduction_insn_depends_on.add(get_args_insn_id) else: @@ -633,6 +1118,14 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): v[iname].lt_set(v[0] + size)).get_basic_sets() return bs + def _make_slab_set_from_range(iname, lbound, ubound): + v = isl.make_zero_and_vars([iname]) + bs, = ( + v[iname].ge_set(v[0] + lbound) + & + v[iname].lt_set(v[0] + ubound)).get_basic_sets() + return bs + def map_reduction_local(expr, rec, nresults, arg_dtypes, reduction_dtypes): red_iname, = expr.inames @@ -657,6 +1150,24 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): _get_int_iname_size(oiname) for oiname in outer_local_inames) + from loopy.kernel.data import temp_var_scope + + neutral_var_names = make_temporaries( + name_based_on="neutral_"+red_iname, + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + scope=temp_var_scope.PRIVATE) + + acc_var_names = make_temporaries( + name_based_on="acc_"+red_iname, + nvars=nresults, + shape=outer_local_iname_sizes + (size,), + dtypes=reduction_dtypes, + scope=temp_var_scope.LOCAL) + + acc_vars = tuple(var(n) for n in acc_var_names) + # {{{ add separate iname to carry out the reduction # Doing this sheds any odd conditionals that may be active @@ -668,32 +1179,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): # }}} - neutral_var_names = [ - var_name_gen("neutral_"+red_iname) - for i in range(nresults)] - acc_var_names = [ - var_name_gen("acc_"+red_iname) - for i in range(nresults)] - acc_vars = tuple(var(n) for n in acc_var_names) - - from loopy.kernel.data import TemporaryVariable, temp_var_scope - for name, dtype in zip(acc_var_names, reduction_dtypes): - new_temporary_variables[name] = TemporaryVariable( - name=name, - shape=outer_local_iname_sizes + (size,), - dtype=dtype, - scope=temp_var_scope.LOCAL) - for name, dtype in zip(neutral_var_names, reduction_dtypes): - new_temporary_variables[name] = TemporaryVariable( - name=name, - shape=(), - dtype=dtype, - scope=temp_var_scope.PRIVATE) - base_iname_deps = outer_insn_inames - frozenset(expr.inames) neutral = expr.operation.neutral_element(*arg_dtypes) - init_id = insn_id_gen("%s_%s_init" % (insn.id, red_iname)) init_insn = make_assignment( id=init_id, @@ -718,6 +1206,9 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): transfer_depends_on = set([init_neutral_id, init_id]) + # In the case of a multi-argument reduction, we need a name for each of + # the arguments in order to pass them to the binary op - so we expand + # items that are not "plain" tuples here. if nresults > 1 and not isinstance(expr.expr, tuple): get_args_insn_id = insn_id_gen( "%s_%s_get" % (insn.id, red_iname)) @@ -752,9 +1243,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): (outer_insn_inames - frozenset(expr.inames)) | frozenset([red_iname])), within_inames_is_final=insn.within_inames_is_final, - depends_on=frozenset(transfer_depends_on) | insn.depends_on, - no_sync_with=frozenset( - [(insn_id, "any") for insn_id in transfer_depends_on])) + depends_on=frozenset([init_id, init_neutral_id]) | insn.depends_on, + no_sync_with=frozenset([(init_id, "any")])) generated_insns.append(transfer_insn) cur_size = 1 @@ -766,6 +1256,7 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): istage = 0 while cur_size > 1: + new_size = cur_size // 2 assert new_size * 2 == cur_size @@ -814,6 +1305,351 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): return [acc_var[outer_local_iname_vars + (0,)] for acc_var in acc_vars] # }}} + # {{{ utils (stateful) + + from pytools import memoize + + @memoize + def get_or_add_sweep_tracking_iname_and_domain( + scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride, + tracking_iname): + domain = temp_kernel.get_inames_domain(frozenset((scan_iname, sweep_iname))) + + inames_added_for_scan.add(tracking_iname) + + new_domain = _create_domain_for_sweep_tracking(domain, + tracking_iname, sweep_iname, sweep_min_value, scan_min_value, stride) + + _insert_subdomain_into_domain_tree(temp_kernel, domains, new_domain) + + return tracking_iname + + def replace_var_within_expr(expr, from_var, to_var): + from pymbolic.mapper.substitutor import make_subst_func + + from loopy.symbolic import ( + SubstitutionRuleMappingContext, RuleAwareSubstitutionMapper) + + rule_mapping_context = SubstitutionRuleMappingContext( + temp_kernel.substitutions, var_name_gen) + + from pymbolic import var + mapper = RuleAwareSubstitutionMapper( + rule_mapping_context, + make_subst_func({from_var: var(to_var)}), + within=lambda *args: True) + + return mapper(expr, temp_kernel, None) + + def make_temporaries(name_based_on, nvars, shape, dtypes, scope): + var_names = [ + var_name_gen(name_based_on.format(index=i)) + for i in range(nvars)] + + from loopy.kernel.data import TemporaryVariable + + for name, dtype in zip(var_names, dtypes): + new_temporary_variables[name] = TemporaryVariable( + name=name, + shape=shape, + dtype=dtype, + scope=scope) + + return var_names + + # }}} + + # {{{ sequential scan + + def map_scan_seq(expr, rec, nresults, arg_dtypes, + reduction_dtypes, sweep_iname, scan_iname, sweep_min_value, + scan_min_value, stride): + outer_insn_inames = temp_kernel.insn_inames(insn) + inames_to_remove.add(scan_iname) + + track_iname = var_name_gen( + "{sweep_iname}__seq_scan" + .format(scan_iname=scan_iname, sweep_iname=sweep_iname)) + + get_or_add_sweep_tracking_iname_and_domain( + scan_iname, sweep_iname, sweep_min_value, scan_min_value, + stride, track_iname) + + from loopy.kernel.data import temp_var_scope + acc_var_names = make_temporaries( + name_based_on="acc_" + scan_iname, + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + scope=temp_var_scope.PRIVATE) + + from pymbolic import var + acc_vars = tuple(var(n) for n in acc_var_names) + + init_id = insn_id_gen( + "%s_%s_init" % (insn.id, "_".join(expr.inames))) + + init_insn_depends_on = frozenset() + + global_barrier = lp.find_most_recent_global_barrier(temp_kernel, insn.id) + + if global_barrier is not None: + init_insn_depends_on |= frozenset([global_barrier]) + + init_insn = make_assignment( + id=init_id, + assignees=acc_vars, + within_inames=outer_insn_inames - frozenset( + (sweep_iname,) + expr.inames), + within_inames_is_final=insn.within_inames_is_final, + depends_on=init_insn_depends_on, + expression=expr.operation.neutral_element(*arg_dtypes)) + + generated_insns.append(init_insn) + + update_insn_depends_on = set([init_insn.id]) | insn.depends_on + + updated_inner_exprs = ( + preprocess_scan_arguments(insn, expr.expr, nresults, + scan_iname, track_iname, update_insn_depends_on)) + + update_id = insn_id_gen( + based_on="%s_%s_update" % (insn.id, "_".join(expr.inames))) + + update_insn_iname_deps = temp_kernel.insn_inames(insn) | set([track_iname]) + if insn.within_inames_is_final: + update_insn_iname_deps = insn.within_inames | set([track_iname]) + + scan_insn = make_assignment( + id=update_id, + assignees=acc_vars, + expression=expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, acc_vars), + _strip_if_scalar(acc_vars, updated_inner_exprs)), + depends_on=frozenset(update_insn_depends_on), + within_inames=update_insn_iname_deps, + no_sync_with=insn.no_sync_with, + within_inames_is_final=insn.within_inames_is_final) + + generated_insns.append(scan_insn) + + new_insn_add_depends_on.add(scan_insn.id) + + if nresults == 1: + assert len(acc_vars) == 1 + return acc_vars[0] + else: + return acc_vars + + # }}} + + # {{{ local-parallel scan + + def map_scan_local(expr, rec, nresults, arg_dtypes, + reduction_dtypes, sweep_iname, scan_iname, + sweep_min_value, scan_min_value, stride): + + scan_size = _get_int_iname_size(sweep_iname) + + assert scan_size > 0 + + if scan_size == 1: + return map_reduction_seq( + expr, rec, nresults, arg_dtypes, reduction_dtypes) + + outer_insn_inames = temp_kernel.insn_inames(insn) + + from loopy.kernel.data import LocalIndexTagBase + outer_local_inames = tuple( + oiname + for oiname in outer_insn_inames + if isinstance( + kernel.iname_to_tag.get(oiname), + LocalIndexTagBase) + and oiname != sweep_iname) + + from pymbolic import var + outer_local_iname_vars = tuple( + var(oiname) for oiname in outer_local_inames) + + outer_local_iname_sizes = tuple( + _get_int_iname_size(oiname) + for oiname in outer_local_inames) + + track_iname = var_name_gen( + "{sweep_iname}__pre_scan" + .format(scan_iname=scan_iname, sweep_iname=sweep_iname)) + + get_or_add_sweep_tracking_iname_and_domain( + scan_iname, sweep_iname, sweep_min_value, scan_min_value, stride, + track_iname) + + # {{{ add separate iname to carry out the scan + + # Doing this sheds any odd conditionals that may be active + # on our scan_iname. + + base_exec_iname = var_name_gen(sweep_iname + "__scan") + domains.append(_make_slab_set(base_exec_iname, scan_size)) + new_iname_tags[base_exec_iname] = kernel.iname_to_tag[sweep_iname] + + # }}} + + from loopy.kernel.data import temp_var_scope + + read_var_names = make_temporaries( + name_based_on="read_"+scan_iname+"_arg_{index}", + nvars=nresults, + shape=(), + dtypes=reduction_dtypes, + scope=temp_var_scope.PRIVATE) + + acc_var_names = make_temporaries( + name_based_on="acc_"+scan_iname, + nvars=nresults, + shape=outer_local_iname_sizes + (scan_size,), + dtypes=reduction_dtypes, + scope=temp_var_scope.LOCAL) + + acc_vars = tuple(var(n) for n in acc_var_names) + read_vars = tuple(var(n) for n in read_var_names) + + base_iname_deps = (outer_insn_inames + - frozenset(expr.inames) - frozenset([sweep_iname])) + + neutral = expr.operation.neutral_element(*arg_dtypes) + + init_insn_depends_on = insn.depends_on + + global_barrier = lp.find_most_recent_global_barrier(temp_kernel, insn.id) + + if global_barrier is not None: + init_insn_depends_on |= frozenset([global_barrier]) + + init_id = insn_id_gen("%s_%s_init" % (insn.id, scan_iname)) + init_insn = make_assignment( + id=init_id, + assignees=tuple( + acc_var[outer_local_iname_vars + (var(base_exec_iname),)] + for acc_var in acc_vars), + expression=neutral, + within_inames=base_iname_deps | frozenset([base_exec_iname]), + within_inames_is_final=insn.within_inames_is_final, + depends_on=init_insn_depends_on) + generated_insns.append(init_insn) + + transfer_insn_depends_on = set([init_insn.id]) | insn.depends_on + + updated_inner_exprs = ( + preprocess_scan_arguments(insn, expr.expr, nresults, + scan_iname, track_iname, transfer_insn_depends_on)) + + from loopy.symbolic import Reduction + + from loopy.symbolic import pw_aff_to_expr + sweep_min_value_expr = pw_aff_to_expr(sweep_min_value) + + transfer_id = insn_id_gen("%s_%s_transfer" % (insn.id, scan_iname)) + transfer_insn = make_assignment( + id=transfer_id, + assignees=tuple( + acc_var[outer_local_iname_vars + + (var(sweep_iname) - sweep_min_value_expr,)] + for acc_var in acc_vars), + expression=Reduction( + operation=expr.operation, + inames=(track_iname,), + expr=_strip_if_scalar(acc_vars, updated_inner_exprs), + allow_simultaneous=False, + ), + within_inames=outer_insn_inames - frozenset(expr.inames), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset(transfer_insn_depends_on), + no_sync_with=frozenset([(init_id, "any")]) | insn.no_sync_with) + + generated_insns.append(transfer_insn) + + prev_id = transfer_id + + istage = 0 + cur_size = 1 + + while cur_size < scan_size: + stage_exec_iname = var_name_gen("%s__scan_s%d" % (sweep_iname, istage)) + domains.append( + _make_slab_set_from_range(stage_exec_iname, cur_size, scan_size)) + new_iname_tags[stage_exec_iname] = kernel.iname_to_tag[sweep_iname] + + for read_var, acc_var in zip(read_vars, acc_vars): + read_stage_id = insn_id_gen( + "scan_%s_read_stage_%d" % (scan_iname, istage)) + + read_stage_insn = make_assignment( + id=read_stage_id, + assignees=(read_var,), + expression=( + acc_var[ + outer_local_iname_vars + + (var(stage_exec_iname) - cur_size,)]), + within_inames=( + base_iname_deps | frozenset([stage_exec_iname])), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset([prev_id])) + + if cur_size == 1: + # Performance hack: don't add a barrier here with transfer_insn. + # NOTE: This won't work if the way that local inames + # are lowered changes. + read_stage_insn = read_stage_insn.copy( + no_sync_with=( + read_stage_insn.no_sync_with + | frozenset([(transfer_id, "any")]))) + + generated_insns.append(read_stage_insn) + prev_id = read_stage_id + + write_stage_id = insn_id_gen( + "scan_%s_write_stage_%d" % (scan_iname, istage)) + write_stage_insn = make_assignment( + id=write_stage_id, + assignees=tuple( + acc_var[outer_local_iname_vars + (var(stage_exec_iname),)] + for acc_var in acc_vars), + expression=expr.operation( + arg_dtypes, + _strip_if_scalar(acc_vars, read_vars), + _strip_if_scalar(acc_vars, tuple( + acc_var[ + outer_local_iname_vars + (var(stage_exec_iname),)] + for acc_var in acc_vars)) + ), + within_inames=( + base_iname_deps | frozenset([stage_exec_iname])), + within_inames_is_final=insn.within_inames_is_final, + depends_on=frozenset([prev_id]), + ) + + generated_insns.append(write_stage_insn) + prev_id = write_stage_id + + cur_size *= 2 + istage += 1 + + new_insn_add_depends_on.add(prev_id) + new_insn_add_within_inames.add(sweep_iname) + + output_idx = var(sweep_iname) - sweep_min_value_expr + + if nresults == 1: + assert len(acc_vars) == 1 + return acc_vars[0][outer_local_iname_vars + (output_idx,)] + else: + return [acc_var[outer_local_iname_vars + (output_idx,)] + for acc_var in acc_vars] + + # }}} + # {{{ seq/par dispatch def map_reduction(expr, rec, nresults=1): @@ -832,31 +1668,43 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): raise LoopyError("reduction used within loop(s) that it was " "supposed to reduce over: " + ", ".join(bad_inames)) - n_sequential = 0 - n_local_par = 0 + iname_classes = _classify_reduction_inames(temp_kernel, expr.inames) - from loopy.kernel.data import ( - LocalIndexTagBase, UnrolledIlpTag, UnrollTag, VectorizeTag, - ParallelTag) - for iname in expr.inames: - iname_tag = kernel.iname_to_tag.get(iname) + n_sequential = len(iname_classes.sequential) + n_local_par = len(iname_classes.local_parallel) + n_nonlocal_par = len(iname_classes.nonlocal_parallel) + + really_force_scan = force_scan and ( + len(expr.inames) != 1 or expr.inames[0] not in inames_added_for_scan) + + def _error_if_force_scan_on(cls, msg): + if really_force_scan: + raise cls(msg) - if isinstance(iname_tag, (UnrollTag, UnrolledIlpTag)): - # These are nominally parallel, but we can live with - # them as sequential. - n_sequential += 1 + may_be_implemented_as_scan = False + if force_scan or automagic_scans_ok: + from loopy.diagnostic import ReductionIsNotTriangularError - elif isinstance(iname_tag, LocalIndexTagBase): - n_local_par += 1 + try: + # Try to determine scan candidate information (sweep iname, scan + # iname, etc). + scan_param = _try_infer_scan_candidate_from_expr( + temp_kernel, expr, outer_insn_inames, + sweep_iname=force_outer_iname_for_scan) - elif isinstance(iname_tag, (ParallelTag, VectorizeTag)): - raise LoopyError("the only form of parallelism supported " - "by reductions is 'local'--found iname '%s' " - "tagged '%s'" - % (iname, type(iname_tag).__name__)) + except ValueError as v: + error = str(v) else: - n_sequential += 1 + # Ensures the reduction is triangular (somewhat expensive). + may_be_implemented_as_scan, error = ( + _check_reduction_is_triangular( + temp_kernel, expr, scan_param)) + + if not may_be_implemented_as_scan: + _error_if_force_scan_on(ReductionIsNotTriangularError, error) + + # {{{ sanity checks if n_local_par and n_sequential: raise LoopyError("Reduction over '%s' contains both parallel and " @@ -872,21 +1720,87 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): "before code generation." % ", ".join(expr.inames)) - if n_sequential: - assert n_local_par == 0 - return map_reduction_seq(expr, rec, nresults, arg_dtypes, - reduction_dtypes) - elif n_local_par: - return map_reduction_local(expr, rec, nresults, arg_dtypes, - reduction_dtypes) - else: + if n_nonlocal_par: + bad_inames = iname_classes.nonlocal_parallel + raise LoopyError("the only form of parallelism supported " + "by reductions is 'local'--found iname(s) '%s' " + "respectively tagged '%s'" + % (", ".join(bad_inames), + ", ".join(kernel.iname_to_tag[iname] + for iname in bad_inames))) + + if n_local_par == 0 and n_sequential == 0: from loopy.diagnostic import warn_with_kernel warn_with_kernel(kernel, "empty_reduction", "Empty reduction found (no inames to reduce over). " "Eliminating.") + # We're not supposed to reduce/sum at all. (Note how this is distinct + # from an empty reduction--there is an element here, just no inames + # to reduce over. It's rather similar to an array with () shape in + # numpy.) + return expr.expr + # }}} + + if may_be_implemented_as_scan: + assert force_scan or automagic_scans_ok + + # We require the "scan" iname to be tagged sequential. + if n_sequential: + sweep_iname = scan_param.sweep_iname + sweep_class = _classify_reduction_inames(kernel, (sweep_iname,)) + + sequential = sweep_iname in sweep_class.sequential + parallel = sweep_iname in sweep_class.local_parallel + bad_parallel = sweep_iname in sweep_class.nonlocal_parallel + + if sweep_iname not in outer_insn_inames: + _error_if_force_scan_on(LoopyError, + "Sweep iname '%s' was detected, but is not an iname " + "for the instruction." % sweep_iname) + elif bad_parallel: + _error_if_force_scan_on(LoopyError, + "Sweep iname '%s' has an unsupported parallel tag '%s' " + "- the only parallelism allowed is 'local'." % + (sweep_iname, temp_kernel.iname_to_tag[sweep_iname])) + elif parallel: + return map_scan_local( + expr, rec, nresults, arg_dtypes, reduction_dtypes, + sweep_iname, scan_param.scan_iname, + scan_param.sweep_lower_bound, + scan_param.scan_lower_bound, + scan_param.stride) + elif sequential: + return map_scan_seq( + expr, rec, nresults, arg_dtypes, reduction_dtypes, + sweep_iname, scan_param.scan_iname, + scan_param.sweep_lower_bound, + scan_param.scan_lower_bound, + scan_param.stride) + + # fallthrough to reduction implementation + + else: + assert n_local_par > 0 + scan_iname, = expr.inames + _error_if_force_scan_on(LoopyError, + "Scan iname '%s' is parallel tagged: this is not allowed " + "(only the sweep iname should be tagged if parallelism " + "is desired)." % scan_iname) + + # fallthrough to reduction implementation + + if n_sequential: + assert n_local_par == 0 + return map_reduction_seq( + expr, rec, nresults, arg_dtypes, reduction_dtypes) + else: + assert n_local_par > 0 + return map_reduction_local( + expr, rec, nresults, arg_dtypes, reduction_dtypes) + # }}} from loopy.symbolic import ReductionCallbackMapper @@ -992,6 +1906,8 @@ def realize_reduction(kernel, insn_id_filter=None, unknown_types_ok=True): kernel = lp.tag_inames(kernel, new_iname_tags) + # TODO: remove unused inames... + kernel = ( _hackily_ensure_multi_assignment_return_values_are_scoped_private( kernel)) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index c078da2ec58dabbbf646bfcf593ea0138941cc85..57cf74b808ae1a7107e76a18a3876785ab8baabd 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -1908,14 +1908,14 @@ def generate_loop_schedules_inner(kernel, debug_args={}): if (gsize or lsize): if not kernel.options.disable_global_barriers: - logger.info("%s: barrier insertion: global" % kernel.name) + logger.debug("%s: barrier insertion: global" % kernel.name) gen_sched = insert_barriers(kernel, gen_sched, kind="global", verify_only=True) - logger.info("%s: barrier insertion: local" % kernel.name) + logger.debug("%s: barrier insertion: local" % kernel.name) gen_sched = insert_barriers(kernel, gen_sched, kind="local", verify_only=False) - logger.info("%s: barrier insertion: done" % kernel.name) + logger.debug("%s: barrier insertion: done" % kernel.name) new_kernel = kernel.copy( schedule=gen_sched, diff --git a/loopy/statistics.py b/loopy/statistics.py index cb15eb55498bcafe4ae537747e387e47ddbd8254..9b15ec471fb681698b85c1dd2f92376fbc731f00 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -38,6 +38,7 @@ __doc__ = """ .. currentmodule:: loopy +.. autoclass:: GuardedPwQPolynomial .. autoclass:: ToCountMap .. autoclass:: Op .. autoclass:: MemAccess @@ -52,6 +53,66 @@ __doc__ = """ """ +# {{{ GuardedPwQPolynomial + +class GuardedPwQPolynomial(object): + def __init__(self, pwqpolynomial, valid_domain): + self.pwqpolynomial = pwqpolynomial + self.valid_domain = valid_domain + + def __add__(self, other): + if isinstance(other, GuardedPwQPolynomial): + return GuardedPwQPolynomial( + self.pwqpolynomial + other.pwqpolynomial, + self.valid_domain & other.valid_domain) + else: + return GuardedPwQPolynomial( + self.pwqpolynomial + other, + self.valid_domain) + + __radd__ = __add__ + + def __mul__(self, other): + if isinstance(other, GuardedPwQPolynomial): + return GuardedPwQPolynomial( + self.pwqpolynomial * other.pwqpolynomial, + self.valid_domain & other.valid_domain) + else: + return GuardedPwQPolynomial( + self.pwqpolynomial * other, + self.valid_domain) + + __rmul__ = __mul__ + + def eval_with_dict(self, value_dict): + space = self.pwqpolynomial.space + pt = isl.Point.zero(space.params()) + + for i in range(space.dim(dim_type.param)): + par_name = space.get_dim_name(dim_type.param, i) + pt = pt.set_coordinate_val( + dim_type.param, i, value_dict[par_name]) + + if not (isl.Set.from_point(pt) <= self.valid_domain): + raise ValueError("evaluation point outside of domain of " + "definition of piecewise quasipolynomial") + + return self.pwqpolynomial.eval(pt).to_python() + + @staticmethod + def zero(): + p = isl.PwQPolynomial('{ 0 }') + return GuardedPwQPolynomial(p, isl.Set.universe(p.domain().space)) + + def __str__(self): + return str(self.pwqpolynomial) + + def __repr__(self): + return repr(self.pwqpolynomial) + +# }}} + + # {{{ ToCountMap class ToCountMap(object): @@ -66,7 +127,7 @@ class ToCountMap(object): """ - def __init__(self, init_dict=None, val_type=isl.PwQPolynomial): + def __init__(self, init_dict=None, val_type=GuardedPwQPolynomial): if init_dict is None: init_dict = {} self.count_map = init_dict @@ -87,7 +148,7 @@ class ToCountMap(object): return self def __mul__(self, other): - if isinstance(other, isl.PwQPolynomial): + if isinstance(other, GuardedPwQPolynomial): return ToCountMap(dict( (index, self.count_map[index]*other) for index in self.keys())) @@ -103,8 +164,8 @@ class ToCountMap(object): return self.count_map[index] except KeyError: #TODO what is the best way to handle this? - if self.val_type is isl.PwQPolynomial: - return isl.PwQPolynomial('{ 0 }') + if self.val_type is GuardedPwQPolynomial: + return GuardedPwQPolynomial.zero() else: return 0 @@ -132,12 +193,18 @@ class ToCountMap(object): def copy(self): return ToCountMap(dict(self.count_map), self.val_type) + def with_set_attributes(self, **kwargs): + return ToCountMap(dict( + (key.copy(**kwargs), val) + for key, val in six.iteritems(self.count_map)), + self.val_type) + def filter_by(self, **kwargs): """Remove items without specified key fields. - :parameter \*\*kwargs: Keyword arguments matching fields in the keys of - the :class:`ToCountMap`, each given a list of - allowable values for that key field. + :arg kwargs: Keyword arguments matching fields in the keys of + the :class:`ToCountMap`, each given a list of + allowable values for that key field. :return: A :class:`ToCountMap` containing the subset of the items in the original :class:`ToCountMap` that match the field values @@ -183,10 +250,10 @@ class ToCountMap(object): def filter_by_func(self, func): """Keep items that pass a test. - :parameter func: A function that takes a map key a parameter and - returns a :class:`bool`. + :arg func: A function that takes a map key a parameter and + returns a :class:`bool`. - :return: A :class:`ToCountMap` containing the subset of the items in + :arg: A :class:`ToCountMap` containing the subset of the items in the original :class:`ToCountMap` for which func(key) is true. Example usage:: @@ -218,7 +285,7 @@ class ToCountMap(object): """Group map items together, distinguishing by only the key fields passed in args. - :parameter \*args: Zero or more :class:`str` fields of map keys. + :arg args: Zero or more :class:`str` fields of map keys. :return: A :class:`ToCountMap` containing the same total counts grouped together by new keys that only contain the fields @@ -336,8 +403,8 @@ class ToCountMap(object): """ - if self.val_type is isl.PwQPolynomial: - total = isl.PwQPolynomial('{ 0 }') + if self.val_type is GuardedPwQPolynomial: + total = GuardedPwQPolynomial.zero() else: total = 0 @@ -385,8 +452,10 @@ def stringify_stats_mapping(m): return result +# {{{ Op descriptor + class Op(object): - """An arithmetic operation. + """A descriptor for a type of arithmetic operation. .. attribute:: dtype @@ -400,6 +469,8 @@ class Op(object): """ + # FIXME: This could be done much more briefly by inheriting from Record. + def __init__(self, dtype=None, name=None): self.name = name if dtype is None: @@ -419,19 +490,15 @@ class Op(object): return hash(str(self)) def __repr__(self): - if self.dtype is None: - dtype = 'None' - else: - dtype = str(self.dtype) - if self.name is None: - name = 'None' - else: - name = self.name - return "Op("+dtype+", "+name+")" + return "Op(%s, %s)" % (self.dtype, self.name) +# }}} + + +# {{{ MemAccess descriptor class MemAccess(object): - """A memory access. + """A descriptor for a type of memory access. .. attribute:: mtype @@ -460,6 +527,8 @@ class MemAccess(object): """ + # FIXME: This could be done much more briefly by inheriting from Record. + def __init__(self, mtype=None, dtype=None, stride=None, direction=None, variable=None): self.mtype = mtype @@ -482,6 +551,16 @@ class MemAccess(object): raise NotImplementedError("MemAccess: variable must be None when " "mtype is 'local'") + def copy(self, mtype=None, dtype=None, stride=None, direction=None, + variable=None): + return MemAccess( + mtype=mtype if mtype is not None else self.mtype, + dtype=dtype if dtype is not None else self.dtype, + stride=stride if stride is not None else self.stride, + direction=direction if direction is not None else self.direction, + variable=variable if variable is not None else self.variable, + ) + def __eq__(self, other): return isinstance(other, MemAccess) and ( (self.mtype is None or other.mtype is None or @@ -522,11 +601,70 @@ class MemAccess(object): return "MemAccess(" + mtype + ", " + dtype + ", " + stride + ", " \ + direction + ", " + variable + ")" +# }}} -# {{{ ExpressionOpCounter -class ExpressionOpCounter(CombineMapper): +# {{{ counter base +class CounterBase(CombineMapper): + def __init__(self, knl): + self.knl = knl + from loopy.type_inference import TypeInferenceMapper + self.type_inf = TypeInferenceMapper(knl) + + def combine(self, values): + return sum(values) + + def map_constant(self, expr): + return ToCountMap() + + def map_call(self, expr): + return self.rec(expr.parameters) + + def map_sum(self, expr): + if expr.children: + return sum(self.rec(child) for child in expr.children) + else: + return ToCountMap() + + map_product = map_sum + + def map_comparison(self, expr): + return self.rec(expr.left)+self.rec(expr.right) + + def map_if(self, expr): + warn_with_kernel(self.knl, "summing_if_branches", + "%s counting sum of if-expression branches." + % type(self).__name__) + return self.rec(expr.condition) + self.rec(expr.then) \ + + self.rec(expr.else_) + + def map_if_positive(self, expr): + warn_with_kernel(self.knl, "summing_if_branches", + "%s counting sum of if-expression branches." + % type(self).__name__) + return self.rec(expr.criterion) + self.rec(expr.then) \ + + self.rec(expr.else_) + + def map_common_subexpression(self, expr): + raise RuntimeError("%s encountered %s--not supposed to happen" + % (type(self).__name__, type(expr).__name__)) + + map_substitution = map_common_subexpression + map_derivative = map_common_subexpression + map_slice = map_common_subexpression + + # preprocessing should have removed these + def map_reduction(self, expr): + raise RuntimeError("%s encountered %s--not supposed to happen" + % (type(self).__name__, type(expr).__name__)) + +# }}} + + +# {{{ ExpressionOpCounter + +class ExpressionOpCounter(CounterBase): def __init__(self, knl): self.knl = knl from loopy.type_inference import TypeInferenceMapper @@ -641,106 +779,59 @@ class ExpressionOpCounter(CombineMapper): # }}} -# {{{ LocalSubscriptCounter +class MemAccessCounter(CounterBase): + pass -class LocalSubscriptCounter(CombineMapper): - def __init__(self, knl): - self.knl = knl - from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl) +# {{{ LocalMemAccessCounter - def combine(self, values): - return sum(values) - - def map_constant(self, expr): - return ToCountMap() - - map_tagged_variable = map_constant - map_variable = map_constant - - def map_call(self, expr): - return self.rec(expr.parameters) - - def map_subscript(self, expr): +class LocalMemAccessCounter(MemAccessCounter): + def count_var_access(self, dtype, name, subscript): sub_map = ToCountMap() - name = expr.aggregate.name # name of array if name in self.knl.temporary_variables: array = self.knl.temporary_variables[name] if array.is_local: - sub_map[MemAccess(mtype='local', dtype=self.type_inf(expr))] = 1 - return sub_map + self.rec(expr.index) - - def map_sum(self, expr): - if expr.children: - return sum(self.rec(child) for child in expr.children) - else: - return ToCountMap() - - map_product = map_sum + sub_map[MemAccess(mtype='local', dtype=dtype)] = 1 + return sub_map - def map_comparison(self, expr): - return self.rec(expr.left)+self.rec(expr.right) - - def map_if(self, expr): - warn_with_kernel(self.knl, "summing_if_branches_lsubs", - "LocalSubscriptCounter counting LMEM accesses as sum " - "of if-statement branches.") - return self.rec(expr.condition) + self.rec(expr.then) \ - + self.rec(expr.else_) - - def map_if_positive(self, expr): - warn_with_kernel(self.knl, "summing_ifpos_branches_lsubs", - "LocalSubscriptCounter counting LMEM accesses as sum " - "of if_pos-statement branches.") - return self.rec(expr.criterion) + self.rec(expr.then) \ - + self.rec(expr.else_) - - def map_common_subexpression(self, expr): - raise NotImplementedError("LocalSubscriptCounter encountered " - "common_subexpression, " - "map_common_subexpression not implemented.") - - def map_substitution(self, expr): - raise NotImplementedError("LocalSubscriptCounter encountered " - "substitution, " - "map_substitution not implemented.") + def map_variable(self, expr): + return self.count_var_access( + self.type_inf(expr), expr.name, None) - def map_derivative(self, expr): - raise NotImplementedError("LocalSubscriptCounter encountered " - "derivative, " - "map_derivative not implemented.") + map_tagged_variable = map_variable - def map_slice(self, expr): - raise NotImplementedError("LocalSubscriptCounter encountered slice, " - "map_slice not implemented.") + def map_subscript(self, expr): + return ( + self.count_var_access( + self.type_inf(expr), expr.aggregate.name, expr.index) + + self.rec(expr.index)) # }}} -# {{{ GlobalSubscriptCounter - -class GlobalSubscriptCounter(CombineMapper): +# {{{ GlobalMemAccessCounter - def __init__(self, knl): - self.knl = knl - from loopy.type_inference import TypeInferenceMapper - self.type_inf = TypeInferenceMapper(knl) - - def combine(self, values): - return sum(values) +class GlobalMemAccessCounter(MemAccessCounter): + def map_variable(self, expr): + name = expr.name - def map_constant(self, expr): - return ToCountMap() + if name in self.knl.arg_dict: + array = self.knl.arg_dict[name] + else: + # this is a temporary variable + return ToCountMap() - map_tagged_variable = map_constant - map_variable = map_constant + if not isinstance(array, lp.GlobalArg): + # this array is not in global memory + return ToCountMap() - def map_call(self, expr): - return self.rec(expr.parameters) + return ToCountMap({MemAccess(mtype='global', + dtype=self.type_inf(expr), stride=0, + variable=name): 1} + ) + self.rec(expr.index) def map_subscript(self, expr): - name = expr.aggregate.name # name of array + name = expr.aggregate.name if name in self.knl.arg_dict: array = self.knl.arg_dict[name] @@ -827,47 +918,6 @@ class GlobalSubscriptCounter(CombineMapper): stride=total_stride, variable=name): 1} ) + self.rec(expr.index) - def map_sum(self, expr): - if expr.children: - return sum(self.rec(child) for child in expr.children) - else: - return ToCountMap() - - map_product = map_sum - - def map_if(self, expr): - warn_with_kernel(self.knl, "summing_if_branches_gsubs", - "GlobalSubscriptCounter counting GMEM accesses as " - "sum of if-statement branches.") - return self.rec(expr.condition) + self.rec(expr.then) \ - + self.rec(expr.else_) - - def map_if_positive(self, expr): - warn_with_kernel(self.knl, "summing_ifpos_branches_gsubs", - "GlobalSubscriptCounter counting GMEM accesses as " - "sum of if_pos-statement branches.") - return self.rec(expr.criterion) + self.rec(expr.then) \ - + self.rec(expr.else_) - - def map_common_subexpression(self, expr): - raise NotImplementedError("GlobalSubscriptCounter encountered " - "common_subexpression, " - "map_common_subexpression not implemented.") - - def map_substitution(self, expr): - raise NotImplementedError("GlobalSubscriptCounter encountered " - "substitution, " - "map_substitution not implemented.") - - def map_derivative(self, expr): - raise NotImplementedError("GlobalSubscriptCounter encountered " - "derivative, " - "map_derivative not implemented.") - - def map_slice(self, expr): - raise NotImplementedError("GlobalSubscriptCounter encountered slice, " - "map_slice not implemented.") - # }}} @@ -940,9 +990,13 @@ class AccessFootprintGatherer(CombineMapper): # {{{ count -def count(kernel, set): +def add_assumptions_guard(kernel, pwqpolynomial): + return GuardedPwQPolynomial(pwqpolynomial, kernel.assumptions) + + +def count(kernel, set, space=None): try: - return set.card() + return add_assumptions_guard(kernel, set.card()) except AttributeError: pass @@ -969,7 +1023,11 @@ def count(kernel, set): if stride is None: stride = 1 - length = isl.PwQPolynomial.from_pw_aff(dmax - dmin + stride) + length_pwaff = dmax - dmin + stride + if space is not None: + length_pwaff = length_pwaff.align_params(space) + + length = isl.PwQPolynomial.from_pw_aff(length_pwaff) length = length.scale_down_val(stride) if bset_count is None: @@ -1029,46 +1087,102 @@ def count(kernel, set): "number of integer points in your loop " "domain.") - return count + return add_assumptions_guard(kernel, count) -# }}} +def get_unused_hw_axes_factor(knl, insn, disregard_local_axes, space=None): + # FIXME: Multi-kernel support + gsize, lsize = knl.get_grid_size_upper_bounds() -# {{{ get_op_poly + g_used = set() + l_used = set() -def get_op_poly(knl, numpy_types=True): + from loopy.kernel.data import LocalIndexTag, GroupIndexTag + for iname in knl.insn_inames(insn): + tag = knl.iname_to_tag.get(iname) - """Count the number of operations in a loopy kernel. + if isinstance(tag, LocalIndexTag): + l_used.add(tag.axis) + elif isinstance(tag, GroupIndexTag): + g_used.add(tag.axis) - get_op_poly is deprecated. Use get_op_map instead. + def mult_grid_factor(used_axes, size): + result = 1 + for iaxis, size in enumerate(size): + if iaxis not in used_axes: + if not isinstance(size, int): + if space is not None: + size = size.align_params(space) + + size = isl.PwQPolynomial.from_pw_aff(size) + + result = result * size + + return result + + if disregard_local_axes: + result = mult_grid_factor(g_used, gsize) + else: + result = mult_grid_factor(g_used, gsize) * mult_grid_factor(l_used, lsize) + + return add_assumptions_guard(knl, result) - """ - warn_with_kernel(knl, "depricated_get_op_poly", - "get_op_poly is deprecated. Use get_op_map instead.") - return get_op_map(knl, numpy_types) + +def count_insn_runs(knl, insn, count_redundant_work, disregard_local_axes=False): + insn_inames = knl.insn_inames(insn) + + if disregard_local_axes: + from loopy.kernel.data import LocalIndexTag + insn_inames = [iname for iname in insn_inames if not + isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)] + + inames_domain = knl.get_inames_domain(insn_inames) + domain = (inames_domain.project_out_except( + insn_inames, [dim_type.set])) + + space = isl.Space.create_from_names(isl.DEFAULT_CONTEXT, + set=[], params=knl.outer_params()) + + c = count(knl, domain, space=space) + + if count_redundant_work: + unused_fac = get_unused_hw_axes_factor(knl, insn, + disregard_local_axes=disregard_local_axes, + space=space) + return c * unused_fac + else: + return c # }}} -def get_op_map(knl, numpy_types=True): +# {{{ get_op_map + +def get_op_map(knl, numpy_types=True, count_redundant_work=False): """Count the number of operations in a loopy kernel. - :parameter knl: A :class:`loopy.LoopKernel` whose operations are to be counted. + :arg knl: A :class:`loopy.LoopKernel` whose operations are to be counted. - :parameter numpy_types: A :class:`bool` specifying whether the types - in the returned mapping should be numpy types - instead of :class:`loopy.LoopyType`. + :arg numpy_types: A :class:`bool` specifying whether the types + in the returned mapping should be numpy types + instead of :class:`loopy.LoopyType`. + + :arg count_redundant_work: Based on usage of hardware axes or other + specifics, a kernel may perform work redundantly. This :class:`bool` + flag indicates whether this work should be included in the count. + (Likely desirable for performance modeling, but undesirable for + code optimization.) :return: A :class:`ToCountMap` of **{** :class:`Op` **:** - :class:`islpy.PwQPolynomial` **}**. + :class:`islpy.PwQPolynomial` **}**. - - The :class:`Op` specifies the characteristics of the arithmetic - operation. + - The :class:`Op` specifies the characteristics of the arithmetic + operation. - - The :class:`islpy.PwQPolynomial` holds the number of operations of - the kind specified in the key (in terms of the - :class:`loopy.LoopKernel` parameter *inames*). + - The :class:`islpy.PwQPolynomial` holds the number of operations of + the kind specified in the key (in terms of the + :class:`loopy.LoopKernel` parameter *inames*). Example usage:: @@ -1090,14 +1204,10 @@ def get_op_map(knl, numpy_types=True): op_map = ToCountMap() op_counter = ExpressionOpCounter(knl) for insn in knl.instructions: - # how many times is this instruction executed? - # check domain size: - insn_inames = knl.insn_inames(insn) - inames_domain = knl.get_inames_domain(insn_inames) - domain = (inames_domain.project_out_except( - insn_inames, [dim_type.set])) ops = op_counter(insn.assignee) + op_counter(insn.expression) - op_map = op_map + ops*count(knl, domain) + op_map = op_map + ops*count_insn_runs( + knl, insn, + count_redundant_work=count_redundant_work) if numpy_types: op_map.count_map = dict((Op(dtype=op.dtype.numpy_dtype, name=op.name), @@ -1106,73 +1216,36 @@ def get_op_map(knl, numpy_types=True): return op_map - -#TODO test deprecated functions? -def get_lmem_access_poly(knl): - """Count the number of local memory accesses in a loopy kernel. - - get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the - result with the mtype=['local'] option. - - """ - warn_with_kernel(knl, "depricated_get_lmem_access_poly", - "get_lmem_access_poly is deprecated. Use " - "get_mem_access_map and filter the result with the " - "mtype=['local'] option.") - return get_mem_access_map(knl).filter_by(mtype=['local']) - - -def get_DRAM_access_poly(knl): - """Count the number of global memory accesses in a loopy kernel. - - get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the - result with the mtype=['global'] option. - - """ - warn_with_kernel(knl, "depricated_get_DRAM_access_poly", - "get_DRAM_access_poly is deprecated. Use " - "get_mem_access_map and filter the result with the " - "mtype=['global'] option.") - return get_mem_access_map(knl).filter_by(mtype=['global']) - - -# {{{ get_gmem_access_poly - -def get_gmem_access_poly(knl): - """Count the number of global memory accesses in a loopy kernel. - - get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the - result with the mtype=['global'] option. - - """ - warn_with_kernel(knl, "depricated_get_gmem_access_poly", - "get_DRAM_access_poly is deprecated. Use " - "get_mem_access_map and filter the result with the " - "mtype=['global'] option.") - return get_mem_access_map(knl).filter_by(mtype=['global']) - # }}} -def get_mem_access_map(knl, numpy_types=True): +# {{{ get_mem_access_map + +def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False): """Count the number of memory accesses in a loopy kernel. - :parameter knl: A :class:`loopy.LoopKernel` whose memory accesses are to be - counted. + :arg knl: A :class:`loopy.LoopKernel` whose memory accesses are to be + counted. - :parameter numpy_types: A :class:`bool` specifying whether the types - in the returned mapping should be numpy types - instead of :class:`loopy.LoopyType`. + :arg numpy_types: A :class:`bool` specifying whether the types + in the returned mapping should be numpy types + instead of :class:`loopy.LoopyType`. + + :arg count_redundant_work: Based on usage of hardware axes or other + specifics, a kernel may perform work redundantly. This :class:`bool` + flag indicates whether this work should be included in the count. + (Likely desirable for performance modeling, but undesirable for + code optimization.) :return: A :class:`ToCountMap` of **{** :class:`MemAccess` **:** - :class:`islpy.PwQPolynomial` **}**. + :class:`islpy.PwQPolynomial` **}**. - - The :class:`MemAccess` specifies the characteristics of the - memory access. + - The :class:`MemAccess` specifies the characteristics of the + memory access. - - The :class:`islpy.PwQPolynomial` holds the number of memory - accesses with the characteristics specified in the key (in terms - of the :class:`loopy.LoopKernel` *inames*). + - The :class:`islpy.PwQPolynomial` holds the number of memory + accesses with the characteristics specified in the key (in terms + of the :class:`loopy.LoopKernel` *inames*). Example usage:: @@ -1217,102 +1290,74 @@ def get_mem_access_map(knl, numpy_types=True): cache_holder = CacheHolder() @memoize_in(cache_holder, "insn_count") - def get_insn_count(knl, insn_inames, uniform=False): - if uniform: - from loopy.kernel.data import LocalIndexTag - insn_inames = [iname for iname in insn_inames if not - isinstance(knl.iname_to_tag.get(iname), LocalIndexTag)] - inames_domain = knl.get_inames_domain(insn_inames) - domain = (inames_domain.project_out_except( - insn_inames, [dim_type.set])) - return count(knl, domain) + def get_insn_count(knl, insn_id, uniform=False): + insn = knl.id_to_insn[insn_id] + return count_insn_runs( + knl, insn, disregard_local_axes=uniform, + count_redundant_work=count_redundant_work) knl = infer_unknown_types(knl, expect_completion=True) knl = preprocess_kernel(knl) - subs_map = ToCountMap() - subs_counter_g = GlobalSubscriptCounter(knl) - subs_counter_l = LocalSubscriptCounter(knl) + access_map = ToCountMap() + access_counter_g = GlobalMemAccessCounter(knl) + access_counter_l = LocalMemAccessCounter(knl) for insn in knl.instructions: - # count subscripts - subs_expr = subs_counter_g(insn.expression) \ - + subs_counter_l(insn.expression) - - # distinguish loads and stores - for key in subs_expr.count_map: - subs_expr[MemAccess(mtype=key.mtype, dtype=key.dtype, - stride=key.stride, direction='load', - variable=key.variable) - ] = subs_expr.pop(key) - - subs_assignee_g = subs_counter_g(insn.assignee) - for key in subs_assignee_g.count_map: - subs_assignee_g[MemAccess(mtype=key.mtype, dtype=key.dtype, - stride=key.stride, - direction='store', - variable=key.variable) - ] = subs_assignee_g.pop(key) - # for now, don't count writes to local mem - - insn_inames = knl.insn_inames(insn) + access_expr = ( + access_counter_g(insn.expression) + + access_counter_l(insn.expression) + ).with_set_attributes(direction="load") + + access_assignee_g = access_counter_g(insn.assignee).with_set_attributes( + direction="store") + + # FIXME: (!!!!) for now, don't count writes to local mem # use count excluding local index tags for uniform accesses - for key in subs_expr.count_map: - map = ToCountMap({key: subs_expr[key]}) - if (key.mtype == 'global' and + for key, val in six.iteritems(access_expr.count_map): + is_uniform = (key.mtype == 'global' and isinstance(key.stride, int) and - key.stride == 0): - subs_map = subs_map \ - + map*get_insn_count(knl, insn_inames, True) - else: - subs_map = subs_map + map*get_insn_count(knl, insn_inames) - #currently not counting stride of local mem access - - for key in subs_assignee_g.count_map: - map = ToCountMap({key: subs_assignee_g[key]}) - if isinstance(key.stride, int) and key.stride == 0: - subs_map = subs_map \ - + map*get_insn_count(knl, insn_inames, True) - else: - subs_map = subs_map + map*get_insn_count(knl, insn_inames) + key.stride == 0) + access_map = ( + access_map + + ToCountMap({key: val}) + * get_insn_count(knl, insn.id, is_uniform)) + #currently not counting stride of local mem access + + for key, val in six.iteritems(access_assignee_g.count_map): + is_uniform = (key.mtype == 'global' and + isinstance(key.stride, int) and + key.stride == 0) + access_map = ( + access_map + + ToCountMap({key: val}) + * get_insn_count(knl, insn.id, is_uniform)) # for now, don't count writes to local mem if numpy_types: - subs_map.count_map = dict((MemAccess(mtype=mem_access.mtype, + # FIXME: Don't modify in-place + access_map.count_map = dict((MemAccess(mtype=mem_access.mtype, dtype=mem_access.dtype.numpy_dtype, stride=mem_access.stride, direction=mem_access.direction, variable=mem_access.variable), count) - for mem_access, count in six.iteritems(subs_map.count_map)) - - return subs_map - + for mem_access, count in six.iteritems(access_map.count_map)) -# {{{ get_synchronization_poly - -def get_synchronization_poly(knl): - """Count the number of synchronization events each thread encounters in a - loopy kernel. - - get_synchronization_poly is deprecated. Use get_synchronization_map instead. - - """ - warn_with_kernel(knl, "depricated_get_synchronization_poly", - "get_synchronization_poly is deprecated. Use " - "get_synchronization_map instead.") - return get_synchronization_map(knl) + return access_map # }}} +# {{{ get_synchronization_map + def get_synchronization_map(knl): """Count the number of synchronization events each thread encounters in a loopy kernel. - :parameter knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. + :arg knl: A :class:`loopy.LoopKernel` whose barriers are to be counted. :return: A dictionary mapping each type of synchronization event to a :class:`islpy.PwQPolynomial` holding the number of events per @@ -1379,9 +1424,10 @@ def get_synchronization_map(knl): raise LoopyError("unexpected schedule item: %s" % type(sched_item).__name__) - #return result.count_map #TODO is this change okay? return result +# }}} + # {{{ gather_access_footprints @@ -1477,4 +1523,74 @@ def gather_access_footprint_bytes(kernel, ignore_uncountable=False): # }}} + +# {{{ compat goop + +def get_lmem_access_poly(knl): + """Count the number of local memory accesses in a loopy kernel. + + get_lmem_access_poly is deprecated. Use get_mem_access_map and filter the + result with the mtype=['local'] option. + + """ + warn_with_kernel(knl, "deprecated_get_lmem_access_poly", + "get_lmem_access_poly is deprecated. Use " + "get_mem_access_map and filter the result with the " + "mtype=['local'] option.") + return get_mem_access_map(knl).filter_by(mtype=['local']) + + +def get_DRAM_access_poly(knl): + """Count the number of global memory accesses in a loopy kernel. + + get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the + result with the mtype=['global'] option. + + """ + warn_with_kernel(knl, "deprecated_get_DRAM_access_poly", + "get_DRAM_access_poly is deprecated. Use " + "get_mem_access_map and filter the result with the " + "mtype=['global'] option.") + return get_mem_access_map(knl).filter_by(mtype=['global']) + + +def get_gmem_access_poly(knl): + """Count the number of global memory accesses in a loopy kernel. + + get_DRAM_access_poly is deprecated. Use get_mem_access_map and filter the + result with the mtype=['global'] option. + + """ + warn_with_kernel(knl, "deprecated_get_gmem_access_poly", + "get_DRAM_access_poly is deprecated. Use " + "get_mem_access_map and filter the result with the " + "mtype=['global'] option.") + return get_mem_access_map(knl).filter_by(mtype=['global']) + + +def get_synchronization_poly(knl): + """Count the number of synchronization events each thread encounters in a + loopy kernel. + + get_synchronization_poly is deprecated. Use get_synchronization_map instead. + + """ + warn_with_kernel(knl, "deprecated_get_synchronization_poly", + "get_synchronization_poly is deprecated. Use " + "get_synchronization_map instead.") + return get_synchronization_map(knl) + + +def get_op_poly(knl, numpy_types=True): + """Count the number of operations in a loopy kernel. + + get_op_poly is deprecated. Use get_op_map instead. + + """ + warn_with_kernel(knl, "deprecated_get_op_poly", + "get_op_poly is deprecated. Use get_op_map instead.") + return get_op_map(knl, numpy_types) + +# }}} + # vim: foldmethod=marker diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 8f371085e0f1655651397c16873f10a95a799f79..f24b115fd5a35af94e4a6d437550bccf86b5bee0 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -335,6 +335,8 @@ class PyOpenCLTarget(OpenCLTarget): % dev_id) def preprocess(self, kernel): + if self.device is not None: + kernel = adjust_local_temp_var_storage(kernel, self.device) return kernel def pre_codegen_check(self, kernel): diff --git a/loopy/target/pyopencl_execution.py b/loopy/target/pyopencl_execution.py index a8f47adb991e331f8a473c4eb14b1ea634c7a3b1..2da25ba39ceef38a4af105913973226bd3773729 100644 --- a/loopy/target/pyopencl_execution.py +++ b/loopy/target/pyopencl_execution.py @@ -25,6 +25,7 @@ THE SOFTWARE. import six from six.moves import range, zip +import numpy as np from pytools import ImmutableRecord, memoize_method from loopy.diagnostic import ParameterFinderWarning from pytools.py_codegen import ( @@ -686,8 +687,17 @@ class PyOpenCLKernelExecutor(KernelExecutorBase): # {{{ debugging aids def get_code(self, arg_to_dtype=None): + def process_dtype(dtype): + if isinstance(dtype, type) and issubclass(dtype, np.generic): + dtype = np.dtype(dtype) + if isinstance(dtype, np.dtype): + dtype = NumpyType(dtype, self.kernel.target) + + return dtype + if arg_to_dtype is not None: - arg_to_dtype = frozenset(six.iteritems(arg_to_dtype)) + arg_to_dtype = frozenset( + (k, process_dtype(v)) for k, v in six.iteritems(arg_to_dtype)) kernel = self.get_typed_and_scheduled_kernel(arg_to_dtype) diff --git a/loopy/target/python.py b/loopy/target/python.py index 99ec42f44b49f546cda324dfdb3c6a5b001d2222..11951abcf17e94c0fdba51042e3060735215b423 100644 --- a/loopy/target/python.py +++ b/loopy/target/python.py @@ -133,15 +133,17 @@ class ExpressionToPythonMapper(StringifyMapper): def map_if(self, expr, enclosing_prec): # Synthesize PREC_IFTHENELSE, make sure it is in the right place in the # operator precedence hierarchy (right above "or"). - from pymbolic.mapper.stringifier import PREC_LOGICAL_OR, PREC_NONE + from pymbolic.mapper.stringifier import PREC_LOGICAL_OR PREC_IFTHENELSE = PREC_LOGICAL_OR - 1 # noqa return self.parenthesize_if_needed( "{then} if {cond} else {else_}".format( - then=self.rec(expr.then, PREC_IFTHENELSE), - cond=self.rec(expr.condition, PREC_IFTHENELSE), - else_=self.rec(expr.else_, PREC_IFTHENELSE)), - enclosing_prec, PREC_NONE) + # "1 if 0 if 1 else 2 else 3" is not valid Python. + # So force parens by using an artificially higher precedence. + then=self.rec(expr.then, PREC_LOGICAL_OR), + cond=self.rec(expr.condition, PREC_LOGICAL_OR), + else_=self.rec(expr.else_, PREC_LOGICAL_OR)), + enclosing_prec, PREC_IFTHENELSE) # }}} @@ -257,7 +259,7 @@ class PythonASTBuilderBase(ASTBuilderBase): lbound, ubound, inner): ecm = codegen_state.expression_to_code_mapper - from pymbolic.mapper.stringifier import PREC_NONE + from pymbolic.mapper.stringifier import PREC_NONE, PREC_SUM from genpy import For return For( @@ -265,7 +267,7 @@ class PythonASTBuilderBase(ASTBuilderBase): "range(%s, %s + 1)" % ( ecm(lbound, PREC_NONE, "i"), - ecm(ubound, PREC_NONE, "i"), + ecm(ubound, PREC_SUM, "i"), ), inner) diff --git a/loopy/version.py b/loopy/version.py index 8516ce006bde8b8616172a72a766ec86dfcd44f1..02244f55d0dbf207a4641c3ebf6cc33b536f0421 100644 --- a/loopy/version.py +++ b/loopy/version.py @@ -32,4 +32,4 @@ except ImportError: else: _islpy_version = islpy.version.VERSION_TEXT -DATA_MODEL_VERSION = "v63-islpy%s" % _islpy_version +DATA_MODEL_VERSION = "v64-islpy%s" % _islpy_version diff --git a/test/test_loopy.py b/test/test_loopy.py index ad5fd72b65b9946156d1067aabdb4ff510d6ec63..3ac857478bf4ac1d4cd6868f22896ca63de34f04 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2184,6 +2184,41 @@ def test_barrier_insertion_near_bottom_of_loop(): assert_barrier_between(knl, "ainit", "aupdate", ignore_barriers_in_levels=[1]) +def test_multi_argument_reduction_type_inference(): + from loopy.type_inference import TypeInferenceMapper + from loopy.library.reduction import SegmentedSumReductionOperation + from loopy.types import to_loopy_type + op = SegmentedSumReductionOperation() + + knl = lp.make_kernel("{[i,j]: 0<=i<10 and 0<=j<i}", "") + + int32 = to_loopy_type(np.int32) + + expr = lp.symbolic.Reduction( + operation=op, + inames=("i",), + expr=lp.symbolic.Reduction( + operation=op, + inames="j", + expr=(1, 2), + allow_simultaneous=True), + allow_simultaneous=True) + + t_inf_mapper = TypeInferenceMapper(knl) + + assert ( + t_inf_mapper(expr, return_tuple=True, return_dtype_set=True) + == [(int32, int32)]) + + +def test_multi_argument_reduction_parsing(): + from loopy.symbolic import parse, Reduction + + assert isinstance( + parse("reduce(argmax, i, reduce(argmax, j, i, j))").expr, + Reduction) + + def test_global_barrier_order_finding(): knl = lp.make_kernel( "{[i,itrip]: 0<=i<n and 0<=itrip<ntrips}", @@ -2228,41 +2263,6 @@ def test_global_barrier_error_if_unordered(): lp.get_global_barrier_order(knl) -def test_multi_argument_reduction_type_inference(): - from loopy.type_inference import TypeInferenceMapper - from loopy.library.reduction import SegmentedSumReductionOperation - from loopy.types import to_loopy_type - op = SegmentedSumReductionOperation() - - knl = lp.make_kernel("{[i,j]: 0<=i<10 and 0<=j<i}", "") - - int32 = to_loopy_type(np.int32) - - expr = lp.symbolic.Reduction( - operation=op, - inames=("i",), - expr=lp.symbolic.Reduction( - operation=op, - inames="j", - expr=(1, 2), - allow_simultaneous=True), - allow_simultaneous=True) - - t_inf_mapper = TypeInferenceMapper(knl) - - assert ( - t_inf_mapper(expr, return_tuple=True, return_dtype_set=True) - == [(int32, int32)]) - - -def test_multi_argument_reduction_parsing(): - from loopy.symbolic import parse, Reduction - - assert isinstance( - parse("reduce(argmax, i, reduce(argmax, j, i, j))").expr, - Reduction) - - def test_struct_assignment(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) diff --git a/test/test_scan.py b/test/test_scan.py new file mode 100644 index 0000000000000000000000000000000000000000..08754819c9a156403aba689cb3e9c238144e7905 --- /dev/null +++ b/test/test_scan.py @@ -0,0 +1,432 @@ +from __future__ import division, absolute_import, print_function + +__copyright__ = """ +Copyright (C) 2012 Andreas Kloeckner +Copyright (C) 2016, 2017 Matt Wala +""" + +__license__ = """ +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +""" + +import sys +import numpy as np +import loopy as lp +import pyopencl as cl +import pyopencl.clmath # noqa +import pyopencl.clrandom # noqa +import pytest + +import logging +logger = logging.getLogger(__name__) + +try: + import faulthandler +except ImportError: + pass +else: + faulthandler.enable() + +from pyopencl.tools import pytest_generate_tests_for_pyopencl \ + as pytest_generate_tests + +__all__ = [ + "pytest_generate_tests", + "cl" # 'cl.create_some_context' + ] + + +# More things to test. +# - scan(a) + scan(b) +# - test for badly tagged inames + +@pytest.mark.parametrize("n", [1, 2, 3, 16]) +@pytest.mark.parametrize("stride", [1, 2]) +def test_sequential_scan(ctx_factory, n, stride): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "[n] -> {[i,j]: 0<=i<n and 0<=j<=%d*i}" % stride, + """ + a[i] = sum(j, j**2) + """ + ) + + knl = lp.fix_parameters(knl, n=n) + knl = lp.realize_reduction(knl, force_scan=True) + + evt, (a,) = knl(queue) + + assert (a.get() == np.cumsum(np.arange(stride*n)**2)[::stride]).all() + + +@pytest.mark.parametrize("sweep_lbound, scan_lbound", [ + (4, 0), + (3, 1), + (2, 2), + (1, 3), + (0, 4), + (5, -1), + ]) +def test_scan_with_different_lower_bound_from_sweep( + ctx_factory, sweep_lbound, scan_lbound): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "[n, sweep_lbound, scan_lbound] -> " + "{[i,j]: sweep_lbound<=i<n+sweep_lbound " + "and scan_lbound<=j<=2*(i-sweep_lbound)+scan_lbound}", + """ + out[i-sweep_lbound] = sum(j, j**2) + """ + ) + + n = 10 + + knl = lp.fix_parameters(knl, sweep_lbound=sweep_lbound, scan_lbound=scan_lbound) + knl = lp.realize_reduction(knl, force_scan=True) + evt, (out,) = knl(queue, n=n) + + assert (out.get() + == np.cumsum(np.arange(scan_lbound, 2*n+scan_lbound)**2)[::2]).all() + + +def test_automatic_scan_detection(): + knl = lp.make_kernel( + [ + "[n] -> {[i]: 0<=i<n}", + "{[j]: 0<=j<=2*i}" + ], + """ + a[i] = sum(j, j**2) + """ + ) + + cgr = lp.generate_code_v2(knl) + assert "scan" not in cgr.device_code() + + +def test_selective_scan_realization(): + pass + + +def test_force_outer_iname_for_scan(): + knl = lp.make_kernel( + "[n] -> {[i,j,k]: 0<=k<n and 0<=i<=k and 0<=j<=i}", + "out[i] = product(j, a[j]) {inames=i:k}") + + knl = lp.add_dtypes(knl, dict(a=np.float32)) + + # TODO: Maybe this deserves to work? + with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError): + lp.realize_reduction(knl, force_scan=True) + + knl = lp.realize_reduction(knl, force_scan=True, force_outer_iname_for_scan="i") + + +def test_dependent_domain_scan(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + [ + "[n] -> {[i]: 0<=i<n}", + "{[j]: 0<=j<=2*i}" + ], + """ + a[i] = sum(j, j**2) {id=scan} + """ + ) + knl = lp.realize_reduction(knl, force_scan=True) + evt, (a,) = knl(queue, n=100) + + assert (a.get() == np.cumsum(np.arange(200)**2)[::2]).all() + + +@pytest.mark.parametrize("i_tag, j_tag", [ + ("for", "for") + ]) +def test_nested_scan(ctx_factory, i_tag, j_tag): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + [ + "[n] -> {[i]: 0 <= i < n}", + "[i] -> {[j]: 0 <= j <= i}", + "[i] -> {[k]: 0 <= k <= i}" + ], + """ + <>tmp[i] = sum(k, 1) + out[i] = sum(j, tmp[j]) + """) + + knl = lp.fix_parameters(knl, n=10) + knl = lp.tag_inames(knl, dict(i=i_tag, j=j_tag)) + + knl = lp.realize_reduction(knl, force_scan=True) + + print(knl) + + evt, (out,) = knl(queue) + + print(out) + + +def test_scan_not_triangular(): + knl = lp.make_kernel( + "{[i,j]: 0<=i<100 and 1<=j<=2*i}", + """ + a[i] = sum(j, j**2) + """ + ) + + with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError): + knl = lp.realize_reduction(knl, force_scan=True) + + +@pytest.mark.parametrize("n", [1, 2, 3, 16, 17]) +def test_local_parallel_scan(ctx_factory, n): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "[n] -> {[i,j]: 0<=i<n and 0<=j<=i}", + """ + out[i] = sum(j, a[j]**2) + """, + "..." + ) + + knl = lp.fix_parameters(knl, n=n) + knl = lp.tag_inames(knl, dict(i="l.0")) + knl = lp.realize_reduction(knl, force_scan=True) + + knl = lp.realize_reduction(knl) + + knl = lp.add_dtypes(knl, dict(a=int)) + + print(knl) + + evt, (a,) = knl(queue, a=np.arange(n)) + assert (a == np.cumsum(np.arange(n)**2)).all() + + +def test_local_parallel_scan_with_nonzero_lower_bounds(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "[n] -> {[i,j]: 1<=i<n+1 and 0<=j<=i-1}", + """ + out[i-1] = sum(j, a[j]**2) + """, + "..." + ) + + knl = lp.fix_parameters(knl, n=16) + knl = lp.tag_inames(knl, dict(i="l.0")) + knl = lp.realize_reduction(knl, force_scan=True) + knl = lp.realize_reduction(knl) + + knl = lp.add_dtypes(knl, dict(a=int)) + evt, (out,) = knl(queue, a=np.arange(1, 17)) + + assert (out == np.cumsum(np.arange(1, 17)**2)).all() + + +def test_scan_extra_constraints_on_domain(): + knl = lp.make_kernel( + "{[i,j,k]: 0<=i<n and 0<=j<=i and i=k}", + "out[i] = sum(j, a[j])") + + with pytest.raises(lp.diagnostic.ReductionIsNotTriangularError): + knl = lp.realize_reduction( + knl, force_scan=True, force_outer_iname_for_scan="i") + + +@pytest.mark.parametrize("sweep_iname_tag", ["for", "l.1"]) +def test_scan_with_outer_parallel_iname(ctx_factory, sweep_iname_tag): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + [ + "{[k]: 0<=k<=1}", + "[n] -> {[i,j]: 0<=i<n and 0<=j<=i}" + ], + "out[k,i] = k + sum(j, j**2)" + ) + + knl = lp.tag_inames(knl, dict(k="l.0", i=sweep_iname_tag)) + n = 10 + knl = lp.fix_parameters(knl, n=n) + knl = lp.realize_reduction(knl, force_scan=True) + + evt, (out,) = knl(queue) + + inner = np.cumsum(np.arange(n)**2) + + assert (out.get() == np.array([inner, 1 + inner])).all() + + +@pytest.mark.parametrize("dtype", [ + np.int32, np.int64, np.float32, np.float64]) +def test_scan_data_types(ctx_factory, dtype): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i,j]: 0<=i<n and 0<=j<=i }", + "res[i] = reduce(sum, j, a[j])", + assumptions="n>=1") + + a = np.random.randn(20).astype(dtype) + knl = lp.add_dtypes(knl, dict(a=dtype)) + knl = lp.realize_reduction(knl, force_scan=True) + evt, (res,) = knl(queue, a=a) + + assert np.allclose(res, np.cumsum(a)) + + +@pytest.mark.parametrize(("op_name", "np_op"), [ + ("sum", np.sum), + ("product", np.prod), + ("min", np.min), + ("max", np.max), + ]) +def test_scan_library(ctx_factory, op_name, np_op): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel( + "{[i,j]: 0<=i<n and 0<=j<=i }", + "res[i] = reduce(%s, j, a[j])" % op_name, + assumptions="n>=1") + + a = np.random.randn(20) + knl = lp.add_dtypes(knl, dict(a=np.float)) + knl = lp.realize_reduction(knl, force_scan=True) + evt, (res,) = knl(queue, a=a) + + assert np.allclose(res, np.array( + [np_op(a[:i+1]) for i in range(len(a))])) + + +def test_scan_unsupported_tags(): + pass + + +@pytest.mark.parametrize("i_tag", ["for", "l.0"]) +def test_argmax(ctx_factory, i_tag): + logging.basicConfig(level=logging.INFO) + + dtype = np.dtype(np.float32) + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + n = 128 + + knl = lp.make_kernel( + "{[i,j]: 0<=i<%d and 0<=j<=i}" % n, + """ + max_vals[i], max_indices[i] = argmax(j, fabs(a[j]), j) + """) + + knl = lp.tag_inames(knl, dict(i=i_tag)) + knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) + knl = lp.realize_reduction(knl, force_scan=True) + + a = np.random.randn(n).astype(dtype) + evt, (max_indices, max_vals) = knl(queue, a=a, out_host=True) + + assert (max_vals == [np.max(np.abs(a)[0:i+1]) for i in range(n)]).all() + assert (max_indices == [np.argmax(np.abs(a[0:i+1])) for i in range(n)]).all() + + +def check_segmented_scan_output(arr, segment_boundaries_indices, out): + class SegmentGrouper(object): + + def __init__(self): + self.seg_idx = 0 + self.idx = 0 + + def __call__(self, key): + if self.idx in segment_boundaries_indices: + self.seg_idx += 1 + self.idx += 1 + return self.seg_idx + + from itertools import groupby + + expected = [np.cumsum(list(group)) + for _, group in groupby(arr, SegmentGrouper())] + actual = [np.array(list(group)) + for _, group in groupby(out, SegmentGrouper())] + + assert len(expected) == len(actual) == len(segment_boundaries_indices) + assert [(e == a).all() for e, a in zip(expected, actual)] + + +@pytest.mark.parametrize("n, segment_boundaries_indices", [ + (1, (0,)), + (2, (0,)), + (2, (0, 1)), + (3, (0,)), + (3, (0, 1)), + (3, (0, 2)), + (3, (0, 1, 2)), + (16, (0, 4, 8, 12))]) +@pytest.mark.parametrize("iname_tag", ("for", "l.0")) +def test_segmented_scan(ctx_factory, n, segment_boundaries_indices, iname_tag): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + arr = np.ones(n, dtype=np.float32) + segment_boundaries = np.zeros(n, dtype=np.int32) + segment_boundaries[(segment_boundaries_indices,)] = 1 + + knl = lp.make_kernel( + "{[i,j]: 0<=i<n and 0<=j<=i}", + "out[i], <>_ = reduce(segmented(sum), j, arr[j], segflag[j])", + [ + lp.GlobalArg("arr", np.float32, shape=("n",)), + lp.GlobalArg("segflag", np.int32, shape=("n",)), + "..." + ]) + + knl = lp.fix_parameters(knl, n=n) + knl = lp.tag_inames(knl, dict(i=iname_tag)) + knl = lp.realize_reduction(knl, force_scan=True) + + (evt, (out,)) = knl(queue, arr=arr, segflag=segment_boundaries) + + check_segmented_scan_output(arr, segment_boundaries_indices, out) + + +if __name__ == "__main__": + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from py.test.cmdline import main + main([__file__]) + +# vim: foldmethod=marker diff --git a/test/test_statistics.py b/test/test_statistics.py index 5e363f13594ee8e4cf170faa232b0783cca9d018..a72b62af90050008f837e144f1f28d4a4de1c730 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -49,7 +49,7 @@ def test_op_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - op_map = lp.get_op_map(knl) + op_map = lp.get_op_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -74,7 +74,7 @@ def test_op_counter_reduction(): name="matmul_serial", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - op_map = lp.get_op_map(knl) + op_map = lp.get_op_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -100,7 +100,7 @@ def test_op_counter_logic(): name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - op_map = lp.get_op_map(knl) + op_map = lp.get_op_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -130,7 +130,7 @@ def test_op_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - op_map = lp.get_op_map(knl) + op_map = lp.get_op_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -166,7 +166,7 @@ def test_op_counter_bitwise(): a=np.int32, b=np.int32, g=np.int64, h=np.int64)) - op_map = lp.get_op_map(knl) + op_map = lp.get_op_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -205,7 +205,7 @@ def test_op_counter_triangular_domain(): else: expect_fallback = False - op_map = lp.get_op_map(knl)[lp.Op(np.float64, 'mul')] + op_map = lp.get_op_map(knl, count_redundant_work=True)[lp.Op(np.float64, 'mul')] value_dict = dict(m=13, n=200) flops = op_map.eval_with_dict(value_dict) @@ -229,7 +229,7 @@ def test_mem_access_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - mem_map = lp.get_mem_access_map(knl) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -269,7 +269,7 @@ def test_mem_access_counter_reduction(): name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - mem_map = lp.get_mem_access_map(knl) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -307,7 +307,7 @@ def test_mem_access_counter_logic(): name="logic", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(g=np.float32, h=np.float64)) - mem_map = lp.get_mem_access_map(knl) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -343,7 +343,7 @@ def test_mem_access_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - mem_map = lp.get_mem_access_map(knl) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -395,7 +395,7 @@ def test_mem_access_counter_bitwise(): a=np.int32, b=np.int32, g=np.int32, h=np.int32)) - mem_map = lp.get_mem_access_map(knl) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -437,11 +437,11 @@ def test_mem_access_counter_mixed(): knl = lp.add_and_infer_dtypes(knl, dict( a=np.float32, b=np.float32, g=np.float64, h=np.float64, x=np.float32)) - threads = 16 - knl = lp.split_iname(knl, "j", threads) + bsize = 16 + knl = lp.split_iname(knl, "j", bsize) knl = lp.tag_inames(knl, {"j_inner": "l.0", "j_outer": "g.0"}) - mem_map = lp.get_mem_access_map(knl) # noqa + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) # noqa n = 512 m = 256 l = 128 @@ -463,8 +463,8 @@ def test_mem_access_counter_mixed(): stride=Variable('m'), direction='load', variable='b') ].eval_with_dict(params) - assert f64uniform == 2*n*m - assert f32uniform == n*m*l/threads + assert f64uniform == 2*n*m*l/bsize + assert f32uniform == n*m*l/bsize assert f32nonconsec == 3*n*m*l f64uniform = mem_map[lp.MemAccess('global', np.float64, @@ -474,7 +474,7 @@ def test_mem_access_counter_mixed(): stride=Variable('m'), direction='store', variable='c') ].eval_with_dict(params) - assert f64uniform == n*m + assert f64uniform == n*m*l/bsize assert f32nonconsec == n*m*l @@ -494,7 +494,7 @@ def test_mem_access_counter_nonconsec(): knl = lp.split_iname(knl, "i", 16) knl = lp.tag_inames(knl, {"i_inner": "l.0", "i_outer": "g.0"}) - mem_map = lp.get_mem_access_map(knl) # noqa + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) # noqa n = 512 m = 256 l = 128 @@ -545,7 +545,7 @@ def test_mem_access_counter_consec(): a=np.float32, b=np.float32, g=np.float64, h=np.float64)) knl = lp.tag_inames(knl, {"k": "l.0", "i": "g.0", "j": "g.1"}) - mem_map = lp.get_mem_access_map(knl) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) n = 512 m = 256 l = 128 @@ -563,7 +563,7 @@ def test_mem_access_counter_consec(): f32consec += mem_map[lp.MemAccess('global', np.dtype(np.float32), stride=1, direction='load', variable='b') ].eval_with_dict(params) - assert f64consec == 2*n*m + assert f64consec == 2*n*m*l assert f32consec == 3*n*m*l f64consec = mem_map[lp.MemAccess('global', np.float64, @@ -572,7 +572,7 @@ def test_mem_access_counter_consec(): f32consec = mem_map[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c') ].eval_with_dict(params) - assert f64consec == n*m + assert f64consec == n*m*l assert f32consec == n*m*l @@ -628,6 +628,7 @@ def test_barrier_counter_barriers(): def test_all_counters_parallel_matmul(): + bsize = 16 knl = lp.make_kernel( "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<l}", [ @@ -635,9 +636,9 @@ def test_all_counters_parallel_matmul(): ], name="matmul", assumptions="n,m,l >= 1") knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32)) - knl = lp.split_iname(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") - knl = lp.split_iname(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") - knl = lp.split_iname(knl, "k", 16) + knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1") + knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0") + knl = lp.split_iname(knl, "k", bsize) knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"]) knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"]) @@ -649,9 +650,9 @@ def test_all_counters_parallel_matmul(): sync_map = lp.get_synchronization_map(knl) assert len(sync_map) == 2 assert sync_map["kernel_launch"].eval_with_dict(params) == 1 - assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/16 + assert sync_map["barrier_local"].eval_with_dict(params) == 2*m/bsize - op_map = lp.get_op_map(knl) + op_map = lp.get_op_map(knl, count_redundant_work=True) f32mul = op_map[ lp.Op(np.float32, 'mul') ].eval_with_dict(params) @@ -667,16 +668,17 @@ def test_all_counters_parallel_matmul(): assert f32mul+f32add == n*m*l*2 - op_map = lp.get_mem_access_map(knl) + op_map = lp.get_mem_access_map(knl, count_redundant_work=True) - f32coal = op_map[lp.MemAccess('global', np.float32, + f32s1lb = op_map[lp.MemAccess('global', np.float32, stride=1, direction='load', variable='b') ].eval_with_dict(params) - f32coal += op_map[lp.MemAccess('global', np.float32, - stride=1, direction='load', variable='a') - ].eval_with_dict(params) + f32s1la = op_map[lp.MemAccess('global', np.float32, + stride=1, direction='load', variable='a') + ].eval_with_dict(params) - assert f32coal == n*m+m*l + assert f32s1lb == n*m*l/bsize + assert f32s1la == n*m*l/bsize f32coal = op_map[lp.MemAccess('global', np.float32, stride=1, direction='store', variable='c') @@ -684,7 +686,8 @@ def test_all_counters_parallel_matmul(): assert f32coal == n*l - local_mem_map = lp.get_mem_access_map(knl).filter_by(mtype=['local']) + local_mem_map = lp.get_mem_access_map(knl, + count_redundant_work=True).filter_by(mtype=['local']) local_mem_l = local_mem_map[lp.MemAccess('local', np.dtype(np.float32), direction='load') ].eval_with_dict(params) @@ -742,7 +745,7 @@ def test_summations_and_filters(): l = 128 params = {'n': n, 'm': m, 'l': l} - mem_map = lp.get_mem_access_map(knl) + mem_map = lp.get_mem_access_map(knl, count_redundant_work=True) loads_a = mem_map.filter_by(direction=['load'], variable=['a'] ).eval_and_sum(params) @@ -768,7 +771,7 @@ def test_summations_and_filters(): assert f32lall == 3*n*m*l assert f64lall == 2*n*m - op_map = lp.get_op_map(knl) + op_map = lp.get_op_map(knl, count_redundant_work=True) #for k, v in op_map.items(): # print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v)