From 8925c5d4ead4ea6cf06ad83d2b44e21f01c2a276 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Wed, 12 Oct 2011 00:01:41 -0400 Subject: [PATCH] Clean up. --- MEMO | 4 +- loopy/codegen/bounds.py | 7 +- loopy/codegen/dispatch.py | 148 ------------ loopy/codegen/prefetch.py | 494 -------------------------------------- loopy/prefetch.py | 195 --------------- test/test_matmul.py | 1 - 6 files changed, 5 insertions(+), 844 deletions(-) delete mode 100644 loopy/codegen/prefetch.py delete mode 100644 loopy/prefetch.py diff --git a/MEMO b/MEMO index dd7220c4b..281e17845 100644 --- a/MEMO +++ b/MEMO @@ -47,8 +47,6 @@ Things to consider - Parallel dimension splitting/merging via tags -- Implement get_problems() - - FIXME: Deal with insns losing a seq iname dep in a CSE realization a <- cse(reduce(stuff)) @@ -81,6 +79,8 @@ Things to consider Dealt with ^^^^^^^^^^ +- Implement get_problems() + - CSE iname duplication might be unnecessary? (don't think so: It might be desired to do a full fetch before a mxm k loop even if that requires going iterative.) diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index 06bef83e7..84f130e2d 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -61,9 +61,6 @@ def get_bounds_constraints(set, iname, admissible_inames, allow_parameters): return lower, upper, equality def solve_constraint_for_bound(cns, iname): - from warnings import warn - warn("solve_constraint_for_bound deprecated?") - from loopy.symbolic import constraint_to_expr rhs, iname_coeff = constraint_to_expr(cns, except_name=iname) @@ -265,7 +262,7 @@ def get_defined_inames(kernel, sched_index, allow_tag_classes=()): # }}} -# {{{ +# {{{ pick_simple_constraint def pick_simple_constraint(constraints, iname): if len(constraints) == 0: @@ -281,6 +278,8 @@ def pick_simple_constraint(constraints, iname): (cns, count_flops(solve_constraint_for_bound(cns, iname)[1])) for cns in constraints) +# }}} + diff --git a/loopy/codegen/dispatch.py b/loopy/codegen/dispatch.py index ad6d4e523..77b2f718e 100644 --- a/loopy/codegen/dispatch.py +++ b/loopy/codegen/dispatch.py @@ -68,152 +68,4 @@ def build_loop_nest(kernel, sched_index, codegen_state): -def build_loop_nest_old(kernel, sched_index, codegen_state, no_conditional_check=False): - assert isinstance(exec_domain, ExecutionDomain) - - ccm = exec_domain.c_code_mapper - - from cgen import (POD, Initializer, Assign, Statement as S, - Line) - - from loopy.codegen.bounds import ( - generate_bounds_checks, - generate_bounds_checks_code, - get_valid_check_vars, - constraint_to_code) - - if not no_conditional_check: - # {{{ see if there are any applicable conditionals - - applicable_constraints = generate_bounds_checks( - kernel.domain, - get_valid_check_vars(kernel, sched_index, allow_ilp=False), - exec_domain.implemented_domain) - - if applicable_constraints: - import islpy as isl - exec_domain_restriction = isl.Set.universe(kernel.space) - for cns in applicable_constraints: - exec_domain_restriction = (exec_domain_restriction - .add_constraint(cns)) - - exec_domain = exec_domain.intersect(exec_domain_restriction) - - inner = build_loop_nest(kernel, sched_index, exec_domain, - no_conditional_check=True) - - from loopy.codegen import wrap_in_if - return wrap_in_if([ - constraint_to_code(ccm, cns) - for cns in applicable_constraints], - inner) - - # }}} - - if sched_index >= len(kernel.schedule): - # {{{ write innermost loop body - - from pymbolic.primitives import Subscript - - # FIXME revert to unroll if actual bounds checks are needed? - - valid_index_vars = get_valid_check_vars(kernel, sched_index, allow_ilp=True) - bounds_check_lists = [ - generate_bounds_checks_code(subd.c_code_mapper, kernel.domain, - valid_index_vars, subd.implemented_domain) - for subd in exec_domain.subdomains] - - result = [] - for lvalue, expr in kernel.instructions: - for i, subd in enumerate(exec_domain.subdomains): - assert isinstance(lvalue, Subscript) - name = lvalue.aggregate.name - - from loopy.codegen import wrap_in_if - result.append(wrap_in_if( - bounds_check_lists[i], - S("tmp_%s_%d += %s" - % (name, i, subd.c_code_mapper(expr))))) - - return gen_code_block(result) - - # }}} - - sched_item = kernel.schedule[sched_index] - - from loopy.schedule import ScheduledLoop, WriteOutput - from loopy.prefetch import LocalMemoryPrefetch, RegisterPrefetch - from loopy.codegen.bounds import wrap_in_bounds_checks - - if isinstance(sched_item, ScheduledLoop): - from loopy.codegen.loop import ( - generate_unroll_or_ilp_code, - generate_parallel_loop_dim_code, - generate_sequential_loop_dim_code) - from loopy.kernel import (TAG_UNROLL, TAG_ILP, - ParallelTagWithAxis) - - tag = kernel.iname_to_tag.get(sched_item.iname) - - if isinstance(tag, (TAG_UNROLL, TAG_ILP)): - func = generate_unroll_or_ilp_code - elif isinstance(tag, ParallelTagWithAxis): - func = generate_parallel_loop_dim_code - else: - func = generate_sequential_loop_dim_code - - return func(kernel, sched_index, exec_domain) - - elif isinstance(sched_item, WriteOutput): - result = ( - [Initializer(POD(kernel.arg_dict[lvalue.aggregate.name].dtype, - "tmp_%s_%d" % (lvalue.aggregate.name, i)), 0) - for i in range(len(exec_domain.subdomains)) - for lvalue, expr in kernel.instructions] - +[Line()] - +[build_loop_nest(kernel, sched_index+1, - exec_domain)] - +[Line()]) - - for i, subd in enumerate(exec_domain.subdomains): - for lvalue, expr in kernel.instructions: - assignment = Assign(subd.c_code_mapper(lvalue), "tmp_%s_%d" % ( - lvalue.aggregate.name, i)) - - wrapped_assign = wrap_in_bounds_checks( - subd.c_code_mapper, kernel.domain, - get_valid_check_vars(kernel, sched_index, allow_ilp=True), - subd.implemented_domain, assignment) - - result.append(wrapped_assign) - - return gen_code_block(result) - - elif isinstance(sched_item, LocalMemoryPrefetch): - from loopy.codegen.prefetch import generate_prefetch_code - return generate_prefetch_code(kernel, sched_index, - exec_domain) - - elif isinstance(sched_item, RegisterPrefetch): - raise NotImplementedError("reg prefetch") # FIXME - - agg_name = sched_item.subscript_expr.aggregate.name - return gen_code_block([ - wrap_in_bounds_checks(ccm, kernel, sched_index, implemented_domain, - Initializer(POD(kernel.arg_dict[agg_name].dtype, - sched_item.new_name), - "%s[%s]" - % (agg_name, - ccm(sched_item.subscript_expr.index)))), - - build_loop_nest(kernel, sched_index+1, exec_domain) - ]) - - else: - raise ValueError("invalid schedule item encountered") - - - - - # vim: foldmethod=marker diff --git a/loopy/codegen/prefetch.py b/loopy/codegen/prefetch.py deleted file mode 100644 index c35a11fcb..000000000 --- a/loopy/codegen/prefetch.py +++ /dev/null @@ -1,494 +0,0 @@ -from __future__ import division - -from pytools import Record -import pyopencl as cl -import pyopencl.characterize as cl_char -from loopy.codegen import wrap_in, gen_code_block -import islpy as isl -from islpy import dim_type -import numpy as np - - - - -# {{{ prefetch preprocessing - -def preprocess_prefetch(kernel): - """Assign names, dim storage lengths to prefetches. - """ - - all_pf_list = kernel.prefetch.values() - new_prefetch_dict = {} - lmem_size = cl_char.usable_local_mem_size(kernel.device) - - for i_pf, pf in enumerate(kernel.prefetch.itervalues()): - all_pf_nbytes = [opf.nbytes for opf in all_pf_list] - other_pf_sizes = sum(all_pf_nbytes[:i_pf]+all_pf_nbytes[i_pf+1:]) - - dim_storage_lengths = [stop-start for start, stop in - [pf.dim_bounds_by_iname[iname] for iname in pf.all_inames()]] - - # sizes of all dims except the last one, which we may change - # below to avoid bank conflicts - from pytools import product - other_dim_sizes = (pf.itemsize - * product(dim_storage_lengths[:-1])) - - if kernel.device.local_mem_type == cl.device_local_mem_type.GLOBAL: - # FIXME: could try to avoid cache associativity disasters - new_dsl = dim_storage_lengths - - elif kernel.device.local_mem_type == cl.device_local_mem_type.LOCAL: - min_mult = cl_char.local_memory_bank_count(kernel.device) - good_incr = None - new_dsl = dim_storage_lengths - min_why_not = None - - for increment in range(dim_storage_lengths[-1]//2): - - test_dsl = dim_storage_lengths[:] - test_dsl[-1] = test_dsl[-1] + increment - new_mult, why_not = cl_char.why_not_local_access_conflict_free( - kernel.device, pf.itemsize, - pf.dim_lengths(), test_dsl) - - # will choose smallest increment 'automatically' - if new_mult < min_mult: - new_lmem_use = other_pf_sizes + pf.itemsize*product(new_dsl) - if new_lmem_use < lmem_size: - new_dsl = test_dsl - min_mult = new_mult - min_why_not = why_not - good_incr = increment - - if min_mult != 1: - from warnings import warn - from loopy import LoopyAdvisory - warn("could not find a conflict-free mem layout " - "for prefetch of '%s' " - "(currently: %dx conflict, increment: %d, reason: %s)" - % (pf.input_vector, min_mult, good_incr, min_why_not), - LoopyAdvisory) - else: - from warnings import warn - warn("unknown type of local memory") - - new_dsl = dim_storage_lengths - - - new_pf = pf.copy(dim_storage_lengths=new_dsl, - name="prefetch_%s_%d" % (pf.input_vector, i_pf)) - new_prefetch_dict[pf.input_vector, pf.index_expr] = new_pf - all_pf_list[i_pf] = new_pf - - return kernel.copy(prefetch=new_prefetch_dict) - -# }}} - -# {{{ lmem prefetch code generation - -class FetchLoopNestData(Record): - pass - -def make_fetch_loop_nest(flnd, fetch_dim_idx, pf_dim_exprs, iname_subst_map, - implemented_domain): - pf = flnd.prefetch - ccm = flnd.c_code_mapper - no_pf_ccm = flnd.no_prefetch_c_code_mapper - kernel = flnd.kernel - - from pymbolic import var - from cgen import Assign, For, If - - from pymbolic.mapper.substitutor import substitute - if fetch_dim_idx >= len(pf.fetch_dims): - # done, return - from pymbolic.primitives import Variable, Subscript - - result = Assign( - pf.name + "".join("[%s]" % ccm(dexpr) - for dexpr in pf_dim_exprs), - no_pf_ccm( - Subscript( - Variable(pf.input_vector), - substitute(pf.index_expr, iname_subst_map)))) - - from pymbolic.mapper.dependency import DependencyMapper - check_vars = [v.name for v in DependencyMapper()(pf.index_expr)] - - from loopy.codegen.bounds import wrap_in_bounds_checks - return wrap_in_bounds_checks( - ccm.copy_and_assign_many(iname_subst_map), - pf.kernel.domain, - check_vars, implemented_domain, result) - - fetch_inames = pf.fetch_dims[fetch_dim_idx] - realiz_inames = flnd.realization_inames[fetch_dim_idx] - - fetch_iname_lengths = [stop-start - for start, stop in - [pf.dim_bounds_by_iname[iname] for iname in fetch_inames]] - - from pytools import product - dim_length = product(fetch_iname_lengths) - - idx_var_name = "loopy_prefetch_dim_idx_%d" % fetch_dim_idx - idx_var = var(idx_var_name) - - if realiz_inames is not None: - # {{{ parallel fetch - - # {{{ find strides per fetch iname - - fetch_iname_strides = [1] - for fil in fetch_iname_lengths[:0:-1]: - fetch_iname_strides.insert(0, - fetch_iname_strides[0]*fil) - - # }}} - - idx_var_expr_from_inames = sum(stride*var(iname) - for iname, stride in zip(fetch_inames, fetch_iname_strides)) - - # {{{ find expressions for each iname from idx_var - - pf_dim_exprs = pf_dim_exprs[:] - iname_subst_map = iname_subst_map.copy() - - for i, iname in enumerate(fetch_inames): - iname_lower, iname_upper = pf.dim_bounds_by_iname[iname] - iname_len = iname_upper-iname_lower - iname_val_base = (idx_var // fetch_iname_strides[i]) - if i != 0: - # the outermost iname is the 'largest', no need to - # 'modulo away' any larger ones - iname_val_base = iname_val_base % iname_len - - pf_dim_exprs.append(iname_val_base) - iname_subst_map[iname] = iname_val_base + iname_lower - - # }}} - - # {{{ build an implemented domain with an extra index variable - - from loopy.symbolic import eq_constraint_from_expr - idx_var_dim_idx = implemented_domain.get_space().size(dim_type.set) - impl_domain_with_index_var = implemented_domain.add_dims(dim_type.set, 1) - impl_domain_with_index_var = ( - impl_domain_with_index_var - .set_dim_name(dim_type.set, idx_var_dim_idx, idx_var_name)) - aug_space = impl_domain_with_index_var.get_space() - impl_domain_with_index_var = ( - impl_domain_with_index_var - .intersect( - isl.Set.universe(aug_space) - .add_constraint( - eq_constraint_from_expr( - aug_space, - idx_var_expr_from_inames - idx_var)))) - - # }}} - - realiz_bounds = [ - flnd.kernel.get_bounds(rn, (rn,), allow_parameters=False) - for rn in realiz_inames] - for realiz_start, realiz_stop, realiz_equality in realiz_bounds: - assert not realiz_equality - - realiz_lengths = [stop-start for start, stop, equality in realiz_bounds] - from pytools import product - total_realiz_size = product(realiz_lengths) - - result = [] - - cur_index = 0 - - while cur_index < dim_length: - pf_idx_expr = 0 - for realiz_iname, length in zip(realiz_inames, realiz_lengths): - tag = flnd.kernel.iname_to_tag[realiz_iname] - from loopy.kernel import TAG_LOCAL_IDX - assert isinstance(tag, TAG_LOCAL_IDX) - - pf_idx_expr = (pf_idx_expr*length - + var("(int) get_local_id(%d)" % tag.axis)) - - pf_idx_expr += cur_index - - from loopy.isl import make_slab - new_impl_domain = ( - impl_domain_with_index_var - .intersect( - make_slab( - impl_domain_with_index_var.get_space(), idx_var_name, - cur_index, - min(dim_length, cur_index+total_realiz_size))) - .project_out(dim_type.set, idx_var_dim_idx, 1)) - - inner = make_fetch_loop_nest(flnd, fetch_dim_idx+1, - pf_dim_exprs, iname_subst_map, - new_impl_domain) - - if cur_index+total_realiz_size > dim_length: - inner = wrap_in(If, - "%s < %s" % (idx_var_name, dim_length), - inner) - - from cgen import Initializer, Const, POD - inner = gen_code_block([ - Initializer(Const(POD(np.int32, idx_var_name)), - ccm(pf_idx_expr)), - inner], denest=True) - - result.append(inner) - - cur_index += total_realiz_size - - return gen_code_block(result) - - # }}} - else: - # {{{ sequential fetch - - if len(fetch_inames) > 1: - raise NotImplementedError("merged sequential fetches are not supported") - pf_iname, = fetch_inames - - lb_cns, ub_cns = pf.get_dim_bounds_constraints_by_iname(pf_iname) - - from loopy.isl import cast_constraint_to_space - loop_slab = (isl.Set.universe(flnd.kernel.space) - .add_constraints([cast_constraint_to_space(cns, kernel.space) - for cns in [lb_cns, ub_cns]])) - new_impl_domain = implemented_domain.intersect(loop_slab) - - iname_subst_map = iname_subst_map.copy() - iname_subst_map[pf_iname] = idx_var + pf.dim_bounds_by_iname[pf_iname][0] - inner = make_fetch_loop_nest(flnd, fetch_dim_idx+1, - pf_dim_exprs+[idx_var], iname_subst_map, - new_impl_domain) - - return wrap_in(For, - "int %s = 0" % idx_var_name, - "%s < %s" % (idx_var_name, ccm(dim_length)), - "++%s" % idx_var_name, - inner) - - # }}} - - -def generate_prefetch_code(kernel, sched_index, exec_domain): - implemented_domain = exec_domain.implemented_domain - - from cgen import Statement as S, Line, Comment - - ccm = exec_domain.c_code_mapper - - scheduled_pf = kernel.schedule[sched_index] - pf = kernel.prefetch[ - scheduled_pf.input_vector, scheduled_pf.index_expr] - - # Prefetch has a good amount of flexibility over what axes it - # uses to accomplish the prefetch. In particular, it can (and should!) - # use all work group dimensions. - - # {{{ determine which loop axes are used to realize the fetch - - # realization_dims is a list of lists of inames, to represent when two dims jointly - # make up one fetch axis - - realization_inames = [None] * len(pf.fetch_dims) - - # {{{ first, fix the user-specified fetch dims - - from loopy.kernel import TAG_LOCAL_IDX - knl_work_item_inames = kernel.ordered_inames_by_tag_type(TAG_LOCAL_IDX) - used_kernel_work_item_inames = [] - - for realization_dim_idx, loc_fetch_axis_list in \ - pf.loc_fetch_axes.iteritems(): - loc_fetch_inames = [knl_work_item_inames[axis] - for axis in loc_fetch_axis_list] - realization_inames[realization_dim_idx] = loc_fetch_inames - used_kernel_work_item_inames.extend(loc_fetch_inames) - - for inm in used_kernel_work_item_inames: - knl_work_item_inames.remove(inm) - - # }}} - - # {{{ next use the work group dimensions, least-stride dim first - - from loopy.kernel import ImageArg, ScalarArg - from loopy.symbolic import CoefficientCollector - - index_expr = pf.index_expr - if not isinstance(index_expr, tuple): - index_expr = (index_expr,) - - arg = kernel.arg_dict[pf.input_vector] - if isinstance(arg, ImageArg): - # arbitrary - ary_strides = (1, 1, 1)[:arg.dimensions] - else: - ary_strides = arg.strides - if ary_strides is None and len(index_expr) == 1: - ary_strides = (1,) - - iname_to_stride = {} - for iexpr_i, stride in zip(index_expr, ary_strides): - coeffs = CoefficientCollector()(iexpr_i) - for var_name, coeff in coeffs.iteritems(): - if var_name != 1: - new_stride = coeff*stride - old_stride = iname_to_stride.get(var_name, None) - if old_stride is None or new_stride < old_stride: - iname_to_stride[var_name] = new_stride - - approximate_arg_values = dict( - (arg.name, arg.approximately) - for arg in kernel.args - if isinstance(arg, ScalarArg)) - - def stride_key(fetch_dim_idx): - fetch_dim = pf.fetch_dims[fetch_dim_idx] - - from pymbolic import evaluate - key = min( - evaluate(iname_to_stride[iname], approximate_arg_values) - for iname in fetch_dim) - assert isinstance(key, int) - return key - - pf_fetch_dim_strides = sorted((dim_idx - for dim_idx in range(len(pf.fetch_dims)) - if realization_inames[dim_idx] is None), - key=stride_key) - - while knl_work_item_inames and pf_fetch_dim_strides: - # grab least-stride prefetch dim - least_stride_pf_fetch_dim_idx = pf_fetch_dim_strides.pop(0) - - # FIXME: It might be good to join multiple things together here - # for size reasons - realization_inames[least_stride_pf_fetch_dim_idx] \ - = [knl_work_item_inames.pop(0)] - - if knl_work_item_inames: - # FIXME - from warnings import warn - warn("There were leftover work group dimensions in prefetch " - "assignment. For now, this won't lead to wrong code, " - "but it will lead to unnecessary memory bandwidth use.") - - # }}} - - # }}} - - # {{{ generate fetch code - - from loopy.codegen.bounds import get_valid_check_vars - valid_index_vars = get_valid_check_vars(kernel, sched_index, - allow_ilp=True, - exclude_tag_classes=(TAG_LOCAL_IDX,)) - - from loopy.symbolic import LoopyCCodeMapper - flnd = FetchLoopNestData(prefetch=pf, - no_prefetch_c_code_mapper= - LoopyCCodeMapper(kernel, no_prefetch=True), - c_code_mapper=ccm, - realization_inames=realization_inames, - kernel=kernel, - valid_index_vars=valid_index_vars) - - fetch_block = make_fetch_loop_nest(flnd, 0, [], {}, implemented_domain) - - # }}} - - new_block = [] - - # {{{ generate comments explaining dimension mapping - - new_block.append(Comment("prefetch %s -- using dimension mapping:" % pf.input_vector)) - for iaxis, (fetch_dim, realiz_inames) in enumerate(zip(pf.fetch_dims, realization_inames)): - new_block.append(Comment(" fetch axis %d:" % iaxis)) - for iname in fetch_dim: - iname_lwr, iname_upr = pf.dim_bounds_by_iname[iname] - new_block.append(Comment(" %s [%d..%d)" % (iname, iname_lwr, iname_upr))) - new_block.append(Comment(" using:")) - - if realiz_inames is None: - new_block.append(Comment(" loop")) - else: - for realiz_iname in realiz_inames: - rd_iname_descr = "loop" - iname_lwr, iname_upr, iname_eq = flnd.kernel.get_bounds(realiz_iname, (realiz_iname,), - allow_parameters=False) - assert not iname_eq - - new_block.append(Comment(" %s (%s) [%s..%s)" - % (realiz_iname, kernel.iname_to_tag[realiz_iname], - iname_lwr, iname_upr))) - - new_block.append(Line()) - - # }}} - - # {{{ omit head sync primitive if possible - - head_sync_unneeded_because = None - - from loopy.prefetch import LocalMemoryPrefetch - if (sched_index-1 >= 0 - and isinstance(kernel.schedule[sched_index-1], LocalMemoryPrefetch)): - head_sync_unneeded_because = "next outer schedule item is a prefetch" - - from pytools import all - from loopy.kernel import ParallelTag - from loopy.schedule import ScheduledLoop - outer_tags = [ - kernel.iname_to_tag.get(sched_item.iname) - for sched_item in kernel.schedule[:sched_index] - if isinstance(sched_item, ScheduledLoop)] - - if not [tag - for tag in outer_tags - if not isinstance(tag, ParallelTag)]: - head_sync_unneeded_because = "no sequential axes nested around fetch" - - # generate (no) head sync code - if head_sync_unneeded_because is None: - new_block.append(S("barrier(CLK_LOCAL_MEM_FENCE)")) - else: - new_block.append(Comment("no sync needed: " + head_sync_unneeded_because)) - new_block.append(Line()) - - # }}} - - new_block.append(fetch_block) - - # {{{ omit tail sync primitive if possible - - tail_sync_unneeded_because = None - - if (sched_index+1 < len(kernel.schedule) - and isinstance(kernel.schedule[sched_index+1], LocalMemoryPrefetch)): - tail_sync_unneeded_because = "next inner schedule item is a prefetch" - - if tail_sync_unneeded_because is None: - new_block.append(S("barrier(CLK_LOCAL_MEM_FENCE)")) - else: - new_block.append(Line()) - new_block.append(Comment("no sync needed: " + tail_sync_unneeded_because)) - - # }}} - - from loopy.codegen.dispatch import build_loop_nest - new_block.extend([Line(), - build_loop_nest(kernel, sched_index+1, exec_domain)]) - - return gen_code_block(new_block) - -# }}} - -# vim: foldmethod=marker diff --git a/loopy/prefetch.py b/loopy/prefetch.py deleted file mode 100644 index 98aeed948..000000000 --- a/loopy/prefetch.py +++ /dev/null @@ -1,195 +0,0 @@ -from __future__ import division - -from pytools import Record, memoize_method -from islpy import dim_type - - - - -# {{{ register prefetches - -class RegisterPrefetch(Record): - __slots__ = ["subexprs", "new_names"] - -def insert_register_prefetches(kernel): - reg_pf = {} - - total_loop_count = len(kernel.all_inames()) - known_vars = set() - - unused_index_exprs = set() - from loopy.symbolic import AllSubscriptExpressionCollector - asec = AllSubscriptExpressionCollector() - - from pymbolic.mapper.dependency import DependencyMapper - - for tgt, expr in kernel.instructions: - unused_index_exprs |= asec(expr) - unused_index_exprs = [ - (iexpr, set(v.name for v in DependencyMapper()(iexpr.index))) - for iexpr in unused_index_exprs] - - schedule = kernel.schedule[:] - - from loopy.schedule import ScheduledLoop - - sched_index = 0 - loop_count = 0 - while sched_index < len(schedule): - sched_item = schedule[sched_index] - if isinstance(sched_item, ScheduledLoop): - known_vars.add(sched_item.iname) - loop_count += 1 - sched_index += 1 - - if loop_count < total_loop_count: - i = 0 - while i < len(unused_index_exprs): - iexpr, index_deps = unused_index_exprs[i] - if (index_deps <= known_vars - and (iexpr.aggregate.name, iexpr.index) - not in kernel.prefetch): - unused_index_exprs.pop(i) - new_name = "reg_prefetch_"+iexpr.aggregate.name+str(sched_index) - reg_pf[iexpr] = new_name - schedule.insert(sched_index, - RegisterPrefetch( - subexprs=[iexpr], new_names=[new_name])) - sched_index += 1 - else: - i += 1 - - return kernel.copy(schedule=schedule, register_prefetch=reg_pf) - -# }}} - -# {{{ local-mem prefetch-related - -class LocalMemoryPrefetch(Record): - """ - Attributes: - :ivar kernel: - :ivar input_vector: A string indicating the input vector variable name. - :ivar index_expr: An expression identifying the access which this prefetch - serves. - :ivar fetch_dims: A sequence of tuples of inames (i.e. loop dimensions) - identifying which part of the input vector, given the index_expr, should - be prefetched. Non-length-1 tuples indicate that these indices should - share a dimension in the prefetch array. - :ivar loc_fetch_axes: dictionary from integers 0..len(inames) to lists of - local index axes which should be used to realize that dimension of the - prefetch. The last dimension in this list is used as the fastest-changing - one. - :ivar name: the variable name used for the prefetch - :ivar dim_storage_lengths: a sequence of integers indicating the size of - the storage for each dimension. It may may differ from the size of the - actual loop dimensions to mitigate bank conflicts. - - The latter two values are only assigned during code generation. - """ - - @memoize_method - def all_inames(self): - """Order matters as this will be the order of indices into the - prefetch array. - """ - return [ - iname - for fetch_dim in self.fetch_dims - for iname in fetch_dim] - - @property - @memoize_method - def domain(self): - return (self.kernel.domain - .project_out_except(self.all_inames(), [dim_type.set]) - .compute_divs() - .remove_divs_of_dim_type(dim_type.set)) - - @property - @memoize_method - def index_map(self): - from loopy.isl import make_index_map - imap = make_index_map(self.kernel_domain, self.index_expr) - assert imap.is_bijective() - return imap - - @property - @memoize_method - def restricted_index_map(self): - return self.index_map.intersect_domain(self.domain) - - @memoize_method - def get_dim_bounds_constraints_by_iname(self, iname): - from loopy.codegen.bounds import get_bounds_constraints - lower, upper, equality = get_bounds_constraints( - self.domain, iname, (iname,), - allow_parameters=False) - - assert not equality - - lower, = lower - upper, = upper - return lower, upper - - @property - @memoize_method - def dim_bounds_by_iname(self): - from loopy.codegen.bounds import solve_constraint_for_bound - result = {} - for iname in self.all_inames(): - lower, upper = self.get_dim_bounds_constraints_by_iname(iname) - - lower_kind, lower_bound = solve_constraint_for_bound(lower, iname) - upper_kind, upper_bound = solve_constraint_for_bound(upper, iname) - - try: - lower_bound = int(lower_bound) - upper_bound = int(upper_bound) - except TypeError: - raise RuntimeError("loop bounds for prefetch must be known statically") - - result[iname] = (lower_bound, upper_bound) - - return result - - @property - def itemsize(self): - return self.kernel.arg_dict[self.input_vector].dtype.itemsize - - def dim_lengths(self): - result = [] - for fetch_dim in self.fetch_dims: - fd_result = 1 - for iname in fetch_dim: - start, stop = self.dim_bounds_by_iname[iname] - fd_result *= stop-start - result.append(fd_result) - return result - - @property - @memoize_method - def nbytes(self): - from pytools import product - return self.itemsize * product(self.dim_lengths()) - - @memoize_method - def free_variables(self): - from pymbolic.mapper.dependency import DependencyMapper - return set(var.name - for var in DependencyMapper()(self.index_expr) - ) - set(self.all_inames()) - set(self.kernel.scalar_loop_args) - - def hash(self): - return (hash(type(self)) ^ hash(self.input_vector) - ^ hash(self.index_expr)) - - def __eq__(self, other): - # in particular, dim_storage_lengths should not factor into equality - return (type(self) == type(other) - and self.input_vector == other.input_vector - and self.index_expr == other.index_expr) - -# }}} - -# vim: foldmethod=marker diff --git a/test/test_matmul.py b/test/test_matmul.py index 13a81e0ab..e342289be 100644 --- a/test/test_matmul.py +++ b/test/test_matmul.py @@ -244,7 +244,6 @@ def test_plain_matrix_mul_new_ui(ctx_factory): - def test_troublesome_premagma_fermi_matrix_mul(ctx_factory): dtype = np.float32 ctx = ctx_factory() -- GitLab