From a86fd84e85faa9f4687685507ee01edd3673e686 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Sat, 24 Sep 2011 16:51:34 -0400 Subject: [PATCH] Towards insn-based loopy. --- MEMO | 83 +++++ loopy/__init__.py | 544 +++++++++++++++++++++++++++++-- loopy/codegen/bounds.py | 12 +- loopy/codegen/prefetch.py | 6 +- loopy/isl.py | 65 +++- loopy/kernel.py | 668 ++++++++++++++++++++------------------ loopy/schedule.py | 106 +++++- loopy/symbolic.py | 283 +++++++++++++--- loopy/tools.py | 13 + test/test_isl.py | 27 ++ test/test_matmul.py | 26 +- 11 files changed, 1408 insertions(+), 425 deletions(-) create mode 100644 MEMO create mode 100644 loopy/tools.py create mode 100644 test/test_isl.py diff --git a/MEMO b/MEMO new file mode 100644 index 000000000..c08463c3f --- /dev/null +++ b/MEMO @@ -0,0 +1,83 @@ +TODO list +^^^^^^^^^ + +Immediately: +------------ +TODO: Imitate codegen bulk slab handling in bulk slab trials + +For writeup: +------------ +TODO: Reimplement forced lengths +TODO: Try, fix reg. prefetch (DG example) / CSEs + ILP and reg. prefetch interact! +TODO: Custom reductions per red. axis +TODO: Functions +TODO: Common subexpressions +TODO: Array common subexpressions (shared and private!) +TODO: ILP arrays +FIXME: support non-reductive dimensions (what did I mean here?) +FIXME: write names should be assigned during scheduling +FIXME: screwy lower bounds in ILP +FIXME: Leading syncthreads elimination + +TODO: Divisibility +TODO: Try, fix indirect addressing + +TODO: Implement GT200 matmul, Fermi matmul, DG +TODO: DMA engine threads? +TODO: Deal with equalities that crop up. +TODO: Better user feedback. + +Later: +------ +TODO: Try different kernels +TODO: - Tricky: Convolution, Stencil +TODO: Separate all-bulk from non-bulk kernels. (maybe?) (#ifdef?) +TODO: implement efficient ceil_div? (as opposed to floor_div) +TODO: why are corner cases inefficient? +TODO: Use gists (why do disjoint sets arise?) +TODO: variable shuffle detection + +Things to consider +^^^^^^^^^^^^^^^^^^ + +- implemented_domain may end up being smaller than requested in cse + evaluations--check that! + +- Instructions must agree on all iname tags except the parallel ones + +- Auto tag assignment depends on known work group size + +- Depedencies are pointwise for shared loop dimensions + and global over non-shared ones (between dependent and ancestor) + +- Parallel dimension splitting/merging via tags + +- Generalize reduction to be over multiplie variables + +- Implement get_problems() + + +Dealt with +^^^^^^^^^^ + +- Reduction needs to know a neutral element + +- Types of reduction variables? + +How to represent the schedule +^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ + +- Focus everything on instructions + - Each instruction can have its own interpretation of global/local ids. +- Loop variables/splits and such are and remain global +- What about grouped dimensions? +- UniqueTag is the wrong idea! (not really--it's ok per-insn) + +Scheduling: +- Find insns whose dependencies are satisfied +- Find maximally shareable loop +- Open that one +- For that opened loop, check if an available insn can run + - If not, open another loop + - Else, schedule that instruction diff --git a/loopy/__init__.py b/loopy/__init__.py index 03d0ddb50..7e0474c4d 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -1,4 +1,5 @@ from __future__ import division +import isl def register_mpz_with_pymbolic(): from pymbolic.primitives import register_constant_class @@ -8,42 +9,14 @@ def register_mpz_with_pymbolic(): register_mpz_with_pymbolic() +import islpy as isl +from islpy import dim_type +import numpy as np -# Immediately: -# ------------ -# TODO: Imitate codegen bulk slab handling in bulk slab trials -# For writeup: -# ------------ -# TODO: Try, fix reg. prefetch (DG example) / CSEs -# ILP and reg. prefetch interact! -# TODO: Custom reductions per red. axis -# TODO: Functions -# TODO: Common subexpressions -# TODO: Array common subexpressions (shared and private!) -# TODO: ILP arrays -# FIXME: support non-reductive dimensions (what did I mean here?) -# FIXME: write names should be assigned during scheduling -# FIXME: screwy lower bounds in ILP -# FIXME: Leading syncthreads elimination -# TODO: Divisibility -# TODO: Try, fix indirect addressing - -# TODO: Implement GT200 matmul, Fermi matmul, DG -# TODO: DMA engine threads? -# TODO: Deal with equalities that crop up. -# TODO: Better user feedback. - -# Later: -# ------ -# TODO: Try different kernels -# TODO: - Tricky: Convolution, Stencil -# TODO: Separate all-bulk from non-bulk kernels. (maybe?) (#ifdef?) -# TODO: implement efficient ceil_div? (as opposed to floor_div) -# TODO: why are corner cases inefficient? -# TODO: Use gists (why do disjoint sets arise?) +# TODO list moved to MEMO in root @@ -62,16 +35,516 @@ from loopy.compiled import CompiledKernel, drive_timing_run # }}} -# {{{ high-level modifiers +# {{{ user-facing kernel manipulation functionality + +def split_dimension(kernel, name, inner_length, padded_length=None, + outer_name=None, inner_name=None, + outer_tag=None, inner_tag=None, + outer_slab_increments=(0, -1), no_slabs=None): + + if name not in kernel.all_inames(): + raise ValueError("cannot split loop for unknown variable '%s'" % name) + + if no_slabs: + outer_slab_increments = (0, 0) + + if padded_length is not None: + inner_tag = inner_tag.copy(forced_length=padded_length) + + if outer_name is None: + outer_name = name+"_outer" + if inner_name is None: + inner_name = name+"_inner" + + outer_var_nr = kernel.space.dim(dim_type.set) + inner_var_nr = kernel.space.dim(dim_type.set)+1 + + def process_set(s): + s = s.add_dims(dim_type.set, 2) + s.set_dim_name(dim_type.set, outer_var_nr, outer_name) + s.set_dim_name(dim_type.set, inner_var_nr, inner_name) + + from loopy.isl import make_slab + + space = s.get_space() + inner_constraint_set = ( + make_slab(space, inner_name, 0, inner_length) + # name = inner + length*outer + .add_constraint(isl.Constraint.eq_from_names( + space, {name:1, inner_name: -1, outer_name:-inner_length}))) + + name_dim_type, name_idx = space.get_var_dict()[name] + return (s + .intersect(inner_constraint_set) + .project_out(name_dim_type, name_idx, 1)) + + new_domain = process_set(kernel.domain) + new_assumptions = process_set(kernel.assumptions) + + from pymbolic import var + inner = var(inner_name) + outer = var(outer_name) + new_loop_index = inner + outer*inner_length + + # {{{ look for reduction loops to split, split seq. tags + + from loopy.symbolic import ReductionLoopSplitter + + rls = ReductionLoopSplitter(name, outer_name, inner_name) + new_insns = [] + for insn in kernel.instructions: + insn = insn.copy(expression=rls(insn.expression)) + old_iname_tag = insn.iname_to_tag.get(name) + + new_iname_to_tag = insn.iname_to_tag.copy() + tagged_ok = False + from loopy.kernel import SequentialTag + if isinstance(old_iname_tag, SequentialTag): + tagged_ok = True + del new_iname_to_tag[name] + new_iname_to_tag[outer_name] = old_iname_tag + new_iname_to_tag[inner_name] = old_iname_tag + + if name in insn.forced_iname_deps: + new_forced_iname_deps = insn.forced_iname_deps[:] + new_forced_iname_deps.remove(name) + new_forced_iname_deps.extend([outer_name, inner_name]) + else: + new_forced_iname_deps = insn.forced_iname_deps + + insn = insn.substitute(name, new_loop_index, tagged_ok=tagged_ok).copy( + iname_to_tag=new_iname_to_tag, + forced_iname_deps=new_forced_iname_deps + ) + + new_insns.append(insn) + + # }}} + + iname_slab_increments = kernel.iname_slab_increments.copy() + iname_slab_increments[outer_name] = outer_slab_increments + result = (kernel + .copy(domain=new_domain, + assumptions=new_assumptions, + iname_slab_increments=iname_slab_increments, + name_to_dim=None, + instructions=new_insns)) + + return tag_dimensions(result, {outer_name: outer_tag, inner_name: inner_tag}) + + + + +def tag_dimensions(kernel, iname_to_tag, insn_id=None): + from loopy.kernel import UniqueTag, ParallelTag, parse_tag + + iname_to_tag = dict((iname, parse_tag(tag)) + for iname, tag in iname_to_tag.iteritems()) + + new_insns = [] + for insn in kernel.instructions: + if insn_id is None or insn_id == insn.id: + new_iname_to_tag = insn.iname_to_tag.copy() + + existing_unique_tag_keys = set( + tag.key for tag in new_iname_to_tag.itervalues() + if isinstance(tag, UniqueTag)) + + for iname, tag in iname_to_tag.iteritems(): + if iname not in insn.all_inames(): + continue + + if isinstance(tag, ParallelTag) and iname in insn.sequential_inames(): + raise NotImplementedError("cannot parallelize reduction dimension (yet)") + + new_iname_to_tag[iname] = tag + + if isinstance(tag, UniqueTag): + tag_key = tag.key + if tag_key in existing_unique_tag_keys: + raise RuntimeError("repeated unique tag key: %s" % tag_key) + + existing_unique_tag_keys.add(tag_key) + + new_insns.append( + insn.copy(iname_to_tag=new_iname_to_tag)) + else: + new_insns.append(insn) + + return kernel.copy(instructions=new_insns) + + + + + +def realize_cse(kernel, cse_tag, dtype, duplicate_inames=[], parallel_inames=None, + dup_iname_to_tag={}, new_inames=None): + """ + :arg duplicate_inames: + also determines index order of temporary array + :arg parallel_inames: only a convenient interface for dup_iname_to_tag + """ + + dtype = np.dtype(dtype) + + from pytools import any + + # {{{ process parallel_inames and dup_iname_to_tag arguments + + if parallel_inames is None: + # default to all-parallel + parallel_inames = duplicate_inames + + dup_iname_to_tag = dup_iname_to_tag.copy() + from loopy.kernel import TAG_AUTO_WORK_ITEM_IDX + for piname in parallel_inames: + dup_iname_to_tag[piname] = TAG_AUTO_WORK_ITEM_IDX() + + for diname in duplicate_inames: + dup_iname_to_tag.setdefault(diname, None) + + if not set(dup_iname_to_tag.iterkeys()) <= set(duplicate_inames): + raise RuntimeError("paralleization/tag info for non-duplicated inames " + "may not be passed") + + # here, all information is consolidated into dup_iname_to_tag + + # }}} + + # {{{ process new_inames argument, think of new inames for inames to be duplicated + + if new_inames is None: + new_inames = [None] * len(duplicate_inames) + + temp_new_inames = [] + for old_iname, new_iname in zip(duplicate_inames, new_inames): + if new_iname is None: + new_iname = kernel.make_unique_var_name(old_iname) + temp_new_inames.append(new_iname) + + new_inames = temp_new_inames + + # }}} + + target_var_name = kernel.make_unique_var_name(cse_tag) + + cse_lookup_table = {} + + cse_result_insns = [] + + def map_cse(expr, rec): + if expr.prefix != cse_tag: + return + + # FIXME stencils and variable shuffle detection would happen here + + try: + cse_replacement, dep_id = cse_lookup_table[expr] + except KeyError: + pass + else: + return cse_replacement + + if cse_result_insns: + raise RuntimeError("CSE tag '%s' is not unique" % cse_tag) + + # {{{ concoct new inner and outer expressions + + from pymbolic import var + assignee = var(target_var_name) + new_outer_expr = assignee + + if duplicate_inames: + assignee = assignee[tuple( + var(iname) for iname in new_inames + )] + new_outer_expr = new_outer_expr[tuple( + var(iname) for iname in duplicate_inames + )] + + from pymbolic import substitute + new_inner_expr = substitute(rec(expr.child), dict( + (old_iname, var(new_iname)) + for old_iname, new_iname in zip(duplicate_inames, new_inames))) + + # }}} + + # {{{ build the new instruction's iname_to tag -def split_dimension(knl, *args, **kwargs): - return knl.split_dimension(*args, **kwargs) + from loopy.symbolic import IndexVariableFinder + insn_iname_to_tag = {} + for old_iname, new_iname in zip(duplicate_inames, new_inames): + insn_iname_to_tag[new_iname] = dup_iname_to_tag[old_iname] + + index_deps = ( + IndexVariableFinder()(new_outer_expr) + | set(new_inames)) + + for iname in index_deps: + if iname not in insn_iname_to_tag: + # assume generating instruction's view on + # inames on which we don't have an opinion. + + if iname in insn.iname_to_tag: + insn_iname_to_tag[iname] = insn.iname_to_tag[iname] + + # }}} + + from loopy.kernel import Instruction + new_insn = Instruction( + id=kernel.make_unique_instruction_id(based_on=cse_tag), + assignee=assignee, + expression=new_inner_expr, + iname_to_tag=insn_iname_to_tag, + ) + + cse_result_insns.append(new_insn) + + return new_outer_expr + + from loopy.symbolic import CSECallbackMapper + cse_cb_mapper = CSECallbackMapper(map_cse) + + new_insns = [] + for insn in kernel.instructions: + was_empty = not bool(cse_result_insns) + new_expr = cse_cb_mapper(insn.expression) + + if was_empty and cse_result_insns: + new_iname_to_tag = insn.iname_to_tag.copy() + new_iname_to_tag.update(dup_iname_to_tag) + + new_insns.append( + insn.copy( + expression=new_expr, + iname_to_tag=new_iname_to_tag)) + else: + new_insns.append(insn) + + new_insns.extend(cse_result_insns) + + # {{{ build new domain, duplicating each constraint on duplicated inames + + start_idx = kernel.domain.dim(dim_type.set) + new_domain = kernel.domain.insert_dims( + dim_type.set, start_idx, + len(duplicate_inames)) + new_name_to_dim = kernel.name_to_dim.copy() + for i, iname in enumerate(new_inames): + new_idx = start_idx+i + new_domain = new_domain.set_dim_name( + dim_type.set, new_idx, iname) + new_name_to_dim[iname] = (dim_type.set, new_idx) + + dup_iname_dims = [kernel.name_to_dim[iname] + for iname in duplicate_inames] + old_to_new = dict((old_iname, new_iname) + for old_iname, new_iname in zip(duplicate_inames, new_inames)) + + new_domain_bs, = new_domain.get_basic_sets() + + for cns in new_domain_bs.get_constraints(): + if any(cns.involves_dims(*dim+(1,)) for dim in dup_iname_dims): + assert not cns.is_div_constraint() + if cns.is_equality(): + new_cns = cns.equality_alloc(new_domain.get_space()) + else: + new_cns = cns.inequality_alloc(new_domain.get_space()) + + new_coeffs = {} + for key, val in cns.get_coefficients_by_name().iteritems(): + if key in old_to_new: + new_coeffs[old_to_new[key]] = val + else: + new_coeffs[key] = val + + new_cns = new_cns.set_coefficients_by_name(new_coeffs) + new_domain = new_domain.add_constraint(new_cns) + + # }}} + + # {{{ set up data for temp variable + + temp_var_base_indices = [] + temp_var_shape = [] + + for iname in new_inames: + lower_bound_pw_aff = new_domain.dim_min(new_name_to_dim[iname][1]) + upper_bound_pw_aff = new_domain.dim_max(new_name_to_dim[iname][1]) + + from loopy.isl import static_max_of_pw_aff + from loopy.symbolic import pw_aff_to_expr + + temp_var_shape.append(static_max_of_pw_aff( + upper_bound_pw_aff - lower_bound_pw_aff + 1)) + temp_var_base_indices.append(pw_aff_to_expr(lower_bound_pw_aff)) + + from loopy.kernel import TemporaryVariable, ParallelTag + new_temporary_variables = kernel.temporary_variables + [ + TemporaryVariable( + name=target_var_name, + dtype=dtype, + base_indices=temp_var_base_indices, + shape=temp_var_shape, + is_local=any(isinstance(tag, ParallelTag) + for tag in dup_iname_to_tag.iterkeys())) + ] + + # }}} + + return kernel.copy( + domain=new_domain, + instructions=new_insns, + temporary_variables=new_temporary_variables, + name_to_dim=new_name_to_dim) + + + + +def realize_reduction(kernel, loop_iname, dtype, reduction_tag=None): + dtype = np.dtype(dtype) + + new_insns = [] + new_temporary_variables = kernel.temporary_variables[:] + + def map_reduction(expr, rec): + sub_expr = rec(expr.expr) + + if reduction_tag is not None and expr.tag != reduction_tag: + return + + from pymbolic import var + + from loopy.kernel import REGISTERED_REDUCTION_OPS + red_op = REGISTERED_REDUCTION_OPS[expr.operation] + + target_var_name = kernel.make_unique_var_name("red_"+loop_iname, + extra_used_vars=set(tv.name for tv in new_temporary_variables)) + target_var = var(target_var_name) + + from loopy.kernel import Instruction + + from loopy.kernel import TemporaryVariable + new_temporary_variables.append( + TemporaryVariable( + name=target_var_name, + dtype=dtype, + shape=(), + is_local=False)) + + init_iname_to_tag = insn.iname_to_tag.copy() + + init_insn = Instruction( + id=kernel.make_unique_instruction_id( + extra_used_ids=set(ni.id for ni in new_insns)), + assignee=target_var, + forced_iname_deps=list(insn.all_inames() - set(expr.iname)), + expression=red_op.neutral_element) + + new_insns.append(init_insn) + + from loopy.kernel import SequentialTag + reduction_insn = Instruction( + id=kernel.make_unique_instruction_id( + extra_used_ids=set(ni.id for ni in new_insns)), + assignee=target_var, + expression=red_op(target_var, sub_expr), + forced_insn_deps=[init_insn.id], + use_auto_dependencies=False, + forced_iname_deps=list(insn.all_inames()), + iname_to_tag={expr.iname: SequentialTag()}) + + new_insns.append(reduction_insn) + + new_insn_forced_insn_deps.append(reduction_insn.id) + + return target_var + + from loopy.symbolic import ReductionCallbackMapper + cb_mapper = ReductionCallbackMapper(map_reduction) + + for insn in kernel.instructions: + new_insn_forced_insn_deps = [] + + new_insns.append( + insn.copy( + expression=cb_mapper(insn.expression), + forced_insn_deps=insn.forced_insn_deps + + new_insn_forced_insn_deps, + )) + + return kernel.copy( + instructions=new_insns, + temporary_variables=new_temporary_variables) + + + + +def get_problems(kernel, parameters, emit_warnings=True): + """ + :return: *(max_severity, list of (severity, msg))*, where *severity* ranges from 1-5. + '5' means 'will certainly not run'. + """ + msgs = [] + + def msg(severity, s): + if emit_warnings: + from warnings import warn + from loopy import LoopyAdvisory + warn(s, LoopyAdvisory) + + msgs.append((severity, s)) + + glens = kernel.tag_type_lengths(TAG_GROUP_IDX, allow_parameters=True) + llens = kernel.tag_type_lengths(TAG_WORK_ITEM_IDX, allow_parameters=False) + + from pymbolic import evaluate + glens = evaluate(glens, parameters) + llens = evaluate(llens, parameters) + + if (max(len(glens), len(llens)) + > kernel.device.max_work_item_dimensions): + msg(5, "too many work item dimensions") + + for i in range(len(llens)): + if llens[i] > kernel.device.max_work_item_sizes[i]: + msg(5, "group axis %d too big" % i) + + from pytools import product + if product(llens) > kernel.device.max_work_group_size: + msg(5, "work group too big") + + from pyopencl.characterize import usable_local_mem_size + if kernel.local_mem_use() > usable_local_mem_size(kernel.device): + if kernel.device.local_mem_type == cl.device_local_mem_type.LOCAL: + msg(5, "using too much local memory") + else: + msg(4, "using more local memory than available--" + "possibly OK due to cache nature") + + const_arg_count = sum( + 1 for arg in kernel.args + if isinstance(arg, ArrayArg) and arg.constant_mem) + + if const_arg_count > kernel.device.max_constant_args: + msg(5, "too many constant arguments") + + max_severity = 0 + for sev, msg in msgs: + max_severity = max(sev, max_severity) + return max_severity, msgs + +# }}} + +# {{{ high-level modifiers def get_input_access_descriptors(kernel): """Return a dictionary mapping input vectors to a list of input access descriptor. An input access descriptor is a tuple (input_vec, index_expr). """ + 1/0 # broken + from loopy.symbolic import VariableIndexExpressionCollector from pytools import flatten @@ -94,6 +567,7 @@ def add_prefetch(kernel, input_access_descr, fetch_dims, loc_fetch_axes={}): :arg fetch_dims: loop dimensions indexing the input variable on which the prefetch is to be carried out. """ + 1/0 # broken if isinstance(input_access_descr, str): var_name = input_access_descr diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index 4c509d4c7..e2cc6f0f3 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -41,7 +41,7 @@ def get_bounds_constraints(set, iname, admissible_inames, allow_parameters): upper = [] equality = [] - space = bset.get_dim() + space = bset.get_space() var_dict = space.get_var_dict() iname_tp, iname_idx = var_dict[iname] @@ -64,6 +64,9 @@ def get_bounds_constraints(set, iname, admissible_inames, allow_parameters): return lower, upper, equality def solve_constraint_for_bound(cns, iname): + from warnings import warn + warn("deprecated") + from loopy.symbolic import constraint_to_expr rhs, iname_coeff = constraint_to_expr(cns, except_name=iname) @@ -94,6 +97,9 @@ def get_bounds(set, iname, admissible_inames, allow_parameters): as actual bounds. """ + from warnings import warn + warn("deprecated") + lower, upper, equality = get_bounds_constraints( set, iname, admissible_inames, allow_parameters) @@ -137,7 +143,7 @@ def constraint_to_code(ccm, cns): return "%s %s 0" % (ccm(constraint_to_expr(cns)), comp_op) def filter_necessary_constraints(implemented_domain, constraints): - space = implemented_domain.get_dim() + space = implemented_domain.get_space() return [cns for cns in constraints if not implemented_domain.is_subset( @@ -152,7 +158,7 @@ def generate_bounds_checks(domain, check_vars, implemented_domain): .coalesce() .get_basic_sets()) - space = domain.get_dim() + space = domain.get_space() cast_constraints = [] diff --git a/loopy/codegen/prefetch.py b/loopy/codegen/prefetch.py index defaaa81a..1b7206a48 100644 --- a/loopy/codegen/prefetch.py +++ b/loopy/codegen/prefetch.py @@ -172,12 +172,12 @@ def make_fetch_loop_nest(flnd, fetch_dim_idx, pf_dim_exprs, iname_subst_map, # {{{ build an implemented domain with an extra index variable from loopy.symbolic import eq_constraint_from_expr - idx_var_dim_idx = implemented_domain.get_dim().size(dim_type.set) + idx_var_dim_idx = implemented_domain.get_space().size(dim_type.set) impl_domain_with_index_var = implemented_domain.add_dims(dim_type.set, 1) impl_domain_with_index_var = ( impl_domain_with_index_var .set_dim_name(dim_type.set, idx_var_dim_idx, idx_var_name)) - aug_space = impl_domain_with_index_var.get_dim() + aug_space = impl_domain_with_index_var.get_space() impl_domain_with_index_var = ( impl_domain_with_index_var .intersect( @@ -220,7 +220,7 @@ def make_fetch_loop_nest(flnd, fetch_dim_idx, pf_dim_exprs, iname_subst_map, impl_domain_with_index_var .intersect( make_slab( - impl_domain_with_index_var.get_dim(), idx_var_name, + impl_domain_with_index_var.get_space(), idx_var_name, cur_index, min(dim_length, cur_index+total_realiz_size))) .project_out(dim_type.set, idx_var_dim_idx, 1)) diff --git a/loopy/isl.py b/loopy/isl.py index e31575e2d..bf7b5a064 100644 --- a/loopy/isl.py +++ b/loopy/isl.py @@ -1,3 +1,5 @@ +"""isl helpers""" + from __future__ import division import islpy as isl @@ -7,9 +9,9 @@ from islpy import dim_type -# {{{ isl helpers - def cast_constraint_to_space(cns, new_space, as_equality=None): + 1/0 # bad routine, shouldn't be used + if as_equality is None: as_equality = cns.is_equality() @@ -19,16 +21,30 @@ def cast_constraint_to_space(cns, new_space, as_equality=None): factory = isl.Constraint.ineq_from_names return factory(new_space, cns.get_coefficients_by_name()) -def block_shift_constraint(cns, iname, multiple, as_equality=None): - cns = copy_constraint(cns, as_equality=as_equality) + + + +def block_shift_constraint(cns, type, pos, multiple, as_equality=None): + if as_equality != cns.is_equality(): + if as_equality: + factory = isl.Constraint.equality_from_aff + else: + factory = isl.Constraint.inequality_from_aff + + cns = factory(cns.get_aff()) + cns = cns.set_constant(cns.get_constant() - + cns.get_coefficients_by_name()[iname]*multiple) + + cns.get_coefficient(type, pos)*multiple) return cns + + + + def negate_constraint(cns): assert not cns.is_equality() # FIXME hackety hack - my_set = (isl.BasicSet.universe(cns.get_dim()) + my_set = (isl.BasicSet.universe(cns.get_space()) .add_constraint(cns)) my_set = my_set.complement() @@ -39,9 +55,8 @@ def negate_constraint(cns): result, = results return result -def copy_constraint(cns, as_equality=None): - return cast_constraint_to_space(cns, cns.get_dim(), - as_equality=as_equality) + + def make_index_map(set, index_expr): from loopy.symbolic import eq_constraint_from_expr @@ -52,7 +67,7 @@ def make_index_map(set, index_expr): amap = isl.Map.from_domain(set).add_dims(dim_type.out, len(index_expr)) out_names = ["_ary_idx_%d" % i for i in range(len(index_expr))] - dim = amap.get_dim() + dim = amap.get_space() all_constraints = tuple( eq_constraint_from_expr(dim, iexpr_i) for iexpr_i in index_expr) @@ -66,6 +81,10 @@ def make_index_map(set, index_expr): return amap + + + + def make_slab(space, iname, start, stop): from loopy.symbolic import ineq_constraint_from_expr from pymbolic import var @@ -78,6 +97,30 @@ def make_slab(space, iname, start, stop): .add_constraint(ineq_constraint_from_expr( space, stop-1 - var_iname))) -# }}} + + + +def set_is_universe(set): + bs = set.get_basic_sets() + if len(bs) == 1: + return bs[0].is_universe() + else: + return isl.Set.universe_like(set).is_subset(set) + + + + +def static_max_of_pw_aff(pw_aff): + for set, aff in pw_aff.get_pieces(): + candidate_pw_aff = isl.PwAff.from_aff(aff) + + if set_is_universe(candidate_pw_aff.ge_set(pw_aff)): + return aff + + raise ValueError("a static maximum was not found for PwAff '%s'" + % pw_aff) + + + # vim: foldmethod=marker diff --git a/loopy/kernel.py b/loopy/kernel.py index 2838d09d0..da04a6545 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -4,14 +4,101 @@ from __future__ import division import numpy as np from pytools import Record, memoize_method -from pymbolic.mapper.dependency import DependencyMapper -import pyopencl as cl import islpy as isl from islpy import dim_type +from pymbolic import var + +# {{{ index tags + +class IndexTag(Record): + __slots__ = [] + + def __hash__(self): + raise RuntimeError("use .key to hash index tags") + + + + +class SequentialTag(IndexTag): + pass + +class ParallelTag(IndexTag): + pass + +class UniqueTag(IndexTag): + @property + def key(self): + return type(self) + +class ParallelTagWithAxis(ParallelTag, UniqueTag): + __slots__ = ["axis"] + + def __init__(self, axis): + Record.__init__(self, + axis=axis) + + @property + def key(self): + return (type(self), self.axis) + + def __str__(self): + return "%s.%d" % ( + self.print_name, self.axis) + +class TAG_GROUP_IDX(ParallelTagWithAxis): + print_name = "g" + +class TAG_WORK_ITEM_IDX(ParallelTagWithAxis): + print_name = "l" + +class TAG_AUTO_WORK_ITEM_IDX(ParallelTag): + def __str__(self): + return "l.auto" + +class TAG_ILP(ParallelTag): + def __str__(self): + return "ilp" + +class BaseUnrollTag(IndexTag): + pass + +class TAG_UNROLL_STATIC(BaseUnrollTag): + def __str__(self): + return "unr" + +class TAG_UNROLL_INCR(BaseUnrollTag): + def __str__(self): + return "unri" + +def parse_tag(tag): + if tag is None: + return tag + + if isinstance(tag, IndexTag): + return tag + + if not isinstance(tag, str): + raise ValueError("cannot parse tag: %s" % tag) + + if tag in ["unrs", "unr"]: + return TAG_UNROLL_STATIC() + elif tag == "unri": + return TAG_UNROLL_INCR() + elif tag == "ilp": + return TAG_ILP() + elif tag.startswith("g."): + return TAG_GROUP_IDX(int(tag[2:])) + elif tag.startswith("l."): + return TAG_WORK_ITEM_IDX(int(tag[2:])) + else: + raise ValueError("cannot parse tag: %s" % tag) + +# }}} + # {{{ arguments class ArrayArg: @@ -87,88 +174,225 @@ class ScalarArg: # }}} -# {{{ index tags +# {{{ temporary variable -class IndexTag(Record): - __slots__ = [] +class TemporaryVariable(Record): + """ + :ivar name: + :ivar dtype: + :ivar shape: + :ivar base_indices: + :ivar is_local: + """ - def __hash__(self): - raise RuntimeError("use .key to hash index tags") + def __init__(self, name, dtype, shape, is_local, base_indices=None): + if base_indices is None: + base_indices = (0,) * len(shape) + Record.__init__(self, name=name, dtype=dtype, shape=shape, is_local=is_local) + @property + def nbytes(self): + from pytools import product + return product(self.shape)*self.dtype.itemsize +# }}} -class ParallelTag(IndexTag): - pass +# {{{ instruction -class UniqueTag(IndexTag): - @property - def key(self): - return type(self) +class Instruction(Record): + #:ivar kernel: handle to the :class:`LoopKernel` of which this instruction + #is a part. (not yet) + """ + :ivar id: An (otherwise meaningless) identifier that is unique within + a :class:`LoopKernel`. + :ivar assignee: + :ivar expression: + :ivar use_auto_dependencies: + :ivar forced_iname_deps: a list of inames that are added to the list of iname + dependencies + :ivar forced_insn_deps: a list of ids of :class:`Instruction` instances that + *must* be executed before this one. Note that loop scheduling augments this + by adding dependencies on any writes to temporaries read by this instruction + *if* use_auto_dependencies is True. + :ivar iname_to_tag: a map from loop domain variables to subclasses + of :class:`IndexTag` + """ + def __init__(self, + id, assignee, expression, + use_auto_dependencies=True, + forced_iname_deps=[], forced_insn_deps=[], + iname_to_tag={}): -class ParallelTagWithAxis(ParallelTag, UniqueTag): - __slots__ = ["axis", "forced_length"] + # {{{ find and properly tag reduction inames + + reduction_inames = set() + + from loopy.symbolic import ReductionCallbackMapper + + def map_reduction(expr, rec): + rec(expr.expr) + reduction_inames.add(expr.iname) + + ReductionCallbackMapper(map_reduction)(expression) + + if reduction_inames: + iname_to_tag = iname_to_tag.copy() + + for iname in reduction_inames: + tag = iname_to_tag.get(iname) + if not (tag is None or isinstance(tag, SequentialTag)): + raise RuntimeError("inconsistency detected: " + "sequential/reduction iname '%s' was " + "tagged otherwise" % iname) + + iname_to_tag[iname] = SequentialTag() + + # }}} - def __init__(self, axis, forced_length=None): Record.__init__(self, - axis=axis, forced_length=forced_length) + id=id, assignee=assignee, expression=expression, + use_auto_dependencies=use_auto_dependencies, + forced_iname_deps=forced_iname_deps, + forced_insn_deps=forced_insn_deps, + iname_to_tag=dict( + (iname, parse_tag(tag)) + for iname, tag in iname_to_tag.iteritems())) - @property - def key(self): - return (type(self), self.axis) + unused_tags = set(self.iname_to_tag.iterkeys()) - self.all_inames() + if unused_tags: + raise RuntimeError("encountered tags for unused inames: " + + ", ".join(unused_tags)) - def __repr__(self): - if self.forced_length: - return "%s(%d, flen=%d)" % ( - self.print_name, self.axis, - self.forced_length) - else: - return "%s(%d)" % ( - self.print_name, self.axis) + @memoize_method + def all_inames(self): + from loopy.symbolic import VariableIndexExpressionCollector + idx_exprs = ( + VariableIndexExpressionCollector()(self.expression) + | VariableIndexExpressionCollector()(self.assignee)) + from pymbolic.mapper.dependency import DependencyMapper + index_vars = set() -class TAG_GROUP_IDX(ParallelTagWithAxis): - print_name = "GROUP_IDX" + from pymbolic.primitives import Variable + for idx_expr in idx_exprs: + for i in DependencyMapper()(idx_expr): + if isinstance(i, Variable): + index_vars.add(i.name) -class TAG_WORK_ITEM_IDX(ParallelTagWithAxis): - print_name = "WORK_ITEM_IDX" + return index_vars | set(self.forced_iname_deps) -class TAG_ILP(ParallelTag): - def __repr__(self): - return "TAG_ILP" + @memoize_method + def sequential_inames(self): + result = set() -class BaseUnrollTag(IndexTag): - pass + for iname, tag in self.iname_to_tag.iteritems(): + if isinstance(tag, SequentialTag): + result.add(iname) -class TAG_UNROLL_STATIC(BaseUnrollTag): - def __repr__(self): - return "TAG_UNROLL_STATIC" + return result -class TAG_UNROLL_INCR(BaseUnrollTag): - def __repr__(self): - return "TAG_UNROLL_INCR" + def substitute(self, old_var, new_expr, tagged_ok=False): + from loopy.symbolic import SubstitutionMapper -def parse_tag(tag): - if tag is None: - return tag + prev_tag = self.iname_to_tag.get(old_var) + if prev_tag is not None and not tagged_ok: + raise RuntimeError("cannot substitute already tagged variable '%s'" + % old_var) - if isinstance(tag, IndexTag): - return tag + subst_map = {var(old_var): new_expr} - if not isinstance(tag, str): - raise ValueError("cannot parse tag: %s" % tag) + subst_mapper = SubstitutionMapper(subst_map.get) + return self.copy( + assignee=subst_mapper(self.assignee), + expression=subst_mapper(self.expression)) - if tag in ["unrs", "unr"]: - return TAG_UNROLL_STATIC() - elif tag == "unri": - return TAG_UNROLL_INCR() - elif tag == "ilp": - return TAG_ILP() - elif tag.startswith("g."): - return TAG_GROUP_IDX(int(tag[2:])) - elif tag.startswith("l."): - return TAG_WORK_ITEM_IDX(int(tag[2:])) - else: - raise ValueError("cannot parse tag: %s" % tag) + def __str__(self): + loop_descrs = [] + for iname in self.all_inames(): + tag = self.iname_to_tag.get(iname) + + if tag is None: + loop_descrs.append(iname) + else: + loop_descrs.append("%s: %s" % (iname, tag)) + + result = "%s: %s <- %s\n [%s]" % (self.id, + self.assignee, self.expression, ", ".join(loop_descrs)) + + return result + +# }}} + +# {{{ reduction operations + +class ReductionOperation: + """ + :ivar neutral_element: + """ + + def __call__(self, operand1, operand2): + raise NotImplementedError + +class SumReductionOperation: + neutral_element = 0 + + def __call__(self, operand1, operand2): + return operand1 + operand2 + +class ProductReductionOperation: + neutral_element = 1 + + def __call__(self, operand1, operand2): + return operand1 * operand2 + +class FloatingPointMaxOperation: + neutral_element = -var("INFINITY") + + def __call__(self, operand1, operand2): + return var("max")(operand1, operand2) + +class FloatingPointMaxOperation: + # OpenCL 1.1, section 6.11.2 + neutral_element = -var("INFINITY") + + def __call__(self, operand1, operand2): + from pymbolic.primitives import FunctionSymbol + return FunctionSymbol("max")(operand1, operand2) + +class FloatingPointMinOperation: + # OpenCL 1.1, section 6.11.2 + neutral_element = var("INFINITY") + + def __call__(self, operand1, operand2): + from pymbolic.primitives import FunctionSymbol + return FunctionSymbol("min")(operand1, operand2) + + + + +REGISTERED_REDUCTION_OPS = { + "sum": SumReductionOperation(), + "product": ProductReductionOperation(), + "fp_max": FloatingPointMaxOperation(), + "fp_min": FloatingPointMinOperation(), + } + + + +def register_reduction(prefix, op): + """Register a new :class:`ReductionOperation`. + + :arg prefix: the desired name of the operation + :return: the name actually assigned to the operation, + will start with *prefix*. + """ + + from loopy.tools import generate_unique_possibilities + + for name in generate_unique_possibilities(prefix): + if name not in REGISTERED_REDUCTION_OPS: + REGISTERED_REDUCTION_OPS[name] = op + return name # }}} @@ -178,12 +402,9 @@ class LoopKernel(Record): """ :ivar device: :class:`pyopencl.Device` :ivar domain: :class:`islpy.BasicSet` - :ivar iname_to_tag: :ivar instructions: :ivar args: - :ivar prefetch: :ivar schedule: - :ivar register_prefetch: :ivar name: :ivar preamble: :ivar assumptions: the initial implemented_domain, captures assumptions @@ -191,21 +412,29 @@ class LoopKernel(Record): :ivar iname_slab_increments: a dictionary mapping inames to (lower_incr, upper_incr) tuples that will be separated out in the execution to generate 'bulk' slabs with fewer conditionals. + :ivar temporary_variables: + :ivar workgroup_size: + :ivar name_to_dim: A lookup table from inames to ISL-style + (dim_type, index) tuples """ - def __init__(self, device, domain, instructions, args=None, prefetch={}, schedule=None, - register_prefetch=None, name="loopy_kernel", - iname_to_tag={}, is_divisible=lambda dividend, divisor: False, + def __init__(self, device, domain, instructions, args=None, schedule=None, + name="loopy_kernel", preamble=None, assumptions=None, - iname_slab_increments={}): + iname_slab_increments={}, + temporary_variables=[], + workgroup_size=None, + name_to_dim=None): """ :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl. Example: "{[i,j]: 0<=i < 10 and 0<= j < 9}" - :arg iname_to_tag: a map from loop domain variables to subclasses of :class:`IndexTag` """ def parse_if_necessary(insn): from pymbolic import parse + + if isinstance(insn, Instruction): + return insn if isinstance(insn, str): lhs, rhs = insn.split("=") elif isinstance(insn, tuple): @@ -213,22 +442,35 @@ class LoopKernel(Record): if isinstance(lhs, str): lhs = parse(lhs) + if isinstance(rhs, str): - from loopy.symbolic import CSERealizer - rhs = CSERealizer()(parse(rhs)) + from loopy.symbolic import FunctionToPrimitiveMapper + rhs = parse(rhs) + rhs = FunctionToPrimitiveMapper()(rhs) - return lhs, rhs + return Instruction( + id=self.make_unique_instruction_id(insns), + assignee=lhs, expression=rhs) if isinstance(domain, str): ctx = isl.Context() domain = isl.Set.read_from_str(ctx, domain, nparam=-1) - insns = [parse_if_necessary(insn) for insn in instructions] + if name_to_dim is None: + name_to_dim = domain.get_space().get_var_dict() + + insns = [] + for insn in instructions: + # must construct list one-by-one to facilitate unique id generation + insns.append(parse_if_necessary(insn)) + + if len(set(insn.id for insn in insns)) != len(insns): + raise RuntimeError("instruction ids do not appear to be unique") if assumptions is None: - assumptions = isl.Set.universe(domain.get_dim()) + assumptions = isl.Set.universe(domain.get_space()) elif isinstance(assumptions, str): - s = domain.get_dim() + s = domain.get_space() assumptions = isl.BasicSet.read_from_str(domain.get_ctx(), "[%s] -> {[%s]: %s}" % (",".join(s.get_name(dim_type.param, i) @@ -239,24 +481,50 @@ class LoopKernel(Record): nparam=-1) Record.__init__(self, - device=device, args=args, domain=domain, instructions=insns, - prefetch=prefetch, schedule=schedule, - register_prefetch=register_prefetch, name=name, - iname_to_tag=dict( - (iname, parse_tag(tag)) - for iname, tag in iname_to_tag.iteritems()), - is_divisible=is_divisible, + device=device, domain=domain, instructions=insns, + args=args, + schedule=schedule, + name=name, preamble=preamble, assumptions=assumptions, - iname_slab_increments=iname_slab_increments) + iname_slab_increments=iname_slab_increments, + temporary_variables=temporary_variables, + workgroup_size=workgroup_size, + name_to_dim=name_to_dim) + + def make_unique_instruction_id(self, insns=None, based_on="insn", extra_used_ids=set()): + if insns is None: + insns = self.instructions + + used_ids = set(insn.id for insn in insns) | extra_used_ids + + from loopy.tools import generate_unique_possibilities + for id_str in generate_unique_possibilities(based_on): + if id_str not in used_ids: + return id_str + + def make_unique_var_name(self, based_on="var", extra_used_vars=set()): + used_vars = ( + set(lv.name for lv in self.temporary_variables) + | set(arg.name for arg in self.args) + | set(self.name_to_dim.keys()) + | extra_used_vars) + + from loopy.tools import generate_unique_possibilities + for var_name in generate_unique_possibilities(based_on): + if var_name not in used_vars: + return var_name - # FIXME: assert empty intersection of loop vars and args - # FIXME: assert that isl knows about loop parameters + @property + @memoize_method + def dim_to_name(self): + from pytools import reverse_dict + return reverse_dict(self.name_to_dim) @property @memoize_method def space(self): - return self.domain.get_dim() + return self.domain.get_space() @property @memoize_method @@ -287,22 +555,6 @@ class LoopKernel(Record): from islpy import dim_type return set(self.space.get_var_dict(dim_type.set).iterkeys()) - @memoize_method - def output_inames(self): - dm = DependencyMapper(include_subscripts=False) - - output_indices = set() - for lvalue, expr in self.instructions: - output_indices.update( - set(v.name for v in dm(lvalue)) - & self.all_inames()) - - return output_indices - set(arg.name for arg in self.args) - - @memoize_method - def reduction_inames(self): - return self.all_inames() - self.output_inames() - def inames_by_tag_type(self, tag_type): return [iname for iname in self.all_inames() if isinstance(self.iname_to_tag.get(iname), tag_type)] @@ -360,216 +612,18 @@ class LoopKernel(Record): return s def local_mem_use(self): - return sum(pf.nbytes for pf in self.prefetch.itervalues()) - - @memoize_method - def input_vectors(self): - dm = DependencyMapper(include_subscripts=False) - - input_vectors = set() - for lvalue, expr in self.instructions: - input_vectors.update( - set(v.name for v in dm(expr))) - - return input_vectors - self.all_inames() - set(self.scalar_loop_args) - - @memoize_method - def output_vectors(self): - dm = DependencyMapper(include_subscripts=False) - - output_vectors = set() - for lvalue, expr in self.instructions: - output_vectors.update( - set(v.name for v in dm(lvalue))) - - return output_vectors - self.all_inames() - self.scalar_loop_args - - def _subst_insns(self, old_var, new_expr): - from pymbolic.mapper.substitutor import substitute - - subst_map = {old_var: new_expr} - - return [(substitute(lvalue, subst_map), - substitute(expr, subst_map)) - for lvalue, expr in self.instructions] - - def is_prefetch_variable(self, varname): - if self.prefetch: - for pf in self.prefetch.itervalues(): - for pfdim in pf.dims: - if pfdim.name == varname: - return True - - return False - - def _subst_prefetch(self, old_var, new_expr): - # FIXME delete me - from pymbolic.mapper.substitutor import substitute - subst_map = {old_var: new_expr} - - result = {} - for pf in self.prefetch.itervalues(): - for pfdim in pf.dims: - if pfdim.name == old_var: - raise RuntimeError("can't substitute prefetch dimension %s" - % old_var) - - new_pf = pf.copy(index_expr=substitute(pf.index_expr, subst_map)) - result[pf.input_vector, new_pf.index_expr] = new_pf - - return result + return sum(lv.nbytes for lv in self.temporary_variables + if lv.is_local) def substitute(self, old_var, new_expr): - copy = self.copy(instructions=self._subst_insns(old_var, new_expr)) - if self.prefetch: - raise RuntimeError("cannot substitute-prefetches already generated") - #copy.prefetch = self._subst_prefetch(old_var, new_expr) - if self.schedule is not None: raise RuntimeError("cannot substitute-schedule already generated") - return copy - - def split_dimension(self, name, inner_length, padded_length=None, - outer_name=None, inner_name=None, - outer_tag=None, inner_tag=None, - outer_slab_increments=(0, -1), no_slabs=None): - - if name not in self.all_inames(): - raise ValueError("cannot split loop for unknown variable '%s'" % name) - - if no_slabs: - outer_slab_increments = (0, 0) - - outer_tag = parse_tag(outer_tag) - inner_tag = parse_tag(inner_tag) - - if self.iname_to_tag.get(name) is not None: - raise RuntimeError("cannot split tagged dimension '%s'" % name) - - # {{{ check for repeated unique tag keys - - new_tag_keys = set(tag.key - for tag in [outer_tag, inner_tag] - if tag is not None - if isinstance(tag, UniqueTag)) - - repeated_tag_keys = new_tag_keys & set( - tag.key for tag in self.iname_to_tag.itervalues() - if isinstance(tag, UniqueTag)) - - if repeated_tag_keys: - raise RuntimeError("repeated tag(s): %s" % repeated_tag_keys) - - # }}} - - if padded_length is not None: - inner_tag = inner_tag.copy(forced_length=padded_length) - - if outer_name is None: - outer_name = name+"_outer" - if inner_name is None: - inner_name = name+"_inner" - - new_iname_to_tag = self.iname_to_tag.copy() - if inner_tag is not None: - new_iname_to_tag[inner_name] = inner_tag - if outer_tag is not None: - new_iname_to_tag[outer_name] = outer_tag - - from islpy import dim_type - outer_var_nr = self.space.size(dim_type.set) - inner_var_nr = self.space.size(dim_type.set)+1 - - def process_set(s): - s = s.add_dims(dim_type.set, 2) - s.set_dim_name(dim_type.set, outer_var_nr, outer_name) - s.set_dim_name(dim_type.set, inner_var_nr, inner_name) - - from loopy.isl import make_slab - - space = s.get_dim() - inner_constraint_set = ( - make_slab(space, inner_name, 0, inner_length) - # name = inner + length*outer - .add_constraint(isl.Constraint.eq_from_names( - space, {name:1, inner_name: -1, outer_name:-inner_length}))) - - name_dim_type, name_idx = space.get_var_dict()[name] - return (s - .intersect(inner_constraint_set) - .project_out(name_dim_type, name_idx, 1)) - - new_domain = process_set(self.domain) - new_assumptions = process_set(self.assumptions) - - from pymbolic import var - inner = var(inner_name) - outer = var(outer_name) - new_loop_index = inner + outer*inner_length - - iname_slab_increments = self.iname_slab_increments.copy() - iname_slab_increments[outer_name] = outer_slab_increments - return (self - .substitute(name, new_loop_index) - .copy(domain=new_domain, iname_to_tag=new_iname_to_tag, - assumptions=new_assumptions, - iname_slab_increments=iname_slab_increments)) - - def get_problems(self, parameters, emit_warnings=True): - """ - :return: *(max_severity, list of (severity, msg))*, where *severity* ranges from 1-5. - '5' means 'will certainly not run'. - """ - msgs = [] - - def msg(severity, s): - if emit_warnings: - from warnings import warn - from loopy import LoopyAdvisory - warn(s, LoopyAdvisory) - - msgs.append((severity, s)) - - glens = self.tag_type_lengths(TAG_GROUP_IDX, allow_parameters=True) - llens = self.tag_type_lengths(TAG_WORK_ITEM_IDX, allow_parameters=False) - - from pymbolic import evaluate - glens = evaluate(glens, parameters) - llens = evaluate(llens, parameters) - - if (max(len(glens), len(llens)) - > self.device.max_work_item_dimensions): - msg(5, "too many work item dimensions") - - for i in range(len(llens)): - if llens[i] > self.device.max_work_item_sizes[i]: - msg(5, "group axis %d too big" % i) - - from pytools import product - if product(llens) > self.device.max_work_group_size: - msg(5, "work group too big") - - from pyopencl.characterize import usable_local_mem_size - if self.local_mem_use() > usable_local_mem_size(self.device): - if self.device.local_mem_type == cl.device_local_mem_type.LOCAL: - msg(5, "using too much local memory") - else: - msg(4, "using more local memory than available--" - "possibly OK due to cache nature") - - const_arg_count = sum( - 1 for arg in self.args - if isinstance(arg, ArrayArg) and arg.constant_mem) - - if const_arg_count > self.device.max_constant_args: - msg(5, "too many constant arguments") - - max_severity = 0 - for sev, msg in msgs: - max_severity = max(sev, max_severity) - return max_severity, msgs + return self.copy(instructions=[ + insn.substitute(old_var, new_expr) + for insn in self.instructions]) # }}} + # vim: foldmethod=marker diff --git a/loopy/schedule.py b/loopy/schedule.py index ee5cab82a..08729bbe5 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -7,17 +7,113 @@ from pytools import Record # {{{ schedule items -class ScheduledLoop(Record): +class EnterLoop(Record): __slots__ = ["iname"] -class WriteOutput(Record): - pass +class LeaveLoop(Record): + __slots__ = [] -# plus loopy.prefetch.{Register,LocalMemory}Prefetch +class RunInstruction(Record): + __slots__ = ["id"] + +class Barrier(Record): + __slots__ = [] # }}} -def generate_loop_schedules(kernel, hints=[]): + + + +def fix_grid_sizes(kernel): + from warnings import warn + warn("fix_grid_sizes is unimplemented") + return kernel + + + + +def generate_loop_dep_graph(kernel): + """ + :return: a dict mapping an iname to the ones that need to be entered + before it. + """ + result = {} + + print "------------------------------------------------------" + for i, insn_a in enumerate(kernel.instructions): + print i, insn_a + print insn_a.all_inames() + + print "------------------------------------------------------" + all_inames = kernel.all_inames() + for i_a, insn_a in enumerate(kernel.instructions): + for i_b, insn_b in enumerate(kernel.instructions): + if i_a == i_b: + continue + + a = insn_a.all_inames() + b = insn_b.all_inames() + intersection = a & b + sym_difference = (a|b) - intersection + + print i_a, i_b, intersection, sym_difference + if a <= b or b <= a: + for sd in sym_difference: + result.setdefault(sd, set()).update(intersection) + + print "------------------------------------------------------" + return result + + + + +def generate_loop_schedules_internal(kernel, entered_loops=[]): + scheduled_insn_ids = set(sched_item.id for sched_item in kernel.schedule + if isinstance(sched_item, RunInstruction)) + + all_inames = kernel.all_inames() + + + + +def generate_loop_schedules(kernel): + # {{{ check that all CSEs and reductions are realized + + from loopy.symbolic import CSECallbackMapper, ReductionCallbackMapper + + def map_reduction(expr, rec): + raise RuntimeError("all reductions must be realized before scheduling") + + def map_cse(expr, rec): + raise RuntimeError("all CSEs must be realized before scheduling") + + for insn in kernel.instructions: + ReductionCallbackMapper(map_reduction)(insn.expression) + CSECallbackMapper(map_cse)(insn.expression) + + # }}} + + kernel = fix_grid_sizes(kernel) + + if 0: + loop_dep_graph = generate_loop_dep_graph(kernel) + for k, v in loop_dep_graph.iteritems(): + print "%s: %s" % (k, ",".join(v)) + 1/0 + + #grid_size, group_size = find_known_grid_and_group_sizes(kernel) + + #kernel = assign_grid_and_group_indices(kernel) + + for gen_knl in generate_loop_schedules_internal(kernel): + yield gen_knl + + + + + +def generate_loop_schedules_old(kernel, hints=[]): + # OLD! from loopy.kernel import TAG_GROUP_IDX, TAG_WORK_ITEM_IDX, TAG_ILP, ParallelTag prev_schedule = kernel.schedule diff --git a/loopy/symbolic.py b/loopy/symbolic.py index b928d4b31..69637268c 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -2,33 +2,157 @@ from __future__ import division -from pymbolic.mapper import CombineMapper, RecursiveMapper, IdentityMapper +from pymbolic.primitives import AlgebraicLeaf +from pymbolic.mapper import ( + CombineMapper as CombineMapperBase, + IdentityMapper as IdentityMapperBase, + RecursiveMapper) from pymbolic.mapper.c_code import CCodeMapper from pymbolic.mapper.stringifier import PREC_NONE +from pymbolic.mapper.substitutor import \ + SubstitutionMapper as SubstitutionMapperBase +from pymbolic.mapper.stringifier import \ + StringifyMapper as StringifyMapperBase import numpy as np import islpy as isl +from islpy import dim_type +# {{{ loopy-specific primitives -# {{{ subscript expression collector +class Reduction(AlgebraicLeaf): + def __init__(self, operation, iname, expr, tag=None): + self.operation = operation + self.iname = iname + self.expr = expr + self.tag = tag -class AllSubscriptExpressionCollector(CombineMapper): - def combine(self, values): - from pytools import flatten - return set(flatten(values)) + def __getinitargs__(self): + return (self.operation, self.iname, self.expr, self.tag) - def map_constant(self, expr): - return set() + def get_hash(self): + return hash((self.__class__, self.operation, self.iname, + self.expr, self.tag)) - def map_algebraic_leaf(self, expr): - return set() + def is_equal(self, other): + return (other.__class__ == self.__class__ + and other.operation == self.operation + and other.iname == self.iname + and other.expr == self.expr + and other.tag == self.tag) - def map_subscript(self, expr): + def stringifier(self): + return StringifyMapper + + mapper_method = intern("map_reduction") + +# }}} + +# {{{ mappers with support for loopy-specific primitives + +class IdentityMapperMixin(object): + def map_reduction(self, expr): + return Reduction(expr.operation, expr.iname, + self.rec(expr.expr), expr.tag) + +class IdentityMapper(IdentityMapperBase, IdentityMapperMixin): + pass + +class CombineMapper(CombineMapperBase): + def map_reduction(self, expr): + return self.rec(expr.expr) + +class SubstitutionMapper(SubstitutionMapperBase, IdentityMapperMixin): + pass + +class StringifyMapper(StringifyMapperBase): + def map_reduction(self, expr, prec): + return "reduce(%s, %s, %s, tag=%s)" % ( + expr.operation, expr.iname, expr.expr, expr.tag) + +# }}} + +# {{{ functions to primitives + +class FunctionToPrimitiveMapper(IdentityMapper): + """Looks for invocations of a function called 'cse' or 'reduce' and + turns those into the actual pymbolic primitives used for that. + """ + + def map_call(self, expr): from pymbolic.primitives import Variable - assert isinstance(expr.aggregate, Variable) + if isinstance(expr.function, Variable) and expr.function.name == "cse": + from pymbolic.primitives import CommonSubexpression + if len(expr.parameters) == 2: + if not isinstance(expr.parameters[1], Variable): + raise TypeError("second argument to cse() must be a symbol") + return CommonSubexpression( + expr.parameters[0], expr.parameters[1].name) + else: + raise TypeError("cse takes two arguments") + + elif isinstance(expr.function, Variable) and expr.function.name == "reduce": + if len(expr.parameters) == 3: + operation, iname, red_expr = expr.parameters + tag = None + elif len(expr.parameters) == 4: + operation, iname, red_expr, tag = expr.parameters + else: + raise TypeError("reduce takes three or four arguments") + + red_expr = self.rec(red_expr) + + if not isinstance(operation, Variable): + raise TypeError("operation argument to reduce() must be a symbol") + operation = operation.name + if not isinstance(iname, Variable): + raise TypeError("iname argument to reduce() must be a symbol") + iname = iname.name + if tag is not None: + if not isinstance(tag, Variable): + raise TypeError("tag argument to reduce() must be a symbol") + tag = tag.name + + return Reduction(operation, iname, red_expr, tag) + else: + return IdentityMapper.map_call(self, expr) - return set([expr]) +# }}} + +# {{{ reduction loop splitter + +class ReductionLoopSplitter(IdentityMapper): + def __init__(self, old_iname, outer_iname, inner_iname, do_warn=True): + self.old_iname = old_iname + self.outer_iname = outer_iname + self.inner_iname = inner_iname + self.do_warn = do_warn + + def map_reduction(self, expr): + if expr.iname == self.old_iname: + if self.do_warn: + from warnings import warn + from loopy import LoopyAdvisory + warn("The reduction loop for iname '%s' (with tag '%s') " + "was split into two nested reductions with separate " + "state variables. It might be advantageous to " + "'realize' the reduction first before splitting " + "this loop." % (expr.iname, expr.tag), LoopyAdvisory) + + if expr.tag is not None: + outer_tag = expr.tag + "_outer" + inner_tag = expr.tag + "_inner" + else: + outer_tag = None + inner_tag = None + + return Reduction(expr.operation, self.outer_iname, + Reduction(expr.operation, self.inner_iname, + expr.expr, inner_tag), + outer_tag) + else: + return IdentityMapper.map_reduction(self, expr) # }}} @@ -93,7 +217,7 @@ class CoefficientCollector(RecursiveMapper): # {{{ variable index expression collector class VariableIndexExpressionCollector(CombineMapper): - def __init__(self, tgt_vector_name): + def __init__(self, tgt_vector_name=None): self.tgt_vector_name = tgt_vector_name def combine(self, values): @@ -110,8 +234,8 @@ class VariableIndexExpressionCollector(CombineMapper): from pymbolic.primitives import Variable assert isinstance(expr.aggregate, Variable) - if expr.aggregate.name == self.tgt_vector_name: - return set([expr.index]) + if self.tgt_vector_name is None or expr.aggregate.name == self.tgt_vector_name: + return set([expr.index]) | self.rec(expr.index) else: return CombineMapper.map_subscript(self, expr) @@ -246,62 +370,113 @@ class LoopyCCodeMapper(CCodeMapper): # }}} -# {{{ expression <-> constraint conversion +# {{{ aff -> expr conversion + +def aff_to_expr(aff): + from pymbolic import var -def _constraint_from_expr(space, expr, constraint_factory): - from loopy.symbolic import CoefficientCollector - return constraint_factory(space, - CoefficientCollector()(expr)) + result = int(aff.get_constant()) + for dt in [dim_type.in_, dim_type.param]: + for i in xrange(aff.dim(dim_type.in_)): + coeff = int(aff.get_coefficient(dt, i)) + if coeff: + result += coeff*var(aff.get_dim_name(dt, i)) + + for i in xrange(aff.dim(dim_type.div)): + coeff = int(aff.get_coefficient(dim_type.div, i)) + if coeff: + result += coeff*aff_to_expr(aff.get_div(i)) + + denom = aff.get_denominator() + if denom == 1: + return result + else: + return result // denom + + + + +def pw_aff_to_expr(pw_aff): + pieces = pw_aff.get_pieces() + + if len(pieces) != 1: + raise NotImplementedError("pw_aff_to_expr for multi-piece PwAff instances") + + (set, aff), = pieces + return aff_to_expr(aff) + +def aff_from_expr(space, expr): + n = space.dim(dim_type.set) + + zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(space)) + context = {} + for name, (dt, pos) in space.get_var_dict().iteritems(): + if dt == dim_type.set: + dt = dim_type.in_ + + context[name] = zero.set_coefficient(dt, pos, 1) + + from pymbolic import evaluate + return evaluate(expr, context) + +# }}} + +# {{{ expression <-> constraint conversion def eq_constraint_from_expr(space, expr): - return _constraint_from_expr( - space, expr, isl.Constraint.eq_from_names) + return isl.Constraint.equality_from_aff(aff_from_expr(space,expr)) def ineq_constraint_from_expr(space, expr): - return _constraint_from_expr( - space, expr, isl.Constraint.ineq_from_names) + return isl.Constraint.inequality_from_aff(aff_from_expr(space,expr)) def constraint_to_expr(cns, except_name=None): - excepted_coeff = 0 - result = 0 - from pymbolic import var - for var_name, coeff in cns.get_coefficients_by_name().iteritems(): - if isinstance(var_name, str): - if var_name == except_name: - excepted_coeff = int(coeff) - else: - result += int(coeff)*var(var_name) - else: - result += int(coeff) + return aff_to_expr(cns.get_aff()) - if except_name is not None: - return result, excepted_coeff - else: +# }}} + +# {{{ CSE callback mapper + +class CSECallbackMapper(IdentityMapper): + def __init__(self, callback): + self.callback = callback + + def map_common_subexpression(self, expr): + result = self.callback(expr, self.rec) + if result is None: + return IdentityMapper.map_common_subexpression(self, expr) return result # }}} -# {{{ CSE "realizer" +# {{{ Reduction callback mapper -class CSERealizer(IdentityMapper): - """Looks for invocations of a function called 'cse' and turns those into - CommonSubexpression objects. - """ +class ReductionCallbackMapper(IdentityMapper): + def __init__(self, callback): + self.callback = callback - def map_call(self, expr): - from pymbolic import Variable - if isinstance(expr.function, Variable) and expr.function.name == "cse": - from pymbolic.primitives import CommonSubexpression - if len(expr.parameters) == 1: - return CommonSubexpression(expr.parameters[0]) - else: - raise TypeError("cse takes exactly one argument") - else: - return IdentityMapper.map_call(self, expr) + def map_reduction(self, expr): + result = self.callback(expr, self.rec) + if result is None: + return IdentityMapper.map_reduction(self, expr) + return result + +# }}} + +# {{{ index dependency finding + +class IndexVariableFinder(CombineMapper): + def combine(self, values): + import operator + return reduce(operator.or_, values, set()) + + def map_subscript(self, expr): + from pymbolic.mapper.dependency import DependencyMapper + return DependencyMapper()(expr) # }}} + # vim: foldmethod=marker diff --git a/loopy/tools.py b/loopy/tools.py new file mode 100644 index 000000000..95d4e4e41 --- /dev/null +++ b/loopy/tools.py @@ -0,0 +1,13 @@ +from __future__ import division + + + + +def generate_unique_possibilities(prefix): + yield prefix + + try_num = 0 + while True: + yield "%s_%d" % (prefix, try_num) + try_num += 1 + diff --git a/test/test_isl.py b/test/test_isl.py new file mode 100644 index 000000000..c6925355a --- /dev/null +++ b/test/test_isl.py @@ -0,0 +1,27 @@ +import islpy as isl + + + + +def test_aff_to_expr(): + s = isl.Space.create_from_names(isl.Context(), ["a", "b"]) + zero = isl.Aff.zero_on_domain(isl.LocalSpace.from_space(s)) + one = zero.set_constant(1) + a = zero.set_coefficient(isl.dim_type.in_, 0, 1) + b = zero.set_coefficient(isl.dim_type.in_, 1, 1) + + x = (5*a + 3*b) % 17 % 5 + print x + from loopy.symbolic import aff_to_expr + print aff_to_expr(x) + + + + +if __name__ == "__main__": + import sys + if len(sys.argv) > 1: + exec(sys.argv[1]) + else: + from py.test.cmdline import main + main([__file__]) diff --git a/test/test_matmul.py b/test/test_matmul.py index 0ea8b86f8..b58edc582 100644 --- a/test/test_matmul.py +++ b/test/test_matmul.py @@ -201,14 +201,15 @@ def test_plain_matrix_mul_new_ui(ctx_factory): n = get_suitable_size(ctx) knl = lp.LoopKernel(ctx.devices[0], - "{[i,j,k]: 0<=i,j,k<%d}" % n, + "[n] -> {[i,j,k]: 0<=i,j,k<n}", [ - "c[i, j] = reduce(sum, k, cse(a[i, k], 'lhsmat')*cse(b[k, j], 'rhsmat'))" + "c[i, j] = reduce(sum, k, cse(a[i, k], lhsmat)*cse(b[k, j], rhsmat))" ], [ lp.ArrayArg("a", dtype, shape=(n, n), order=order), lp.ArrayArg("b", dtype, shape=(n, n), order=order), lp.ArrayArg("c", dtype, shape=(n, n), order=order), + lp.ScalarArg("n", np.int32, approximately=n), ], name="matmul") @@ -216,13 +217,24 @@ def test_plain_matrix_mul_new_ui(ctx_factory): outer_tag="g.0", inner_tag="l.1", no_slabs=True) knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0", no_slabs=True) + for insn in knl.instructions: + print insn + print + knl = lp.realize_reduction(knl, "k", dtype) + for insn in knl.instructions: + print insn + print knl = lp.split_dimension(knl, "k", 16, no_slabs=True) - #knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner"]) - #knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner", ]) - assert knl.get_problems({})[0] <= 2 - kernel_gen = (lp.insert_register_prefetches(knl) - for knl in lp.generate_loop_schedules(knl)) + knl = lp.realize_cse(knl, "lhsmat", dtype, ["k_inner", "i_inner"]) + knl = lp.realize_cse(knl, "rhsmat", dtype, ["j_inner", "k_inner"]) + + for insn in knl.instructions: + print insn + + #assert lp.get_problems(knl, {})[0] <= 2 + + kernel_gen = lp.generate_loop_schedules(knl) a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) -- GitLab