diff --git a/examples/quadrature.py b/examples/quadrature.py index 9bf4e6f21f6b1ecd5b234dd31a0864dba372f09a..c39d6c99ab7461ffe8dfe63f08ded986f8b45baa 100644 --- a/examples/quadrature.py +++ b/examples/quadrature.py @@ -34,17 +34,14 @@ def build_mass_mat_maker(ctx_factory=cl.create_some_context): Nb = 3 Nv = 3 Nq = 3*3 - Nc = 1600 - from pymbolic import var - m, w, det_j, phi, c, i, j, q = [var(s) for s in "m w det_j phi c i j q".split()] knl = lp.LoopKernel(ctx.devices[0], "[ncells] -> {[c,i,j,q]: 0<=c<ncells and 0 <= i < %(Nv)s " "and 0<=j<%(Nb)s and 0<=q<%(Nq)s}" % dict( Nv=Nv, Nb=Nb, Nq=Nq), [ - (m[c, i, j], w[q]*det_j[c]*phi[i,q]*phi[j,q]) + "m[c,i,j] = w[q]*det_j[c]*phi[i,q]*phi[j,q]", ], [ lp.ArrayArg("m", dtype, shape=(Nc, Nv, Nb)), @@ -54,17 +51,16 @@ def build_mass_mat_maker(ctx_factory=cl.create_some_context): lp.ScalarArg("ncells", np.int32, approximately=1000), ], name="mass_mat", - iname_to_tag=dict(i="l.0", j="l.1") + iname_to_tag=dict(i="l.0", j="l.1"), + assumptions="ncells >= 1" ) - knl = lp.split_dimension(knl, "c", 8, outer_tag="g.0", inner_tag="l.2") - knl = lp.add_prefetch(knl, "det_j", ["c_inner"]) + knl = lp.split_dimension(knl, "c", 8, inner_tag="l.2", + outer_slab_increments=(0,0)) + knl = lp.split_dimension(knl, "c_outer", 8, outer_tag="g.0", + outer_slab_increments=(0,0)) # fix reg prefetch - # fix redundant slab generation - - # FIXME - #knl = lp.split_dimension(knl, "c", 8, inner_tag="l.2") - #knl = lp.split_dimension(knl, "c_outer", 8, outer_tag="g.0") + knl = lp.add_prefetch(knl, "det_j", ["c_inner"]) #ilp = 4 #knl = lp.split_dimension(knl, "i", 2, outer_tag="g.0", inner_tag="l.1") @@ -77,7 +73,8 @@ def build_mass_mat_maker(ctx_factory=cl.create_some_context): #knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"]) #assert knl.get_problems({})[0] <= 2 - kernel_gen = (lp.insert_register_prefetches(knl) + kernel_gen = (knl + #kernel_gen = (lp.insert_register_prefetches(knl) for knl in lp.generate_loop_schedules(knl)) if False: diff --git a/loopy/__init__.py b/loopy/__init__.py index 147efabf99ab20e57ee88c19cfc1bc06c9f96653..46bc404e5ee3230429e64db6e68aefb2d67973ac 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -21,24 +21,23 @@ register_mpz_with_pymbolic() # TODO: Custom reductions per red. axis # TODO: Functions # TODO: Common subexpressions -# TODO: Parse ops from string +# TODO: Array common subexpressions # FIXME: support non-reductive dimensions (what did I mean here?) # FIXME: write names should be assigned during scheduling +# FIXME: screwy lower bounds in ILP # TODO: Divisibility # TODO: Try, fix indirect addressing -# TODO: More user control for slab opt # TODO: Implement GT200 matmul, Fermi matmul, DG # TODO: DMA engine threads? -# TODO: Specify initial implemented domain. -# (to filter away unnecessary conditions on parameters) # TODO: Deal with equalities that crop up. +# TODO: Better user feedback. # Later: # ------ # TODO: Try different kernels -# TODO: - Tricky: Convolution, FD +# TODO: - Tricky: Convolution, Stencil # TODO: Separate all-bulk from non-bulk kernels. (maybe?) (#ifdef?) # TODO: implement efficient ceil_div? (as opposed to floor_div) # TODO: why are corner cases inefficient? diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index e8f10a1a8d741c0e875a55e8c6ccce59305260f7..6b9cb966fbd2aafbe94617e16edf6626e773def3 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -2,7 +2,6 @@ from __future__ import division from pytools import Record import numpy as np -from pymbolic.mapper.stringifier import PREC_NONE @@ -95,50 +94,56 @@ def add_comment(cmt, code): # {{{ main code generation entrypoint -class CodeGenerationState(Record): - __slots__ = ["c_code_mapper", "try_slab_partition"] +class ExecutionSubdomain(Record): + __slots__ = ["implemented_domain", "c_code_mapper"] + + def __init__(self, implemented_domain, c_code_mapper): + Record.__init__(self, + implemented_domain=implemented_domain, + c_code_mapper=c_code_mapper) + + def intersect(self, set): + return ExecutionSubdomain( + self.implemented_domain.intersect(set), + self.c_code_mapper) class ExecutionDomain(object): - def __init__(self, implemented_domain, assignments_and_impl_domains=None): + def __init__(self, implemented_domain, c_code_mapper, subdomains=None): """ :param implemented_domain: The entire implemented domain, i.e. all constraints that have been enforced so far. - :param assignments_and_impl_domains: a list of tuples - (assignments, implemented_domain), where *assignments* - is a list of :class:`cgen.Initializer` instances - and *implemented_domain* is the implemented domain to which - the situation produced by the assignments corresponds. + :param subdomains: a list of :class:`ExecutionSubdomain` + instances. - The point of this being is a list is the implementation of + The point of this being a list is the implementation of ILP, and each entry represents a 'fake-parallel' trip through the - ILP'd loop. + ILP'd loop, with the requisite implemented_domain + and a C code mapper that realizes the necessary assignments. + :param c_code_mapper: A C code mapper that does not take per-ILP + assignments into account. """ - if assignments_and_impl_domains is None: - assignments_and_impl_domains = [([], implemented_domain)] self.implemented_domain = implemented_domain - self.assignments_and_impl_domains = assignments_and_impl_domains - - def __len__(self): - return len(self.assignments_and_impl_domains) + if subdomains is None: + self.subdomains = [ + ExecutionSubdomain(implemented_domain, c_code_mapper)] + else: + self.subdomains = subdomains - def __iter__(self): - return iter(self.assignments_and_impl_domains) + self.c_code_mapper = c_code_mapper def intersect(self, set): return ExecutionDomain( self.implemented_domain.intersect(set), - [(assignments, implemented_domain.intersect(set)) - for assignments, implemented_domain - in self.assignments_and_impl_domains]) + self.c_code_mapper, + [subd.intersect(set) for subd in self.subdomains]) def get_the_one_domain(self): - assert len(self.assignments_and_impl_domains) == 1 + assert len(self.subdomains) == 1 return self.implemented_domain - def generate_code(kernel): from loopy.codegen.prefetch import preprocess_prefetch kernel = preprocess_prefetch(kernel) @@ -151,10 +156,7 @@ def generate_code(kernel): CLLocal, CLImage, CLConstant) from loopy.symbolic import LoopyCCodeMapper - my_ccm = LoopyCCodeMapper(kernel) - - def ccm(expr, prec=PREC_NONE): - return my_ccm(expr, prec) + ccm = LoopyCCodeMapper(kernel) # {{{ build top-level @@ -266,13 +268,10 @@ def generate_code(kernel): # }}} - cgs = CodeGenerationState(c_code_mapper=ccm, try_slab_partition=True) - from loopy.codegen.dispatch import build_loop_nest - import islpy as isl - gen_code = build_loop_nest(cgs, kernel, 0, - ExecutionDomain(isl.Set.universe(kernel.space))) + gen_code = build_loop_nest(kernel, 0, + ExecutionDomain( kernel.assumptions, c_code_mapper=ccm)) body.extend([Line(), gen_code.ast]) #print "# conditionals: %d" % gen_code.num_conditionals diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index e602e951e2994a63b4de060d6c14e96ef17f9814..4c509d4c729b6ab44fdbc980e038a7f2745a835b 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -2,7 +2,7 @@ from __future__ import division import islpy as isl from islpy import dim_type -from pymbolic.mapper.stringifier import PREC_NONE +import numpy as np @@ -192,45 +192,61 @@ def wrap_in_for_from_constraints(ccm, iname, constraint_bset, stmt): cfm = CommutativeConstantFoldingMapper() from loopy.symbolic import constraint_to_expr - from pytools import any - if any(cns.is_equality() for cns in constraints): - raise NotImplementedError("equality constraints for 'for' loops") - else: - start_exprs = [] - end_conds = [] - - for cns in constraints: - rhs, iname_coeff = constraint_to_expr(cns, except_name=iname) - - if iname_coeff == 0: - continue - elif iname_coeff < 0: - from pymbolic import var - rhs += iname_coeff*var(iname) - end_conds.append("%s >= 0" % - ccm(cfm(expand(rhs)))) - else: # iname_coeff > 0 - kind, bound = solve_constraint_for_bound(cns, iname) - assert kind == ">=" - start_exprs.append(bound) - - if len(start_exprs) > 1: - from pymbolic.primitives import Max - start_expr = Max(start_exprs) - elif len(start_exprs) == 1: - start_expr, = start_exprs - else: - raise RuntimeError("no starting value found for 'for' loop in '%s'" - % iname) + start_exprs = [] + end_conds = [] + equality_exprs = [] - from cgen import For - from loopy.codegen import wrap_in - return wrap_in(For, - "int %s = %s" % (iname, ccm(start_expr, PREC_NONE)), - " && ".join(end_conds), - "++%s" % iname, - stmt) + for cns in constraints: + rhs, iname_coeff = constraint_to_expr(cns, except_name=iname) + + if iname_coeff == 0: + continue + + if cns.is_equality(): + kind, bound = solve_constraint_for_bound(cns, iname) + assert kind == "==" + equality_exprs.append(bound) + elif iname_coeff < 0: + from pymbolic import var + rhs += iname_coeff*var(iname) + end_conds.append("%s >= 0" % + ccm(cfm(expand(rhs)))) + else: # iname_coeff > 0 + kind, bound = solve_constraint_for_bound(cns, iname) + assert kind == ">=" + start_exprs.append(bound) + + if equality_exprs: + assert len(equality_exprs) == 1 + + equality_expr, = equality_exprs + + from loopy.codegen import gen_code_block + from cgen import Initializer, POD, Const, Line + return gen_code_block([ + Initializer(Const(POD(np.int32, iname)), + ccm(equality_expr)), + Line(), + stmt, + ]) + else: + if len(start_exprs) > 1: + from pymbolic.primitives import Max + start_expr = Max(start_exprs) + elif len(start_exprs) == 1: + start_expr, = start_exprs + else: + raise RuntimeError("no starting value found for 'for' loop in '%s'" + % iname) + + from cgen import For + from loopy.codegen import wrap_in + return wrap_in(For, + "int %s = %s" % (iname, ccm(start_expr)), + " && ".join(end_conds), + "++%s" % iname, + stmt) # }}} diff --git a/loopy/codegen/dispatch.py b/loopy/codegen/dispatch.py index f13b501a5171a63bcec91cea9dbb50d2b28b596a..e1f688a051fd3c1456813f05965e6c84fe34191f 100644 --- a/loopy/codegen/dispatch.py +++ b/loopy/codegen/dispatch.py @@ -6,10 +6,10 @@ from loopy.codegen import ExecutionDomain, gen_code_block -def build_loop_nest(cgs, kernel, sched_index, exec_domain, no_conditional_check=False): +def build_loop_nest(kernel, sched_index, exec_domain, no_conditional_check=False): assert isinstance(exec_domain, ExecutionDomain) - ccm = cgs.c_code_mapper + ccm = exec_domain.c_code_mapper from cgen import (POD, Initializer, Assign, Statement as S, Line) @@ -37,7 +37,7 @@ def build_loop_nest(cgs, kernel, sched_index, exec_domain, no_conditional_check= exec_domain = exec_domain.intersect(exec_domain_restriction) - inner = build_loop_nest(cgs, kernel, sched_index, exec_domain, + inner = build_loop_nest(kernel, sched_index, exec_domain, no_conditional_check=True) from loopy.codegen import wrap_in_if @@ -57,30 +57,21 @@ def build_loop_nest(cgs, kernel, sched_index, exec_domain, no_conditional_check= valid_index_vars = get_valid_check_vars(kernel, sched_index, allow_ilp=True) bounds_check_lists = [ - generate_bounds_checks_code(ccm, kernel.domain, - valid_index_vars, impl_domain) - for assignments, impl_domain in - exec_domain] + generate_bounds_checks_code(subd.c_code_mapper, kernel.domain, + valid_index_vars, subd.implemented_domain) + for subd in exec_domain.subdomains] result = [] for lvalue, expr in kernel.instructions: - for i, (assignments, impl_domain) in \ - enumerate(exec_domain): - - my_block = [] - if assignments: - my_block.extend(assignments+[Line()]) - + for i, subd in enumerate(exec_domain.subdomains): assert isinstance(lvalue, Subscript) name = lvalue.aggregate.name from loopy.codegen import wrap_in_if - my_block.append( - wrap_in_if( + result.append(wrap_in_if( bounds_check_lists[i], S("tmp_%s_%d += %s" - % (name, i, ccm(expr))))) - result.append(gen_code_block(my_block)) + % (name, i, subd.c_code_mapper(expr))))) return gen_code_block(result) @@ -109,42 +100,36 @@ def build_loop_nest(cgs, kernel, sched_index, exec_domain, no_conditional_check= else: func = generate_sequential_loop_dim_code - return func(cgs, kernel, sched_index, exec_domain) + return func(kernel, sched_index, exec_domain) elif isinstance(sched_item, WriteOutput): result = ( [Initializer(POD(kernel.arg_dict[lvalue.aggregate.name].dtype, "tmp_%s_%d" % (lvalue.aggregate.name, i)), 0) - for i in range(len(exec_domain)) + for i in range(len(exec_domain.subdomains)) for lvalue, expr in kernel.instructions] +[Line()] - +[build_loop_nest(cgs, kernel, sched_index+1, + +[build_loop_nest(kernel, sched_index+1, exec_domain)] +[Line()]) - for i, (idx_assignments, impl_domain) in \ - enumerate(exec_domain): + for i, subd in enumerate(exec_domain.subdomains): for lvalue, expr in kernel.instructions: - assignment = Assign(ccm(lvalue), "tmp_%s_%d" % ( + assignment = Assign(subd.c_code_mapper(lvalue), "tmp_%s_%d" % ( lvalue.aggregate.name, i)) wrapped_assign = wrap_in_bounds_checks( - ccm, kernel.domain, + subd.c_code_mapper, kernel.domain, get_valid_check_vars(kernel, sched_index, allow_ilp=True), - impl_domain, assignment) - - cb = [] - if idx_assignments: - cb.extend(idx_assignments+[Line()]) - cb.append(wrapped_assign) + subd.implemented_domain, assignment) - result.append(gen_code_block(cb)) + result.append(wrapped_assign) return gen_code_block(result) elif isinstance(sched_item, LocalMemoryPrefetch): from loopy.codegen.prefetch import generate_prefetch_code - return generate_prefetch_code(cgs, kernel, sched_index, + return generate_prefetch_code(kernel, sched_index, exec_domain) elif isinstance(sched_item, RegisterPrefetch): @@ -159,7 +144,7 @@ def build_loop_nest(cgs, kernel, sched_index, exec_domain, no_conditional_check= % (agg_name, ccm(sched_item.subscript_expr.index)))), - build_loop_nest(cgs, kernel, sched_index+1, exec_domain) + build_loop_nest(kernel, sched_index+1, exec_domain) ]) else: diff --git a/loopy/codegen/loop_dim.py b/loopy/codegen/loop_dim.py index 7c799a84c3b5026ac12a4f5d6e4be5353aa1cf65..888682e5aa76169879c9bc0e1ea1d27b089eac27 100644 --- a/loopy/codegen/loop_dim.py +++ b/loopy/codegen/loop_dim.py @@ -32,86 +32,46 @@ def get_simple_loop_bounds(kernel, sched_index, iname, implemented_domain): # {{{ conditional-minimizing slab decomposition -def get_slab_decomposition(cgs, kernel, sched_index, exec_domain): +def get_slab_decomposition(kernel, sched_index, exec_domain): from loopy.isl import block_shift_constraint, negate_constraint - ccm = cgs.c_code_mapper + ccm = exec_domain.c_code_mapper space = kernel.space iname = kernel.schedule[sched_index].iname tag = kernel.iname_to_tag.get(iname) - # {{{ attempt slab partition to reduce conditional count - lb_cns_orig, ub_cns_orig = get_simple_loop_bounds(kernel, sched_index, iname, exec_domain.implemented_domain) - # jostle the constant in {lb,ub}_cns to see if we can get - # fewer conditionals in the bulk middle segment - - class TrialRecord(Record): - pass - - if (cgs.try_slab_partition - and "outer" in iname): - trial_cgs = cgs.copy(try_slab_partition=False) - trials = [] - - for lower_incr, upper_incr in [ (0,0), (0,-1), ]: - - lb_cns = block_shift_constraint(lb_cns_orig, iname, -lower_incr) - ub_cns = block_shift_constraint(ub_cns_orig, iname, -upper_incr) - - bulk_slab = (isl.Set.universe(kernel.space) - .add_constraint(lb_cns) - .add_constraint(ub_cns)) - bulk_exec_domain = exec_domain.intersect(bulk_slab) - inner = build_loop_nest(trial_cgs, kernel, sched_index+1, - bulk_exec_domain) - - trials.append((TrialRecord( - lower_incr=lower_incr, - upper_incr=upper_incr, - bulk_slab=bulk_slab), - (inner.num_conditionals, - # when all num_conditionals are equal, choose the - # one with the smallest bounds changes - abs(upper_incr)+abs(lower_incr)))) - - from pytools import argmin2 - chosen = argmin2(trials) - else: - bulk_slab = (isl.Set.universe(kernel.space) - .add_constraint(lb_cns_orig) - .add_constraint(ub_cns_orig)) - chosen = TrialRecord( - lower_incr=0, - upper_incr=0, - bulk_slab=bulk_slab) - - # }}} + lower_incr, upper_incr = kernel.iname_slab_increments.get(iname, (0, 0)) # {{{ build slabs slabs = [] - if chosen.lower_incr: + if lower_incr: slabs.append(("initial", isl.Set.universe(kernel.space) .add_constraint(lb_cns_orig) .add_constraint(ub_cns_orig) .add_constraint( negate_constraint( block_shift_constraint( - lb_cns_orig, iname, -chosen.lower_incr))))) + lb_cns_orig, iname, -lower_incr))))) - slabs.append(("bulk", chosen.bulk_slab)) + slabs.append(("bulk", + (isl.Set.universe(kernel.space) + .add_constraint( + block_shift_constraint(lb_cns_orig, iname, -lower_incr)) + .add_constraint( + block_shift_constraint(ub_cns_orig, iname, -upper_incr))))) - if chosen.upper_incr: + if upper_incr: slabs.append(("final", isl.Set.universe(kernel.space) .add_constraint(ub_cns_orig) .add_constraint(lb_cns_orig) .add_constraint( negate_constraint( block_shift_constraint( - ub_cns_orig, iname, -chosen.upper_incr))))) + ub_cns_orig, iname, -upper_incr))))) # }}} @@ -121,13 +81,13 @@ def get_slab_decomposition(cgs, kernel, sched_index, exec_domain): # {{{ unrolled/ILP loops -def generate_unroll_or_ilp_code(cgs, kernel, sched_index, exec_domain): +def generate_unroll_or_ilp_code(kernel, sched_index, exec_domain): from loopy.isl import block_shift_constraint from loopy.codegen.bounds import solve_constraint_for_bound from cgen import (POD, Assign, Line, Statement as S, Initializer, Const) - ccm = cgs.c_code_mapper + ccm = exec_domain.c_code_mapper space = kernel.space iname = kernel.schedule[sched_index].iname tag = kernel.iname_to_tag.get(iname) @@ -162,8 +122,7 @@ def generate_unroll_or_ilp_code(cgs, kernel, sched_index, exec_domain): for i, slab in generate_idx_eq_slabs(): new_exec_domain = exec_domain.intersect(slab) - inner = build_loop_nest(cgs, kernel, sched_index+1, - new_exec_domain) + inner = build_loop_nest(kernel, sched_index+1, new_exec_domain) if isinstance(tag, TAG_UNROLL_STATIC): result.extend([ @@ -175,24 +134,25 @@ def generate_unroll_or_ilp_code(cgs, kernel, sched_index, exec_domain): return gen_code_block(result) elif isinstance(tag, TAG_ILP): - new_aaid = [] - for assignments, implemented_domain in exec_domain: + new_subdomains = [] + for subd in exec_domain.subdomains: for i, single_slab in generate_idx_eq_slabs(): - assignments = assignments + [ - Initializer(Const(POD(np.int32, iname)), ccm(lower_bound+i))] - new_aaid.append((assignments, - implemented_domain.intersect(single_slab))) - - assignments = [] + from loopy.codegen import ExecutionSubdomain + new_subdomains.append( + ExecutionSubdomain( + subd.implemented_domain.intersect(single_slab), + subd.c_code_mapper.copy_and_assign( + iname, lower_bound+i))) overall_slab = (isl.Set.universe(kernel.space) .add_constraint(lower_cns) .add_constraint(upper_cns)) - return build_loop_nest(cgs, kernel, sched_index+1, + return build_loop_nest(kernel, sched_index+1, ExecutionDomain( exec_domain.implemented_domain.intersect(overall_slab), - new_aaid)) + exec_domain.c_code_mapper, + new_subdomains)) else: assert False, "not supposed to get here" @@ -200,16 +160,16 @@ def generate_unroll_or_ilp_code(cgs, kernel, sched_index, exec_domain): # {{{ parallel loop -def generate_parallel_loop_dim_code(cgs, kernel, sched_index, exec_domain): +def generate_parallel_loop_dim_code(kernel, sched_index, exec_domain): from loopy.isl import make_slab - ccm = cgs.c_code_mapper + ccm = exec_domain.c_code_mapper space = kernel.space iname = kernel.schedule[sched_index].iname tag = kernel.iname_to_tag.get(iname) lb_cns_orig, ub_cns_orig, slabs = get_slab_decomposition( - cgs, kernel, sched_index, exec_domain) + kernel, sched_index, exec_domain) # For a parallel loop dimension, the global loop bounds are # automatically obeyed--simply because no work items are launched @@ -242,8 +202,7 @@ def generate_parallel_loop_dim_code(cgs, kernel, sched_index, exec_domain): domain=kernel.domain.intersect(slab)) result.append( add_comment(cmt, - build_loop_nest(cgs, new_kernel, sched_index+1, - exec_domain))) + build_loop_nest(new_kernel, sched_index+1, exec_domain))) from loopy.codegen import gen_code_block return gen_code_block(result, is_alternatives=True) @@ -252,15 +211,15 @@ def generate_parallel_loop_dim_code(cgs, kernel, sched_index, exec_domain): # {{{ sequential loop -def generate_sequential_loop_dim_code(cgs, kernel, sched_index, exec_domain): +def generate_sequential_loop_dim_code(kernel, sched_index, exec_domain): - ccm = cgs.c_code_mapper + ccm = exec_domain.c_code_mapper space = kernel.space iname = kernel.schedule[sched_index].iname tag = kernel.iname_to_tag.get(iname) lb_cns_orig, ub_cns_orig, slabs = get_slab_decomposition( - cgs, kernel, sched_index, exec_domain) + kernel, sched_index, exec_domain) result = [] nums_of_conditionals = [] @@ -271,7 +230,7 @@ def generate_sequential_loop_dim_code(cgs, kernel, sched_index, exec_domain): cmt = None new_exec_domain = exec_domain.intersect(slab) - inner = build_loop_nest(cgs, kernel, sched_index+1, + inner = build_loop_nest(kernel, sched_index+1, new_exec_domain) from loopy.codegen.bounds import wrap_in_for_from_constraints diff --git a/loopy/codegen/prefetch.py b/loopy/codegen/prefetch.py index 22af66d33b52e22d4d877440a60eb3af2619da59..155927e4b3be4ae1ae6913fd574773e0209c7374 100644 --- a/loopy/codegen/prefetch.py +++ b/loopy/codegen/prefetch.py @@ -105,24 +105,21 @@ def make_fetch_loop_nest(flnd, fetch_dim_idx, pf_dim_exprs, iname_subst_map, # done, return from pymbolic.primitives import Variable, Subscript - from pymbolic.mapper.stringifier import PREC_NONE result = Assign( pf.name + "".join("[%s]" % ccm(dexpr) for dexpr in pf_dim_exprs), no_pf_ccm( Subscript( Variable(pf.input_vector), - substitute(pf.index_expr, iname_subst_map)), - PREC_NONE)) - - def my_ccm(expr): - return ccm(substitute(expr, iname_subst_map)) + substitute(pf.index_expr, iname_subst_map)))) from pymbolic.mapper.dependency import DependencyMapper check_vars = [v.name for v in DependencyMapper()(pf.index_expr)] from loopy.codegen.bounds import wrap_in_bounds_checks - return wrap_in_bounds_checks(my_ccm, pf.kernel.domain, + return wrap_in_bounds_checks( + ccm.copy_and_assign_many(iname_subst_map), + pf.kernel.domain, check_vars, implemented_domain, result) fetch_inames = pf.fetch_dims[fetch_dim_idx] @@ -280,12 +277,12 @@ def make_fetch_loop_nest(flnd, fetch_dim_idx, pf_dim_exprs, iname_subst_map, # }}} -def generate_prefetch_code(cgs, kernel, sched_index, exec_domain): +def generate_prefetch_code(kernel, sched_index, exec_domain): implemented_domain = exec_domain.implemented_domain from cgen import Statement as S, Line, Comment - ccm = cgs.c_code_mapper + ccm = exec_domain.c_code_mapper scheduled_pf = kernel.schedule[sched_index] pf = kernel.prefetch[ @@ -482,7 +479,7 @@ def generate_prefetch_code(cgs, kernel, sched_index, exec_domain): from loopy.codegen.dispatch import build_loop_nest new_block.extend([Line(), - build_loop_nest(cgs, kernel, sched_index+1, exec_domain)]) + build_loop_nest(kernel, sched_index+1, exec_domain)]) return gen_code_block(new_block) diff --git a/loopy/compiled.py b/loopy/compiled.py index bd72ecdaf2894bf61752181aa7f22dfc80e16077..a2b4210d6105f4c458f19766f8677b887619edb8 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -44,7 +44,7 @@ class CompiledKernel: from pymbolic import compile if size_args is None: - self.size_args = [arg.name for arg in kernel.args if isinstance(arg, ScalarArg)] + self.size_args = kernel.scalar_loop_args else: self.size_args = size_args diff --git a/loopy/kernel.py b/loopy/kernel.py index f1c8638382d0b7eab5ada3c3781f00f786b81197..30aa1c515addeb4b01cf075a0a19f2efc67b76a3 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -6,6 +6,8 @@ import numpy as np from pytools import Record, memoize_method from pymbolic.mapper.dependency import DependencyMapper import pyopencl as cl +import islpy as isl +from islpy import dim_type @@ -32,9 +34,17 @@ class ArrayArg: raise ValueError("can only specify one of shape and strides") if strides is not None: + if isinstance(strides, str): + from pymbolic import parse + strides = parse(strides) + strides = tuple(strides) if shape is not None: + if isinstance(shape, str): + from pymbolic import parse + shape = parse(shape) + from pyopencl.compyte.array import ( f_contiguous_strides, c_contiguous_strides) @@ -67,7 +77,7 @@ class ImageArg: class ScalarArg: - def __init__(self, name, dtype, approximately): + def __init__(self, name, dtype, approximately=None): self.name = name self.dtype = np.dtype(dtype) self.approximately = approximately @@ -176,34 +186,57 @@ class LoopKernel(Record): :ivar register_prefetch: :ivar name: :ivar preamble: + :ivar assumptions: the initial implemented_domain, captures assumptions + on the parameters. (an isl.Set) + :ivar iname_slab_increments: a dictionary mapping inames to (lower_incr, + upper_incr) tuples that will be separated out in the execution to generate + 'bulk' slabs with fewer conditionals. """ def __init__(self, device, domain, instructions, args=None, prefetch={}, schedule=None, register_prefetch=None, name="loopy_kernel", iname_to_tag={}, is_divisible=lambda dividend, divisor: False, - preamble=None): + preamble=None, assumptions=None, + iname_slab_increments={}): """ :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl. Example: "{[i,j]: 0<=i < 10 and 0<= j < 9}" :arg iname_to_tag: a map from loop domain variables to subclasses of :class:`IndexTag` """ - from pymbolic import parse - def parse_if_necessary(v): - if isinstance(v, str): - return parse(v) - else: - return v + def parse_if_necessary(insn): + from pymbolic import parse + if isinstance(insn, str): + lhs, rhs = insn.split("=") + elif isinstance(insn, tuple): + lhs, rhs = insn + + if isinstance(lhs, str): + lhs = parse(lhs) + if isinstance(rhs, str): + from loopy.symbolic import CSERealizer + rhs = CSERealizer()(parse(rhs)) + + return lhs, rhs if isinstance(domain, str): - import islpy as isl ctx = isl.Context() domain = isl.Set.read_from_str(ctx, domain, nparam=-1) - insns = [ - (parse_if_necessary(lvalue), - parse_if_necessary(expr)) - for lvalue, expr in instructions] + insns = [parse_if_necessary(insn) for insn in instructions] + + if assumptions is None: + assumptions = isl.Set.universe(domain.get_dim()) + elif isinstance(assumptions, str): + s = domain.get_dim() + assumptions = isl.BasicSet.read_from_str(domain.get_ctx(), + "[%s] -> {[%s]: %s}" + % (",".join(s.get_name(dim_type.param, i) + for i in range(s.size(dim_type.param))), + ",".join(s.get_name(dim_type.set, i) + for i in range(s.size(dim_type.set))), + assumptions), + nparam=-1) Record.__init__(self, device=device, args=args, domain=domain, instructions=insns, @@ -213,7 +246,9 @@ class LoopKernel(Record): (iname, parse_tag(tag)) for iname, tag in iname_to_tag.iteritems()), is_divisible=is_divisible, - preamble=preamble) + preamble=preamble, + assumptions=assumptions, + iname_slab_increments=iname_slab_increments) # FIXME: assert empty intersection of loop vars and args # FIXME: assert that isl knows about loop parameters @@ -236,12 +271,16 @@ class LoopKernel(Record): def arg_dict(self): return dict((arg.name, arg) for arg in self.args) + @property @memoize_method - def scalar_args(self): + def scalar_loop_args(self): if self.args is None: - return set() + return [] else: - return set(arg.name for arg in self.args if isinstance(arg, ScalarArg)) + loop_arg_names = [self.space.get_name(dim_type.param, i) + for i in range(self.space.size(dim_type.param))] + return [arg.name for arg in self.args if isinstance(arg, ScalarArg) + if arg.name in loop_arg_names] @memoize_method def all_inames(self): @@ -332,7 +371,7 @@ class LoopKernel(Record): input_vectors.update( set(v.name for v in dm(expr))) - return input_vectors - self.all_inames() - self.scalar_args() + return input_vectors - self.all_inames() - set(self.scalar_loop_args) @memoize_method def output_vectors(self): @@ -343,7 +382,7 @@ class LoopKernel(Record): output_vectors.update( set(v.name for v in dm(lvalue))) - return output_vectors - self.all_inames() - self.scalar_args() + return output_vectors - self.all_inames() - self.scalar_loop_args def _subst_insns(self, old_var, new_expr): from pymbolic.mapper.substitutor import substitute @@ -393,7 +432,8 @@ class LoopKernel(Record): def split_dimension(self, name, inner_length, padded_length=None, outer_name=None, inner_name=None, - outer_tag=None, inner_tag=None): + outer_tag=None, inner_tag=None, + outer_slab_increments=(0, -1)): outer_tag = parse_tag(outer_tag) inner_tag = parse_tag(inner_tag) @@ -434,33 +474,41 @@ class LoopKernel(Record): from islpy import dim_type outer_var_nr = self.space.size(dim_type.set) inner_var_nr = self.space.size(dim_type.set)+1 - new_domain = self.domain.add_dims(dim_type.set, 2) - new_domain.set_dim_name(dim_type.set, outer_var_nr, outer_name) - new_domain.set_dim_name(dim_type.set, inner_var_nr, inner_name) - import islpy as isl - from loopy.isl import make_slab + def process_set(s): + s = s.add_dims(dim_type.set, 2) + s.set_dim_name(dim_type.set, outer_var_nr, outer_name) + s.set_dim_name(dim_type.set, inner_var_nr, inner_name) + + from loopy.isl import make_slab + + space = s.get_dim() + inner_constraint_set = ( + make_slab(space, inner_name, 0, inner_length) + # name = inner + length*outer + .add_constraint(isl.Constraint.eq_from_names( + space, {name:1, inner_name: -1, outer_name:-inner_length}))) - space = new_domain.get_dim() - inner_constraint_set = ( - make_slab(space, inner_name, 0, inner_length) - # name = inner + length*outer - .add_constraint(isl.Constraint.eq_from_names( - space, {name:1, inner_name: -1, outer_name:-inner_length}))) + name_dim_type, name_idx = space.get_var_dict()[name] + return (s + .intersect(inner_constraint_set) + .project_out(name_dim_type, name_idx, 1)) - name_dim_type, name_idx = space.get_var_dict()[name] - new_domain = (new_domain - .intersect(inner_constraint_set) - .project_out(name_dim_type, name_idx, 1)) + new_domain = process_set(self.domain) + new_assumptions = process_set(self.assumptions) from pymbolic import var inner = var(inner_name) outer = var(outer_name) new_loop_index = inner + outer*inner_length + iname_slab_increments = self.iname_slab_increments.copy() + iname_slab_increments[outer_name] = outer_slab_increments return (self .substitute(name, new_loop_index) - .copy(domain=new_domain, iname_to_tag=new_iname_to_tag)) + .copy(domain=new_domain, iname_to_tag=new_iname_to_tag, + assumptions=new_assumptions, + iname_slab_increments=iname_slab_increments)) def get_problems(self, parameters, emit_warnings=True): """ diff --git a/loopy/prefetch.py b/loopy/prefetch.py index 49330236cad088a428b5e2837512daf4d98481b4..98aeed94806fdf73b18dd1e20d5ab81c83bbca8e 100644 --- a/loopy/prefetch.py +++ b/loopy/prefetch.py @@ -9,7 +9,7 @@ from islpy import dim_type # {{{ register prefetches class RegisterPrefetch(Record): - __slots__ = ["subscript_expr", "new_name"] + __slots__ = ["subexprs", "new_names"] def insert_register_prefetches(kernel): reg_pf = {} @@ -54,7 +54,7 @@ def insert_register_prefetches(kernel): reg_pf[iexpr] = new_name schedule.insert(sched_index, RegisterPrefetch( - index_expr=iexpr, new_name=new_name)) + subexprs=[iexpr], new_names=[new_name])) sched_index += 1 else: i += 1 @@ -178,7 +178,7 @@ class LocalMemoryPrefetch(Record): from pymbolic.mapper.dependency import DependencyMapper return set(var.name for var in DependencyMapper()(self.index_expr) - ) - set(self.all_inames()) - self.kernel.scalar_args() + ) - set(self.all_inames()) - set(self.kernel.scalar_loop_args) def hash(self): return (hash(type(self)) ^ hash(self.input_vector) diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 2e2587f1dc184b508e049079b820ee337ca4b37a..b928d4b31bb280d68bf7d822b7e1bde2cd1b352d 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -2,7 +2,7 @@ from __future__ import division -from pymbolic.mapper import CombineMapper, RecursiveMapper +from pymbolic.mapper import CombineMapper, RecursiveMapper, IdentityMapper from pymbolic.mapper.c_code import CCodeMapper from pymbolic.mapper.stringifier import PREC_NONE import numpy as np @@ -120,7 +120,8 @@ class VariableIndexExpressionCollector(CombineMapper): # {{{ C code mapper class LoopyCCodeMapper(CCodeMapper): - def __init__(self, kernel, no_prefetch=False): + def __init__(self, kernel, no_prefetch=False, cse_name_list=[], + var_subst_map={}): def constant_mapper(c): if isinstance(c, float): # FIXME: type-variable @@ -128,11 +129,39 @@ class LoopyCCodeMapper(CCodeMapper): else: return repr(c) - CCodeMapper.__init__(self, constant_mapper=constant_mapper) + CCodeMapper.__init__(self, constant_mapper=constant_mapper, + cse_name_list=cse_name_list) self.kernel = kernel + self.var_subst_map = var_subst_map.copy() + self.no_prefetch = no_prefetch + def copy(self, var_subst_map=None, cse_name_list=None): + if var_subst_map is None: + var_subst_map = self.var_subst_map + if cse_name_list is None: + cse_name_list = self.cse_name_list + return LoopyCCodeMapper(self.kernel, no_prefetch=self.no_prefetch, + cse_name_list=cse_name_list, var_subst_map=var_subst_map) + + def copy_and_assign(self, name, value): + var_subst_map = self.var_subst_map.copy() + var_subst_map[name] = value + return self.copy(var_subst_map=var_subst_map) + + def copy_and_assign_many(self, assignments): + var_subst_map = self.var_subst_map.copy() + var_subst_map.update(assignments) + return self.copy(var_subst_map=var_subst_map) + + def map_variable(self, expr, prec): + if expr.name in self.var_subst_map: + return " /* %s */ %s" % ( + expr.name, self.rec(self.var_subst_map[expr.name], prec)) + else: + return CCodeMapper.map_variable(self, expr, prec) + def map_subscript(self, expr, enclosing_prec): from pymbolic.primitives import Variable if (not self.no_prefetch @@ -143,11 +172,11 @@ class LoopyCCodeMapper(CCodeMapper): except KeyError: pass else: - from pymbolic.mapper.stringifier import PREC_SUM + from pymbolic import var return pf.name+"".join( - "[%s - %s]" % (iname, self.rec( - pf.dim_bounds_by_iname[iname][0], - PREC_SUM)) + "[%s]" % self.rec( + var(iname) - pf.dim_bounds_by_iname[iname][0], + PREC_NONE) for iname in pf.all_inames()) if isinstance(expr.aggregate, Variable): @@ -252,6 +281,26 @@ def constraint_to_expr(cns, except_name=None): # }}} +# {{{ CSE "realizer" + +class CSERealizer(IdentityMapper): + """Looks for invocations of a function called 'cse' and turns those into + CommonSubexpression objects. + """ + + def map_call(self, expr): + from pymbolic import Variable + if isinstance(expr.function, Variable) and expr.function.name == "cse": + from pymbolic.primitives import CommonSubexpression + if len(expr.parameters) == 1: + return CommonSubexpression(expr.parameters[0]) + else: + raise TypeError("cse takes exactly one argument") + else: + return IdentityMapper.map_call(self, expr) + +# }}} + diff --git a/test/test_matmul.py b/test/test_matmul.py index 7aa72e646ac2aa800e390ee32245e103b25d24e4..c512640c42c83ebaac34166154afab461c30912f 100644 --- a/test/test_matmul.py +++ b/test/test_matmul.py @@ -2,6 +2,7 @@ import numpy as np import numpy.linalg as la import pyopencl as cl import pyopencl.array as cl_array +import pyopencl.clrandom as cl_random import loopy as lp from pyopencl.tools import pytest_generate_tests_for_pyopencl \ @@ -47,7 +48,12 @@ def check_error(refsol, sol): if not DO_CHECK: return - rel_err = la.norm(refsol-sol, "fro")/la.norm(refsol, "fro") + if sol.shape == 2: + norm_order = "fro" + else: + norm_order = 2 + + rel_err = la.norm(refsol-sol, norm_order)/la.norm(refsol, norm_order) if rel_err > 1e-5 or np.isinf(rel_err) or np.isnan(rel_err): if 1: import matplotlib.pyplot as pt @@ -86,22 +92,20 @@ def test_axpy(ctx_factory): queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) - n = get_suitable_size(ctx) - from pymbolic import var - x, y, z, n_sym, i = [var(s) for s in "xyzni"] - - n_approx = 10**6 + n = get_suitable_size(ctx)**3 knl = lp.LoopKernel(ctx.devices[0], "[n] -> {[i]: 0<=i<n}", [ - (z[i], x[i]+y[i]) # FIXME: Add scalars + "z[i] = a*x[i]+b*y[i]" ], [ - lp.ArrayArg("x", dtype, shape=(n,)), - lp.ArrayArg("y", dtype, shape=(n,)), - lp.ArrayArg("z", dtype, shape=(n,)), - lp.ScalarArg("n", np.int32, approximately=n_approx), + lp.ScalarArg("a", dtype), + lp.ArrayArg("x", dtype, shape="n,"), + lp.ScalarArg("b", dtype), + lp.ArrayArg("y", dtype, shape="n,"), + lp.ArrayArg("z", dtype, shape="n,"), + lp.ScalarArg("n", np.int32, approximately=n), ], name="matmul") @@ -109,27 +113,26 @@ def test_axpy(ctx_factory): block_size = 256 knl = lp.split_dimension(knl, "i", unroll*block_size, outer_tag="g.0") knl = lp.split_dimension(knl, "i_inner", block_size, outer_tag="unr", inner_tag="l.0") - assert knl.get_problems({"n": n_approx})[0] <= 2 + assert knl.get_problems({"n": n})[0] <= 2 kernel_gen = (lp.insert_register_prefetches(knl) for knl in lp.generate_loop_schedules(knl)) - #a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) - #b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) - #c = cl_array.empty_like(a) - #refsol = np.dot(a.get(), b.get()) + a = cl_random.rand(queue, n, dtype=dtype) + b = cl_random.rand(queue, n, dtype=dtype) + c = cl_array.empty_like(a) + refsol = (2*a+3*b).get() def launcher(kernel, gsize, lsize, check): - #evt = kernel(queue, gsize(), lsize(), a.data, b.data, c.data, - #g_times_l=True) + evt = kernel(queue, gsize(n), lsize(n), 2, a.data, 3, b.data, c.data, n, + g_times_l=True) - #if check: - #check_error(refsol, c.get()) + if check: + check_error(refsol, c.get()) - #return evt - pass + return evt - lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3) + lp.drive_timing_run(kernel_gen, queue, launcher, 5*n) @@ -143,13 +146,11 @@ def test_plain_matrix_mul(ctx_factory): properties=cl.command_queue_properties.PROFILING_ENABLE) n = get_suitable_size(ctx) - from pymbolic import var - a, b, c, i, j, k, n_sym = [var(s) for s in "abcijkn"] knl = lp.LoopKernel(ctx.devices[0], "{[i,j,k]: 0<=i,j,k<%d}" % n, [ - (c[i, j], a[i, k]*b[k, j]) + "c[i, j] = a[i, k]*b[k, j]" ], [ lp.ArrayArg("a", dtype, shape=(n, n), order=order), @@ -196,13 +197,11 @@ def test_image_matrix_mul(ctx_factory): properties=cl.command_queue_properties.PROFILING_ENABLE) n = get_suitable_size(ctx) - from pymbolic import var - a, b, c, i, j, k, n_sym = [var(s) for s in "abcijkn"] knl = lp.LoopKernel(ctx.devices[0], "{[i,j,k]: 0<=i,j,k<%d}" % n, [ - (c[i, j], a[i, k]*b[k, j]) + "c[i, j] = a[i, k]*b[k, j]" ], [ lp.ImageArg("a", dtype, 2), @@ -251,13 +250,11 @@ def test_image_matrix_mul_ilp(ctx_factory): properties=cl.command_queue_properties.PROFILING_ENABLE) n = get_suitable_size(ctx) - from pymbolic import var - a, b, c, i, j, k, n_sym = [var(s) for s in "abcijkn"] knl = lp.LoopKernel(ctx.devices[0], "{[i,j,k]: 0<=i,j,k<%d}" % n, [ - (c[i, j], a[i, k]*b[k, j]) + "c[i, j] = a[i, k]*b[k, j]" ], [ lp.ImageArg("a", dtype, 2), @@ -312,18 +309,16 @@ def test_fancy_matrix_mul(ctx_factory): order = "C" n = get_suitable_size(ctx) - from pymbolic import var - a, b, c, i, j, k, n_sym = [var(s) for s in "abcijkn"] knl = lp.LoopKernel(ctx.devices[0], "[n] -> {[i,j,k]: 0<=i,j,k<n }", [ - (c[i, j], a[i, k]*b[k, j]) + "c[i, j] = a[i, k]*b[k, j]" ], [ - lp.ArrayArg("a", dtype, shape=(n_sym, n_sym), order=order), - lp.ArrayArg("b", dtype, shape=(n_sym, n_sym), order=order), - lp.ArrayArg("c", dtype, shape=(n_sym, n_sym), order=order), + lp.ArrayArg("a", dtype, shape="(n, n)", order=order), + lp.ArrayArg("b", dtype, shape="(n, n)", order=order), + lp.ArrayArg("c", dtype, shape="(n, n)", order=order), lp.ScalarArg("n", np.int32, approximately=1000), ], name="fancy_matmul")