diff --git a/examples/matrix-ops.py b/examples/matrix-ops.py index a8cd27539213d727af5eb3d3521613fa14a68303..fb585d58271a313292a1022300cd6ba178c7da2a 100644 --- a/examples/matrix-ops.py +++ b/examples/matrix-ops.py @@ -10,21 +10,46 @@ import loopy as lp FAST_OPTIONS = ["-cl-mad-enable", "-cl-fast-relaxed-math", "-cl-no-signed-zeros", "-cl-strict-aliasing"] -def make_well_conditioned_dev_matrix(queue, n, dtype=np.float32, order="C"): - return cl_array.to_device(queue, - np.asarray(np.random.randn(n, n) + 5*np.eye(n), - dtype=dtype, order=order)) +def make_well_conditioned_dev_matrix(queue, n, dtype=np.float32, order="C", ran_factor=1, od=0): + ary = np.asarray( + ran_factor*np.random.randn(n, n) + + 5*np.eye(n, k=od), + dtype=dtype, order=order) + + return cl_array.to_device(queue, ary) + + + + +DO_CHECK = True + +DEBUG_PREAMBLE = r""" + #pragma OPENCL EXTENSION cl_amd_printf: enable + #define IFDIAG if (i_outer*16+i_inner == j_outer*16+j_inner) + #define TST(S) IFDIAG if (i_outer*16+i_inner == 0) \ + printf("ko=%d ki=%d" #S "\n", k_outer, k_inner); + """ def check_error(refsol, sol): rel_err = la.norm(refsol-sol, "fro")/la.norm(refsol, "fro") - if rel_err > 1e-5: - import matplotlib.pyplot as pt - pt.imshow(refsol-sol) - pt.colorbar() - pt.show() + if DO_CHECK and rel_err > 1e-5: + if 0: + import matplotlib.pyplot as pt + pt.imshow(refsol-sol) + pt.colorbar() + pt.show() + elif 0: + print "---------------------------" + print "ACTUAL" + print "---------------------------" + print sol + print "---------------------------" + print "CORRECT" + print "---------------------------" + print refsol raise RuntimeError("check failed, rel err=%g" % rel_err) @@ -102,15 +127,21 @@ def image_matrix_mul(ctx_factory=cl.create_some_context): [ lp.ImageArg("a", dtype, 2), lp.ImageArg("b", dtype, 2), + #lp.ArrayArg("a", dtype, shape=(n, n), order=order), + #lp.ArrayArg("b", dtype, shape=(n, n), order=order), lp.ArrayArg("c", dtype, shape=(n, n), order=order), ], name="matmul") knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") - knl = lp.split_dimension(knl, "k", 16) + knl = lp.split_dimension(knl, "k", 32) + # slow + #knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) + #knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner"]) + # fast knl = lp.add_prefetch(knl, 'a', ["k_inner", "i_inner"]) - knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner", ]) + knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner", ]) assert knl.get_invalid_reason() is None kernel_gen = (lp.insert_register_prefetches(knl) @@ -147,7 +178,7 @@ def fancy_matrix_mul(ctx_factory=cl.create_some_context): order = "F" - n = 16*100 + n = 16*10 from pymbolic import var a, b, c, i, j, k, n_sym = [var(s) for s in "abcijkn"] @@ -165,7 +196,7 @@ def fancy_matrix_mul(ctx_factory=cl.create_some_context): knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") - knl = lp.split_dimension(knl, "k", 19) + knl = lp.split_dimension(knl, "k", 16) knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"]) assert knl.get_invalid_reason() is None @@ -173,8 +204,10 @@ def fancy_matrix_mul(ctx_factory=cl.create_some_context): kernel_gen = (lp.insert_register_prefetches(knl) for knl in lp.generate_loop_schedules(knl)) - a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) - b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order) + a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order, + ran_factor=0) + b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order, + ran_factor=0) c = cl_array.empty_like(a) refsol = np.dot(a.get(), b.get()) @@ -276,4 +309,4 @@ if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) else: - image_matrix_mul() + fancy_matrix_mul() diff --git a/loopy/__init__.py b/loopy/__init__.py index 47df9b0d00f2cebb6d9ffcf05085ce33c7f87ba4..0fc63dcfc0854b01a674c18de9ea40decaeb7f83 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -22,11 +22,9 @@ register_mpz_with_pymbolic() -# TODO: Wrong 19 -# TODO: Restrict on/off # TODO: Try, fix reg. prefetch -# TODO: 1D local arrays # TODO: Divisibility +# TODO: Reasoning about bank conflicts # TODO: Functions # TODO: Common subexpressions # TODO: Try different kernels @@ -204,29 +202,31 @@ def get_projected_bounds_constraints(set, iname): -def solve_constraint_for_equality_bound(cns, iname): +def solve_constraint_for_bound(cns, iname): rhs, iname_coeff = constraint_to_expr(cns, except_name=iname) if iname_coeff == 0: - raise RuntimeError("cannot solve for bound in constraint " - "not containing variable") - - assert iname_coeff > 0 or cns.is_equality() + raise ValueError("cannot solve constraint for '%s'--" + "constraint does not contain variable" + % iname) from pymbolic import expand - return expand(-rhs)//iname_coeff - -def solve_constraint_for_upper_bound(cns, iname): - rhs, iname_coeff = constraint_to_expr(cns, except_name=iname) + from pytools import div_ceil + from pymbolic import flatten + from pymbolic.mapper.constant_folder import CommutativeConstantFoldingMapper + cfm = CommutativeConstantFoldingMapper() - if iname_coeff == 0: - raise RuntimeError("cannot solve for bound in constraint " - "not containing variable") + if iname_coeff > 0 or cns.is_equality(): + if cns.is_equality(): + kind = "==" + else: + kind = ">=" - assert iname_coeff < 0 + return kind, cfm(flatten(div_ceil(expand(-rhs), iname_coeff))) + else: # iname_coeff < 0 + from pytools import div_ceil + return "<", cfm(flatten(div_ceil(rhs+1, -iname_coeff))) - from pytools import div_ceil - return div_ceil(rhs+1, -iname_coeff) def get_projected_bounds(set, iname): """Get an overapproximation of the loop bounds for the variable *iname*, @@ -235,21 +235,20 @@ def get_projected_bounds(set, iname): lb_cns, ub_cns = get_projected_bounds_constraints(set, iname) - from pymbolic.mapper.constant_folder import CommutativeConstantFoldingMapper - from pymbolic import flatten - cfm = CommutativeConstantFoldingMapper() - for cns in [lb_cns, ub_cns]: - rhs, iname_coeff = constraint_to_expr(cns, except_name=iname) + iname_tp, iname_idx = lb_cns.get_dim().get_var_dict()[iname] + iname_coeff = cns.get_coefficient(iname_tp, iname_idx) if iname_coeff == 0: - return - elif iname_coeff < 0: - from pytools import div_ceil - ub = cfm(flatten(div_ceil(rhs+1, -iname_coeff))) - else: # iname_coeff > 0 - from pymbolic import expand - lb = cfm(flatten(expand(-rhs)//iname_coeff)) + continue + + kind, bound = solve_constraint_for_bound(cns, iname) + if kind == "<": + ub = bound + elif kind == ">=": + lb = bound + else: + raise ValueError("unsupported constraint kind") return lb, ub @@ -623,10 +622,6 @@ class LoopKernel(Record): def tag_type_lengths(self, tag_cls): return [stop-start for start, stop in self.tag_type_bounds(tag_cls)] - def tag_type_count(self, tag_cls): - from pytools import product - return product(self.tag_type_lengths(tag_cls)) - def tag_or_iname_to_iname(self, s): try: tag = parse_tag(s) @@ -1191,6 +1186,11 @@ class LoopyCCodeMapper(CCodeMapper): return CCodeMapper.map_subscript(self, expr, enclosing_prec) + def map_floor_div(self, expr, prec): + return ("floor_int_div(%s, %s)" + % (self.rec(expr.numerator, PREC_NONE), + self.rec(expr.denominator, PREC_NONE))) + # }}} # {{{ prefetch code generation @@ -1501,8 +1501,11 @@ def generate_loop_dim_code(cgs, kernel, sched_index, lower_cns = cast_constraint_to_space(lower_cns, space) upper_cns = cast_constraint_to_space(upper_cns, space) - lower_bound = solve_constraint_for_equality_bound(lower_cns, iname) - upper_bound = solve_constraint_for_upper_bound(upper_cns, iname) + lower_kind, lower_bound = solve_constraint_for_bound(lower_cns, iname) + upper_kind, upper_bound = solve_constraint_for_bound(upper_cns, iname) + + assert lower_kind == ">=" + assert upper_kind == "<" from pymbolic import flatten from pymbolic.mapper.constant_folder import CommutativeConstantFoldingMapper @@ -1605,7 +1608,6 @@ def generate_loop_dim_code(cgs, kernel, sched_index, block_shift_constraint( ub_cns_orig, iname, -chosen.upper_incr))))) - if isinstance(tag, ImplicitlyParallelTag): # For a parallel loop dimension, the global loop bounds are # automatically obeyed--simply because no work items are launched @@ -1719,7 +1721,7 @@ def wrap_in_for_from_constraints(ccm, iname, constraint_bset, stmt): constraints = constraint_bset.get_constraints() - from pymbolic import flatten, expand + from pymbolic import expand from pymbolic.mapper.constant_folder import CommutativeConstantFoldingMapper cfm = CommutativeConstantFoldingMapper() @@ -1741,12 +1743,15 @@ def wrap_in_for_from_constraints(ccm, iname, constraint_bset, stmt): end_conds.append("%s >= 0" % ccm(cfm(expand(rhs)))) else: # iname_coeff > 0 - start_exprs.append( - ccm(cfm(flatten(expand(-rhs)//iname_coeff)))) + kind, bound = solve_constraint_for_bound(cns, iname) + assert kind == ">=" + start_exprs.append(bound) while len(start_exprs) >= 2: start_exprs.append( - "max(%s, %s)" % (start_exprs.pop(), start_exprs.pop())) + "max(%s, %s)" % ( + ccm(start_exprs.pop()), + ccm(start_exprs.pop()))) start_expr, = start_exprs # there has to be at least one @@ -1937,6 +1942,23 @@ def generate_code(kernel): if kernel.preamble is not None: mod.extend([LiteralLines(kernel.preamble), Line()]) + mod.extend([ + LiteralLines(""" + inline int floor_int_div(int a, int b) + { + if ((a<0) != (b<0)) + { + if (b<0) + return (-a+b+1)/-b; + else + return (a-b+1)/b; + } + else + return a/b; + } + """), + Line()]) + # {{{ symbolic names for group and local indices for what_cls, func in [ @@ -1968,7 +1990,7 @@ def generate_code(kernel): cgs = CodeGenerationState(c_code_mapper=ccm, try_slab_partition=True) gen_code = build_loop_nest(cgs, kernel, 0, isl.Set.universe(kernel.space)) body.extend([Line(), gen_code.ast]) - print "# conditionals: %d" % gen_code.num_conditionals + #print "# conditionals: %d" % gen_code.num_conditionals mod.append( FunctionBody( @@ -1987,29 +2009,6 @@ def generate_code(kernel): # }}} -# {{{ debugging - -def print_kernel_info(knl): - if hasattr(knl, "prefetch"): - print "PREFETCH", knl.local_mem_use() - for pf in knl.prefetch.itervalues(): - print " %s[%s]: %s" % (pf.input_vector, pf.index_expr, pf.dims) - print - - if hasattr(knl, "schedule"): - print "Scheduling: ---------------------" - for sched_item in knl.schedule: - print sched_item - print - - for ld in knl.dims: - print ld - print - for t, e in knl.instructions: - print "%s <- %s" % (t, e) - -# }}} - # {{{ high-level modifiers def split_dimension(knl, *args, **kwargs): @@ -2133,7 +2132,7 @@ class CompiledKernel: # driver ---------------------------------------------------------------------- def drive_timing_run(kernel_generator, queue, launch, flop_count=None, - options=[]): + options=[], print_code=True): def time_run(compiled_knl, warmup_rounds=2, timing_rounds=5): check = True @@ -2162,8 +2161,9 @@ def drive_timing_run(kernel_generator, queue, launch, flop_count=None, print "-----------------------------------------------" print "SOLUTION #%d" % soln_count print "-----------------------------------------------" - print compiled.code - print "-----------------------------------------------" + if print_code: + print compiled.code + print "-----------------------------------------------" elapsed = time_run(compiled)