From dea47d64d394ecd43d52f8cbd7162a0d41048a2e Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Fri, 22 Jul 2011 03:06:54 -0500 Subject: [PATCH] Be smarter about choosing conflict-free lmem layout. --- examples/matrix-ops.py | 20 +++--- loopy/__init__.py | 140 ++++++++++++++++++++++++++--------------- 2 files changed, 99 insertions(+), 61 deletions(-) diff --git a/examples/matrix-ops.py b/examples/matrix-ops.py index fcf0a078b..eb1ffb7bf 100644 --- a/examples/matrix-ops.py +++ b/examples/matrix-ops.py @@ -136,7 +136,7 @@ def image_matrix_mul(ctx_factory=cl.create_some_context): knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") knl = lp.split_dimension(knl, "k", 32) - # slow + # slow, but conflict-free #knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) #knl = lp.add_prefetch(knl, 'b', ["j_inner", "k_inner"]) # fast @@ -164,7 +164,8 @@ def image_matrix_mul(ctx_factory=cl.create_some_context): return evt lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3, - options=FAST_OPTIONS) + options=FAST_OPTIONS + ["-cl-nv-verbose"], + force_rebuild=True) @@ -176,7 +177,7 @@ def fancy_matrix_mul(ctx_factory=cl.create_some_context): queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) - order = "F" + order = "C" n = 16*40 from pymbolic import var @@ -188,15 +189,15 @@ def fancy_matrix_mul(ctx_factory=cl.create_some_context): (c[i, j], a[i, k]*b[k, j]) ], [ - lp.ArrayArg("a", dtype, shape=(n_sym, n_sym), order="F"), - lp.ArrayArg("b", dtype, shape=(n_sym, n_sym), order="F"), - lp.ArrayArg("c", dtype, shape=(n_sym, n_sym), order="F"), + lp.ArrayArg("a", dtype, shape=(n_sym, n_sym), order=order), + lp.ArrayArg("b", dtype, shape=(n_sym, n_sym), order=order), + lp.ArrayArg("c", dtype, shape=(n_sym, n_sym), order=order), lp.ScalarArg("n", np.int32, approximately=1000), ], name="fancy_matmul") knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") - knl = lp.split_dimension(knl, "k", 19) + knl = lp.split_dimension(knl, "k", 16) knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) knl = lp.add_prefetch(knl, 'b', ["k_inner", "j_inner"]) assert knl.get_invalid_reason() is None @@ -221,7 +222,8 @@ def fancy_matrix_mul(ctx_factory=cl.create_some_context): return evt lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3, - options=FAST_OPTIONS) + options=FAST_OPTIONS + ["-cl-nv-verbose"], + force_rebuild=True) @@ -309,4 +311,4 @@ if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) else: - fancy_matrix_mul() + image_matrix_mul() diff --git a/loopy/__init__.py b/loopy/__init__.py index 9d4276567..78ee835a2 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -5,9 +5,10 @@ from pytools import Record, memoize_method from pymbolic.mapper.dependency import DependencyMapper from pymbolic.mapper.c_code import CCodeMapper from pymbolic.mapper.stringifier import PREC_NONE -from pymbolic.mapper import IdentityMapper, CombineMapper, RecursiveMapper +from pymbolic.mapper import CombineMapper, RecursiveMapper import pyopencl as cl +import pyopencl.characterize as cl_char import islpy as isl from islpy import dim_type @@ -24,7 +25,6 @@ register_mpz_with_pymbolic() # TODO: Try, fix reg. prefetch # TODO: Divisibility -# TODO: Reasoning about bank conflicts # TODO: Functions # TODO: Common subexpressions # TODO: Try different kernels @@ -34,6 +34,7 @@ register_mpz_with_pymbolic() # TODO: User controllable switch for slab opt # TODO: User control over schedule # TODO: Condition hoisting +# TODO: Separate all-bulk from non-bulk kernels. # TODO: Custom reductions per red. axis @@ -143,7 +144,7 @@ def get_bounds_constraints(bset, iname, space=None, admissible_vars=None): iname_coeff = int(cns.get_coefficient(iname_tp, iname_idx)) if admissible_vars is not None: - if not (set(cns.get_coefficients_by_name().iterkeys()) + if not (set(cns.get_coefficients_by_name().iterkeys()) <= admissible_vars): continue @@ -1071,7 +1072,12 @@ def insert_register_prefetches(kernel): # {{{ code generation +# {{{ support code for AST wrapper objects + class GeneratedCode(Record): + """Objects of this type are wrapped around ASTs upon + return from generation calls to collect information about them. + """ __slots__ = ["ast", "num_conditionals"] def gen_code_block(elements): @@ -1117,6 +1123,8 @@ def wrap_with(cls, *args): return GeneratedCode(ast=ast, num_conditionals=num_conditionals) +# }}} + # {{{ C code mapper class LoopyCCodeMapper(CCodeMapper): @@ -1157,8 +1165,8 @@ class LoopyCCodeMapper(CCodeMapper): assert isinstance(expr.index, tuple) base_access = ("read_imagef(%s, loopy_sampler, (float%d)(%s))" - % (arg.name, arg.dimensions, - ", ".join(self.rec(idx, PREC_NONE) + % (arg.name, arg.dimensions, + ", ".join(self.rec(idx, PREC_NONE) for idx in expr.index[::-1]))) if arg.dtype == np.float32: @@ -1191,11 +1199,11 @@ class LoopyCCodeMapper(CCodeMapper): def map_floor_div(self, expr, prec): if isinstance(expr.denominator, int) and expr.denominator > 0: - return ("int_floor_div_pos_b(%s, %s)" + return ("int_floor_div_pos_b(%s, %s)" % (self.rec(expr.numerator, PREC_NONE), expr.denominator)) else: - return ("int_floor_div(%s, %s)" + return ("int_floor_div(%s, %s)" % (self.rec(expr.numerator, PREC_NONE), self.rec(expr.denominator, PREC_NONE))) @@ -1557,7 +1565,7 @@ def generate_loop_dim_code(cgs, kernel, sched_index, class TrialRecord(Record): pass - if (cgs.try_slab_partition + if (cgs.try_slab_partition and "outer" in iname): trial_cgs = cgs.copy(try_slab_partition=False) trials = [] @@ -1681,7 +1689,7 @@ def get_valid_index_vars(kernel, sched_index, exclude_tags=()): def filter_necessary_constraints(implemented_domain, constraints): space = implemented_domain.get_dim() - return [cns + return [cns for cns in constraints if not implemented_domain.is_subset( isl.Set.universe(space) @@ -1772,7 +1780,7 @@ def wrap_in_for_from_constraints(ccm, iname, constraint_bset, stmt): # }}} -# {{{ codegen top-level dispatch +# {{{ loop nest build top-level dispatch def build_loop_nest(cgs, kernel, sched_index, implemented_domain): ccm = cgs.c_code_mapper @@ -1839,49 +1847,80 @@ def build_loop_nest(cgs, kernel, sched_index, implemented_domain): # }}} -# {{{ main code generation entrypoint - -class CodeGenerationState(Record): - __slots__ = ["c_code_mapper", "try_slab_partition"] - -def generate_code(kernel): - from cgen import (FunctionBody, FunctionDeclaration, - POD, Value, ArrayOf, Module, Block, - Define, Line, Const, LiteralLines, Initializer) - - from cgen.opencl import (CLKernel, CLGlobal, CLRequiredWorkGroupSize, - CLLocal, CLImage) +# {{{ prefetch preprocessing - # {{{ assign names, dim storage lengths to prefetches +def preprocess_prefetch(kernel): + """Assign names, dim storage lengths to prefetches. + """ all_pf_list = kernel.prefetch.values() - all_pf_nbytes = [opf.nbytes for opf in all_pf_list] + new_prefetch_dict = {} + lmem_size = cl_char.usable_local_mem_size(kernel.device) - new_prefetch = {} for i_pf, pf in enumerate(kernel.prefetch.itervalues()): - dim_storage_lengths = [stop-start for start, stop in pf.dim_bounds] - + all_pf_nbytes = [opf.nbytes for opf in all_pf_list] other_pf_sizes = sum(all_pf_nbytes[:i_pf]+all_pf_nbytes[i_pf+1:]) + shape = [stop-start for start, stop in pf.dim_bounds] + dim_storage_lengths = shape[:] + # sizes of all dims except the last one, which we may change # below to avoid bank conflicts from pytools import product other_dim_sizes = (pf.itemsize * product(dim_storage_lengths[:-1])) - from pyopencl.characterize import usable_local_mem_size - if (dim_storage_lengths[-1] % 2 == 0 - and other_pf_sizes+other_dim_sizes*(dim_storage_lengths[-1]+1) - < usable_local_mem_size(kernel.device)): - dim_storage_lengths[-1] += 1 + min_mult = cl_char.local_memory_bank_count(kernel.device) + good_incr = None + new_dsl = dim_storage_lengths + + for increment in range(dim_storage_lengths[-1]//2): + + test_dsl = dim_storage_lengths[:] + test_dsl[-1] = test_dsl[-1] + increment + new_mult, why_not = cl_char.why_not_local_access_conflict_free( + kernel.device, pf.itemsize, + shape, test_dsl) + + # will choose smallest increment 'automatically' + if new_mult < min_mult: + new_lmem_use = other_pf_sizes+pf.itemsize*product(new_dsl) + if new_lmem_use < lmem_size: + new_dsl = test_dsl + min_mult = new_mult + good_incr = increment + + if min_mult != 1: + from warnings import warn + warn("could not find a conflict-free mem layout " + "for prefetch of '%s' " + "(currently: %dx conflict, increment: %d)" + % (pf.input_vector, min_mult, good_incr), + LoopyAdvisory) + + new_pf = pf.copy(dim_storage_lengths=dim_storage_lengths, + name="prefetch_%s_%d" % (pf.input_vector, i_pf)) + new_prefetch_dict[pf.input_vector, pf.index_expr] = new_pf + all_pf_list[i_pf] = new_pf + + return kernel.copy(prefetch=new_prefetch_dict) - new_prefetch[pf.input_vector, pf.index_expr] = \ - pf.copy(dim_storage_lengths=dim_storage_lengths, - name="prefetch_%s_%d" % (pf.input_vector, i_pf)) +# }}} - kernel = kernel.copy(prefetch=new_prefetch) +# {{{ main code generation entrypoint - # }}} +class CodeGenerationState(Record): + __slots__ = ["c_code_mapper", "try_slab_partition"] + +def generate_code(kernel): + kernel = preprocess_prefetch(kernel) + + from cgen import (FunctionBody, FunctionDeclaration, + POD, Value, ArrayOf, Module, Block, + Define, Line, Const, LiteralLines, Initializer) + + from cgen.opencl import (CLKernel, CLGlobal, CLRequiredWorkGroupSize, + CLLocal, CLImage) my_ccm = LoopyCCodeMapper(kernel) @@ -2075,14 +2114,18 @@ def add_prefetch(kernel, input_access_descr, tags_or_inames, loc_fetch_axes={}): # }}} - - +# {{{ compiled kernel object class CompiledKernel: - def __init__(self, context, kernel, size_args=None, options=[]): + def __init__(self, context, kernel, size_args=None, options=[], + force_rebuild=False): self.kernel = kernel self.code = generate_code(kernel) + if force_rebuild: + from time import time + self.code = "/* %s */\n%s" % (time(), self.code) + #from pytools import invoke_editor #self.code = invoke_editor(self.code) @@ -2126,20 +2169,11 @@ class CompiledKernel: self.local_size_func = compile( lsize_expr, self.size_args) - - - -# {{{ speed measurement - - # }}} - - - -# driver ---------------------------------------------------------------------- +# {{{ timing driver def drive_timing_run(kernel_generator, queue, launch, flop_count=None, - options=[], print_code=True): + options=[], print_code=True, force_rebuild=False): def time_run(compiled_knl, warmup_rounds=2, timing_rounds=5): check = True @@ -2163,7 +2197,8 @@ def drive_timing_run(kernel_generator, queue, launch, flop_count=None, soln_count = 0 for kernel in kernel_generator: - compiled = CompiledKernel(queue.context, kernel, options=options) + compiled = CompiledKernel(queue.context, kernel, options=options, + force_rebuild=force_rebuild) print "-----------------------------------------------" print "SOLUTION #%d" % soln_count @@ -2184,6 +2219,7 @@ def drive_timing_run(kernel_generator, queue, launch, flop_count=None, print "%d solutions" % soln_count +# }}} -- GitLab