diff --git a/examples/matrix-ops.py b/examples/matrix-ops.py index 36b1f9c87b4adb08e40f2c2a41d69688fbbe11f9..0932b1ca15aa91386d5ca5fea6782336597ea9c5 100644 --- a/examples/matrix-ops.py +++ b/examples/matrix-ops.py @@ -8,44 +8,53 @@ import loopy as lp +def make_well_condition_dev_matrix(queue, n, dtype=np.float32): + return cl_array.to_device(queue, + np.random.randn(n, n).astype(dtype) + 5*np.eye(n, dtype=dtype)) + def main_matrix_mul(): - ctx = cl.create_some_context(answers=[2]) + #ctx = cl.create_some_context() + ctx = cl.create_some_context(answers=[1]) queue = cl.CommandQueue(ctx, properties=cl.command_queue_properties.PROFILING_ENABLE) - #n = 16*34 - n = 16*34 + n = 16*100 from pymbolic import var a, b, c, i, j, k = [var(s) for s in "abcijk"] - k = lp.make_loop_kernel(ctx.devices[0], [ + knl = lp.make_loop_kernel(ctx.devices[0], [ lp.LoopDimension("i", n), lp.LoopDimension("j", n), lp.LoopDimension("k", n), ], [ - (c[i+n*j], a[i+n*k]*b[k+n*j]) - ], - hints={ - lp.HINTS.PREFETCH: {"a": True, "b":True}, - lp.HINTS.MIN_GROUP_SIZE: 128, - lp.HINTS.MIN_GROUP_COUNT: 30, - }) + (c[i*n+j], a[i*n+k]*b[k*n+j]) + ]) + knl = lp.split_dimension(knl, "i", 16, outer_tag="g.0", inner_tag="l.1") + knl = lp.split_dimension(knl, "j", 16, outer_tag="g.1", inner_tag="l.0") + knl = lp.split_dimension(knl, "k", 16) # 8! + knl = lp.add_prefetch_dims(knl, 'a', ["i_inner", "k_inner"]) + knl = lp.add_prefetch_dims(knl, 'b', ["k_inner", "j_inner"]) + assert knl.get_invalid_reason() is None - kernel_gen = lp.generate_all_kernels(k) + kernel_gen = (lp.insert_register_prefetches(knl) + for knl in lp.generate_loop_schedules(knl)) if 1: - a = clrandom.rand(queue, (n, n), dtype=np.float32) - b = clrandom.rand(queue, (n, n), dtype=np.float32) + a = make_well_condition_dev_matrix(queue, n) + b = make_well_condition_dev_matrix(queue, n) c = cl_array.empty_like(a) + refsol = np.dot(a.astype(np.float64).get(), b.astype(np.float64).get()) - def launcher(gsize, lsize, kernel): + def launcher(gsize, lsize, kernel, check): kernel(queue, gsize, lsize, a.data, b.data, c.data, g_times_l=True) - refsol = np.dot(a.astype(np.float64).get(), b.astype(np.float64).get()) - sol = c.astype(np.float64).get() - print la.norm(refsol-sol)/la.norm(refsol) + if check: + sol = c.astype(np.float64).get() + rel_err = la.norm(refsol-sol, "fro")/la.norm(refsol, "fro") + print rel_err + #assert rel_err < 1e-5, rel_err lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3) else: diff --git a/loopy/__init__.py b/loopy/__init__.py index 1b5ff79fa1cf77e09e37e2b2962a93329fe22cae..a7b3b6be546fff4e195ef5ca940b8b1db15fa818 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -5,30 +5,38 @@ from pytools import Record, memoize_method from pymbolic.mapper.dependency import DependencyMapper from pymbolic.mapper.c_code import CCodeMapper from pymbolic.mapper.stringifier import PREC_NONE -from pymbolic.mapper import CombineMapper +from pymbolic.mapper import CombineMapper, RecursiveMapper import pyopencl as cl -import pyopencl.array as cl_array -# TODO: Correctness checking +# TODO: Restrict, const +# TODO: Be smart about picking prefetch 'realization' dims # TODO: More freedom for data types of input and output vectors -# TODO: Try different kernels +# TODO: extra parameters # TODO: Non-multiple loop splits -# TODO: Fake "constant" parameter preservation +# TODO: Symbolic bounds +# TODO: Double precision pragma +# TODO: nD Texture access +# TODO: Required work group sizes + +# TODO: Try different kernels # TODO: Play with multi-d data layout (optionally?) +# TODO: Custom reductions per red. axis +# TODO: Fix indirect addressing +# TODO: Switch to sympy, serve multiple accesses with one prefetch +# TODO: Preambles +# TODO: Functions +# TODO: Common subexpressions +# TODO: Vectorize +# TODO: Unroll -LOCAL_SPARE_BYTES = 256 # allow for parameter space on Nv -class HINTS: - class PREFETCH: pass - class MIN_GROUP_SIZE: pass - class MIN_GROUP_COUNT: pass @@ -37,18 +45,9 @@ class HINTS: # {{{ index tags class IndexTag(object): - pass - -class GROUP_IDX_TAG(IndexTag): def __init__(self, axis=None): self.axis = axis - def __repr__(self): - if self.axis is None: - return "GROUP_IDX" - else: - return "GROUP_IDX(%d)" % self.axis - def __eq__(self, other): return (self.__class__ == other.__class__ and self.axis == other.axis) @@ -56,22 +55,45 @@ class GROUP_IDX_TAG(IndexTag): def __ne__(self, other): return not self.__eq__(other) -class WORK_ITEM_IDX_TAG(IndexTag): - def __init__(self, axis=None): - self.axis = axis + def __hash__(self): + return hash(type(self)) ^ hash(self.axis) + +class GROUP_IDX_TAG(IndexTag): + def __repr__(self): + if self.axis is None: + return "GROUP_IDX" + else: + return "GROUP_IDX(%d)" % self.axis + + +class WORK_ITEM_IDX_TAG(IndexTag): def __repr__(self): if self.axis is None: return "WORK_ITEM_IDX" else: return "WORK_ITEM_IDX(%d)" % self.axis - def __eq__(self, other): - return (self.__class__ == other.__class__ - and self.axis == other.axis) +def parse_tag(tag): + if tag is None: + return tag - def __ne__(self, other): - return not self.__eq__(other) + if isinstance(tag, IndexTag): + return tag + + if not isinstance(tag, str): + raise ValueError("cannot parse tag: %s" % tag) + + if tag.startswith("g."): + return GROUP_IDX_TAG(int(tag[2:])) + if tag.startswith("l."): + return WORK_ITEM_IDX_TAG(int(tag[2:])) + else: + raise ValueError("cannot parse tag: %s" % tag) + +# }}} + +# {{{ loop dim, loop domain, kernel class LoopDimension(Record): __slots__ = ["name", "length", "tag"] @@ -91,13 +113,17 @@ class LoopDimension(Record): else: return "LD(%r, %d)" % (self.name, self.length) -# }}} - -# {{{ loop domain, kernel class LoopDomain(Record): __slots__ = ["dims"] + def name_to_idx(self, name): + for i, dim in enumerate(self.dims): + if dim.name == name: + return i + else: + raise KeyError("invalid dimension name: %s" % name) + def name_to_idx(self, name): for i, dim in enumerate(self.dims): if dim.name == name: @@ -129,6 +155,9 @@ class LoopDomain(Record): return [dim for dim in self.dims if isinstance(dim.tag, tag_type)] + def local_mem_use(kernel): + return sum(pf.size() for pf in kernel.prefetch.itervalues()) + def ordered_dim_by_tag_type(self, tag_type): result = [] from itertools import count @@ -151,17 +180,6 @@ class LoopDomain(Record): + [new_dim] + self.dims[(idx+1):]) - def change_dim(self, idx, **kwargs): - return self.set_dim(idx, self.dims[idx].copy(**kwargs)) - - def move(self, from_idx, to_idx): - # BROKEN - new_dims = self.dims[:idx] + self.dims[(idx+1):] - if from_idx > to_idx: - to_idx -= 1 - new_dims.insert(to_idx, self.dims[from_idx]) - return self.copy(dims=new_dims) - @@ -172,20 +190,6 @@ class LoopKernel(LoopDomain): # - instructions # - prefetch # - schedule - # - hints (dictionary of category->value) - # - _used_hints (to tell user about ignored hints, subdict of the above) - - def get_hint(self, hint_category): - try: - return self._used_hints[hint_category] - except KeyError: - pass - except AttributeError: - self._used_hints = {} - - result = self.hints.get(hint_category) - self._used_hints[hint_category] = result - return result @memoize_method def all_indices(self): @@ -211,21 +215,41 @@ class LoopKernel(LoopDomain): def reduction_dimensions(self): return [dim for dim in self.dims if dim.name not in self.output_indices()] - def group_counts(self): + def group_dims(self): dims = self.ordered_dim_by_tag_type(GROUP_IDX_TAG) return tuple(dim.length for dim in dims) - def local_size(self): + def local_dims(self): dims = self.ordered_dim_by_tag_type(WORK_ITEM_IDX_TAG) return tuple(dim.length for dim in dims) def group_size(self): from pytools import product - return product(self.local_size()) + return product(self.local_dims()) def group_count(self): from pytools import product - return product(self.group_counts()) + return product(self.group_dims()) + + def parse_sloppy_dim_to_dim_idx(self, dim): + if isinstance(dim, str): + try: + tag = parse_tag(dim) + except ValueError: + pass + else: + return self.tag_to_idx(tag) + + return self.name_to_idx(dim) + + if isinstance(dim, LoopDimension): + return self.dims.index(dim) + + if isinstance(dim, int): + return dim + + def parse_sloppy_dim(self, dim): + return self.dims[self.parse_sloppy_dim_to_dim_idx(dim)] @memoize_method def input_vectors(self): @@ -299,6 +323,18 @@ class LoopKernel(LoopDomain): def split_dimension(self, idx, inner_length, outer_name=None, inner_name=None, outer_tag=None, inner_tag=None): + if isinstance(idx, str): + idx = self.name_to_idx(idx) + + outer_tag = parse_tag(outer_tag) + inner_tag = parse_tag(inner_tag) + + new_tags = set(tag for tag in [outer_tag, inner_tag] if tag is not None) + + for d in self.dims: + if d.tag in new_tags: + raise RuntimeError("repeated tag: %s" % d.tag) + dim = self.dims[idx] if outer_name is None: @@ -326,6 +362,28 @@ class LoopKernel(LoopDomain): ] + self.dims[(idx+1):]), tgt_expr + def get_invalid_reason(self): + gdims = self.group_dims() + ldims = self.local_dims() + if (max(len(gdims), len(ldims)) + > self.device.max_work_item_dimensions): + return "too many work item dimensions" + + for i in range(len(ldims)): + if ldims[i] > self.device.max_work_item_sizes[i]: + return "group axis %d too big" + + if self.group_size() > self.device.max_work_group_size: + return "work group too big" + + from pyopencl.characterize import usable_local_mem_size + if self.local_mem_use() > usable_local_mem_size(self.device): + return "using too much local memory" + + return None + + + @@ -348,128 +406,31 @@ def make_loop_kernel(dev, dims, insns, hints={}): # }}} -# {{{ dim->tag assignment - -def generate_work_item_index_assignment_numberings(kernel): - work_item_idx_dim_indices = kernel.indices_by_tag_type( - WORK_ITEM_IDX_TAG) - - if work_item_idx_dim_indices: - from pytools import generate_unique_permutations - - for perm in generate_unique_permutations( - tuple(range(len(work_item_idx_dim_indices)))): - - new_kernel = kernel - for dim_i, w_item_axis in zip( - work_item_idx_dim_indices, - perm): - new_kernel = new_kernel.change_dim( - dim_i, tag=WORK_ITEM_IDX_TAG(w_item_axis)) - - yield new_kernel - else: - # nothing assigned to work item indices? not interested. - pass - - - - -def generate_dim_assignments(kernel, idx=0, - no_work_item_indices=set()): - if idx >= len(kernel.dims): - for knl in generate_work_item_index_assignment_numberings( - kernel): - yield knl - return - - dim = kernel.dims[idx] - - assert dim.length >= 2 - - for knl in generate_dim_assignments(kernel, idx+1, - no_work_item_indices=no_work_item_indices): - yield knl - - from pymbolic import var - - block_idx_dim_count = len(kernel.dims_by_tag_type(GROUP_IDX_TAG)) - - if dim.name in kernel.output_indices() \ - and block_idx_dim_count < 2 : - for knl in generate_dim_assignments( - kernel.change_dim(idx, - tag=GROUP_IDX_TAG(block_idx_dim_count)), - idx+1, - no_work_item_indices=no_work_item_indices): - yield knl - - # try to assign to work item indices - work_item_idx_dims = kernel.dims_by_tag_type(WORK_ITEM_IDX_TAG) - work_item_idx_dims_count = len(work_item_idx_dims) - - from pytools import product - assigned_block_size = product(tid.length - for tid in work_item_idx_dims) - leftover_block_size = (kernel.device.max_work_group_size - // assigned_block_size) - - if (dim.name in kernel.output_indices() - and dim.name not in no_work_item_indices - and dim.tag is None - and work_item_idx_dims_count < 3 - and leftover_block_size > 1): - my_block_length = 1 - while my_block_length < dim.length: - my_block_length *= 2 - if my_block_length > dim.length: - my_block_length = dim.length - - if my_block_length > leftover_block_size: - break - - if dim.length % my_block_length != 0: - continue - - new_length = dim.length//my_block_length - - if new_length > 1: - outer_name = dim.name+"_outer" - inner_name = dim.name+"_inner" - - new_kernel, tgt_expr = kernel.split_dimension(idx, - outer_name=outer_name, - inner_length=my_block_length, - inner_name=inner_name, - inner_tag=WORK_ITEM_IDX_TAG(), - ) - for knl in generate_dim_assignments(new_kernel, idx, - no_work_item_indices=( - no_work_item_indices | set([outer_name]))): - yield knl - else: - for knl in generate_dim_assignments( - kernel.change_dim(idx, tag=WORK_ITEM_IDX_TAG()), - idx+1, - no_work_item_indices=no_work_item_indices): - yield knl - -# }}} - # {{{ local-mem prefetch-related -def total_prefetch_size(kernel): - return sum(pf.size() for pf in kernel.prefetch.itervalues()) - class PrefetchDescriptor(Record): - # possible attributes: - # - input_vector - # - index_expr - # - dims - # - dim_storage_lengths + """ + Attributes: + :ivar input_vector: A string indicating the input vector variable name. + :ivar index_expr: An expression identifying the access which this prefetch + serves. + :ivar dims: A sequence of loop dimensions identifying which part of the + input vector, given the index_expr, should be prefetched. + :ivar loc_fetch_axes: dictionary from integers 0..len(dims) to lists of + local index axes which should be used to realize that dimension of the + prefetch. The last dimension in this list is used as the fastest-changing + one. + :ivar name: the variable name used for the prefetch + :ivar dim_storage_lengths: a sequence of integers indicating the size of + the storage for each dimension. It may may differ from the size of the + actual loop dimensions to mitigate bank conflicts. + + The latter two values are only assigned during code generation. + """ def size(self): from pytools import product + # FIXME: sizeof return 4*product(dim.length for dim in self.dims) @memoize_method @@ -478,86 +439,17 @@ class PrefetchDescriptor(Record): for var in DependencyMapper()(self.index_expr) ) - set(dim.name for dim in self.dims) + def hash(self): + return (hash(type(self)) ^ hash(self.input_vector) + ^ hash(self.index_expr)) + def __eq__(self, other): + # in particular, dim_storage_lengths should not factor into equality + return (type(self) == type(other) + and self.input_vector == other.input_vector + and self.index_expr == other.index_expr) -def with_added_prefetch_dim(kernel, ivec, iexpr, dim): - new_prefetch = kernel.prefetch.copy() - if (ivec, iexpr) in new_prefetch: - old_pf_descr = new_prefetch[ivec, iexpr] - new_prefetch[ivec, iexpr] = old_pf_descr.copy( - dims=old_pf_descr.dims + [dim]) - else: - new_prefetch[ivec, iexpr] = PrefetchDescriptor( - input_vector=ivec, - index_expr=iexpr, - dims=[dim]) - - new_kernel = kernel.copy(prefetch=new_prefetch) - - if total_prefetch_size(new_kernel) \ - <= kernel.device.local_mem_size - LOCAL_SPARE_BYTES: - return new_kernel - else: - return None - - - - -def generate_prefetch_sizes(kernel, ivec, iexpr, prefetch_dims): - if not prefetch_dims: - yield kernel - return - - dim = prefetch_dims[0] - - if (isinstance(dim.tag, WORK_ITEM_IDX_TAG) - or kernel.is_prefetch_variable(dim.name)): - new_kernel = with_added_prefetch_dim(kernel, ivec, iexpr, dim) - if new_kernel is not None: - for knl in generate_prefetch_sizes( - new_kernel, ivec, iexpr, prefetch_dims[1:]): - yield knl - else: - prefetch_length = 2 - while prefetch_length < dim.length: - if prefetch_length > dim.length: - prefetch_length = dim.length - - if dim.length % prefetch_length != 0: - prefetch_length *= 2 - continue - - outer_length = dim.length//prefetch_length - if outer_length > 1: - # split the dimension, then generate prefetch - inner_name = dim.name+"_prefetch" - - new_kernel, tgt_expr = kernel.split_dimension( - kernel.name_to_idx(dim.name), - inner_length=prefetch_length, - inner_name=inner_name) - - from pymbolic import var - from pymbolic.mapper.substitutor import substitute - - new_iexpr = substitute(iexpr, {var(dim.name): tgt_expr}) - new_kernel = with_added_prefetch_dim( - new_kernel, ivec, new_iexpr, - LoopDimension(inner_name, prefetch_length)) - - if new_kernel is not None: - for knl in generate_prefetch_sizes(new_kernel, - ivec, new_iexpr, prefetch_dims[1:]): - yield knl - else: - # prefetch the whole dimension - new_kernel = with_added_prefetch_dim(kernel, ivec, iexpr, dim) - if new_kernel is not None: - for knl in generate_prefetch_sizes( - new_kernel, ivec, iexpr, prefetch_dims[1:]): - yield knl - prefetch_length *= 2 @@ -588,127 +480,56 @@ class VariableIndexExpressionCollector(CombineMapper): -def generate_single_vector_kernel_prefetch_choices(kernel, ivec, - force_prefetch): - from pytools import flatten - index_exprs = set(flatten( - VariableIndexExpressionCollector(ivec)(expression) - for lvalue, expression in kernel.instructions - )) - - for index_expr in index_exprs: - dm = DependencyMapper() - - involved_dims = list(set(kernel.name_to_dim(idx.name) - for idx in dm(index_expr))) - - prefetch_dims = [dim - for dim in involved_dims - if isinstance(dim.tag, WORK_ITEM_IDX_TAG)] - uncertain_dims = [dim - for dim in involved_dims - if not isinstance(dim.tag, (WORK_ITEM_IDX_TAG, GROUP_IDX_TAG))] - - if force_prefetch == True: - choices = [[True]*len(uncertain_dims)] - elif force_prefetch == False: - choices = [[True]*len(uncertain_dims)] - else: - assert force_prefetch is None - from pytools import generate_nonnegative_integer_tuples_below as gnitt - choices = gnitt(2, len(uncertain_dims)) - - for flags in choices: - my_prefetch_dims = prefetch_dims + [ - udim for udim, flag in zip(uncertain_dims, flags) - if flag] - for knl in generate_prefetch_sizes(kernel, - ivec, index_expr, my_prefetch_dims): - yield knl - - - - - -def optimize_prefetch(kernel): - new_prefetch = {} - - pf_list = kernel.prefetch.values() - - from pytools import partition2, product - - for i_pf, pf in enumerate(pf_list): - # reorder to get local_id(0) to the bottom - work_item_pf_dims, non_work_item_pf_dims = partition2( - (isinstance(pfdim.tag, WORK_ITEM_IDX_TAG), pfdim) - for pfdim in pf.dims) - work_item_pf_dims.sort(key=lambda pfdim: pfdim.tag.axis) - - # try and avoid bank conflicts - pfdims = non_work_item_pf_dims+work_item_pf_dims[::-1] - dim_storage_lengths = [pfdim.length for pfdim in pfdims] - - if work_item_pf_dims and work_item_pf_dims[0].tag.axis == 0: - tx_dim = work_item_pf_dims[0] - - # see if we can afford the smem expense in avoiding bank conflicts - other_pf_sizes = sum(opf.size() - for opf in pf_list[:i_pf]+pf_list[i_pf+1:]) - other_dim_sizes = 4*product(odim.length - for odim in work_item_pf_dims[1:]+non_work_item_pf_dims) - - if (tx_dim.length % 2 == 0 - and other_pf_sizes+other_dim_sizes*(tx_dim.length+1) - < kernel.device.local_mem_size-LOCAL_SPARE_BYTES): - dim_storage_lengths[-1] += 1 - - new_prefetch[pf.input_vector, pf.index_expr] = \ - pf.copy(dims=pfdims, dim_storage_lengths=dim_storage_lengths) - - return kernel.copy(prefetch=new_prefetch) - - - - +class StrideCollector(RecursiveMapper): + def map_sum(self, expr): + stride_dicts = [self.rec(ch) for ch in expr.children] + result = {} + for stride_dict in stride_dicts: + for var, stride in stride_dict.iteritems(): + if var in result: + result[var] += stride + else: + result[var] = stride -def generate_kernel_prefetch_choices(kernel, ivecs, prefetch_hints): - if ivecs: - for knl in generate_single_vector_kernel_prefetch_choices( - kernel, ivecs[0], prefetch_hints.get(ivecs[0])): - for subknl in generate_kernel_prefetch_choices( - knl, ivecs[1:], prefetch_hints): - yield subknl - else: - yield kernel - + return result + def map_product(self, expr): + result = {} + for i, ch in enumerate(expr.children): + strides = self.rec(ch) + from pymbolic import flattened_product + prod_other_children = flattened_product( + expr.children[:i] + expr.children[(i+1):]) + + for var, stride in strides.iteritems(): + if var in result: + raise NotImplementedError( + "nonlinear index expression") + else: + result[var] = prod_other_children*stride + return result -def generate_all_prefetching_kernels(kernel): - prefetch_hints = kernel.get_hint(HINTS.PREFETCH) + def map_divide(self, expr): + num_strides = self.rec(expr.numerator) + denom_strides = self.rec(expr.denominator) - kernel = kernel.copy(prefetch={}) + if denom_strides: + raise NotImplementedError - from pytools import generate_nonnegative_integer_tuples_below as gnitt - ivecs = kernel.input_vectors() + return dict( + (var, stride/expr.denominator) + for var, stride in num_strides.iteritems()) - certain_yes_ivecs = [] - uncertain_ivecs = [] + def map_constant(self, expr): + return {} - for iv in ivecs: - hint = prefetch_hints.get(iv) - if hint == True: - certain_yes_ivecs.append(iv) - elif hint is None: - uncertain_ivecs.append(iv) + def map_variable(self, expr): + return {expr.name: 1} - for flags in gnitt(2, len(uncertain_ivecs)): - for knl in generate_kernel_prefetch_choices(kernel, - certain_yes_ivecs - + [ivec for flag, ivec in zip(flags, uncertain_ivecs) if flag], - prefetch_hints): - yield optimize_prefetch(knl) + def map_subscript(self, expr): + raise RuntimeError("cannot gather strides--indirect addressing in use") # }}} @@ -882,16 +703,16 @@ def insert_register_prefetches(kernel): # {{{ code generation class LoopyCCodeMapper(CCodeMapper): - def __init__(self, kernel, get_prefetch_name): + def __init__(self, kernel): def constant_mapper(c): if isinstance(c, float): + # FIXME: type-variable return "%sf" % repr(c) else: return repr(c) CCodeMapper.__init__(self, constant_mapper=constant_mapper) self.kernel = kernel - self.get_prefetch_name = get_prefetch_name def map_subscript(self, expr, enclosing_prec): from pymbolic.primitives import Variable @@ -902,7 +723,7 @@ class LoopyCCodeMapper(CCodeMapper): except KeyError: pass else: - return self.get_prefetch_name(pf)+"".join( + return pf.name+"".join( "[%s]" % dim.name for dim in pf.dims) return CCodeMapper.map_subscript(self, expr, enclosing_prec) @@ -920,48 +741,233 @@ class RegisterPrefetch(Record): -def make_fetch_index_expr(kernel, exclude): +def generate_prefetch_code(ccm, kernel, schedule, sched_index, inner): from pymbolic import var - expr = 0 - for dim in kernel.ordered_dim_by_tag_type(WORK_ITEM_IDX_TAG)[::-1]: - if dim in exclude: - continue - expr = expr*dim.length + var("get_local_id")(dim.tag.axis) + from cgen import (POD, Block, + Initializer, Assign, Statement as S, ArrayOf, + For, If, Line, Comment, MaybeUnused) + + from cgen.opencl import CLLocal + + # find surrounding schedule items + if sched_index > 0: + next_inner_sched_item = schedule[sched_index-1] + else: + next_inner_sched_item = None + + if sched_index+1 < len(schedule): + next_outer_sched_item = schedule[sched_index+1] + else: + next_outer_sched_item = None + + scheduled_pf = schedule[sched_index] + pf = kernel.prefetch[ + scheduled_pf.input_vector, scheduled_pf.index_expr] + + # figure out dimension types + from pytools import partition2 + work_item_pf_dims, non_work_item_pf_dims = partition2( + (isinstance(dim.tag, WORK_ITEM_IDX_TAG), dim) + for dim in pf.dims) + + # Prefetch has a good amount of flexibility over what axes it + # uses to accomplish the prefetch. In particular, it can (and should!) + # use all work group dimensions. + + # {{{ determine which dims are used to realize the fetch + + # realization_dims is a list of lists of dims, to represent when two dims jointly + # make up one fetch axis + + realization_dims = [None] * len(pf.dims) + + # {{{ first, fix the user-specified fetch dims + + knl_work_item_dims = sorted(kernel.dims_by_tag_type(WORK_ITEM_IDX_TAG), + key=lambda dim: dim.tag.axis) + + for realization_dim_idx, loc_fetch_axis_list in \ + getattr(pf, "loc_fetch_axes", {}).iteritems(): + realization_dims[realization_dim_idx] = [knl_work_item_dims.pop(axis) + for axis in loc_fetch_axis_list] + + # }}} + + # {{{ next use the work group dimensions, least-stride dim first + + strides = StrideCollector()(pf.index_expr) + + def stride_key(a): + idx, a_stride = a + # FIXME: a_stride could be symbolic + assert isinstance(a_stride, int) + return a_stride + + pf_dim_strides = sorted(((dim_idx, strides[dim.name]) + for dim_idx, dim in enumerate(pf.dims) + if realization_dims[dim_idx] is None), + key=stride_key) + + while knl_work_item_dims and pf_dim_strides: + # grab least-stride prefetch dim + least_stride_pf_dim_idx, _ = pf_dim_strides.pop(0) + + # FIXME: It might be good to join multiple things together here + # for size reasons + realization_dims[least_stride_pf_dim_idx] = [knl_work_item_dims.pop(0)] + + # }}} + + # }}} + + # {{{ generate fetch code + + def make_fetch_loop_nest(pf_dim_idx, pf_dim_exprs=[], pf_idx_subst_map={}): + # may mutate kernel for prefetch dim enlargement + + if pf_dim_idx >= len(pf.dims): + from pymbolic.mapper.substitutor import substitute + # done, return + return Assign( + pf.name + "".join("[%s]" % ccm(dexpr, PREC_NONE) + for dexpr in pf_dim_exprs), + "%s[%s]" + % (pf.input_vector, + substitute(pf.index_expr, pf_idx_subst_map)) + ) + + pf_dim = pf.dims[pf_dim_idx] + realiz_dim_list = realization_dims[pf_dim_idx] + + if realiz_dim_list is not None: + # parallel fetch + pf_dim_expr = 0 + + from pytools import product + total_realiz_size = product(rd.length for rd in realiz_dim_list) + + # TODO: Too big/too small? + assert pf_dim.length == total_realiz_size + + for realiz_dim in realiz_dim_list: + assert isinstance(realiz_dim.tag, WORK_ITEM_IDX_TAG) + + pf_dim_expr = (pf_dim_expr*realiz_dim.length + + var("get_local_id(%d)" % realiz_dim.tag.axis)) + + pf_idx_subst_map = pf_idx_subst_map.copy() + pf_idx_subst_map[pf_dim.name] = pf_dim_expr + inner = make_fetch_loop_nest(pf_dim_idx+1, + pf_dim_exprs+[pf_dim_expr], pf_idx_subst_map) + return inner + else: + # sequential fetch + pf_dim_var = "prefetch_dim_idx_%d" % pf_dim_idx + pf_dim_expr = var(pf_dim_var) + + pf_idx_subst_map = pf_idx_subst_map.copy() + pf_idx_subst_map[pf_dim.name] = pf_dim_expr + inner = make_fetch_loop_nest(pf_dim_idx+1, + pf_dim_exprs+[pf_dim_expr], pf_idx_subst_map) + + return For( + "int %s = 0" % pf_dim_var, + "%s < %s" % (pf_dim_var, dim.length), + "++%s" % pf_dim_var, + fetch_block) + + + fetch_block = make_fetch_loop_nest(0) + + # }}} + + # {{{ build lmem array declarator + + smem_pf_array = POD(numpy.float32, pf.name) + for l in pf.dim_storage_lengths: + smem_pf_array = ArrayOf(smem_pf_array, l) + smem_pf_array = CLLocal(smem_pf_array) + + # }}} + + new_block = Block([ + Comment(("prefetch %s dim: " % pf.input_vector) + ", ".join( + "%s[%d]" % (pfdim.name, pfdim.length) for pfdim in pf.dims)), + Line(), + ]) + + # omit head sync primitive if we just came out of a prefetch + if not isinstance(next_outer_sched_item, PrefetchDescriptor): + new_block.append(S("barrier(CLK_LOCAL_MEM_FENCE)")) + else: + new_block.append(Comment("next outer schedule item is a prefetch: " + "no sync needed")) + + new_block.extend([ + Line(), + smem_pf_array, + fetch_block, + Line(), + ]) + + # omit tail sync primitive if we're headed into another prefetch + if not isinstance(next_inner_sched_item, PrefetchDescriptor): + new_block.append(S("barrier(CLK_LOCAL_MEM_FENCE)")) + else: + new_block.append(Comment("next inner schedule item is a prefetch: " + "no sync needed")) + + new_block.extend([Line(),inner]) - return expr + return new_block def generate_code(kernel): from cgen import FunctionBody, FunctionDeclaration, \ - POD, Value, Pointer, Module, Block, \ - Initializer, Assign, Statement, For, ArrayOf, \ - Define, If, Line, Comment, MaybeUnused + POD, Value, RestrictPointer, Module, Block, \ + Initializer, Assign, Statement, For, \ + Define, Line, Const - from cgen.opencl import CLKernel, CLGlobal, CLLocal + from cgen.opencl import CLKernel, CLGlobal S = Statement from pymbolic.primitives import Subscript - from pymbolic import var - # {{{ prefetch name assignment + # {{{ assign names, dim storage lengths to prefetches + + all_pf_list = kernel.prefetch.values() + all_pf_sizes = [opf.size() for opf in all_pf_list] + + new_prefetch = {} + for i_pf, pf in enumerate(kernel.prefetch.itervalues()): + dim_storage_lengths = [pfdim.length for pfdim in pf.dims] + + other_pf_sizes = sum(all_pf_sizes[:i_pf]+all_pf_sizes[i_pf+1:]) + # FIXME: sizeof() + + from pytools import product + other_dim_sizes = 4*product(odim.length for odim in pf.dims[:-1]) - def get_prefetch_name(pf): - try: - return prefetch_names[pf] - except KeyError: - nm = "prefetch_%s_%d" % (pf.input_vector, len(prefetch_names)) - prefetch_names[pf] = nm - return nm + from pyopencl.characterize import usable_local_mem_size + if (pf.dims[-1].length % 2 == 0 + and other_pf_sizes+other_dim_sizes*(pf.dims[-1].length+1) + < usable_local_mem_size(kernel.device)): + dim_storage_lengths[-1] += 1 - prefetch_names = {} + new_prefetch[pf.input_vector, pf.index_expr] = \ + pf.copy(dims=pf.dims, + dim_storage_lengths=dim_storage_lengths, + name="prefetch_%s_%d" % (pf.input_vector, i_pf)) + + kernel = kernel.copy(prefetch=new_prefetch) # }}} - ccm = LoopyCCodeMapper(kernel, get_prefetch_name) + ccm = LoopyCCodeMapper(kernel) # {{{ write innermost loop body @@ -983,17 +989,6 @@ def generate_code(kernel): schedule = kernel.schedule[::-1] for sched_index, sched_item in enumerate(schedule): - # find surrounding schedule items - if sched_index > 0: - next_inner_sched_item = schedule[sched_index-1] - else: - next_inner_sched_item = None - - if sched_index+1 < len(schedule): - next_outer_sched_item = schedule[sched_index+1] - else: - next_outer_sched_item = None - # write code for loops if isinstance(sched_item, LoopDimension): dim = sched_item @@ -1017,124 +1012,8 @@ def generate_code(kernel): # write code for prefetches elif isinstance(sched_item, PrefetchDescriptor): - pf = sched_item - pf_name = get_prefetch_name(pf) - - # build smem array declarator - smem_pf_array = POD(numpy.float32, pf_name) - for l in pf.dim_storage_lengths: - smem_pf_array = ArrayOf(smem_pf_array, l) - smem_pf_array = CLLocal(smem_pf_array) - - # figure out dimensions - from pytools import partition2 - work_item_pf_dims, non_work_item_pf_dims = partition2( - (isinstance(dim.tag, WORK_ITEM_IDX_TAG), dim) - for dim in pf.dims) - - # start writing fetch code block - fetch_block = Block([ - Initializer( - MaybeUnused(POD(numpy.uint32, "fetch_idx")), - make_fetch_index_expr(kernel, work_item_pf_dims)) - ]) - - # Prefetch needs to take types of dimensions into account: - # - # - non-launch prefetch dimensions (1) - # - launch prefetch dimensions (2) - # - non-prefetch launch dimensions (3) - # - # Type (1) comprises a pool of fetches that must be efficiently - # fetched with the "extra parallelism" available through (3). - - from pytools import product - non_work_item_pf_size = product(dim.length for dim in non_work_item_pf_dims) - work_item_pf_size = product(dim.length for dim in work_item_pf_dims) - non_pf_work_item_size = product( - dim.length for dim in kernel.dims_by_tag_type(WORK_ITEM_IDX_TAG) - if dim not in work_item_pf_dims) - - # generate indexing substitution map and smem assignment indices - pf_indices = [] - pf_idx_subst_map = {} - prev_dim_sizes = 1 - for i, pf_dim in enumerate(pf.dims): - if isinstance(pf_dim.tag, WORK_ITEM_IDX_TAG): - dim_expr = "get_local_id(%d)" % pf_dim.tag.axis - pf_indices.append(dim_expr) - pf_idx_subst_map[pf_dim.name] = var(dim_expr) - else: - dim_expr = var("fetch_idx") / prev_dim_sizes - - if [pf_subdim - for pf_subdim in pf.dims[i+1:] - if isinstance(pf_subdim, WORK_ITEM_IDX_TAG)]: - dim_expr = dim_expr % pf_dim.length - - pf_indices.append(str(dim_expr)) - pf_idx_subst_map[pf_dim.name] = dim_expr - prev_dim_sizes *= pf_dim.length - - assert prev_dim_sizes == non_work_item_pf_size - - # generate smem assignment statement - from pymbolic.mapper.substitutor import substitute - pf_assignment = Assign( - pf_name + "".join("[%s]" % dexpr - for dexpr in pf_indices), - "%s[%s]" - % (pf.input_vector, - substitute(pf.index_expr, pf_idx_subst_map)) - ) - - # generate fetch loop - - # fetch_base and fetch_idx don't include work_item_pf_size - fetch_base = 0 - while fetch_base + non_pf_work_item_size <= non_work_item_pf_size: - fetch_block.append(pf_assignment) - if fetch_base + non_pf_work_item_size < non_work_item_pf_size: - fetch_block.append( - S("fetch_idx += %d" % non_pf_work_item_size)) - fetch_base += non_pf_work_item_size - if fetch_base < non_work_item_pf_size: - fetch_block.append( - If("fetch_idx < %d" % non_work_item_pf_size, - pf_assignment)) - - new_block = Block([ - Comment(("prefetch %s dim: " % pf.input_vector) + ", ".join( - "%s[%d]" % (pfdim.name, pfdim.length) for pfdim in pf.dims)), - Comment(" ... direct-work-item prefetch dims: " + ", ".join( - pfdim.name for pfdim in work_item_pf_dims)), - Line(), - ]) - - # omit head sync primitive if we just came out of a prefetch - if not isinstance(next_outer_sched_item, PrefetchDescriptor): - new_block.append(S("barrier(CLK_LOCAL_MEM_FENCE)")) - else: - new_block.append(Comment("next outer schedule item is a prefetch: " - "no sync needed")) - - new_block.extend([ - Line(), - smem_pf_array, - fetch_block, - Line(), - ]) - - # omit tail sync primitive if we're headed into another prefetch - if not isinstance(next_inner_sched_item, PrefetchDescriptor): - new_block.append(S("barrier(CLK_LOCAL_MEM_FENCE)")) - else: - new_block.append(Comment("next inner schedule item is a prefetch: " - "no sync needed")) - - new_block.extend([Line(),inner]) - - inner = new_block + inner = generate_prefetch_code( + ccm, kernel, schedule, sched_index, inner) elif isinstance(sched_item, RegisterPrefetch): inner = Block([ @@ -1170,9 +1049,9 @@ def generate_code(kernel): FunctionBody( CLKernel(FunctionDeclaration( Value("void", "loopy_kernel"), - [CLGlobal(Pointer(POD(numpy.float32, name))) + [CLGlobal(Const(RestrictPointer(POD(numpy.float32, name)))) for name in kernel.input_vectors()] - + [CLGlobal(Pointer(POD(numpy.float32, name))) + + [CLGlobal(RestrictPointer(POD(numpy.float32, name))) for name in kernel.output_vectors()])), Block([inner]))) @@ -1186,7 +1065,7 @@ def generate_code(kernel): def print_kernel_info(knl): if hasattr(knl, "prefetch"): - print "PREFETCH", total_prefetch_size(knl) + print "PREFETCH", knl.local_mem_use() for pf in knl.prefetch.itervalues(): print " %s[%s]: %s" % (pf.input_vector, pf.index_expr, pf.dims) print @@ -1205,6 +1084,65 @@ def print_kernel_info(knl): # }}} +# {{{ high-level modifiers + +def split_dimension(knl, *args, **kwargs): + knl, _ = knl.split_dimension(*args, **kwargs) + return knl + +def get_input_access_descriptors(kernel): + """Return a dictionary mapping input vectors to + a list of input access descriptor. An input access + descriptor is a tuple (input_vec, index_expr). + """ + from pytools import flatten + result = {} + for ivec in kernel.input_vectors(): + result[ivec] = [ + (ivec, iexpr) + for iexpr in flatten( + VariableIndexExpressionCollector(ivec)(expression) + for lvalue, expression in kernel.instructions + )] + + return result + +def add_prefetch_dims(kernel, input_access_descr, dims, loc_fetch_axes={}): + """ + :arg input_access_descr: see :func:`get_input_access_descriptors`. + May also be the name of the variable if there is only one + reference to that variable. + :arg dims: loop dimensions that are used to carry out the prefetch + """ + + if isinstance(input_access_descr, str): + var_name = input_access_descr + var_iads = get_input_access_descriptors(kernel)[var_name] + + if len(var_iads) != 1: + raise ValueError("input access descriptor for variable %s is " + "not unique" % var_name) + + input_access_descr, = var_iads + + dims = [kernel.parse_sloppy_dim(dim) for dim in dims] + ivec, iexpr = input_access_descr + + new_prefetch = getattr(kernel, "prefetch", {}).copy() + if input_access_descr in new_prefetch: + raise ValueError("a prefetch descriptor for the input access %s[%s] " + "already exists" % (ivec, iexpr)) + + new_prefetch[input_access_descr] = PrefetchDescriptor( + input_vector=ivec, + index_expr=iexpr, + dims=dims, + loc_fetch_axes={}) + + return kernel.copy(prefetch=new_prefetch) + +# }}} + @@ -1223,17 +1161,20 @@ class CompiledKernel: self.cl_kernel = cl.Program(context, self.code).build().loopy_kernel def time_run(self, queue, launcher, warmup_rounds=2, timing_rounds=5): + check = True for i in range(warmup_rounds): - launcher(self.kernel.group_counts(), self.kernel.local_size(), - self.cl_kernel) + launcher(self.kernel.group_dims(), self.kernel.local_dims(), + self.cl_kernel, check=check) + check = False + evt_start = cl.enqueue_marker(queue) for i in range(timing_rounds): - launcher(self.kernel.group_counts(), self.kernel.local_size(), - self.cl_kernel) + launcher(self.kernel.group_dims(), self.kernel.local_dims(), + self.cl_kernel, check=check) evt_end = cl.enqueue_marker(queue) evt_end.wait() - return 1e-9*(evt_end.profile.START-evt_start.profile.START) + return 1e-9*(evt_end.profile.START-evt_start.profile.START)/timing_rounds # }}} @@ -1241,23 +1182,6 @@ class CompiledKernel: # driver ---------------------------------------------------------------------- -def generate_all_kernels(orig_kernel): - min_group_size = orig_kernel.get_hint(HINTS.MIN_GROUP_SIZE) - min_group_count = orig_kernel.get_hint(HINTS.MIN_GROUP_COUNT) - - for knl in generate_dim_assignments(orig_kernel): - if min_group_size is not None and knl.group_size() < min_group_size: - continue - if min_group_count is not None and knl.group_count() < min_group_count: - continue - - for pf_knl in generate_all_prefetching_kernels(knl): - for sch_knl in generate_loop_schedules(pf_knl): - yield insert_register_prefetches(sch_knl) - - - - def drive_timing_run(kernel_generator, queue, launch, flop_count=None): soln_count = 0 for kernel in kernel_generator: @@ -1285,16 +1209,4 @@ def drive_timing_run(kernel_generator, queue, launch, flop_count=None): -def show_kernel_codes(kernel_generator): - soln_count = 0 - - for kernel in kernel_generator: - print "-----------------------------------------------" - print "SOLUTION #%d" % soln_count - print "-----------------------------------------------" - print generate_code(kernel) - soln_count += 1 - - print "%d solutions" % soln_count - # vim: foldmethod=marker