From 221bbe2a251f835cd4897ee588b7a177de7737cb Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner <inform@tiker.net> Date: Mon, 8 Aug 2011 01:27:22 +0200 Subject: [PATCH] Split loopy into multiple, smaller files. --- examples/matrix-mul.py | 88 + loopy/__init__.py | 2364 +---------------- loopy/codegen/__init__.py | 271 ++ loopy/codegen/bounds.py | 116 + loopy/codegen/dispatch.py | 131 + loopy/codegen/loop_dim.py | 260 ++ loopy/codegen/prefetch.py | 372 +++ loopy/compiled.py | 122 + loopy/isl.py | 260 ++ loopy/kernel.py | 475 ++++ loopy/prefetch.py | 143 + loopy/schedule.py | 165 ++ loopy/symbolic.py | 204 ++ examples/matrix-ops.py => test/test_matmul.py | 0 14 files changed, 2646 insertions(+), 2325 deletions(-) create mode 100644 examples/matrix-mul.py create mode 100644 loopy/codegen/__init__.py create mode 100644 loopy/codegen/bounds.py create mode 100644 loopy/codegen/dispatch.py create mode 100644 loopy/codegen/loop_dim.py create mode 100644 loopy/codegen/prefetch.py create mode 100644 loopy/compiled.py create mode 100644 loopy/isl.py create mode 100644 loopy/kernel.py create mode 100644 loopy/prefetch.py create mode 100644 loopy/schedule.py create mode 100644 loopy/symbolic.py rename examples/matrix-ops.py => test/test_matmul.py (100%) diff --git a/examples/matrix-mul.py b/examples/matrix-mul.py new file mode 100644 index 000000000..78ebb088c --- /dev/null +++ b/examples/matrix-mul.py @@ -0,0 +1,88 @@ +import numpy as np +import pyopencl as cl +import pyopencl.array as cl_array +import loopy as lp + + + + +def make_well_conditioned_dev_matrix(queue, shape, dtype=np.float32, + order="C", ran_factor=1, id_factor=5, inc_factor=0, od=0): + if isinstance(shape, int): + shape = (shape, shape) + l = max(shape) + eye_ish = id_factor*np.eye(l, k=od) + if inc_factor: + eye_ish[np.arange(l), np.arange(l)] = inc_factor*np.arange(l) + ary = np.asarray( + ran_factor*np.random.randn(*shape) + + eye_ish[:shape[0], :shape[1]], + dtype=dtype, order=order) + + return cl_array.to_device(queue, ary) + + + + +def image_matrix_mul_ilp(ctx_factory=cl.create_some_context): + dtype = np.float32 + ctx = ctx_factory() + order = "C" + queue = cl.CommandQueue(ctx, + properties=cl.command_queue_properties.PROFILING_ENABLE) + + n = 16*10 + from pymbolic import var + a, b, c, i, j, k, n_sym = [var(s) for s in "abcijkn"] + + knl = lp.LoopKernel(ctx.devices[0], + "{[i,j,k]: 0<=i,j,k<%d}" % n, + [ + (c[i, j], a[i, k]*b[k, j]) + ], + [ + lp.ImageArg("a", dtype, 2), + lp.ImageArg("b", dtype, 2), + lp.ArrayArg("c", dtype, shape=(n, n), order=order), + ], + name="matmul") + + ilp = 4 + knl = lp.split_dimension(knl, "i", 2, outer_tag="g.0", inner_tag="l.1") + j_inner_split = 16 + knl = lp.split_dimension(knl, "j", ilp*j_inner_split, outer_tag="g.1") + knl = lp.split_dimension(knl, "j_inner", j_inner_split, outer_tag="ilp", inner_tag="l.0") + knl = lp.split_dimension(knl, "k", 2) + + knl = lp.add_prefetch(knl, 'a', ["i_inner", "k_inner"]) + knl = lp.add_prefetch(knl, 'b', ["j_inner_outer", "j_inner_inner", "k_inner"]) + inv_reason = knl.get_invalid_reason() + assert inv_reason is None, inv_reason + + kernel_gen = (lp.insert_register_prefetches(knl) + for knl in lp.generate_loop_schedules(knl)) + + a = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order, + ran_factor=1, id_factor=5) + b = make_well_conditioned_dev_matrix(queue, n, dtype=dtype, order=order, + ran_factor=1, id_factor=5, inc_factor=0) + c = cl_array.empty_like(a) + a_img = cl.image_from_array(ctx, a.get(), 1) + b_img = cl.image_from_array(ctx, b.get(), 1) + + def launcher(kernel, gsize, lsize, check): + evt = kernel(queue, gsize(), lsize(), a_img, b_img, c.data, + g_times_l=True) + + return evt + + from pyopencl.characterize import get_fast_inaccurate_build_options + lp.drive_timing_run(kernel_gen, queue, launcher, 2*n**3, + options=get_fast_inaccurate_build_options(ctx.devices[0]), + force_rebuild=True) + + + + +if __name__ == "__main__": + image_matrix_mul_ilp() diff --git a/loopy/__init__.py b/loopy/__init__.py index 4420afd44..cb497a4b9 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -1,2238 +1,56 @@ from __future__ import division -import numpy as np -from pytools import Record, memoize_method -from pymbolic.mapper.dependency import DependencyMapper -from pymbolic.mapper.c_code import CCodeMapper -from pymbolic.mapper.stringifier import PREC_NONE -from pymbolic.mapper import CombineMapper, RecursiveMapper - -import pyopencl as cl -import pyopencl.characterize as cl_char -import islpy as isl -from islpy import dim_type - -def register_mpz_with_pymbolic(): - from pymbolic.primitives import register_constant_class - import gmpy - mpz_type = type(gmpy.mpz(1)) - register_constant_class(mpz_type) - -register_mpz_with_pymbolic() - - - -# TODO: Constant memory -# TODO: Reuse of previously split dimensions for prefetch -# (Or general merging) -# TODO: ILP Unroll -# ILP dep chain: -# Prefetch needs value for k_outer -# k_outer is outermost reduction loop -# ILP must be outside of reduction loops -# Therfore, there are prefetches inside ILPs -# TODO: Debug 1 ILP -# FIXME: Random equality constraints -# TODO: Use increment for ILP? - -# TODO: Try, fix reg. prefetch (DG example) / CSEs -# TODO: Custom reductions per red. axis -# TODO: Functions -# TODO: Common subexpressions -# TODO: Parse ops from string -# FIXME: support non-reductive dimensions -# FIXME: write names should be assigned during scheduling - -# TODO: Condition hoisting -# TODO: Don't emit spurious barriers (no for scheduled before) -# TODO: Make code more readable - -# TODO: Split into multiple files. -# TODO: Divisibility -# TODO: Try different kernels -# TODO: - Tricky: Convolution, FD -# TODO: Try, fix indirect addressing -# TODO: User controllable switch for slab opt -# TODO: Separate all-bulk from non-bulk kernels. - - -# TODO: implement efficient div_ceil? -# TODO: why are corner cases inefficient? - - - - -class LoopyAdvisory(UserWarning): - pass - -# {{{ index tags - -class IndexTag(Record): - __slots__ = [] - - def __hash__(self): - raise RuntimeError("use .key to hash index tags") - - @property - def key(self): - return type(self) - - - -class ParallelTag(IndexTag): - pass - -class AxisParallelTag(ParallelTag): - __slots__ = ["axis", "forced_length"] - - def __init__(self, axis, forced_length=None): - Record.__init__(self, - axis=axis, forced_length=forced_length) - - @property - def key(self): - return (type(self), self.axis) - - def __repr__(self): - if self.forced_length: - return "%s(%d, flen=%d)" % ( - self.print_name, self.axis, - self.forced_length) - else: - return "%s(%d)" % ( - self.print_name, self.axis) - -class TAG_GROUP_IDX(AxisParallelTag): - print_name = "GROUP_IDX" - -class TAG_WORK_ITEM_IDX(AxisParallelTag): - print_name = "WORK_ITEM_IDX" - -class TAG_ILP(ParallelTag): - def __repr__(self): - return "TAG_ILP" - -class BaseUnrollTag(IndexTag): - pass - -class TAG_UNROLL_STATIC(BaseUnrollTag): - def __repr__(self): - return "TAG_UNROLL_STATIC" - -class TAG_UNROLL_INCR(BaseUnrollTag): - def __repr__(self): - return "TAG_UNROLL_INCR" - -def parse_tag(tag): - if tag is None: - return tag - - if isinstance(tag, IndexTag): - return tag - - if not isinstance(tag, str): - raise ValueError("cannot parse tag: %s" % tag) - - if tag in ["unrs", "unr"]: - return TAG_UNROLL_STATIC() - elif tag == "unri": - return TAG_UNROLL_INCR() - elif tag == "ilp": - return TAG_ILP() - elif tag.startswith("g."): - return TAG_GROUP_IDX(int(tag[2:])) - elif tag.startswith("l."): - return TAG_WORK_ITEM_IDX(int(tag[2:])) - else: - raise ValueError("cannot parse tag: %s" % tag) - -# }}} - -# {{{ isl helpers - -def get_bounds_constraints(bset, iname, space=None, admissible_vars=None): - if isinstance(bset, isl.Set): - bset, = bset.get_basic_sets() - - constraints = bset.get_constraints() - - if not isinstance(admissible_vars, set): - admissible_vars = set(admissible_vars) - - lower = [] - upper = [] - equality = [] - - if space is None: - space = bset.get_dim() - - var_dict = space.get_var_dict() - iname_tp, iname_idx = var_dict[iname] - - for cns in constraints: - iname_coeff = int(cns.get_coefficient(iname_tp, iname_idx)) - - if admissible_vars is not None: - if not (set(cns.get_coefficients_by_name().iterkeys()) - <= admissible_vars): - continue - - if iname_coeff == 0: - continue - - if cns.is_equality(): - equality.append(cns) - elif iname_coeff < 0: - upper.append(cns) - else: # iname_coeff > 0 - lower.append(cns) - - return lower, upper, equality - - -def get_projected_bounds_constraints(set, iname): - """Get an overapproximation of the loop bounds for the variable *iname*, - as constraints. - """ - - # project out every variable except iname - projected_domain = isl.project_out_except(set, [iname], [dim_type.set]) - - basic_sets = projected_domain.get_basic_sets() - - # FIXME perhaps use some form of hull here if there's more than one - # basic set? - bset, = basic_sets - - # Python-style, half-open bounds - upper_bounds = [] - lower_bounds = [] - bset = bset.remove_divs() - - bset_iname_dim_type, bset_iname_idx = bset.get_dim().get_var_dict()[iname] - - def examine_constraint(cns): - assert not cns.is_equality() - assert not cns.is_div_constraint() - - coeffs = cns.get_coefficients_by_name() - - iname_coeff = int(coeffs.get(iname, 0)) - if iname_coeff == 0: - return - elif iname_coeff < 0: - upper_bounds.append(cns) - else: # iname_coeff > 0: - lower_bounds.append(cns) - - bset.foreach_constraint(examine_constraint) - - lb, = lower_bounds - ub, = upper_bounds - - return lb, ub - - - - -def solve_constraint_for_bound(cns, iname): - rhs, iname_coeff = constraint_to_expr(cns, except_name=iname) - - if iname_coeff == 0: - raise ValueError("cannot solve constraint for '%s'--" - "constraint does not contain variable" - % iname) - - from pymbolic import expand - from pytools import div_ceil - from pymbolic import flatten - from pymbolic.mapper.constant_folder import CommutativeConstantFoldingMapper - cfm = CommutativeConstantFoldingMapper() - - if iname_coeff > 0 or cns.is_equality(): - if cns.is_equality(): - kind = "==" - else: - kind = ">=" - - return kind, cfm(flatten(div_ceil(expand(-rhs), iname_coeff))) - else: # iname_coeff < 0 - from pytools import div_ceil - return "<", cfm(flatten(div_ceil(rhs+1, -iname_coeff))) - - - - -def get_projected_bounds(set, iname): - """Get an overapproximation of the loop bounds for the variable *iname*, - as actual bounds. - """ - - lb_cns, ub_cns = get_projected_bounds_constraints(set, iname) - - for cns in [lb_cns, ub_cns]: - iname_tp, iname_idx = lb_cns.get_dim().get_var_dict()[iname] - iname_coeff = cns.get_coefficient(iname_tp, iname_idx) - - if iname_coeff == 0: - continue - - kind, bound = solve_constraint_for_bound(cns, iname) - if kind == "<": - ub = bound - elif kind == ">=": - lb = bound - else: - raise ValueError("unsupported constraint kind") - - return lb, ub - -def cast_constraint_to_space(cns, new_space, as_equality=None): - if as_equality is None: - as_equality = cns.is_equality() - - if as_equality: - factory = isl.Constraint.eq_from_names - else: - factory = isl.Constraint.ineq_from_names - return factory(new_space, cns.get_coefficients_by_name()) - -def block_shift_constraint(cns, iname, multiple, as_equality=None): - cns = copy_constraint(cns, as_equality=as_equality) - cns = cns.set_constant(cns.get_constant() - + cns.get_coefficients_by_name()[iname]*multiple) - return cns - -def negate_constraint(cns): - assert not cns.is_equality() - # FIXME hackety hack - my_set = (isl.BasicSet.universe(cns.get_dim()) - .add_constraint(cns)) - my_set = my_set.complement() - - results = [] - def examine_basic_set(s): - s.foreach_constraint(results.append) - my_set.foreach_basic_set(examine_basic_set) - result, = results - return result - -def copy_constraint(cns, as_equality=None): - return cast_constraint_to_space(cns, cns.get_dim(), - as_equality=as_equality) - -def get_dim_bounds(set, inames): - vars = set.get_dim().get_var_dict(dim_type.set).keys() - return [get_projected_bounds(set, v) for v in inames] - -def count_box_from_bounds(bounds): - from pytools import product - return product(stop-start for start, stop in bounds) - -def make_index_map(set, index_expr): - if not isinstance(index_expr, tuple): - index_expr = (index_expr,) - - amap = isl.Map.from_domain(set).add_dims(dim_type.out, len(index_expr)) - out_names = ["_ary_idx_%d" % i for i in range(len(index_expr))] - - dim = amap.get_dim() - all_constraints = tuple( - eq_constraint_from_expr(dim, iexpr_i) - for iexpr_i in index_expr) - - for i, out_name in enumerate(out_names): - amap = amap.set_dim_name(dim_type.out, i, out_name) - - for i, (out_name, constr) in enumerate(zip(out_names, all_constraints)): - constr.set_coefficients_by_name({out_name: -1}) - amap = amap.add_constraint(constr) - - return amap - -def make_slab(space, iname, start, stop): - from pymbolic import var - var_iname = var(iname) - return (isl.Set.universe(space) - # start <= inner - .add_constraint(ineq_constraint_from_expr( - space, var_iname -start)) - # inner < stop - .add_constraint(ineq_constraint_from_expr( - space, stop-1 - var_iname))) - -def constraint_to_expr(cns, except_name=None): - excepted_coeff = 0 - result = 0 - from pymbolic import var - for var_name, coeff in cns.get_coefficients_by_name().iteritems(): - if isinstance(var_name, str): - if var_name == except_name: - excepted_coeff = int(coeff) - else: - result += int(coeff)*var(var_name) - else: - result += int(coeff) - - if except_name is not None: - return result, excepted_coeff - else: - return result - -def constraint_to_code(ccm, cns): - if cns.is_equality(): - comp_op = "==" - else: - comp_op = ">=" - - return "%s %s 0" % (ccm(constraint_to_expr(cns)), comp_op) - - -# }}} - -# {{{ pymbolic mappers - -class CoefficientCollector(RecursiveMapper): - def map_sum(self, expr): - stride_dicts = [self.rec(ch) for ch in expr.children] - - result = {} - for stride_dict in stride_dicts: - for var, stride in stride_dict.iteritems(): - if var in result: - result[var] += stride - else: - result[var] = stride - - return result - - def map_product(self, expr): - result = {} - - children_coeffs = [self.rec(child) for child in expr.children] - - idx_of_child_with_vars = None - for i, child_coeffs in enumerate(children_coeffs): - for k in child_coeffs: - if isinstance(k, str): - if (idx_of_child_with_vars is not None - and idx_of_child_with_vars != i): - raise RuntimeError( - "nonlinear expression") - idx_of_child_with_vars = i - - other_coeffs = 1 - for i, child_coeffs in enumerate(children_coeffs): - if i != idx_of_child_with_vars: - assert len(child_coeffs) == 1 - other_coeffs *= child_coeffs[1] - - if idx_of_child_with_vars is None: - return {1: other_coeffs} - else: - return dict( - (var, other_coeffs*coeff) - for var, coeff in - children_coeffs[idx_of_child_with_vars].iteritems()) - - return result - - def map_constant(self, expr): - return {1: expr} - - def map_variable(self, expr): - return {expr.name: 1} - - def map_subscript(self, expr): - raise RuntimeError("cannot gather coefficients--indirect addressing in use") - - - - -def _constraint_from_expr(space, expr, constraint_factory): - return constraint_factory(space, - CoefficientCollector()(expr)) - -def eq_constraint_from_expr(space, expr): - return _constraint_from_expr( - space, expr, isl.Constraint.eq_from_names) - -def ineq_constraint_from_expr(space, expr): - return _constraint_from_expr( - space, expr, isl.Constraint.ineq_from_names) - -# }}} - -# {{{ arguments - -class ArrayArg: - def __init__(self, name, dtype, strides=None, shape=None, order="C", - offset=0): - """ - All of the following are optional. Specify either strides or shape. - - :arg strides: like numpy strides, but in multiples of - data type size - :arg shape: - :arg order: - :arg offset: Offset from the beginning of the vector from which - the strides are counted. - """ - self.name = name - self.dtype = np.dtype(dtype) - - if strides is not None and shape is not None: - raise ValueError("can only specify one of shape and strides") - - if strides is not None: - strides = tuple(strides) - - if shape is not None: - from pyopencl.compyte.array import ( - f_contiguous_strides, - c_contiguous_strides) - - if order == "F": - strides = f_contiguous_strides(1, shape) - elif order == "C": - strides = c_contiguous_strides(1, shape) - else: - raise ValueError("invalid order: %s" % order) - - self.strides = strides - self.offset = offset - - def __repr__(self): - return "<ArrayArg '%s' of type %s>" % (self.name, self.dtype) - - - -class ImageArg: - def __init__(self, name, dtype, dimensions): - self.name = name - self.dtype = np.dtype(dtype) - self.dimensions = dimensions - - def __repr__(self): - return "<ImageArg '%s' of type %s>" % (self.name, self.dtype) - - -class ScalarArg: - def __init__(self, name, dtype, approximately): - self.name = name - self.dtype = np.dtype(dtype) - self.approximately = approximately - - def __repr__(self): - return "<ScalarArg '%s' of type %s>" % (self.name, self.dtype) - -# }}} - -# {{{ loop kernel object - -class LoopKernel(Record): - """ - :ivar device: :class:`pyopencl.Device` - :ivar domain: :class:`islpy.BasicSet` - :ivar iname_to_tag: - :ivar instructions: - :ivar args: - :ivar prefetch: - :ivar schedule: - :ivar register_prefetch: - :ivar name: - :ivar preamble: - """ - - def __init__(self, device, domain, instructions, args=None, prefetch={}, schedule=None, - register_prefetch=None, name="loopy_kernel", - iname_to_tag={}, is_divisible=lambda dividend, divisor: False, - preamble=None): - """ - :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl. - Example: "{[i,j]: 0<=i < 10 and 0<= j < 9}" - :arg iname_to_tag: a map from loop domain variables to subclasses of :class:`IndexTag` - """ - from pymbolic import parse - - def parse_if_necessary(v): - if isinstance(v, str): - return parse(v) - else: - return v - - if isinstance(domain, str): - ctx = isl.Context() - domain = isl.Set.read_from_str(ctx, domain, nparam=-1) - - insns = [ - (parse_if_necessary(lvalue), - parse_if_necessary(expr)) - for lvalue, expr in instructions] - - Record.__init__(self, - device=device, args=args, domain=domain, instructions=insns, - prefetch=prefetch, schedule=schedule, - register_prefetch=register_prefetch, name=name, - iname_to_tag=iname_to_tag, - is_divisible=is_divisible, - preamble=preamble) - - # FIXME: assert empyt intersection of loop vars and args - # FIXME: assert that isl knows about loop parameters - - @property - @memoize_method - def space(self): - return self.domain.get_dim() - - @property - @memoize_method - def tag_key_to_iname(self): - return dict( - (tag.key, iname) - for iname, tag in self.iname_to_tag.iteritems()) - - @property - @memoize_method - def arg_dict(self): - return dict((arg.name, arg) for arg in self.args) - - @memoize_method - def scalar_args(self): - if self.args is None: - return set() - else: - return set(arg.name for arg in self.args if isinstance(arg, ScalarArg)) - - @memoize_method - def all_inames(self): - return set(self.space.get_var_dict(dim_type.set).iterkeys()) - - @memoize_method - def output_inames(self): - dm = DependencyMapper(include_subscripts=False) - - output_indices = set() - for lvalue, expr in self.instructions: - output_indices.update( - set(v.name for v in dm(lvalue)) - & self.all_inames()) - - return output_indices - set(arg.name for arg in self.args) - - @memoize_method - def reduction_inames(self): - return self.all_inames() - self.output_inames() - - def inames_by_tag_type(self, tag_type): - return [iname for iname in self.all_inames() - if isinstance(self.iname_to_tag.get(iname), tag_type)] - - def ordered_inames_by_tag_type(self, tag_type): - result = [] - from itertools import count - for i in count(): - try: - dim = self.tag_key_to_iname[tag_type(i).key] - except KeyError: - return result - else: - result.append(dim) - - @memoize_method - def get_projected_bounds_constraints(self, iname): - """Get an overapproximation of the loop bounds for the variable *iname*.""" - - return get_projected_bounds_constraints(self.domain, iname) - - @memoize_method - def get_projected_bounds(self, iname): - """Get an overapproximation of the loop bounds for the variable *iname*.""" - - return get_projected_bounds(self.domain, iname) - - def tag_type_lengths(self, tag_cls): - def get_length(iname): - tag = self.iname_to_tag[iname] - if tag.forced_length is not None: - return tag.forced_length - - start, stop = self.get_projected_bounds(iname) - return stop-start - - return [get_length(iname) - for iname in self.ordered_inames_by_tag_type(tag_cls)] - - def tag_or_iname_to_iname(self, s): - try: - tag = parse_tag(s) - except ValueError: - pass - else: - return self.tag_key_to_iname[tag.key] - - if s not in self.all_inames(): - raise RuntimeError("invalid index name '%s'" % s) - - return s - - def local_mem_use(self): - return sum(pf.nbytes for pf in self.prefetch.itervalues()) - - @memoize_method - def input_vectors(self): - dm = DependencyMapper(include_subscripts=False) - - input_vectors = set() - for lvalue, expr in self.instructions: - input_vectors.update( - set(v.name for v in dm(expr))) - - return input_vectors - self.all_inames() - self.scalar_args() - - @memoize_method - def output_vectors(self): - dm = DependencyMapper(include_subscripts=False) - - output_vectors = set() - for lvalue, expr in self.instructions: - output_vectors.update( - set(v.name for v in dm(lvalue))) - - return output_vectors - self.all_inames() - self.scalar_args() - - def _subst_insns(self, old_var, new_expr): - from pymbolic.mapper.substitutor import substitute - - subst_map = {old_var: new_expr} - - return [(substitute(lvalue, subst_map), - substitute(expr, subst_map)) - for lvalue, expr in self.instructions] - - def is_prefetch_variable(self, varname): - if self.prefetch: - for pf in self.prefetch.itervalues(): - for pfdim in pf.dims: - if pfdim.name == varname: - return True - - return False - - def _subst_prefetch(self, old_var, new_expr): - from pymbolic.mapper.substitutor import substitute - subst_map = {old_var: new_expr} - - result = {} - for pf in self.prefetch.itervalues(): - for pfdim in pf.dims: - if pfdim.name == old_var: - raise RuntimeError("can't substitute prefetch dimension %s" - % old_var) - - new_pf = pf.copy(index_expr=substitute(pf.index_expr, subst_map)) - result[pf.input_vector, new_pf.index_expr] = new_pf - - return result - - def substitute(self, old_var, new_expr): - copy = self.copy(instructions=self._subst_insns(old_var, new_expr)) - if self.prefetch: - copy.prefetch = self._subst_prefetch(old_var, new_expr) - - if self.schedule is not None: - for sched_item in self.schedule: - if (isinstance(sched_item, ScheduledLoop) - and sched_item.iname == old_var): - raise RuntimeError("can't substitute already-scheduled variable: %s" - % old_var) - - return copy - - def split_dimension(self, name, inner_length, padded_length=None, - outer_name=None, inner_name=None, - outer_tag=None, inner_tag=None): - - outer_tag = parse_tag(outer_tag) - inner_tag = parse_tag(inner_tag) - - if self.iname_to_tag.get(name) is not None: - raise RuntimeError("cannot split tagged dimension '%s'" % name) - - # {{{ check for repeated tag keys - - new_tag_keys = set(tag.key - for tag in [outer_tag, inner_tag] - if tag is not None) - - repeated_tag_keys = new_tag_keys & set( - tag.key for tag in - self.iname_to_tag.itervalues()) - - if repeated_tag_keys: - raise RuntimeError("repeated tag(s): %s" % repeated_tag_keys) - - # }}} - - if padded_length is not None: - inner_tag = inner_tag.copy(forced_length=padded_length) - - if outer_name is None: - outer_name = name+"_outer" - if inner_name is None: - inner_name = name+"_inner" - - new_iname_to_tag = self.iname_to_tag.copy() - if inner_tag is not None: - new_iname_to_tag[inner_name] = inner_tag - if outer_tag is not None: - new_iname_to_tag[outer_name] = outer_tag - - outer_var_nr = self.space.size(dim_type.set) - inner_var_nr = self.space.size(dim_type.set)+1 - new_domain = self.domain.add_dims(dim_type.set, 2) - new_domain.set_dim_name(dim_type.set, outer_var_nr, outer_name) - new_domain.set_dim_name(dim_type.set, inner_var_nr, inner_name) - - space = new_domain.get_dim() - inner_constraint_set = ( - make_slab(space, inner_name, 0, inner_length) - # name = inner + length*outer - .add_constraint(isl.Constraint.eq_from_names( - space, {name:1, inner_name: -1, outer_name:-inner_length}))) - - name_dim_type, name_idx = space.get_var_dict()[name] - new_domain = (new_domain - .intersect(inner_constraint_set) - .project_out(name_dim_type, name_idx, 1)) - - from pymbolic import var - inner = var(inner_name) - outer = var(outer_name) - new_loop_index = inner + outer*inner_length - - return (self - .substitute(name, new_loop_index) - .copy(domain=new_domain, iname_to_tag=new_iname_to_tag)) - - def get_invalid_reason(self): - glens = self.tag_type_lengths(TAG_GROUP_IDX) - llens = self.tag_type_lengths(TAG_WORK_ITEM_IDX) - if (max(len(glens), len(llens)) - > self.device.max_work_item_dimensions): - return "too many work item dimensions" - - for i in range(len(llens)): - if llens[i] > self.device.max_work_item_sizes[i]: - return "group axis %d too big" - - from pytools import product - if product(llens) > self.device.max_work_group_size: - return "work group too big" - - from pyopencl.characterize import usable_local_mem_size - if self.local_mem_use() > usable_local_mem_size(self.device): - return "using too much local memory" - - return None - -# }}} - -# {{{ local-mem prefetch-related - -class PrefetchDescriptor(Record): - """ - Attributes: - :ivar kernel: - :ivar input_vector: A string indicating the input vector variable name. - :ivar index_expr: An expression identifying the access which this prefetch - serves. - :ivar inames: A sequence of inames (i.e. loop dimensions) identifying which - part of the input vector, given the index_expr, should be prefetched. - :ivar loc_fetch_axes: dictionary from integers 0..len(inames) to lists of - local index axes which should be used to realize that dimension of the - prefetch. The last dimension in this list is used as the fastest-changing - one. - :ivar name: the variable name used for the prefetch - :ivar dim_storage_lengths: a sequence of integers indicating the size of - the storage for each dimension. It may may differ from the size of the - actual loop dimensions to mitigate bank conflicts. - - The latter two values are only assigned during code generation. - """ - - @property - @memoize_method - def domain(self): - return (isl.project_out_except(self.kernel.domain, self.inames, [dim_type.set]) - .remove_divs()) - - @property - @memoize_method - def index_map(self): - imap = make_index_map(self.kernel_domain, self.index_expr) - assert imap.is_bijective() - return imap - - @property - @memoize_method - def restricted_index_map(self): - return self.index_map.intersect_domain(self.domain) - - @property - @memoize_method - def dim_bounds(self): - return get_dim_bounds(self.domain, self.inames) - - @property - def itemsize(self): - return self.kernel.arg_dict[self.input_vector].dtype.itemsize - @property - @memoize_method - def nbytes(self): - return self.itemsize * count_box_from_bounds(self.dim_bounds) - - @memoize_method - def free_variables(self): - return set(var.name - for var in DependencyMapper()(self.index_expr) - ) - set(self.inames) - self.kernel.scalar_args() - - def hash(self): - return (hash(type(self)) ^ hash(self.input_vector) - ^ hash(self.index_expr)) - - def __eq__(self, other): - # in particular, dim_storage_lengths should not factor into equality - return (type(self) == type(other) - and self.input_vector == other.input_vector - and self.index_expr == other.index_expr) - - - - - - -class VariableIndexExpressionCollector(CombineMapper): - def __init__(self, tgt_vector_name): - self.tgt_vector_name = tgt_vector_name - - def combine(self, values): - from pytools import flatten - return set(flatten(values)) - - def map_constant(self, expr): - return set() - - def map_algebraic_leaf(self, expr): - return set() - - def map_subscript(self, expr): - from pymbolic.primitives import Variable - assert isinstance(expr.aggregate, Variable) - - if expr.aggregate.name == self.tgt_vector_name: - return set([expr.index]) - else: - return CombineMapper.map_subscript(self, expr) - - - - -# }}} - -# {{{ loop scheduling - -# {{{ schedule items - -class ScheduledLoop(Record): - __slots__ = ["iname"] - -class WriteOutput(Record): - pass - -class RegisterPrefetch(Record): - __slots__ = ["subscript_expr", "new_name"] - -# }}} - -def generate_loop_schedules(kernel, hints=[]): - prev_schedule = kernel.schedule - if prev_schedule is None: - prev_schedule = [ - ScheduledLoop(iname=iname) - for iname in ( - kernel.ordered_inames_by_tag_type(TAG_GROUP_IDX) - + kernel.ordered_inames_by_tag_type(TAG_WORK_ITEM_IDX))] - - scheduled_inames = set(sch_item.iname - for sch_item in prev_schedule - if isinstance(sch_item, ScheduledLoop)) - - # have a schedulable prefetch? load, schedule it - had_usable_prefetch = False - locally_parallel_inames = set( - iname for iname in scheduled_inames - if isinstance(kernel.iname_to_tag.get(iname), - (TAG_ILP, TAG_WORK_ITEM_IDX))) - - for pf in kernel.prefetch.itervalues(): - # already scheduled? never mind then. - if pf in prev_schedule: - continue - - # a free variable not known yet? then we're not ready - if not pf.free_variables() <= scheduled_inames: - continue - - # a prefetch variable already scheduled, but not borrowable? - # (only work item index variables are borrowable) - - if set(pf.inames) & (scheduled_inames - locally_parallel_inames): - # dead end: we won't be able to schedule this prefetch - # in this branch. at least one of its loop dimensions - # was already scheduled, and that dimension is not - # borrowable. - - #print "UNSCHEDULABLE", kernel.schedule - return - - new_kernel = kernel.copy(schedule=prev_schedule+[pf]) - for knl in generate_loop_schedules(new_kernel): - had_usable_prefetch = True - yield knl - - if had_usable_prefetch: - # because we've already recursed - return - - # Build set of potentially schedulable variables - # Don't re-schedule already scheduled variables - schedulable = kernel.all_inames() - scheduled_inames - - # Schedule in the following order: - # - serial output inames - # - remaining parallel output inames (i.e. ILP) - # - output write - # - reduction - # Don't schedule reduction variables until all output - # variables are taken care of. Once they are, schedule - # output writing. - parallel_output_inames = set(oin for oin in kernel.output_inames() - if isinstance(kernel.iname_to_tag.get(oin), ParallelTag)) - - serial_output_inames = kernel.output_inames() - parallel_output_inames - - if schedulable & serial_output_inames: - schedulable = schedulable & serial_output_inames - - if schedulable & parallel_output_inames: - schedulable = schedulable & parallel_output_inames - - if kernel.output_inames() <= scheduled_inames: - if not any(isinstance(sch_item, WriteOutput) - for sch_item in prev_schedule): - kernel = kernel.copy( - schedule=prev_schedule + [WriteOutput()]) - prev_schedule = kernel.schedule - - # Don't schedule variables that are prefetch axes - # for not-yet-scheduled prefetches. - unsched_prefetch_axes = set(iname - for pf in kernel.prefetch.itervalues() - if pf not in prev_schedule - for iname in pf.inames - if not isinstance(kernel.iname_to_tag.get(iname), ParallelTag)) - schedulable -= unsched_prefetch_axes - - while hints and hints[0] in scheduled_inames: - hints = hints[1:] - - if hints and hints[0] in schedulable: - schedulable = set(hints[0]) - - if schedulable: - # have a schedulable variable? schedule a loop for it, recurse - for iname in schedulable: - new_kernel = kernel.copy(schedule=prev_schedule+[ScheduledLoop(iname=iname)]) - for knl in generate_loop_schedules(new_kernel, hints): - yield knl - else: - # all loop dimensions and prefetches scheduled? - # great! yield the finished product if it is complete - - if hints: - from warnings import warn - warn("leftover schedule hints: "+ (", ".join(hints)), - LoopyAdvisory) - - all_inames_scheduled = len(scheduled_inames) == len(kernel.all_inames()) - all_pf_scheduled = len(set(sch_item for sch_item in prev_schedule - if isinstance(sch_item, PrefetchDescriptor))) == len(kernel.prefetch) - output_scheduled = len(set(sch_item for sch_item in prev_schedule - if isinstance(sch_item, WriteOutput))) == 1 - - if all_inames_scheduled and all_pf_scheduled and output_scheduled: - yield kernel - -# }}} - -# {{{ register prefetches - -class AllIndexExpressionCollector(CombineMapper): - def combine(self, values): - from pytools import flatten - return set(flatten(values)) - - def map_constant(self, expr): - return set() - - def map_algebraic_leaf(self, expr): - return set() - - def map_subscript(self, expr): - from pymbolic.primitives import Variable - assert isinstance(expr.aggregate, Variable) - - return set([expr]) - - - - -def insert_register_prefetches(kernel): - reg_pf = {} - - total_loop_count = len(kernel.all_inames()) - known_vars = set() - - unused_index_exprs = set() - for tgt, expr in kernel.instructions: - unused_index_exprs |= AllIndexExpressionCollector()(expr) - unused_index_exprs = [ - (iexpr, set(v.name for v in DependencyMapper()(iexpr.index))) - for iexpr in unused_index_exprs] - - schedule = kernel.schedule[:] - - sched_index = 0 - loop_count = 0 - while sched_index < len(schedule): - sched_item = schedule[sched_index] - if isinstance(sched_item, ScheduledLoop): - known_vars.add(sched_item.iname) - loop_count += 1 - sched_index += 1 - - if loop_count < total_loop_count: - i = 0 - while i < len(unused_index_exprs): - iexpr, index_deps = unused_index_exprs[i] - if (index_deps <= known_vars - and (iexpr.aggregate.name, iexpr.index) - not in kernel.prefetch): - unused_index_exprs.pop(i) - new_name = "reg_prefetch_"+iexpr.aggregate.name+str(sched_index) - reg_pf[iexpr] = new_name - schedule.insert(sched_index, - RegisterPrefetch( - index_expr=iexpr, new_name=new_name)) - sched_index += 1 - else: - i += 1 - - return kernel.copy(schedule=schedule, register_prefetch=reg_pf) - -# }}} - -# {{{ code generation - -# {{{ support code for AST wrapper objects - -class GeneratedCode(Record): - """Objects of this type are wrapped around ASTs upon - return from generation calls to collect information about them. - """ - __slots__ = ["ast", "num_conditionals"] - -def gen_code_block(elements): - from cgen import Generable, Block - - num_conditionals = 0 - block_els = [] - for el in elements: - if isinstance(el, GeneratedCode): - num_conditionals = num_conditionals + el.num_conditionals - block_els.append(el.ast) - elif isinstance(el, Generable): - block_els.append(el) - else: - raise ValueError("unidentifiable object in block") - - if len(block_els) == 1: - ast, = block_els - else: - ast = Block(block_els) - return GeneratedCode(ast=ast, num_conditionals=num_conditionals) - -def wrap_in(cls, *args): - inner = args[-1] - args = args[:-1] - - from cgen import If, Generable - - if isinstance(inner, GeneratedCode): - num_conditionals = inner.num_conditionals - ast = inner.ast - elif isinstance(inner, Generable): - num_conditionals = 0 - ast = inner - - args = args + (ast,) - ast = cls(*args) - - if isinstance(ast, If): - import re - cond_joiner_re = re.compile(r"\|\||\&\&") - num_conditionals += len(cond_joiner_re.split(ast.condition)) - - return GeneratedCode(ast=ast, num_conditionals=num_conditionals) - -def wrap_in_if(condition_codelets, inner): - from cgen import If - - if condition_codelets: - return wrap_in(If, - "\n&& ".join(condition_codelets), - inner) - - return inner - -# }}} - -# {{{ C code mapper - -class LoopyCCodeMapper(CCodeMapper): - def __init__(self, kernel, no_prefetch=False): - def constant_mapper(c): - if isinstance(c, float): - # FIXME: type-variable - return "%sf" % repr(c) - else: - return repr(c) - - CCodeMapper.__init__(self, constant_mapper=constant_mapper) - self.kernel = kernel - - self.no_prefetch = no_prefetch - - def map_subscript(self, expr, enclosing_prec): - from pymbolic.primitives import Variable - if (not self.no_prefetch - and isinstance(expr.aggregate, Variable) - and expr.aggregate.name in self.kernel.input_vectors()): - try: - pf = self.kernel.prefetch[expr.aggregate.name, expr.index] - except KeyError: - pass - else: - from pymbolic.mapper.stringifier import PREC_SUM - return pf.name+"".join( - "[%s - %s]" % (iname, self.rec( - self.kernel.get_projected_bounds(iname)[0], - PREC_SUM)) - for iname in pf.inames) - - if isinstance(expr.aggregate, Variable): - arg = self.kernel.arg_dict[expr.aggregate.name] - - if isinstance(arg, ImageArg): - assert isinstance(expr.index, tuple) - - base_access = ("read_imagef(%s, loopy_sampler, (float%d)(%s))" - % (arg.name, arg.dimensions, - ", ".join(self.rec(idx, PREC_NONE) - for idx in expr.index[::-1]))) - - if arg.dtype == np.float32: - return base_access+".x" - elif arg.dtype == np.float64: - return "as_double(%s.xy)" % base_access - else: - raise NotImplementedError( - "non-floating-point images not supported for now") - - else: - # ArrayArg - index_expr = expr.index - if isinstance(expr.index, tuple): - ary_strides = arg.strides - if ary_strides is None: - raise RuntimeError("tuple-indexed variable '%s' does not " - "have stride information" % expr.aggregate.name) - else: - ary_strides = (1,) - index_expr = (index_expr,) - - from pymbolic.primitives import Subscript - return CCodeMapper.map_subscript(self, - Subscript(expr.aggregate, arg.offset+sum( - stride*expr_i for stride, expr_i in zip( - ary_strides, index_expr))), enclosing_prec) - - return CCodeMapper.map_subscript(self, expr, enclosing_prec) - - def map_floor_div(self, expr, prec): - if isinstance(expr.denominator, int) and expr.denominator > 0: - return ("int_floor_div_pos_b(%s, %s)" - % (self.rec(expr.numerator, PREC_NONE), - expr.denominator)) - else: - return ("int_floor_div(%s, %s)" - % (self.rec(expr.numerator, PREC_NONE), - self.rec(expr.denominator, PREC_NONE))) - -# }}} - -# {{{ prefetch code generation - -class FetchLoopNestData(Record): - pass - -def make_fetch_loop_nest(flnd, pf_iname_idx, pf_dim_exprs, pf_idx_subst_map, - implemented_domain): - pf = flnd.prefetch - ccm = flnd.c_code_mapper - no_pf_ccm = flnd.no_prefetch_c_code_mapper - kernel = flnd.kernel - - from pymbolic import var - from cgen import Assign, For, If - - from pymbolic.mapper.substitutor import substitute - if pf_iname_idx >= len(pf.inames): - # done, return - from pymbolic.primitives import Variable, Subscript - - result = Assign( - pf.name + "".join("[%s]" % ccm(dexpr) - for dexpr in pf_dim_exprs), - no_pf_ccm( - Subscript( - Variable(pf.input_vector), - substitute(pf.index_expr, pf_idx_subst_map)), - PREC_NONE)) - - def my_ccm(expr): - return ccm(substitute(expr, pf_idx_subst_map)) - - check_vars = [v.name for v in DependencyMapper()(pf.index_expr)] - return wrap_in_bounds_checks(my_ccm, pf.kernel.domain, - check_vars, implemented_domain, result) - - pf_iname = pf.inames[pf_iname_idx] - realiz_inames = flnd.realization_inames[pf_iname_idx] - - start_index, stop_index = flnd.kernel.get_projected_bounds(pf_iname) - try: - start_index = int(start_index) - stop_index = int(stop_index) - except TypeError: - raise RuntimeError("loop bounds for prefetch must be " - "known statically at code gen time") - - dim_length = stop_index-start_index - - if realiz_inames is not None: - # {{{ parallel fetch - - realiz_bounds = [flnd.kernel.get_projected_bounds(rn) for rn in realiz_inames] - realiz_lengths = [stop-start for start, stop in realiz_bounds] - from pytools import product - total_realiz_size = product(realiz_lengths) - - result = [] - - cur_index = 0 - - while start_index+cur_index < stop_index: - pf_dim_expr = 0 - for realiz_iname, length in zip(realiz_inames, realiz_lengths): - tag = flnd.kernel.iname_to_tag[realiz_iname] - assert isinstance(tag, TAG_WORK_ITEM_IDX) - - pf_dim_expr = (pf_dim_expr*length - + var("(int) get_local_id(%d)" % tag.axis)) - - loop_slab = make_slab(pf.kernel.space, pf_iname, - start_index+cur_index, - min(stop_index, start_index+cur_index+total_realiz_size)) - new_impl_domain = implemented_domain.intersect(loop_slab) - - pf_dim_expr += cur_index - - pf_idx_subst_map = pf_idx_subst_map.copy() - pf_idx_subst_map[pf_iname] = pf_dim_expr + start_index - inner = make_fetch_loop_nest(flnd, pf_iname_idx+1, - pf_dim_exprs+[pf_dim_expr], pf_idx_subst_map, - new_impl_domain) - - if cur_index+total_realiz_size > dim_length: - inner = wrap_in(If, - "%s < %s" % (ccm(pf_dim_expr), stop_index), - inner) - - result.append(inner) - - cur_index += total_realiz_size - - return gen_code_block(result) - - # }}} - else: - # {{{ sequential fetch - - pf_dim_var = "prefetch_dim_idx_%d" % pf_iname_idx - pf_dim_expr = var(pf_dim_var) - - lb_cns, ub_cns = flnd.kernel.get_projected_bounds_constraints(pf_iname) - loop_slab = (isl.Set.universe(flnd.kernel.space) - .add_constraint(cast_constraint_to_space(lb_cns, kernel.space)) - .add_constraint(cast_constraint_to_space(ub_cns, kernel.space))) - new_impl_domain = implemented_domain.intersect(loop_slab) - - pf_idx_subst_map = pf_idx_subst_map.copy() - pf_idx_subst_map[pf_iname] = pf_dim_expr + start_index - inner = make_fetch_loop_nest(flnd, pf_iname_idx+1, - pf_dim_exprs+[pf_dim_expr], pf_idx_subst_map, - new_impl_domain) - - return wrap_in(For, - "int %s = 0" % pf_dim_var, - "%s < %s" % (pf_dim_var, ccm(dim_length)), - "++%s" % pf_dim_var, - inner) - - # }}} - - -def generate_prefetch_code(cgs, kernel, sched_index, exec_domain): - implemented_domain = exec_domain.implemented_domain - - from cgen import Statement as S, Line, Comment - - ccm = cgs.c_code_mapper - - # find surrounding schedule items - if sched_index-1 >= 0: - next_outer_sched_item = kernel.schedule[sched_index-1] - else: - next_outer_sched_item = None - - if sched_index+1 < len(kernel.schedule): - next_inner_sched_item = kernel.schedule[sched_index+1] - else: - next_inner_sched_item = None - - scheduled_pf = kernel.schedule[sched_index] - pf = kernel.prefetch[ - scheduled_pf.input_vector, scheduled_pf.index_expr] - - # Prefetch has a good amount of flexibility over what axes it - # uses to accomplish the prefetch. In particular, it can (and should!) - # use all work group dimensions. - - # {{{ determine which loop axes are used to realize the fetch - - # realization_dims is a list of lists of inames, to represent when two dims jointly - # make up one fetch axis - - realization_inames = [None] * len(pf.inames) - - # {{{ first, fix the user-specified fetch dims - - knl_work_item_inames = kernel.ordered_inames_by_tag_type(TAG_WORK_ITEM_IDX) - - for realization_dim_idx, loc_fetch_axis_list in \ - getattr(pf, "loc_fetch_axes", {}).iteritems(): - realization_inames[realization_dim_idx] = [knl_work_item_inames.pop(axis) - for axis in loc_fetch_axis_list] - - # }}} - - # {{{ next use the work group dimensions, least-stride dim first - - index_expr = pf.index_expr - if not isinstance(index_expr, tuple): - index_expr = (index_expr,) - - arg = kernel.arg_dict[pf.input_vector] - if isinstance(arg, ImageArg): - # arbitrary - ary_strides = (1, 1, 1)[:arg.dimensions] - else: - ary_strides = arg.strides - if ary_strides is None and len(index_expr) == 1: - ary_strides = (1,) - - iname_to_stride = {} - for iexpr_i, stride in zip(index_expr, ary_strides): - coeffs = CoefficientCollector()(iexpr_i) - for var_name, coeff in coeffs.iteritems(): - if var_name != 1: - new_stride = coeff*stride - old_stride = iname_to_stride.get(var_name, None) - if old_stride is None or new_stride < old_stride: - iname_to_stride[var_name] = new_stride - - approximate_arg_values = dict( - (arg.name, arg.approximately) - for arg in kernel.args - if isinstance(arg, ScalarArg)) - - def stride_key(iname): - iname_stride = iname_to_stride[iname] - - from pymbolic import evaluate - key = evaluate(iname_stride, approximate_arg_values) - assert isinstance(key, int) - return key - - pf_iname_strides = sorted((iname - for dim_idx, iname in enumerate(pf.inames) - if realization_inames[dim_idx] is None), - key=stride_key) - - while knl_work_item_inames and pf_iname_strides: - # grab least-stride prefetch dim - least_stride_pf_iname = pf_iname_strides.pop(0) - - # FIXME: It might be good to join multiple things together here - # for size reasons - realization_inames[pf.inames.index(least_stride_pf_iname)] \ - = [knl_work_item_inames.pop(0)] - - if knl_work_item_inames: - # FIXME - from warnings import warn - warn("There were leftover work group dimensions in prefetch " - "assignment. For now, this won't lead to wrong code, " - "but it will lead to unnecessary memory bandwidth use.") - - # }}} - - # }}} - - # {{{ generate fetch code - - valid_index_vars = get_valid_index_vars(kernel, sched_index, - exclude_tags=(TAG_WORK_ITEM_IDX,)) - flnd = FetchLoopNestData(prefetch=pf, - no_prefetch_c_code_mapper= - LoopyCCodeMapper(kernel, no_prefetch=True), - c_code_mapper=ccm, - realization_inames=realization_inames, - kernel=kernel, - valid_index_vars=valid_index_vars) - - fetch_block = make_fetch_loop_nest(flnd, 0, [], {}, implemented_domain) - - # }}} - - new_block = [ - Comment("prefetch %s[%s] using %s" % ( - pf.input_vector, - ", ".join(pf.inames), - ", ".join( - (" x ".join("%s(%s)" % (realiz_iname, kernel.iname_to_tag[realiz_iname]) - for realiz_iname in realiz_inames) - if realiz_inames is not None else "loop") - for realiz_inames in realization_inames))), - Line(), - ] - - # omit head sync primitive if we just came out of a prefetch - if not isinstance(next_outer_sched_item, PrefetchDescriptor): - new_block.append(S("barrier(CLK_LOCAL_MEM_FENCE)")) - else: - new_block.append(Comment("next outer schedule item is a prefetch: " - "no sync needed")) - - new_block.extend([ - fetch_block, - ]) - - # omit tail sync primitive if we're headed into another prefetch - if not isinstance(next_inner_sched_item, PrefetchDescriptor): - new_block.append(S("barrier(CLK_LOCAL_MEM_FENCE)")) - else: - new_block.append(Line()) - new_block.append(Comment("next inner schedule item is a prefetch: " - "no sync needed")) - - new_block.extend([Line(), - build_loop_nest(cgs, kernel, sched_index+1, exec_domain)]) - - return gen_code_block(new_block) - -# }}} - -# {{{ per-axis loop nest code generation - -class ExecutionDomain(object): - def __init__(self, implemented_domain, assignments_and_impl_domains=None): - """ - :param implemented_domain: The entire implemented domain, - i.e. all constraints that have been enforced so far. - :param assignments_and_impl_domains: a list of tuples - (assignments, implemented_domain), where *assignments* - is a list of :class:`cgen.Initializer` instances - and *implemented_domain* is the implemented domain to which - the situation produced by the assignments corresponds. - - The point of this being is a list is the implementation of - ILP, and each entry represents a 'fake-parallel' trip through the - ILP'd loop. - """ - if assignments_and_impl_domains is None: - assignments_and_impl_domains = [([], implemented_domain)] - self.implemented_domain = implemented_domain - self.assignments_and_impl_domains = assignments_and_impl_domains - - def __len__(self): - return len(self.assignments_and_impl_domains) - - def __iter__(self): - return iter(self.assignments_and_impl_domains) - - def intersect(self, set): - return ExecutionDomain( - self.implemented_domain.intersect(set), - [(assignments, implemented_domain.intersect(set)) - for assignments, implemented_domain - in self.assignments_and_impl_domains]) - - def get_the_one_domain(self): - assert len(self.assignments_and_impl_domains) == 1 - return self.implemented_domain - - - - -def generate_loop_dim_code(cgs, kernel, sched_index, - exec_domain): - from cgen import (Comment, add_comment, make_multiple_ifs, - POD, Assign, Line, Statement as S, Initializer, Const) - - ccm = cgs.c_code_mapper - - space = kernel.space - - iname = kernel.schedule[sched_index].iname - tag = kernel.iname_to_tag.get(iname) - - if isinstance(tag, (BaseUnrollTag, TAG_ILP)): - lower_cns, upper_cns = kernel.get_projected_bounds_constraints(iname) - lower_cns = cast_constraint_to_space(lower_cns, space) - upper_cns = cast_constraint_to_space(upper_cns, space) - - lower_kind, lower_bound = solve_constraint_for_bound(lower_cns, iname) - upper_kind, upper_bound = solve_constraint_for_bound(upper_cns, iname) - - assert lower_kind == ">=" - assert upper_kind == "<" - - from pymbolic import flatten - from pymbolic.mapper.constant_folder import CommutativeConstantFoldingMapper - cfm = CommutativeConstantFoldingMapper() - length = int(cfm(flatten(upper_bound-lower_bound))) - - def generate_idx_eq_slabs(): - for i in xrange(length): - yield (i, isl.Set.universe(kernel.space) - .add_constraint( - block_shift_constraint( - lower_cns, iname, -i, as_equality=True))) - - if isinstance(tag, BaseUnrollTag): - result = [POD(np.int32, iname), Line()] - - for i, slab in generate_idx_eq_slabs(): - new_exec_domain = exec_domain.intersect(slab) - inner = build_loop_nest(cgs, kernel, sched_index+1, - new_exec_domain) - - if isinstance(tag, TAG_UNROLL_STATIC): - result.extend([ - Assign(iname, ccm(lower_bound+i)), - Line(), inner]) - elif isinstance(tag, TAG_UNROLL_INCR): - result.append(S("++%s" % iname)) - - return gen_code_block(result) - - elif isinstance(tag, TAG_ILP): - new_aaid = [] - for assignments, implemented_domain in exec_domain: - for i, single_slab in generate_idx_eq_slabs(): - assignments = assignments + [ - Initializer(Const(POD(np.int32, iname)), ccm(lower_bound+i))] - new_aaid.append((assignments, - implemented_domain.intersect(single_slab))) - - assignments = [] - - overall_slab = (isl.Set.universe(kernel.space) - .add_constraint(lower_cns) - .add_constraint(upper_cns)) - - return build_loop_nest(cgs, kernel, sched_index+1, - ExecutionDomain( - exec_domain.implemented_domain.intersect(overall_slab), - new_aaid)) - else: - assert False, "not supposed to get here" - - lb_cns_orig, ub_cns_orig = kernel.get_projected_bounds_constraints(iname) - lb_cns_orig = cast_constraint_to_space(lb_cns_orig, space) - ub_cns_orig = cast_constraint_to_space(ub_cns_orig, space) - - # jostle the constant in {lb,ub}_cns to see if we can get - # fewer conditionals in the bulk middle segment - - class TrialRecord(Record): - pass - - if (cgs.try_slab_partition - and "outer" in iname): - trial_cgs = cgs.copy(try_slab_partition=False) - trials = [] - - for lower_incr, upper_incr in [ (0,0), (0,-1), ]: - - lb_cns = block_shift_constraint(lb_cns_orig, iname, -lower_incr) - ub_cns = block_shift_constraint(ub_cns_orig, iname, -upper_incr) - - bulk_slab = (isl.Set.universe(kernel.space) - .add_constraint(lb_cns) - .add_constraint(ub_cns)) - bulk_exec_domain = exec_domain.intersect(bulk_slab) - inner = build_loop_nest(trial_cgs, kernel, sched_index+1, - bulk_exec_domain) - - trials.append((TrialRecord( - lower_incr=lower_incr, - upper_incr=upper_incr, - bulk_slab=bulk_slab), - (inner.num_conditionals, - # when all num_conditionals are equal, choose the - # one with the smallest bounds changes - abs(upper_incr)+abs(lower_incr)))) - - from pytools import argmin2 - chosen = argmin2(trials) - else: - bulk_slab = (isl.Set.universe(kernel.space) - .add_constraint(lb_cns_orig) - .add_constraint(ub_cns_orig)) - chosen = TrialRecord( - lower_incr=0, - upper_incr=0, - bulk_slab=bulk_slab) - - slabs = [] - if chosen.lower_incr: - slabs.append(("initial", isl.Set.universe(kernel.space) - .add_constraint(lb_cns_orig) - .add_constraint(ub_cns_orig) - .add_constraint( - negate_constraint( - block_shift_constraint( - lb_cns_orig, iname, -chosen.lower_incr))))) - - slabs.append(("bulk", chosen.bulk_slab)) - - if chosen.upper_incr: - slabs.append(("final", isl.Set.universe(kernel.space) - .add_constraint(ub_cns_orig) - .add_constraint(lb_cns_orig) - .add_constraint( - negate_constraint( - block_shift_constraint( - ub_cns_orig, iname, -chosen.upper_incr))))) - - if isinstance(tag, AxisParallelTag): - # For a parallel loop dimension, the global loop bounds are - # automatically obeyed--simply because no work items are launched - # outside the requested grid. - # - # For a forced length, this is actually implemented - # by an if below. - - if tag.forced_length is None: - exec_domain = exec_domain.intersect( - isl.Set.universe(kernel.space) - .add_constraint(lb_cns_orig) - .add_constraint(ub_cns_orig)) - else: - impl_len = tag.forced_length - start, _ = kernel.get_projected_bounds(iname) - exec_domain = exec_domain.intersect( - make_slab(kernel.space, iname, start, start+impl_len)) - - result = [] - nums_of_conditionals = [] - - for slab_name, slab in slabs: - cmt = "%s slab for '%s'" % (slab_name, iname) - if len(slabs) == 1: - cmt = None - - new_exec_domain = exec_domain.intersect(slab) - inner = build_loop_nest(cgs, kernel, sched_index+1, - new_exec_domain) - - if tag is None: - # regular loop - if cmt is not None: - result.append(Comment(cmt)) - result.append( - wrap_in_for_from_constraints(ccm, iname, slab, inner)) - else: - # parallel loop - par_impl_domain = exec_domain.get_the_one_domain() - - nums_of_conditionals.append(inner.num_conditionals) - constraint_codelets = generate_bounds_checks(ccm, - slab, get_valid_index_vars(kernel, sched_index+1), - par_impl_domain) - result.append( - ("\n&& ".join(constraint_codelets), - add_comment(cmt, inner.ast))) - - if tag is None: - # regular or unrolled loop - return gen_code_block(result) - - elif isinstance(tag, AxisParallelTag): - # parallel loop - if tag.forced_length is None: - base = "last" - else: - base = None - return GeneratedCode( - ast=make_multiple_ifs(result, base=base), - num_conditionals=min(nums_of_conditionals)) - - else: - assert False, "we aren't supposed to get here" - -# }}} - -# {{{ bounds check generator - -def get_valid_index_vars(kernel, sched_index, exclude_tags=()): - """ - :param exclude_tags: a tuple of tag classes to exclude - """ - return [ - sched_item.iname - for sched_item in kernel.schedule[:sched_index] - if isinstance(sched_item, ScheduledLoop) - if not isinstance(kernel.iname_to_tag.get(sched_item.iname), exclude_tags)] - -def filter_necessary_constraints(implemented_domain, constraints): - space = implemented_domain.get_dim() - return [cns - for cns in constraints - if not implemented_domain.is_subset( - isl.Set.universe(space) - .add_constraint(cns))] - -def generate_bounds_checks(ccm, domain, check_vars, implemented_domain): - domain_bset, = domain.get_basic_sets() - - projected_domain_bset = isl.project_out_except( - domain_bset, check_vars, [dim_type.set]) - - space = domain.get_dim() - - cast_constraints = [] - - def examine_constraint(cns): - assert not cns.is_div_constraint() - cast_constraints.append( - cast_constraint_to_space(cns, space)) - - projected_domain_bset.foreach_constraint(examine_constraint) - - necessary_constraints = filter_necessary_constraints( - implemented_domain, cast_constraints) - - return [constraint_to_code(ccm, cns) for cns in necessary_constraints] - -def wrap_in_bounds_checks(ccm, domain, check_vars, implemented_domain, stmt): - return wrap_in_if( - generate_bounds_checks(ccm, domain, check_vars, - implemented_domain), - stmt) - -def wrap_in_for_from_constraints(ccm, iname, constraint_bset, stmt): - # FIXME add admissible vars - if isinstance(constraint_bset, isl.Set): - constraint_bset, = constraint_bset.get_basic_sets() - - constraints = constraint_bset.get_constraints() - - from pymbolic import expand - from pymbolic.mapper.constant_folder import CommutativeConstantFoldingMapper - cfm = CommutativeConstantFoldingMapper() - - from pytools import any - if any(cns.is_equality() for cns in constraints): - raise NotImplementedError("equality constraints for 'for' loops") - else: - start_exprs = [] - end_conds = [] - - for cns in constraints: - rhs, iname_coeff = constraint_to_expr(cns, except_name=iname) - - if iname_coeff == 0: - continue - elif iname_coeff < 0: - from pymbolic import var - rhs += iname_coeff*var(iname) - end_conds.append("%s >= 0" % - ccm(cfm(expand(rhs)))) - else: # iname_coeff > 0 - kind, bound = solve_constraint_for_bound(cns, iname) - assert kind == ">=" - start_exprs.append(bound) - - while len(start_exprs) >= 2: - start_exprs.append( - "max(%s, %s)" % ( - ccm(start_exprs.pop()), - ccm(start_exprs.pop()))) - - start_expr, = start_exprs # there has to be at least one - - from cgen import For - return wrap_in(For, - "int %s = %s" % (iname, start_expr), - " && ".join(end_conds), - "++%s" % iname, - stmt) - -# }}} - -# {{{ loop nest build top-level dispatch - -def build_loop_nest(cgs, kernel, sched_index, exec_domain): - assert isinstance(exec_domain, ExecutionDomain) - - ccm = cgs.c_code_mapper - - from cgen import (POD, Initializer, Assign, Statement as S, - Line, Block) - - if sched_index >= len(kernel.schedule): - # {{{ write innermost loop body - - from pymbolic.primitives import Subscript - - # FIXME revert to unroll if actual bounds checks are needed? - - valid_index_vars = get_valid_index_vars(kernel, sched_index) - bounds_check_lists = [ - generate_bounds_checks(ccm, kernel.domain, - valid_index_vars, impl_domain) - for assignments, impl_domain in - exec_domain] - - result = [] - for lvalue, expr in kernel.instructions: - for i, (assignments, impl_domain) in \ - enumerate(exec_domain): - - my_block = assignments+[Line()] - - assert isinstance(lvalue, Subscript) - name = lvalue.aggregate.name - my_block.append( - wrap_in_if( - bounds_check_lists[i], - S("tmp_%s_%d += %s" - % (name, i, ccm(expr))))) - result.append(gen_code_block(my_block)) - - return gen_code_block(result) - - # }}} - - sched_item = kernel.schedule[sched_index] - - if isinstance(sched_item, ScheduledLoop): - return generate_loop_dim_code(cgs, kernel, sched_index, - exec_domain) - - elif isinstance(sched_item, WriteOutput): - result = ( - [Initializer(POD(kernel.arg_dict[lvalue.aggregate.name].dtype, - "tmp_%s_%d" % (lvalue.aggregate.name, i)), 0) - for i in range(len(exec_domain)) - for lvalue, expr in kernel.instructions] - +[build_loop_nest(cgs, kernel, sched_index+1, - exec_domain)]) - - - for i, (idx_assignments, impl_domain) in \ - enumerate(exec_domain): - for lvalue, expr in kernel.instructions: - assignment = Assign(ccm(lvalue), "tmp_%s_%d" % ( - lvalue.aggregate.name, i)) - - wrapped_assign = wrap_in_bounds_checks( - ccm, kernel.domain, - get_valid_index_vars(kernel, sched_index), - impl_domain, assignment) - - result.append( - gen_code_block( - idx_assignments+[ Line(), wrapped_assign, Line()])) - - return gen_code_block(result) - - elif isinstance(sched_item, PrefetchDescriptor): - return generate_prefetch_code(cgs, kernel, sched_index, - exec_domain) - - elif isinstance(sched_item, RegisterPrefetch): - raise NotImplementedError("reg prefetch") # FIXME - - agg_name = sched_item.subscript_expr.aggregate.name - return gen_code_block([ - wrap_in_bounds_checks(ccm, kernel, sched_index, implemented_domain, - Initializer(POD(kernel.arg_dict[agg_name].dtype, - sched_item.new_name), - "%s[%s]" - % (agg_name, - ccm(sched_item.subscript_expr.index)))), - - build_loop_nest(cgs, kernel, sched_index+1, exec_domain) - ]) - - else: - raise ValueError("invalid schedule item encountered") - -# }}} - -# {{{ prefetch preprocessing - -def preprocess_prefetch(kernel): - """Assign names, dim storage lengths to prefetches. - """ - - all_pf_list = kernel.prefetch.values() - new_prefetch_dict = {} - lmem_size = cl_char.usable_local_mem_size(kernel.device) - - for i_pf, pf in enumerate(kernel.prefetch.itervalues()): - all_pf_nbytes = [opf.nbytes for opf in all_pf_list] - other_pf_sizes = sum(all_pf_nbytes[:i_pf]+all_pf_nbytes[i_pf+1:]) - - shape = [stop-start for start, stop in pf.dim_bounds] - dim_storage_lengths = shape[:] - - # sizes of all dims except the last one, which we may change - # below to avoid bank conflicts - from pytools import product - other_dim_sizes = (pf.itemsize - * product(dim_storage_lengths[:-1])) - - min_mult = cl_char.local_memory_bank_count(kernel.device) - good_incr = None - new_dsl = dim_storage_lengths - min_why_not = None - - for increment in range(dim_storage_lengths[-1]//2): - - test_dsl = dim_storage_lengths[:] - test_dsl[-1] = test_dsl[-1] + increment - new_mult, why_not = cl_char.why_not_local_access_conflict_free( - kernel.device, pf.itemsize, - shape, test_dsl) - - # will choose smallest increment 'automatically' - if new_mult < min_mult: - new_lmem_use = other_pf_sizes + pf.itemsize*product(new_dsl) - if new_lmem_use < lmem_size: - new_dsl = test_dsl - min_mult = new_mult - min_why_not = why_not - good_incr = increment - - if min_mult != 1: - from warnings import warn - warn("could not find a conflict-free mem layout " - "for prefetch of '%s' " - "(currently: %dx conflict, increment: %d, reason: %s)" - % (pf.input_vector, min_mult, good_incr, min_why_not), - LoopyAdvisory) - - new_pf = pf.copy(dim_storage_lengths=new_dsl, - name="prefetch_%s_%d" % (pf.input_vector, i_pf)) - new_prefetch_dict[pf.input_vector, pf.index_expr] = new_pf - all_pf_list[i_pf] = new_pf - - return kernel.copy(prefetch=new_prefetch_dict) - -# }}} - -# {{{ main code generation entrypoint - -class CodeGenerationState(Record): - __slots__ = ["c_code_mapper", "try_slab_partition"] - -def generate_code(kernel): - kernel = preprocess_prefetch(kernel) - - from cgen import (FunctionBody, FunctionDeclaration, - POD, Value, ArrayOf, Module, Block, - Define, Line, Const, LiteralLines, Initializer) - - from cgen.opencl import (CLKernel, CLGlobal, CLRequiredWorkGroupSize, - CLLocal, CLImage) - - my_ccm = LoopyCCodeMapper(kernel) - - def ccm(expr, prec=PREC_NONE): - return my_ccm(expr, prec) - - # {{{ build top-level - - mod = Module() - - body = Block() - - # {{{ examine arg list - - def restrict_ptr_if_not_nvidia(arg): - from cgen import Pointer, RestrictPointer - - if "nvidia" in kernel.device.platform.name.lower(): - return Pointer(arg) - else: - return RestrictPointer(arg) - - has_double = False - has_image = False - - args = [] - for arg in kernel.args: - if isinstance(arg, ArrayArg): - arg_decl = restrict_ptr_if_not_nvidia( - POD(arg.dtype, arg.name)) - if arg_decl.name in kernel.input_vectors(): - arg_decl = Const(arg_decl) - arg_decl = CLGlobal(arg_decl) - elif isinstance(arg, ImageArg): - if arg.name in kernel.input_vectors(): - mode = "r" - else: - mode = "w" - - arg_decl = CLImage(arg.dimensions, mode, arg.name) - - has_image = True - else: - arg_decl = Const(POD(arg.dtype, arg.name)) - - if arg.dtype in [np.float64, np.complex128]: - has_double = True - - args.append(arg_decl) - - if has_double: - mod.extend([ - Line("#pragma OPENCL EXTENSION cl_khr_fp64: enable"), - Line()]) - - if has_image: - body.append(Initializer(Const(Value("sampler_t", "loopy_sampler")), - "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP " - "| CLK_FILTER_NEAREST")) - - # }}} - - if kernel.preamble is not None: - mod.extend([LiteralLines(kernel.preamble), Line()]) +def register_mpz_with_pymbolic(): + from pymbolic.primitives import register_constant_class + import gmpy + mpz_type = type(gmpy.mpz(1)) + register_constant_class(mpz_type) - mod.extend([ - LiteralLines(r""" - #define int_floor_div(a,b) \ - (( (a) - \ - ( ( (a)<0 ) != ( (b)<0 )) \ - *( (b) + ( (b)<0 ) - ( (b)>=0 ) )) \ - / (b) ) +register_mpz_with_pymbolic() - #define int_floor_div_pos_b(a,b) ( \ - ( (a) - ( ((a)<0) ? ((b)-1) : 0 ) ) / (b) \ - ) - """), - Line()]) +# TODO: Constant memory +# TODO: Reuse of previously split dimensions for prefetch +# (Or general merging) - # {{{ symbolic names for group and local indices +# TODO: Try, fix reg. prefetch (DG example) / CSEs +# TODO: Custom reductions per red. axis +# TODO: Functions +# TODO: Common subexpressions +# TODO: Parse ops from string +# FIXME: support non-reductive dimensions +# FIXME: write names should be assigned during scheduling - for what_cls, func in [ - (TAG_GROUP_IDX, "get_group_id"), - (TAG_WORK_ITEM_IDX, "get_local_id")]: - for iname in kernel.ordered_inames_by_tag_type(what_cls): - start, stop = kernel.get_projected_bounds(iname) - mod.append(Define(iname, "(%s + (int) %s(%d)) /* [%s, %s) */" - % (ccm(start), - func, - kernel.iname_to_tag[iname].axis, - ccm(start), - ccm(stop)))) +# TODO: Condition hoisting +# TODO: Don't emit spurious barriers (no for scheduled before) +# TODO: Make code more readable - mod.append(Line()) +# TODO: Divisibility +# TODO: Try different kernels +# TODO: - Tricky: Convolution, FD +# TODO: Try, fix indirect addressing +# TODO: User controllable switch for slab opt +# TODO: Separate all-bulk from non-bulk kernels. - # }}} - # {{{ build lmem array declarators for prefetches +# TODO: implement efficient div_ceil? +# TODO: why are corner cases inefficient? - for pf in kernel.prefetch.itervalues(): - smem_pf_array = POD(kernel.arg_dict[pf.input_vector].dtype, pf.name) - for l in pf.dim_storage_lengths: - smem_pf_array = ArrayOf(smem_pf_array, l) - body.append(CLLocal(smem_pf_array)) - # }}} - cgs = CodeGenerationState(c_code_mapper=ccm, try_slab_partition=True) - gen_code = build_loop_nest(cgs, kernel, 0, - ExecutionDomain(isl.Set.universe(kernel.space))) - body.extend([Line(), gen_code.ast]) - #print "# conditionals: %d" % gen_code.num_conditionals - mod.append( - FunctionBody( - CLRequiredWorkGroupSize( - tuple(dim_length - for dim_length in kernel.tag_type_lengths(TAG_WORK_ITEM_IDX)), - CLKernel(FunctionDeclaration( - Value("void", kernel.name), args))), - body)) +class LoopyAdvisory(UserWarning): + pass - # }}} +# {{{ imported user interface - return str(mod) +from loopy.kernel import ScalarArg, ArrayArg, ImageArg -# }}} +from loopy.kernel import LoopKernel +from loopy.schedule import generate_loop_schedules +from loopy.prefetch import insert_register_prefetches +from loopy.compiled import CompiledKernel, drive_timing_run # }}} @@ -2246,6 +64,8 @@ def get_input_access_descriptors(kernel): a list of input access descriptor. An input access descriptor is a tuple (input_vec, index_expr). """ + from loopy.symbolic import VariableIndexExpressionCollector + from pytools import flatten result = {} for ivec in kernel.input_vectors(): @@ -2284,7 +104,8 @@ def add_prefetch(kernel, input_access_descr, tags_or_inames, loc_fetch_axes={}): raise ValueError("a prefetch descriptor for the input access %s[%s] " "already exists" % (ivec, iexpr)) - new_prefetch[input_access_descr] = PrefetchDescriptor( + from loopy.prefetch import LocalMemoryPrefetch + new_prefetch[input_access_descr] = LocalMemoryPrefetch( kernel=kernel, input_vector=ivec, index_expr=iexpr, @@ -2295,114 +116,7 @@ def add_prefetch(kernel, input_access_descr, tags_or_inames, loc_fetch_axes={}): # }}} -# {{{ compiled kernel object - -class CompiledKernel: - def __init__(self, context, kernel, size_args=None, options=[], - force_rebuild=False, edit_code=False): - self.kernel = kernel - self.code = generate_code(kernel) - - if force_rebuild: - from time import time - self.code = "/* %s */\n%s" % (time(), self.code) - - if edit_code: - from pytools import invoke_editor - self.code = invoke_editor(self.code) - - try: - self.cl_kernel = getattr( - cl.Program(context, self.code).build(options=options), - kernel.name) - except: - print "[Loopy] ----------------------------------------------------" - print "[Loopy] build failed, here's the source code:" - print "[Loopy] ----------------------------------------------------" - print self.code - print "[Loopy] ----------------------------------------------------" - print "[Loopy] end source code" - print "[Loopy] ----------------------------------------------------" - raise - - arg_types = [] - for arg in kernel.args: - if isinstance(arg, ScalarArg): - arg_types.append(arg.dtype) - else: - arg_types.append(None) - - self.cl_kernel.set_scalar_arg_dtypes(arg_types) - - from pymbolic import compile - if size_args is None: - self.size_args = [arg.name for arg in kernel.args if isinstance(arg, ScalarArg)] - else: - self.size_args = size_args - - gsize_expr = tuple(self.kernel.tag_type_lengths(TAG_GROUP_IDX)) - lsize_expr = tuple(self.kernel.tag_type_lengths(TAG_WORK_ITEM_IDX)) - - if not gsize_expr: gsize_expr = (1,) - if not lsize_expr: lsize_expr = (1,) - - self.global_size_func = compile( - gsize_expr, self.size_args) - self.local_size_func = compile( - lsize_expr, self.size_args) - -# }}} - -# {{{ timing driver -def drive_timing_run(kernel_generator, queue, launch, flop_count=None, - options=[], print_code=True, force_rebuild=False, - edit_code=False): - - def time_run(compiled_knl, warmup_rounds=2, timing_rounds=5): - check = True - for i in range(warmup_rounds): - launch(compiled_knl.cl_kernel, - compiled.global_size_func, compiled.local_size_func, - check=check) - check = False - - events = [] - for i in range(timing_rounds): - events.append( - launch(compiled_knl.cl_kernel, - compiled.global_size_func, compiled.local_size_func, - check=check)) - for evt in events: - evt.wait() - - return sum(1e-9*evt.profile.END-1e-9*evt.profile.START for evt in events)/timing_rounds - soln_count = 0 - for kernel in kernel_generator: - - compiled = CompiledKernel(queue.context, kernel, options=options, - force_rebuild=force_rebuild, edit_code=edit_code) - - print "-----------------------------------------------" - print "SOLUTION #%d" % soln_count - print "-----------------------------------------------" - if print_code: - print compiled.code - print "-----------------------------------------------" - - elapsed = time_run(compiled) - - print "time: %f" % elapsed - if flop_count is not None: - print "gflops/s: %f (#%d)" % ( - flop_count/elapsed/1e9, soln_count) - print "-----------------------------------------------" - - soln_count += 1 - - print "%d solutions" % soln_count - -# }}} diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py new file mode 100644 index 000000000..67af2676e --- /dev/null +++ b/loopy/codegen/__init__.py @@ -0,0 +1,271 @@ +from __future__ import division + +from pytools import Record +import numpy as np +from pymbolic.mapper.stringifier import PREC_NONE + + + + +# {{{ support code for AST wrapper objects + +class GeneratedCode(Record): + """Objects of this type are wrapped around ASTs upon + return from generation calls to collect information about them. + """ + __slots__ = ["ast", "num_conditionals"] + +def gen_code_block(elements): + from cgen import Generable, Block + + num_conditionals = 0 + block_els = [] + for el in elements: + if isinstance(el, GeneratedCode): + num_conditionals = num_conditionals + el.num_conditionals + block_els.append(el.ast) + elif isinstance(el, Generable): + block_els.append(el) + else: + raise ValueError("unidentifiable object in block") + + if len(block_els) == 1: + ast, = block_els + else: + ast = Block(block_els) + return GeneratedCode(ast=ast, num_conditionals=num_conditionals) + +def wrap_in(cls, *args): + inner = args[-1] + args = args[:-1] + + from cgen import If, Generable + + if isinstance(inner, GeneratedCode): + num_conditionals = inner.num_conditionals + ast = inner.ast + elif isinstance(inner, Generable): + num_conditionals = 0 + ast = inner + + args = args + (ast,) + ast = cls(*args) + + if isinstance(ast, If): + import re + cond_joiner_re = re.compile(r"\|\||\&\&") + num_conditionals += len(cond_joiner_re.split(ast.condition)) + + return GeneratedCode(ast=ast, num_conditionals=num_conditionals) + +def wrap_in_if(condition_codelets, inner): + from cgen import If + + if condition_codelets: + return wrap_in(If, + "\n&& ".join(condition_codelets), + inner) + + return inner + +# }}} + +# {{{ main code generation entrypoint + +class CodeGenerationState(Record): + __slots__ = ["c_code_mapper", "try_slab_partition"] + +class ExecutionDomain(object): + def __init__(self, implemented_domain, assignments_and_impl_domains=None): + """ + :param implemented_domain: The entire implemented domain, + i.e. all constraints that have been enforced so far. + :param assignments_and_impl_domains: a list of tuples + (assignments, implemented_domain), where *assignments* + is a list of :class:`cgen.Initializer` instances + and *implemented_domain* is the implemented domain to which + the situation produced by the assignments corresponds. + + The point of this being is a list is the implementation of + ILP, and each entry represents a 'fake-parallel' trip through the + ILP'd loop. + """ + if assignments_and_impl_domains is None: + assignments_and_impl_domains = [([], implemented_domain)] + self.implemented_domain = implemented_domain + self.assignments_and_impl_domains = assignments_and_impl_domains + + def __len__(self): + return len(self.assignments_and_impl_domains) + + def __iter__(self): + return iter(self.assignments_and_impl_domains) + + def intersect(self, set): + return ExecutionDomain( + self.implemented_domain.intersect(set), + [(assignments, implemented_domain.intersect(set)) + for assignments, implemented_domain + in self.assignments_and_impl_domains]) + + def get_the_one_domain(self): + assert len(self.assignments_and_impl_domains) == 1 + return self.implemented_domain + + + + + +def generate_code(kernel): + from loopy.codegen.prefetch import preprocess_prefetch + kernel = preprocess_prefetch(kernel) + + from cgen import (FunctionBody, FunctionDeclaration, + POD, Value, ArrayOf, Module, Block, + Define, Line, Const, LiteralLines, Initializer) + + from cgen.opencl import (CLKernel, CLGlobal, CLRequiredWorkGroupSize, + CLLocal, CLImage) + + from loopy.symbolic import LoopyCCodeMapper + my_ccm = LoopyCCodeMapper(kernel) + + def ccm(expr, prec=PREC_NONE): + return my_ccm(expr, prec) + + # {{{ build top-level + + mod = Module() + + body = Block() + + # {{{ examine arg list + + def restrict_ptr_if_not_nvidia(arg): + from cgen import Pointer, RestrictPointer + + if "nvidia" in kernel.device.platform.name.lower(): + return Pointer(arg) + else: + return RestrictPointer(arg) + + has_double = False + has_image = False + + from loopy.kernel import ArrayArg, ImageArg + + args = [] + for arg in kernel.args: + if isinstance(arg, ArrayArg): + arg_decl = restrict_ptr_if_not_nvidia( + POD(arg.dtype, arg.name)) + if arg_decl.name in kernel.input_vectors(): + arg_decl = Const(arg_decl) + arg_decl = CLGlobal(arg_decl) + elif isinstance(arg, ImageArg): + if arg.name in kernel.input_vectors(): + mode = "r" + else: + mode = "w" + + arg_decl = CLImage(arg.dimensions, mode, arg.name) + + has_image = True + else: + arg_decl = Const(POD(arg.dtype, arg.name)) + + if arg.dtype in [np.float64, np.complex128]: + has_double = True + + args.append(arg_decl) + + if has_double: + mod.extend([ + Line("#pragma OPENCL EXTENSION cl_khr_fp64: enable"), + Line()]) + + if has_image: + body.append(Initializer(Const(Value("sampler_t", "loopy_sampler")), + "CLK_NORMALIZED_COORDS_FALSE | CLK_ADDRESS_CLAMP " + "| CLK_FILTER_NEAREST")) + + # }}} + + if kernel.preamble is not None: + mod.extend([LiteralLines(kernel.preamble), Line()]) + + mod.extend([ + LiteralLines(r""" + #define int_floor_div(a,b) \ + (( (a) - \ + ( ( (a)<0 ) != ( (b)<0 )) \ + *( (b) + ( (b)<0 ) - ( (b)>=0 ) )) \ + / (b) ) + + + #define int_floor_div_pos_b(a,b) ( \ + ( (a) - ( ((a)<0) ? ((b)-1) : 0 ) ) / (b) \ + ) + + """), + Line()]) + + # {{{ symbolic names for group and local indices + + from loopy.kernel import TAG_GROUP_IDX, TAG_WORK_ITEM_IDX + for what_cls, func in [ + (TAG_GROUP_IDX, "get_group_id"), + (TAG_WORK_ITEM_IDX, "get_local_id")]: + for iname in kernel.ordered_inames_by_tag_type(what_cls): + start, stop = kernel.get_projected_bounds(iname) + mod.append(Define(iname, "(%s + (int) %s(%d)) /* [%s, %s) */" + % (ccm(start), + func, + kernel.iname_to_tag[iname].axis, + ccm(start), + ccm(stop)))) + + mod.append(Line()) + + # }}} + + # {{{ build lmem array declarators for prefetches + + for pf in kernel.prefetch.itervalues(): + smem_pf_array = POD(kernel.arg_dict[pf.input_vector].dtype, pf.name) + for l in pf.dim_storage_lengths: + smem_pf_array = ArrayOf(smem_pf_array, l) + body.append(CLLocal(smem_pf_array)) + + # }}} + + cgs = CodeGenerationState(c_code_mapper=ccm, try_slab_partition=True) + + from loopy.codegen.dispatch import build_loop_nest + import islpy as isl + + gen_code = build_loop_nest(cgs, kernel, 0, + ExecutionDomain(isl.Set.universe(kernel.space))) + body.extend([Line(), gen_code.ast]) + #print "# conditionals: %d" % gen_code.num_conditionals + + from loopy.kernel import TAG_WORK_ITEM_IDX + mod.append( + FunctionBody( + CLRequiredWorkGroupSize( + tuple(dim_length + for dim_length in kernel.tag_type_lengths(TAG_WORK_ITEM_IDX)), + CLKernel(FunctionDeclaration( + Value("void", kernel.name), args))), + body)) + + # }}} + + return str(mod) + +# }}} + + + + +# vim: foldmethod=marker diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py new file mode 100644 index 000000000..b17f035f3 --- /dev/null +++ b/loopy/codegen/bounds.py @@ -0,0 +1,116 @@ +from __future__ import division + +import islpy as isl +from islpy import dim_type + + + + +# {{{ bounds check generator + +def constraint_to_code(ccm, cns): + if cns.is_equality(): + comp_op = "==" + else: + comp_op = ">=" + + from loopy.isl import constraint_to_expr + return "%s %s 0" % (ccm(constraint_to_expr(cns)), comp_op) + +def filter_necessary_constraints(implemented_domain, constraints): + space = implemented_domain.get_dim() + return [cns + for cns in constraints + if not implemented_domain.is_subset( + isl.Set.universe(space) + .add_constraint(cns))] + +def generate_bounds_checks(ccm, domain, check_vars, implemented_domain): + domain_bset, = domain.get_basic_sets() + + projected_domain_bset = isl.project_out_except( + domain_bset, check_vars, [dim_type.set]) + + space = domain.get_dim() + + cast_constraints = [] + + from loopy.isl import cast_constraint_to_space + + def examine_constraint(cns): + assert not cns.is_div_constraint() + cast_constraints.append( + cast_constraint_to_space(cns, space)) + + projected_domain_bset.foreach_constraint(examine_constraint) + + necessary_constraints = filter_necessary_constraints( + implemented_domain, cast_constraints) + + return [constraint_to_code(ccm, cns) for cns in necessary_constraints] + +def wrap_in_bounds_checks(ccm, domain, check_vars, implemented_domain, stmt): + from loopy.codegen import wrap_in_if + return wrap_in_if( + generate_bounds_checks(ccm, domain, check_vars, + implemented_domain), + stmt) + +def wrap_in_for_from_constraints(ccm, iname, constraint_bset, stmt): + # FIXME add admissible vars + if isinstance(constraint_bset, isl.Set): + constraint_bset, = constraint_bset.get_basic_sets() + + constraints = constraint_bset.get_constraints() + + from pymbolic import expand + from pymbolic.mapper.constant_folder import CommutativeConstantFoldingMapper + + cfm = CommutativeConstantFoldingMapper() + + from loopy.isl import constraint_to_expr, solve_constraint_for_bound + from pytools import any + + if any(cns.is_equality() for cns in constraints): + raise NotImplementedError("equality constraints for 'for' loops") + else: + start_exprs = [] + end_conds = [] + + for cns in constraints: + rhs, iname_coeff = constraint_to_expr(cns, except_name=iname) + + if iname_coeff == 0: + continue + elif iname_coeff < 0: + from pymbolic import var + rhs += iname_coeff*var(iname) + end_conds.append("%s >= 0" % + ccm(cfm(expand(rhs)))) + else: # iname_coeff > 0 + kind, bound = solve_constraint_for_bound(cns, iname) + assert kind == ">=" + start_exprs.append(bound) + + while len(start_exprs) >= 2: + start_exprs.append( + "max(%s, %s)" % ( + ccm(start_exprs.pop()), + ccm(start_exprs.pop()))) + + start_expr, = start_exprs # there has to be at least one + + from cgen import For + from loopy.codegen import wrap_in + return wrap_in(For, + "int %s = %s" % (iname, start_expr), + " && ".join(end_conds), + "++%s" % iname, + stmt) + +# }}} + + + + +# vim: foldmethod=marker diff --git a/loopy/codegen/dispatch.py b/loopy/codegen/dispatch.py new file mode 100644 index 000000000..dd6765c03 --- /dev/null +++ b/loopy/codegen/dispatch.py @@ -0,0 +1,131 @@ +"""Loop nest build top-level dispatch.""" +from __future__ import division + +from loopy.codegen import ExecutionDomain, gen_code_block + + + + +def build_loop_nest(cgs, kernel, sched_index, exec_domain): + assert isinstance(exec_domain, ExecutionDomain) + + ccm = cgs.c_code_mapper + + from cgen import (POD, Initializer, Assign, Statement as S, + Line) + + from loopy.schedule import get_valid_index_vars + from loopy.codegen.bounds import generate_bounds_checks + + if sched_index >= len(kernel.schedule): + # {{{ write innermost loop body + + from pymbolic.primitives import Subscript + + # FIXME revert to unroll if actual bounds checks are needed? + + valid_index_vars = get_valid_index_vars(kernel, sched_index) + bounds_check_lists = [ + generate_bounds_checks(ccm, kernel.domain, + valid_index_vars, impl_domain) + for assignments, impl_domain in + exec_domain] + + result = [] + for lvalue, expr in kernel.instructions: + for i, (assignments, impl_domain) in \ + enumerate(exec_domain): + + my_block = assignments+[Line()] + + assert isinstance(lvalue, Subscript) + name = lvalue.aggregate.name + + from loopy.codegen import wrap_in_if + my_block.append( + wrap_in_if( + bounds_check_lists[i], + S("tmp_%s_%d += %s" + % (name, i, ccm(expr))))) + result.append(gen_code_block(my_block)) + + return gen_code_block(result) + + # }}} + + sched_item = kernel.schedule[sched_index] + + from loopy.schedule import ScheduledLoop, WriteOutput + from loopy.prefetch import LocalMemoryPrefetch, RegisterPrefetch + from loopy.codegen.bounds import wrap_in_bounds_checks + + if isinstance(sched_item, ScheduledLoop): + from loopy.codegen.loop_dim import ( + generate_unroll_or_ilp_code, + generate_non_unroll_loop_dim_code) + from loopy.kernel import BaseUnrollTag, TAG_ILP + + tag = kernel.iname_to_tag.get(sched_item.iname) + + if isinstance(tag, (BaseUnrollTag, TAG_ILP)): + func = generate_unroll_or_ilp_code + else: + func = generate_non_unroll_loop_dim_code + + return func(cgs, kernel, sched_index, exec_domain) + + elif isinstance(sched_item, WriteOutput): + result = ( + [Initializer(POD(kernel.arg_dict[lvalue.aggregate.name].dtype, + "tmp_%s_%d" % (lvalue.aggregate.name, i)), 0) + for i in range(len(exec_domain)) + for lvalue, expr in kernel.instructions] + +[build_loop_nest(cgs, kernel, sched_index+1, + exec_domain)]) + + + for i, (idx_assignments, impl_domain) in \ + enumerate(exec_domain): + for lvalue, expr in kernel.instructions: + assignment = Assign(ccm(lvalue), "tmp_%s_%d" % ( + lvalue.aggregate.name, i)) + + wrapped_assign = wrap_in_bounds_checks( + ccm, kernel.domain, + get_valid_index_vars(kernel, sched_index), + impl_domain, assignment) + + result.append( + gen_code_block( + idx_assignments+[ Line(), wrapped_assign, Line()])) + + return gen_code_block(result) + + elif isinstance(sched_item, LocalMemoryPrefetch): + from loopy.codegen.prefetch import generate_prefetch_code + return generate_prefetch_code(cgs, kernel, sched_index, + exec_domain) + + elif isinstance(sched_item, RegisterPrefetch): + raise NotImplementedError("reg prefetch") # FIXME + + agg_name = sched_item.subscript_expr.aggregate.name + return gen_code_block([ + wrap_in_bounds_checks(ccm, kernel, sched_index, implemented_domain, + Initializer(POD(kernel.arg_dict[agg_name].dtype, + sched_item.new_name), + "%s[%s]" + % (agg_name, + ccm(sched_item.subscript_expr.index)))), + + build_loop_nest(cgs, kernel, sched_index+1, exec_domain) + ]) + + else: + raise ValueError("invalid schedule item encountered") + + + + + +# vim: foldmethod=marker diff --git a/loopy/codegen/loop_dim.py b/loopy/codegen/loop_dim.py new file mode 100644 index 000000000..d6d9814b4 --- /dev/null +++ b/loopy/codegen/loop_dim.py @@ -0,0 +1,260 @@ +from __future__ import division + +import numpy as np +from loopy.codegen import ExecutionDomain, gen_code_block +from pytools import Record +import islpy as isl +from loopy.codegen.dispatch import build_loop_nest + + + + + +# {{{ generate code for unrolled/ILP loops + +def generate_unroll_or_ilp_code(cgs, kernel, sched_index, exec_domain): + from loopy.isl import ( + cast_constraint_to_space, solve_constraint_for_bound, + block_shift_constraint) + + from cgen import (POD, Assign, Line, Statement as S, Initializer, Const) + + ccm = cgs.c_code_mapper + space = kernel.space + iname = kernel.schedule[sched_index].iname + tag = kernel.iname_to_tag.get(iname) + + lower_cns, upper_cns = kernel.get_projected_bounds_constraints(iname) + lower_cns = cast_constraint_to_space(lower_cns, space) + upper_cns = cast_constraint_to_space(upper_cns, space) + + lower_kind, lower_bound = solve_constraint_for_bound(lower_cns, iname) + upper_kind, upper_bound = solve_constraint_for_bound(upper_cns, iname) + + assert lower_kind == ">=" + assert upper_kind == "<" + + from pymbolic import flatten + from pymbolic.mapper.constant_folder import CommutativeConstantFoldingMapper + cfm = CommutativeConstantFoldingMapper() + length = int(cfm(flatten(upper_bound-lower_bound))) + + def generate_idx_eq_slabs(): + for i in xrange(length): + yield (i, isl.Set.universe(kernel.space) + .add_constraint( + block_shift_constraint( + lower_cns, iname, -i, as_equality=True))) + + from loopy.kernel import BaseUnrollTag, TAG_ILP, TAG_UNROLL_STATIC, TAG_UNROLL_INCR + if isinstance(tag, BaseUnrollTag): + result = [POD(np.int32, iname), Line()] + + for i, slab in generate_idx_eq_slabs(): + new_exec_domain = exec_domain.intersect(slab) + inner = build_loop_nest(cgs, kernel, sched_index+1, + new_exec_domain) + + if isinstance(tag, TAG_UNROLL_STATIC): + result.extend([ + Assign(iname, ccm(lower_bound+i)), + Line(), inner]) + elif isinstance(tag, TAG_UNROLL_INCR): + result.append(S("++%s" % iname)) + + return gen_code_block(result) + + elif isinstance(tag, TAG_ILP): + new_aaid = [] + for assignments, implemented_domain in exec_domain: + for i, single_slab in generate_idx_eq_slabs(): + assignments = assignments + [ + Initializer(Const(POD(np.int32, iname)), ccm(lower_bound+i))] + new_aaid.append((assignments, + implemented_domain.intersect(single_slab))) + + assignments = [] + + overall_slab = (isl.Set.universe(kernel.space) + .add_constraint(lower_cns) + .add_constraint(upper_cns)) + + return build_loop_nest(cgs, kernel, sched_index+1, + ExecutionDomain( + exec_domain.implemented_domain.intersect(overall_slab), + new_aaid)) + else: + assert False, "not supposed to get here" + +# }}} + +# {{{ generate code for all other loops + +def generate_non_unroll_loop_dim_code(cgs, kernel, sched_index, exec_domain): + from loopy.isl import (cast_constraint_to_space, + block_shift_constraint, negate_constraint, make_slab) + + from cgen import (Comment, add_comment, make_multiple_ifs) + + ccm = cgs.c_code_mapper + space = kernel.space + iname = kernel.schedule[sched_index].iname + tag = kernel.iname_to_tag.get(iname) + + # {{{ attempt slab partition to reduce conditional count + + lb_cns_orig, ub_cns_orig = kernel.get_projected_bounds_constraints(iname) + lb_cns_orig = cast_constraint_to_space(lb_cns_orig, space) + ub_cns_orig = cast_constraint_to_space(ub_cns_orig, space) + + # jostle the constant in {lb,ub}_cns to see if we can get + # fewer conditionals in the bulk middle segment + + class TrialRecord(Record): + pass + + if (cgs.try_slab_partition + and "outer" in iname): + trial_cgs = cgs.copy(try_slab_partition=False) + trials = [] + + for lower_incr, upper_incr in [ (0,0), (0,-1), ]: + + lb_cns = block_shift_constraint(lb_cns_orig, iname, -lower_incr) + ub_cns = block_shift_constraint(ub_cns_orig, iname, -upper_incr) + + bulk_slab = (isl.Set.universe(kernel.space) + .add_constraint(lb_cns) + .add_constraint(ub_cns)) + bulk_exec_domain = exec_domain.intersect(bulk_slab) + inner = build_loop_nest(trial_cgs, kernel, sched_index+1, + bulk_exec_domain) + + trials.append((TrialRecord( + lower_incr=lower_incr, + upper_incr=upper_incr, + bulk_slab=bulk_slab), + (inner.num_conditionals, + # when all num_conditionals are equal, choose the + # one with the smallest bounds changes + abs(upper_incr)+abs(lower_incr)))) + + from pytools import argmin2 + chosen = argmin2(trials) + else: + bulk_slab = (isl.Set.universe(kernel.space) + .add_constraint(lb_cns_orig) + .add_constraint(ub_cns_orig)) + chosen = TrialRecord( + lower_incr=0, + upper_incr=0, + bulk_slab=bulk_slab) + + # }}} + + # {{{ build slabs + + slabs = [] + if chosen.lower_incr: + slabs.append(("initial", isl.Set.universe(kernel.space) + .add_constraint(lb_cns_orig) + .add_constraint(ub_cns_orig) + .add_constraint( + negate_constraint( + block_shift_constraint( + lb_cns_orig, iname, -chosen.lower_incr))))) + + slabs.append(("bulk", chosen.bulk_slab)) + + if chosen.upper_incr: + slabs.append(("final", isl.Set.universe(kernel.space) + .add_constraint(ub_cns_orig) + .add_constraint(lb_cns_orig) + .add_constraint( + negate_constraint( + block_shift_constraint( + ub_cns_orig, iname, -chosen.upper_incr))))) + + # }}} + + # {{{ generate code + + from loopy.kernel import AxisParallelTag + if isinstance(tag, AxisParallelTag): + # For a parallel loop dimension, the global loop bounds are + # automatically obeyed--simply because no work items are launched + # outside the requested grid. + # + # For a forced length, this is actually implemented + # by an if below. + + if tag.forced_length is None: + exec_domain = exec_domain.intersect( + isl.Set.universe(kernel.space) + .add_constraint(lb_cns_orig) + .add_constraint(ub_cns_orig)) + else: + impl_len = tag.forced_length + start, _ = kernel.get_projected_bounds(iname) + exec_domain = exec_domain.intersect( + make_slab(kernel.space, iname, start, start+impl_len)) + + result = [] + nums_of_conditionals = [] + + for slab_name, slab in slabs: + cmt = "%s slab for '%s'" % (slab_name, iname) + if len(slabs) == 1: + cmt = None + + new_exec_domain = exec_domain.intersect(slab) + inner = build_loop_nest(cgs, kernel, sched_index+1, + new_exec_domain) + + if tag is None: + from loopy.codegen.bounds import wrap_in_for_from_constraints + + # regular loop + if cmt is not None: + result.append(Comment(cmt)) + result.append( + wrap_in_for_from_constraints(ccm, iname, slab, inner)) + else: + # parallel loop + par_impl_domain = exec_domain.get_the_one_domain() + + from loopy.schedule import get_valid_index_vars + from loopy.codegen.bounds import generate_bounds_checks + + nums_of_conditionals.append(inner.num_conditionals) + constraint_codelets = generate_bounds_checks(ccm, + slab, get_valid_index_vars(kernel, sched_index+1), + par_impl_domain) + result.append( + ("\n&& ".join(constraint_codelets), + add_comment(cmt, inner.ast))) + + if tag is None: + # regular or unrolled loop + return gen_code_block(result) + + elif isinstance(tag, AxisParallelTag): + # parallel loop + if tag.forced_length is None: + base = "last" + else: + base = None + + from loopy.codegen import GeneratedCode + return GeneratedCode( + ast=make_multiple_ifs(result, base=base), + num_conditionals=min(nums_of_conditionals)) + + else: + assert False, "we aren't supposed to get here" + + # }}} + +# }}} + +# vim: foldmethod=marker diff --git a/loopy/codegen/prefetch.py b/loopy/codegen/prefetch.py new file mode 100644 index 000000000..bec065c9a --- /dev/null +++ b/loopy/codegen/prefetch.py @@ -0,0 +1,372 @@ +from __future__ import division + +from pytools import Record +import pyopencl.characterize as cl_char +from loopy.codegen import wrap_in, gen_code_block + + + + +# {{{ prefetch preprocessing + +def preprocess_prefetch(kernel): + """Assign names, dim storage lengths to prefetches. + """ + + all_pf_list = kernel.prefetch.values() + new_prefetch_dict = {} + lmem_size = cl_char.usable_local_mem_size(kernel.device) + + for i_pf, pf in enumerate(kernel.prefetch.itervalues()): + all_pf_nbytes = [opf.nbytes for opf in all_pf_list] + other_pf_sizes = sum(all_pf_nbytes[:i_pf]+all_pf_nbytes[i_pf+1:]) + + shape = [stop-start for start, stop in pf.dim_bounds] + dim_storage_lengths = shape[:] + + # sizes of all dims except the last one, which we may change + # below to avoid bank conflicts + from pytools import product + other_dim_sizes = (pf.itemsize + * product(dim_storage_lengths[:-1])) + + min_mult = cl_char.local_memory_bank_count(kernel.device) + good_incr = None + new_dsl = dim_storage_lengths + min_why_not = None + + for increment in range(dim_storage_lengths[-1]//2): + + test_dsl = dim_storage_lengths[:] + test_dsl[-1] = test_dsl[-1] + increment + new_mult, why_not = cl_char.why_not_local_access_conflict_free( + kernel.device, pf.itemsize, + shape, test_dsl) + + # will choose smallest increment 'automatically' + if new_mult < min_mult: + new_lmem_use = other_pf_sizes + pf.itemsize*product(new_dsl) + if new_lmem_use < lmem_size: + new_dsl = test_dsl + min_mult = new_mult + min_why_not = why_not + good_incr = increment + + if min_mult != 1: + from warnings import warn + from loopy import LoopyAdvisory + warn("could not find a conflict-free mem layout " + "for prefetch of '%s' " + "(currently: %dx conflict, increment: %d, reason: %s)" + % (pf.input_vector, min_mult, good_incr, min_why_not), + LoopyAdvisory) + + new_pf = pf.copy(dim_storage_lengths=new_dsl, + name="prefetch_%s_%d" % (pf.input_vector, i_pf)) + new_prefetch_dict[pf.input_vector, pf.index_expr] = new_pf + all_pf_list[i_pf] = new_pf + + return kernel.copy(prefetch=new_prefetch_dict) + +# }}} + +# {{{ lmem prefetch code generation + +class FetchLoopNestData(Record): + pass + +def make_fetch_loop_nest(flnd, pf_iname_idx, pf_dim_exprs, pf_idx_subst_map, + implemented_domain): + pf = flnd.prefetch + ccm = flnd.c_code_mapper + no_pf_ccm = flnd.no_prefetch_c_code_mapper + kernel = flnd.kernel + + from pymbolic import var + from cgen import Assign, For, If + + from pymbolic.mapper.substitutor import substitute + if pf_iname_idx >= len(pf.inames): + # done, return + from pymbolic.primitives import Variable, Subscript + + from pymbolic.mapper.stringifier import PREC_NONE + result = Assign( + pf.name + "".join("[%s]" % ccm(dexpr) + for dexpr in pf_dim_exprs), + no_pf_ccm( + Subscript( + Variable(pf.input_vector), + substitute(pf.index_expr, pf_idx_subst_map)), + PREC_NONE)) + + def my_ccm(expr): + return ccm(substitute(expr, pf_idx_subst_map)) + + from pymbolic.mapper.dependency import DependencyMapper + check_vars = [v.name for v in DependencyMapper()(pf.index_expr)] + + from loopy.codegen.bounds import wrap_in_bounds_checks + return wrap_in_bounds_checks(my_ccm, pf.kernel.domain, + check_vars, implemented_domain, result) + + pf_iname = pf.inames[pf_iname_idx] + realiz_inames = flnd.realization_inames[pf_iname_idx] + + start_index, stop_index = flnd.kernel.get_projected_bounds(pf_iname) + try: + start_index = int(start_index) + stop_index = int(stop_index) + except TypeError: + raise RuntimeError("loop bounds for prefetch must be " + "known statically at code gen time") + + dim_length = stop_index-start_index + + if realiz_inames is not None: + # {{{ parallel fetch + + realiz_bounds = [flnd.kernel.get_projected_bounds(rn) for rn in realiz_inames] + realiz_lengths = [stop-start for start, stop in realiz_bounds] + from pytools import product + total_realiz_size = product(realiz_lengths) + + result = [] + + cur_index = 0 + + while start_index+cur_index < stop_index: + pf_dim_expr = 0 + for realiz_iname, length in zip(realiz_inames, realiz_lengths): + tag = flnd.kernel.iname_to_tag[realiz_iname] + from loopy.kernel import TAG_WORK_ITEM_IDX + assert isinstance(tag, TAG_WORK_ITEM_IDX) + + pf_dim_expr = (pf_dim_expr*length + + var("(int) get_local_id(%d)" % tag.axis)) + + from loopy.isl import make_slab + loop_slab = make_slab(pf.kernel.space, pf_iname, + start_index+cur_index, + min(stop_index, start_index+cur_index+total_realiz_size)) + new_impl_domain = implemented_domain.intersect(loop_slab) + + pf_dim_expr += cur_index + + pf_idx_subst_map = pf_idx_subst_map.copy() + pf_idx_subst_map[pf_iname] = pf_dim_expr + start_index + inner = make_fetch_loop_nest(flnd, pf_iname_idx+1, + pf_dim_exprs+[pf_dim_expr], pf_idx_subst_map, + new_impl_domain) + + if cur_index+total_realiz_size > dim_length: + inner = wrap_in(If, + "%s < %s" % (ccm(pf_dim_expr), stop_index), + inner) + + result.append(inner) + + cur_index += total_realiz_size + + return gen_code_block(result) + + # }}} + else: + # {{{ sequential fetch + + pf_dim_var = "prefetch_dim_idx_%d" % pf_iname_idx + pf_dim_expr = var(pf_dim_var) + + lb_cns, ub_cns = flnd.kernel.get_projected_bounds_constraints(pf_iname) + import islpy as isl + from loopy.isl import cast_constraint_to_space + loop_slab = (isl.Set.universe(flnd.kernel.space) + .add_constraint(cast_constraint_to_space(lb_cns, kernel.space)) + .add_constraint(cast_constraint_to_space(ub_cns, kernel.space))) + new_impl_domain = implemented_domain.intersect(loop_slab) + + pf_idx_subst_map = pf_idx_subst_map.copy() + pf_idx_subst_map[pf_iname] = pf_dim_expr + start_index + inner = make_fetch_loop_nest(flnd, pf_iname_idx+1, + pf_dim_exprs+[pf_dim_expr], pf_idx_subst_map, + new_impl_domain) + + return wrap_in(For, + "int %s = 0" % pf_dim_var, + "%s < %s" % (pf_dim_var, ccm(dim_length)), + "++%s" % pf_dim_var, + inner) + + # }}} + + +def generate_prefetch_code(cgs, kernel, sched_index, exec_domain): + implemented_domain = exec_domain.implemented_domain + + from cgen import Statement as S, Line, Comment + + ccm = cgs.c_code_mapper + + # find surrounding schedule items + if sched_index-1 >= 0: + next_outer_sched_item = kernel.schedule[sched_index-1] + else: + next_outer_sched_item = None + + if sched_index+1 < len(kernel.schedule): + next_inner_sched_item = kernel.schedule[sched_index+1] + else: + next_inner_sched_item = None + + scheduled_pf = kernel.schedule[sched_index] + pf = kernel.prefetch[ + scheduled_pf.input_vector, scheduled_pf.index_expr] + + # Prefetch has a good amount of flexibility over what axes it + # uses to accomplish the prefetch. In particular, it can (and should!) + # use all work group dimensions. + + # {{{ determine which loop axes are used to realize the fetch + + # realization_dims is a list of lists of inames, to represent when two dims jointly + # make up one fetch axis + + realization_inames = [None] * len(pf.inames) + + # {{{ first, fix the user-specified fetch dims + + from loopy.kernel import TAG_WORK_ITEM_IDX + knl_work_item_inames = kernel.ordered_inames_by_tag_type(TAG_WORK_ITEM_IDX) + + for realization_dim_idx, loc_fetch_axis_list in \ + getattr(pf, "loc_fetch_axes", {}).iteritems(): + realization_inames[realization_dim_idx] = [knl_work_item_inames.pop(axis) + for axis in loc_fetch_axis_list] + + # }}} + + # {{{ next use the work group dimensions, least-stride dim first + + from loopy.kernel import ImageArg, ScalarArg + from loopy.symbolic import CoefficientCollector + + index_expr = pf.index_expr + if not isinstance(index_expr, tuple): + index_expr = (index_expr,) + + arg = kernel.arg_dict[pf.input_vector] + if isinstance(arg, ImageArg): + # arbitrary + ary_strides = (1, 1, 1)[:arg.dimensions] + else: + ary_strides = arg.strides + if ary_strides is None and len(index_expr) == 1: + ary_strides = (1,) + + iname_to_stride = {} + for iexpr_i, stride in zip(index_expr, ary_strides): + coeffs = CoefficientCollector()(iexpr_i) + for var_name, coeff in coeffs.iteritems(): + if var_name != 1: + new_stride = coeff*stride + old_stride = iname_to_stride.get(var_name, None) + if old_stride is None or new_stride < old_stride: + iname_to_stride[var_name] = new_stride + + approximate_arg_values = dict( + (arg.name, arg.approximately) + for arg in kernel.args + if isinstance(arg, ScalarArg)) + + def stride_key(iname): + iname_stride = iname_to_stride[iname] + + from pymbolic import evaluate + key = evaluate(iname_stride, approximate_arg_values) + assert isinstance(key, int) + return key + + pf_iname_strides = sorted((iname + for dim_idx, iname in enumerate(pf.inames) + if realization_inames[dim_idx] is None), + key=stride_key) + + while knl_work_item_inames and pf_iname_strides: + # grab least-stride prefetch dim + least_stride_pf_iname = pf_iname_strides.pop(0) + + # FIXME: It might be good to join multiple things together here + # for size reasons + realization_inames[pf.inames.index(least_stride_pf_iname)] \ + = [knl_work_item_inames.pop(0)] + + if knl_work_item_inames: + # FIXME + from warnings import warn + warn("There were leftover work group dimensions in prefetch " + "assignment. For now, this won't lead to wrong code, " + "but it will lead to unnecessary memory bandwidth use.") + + # }}} + + # }}} + + # {{{ generate fetch code + + from loopy.schedule import get_valid_index_vars + valid_index_vars = get_valid_index_vars(kernel, sched_index, + exclude_tags=(TAG_WORK_ITEM_IDX,)) + + from loopy.symbolic import LoopyCCodeMapper + flnd = FetchLoopNestData(prefetch=pf, + no_prefetch_c_code_mapper= + LoopyCCodeMapper(kernel, no_prefetch=True), + c_code_mapper=ccm, + realization_inames=realization_inames, + kernel=kernel, + valid_index_vars=valid_index_vars) + + fetch_block = make_fetch_loop_nest(flnd, 0, [], {}, implemented_domain) + + # }}} + + new_block = [ + Comment("prefetch %s[%s] using %s" % ( + pf.input_vector, + ", ".join(pf.inames), + ", ".join( + (" x ".join("%s(%s)" % (realiz_iname, kernel.iname_to_tag[realiz_iname]) + for realiz_iname in realiz_inames) + if realiz_inames is not None else "loop") + for realiz_inames in realization_inames))), + Line(), + ] + + # omit head sync primitive if we just came out of a prefetch + + from loopy.prefetch import LocalMemoryPrefetch + if not isinstance(next_outer_sched_item, LocalMemoryPrefetch): + new_block.append(S("barrier(CLK_LOCAL_MEM_FENCE)")) + else: + new_block.append(Comment("next outer schedule item is a prefetch: " + "no sync needed")) + + new_block.extend([ + fetch_block, + ]) + + # omit tail sync primitive if we're headed into another prefetch + if not isinstance(next_inner_sched_item, LocalMemoryPrefetch): + new_block.append(S("barrier(CLK_LOCAL_MEM_FENCE)")) + else: + new_block.append(Line()) + new_block.append(Comment("next inner schedule item is a prefetch: " + "no sync needed")) + + from loopy.codegen.dispatch import build_loop_nest + new_block.extend([Line(), + build_loop_nest(cgs, kernel, sched_index+1, exec_domain)]) + + return gen_code_block(new_block) + +# }}} diff --git a/loopy/compiled.py b/loopy/compiled.py new file mode 100644 index 000000000..5a4422dda --- /dev/null +++ b/loopy/compiled.py @@ -0,0 +1,122 @@ +from __future__ import division + + + + +# {{{ compiled kernel object + +class CompiledKernel: + def __init__(self, context, kernel, size_args=None, options=[], + force_rebuild=False, edit_code=False): + self.kernel = kernel + from loopy.codegen import generate_code + self.code = generate_code(kernel) + + if force_rebuild: + from time import time + self.code = "/* %s */\n%s" % (time(), self.code) + + if edit_code: + from pytools import invoke_editor + self.code = invoke_editor(self.code) + + import pyopencl as cl + try: + self.cl_kernel = getattr( + cl.Program(context, self.code).build(options=options), + kernel.name) + except: + print "[Loopy] ----------------------------------------------------" + print "[Loopy] build failed, here's the source code:" + print "[Loopy] ----------------------------------------------------" + print self.code + print "[Loopy] ----------------------------------------------------" + print "[Loopy] end source code" + print "[Loopy] ----------------------------------------------------" + raise + + from loopy.kernel import ScalarArg + + arg_types = [] + for arg in kernel.args: + if isinstance(arg, ScalarArg): + arg_types.append(arg.dtype) + else: + arg_types.append(None) + + self.cl_kernel.set_scalar_arg_dtypes(arg_types) + + from pymbolic import compile + if size_args is None: + self.size_args = [arg.name for arg in kernel.args if isinstance(arg, ScalarArg)] + else: + self.size_args = size_args + + from loopy.kernel import TAG_GROUP_IDX, TAG_WORK_ITEM_IDX + gsize_expr = tuple(self.kernel.tag_type_lengths(TAG_GROUP_IDX)) + lsize_expr = tuple(self.kernel.tag_type_lengths(TAG_WORK_ITEM_IDX)) + + if not gsize_expr: gsize_expr = (1,) + if not lsize_expr: lsize_expr = (1,) + + self.global_size_func = compile( + gsize_expr, self.size_args) + self.local_size_func = compile( + lsize_expr, self.size_args) + +# }}} + + + + +# {{{ timing driver + +def drive_timing_run(kernel_generator, queue, launch, flop_count=None, + options=[], print_code=True, force_rebuild=False, + edit_code=False): + + def time_run(compiled_knl, warmup_rounds=2, timing_rounds=5): + check = True + for i in range(warmup_rounds): + launch(compiled_knl.cl_kernel, + compiled.global_size_func, compiled.local_size_func, + check=check) + check = False + + events = [] + for i in range(timing_rounds): + events.append( + launch(compiled_knl.cl_kernel, + compiled.global_size_func, compiled.local_size_func, + check=check)) + for evt in events: + evt.wait() + + return sum(1e-9*evt.profile.END-1e-9*evt.profile.START for evt in events)/timing_rounds + + soln_count = 0 + for kernel in kernel_generator: + + compiled = CompiledKernel(queue.context, kernel, options=options, + force_rebuild=force_rebuild, edit_code=edit_code) + + print "-----------------------------------------------" + print "SOLUTION #%d" % soln_count + print "-----------------------------------------------" + if print_code: + print compiled.code + print "-----------------------------------------------" + + elapsed = time_run(compiled) + + print "time: %f" % elapsed + if flop_count is not None: + print "gflops/s: %f (#%d)" % ( + flop_count/elapsed/1e9, soln_count) + print "-----------------------------------------------" + + soln_count += 1 + + print "%d solutions" % soln_count + +# }}} diff --git a/loopy/isl.py b/loopy/isl.py new file mode 100644 index 000000000..c232e71c6 --- /dev/null +++ b/loopy/isl.py @@ -0,0 +1,260 @@ +from __future__ import division + +import islpy as isl +from islpy import dim_type + + + + +# {{{ expression -> constraint conversion + +def _constraint_from_expr(space, expr, constraint_factory): + from loopy.symbolic import CoefficientCollector + return constraint_factory(space, + CoefficientCollector()(expr)) + +def eq_constraint_from_expr(space, expr): + return _constraint_from_expr( + space, expr, isl.Constraint.eq_from_names) + +def ineq_constraint_from_expr(space, expr): + return _constraint_from_expr( + space, expr, isl.Constraint.ineq_from_names) + +# }}} + + +# {{{ isl helpers + +def get_bounds_constraints(bset, iname, space=None, admissible_vars=None): + if isinstance(bset, isl.Set): + bset, = bset.get_basic_sets() + + constraints = bset.get_constraints() + + if not isinstance(admissible_vars, set): + admissible_vars = set(admissible_vars) + + lower = [] + upper = [] + equality = [] + + if space is None: + space = bset.get_dim() + + var_dict = space.get_var_dict() + iname_tp, iname_idx = var_dict[iname] + + for cns in constraints: + iname_coeff = int(cns.get_coefficient(iname_tp, iname_idx)) + + if admissible_vars is not None: + if not (set(cns.get_coefficients_by_name().iterkeys()) + <= admissible_vars): + continue + + if iname_coeff == 0: + continue + + if cns.is_equality(): + equality.append(cns) + elif iname_coeff < 0: + upper.append(cns) + else: # iname_coeff > 0 + lower.append(cns) + + return lower, upper, equality + + +def get_projected_bounds_constraints(set, iname): + """Get an overapproximation of the loop bounds for the variable *iname*, + as constraints. + """ + + # project out every variable except iname + projected_domain = isl.project_out_except(set, [iname], [dim_type.set]) + + basic_sets = projected_domain.get_basic_sets() + + # FIXME perhaps use some form of hull here if there's more than one + # basic set? + bset, = basic_sets + + # Python-style, half-open bounds + upper_bounds = [] + lower_bounds = [] + bset = bset.remove_divs() + + bset_iname_dim_type, bset_iname_idx = bset.get_dim().get_var_dict()[iname] + + def examine_constraint(cns): + assert not cns.is_equality() + assert not cns.is_div_constraint() + + coeffs = cns.get_coefficients_by_name() + + iname_coeff = int(coeffs.get(iname, 0)) + if iname_coeff == 0: + return + elif iname_coeff < 0: + upper_bounds.append(cns) + else: # iname_coeff > 0: + lower_bounds.append(cns) + + bset.foreach_constraint(examine_constraint) + + lb, = lower_bounds + ub, = upper_bounds + + return lb, ub + + + + +def solve_constraint_for_bound(cns, iname): + rhs, iname_coeff = constraint_to_expr(cns, except_name=iname) + + if iname_coeff == 0: + raise ValueError("cannot solve constraint for '%s'--" + "constraint does not contain variable" + % iname) + + from pymbolic import expand + from pytools import div_ceil + from pymbolic import flatten + from pymbolic.mapper.constant_folder import CommutativeConstantFoldingMapper + cfm = CommutativeConstantFoldingMapper() + + if iname_coeff > 0 or cns.is_equality(): + if cns.is_equality(): + kind = "==" + else: + kind = ">=" + + return kind, cfm(flatten(div_ceil(expand(-rhs), iname_coeff))) + else: # iname_coeff < 0 + from pytools import div_ceil + return "<", cfm(flatten(div_ceil(rhs+1, -iname_coeff))) + + + + +def get_projected_bounds(set, iname): + """Get an overapproximation of the loop bounds for the variable *iname*, + as actual bounds. + """ + + lb_cns, ub_cns = get_projected_bounds_constraints(set, iname) + + for cns in [lb_cns, ub_cns]: + iname_tp, iname_idx = lb_cns.get_dim().get_var_dict()[iname] + iname_coeff = cns.get_coefficient(iname_tp, iname_idx) + + if iname_coeff == 0: + continue + + kind, bound = solve_constraint_for_bound(cns, iname) + if kind == "<": + ub = bound + elif kind == ">=": + lb = bound + else: + raise ValueError("unsupported constraint kind") + + return lb, ub + +def cast_constraint_to_space(cns, new_space, as_equality=None): + if as_equality is None: + as_equality = cns.is_equality() + + if as_equality: + factory = isl.Constraint.eq_from_names + else: + factory = isl.Constraint.ineq_from_names + return factory(new_space, cns.get_coefficients_by_name()) + +def block_shift_constraint(cns, iname, multiple, as_equality=None): + cns = copy_constraint(cns, as_equality=as_equality) + cns = cns.set_constant(cns.get_constant() + + cns.get_coefficients_by_name()[iname]*multiple) + return cns + +def negate_constraint(cns): + assert not cns.is_equality() + # FIXME hackety hack + my_set = (isl.BasicSet.universe(cns.get_dim()) + .add_constraint(cns)) + my_set = my_set.complement() + + results = [] + def examine_basic_set(s): + s.foreach_constraint(results.append) + my_set.foreach_basic_set(examine_basic_set) + result, = results + return result + +def copy_constraint(cns, as_equality=None): + return cast_constraint_to_space(cns, cns.get_dim(), + as_equality=as_equality) + +def get_dim_bounds(set, inames): + vars = set.get_dim().get_var_dict(dim_type.set).keys() + return [get_projected_bounds(set, v) for v in inames] + +def count_box_from_bounds(bounds): + from pytools import product + return product(stop-start for start, stop in bounds) + +def make_index_map(set, index_expr): + if not isinstance(index_expr, tuple): + index_expr = (index_expr,) + + amap = isl.Map.from_domain(set).add_dims(dim_type.out, len(index_expr)) + out_names = ["_ary_idx_%d" % i for i in range(len(index_expr))] + + dim = amap.get_dim() + all_constraints = tuple( + eq_constraint_from_expr(dim, iexpr_i) + for iexpr_i in index_expr) + + for i, out_name in enumerate(out_names): + amap = amap.set_dim_name(dim_type.out, i, out_name) + + for i, (out_name, constr) in enumerate(zip(out_names, all_constraints)): + constr.set_coefficients_by_name({out_name: -1}) + amap = amap.add_constraint(constr) + + return amap + +def make_slab(space, iname, start, stop): + from pymbolic import var + var_iname = var(iname) + return (isl.Set.universe(space) + # start <= inner + .add_constraint(ineq_constraint_from_expr( + space, var_iname -start)) + # inner < stop + .add_constraint(ineq_constraint_from_expr( + space, stop-1 - var_iname))) + +def constraint_to_expr(cns, except_name=None): + excepted_coeff = 0 + result = 0 + from pymbolic import var + for var_name, coeff in cns.get_coefficients_by_name().iteritems(): + if isinstance(var_name, str): + if var_name == except_name: + excepted_coeff = int(coeff) + else: + result += int(coeff)*var(var_name) + else: + result += int(coeff) + + if except_name is not None: + return result, excepted_coeff + else: + return result + +# }}} + +# vim: foldmethod=marker diff --git a/loopy/kernel.py b/loopy/kernel.py new file mode 100644 index 000000000..854c0e692 --- /dev/null +++ b/loopy/kernel.py @@ -0,0 +1,475 @@ +"""Elements of loopy's user-facing language.""" + +from __future__ import division + +import numpy as np +from pytools import Record, memoize_method + +from pymbolic.mapper.dependency import DependencyMapper + + + +# {{{ arguments + +class ArrayArg: + def __init__(self, name, dtype, strides=None, shape=None, order="C", + offset=0): + """ + All of the following are optional. Specify either strides or shape. + + :arg strides: like numpy strides, but in multiples of + data type size + :arg shape: + :arg order: + :arg offset: Offset from the beginning of the vector from which + the strides are counted. + """ + self.name = name + self.dtype = np.dtype(dtype) + + if strides is not None and shape is not None: + raise ValueError("can only specify one of shape and strides") + + if strides is not None: + strides = tuple(strides) + + if shape is not None: + from pyopencl.compyte.array import ( + f_contiguous_strides, + c_contiguous_strides) + + if order == "F": + strides = f_contiguous_strides(1, shape) + elif order == "C": + strides = c_contiguous_strides(1, shape) + else: + raise ValueError("invalid order: %s" % order) + + self.strides = strides + self.offset = offset + + def __repr__(self): + return "<ArrayArg '%s' of type %s>" % (self.name, self.dtype) + + + +class ImageArg: + def __init__(self, name, dtype, dimensions): + self.name = name + self.dtype = np.dtype(dtype) + self.dimensions = dimensions + + def __repr__(self): + return "<ImageArg '%s' of type %s>" % (self.name, self.dtype) + + +class ScalarArg: + def __init__(self, name, dtype, approximately): + self.name = name + self.dtype = np.dtype(dtype) + self.approximately = approximately + + def __repr__(self): + return "<ScalarArg '%s' of type %s>" % (self.name, self.dtype) + +# }}} + +# {{{ index tags + +class IndexTag(Record): + __slots__ = [] + + def __hash__(self): + raise RuntimeError("use .key to hash index tags") + + @property + def key(self): + return type(self) + + + +class ParallelTag(IndexTag): + pass + +class AxisParallelTag(ParallelTag): + __slots__ = ["axis", "forced_length"] + + def __init__(self, axis, forced_length=None): + Record.__init__(self, + axis=axis, forced_length=forced_length) + + @property + def key(self): + return (type(self), self.axis) + + def __repr__(self): + if self.forced_length: + return "%s(%d, flen=%d)" % ( + self.print_name, self.axis, + self.forced_length) + else: + return "%s(%d)" % ( + self.print_name, self.axis) + +class TAG_GROUP_IDX(AxisParallelTag): + print_name = "GROUP_IDX" + +class TAG_WORK_ITEM_IDX(AxisParallelTag): + print_name = "WORK_ITEM_IDX" + +class TAG_ILP(ParallelTag): + def __repr__(self): + return "TAG_ILP" + +class BaseUnrollTag(IndexTag): + pass + +class TAG_UNROLL_STATIC(BaseUnrollTag): + def __repr__(self): + return "TAG_UNROLL_STATIC" + +class TAG_UNROLL_INCR(BaseUnrollTag): + def __repr__(self): + return "TAG_UNROLL_INCR" + +def parse_tag(tag): + if tag is None: + return tag + + if isinstance(tag, IndexTag): + return tag + + if not isinstance(tag, str): + raise ValueError("cannot parse tag: %s" % tag) + + if tag in ["unrs", "unr"]: + return TAG_UNROLL_STATIC() + elif tag == "unri": + return TAG_UNROLL_INCR() + elif tag == "ilp": + return TAG_ILP() + elif tag.startswith("g."): + return TAG_GROUP_IDX(int(tag[2:])) + elif tag.startswith("l."): + return TAG_WORK_ITEM_IDX(int(tag[2:])) + else: + raise ValueError("cannot parse tag: %s" % tag) + +# }}} + +# {{{ loop kernel object + +class LoopKernel(Record): + """ + :ivar device: :class:`pyopencl.Device` + :ivar domain: :class:`islpy.BasicSet` + :ivar iname_to_tag: + :ivar instructions: + :ivar args: + :ivar prefetch: + :ivar schedule: + :ivar register_prefetch: + :ivar name: + :ivar preamble: + """ + + def __init__(self, device, domain, instructions, args=None, prefetch={}, schedule=None, + register_prefetch=None, name="loopy_kernel", + iname_to_tag={}, is_divisible=lambda dividend, divisor: False, + preamble=None): + """ + :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl. + Example: "{[i,j]: 0<=i < 10 and 0<= j < 9}" + :arg iname_to_tag: a map from loop domain variables to subclasses of :class:`IndexTag` + """ + from pymbolic import parse + + def parse_if_necessary(v): + if isinstance(v, str): + return parse(v) + else: + return v + + if isinstance(domain, str): + import islpy as isl + ctx = isl.Context() + domain = isl.Set.read_from_str(ctx, domain, nparam=-1) + + insns = [ + (parse_if_necessary(lvalue), + parse_if_necessary(expr)) + for lvalue, expr in instructions] + + Record.__init__(self, + device=device, args=args, domain=domain, instructions=insns, + prefetch=prefetch, schedule=schedule, + register_prefetch=register_prefetch, name=name, + iname_to_tag=iname_to_tag, + is_divisible=is_divisible, + preamble=preamble) + + # FIXME: assert empyt intersection of loop vars and args + # FIXME: assert that isl knows about loop parameters + + @property + @memoize_method + def space(self): + return self.domain.get_dim() + + @property + @memoize_method + def tag_key_to_iname(self): + return dict( + (tag.key, iname) + for iname, tag in self.iname_to_tag.iteritems()) + + @property + @memoize_method + def arg_dict(self): + return dict((arg.name, arg) for arg in self.args) + + @memoize_method + def scalar_args(self): + if self.args is None: + return set() + else: + return set(arg.name for arg in self.args if isinstance(arg, ScalarArg)) + + @memoize_method + def all_inames(self): + from islpy import dim_type + return set(self.space.get_var_dict(dim_type.set).iterkeys()) + + @memoize_method + def output_inames(self): + dm = DependencyMapper(include_subscripts=False) + + output_indices = set() + for lvalue, expr in self.instructions: + output_indices.update( + set(v.name for v in dm(lvalue)) + & self.all_inames()) + + return output_indices - set(arg.name for arg in self.args) + + @memoize_method + def reduction_inames(self): + return self.all_inames() - self.output_inames() + + def inames_by_tag_type(self, tag_type): + return [iname for iname in self.all_inames() + if isinstance(self.iname_to_tag.get(iname), tag_type)] + + def ordered_inames_by_tag_type(self, tag_type): + result = [] + from itertools import count + for i in count(): + try: + dim = self.tag_key_to_iname[tag_type(i).key] + except KeyError: + return result + else: + result.append(dim) + + @memoize_method + def get_projected_bounds_constraints(self, iname): + """Get an overapproximation of the loop bounds for the variable *iname*.""" + + from loopy.isl import get_projected_bounds_constraints + return get_projected_bounds_constraints(self.domain, iname) + + @memoize_method + def get_projected_bounds(self, iname): + """Get an overapproximation of the loop bounds for the variable *iname*.""" + + from loopy.isl import get_projected_bounds + return get_projected_bounds(self.domain, iname) + + def tag_type_lengths(self, tag_cls): + def get_length(iname): + tag = self.iname_to_tag[iname] + if tag.forced_length is not None: + return tag.forced_length + + start, stop = self.get_projected_bounds(iname) + return stop-start + + return [get_length(iname) + for iname in self.ordered_inames_by_tag_type(tag_cls)] + + def tag_or_iname_to_iname(self, s): + try: + tag = parse_tag(s) + except ValueError: + pass + else: + return self.tag_key_to_iname[tag.key] + + if s not in self.all_inames(): + raise RuntimeError("invalid index name '%s'" % s) + + return s + + def local_mem_use(self): + return sum(pf.nbytes for pf in self.prefetch.itervalues()) + + @memoize_method + def input_vectors(self): + dm = DependencyMapper(include_subscripts=False) + + input_vectors = set() + for lvalue, expr in self.instructions: + input_vectors.update( + set(v.name for v in dm(expr))) + + return input_vectors - self.all_inames() - self.scalar_args() + + @memoize_method + def output_vectors(self): + dm = DependencyMapper(include_subscripts=False) + + output_vectors = set() + for lvalue, expr in self.instructions: + output_vectors.update( + set(v.name for v in dm(lvalue))) + + return output_vectors - self.all_inames() - self.scalar_args() + + def _subst_insns(self, old_var, new_expr): + from pymbolic.mapper.substitutor import substitute + + subst_map = {old_var: new_expr} + + return [(substitute(lvalue, subst_map), + substitute(expr, subst_map)) + for lvalue, expr in self.instructions] + + def is_prefetch_variable(self, varname): + if self.prefetch: + for pf in self.prefetch.itervalues(): + for pfdim in pf.dims: + if pfdim.name == varname: + return True + + return False + + def _subst_prefetch(self, old_var, new_expr): + from pymbolic.mapper.substitutor import substitute + subst_map = {old_var: new_expr} + + result = {} + for pf in self.prefetch.itervalues(): + for pfdim in pf.dims: + if pfdim.name == old_var: + raise RuntimeError("can't substitute prefetch dimension %s" + % old_var) + + new_pf = pf.copy(index_expr=substitute(pf.index_expr, subst_map)) + result[pf.input_vector, new_pf.index_expr] = new_pf + + return result + + def substitute(self, old_var, new_expr): + copy = self.copy(instructions=self._subst_insns(old_var, new_expr)) + if self.prefetch: + copy.prefetch = self._subst_prefetch(old_var, new_expr) + + if self.schedule is not None: + raise RuntimeError("cannot substitute-schedule already generated") + + return copy + + def split_dimension(self, name, inner_length, padded_length=None, + outer_name=None, inner_name=None, + outer_tag=None, inner_tag=None): + + outer_tag = parse_tag(outer_tag) + inner_tag = parse_tag(inner_tag) + + if self.iname_to_tag.get(name) is not None: + raise RuntimeError("cannot split tagged dimension '%s'" % name) + + # {{{ check for repeated tag keys + + new_tag_keys = set(tag.key + for tag in [outer_tag, inner_tag] + if tag is not None) + + repeated_tag_keys = new_tag_keys & set( + tag.key for tag in + self.iname_to_tag.itervalues()) + + if repeated_tag_keys: + raise RuntimeError("repeated tag(s): %s" % repeated_tag_keys) + + # }}} + + if padded_length is not None: + inner_tag = inner_tag.copy(forced_length=padded_length) + + if outer_name is None: + outer_name = name+"_outer" + if inner_name is None: + inner_name = name+"_inner" + + new_iname_to_tag = self.iname_to_tag.copy() + if inner_tag is not None: + new_iname_to_tag[inner_name] = inner_tag + if outer_tag is not None: + new_iname_to_tag[outer_name] = outer_tag + + from islpy import dim_type + outer_var_nr = self.space.size(dim_type.set) + inner_var_nr = self.space.size(dim_type.set)+1 + new_domain = self.domain.add_dims(dim_type.set, 2) + new_domain.set_dim_name(dim_type.set, outer_var_nr, outer_name) + new_domain.set_dim_name(dim_type.set, inner_var_nr, inner_name) + + import islpy as isl + from loopy.isl import make_slab + + space = new_domain.get_dim() + inner_constraint_set = ( + make_slab(space, inner_name, 0, inner_length) + # name = inner + length*outer + .add_constraint(isl.Constraint.eq_from_names( + space, {name:1, inner_name: -1, outer_name:-inner_length}))) + + name_dim_type, name_idx = space.get_var_dict()[name] + new_domain = (new_domain + .intersect(inner_constraint_set) + .project_out(name_dim_type, name_idx, 1)) + + from pymbolic import var + inner = var(inner_name) + outer = var(outer_name) + new_loop_index = inner + outer*inner_length + + return (self + .substitute(name, new_loop_index) + .copy(domain=new_domain, iname_to_tag=new_iname_to_tag)) + + def get_invalid_reason(self): + glens = self.tag_type_lengths(TAG_GROUP_IDX) + llens = self.tag_type_lengths(TAG_WORK_ITEM_IDX) + if (max(len(glens), len(llens)) + > self.device.max_work_item_dimensions): + return "too many work item dimensions" + + for i in range(len(llens)): + if llens[i] > self.device.max_work_item_sizes[i]: + return "group axis %d too big" + + from pytools import product + if product(llens) > self.device.max_work_group_size: + return "work group too big" + + from pyopencl.characterize import usable_local_mem_size + if self.local_mem_use() > usable_local_mem_size(self.device): + return "using too much local memory" + + return None + +# }}} + +# vim: foldmethod=marker diff --git a/loopy/prefetch.py b/loopy/prefetch.py new file mode 100644 index 000000000..9e2bef57b --- /dev/null +++ b/loopy/prefetch.py @@ -0,0 +1,143 @@ +from __future__ import division + +from pytools import Record, memoize_method +import islpy as isl +from islpy import dim_type + + + + +# {{{ register prefetches + +class RegisterPrefetch(Record): + __slots__ = ["subscript_expr", "new_name"] + +def insert_register_prefetches(kernel): + reg_pf = {} + + total_loop_count = len(kernel.all_inames()) + known_vars = set() + + unused_index_exprs = set() + from loopy.symbolic import AllSubscriptExpressionCollector + asec = AllSubscriptExpressionCollector() + + from pymbolic.mapper.dependency import DependencyMapper + + for tgt, expr in kernel.instructions: + unused_index_exprs |= asec(expr) + unused_index_exprs = [ + (iexpr, set(v.name for v in DependencyMapper()(iexpr.index))) + for iexpr in unused_index_exprs] + + schedule = kernel.schedule[:] + + from loopy.schedule import ScheduledLoop + + sched_index = 0 + loop_count = 0 + while sched_index < len(schedule): + sched_item = schedule[sched_index] + if isinstance(sched_item, ScheduledLoop): + known_vars.add(sched_item.iname) + loop_count += 1 + sched_index += 1 + + if loop_count < total_loop_count: + i = 0 + while i < len(unused_index_exprs): + iexpr, index_deps = unused_index_exprs[i] + if (index_deps <= known_vars + and (iexpr.aggregate.name, iexpr.index) + not in kernel.prefetch): + unused_index_exprs.pop(i) + new_name = "reg_prefetch_"+iexpr.aggregate.name+str(sched_index) + reg_pf[iexpr] = new_name + schedule.insert(sched_index, + RegisterPrefetch( + index_expr=iexpr, new_name=new_name)) + sched_index += 1 + else: + i += 1 + + return kernel.copy(schedule=schedule, register_prefetch=reg_pf) + +# }}} + +# {{{ local-mem prefetch-related + +class LocalMemoryPrefetch(Record): + """ + Attributes: + :ivar kernel: + :ivar input_vector: A string indicating the input vector variable name. + :ivar index_expr: An expression identifying the access which this prefetch + serves. + :ivar inames: A sequence of inames (i.e. loop dimensions) identifying which + part of the input vector, given the index_expr, should be prefetched. + :ivar loc_fetch_axes: dictionary from integers 0..len(inames) to lists of + local index axes which should be used to realize that dimension of the + prefetch. The last dimension in this list is used as the fastest-changing + one. + :ivar name: the variable name used for the prefetch + :ivar dim_storage_lengths: a sequence of integers indicating the size of + the storage for each dimension. It may may differ from the size of the + actual loop dimensions to mitigate bank conflicts. + + The latter two values are only assigned during code generation. + """ + + @property + @memoize_method + def domain(self): + return (isl.project_out_except(self.kernel.domain, self.inames, [dim_type.set]) + .remove_divs()) + + @property + @memoize_method + def index_map(self): + from loopy.isl import make_index_map + imap = make_index_map(self.kernel_domain, self.index_expr) + assert imap.is_bijective() + return imap + + @property + @memoize_method + def restricted_index_map(self): + return self.index_map.intersect_domain(self.domain) + + @property + @memoize_method + def dim_bounds(self): + from loopy.isl import get_dim_bounds + return get_dim_bounds(self.domain, self.inames) + + @property + def itemsize(self): + return self.kernel.arg_dict[self.input_vector].dtype.itemsize + @property + @memoize_method + def nbytes(self): + from loopy.isl import count_box_from_bounds + return self.itemsize * count_box_from_bounds(self.dim_bounds) + + @memoize_method + def free_variables(self): + from pymbolic.mapper.dependency import DependencyMapper + return set(var.name + for var in DependencyMapper()(self.index_expr) + ) - set(self.inames) - self.kernel.scalar_args() + + def hash(self): + return (hash(type(self)) ^ hash(self.input_vector) + ^ hash(self.index_expr)) + + def __eq__(self, other): + # in particular, dim_storage_lengths should not factor into equality + return (type(self) == type(other) + and self.input_vector == other.input_vector + and self.index_expr == other.index_expr) + +# }}} + +# vim: foldmethod=marker diff --git a/loopy/schedule.py b/loopy/schedule.py new file mode 100644 index 000000000..1966b390b --- /dev/null +++ b/loopy/schedule.py @@ -0,0 +1,165 @@ +from __future__ import division + +from pytools import Record + + + + +# {{{ loop scheduling + +# {{{ schedule items + +class ScheduledLoop(Record): + __slots__ = ["iname"] + +class WriteOutput(Record): + pass + +# plus loopy.prefetch.{Register,LocalMemory}Prefetch + +# }}} + +def generate_loop_schedules(kernel, hints=[]): + from loopy.kernel import TAG_GROUP_IDX, TAG_WORK_ITEM_IDX, TAG_ILP, ParallelTag + + prev_schedule = kernel.schedule + if prev_schedule is None: + prev_schedule = [ + ScheduledLoop(iname=iname) + for iname in ( + kernel.ordered_inames_by_tag_type(TAG_GROUP_IDX) + + kernel.ordered_inames_by_tag_type(TAG_WORK_ITEM_IDX))] + + scheduled_inames = set(sch_item.iname + for sch_item in prev_schedule + if isinstance(sch_item, ScheduledLoop)) + + # have a schedulable prefetch? load, schedule it + had_usable_prefetch = False + locally_parallel_inames = set( + iname for iname in scheduled_inames + if isinstance(kernel.iname_to_tag.get(iname), + (TAG_ILP, TAG_WORK_ITEM_IDX))) + + for pf in kernel.prefetch.itervalues(): + # already scheduled? never mind then. + if pf in prev_schedule: + continue + + # a free variable not known yet? then we're not ready + if not pf.free_variables() <= scheduled_inames: + continue + + # a prefetch variable already scheduled, but not borrowable? + # (only work item index variables are borrowable) + + if set(pf.inames) & (scheduled_inames - locally_parallel_inames): + # dead end: we won't be able to schedule this prefetch + # in this branch. at least one of its loop dimensions + # was already scheduled, and that dimension is not + # borrowable. + + #print "UNSCHEDULABLE", kernel.schedule + return + + new_kernel = kernel.copy(schedule=prev_schedule+[pf]) + for knl in generate_loop_schedules(new_kernel): + had_usable_prefetch = True + yield knl + + if had_usable_prefetch: + # because we've already recursed + return + + # Build set of potentially schedulable variables + # Don't re-schedule already scheduled variables + schedulable = kernel.all_inames() - scheduled_inames + + # Schedule in the following order: + # - serial output inames + # - remaining parallel output inames (i.e. ILP) + # - output write + # - reduction + # Don't schedule reduction variables until all output + # variables are taken care of. Once they are, schedule + # output writing. + parallel_output_inames = set(oin for oin in kernel.output_inames() + if isinstance(kernel.iname_to_tag.get(oin), ParallelTag)) + + serial_output_inames = kernel.output_inames() - parallel_output_inames + + if schedulable & serial_output_inames: + schedulable = schedulable & serial_output_inames + + if schedulable & parallel_output_inames: + schedulable = schedulable & parallel_output_inames + + if kernel.output_inames() <= scheduled_inames: + if not any(isinstance(sch_item, WriteOutput) + for sch_item in prev_schedule): + kernel = kernel.copy( + schedule=prev_schedule + [WriteOutput()]) + prev_schedule = kernel.schedule + + # Don't schedule variables that are prefetch axes + # for not-yet-scheduled prefetches. + unsched_prefetch_axes = set(iname + for pf in kernel.prefetch.itervalues() + if pf not in prev_schedule + for iname in pf.inames + if not isinstance(kernel.iname_to_tag.get(iname), ParallelTag)) + schedulable -= unsched_prefetch_axes + + while hints and hints[0] in scheduled_inames: + hints = hints[1:] + + if hints and hints[0] in schedulable: + schedulable = set(hints[0]) + + if schedulable: + # have a schedulable variable? schedule a loop for it, recurse + for iname in schedulable: + new_kernel = kernel.copy(schedule=prev_schedule+[ScheduledLoop(iname=iname)]) + for knl in generate_loop_schedules(new_kernel, hints): + yield knl + else: + # all loop dimensions and prefetches scheduled? + # great! yield the finished product if it is complete + + from loopy import LoopyAdvisory + + if hints: + from warnings import warn + warn("leftover schedule hints: "+ (", ".join(hints)), + LoopyAdvisory) + + all_inames_scheduled = len(scheduled_inames) == len(kernel.all_inames()) + + from loopy.prefetch import LocalMemoryPrefetch + all_pf_scheduled = len(set(sch_item for sch_item in prev_schedule + if isinstance(sch_item, LocalMemoryPrefetch))) == len(kernel.prefetch) + output_scheduled = len(set(sch_item for sch_item in prev_schedule + if isinstance(sch_item, WriteOutput))) == 1 + + if all_inames_scheduled and all_pf_scheduled and output_scheduled: + yield kernel + +# }}} + + + + +def get_valid_index_vars(kernel, sched_index, exclude_tags=()): + """ + :param exclude_tags: a tuple of tag classes to exclude + """ + return [ + sched_item.iname + for sched_item in kernel.schedule[:sched_index] + if isinstance(sched_item, ScheduledLoop) + if not isinstance(kernel.iname_to_tag.get(sched_item.iname), exclude_tags)] + + + + +# vim: foldmethod=marker diff --git a/loopy/symbolic.py b/loopy/symbolic.py new file mode 100644 index 000000000..d318316a7 --- /dev/null +++ b/loopy/symbolic.py @@ -0,0 +1,204 @@ +"""Pymbolic mappers for loopy.""" + +from __future__ import division + +from pymbolic.mapper import CombineMapper, RecursiveMapper +from pymbolic.mapper.c_code import CCodeMapper +from pymbolic.mapper.stringifier import PREC_NONE +import numpy as np + + + + +# {{{ subscript expression collector + +class AllSubscriptExpressionCollector(CombineMapper): + def combine(self, values): + from pytools import flatten + return set(flatten(values)) + + def map_constant(self, expr): + return set() + + def map_algebraic_leaf(self, expr): + return set() + + def map_subscript(self, expr): + from pymbolic.primitives import Variable + assert isinstance(expr.aggregate, Variable) + + return set([expr]) + +# }}} + +# {{{ coefficient collector + +class CoefficientCollector(RecursiveMapper): + def map_sum(self, expr): + stride_dicts = [self.rec(ch) for ch in expr.children] + + result = {} + for stride_dict in stride_dicts: + for var, stride in stride_dict.iteritems(): + if var in result: + result[var] += stride + else: + result[var] = stride + + return result + + def map_product(self, expr): + result = {} + + children_coeffs = [self.rec(child) for child in expr.children] + + idx_of_child_with_vars = None + for i, child_coeffs in enumerate(children_coeffs): + for k in child_coeffs: + if isinstance(k, str): + if (idx_of_child_with_vars is not None + and idx_of_child_with_vars != i): + raise RuntimeError( + "nonlinear expression") + idx_of_child_with_vars = i + + other_coeffs = 1 + for i, child_coeffs in enumerate(children_coeffs): + if i != idx_of_child_with_vars: + assert len(child_coeffs) == 1 + other_coeffs *= child_coeffs[1] + + if idx_of_child_with_vars is None: + return {1: other_coeffs} + else: + return dict( + (var, other_coeffs*coeff) + for var, coeff in + children_coeffs[idx_of_child_with_vars].iteritems()) + + return result + + def map_constant(self, expr): + return {1: expr} + + def map_variable(self, expr): + return {expr.name: 1} + + def map_subscript(self, expr): + raise RuntimeError("cannot gather coefficients--indirect addressing in use") + +# }}} + +# {{{ variable index expression collector + +class VariableIndexExpressionCollector(CombineMapper): + def __init__(self, tgt_vector_name): + self.tgt_vector_name = tgt_vector_name + + def combine(self, values): + from pytools import flatten + return set(flatten(values)) + + def map_constant(self, expr): + return set() + + def map_algebraic_leaf(self, expr): + return set() + + def map_subscript(self, expr): + from pymbolic.primitives import Variable + assert isinstance(expr.aggregate, Variable) + + if expr.aggregate.name == self.tgt_vector_name: + return set([expr.index]) + else: + return CombineMapper.map_subscript(self, expr) + +# }}} + +# {{{ C code mapper + +class LoopyCCodeMapper(CCodeMapper): + def __init__(self, kernel, no_prefetch=False): + def constant_mapper(c): + if isinstance(c, float): + # FIXME: type-variable + return "%sf" % repr(c) + else: + return repr(c) + + CCodeMapper.__init__(self, constant_mapper=constant_mapper) + self.kernel = kernel + + self.no_prefetch = no_prefetch + + def map_subscript(self, expr, enclosing_prec): + from pymbolic.primitives import Variable + if (not self.no_prefetch + and isinstance(expr.aggregate, Variable) + and expr.aggregate.name in self.kernel.input_vectors()): + try: + pf = self.kernel.prefetch[expr.aggregate.name, expr.index] + except KeyError: + pass + else: + from pymbolic.mapper.stringifier import PREC_SUM + return pf.name+"".join( + "[%s - %s]" % (iname, self.rec( + self.kernel.get_projected_bounds(iname)[0], + PREC_SUM)) + for iname in pf.inames) + + if isinstance(expr.aggregate, Variable): + arg = self.kernel.arg_dict[expr.aggregate.name] + + from loopy.kernel import ImageArg + if isinstance(arg, ImageArg): + assert isinstance(expr.index, tuple) + + base_access = ("read_imagef(%s, loopy_sampler, (float%d)(%s))" + % (arg.name, arg.dimensions, + ", ".join(self.rec(idx, PREC_NONE) + for idx in expr.index[::-1]))) + + if arg.dtype == np.float32: + return base_access+".x" + elif arg.dtype == np.float64: + return "as_double(%s.xy)" % base_access + else: + raise NotImplementedError( + "non-floating-point images not supported for now") + + else: + # ArrayArg + index_expr = expr.index + if isinstance(expr.index, tuple): + ary_strides = arg.strides + if ary_strides is None: + raise RuntimeError("tuple-indexed variable '%s' does not " + "have stride information" % expr.aggregate.name) + else: + ary_strides = (1,) + index_expr = (index_expr,) + + from pymbolic.primitives import Subscript + return CCodeMapper.map_subscript(self, + Subscript(expr.aggregate, arg.offset+sum( + stride*expr_i for stride, expr_i in zip( + ary_strides, index_expr))), enclosing_prec) + + return CCodeMapper.map_subscript(self, expr, enclosing_prec) + + def map_floor_div(self, expr, prec): + if isinstance(expr.denominator, int) and expr.denominator > 0: + return ("int_floor_div_pos_b(%s, %s)" + % (self.rec(expr.numerator, PREC_NONE), + expr.denominator)) + else: + return ("int_floor_div(%s, %s)" + % (self.rec(expr.numerator, PREC_NONE), + self.rec(expr.denominator, PREC_NONE))) + +# }}} + +# vim: foldmethod=marker diff --git a/examples/matrix-ops.py b/test/test_matmul.py similarity index 100% rename from examples/matrix-ops.py rename to test/test_matmul.py -- GitLab