diff --git a/MEMO b/MEMO index dc138572753c81fdc334213c740cf2e78b9777b7..89b67986351d093a7e7dff01479ce72e1df781ae 100644 --- a/MEMO +++ b/MEMO @@ -41,11 +41,11 @@ Things to consider To-do ^^^^^ +- Clean up loopy.kernel. + - Group instructions by dependency/inames for scheduling, to increase sched. scalability -- Multi-domain - - Kernel splitting (via what variables get computed in a kernel) - What if no universally valid precompute base index expression is found? @@ -53,17 +53,10 @@ To-do - Add dependencies after the fact -- Expose iname-duplicate-and-rename as a primitive. - -- Allow parameters to be varying during run-time, substituting values - that depend on other inames? - - Fix all tests - Scalar insn priority -- What to do about constants in codegen? (...f suffix, complex types) - - If finding a maximum proves troublesome, move parameters into the domain - : (as in, Matlab full-slice) in prefetches @@ -73,6 +66,8 @@ To-do Future ideas ^^^^^^^^^^^^ +- Expose iname-duplicate-and-rename as a primitive. + - Array language - reg rolling @@ -120,6 +115,21 @@ Future ideas Dealt with ^^^^^^^^^^ +- What to do about constants in codegen? (...f suffix, complex types) + -> dealt with by type contexts + +- relating to Multi-Domain [DONE] + - Reenable codegen sanity check. [DONE] + + - Incorporate loop-bound-mediated iname dependencies into domain + parenthood. [DONE] + + - Make sure that variables that enter into loop bounds are only written + exactly once. [DONE] + + - Make sure that loop bound writes are scheduled before the relevant + loops. [DONE] + - add_prefetch tagging - nbody GPU diff --git a/loopy/__init__.py b/loopy/__init__.py index aac8bc67442cb2ee1964a03f773dc68168478e9d..0dc0fdf76e7ee38c6b47e97912df0fb1f3127396 100644 --- a/loopy/__init__.py +++ b/loopy/__init__.py @@ -79,10 +79,13 @@ def split_dimension(kernel, split_iname, inner_length, if inner_iname is None: inner_iname = split_iname+"_inner" - outer_var_nr = kernel.space.dim(dim_type.set) - inner_var_nr = kernel.space.dim(dim_type.set)+1 - def process_set(s): + if split_iname not in s.get_var_dict(): + return s + + outer_var_nr = s.dim(dim_type.set) + inner_var_nr = s.dim(dim_type.set)+1 + s = s.add_dims(dim_type.set, 2) s = s.set_dim_name(dim_type.set, outer_var_nr, outer_iname) s = s.set_dim_name(dim_type.set, inner_var_nr, inner_iname) @@ -102,7 +105,7 @@ def split_dimension(kernel, split_iname, inner_length, .eliminate(name_dim_type, name_idx, 1) .remove_dims(name_dim_type, name_idx, 1)) - new_domain = process_set(kernel.domain) + new_domains = [process_set(dom) for dom in kernel.domains] from pymbolic import var inner = var(inner_iname) @@ -125,9 +128,10 @@ def split_dimension(kernel, split_iname, inner_length, new_expr = subst_mapper(rls(insn.expression)) if split_iname in insn.forced_iname_deps: - new_forced_iname_deps = insn.forced_iname_deps.copy() - new_forced_iname_deps.remove(split_iname) - new_forced_iname_deps.update([outer_iname, inner_iname]) + new_forced_iname_deps = ( + (insn.forced_iname_deps.copy() + - frozenset([split_iname])) + | frozenset([outer_iname, inner_iname])) else: new_forced_iname_deps = insn.forced_iname_deps @@ -144,7 +148,7 @@ def split_dimension(kernel, split_iname, inner_length, iname_slab_increments[outer_iname] = slabs result = (kernel .map_expressions(subst_mapper, exclude_instructions=True) - .copy(domain=new_domain, + .copy(domains=new_domains, iname_slab_increments=iname_slab_increments, instructions=new_insns, applied_iname_rewrites=applied_iname_rewrites, diff --git a/loopy/check.py b/loopy/check.py index 148595636643ff0f78264539ab34d2e3843a1d7f..6c26e15109d6250211fdec96ebbe07d7ddf25689 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -1,4 +1,5 @@ from __future__ import division +from islpy import dim_type @@ -85,7 +86,7 @@ def check_for_inactive_iname_access(kernel): def check_for_write_races(kernel): from loopy.symbolic import DependencyMapper - from loopy.kernel import ParallelTag, GroupIndexTag, IlpBaseTag + from loopy.kernel import ParallelTag, GroupIndexTag depmap = DependencyMapper() for insn in kernel.instructions: @@ -173,6 +174,26 @@ def check_for_orphaned_user_hardware_axes(kernel): raise RuntimeError("user-requested local hardware axis %d " "has no iname mapped to it" % axis) +def check_for_data_dependent_parallel_bounds(kernel): + from loopy.kernel import ParallelTag + + for i, dom in enumerate(kernel.domains): + dom_inames = set(dom.get_var_names(dim_type.set)) + par_inames = set(iname + for iname in dom_inames + if isinstance(kernel.iname_to_tag.get(iname), ParallelTag)) + + if not par_inames: + continue + + parameters = set(dom.get_var_names(dim_type.param)) + for par in parameters: + if par in kernel.temporary_variables: + raise RuntimeError("Domain number %d has a data-dependent " + "parameter '%s' and contains parallel " + "inames '%s'. This is not allowed (for now)." + % (i, par, ", ".join(par_inames))) + # }}} def run_automatic_checks(kernel): @@ -181,19 +202,14 @@ def run_automatic_checks(kernel): check_for_unused_hw_axes_in_insns(kernel) check_for_inactive_iname_access(kernel) check_for_write_races(kernel) - + check_for_data_dependent_parallel_bounds(kernel) # {{{ sanity-check for implemented domains of each instruction def check_implemented_domains(kernel, implemented_domains): from islpy import dim_type - parameter_inames = set( - kernel.domain.get_dim_name(dim_type.param, i) - for i in range(kernel.domain.dim(dim_type.param))) - - from islpy import align_spaces - assumptions = align_spaces(kernel.assumptions, kernel.domain) + from islpy import align_spaces, align_two for insn_id, idomains in implemented_domains.iteritems(): insn = kernel.id_to_insn[insn_id] @@ -203,17 +219,30 @@ def check_implemented_domains(kernel, implemented_domains): insn_impl_domain = idomains[0] for idomain in idomains[1:]: insn_impl_domain = insn_impl_domain | idomain + assumptions = align_spaces(kernel.assumptions, insn_impl_domain, + obj_bigger_ok=True) insn_impl_domain = ( (insn_impl_domain & assumptions) .project_out_except(kernel.insn_inames(insn), [dim_type.set])) - desired_domain = ((kernel.domain & assumptions) + insn_inames = kernel.insn_inames(insn) + insn_domain = kernel.get_inames_domain(insn_inames) + assumptions = align_spaces(kernel.assumptions, insn_domain, + obj_bigger_ok=True) + desired_domain = ((insn_domain & assumptions) .project_out_except(kernel.insn_inames(insn), [dim_type.set])) + insn_impl_domain, desired_domain = align_two( + insn_impl_domain, desired_domain) + if insn_impl_domain != desired_domain: i_minus_d = insn_impl_domain - desired_domain d_minus_i = desired_domain - insn_impl_domain + parameter_inames = set( + insn_domain.get_dim_name(dim_type.param, i) + for i in range(insn_domain.dim(dim_type.param))) + lines = [] for kind, diff_set in [ ("implemented, but not desired", i_minus_d), diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py index 371dd950d9f793dc4cee4473a34273de4618d9b8..e653feb93ff50331c62fb4a8d5138e03683a8aeb 100644 --- a/loopy/codegen/__init__.py +++ b/loopy/codegen/__init__.py @@ -122,14 +122,16 @@ class CodeGenerationState(object): self.c_code_mapper = c_code_mapper - def intersect(self, set): + def intersect(self, other): + new_impl, new_other = isl.align_two(self.implemented_domain, other) return CodeGenerationState( - self.implemented_domain & set, + new_impl & new_other, self.c_code_mapper) - def fix(self, iname, aff, space): + def fix(self, iname, aff): from loopy.isl_helpers import iname_rel_aff - iname_plus_lb_aff = iname_rel_aff(space, iname, "==", aff) + iname_plus_lb_aff = iname_rel_aff( + self.implemented_domain.get_space(), iname, "==", aff) from loopy.symbolic import pw_aff_to_expr cns = isl.Constraint.equality_from_aff(iname_plus_lb_aff) @@ -262,11 +264,12 @@ def generate_code(kernel, with_annotation=False, # }}} + from pyopencl.tools import dtype_to_ctype mod.extend([ LiteralLines(r""" - #define lid(N) ((int) get_local_id(N)) - #define gid(N) ((int) get_group_id(N)) - """), + #define lid(N) ((%(idx_ctype)s) get_local_id(N)) + #define gid(N) ((%(idx_ctype)s) get_group_id(N)) + """ % dict(idx_ctype=dtype_to_ctype(kernel.index_dtype))), Line()]) # {{{ build lmem array declarators for temporary variables @@ -289,8 +292,7 @@ def generate_code(kernel, with_annotation=False, # }}} - from islpy import align_spaces - initial_implemented_domain = align_spaces(kernel.assumptions, kernel.domain) + initial_implemented_domain = kernel.assumptions codegen_state = CodeGenerationState(initial_implemented_domain, c_code_mapper=ccm) from loopy.codegen.loop import set_up_hw_parallel_loops @@ -303,34 +305,14 @@ def generate_code(kernel, with_annotation=False, else: body.append(gen_code.ast) - from loopy.symbolic import pw_aff_to_expr mod.append( FunctionBody( CLRequiredWorkGroupSize( - tuple(pw_aff_to_expr(sz) for sz in kernel.get_grid_sizes()[1]), + kernel.get_grid_sizes_as_exprs()[1], CLKernel(FunctionDeclaration( Value("void", kernel.name), args))), body)) - gen_code_str = str(gen_code) - - from cgen import LiteralLines - if "int_floor_div" in gen_code_str: - mod.extend(LiteralLines(""" - #define int_floor_div(a,b) \ - (( (a) - \ - ( ( (a)<0 ) != ( (b)<0 )) \ - *( (b) + ( (b)<0 ) - ( (b)>=0 ) )) \ - / (b) ) - """)) - - if "int_floor_div_pos_b" in gen_code_str: - mod.extend(LiteralLines(""" - #define int_floor_div_pos_b(a,b) ( \ - ( (a) - ( ((a)<0) ? ((b)-1) : 0 ) ) / (b) \ - ) - """)) - from loopy.check import check_implemented_domains assert check_implemented_domains(kernel, gen_code.implemented_domains) diff --git a/loopy/codegen/bounds.py b/loopy/codegen/bounds.py index 88a64e7b270f4c1077f645710d5ba9a8111f4357..0a873eb291cb665b82ac09381320798292ecace9 100644 --- a/loopy/codegen/bounds.py +++ b/loopy/codegen/bounds.py @@ -2,7 +2,6 @@ from __future__ import division import islpy as isl from islpy import dim_type -import numpy as np @@ -22,6 +21,7 @@ def get_bounds_constraints(set, iname, admissible_inames, allow_parameters): elim_type.append(dim_type.param) set = set.eliminate_except(admissible_inames, elim_type) + set = set.compute_divs() basic_sets = set.get_basic_sets() if len(basic_sets) > 1: @@ -103,21 +103,40 @@ def constraint_to_code(ccm, cns): comp_op = ">=" from loopy.symbolic import constraint_to_expr - return "%s %s 0" % (ccm(constraint_to_expr(cns)), comp_op) + return "%s %s 0" % (ccm(constraint_to_expr(cns), 'i'), comp_op) def filter_necessary_constraints(implemented_domain, constraints): - space = implemented_domain.get_space() return [cns for cns in constraints if not implemented_domain.is_subset( - isl.Set.universe(space).add_constraint(cns))] + isl.align_spaces( + isl.BasicSet.universe(cns.get_space()).add_constraint(cns), + implemented_domain))] def generate_bounds_checks(domain, check_inames, implemented_domain): - """Will not overapproximate.""" - domain_bset, = (domain - .eliminate_except(check_inames, [dim_type.set]) - .coalesce() - .get_basic_sets()) + """Will not overapproximate if check_inames consists of all inames in the domain.""" + + if len(check_inames) == domain.dim(dim_type.set): + assert check_inames == frozenset(domain.get_var_names(dim_type.set)) + else: + domain = (domain + .eliminate_except(check_inames, [dim_type.set]) + .remove_divs()) + + if isinstance(domain, isl.Set): + bsets = domain.get_basic_sets() + if len(bsets) == 1: + domain_bset, = bsets + else: + domain = domain.coalesce() + bsets = domain.get_basic_sets() + if len(bsets) == 1: + raise RuntimeError("domain of inames '%s' projected onto '%s' " + "did not reduce to a single conjunction" + % (", ".join(domain.get_var_names(dim_type.set)), + check_inames)) + else: + domain_bset = domain domain_bset = domain_bset.remove_redundancies() @@ -129,8 +148,10 @@ def wrap_in_bounds_checks(ccm, domain, check_inames, implemented_domain, stmt): domain, check_inames, implemented_domain) - new_implemented_domain = implemented_domain & ( - isl.Set.universe(domain.get_space()).add_constraints(bounds_checks)) + bounds_check_set = isl.Set.universe(domain.get_space()).add_constraints(bounds_checks) + bounds_check_set, new_implemented_domain = isl.align_two( + bounds_check_set, implemented_domain) + new_implemented_domain = new_implemented_domain & bounds_check_set condition_codelets = [ constraint_to_code(ccm, cns) for cns in @@ -142,7 +163,8 @@ def wrap_in_bounds_checks(ccm, domain, check_inames, implemented_domain, stmt): return stmt, new_implemented_domain -def wrap_in_for_from_constraints(ccm, iname, constraint_bset, stmt): +def wrap_in_for_from_constraints(ccm, iname, constraint_bset, stmt, + index_dtype): # FIXME add admissible vars if isinstance(constraint_bset, isl.Set): constraint_bset, = constraint_bset.get_basic_sets() @@ -173,7 +195,7 @@ def wrap_in_for_from_constraints(ccm, iname, constraint_bset, stmt): from pymbolic import var rhs += iname_coeff*var(iname) end_conds.append("%s >= 0" % - ccm(cfm(rhs))) + ccm(cfm(rhs), 'i')) else: # iname_coeff > 0 kind, bound = solve_constraint_for_bound(cns, iname) assert kind == ">=" @@ -187,8 +209,8 @@ def wrap_in_for_from_constraints(ccm, iname, constraint_bset, stmt): from loopy.codegen import gen_code_block from cgen import Initializer, POD, Const, Line return gen_code_block([ - Initializer(Const(POD(np.int32, iname)), - ccm(equality_expr)), + Initializer(Const(POD(index_dtype, iname)), + ccm(equality_expr, 'i')), Line(), stmt, ]) @@ -205,7 +227,7 @@ def wrap_in_for_from_constraints(ccm, iname, constraint_bset, stmt): from cgen import For from loopy.codegen import wrap_in return wrap_in(For, - "int %s = %s" % (iname, ccm(start_expr)), + "int %s = %s" % (iname, ccm(start_expr, 'i')), " && ".join(end_conds), "++%s" % iname, stmt) diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py index a1a213add433ac3139a7f2566b7218dbfb1b5f95..79c86a18114cb61782559a098883f27762e7baaf 100644 --- a/loopy/codegen/control.py +++ b/loopy/codegen/control.py @@ -120,7 +120,9 @@ def build_loop_nest(kernel, sched_index, codegen_state): from loopy.schedule import (EnterLoop, LeaveLoop, RunInstruction, Barrier, gather_schedule_subloop) - # {{{ pass 1: pre-scan schedule for my schedule items' indices + # {{{ pass 1: pre-scan schedule for my schedule item's siblings' indices + + # i.e. go up to the next LeaveLoop, and skip over inner loops. my_sched_indices = [] @@ -146,7 +148,7 @@ def build_loop_nest(kernel, sched_index, codegen_state): # }}} - # {{{ pass 2: find admissible conditional inames for each schedule item + # {{{ pass 2: find admissible conditional inames for each sibling schedule item admissible_cond_inames = [ get_admissible_conditional_inames_for(kernel, sched_index) @@ -159,14 +161,20 @@ def build_loop_nest(kernel, sched_index, codegen_state): from pytools import memoize_method class BoundsCheckCache: - def __init__(self, domain, impl_domain): - self.domain = domain + def __init__(self, kernel, impl_domain): + self.kernel = kernel self.impl_domain = impl_domain @memoize_method def __call__(self, check_inames): + if not check_inames: + return [] + + domain = isl.align_spaces( + self.kernel.get_inames_domain(check_inames), + self.impl_domain, obj_bigger_ok=True) from loopy.codegen.bounds import generate_bounds_checks - return generate_bounds_checks(self.domain, + return generate_bounds_checks(domain, check_inames, self.impl_domain) def build_insn_group(sched_indices_and_cond_inames, codegen_state, done_group_lengths=set()): @@ -183,7 +191,7 @@ def build_loop_nest(kernel, sched_index, codegen_state): # Keep growing schedule item group as long as group fulfills minimum # size requirement. - bounds_check_cache = BoundsCheckCache(kernel.domain, codegen_state.implemented_domain) + bounds_check_cache = BoundsCheckCache(kernel, codegen_state.implemented_domain) current_iname_set = cond_inames @@ -226,14 +234,21 @@ def build_loop_nest(kernel, sched_index, codegen_state): # pick largest such group group_length, bounds_checks = max(found_hoists) - if bounds_checks: - check_set = isl.BasicSet.universe(kernel.space) - for cns in bounds_checks: - check_set = check_set.add_constraint(cns) + check_set = None + for cns in bounds_checks: + cns_set = (isl.BasicSet.universe(cns.get_space()) + .add_constraint(cns)) - new_codegen_state = codegen_state.intersect(check_set) - else: + if check_set is None: + check_set = cns_set + else: + check_set, cns_set = isl.align_two(check_set, cns_set) + check_set = check_set.intersect(cns_set) + + if check_set is None: new_codegen_state = codegen_state + else: + new_codegen_state = codegen_state.intersect(check_set) if group_length == 1: # group only contains starting schedule item diff --git a/loopy/codegen/expression.py b/loopy/codegen/expression.py index 22f9624d15fffb28ea109e7f64caca3753cc993d..a0f4305454792cf286c30d70bef6f4539baa144e 100644 --- a/loopy/codegen/expression.py +++ b/loopy/codegen/expression.py @@ -2,8 +2,9 @@ from __future__ import division import numpy as np -from pymbolic.mapper.c_code import CCodeMapper as CCodeMapper -from pymbolic.mapper.stringifier import PREC_NONE +from pymbolic.mapper import RecursiveMapper +from pymbolic.mapper.stringifier import (PREC_NONE, PREC_CALL, PREC_PRODUCT, + PREC_POWER) from pymbolic.mapper import CombineMapper # {{{ type inference @@ -86,7 +87,7 @@ class TypeInferenceMapper(CombineMapper): return tv.dtype if expr.name in self.kernel.all_inames(): - return np.dtype(np.int16) # don't force single-precision upcast + return self.kernel.index_dtype for mangler in self.kernel.symbol_manglers: result = mangler(expr.name) @@ -118,7 +119,29 @@ def perform_cast(ccm, expr, expr_dtype, target_dtype): # {{{ C code mapper -class LoopyCCodeMapper(CCodeMapper): +# type_context may be: +# - 'i' for integer - +# - 'f' for single-precision floating point +# - 'd' for double-precision floating point +# or None for 'no known context'. + +def dtype_to_type_context(dtype): + dtype = np.dtype(dtype) + + if dtype.kind == 'i': + return 'i' + if dtype in [np.float64, np.complex128]: + return 'd' + if dtype in [np.float32, np.complex64]: + return 'f' + from pyopencl.array import vec + if dtype in vec.types.values(): + return dtype_to_type_context(dtype.fields["x"][0]) + + return None + + +class LoopyCCodeMapper(RecursiveMapper): def __init__(self, kernel, seen_dtypes, seen_functions, var_subst_map={}, with_annotation=False, allow_complex=False): """ @@ -127,7 +150,6 @@ class LoopyCCodeMapper(CCodeMapper): functions that were encountered. """ - CCodeMapper.__init__(self) self.kernel = kernel self.seen_dtypes = seen_dtypes self.seen_functions = seen_functions @@ -138,6 +160,8 @@ class LoopyCCodeMapper(CCodeMapper): self.with_annotation = with_annotation self.var_subst_map = var_subst_map.copy() + # {{{ copy helpers + def copy(self, var_subst_map=None): if var_subst_map is None: var_subst_map = self.var_subst_map @@ -146,11 +170,6 @@ class LoopyCCodeMapper(CCodeMapper): with_annotation=self.with_annotation, allow_complex=self.allow_complex) - def infer_type(self, expr): - result = self.type_inf_mapper(expr) - self.seen_dtypes.add(result) - return result - def copy_and_assign(self, name, value): """Make a copy of self with variable *name* fixed to *value*.""" var_subst_map = self.var_subst_map.copy() @@ -164,18 +183,41 @@ class LoopyCCodeMapper(CCodeMapper): var_subst_map.update(assignments) return self.copy(var_subst_map=var_subst_map) - def map_common_subexpression(self, expr, prec): + # }}} + + # {{{ helpers + + def infer_type(self, expr): + result = self.type_inf_mapper(expr) + self.seen_dtypes.add(result) + return result + + def join_rec(self, joiner, iterable, prec, type_context): + f = joiner.join("%s" for i in iterable) + return f % tuple(self.rec(i, prec, type_context) for i in iterable) + + def parenthesize_if_needed(self, s, enclosing_prec, my_prec): + if enclosing_prec > my_prec: + return "(%s)" % s + else: + return s + + # }}} + + def map_common_subexpression(self, expr, prec, type_context): raise RuntimeError("common subexpression should have been eliminated upon " "entry to loopy") - def map_variable(self, expr, prec): + def map_variable(self, expr, enclosing_prec, type_context): if expr.name in self.var_subst_map: if self.with_annotation: return " /* %s */ %s" % ( expr.name, - self.rec(self.var_subst_map[expr.name], prec)) + self.rec(self.var_subst_map[expr.name], + enclosing_prec, type_context)) else: - return str(self.rec(self.var_subst_map[expr.name], prec)) + return str(self.rec(self.var_subst_map[expr.name], + enclosing_prec, type_context)) elif expr.name in self.kernel.arg_dict: arg = self.kernel.arg_dict[expr.name] from loopy.kernel import _ShapedArg @@ -188,15 +230,27 @@ class LoopyCCodeMapper(CCodeMapper): _, c_name = result return c_name - return CCodeMapper.map_variable(self, expr, prec) + return expr.name - def map_tagged_variable(self, expr, enclosing_prec): + def map_tagged_variable(self, expr, enclosing_prec, type_context): return expr.name - def map_subscript(self, expr, enclosing_prec): + def map_lookup(self, expr, enclosing_prec, type_context): + return self.parenthesize_if_needed( + "%s.%s" %(self.rec(expr.aggregate, PREC_CALL, type_context), expr.name), + enclosing_prec, PREC_CALL) + + def map_subscript(self, expr, enclosing_prec, type_context): + def base_impl(expr, enclosing_prec, type_context): + return self.parenthesize_if_needed( + "%s[%s]" % ( + self.rec(expr.aggregate, PREC_CALL, type_context), + self.rec(expr.index, PREC_NONE, 'i')), + enclosing_prec, PREC_CALL) + from pymbolic.primitives import Variable if not isinstance(expr.aggregate, Variable): - return CCodeMapper.map_subscript(self, expr, enclosing_prec) + return base_impl(expr, enclosing_prec, type_context) if expr.aggregate.name in self.kernel.arg_dict: arg = self.kernel.arg_dict[expr.aggregate.name] @@ -207,7 +261,7 @@ class LoopyCCodeMapper(CCodeMapper): base_access = ("read_imagef(%s, loopy_sampler, (float%d)(%s))" % (arg.name, arg.dimensions, - ", ".join(self.rec(idx, PREC_NONE) + ", ".join(self.rec(idx, PREC_NONE, 'i') for idx in expr.index[::-1]))) if arg.dtype == np.float32: @@ -239,10 +293,11 @@ class LoopyCCodeMapper(CCodeMapper): return "*" + expr.aggregate.name from pymbolic.primitives import Subscript - return CCodeMapper.map_subscript(self, + return base_impl( Subscript(expr.aggregate, arg.offset+sum( stride*expr_i for stride, expr_i in zip( - ary_strides, index_expr))), enclosing_prec) + ary_strides, index_expr))), + enclosing_prec, type_context) elif expr.aggregate.name in self.kernel.temporary_variables: @@ -252,53 +307,78 @@ class LoopyCCodeMapper(CCodeMapper): else: index = (expr.index,) - return (temp_var.name + "".join("[%s]" % self.rec(idx, PREC_NONE) + return (temp_var.name + "".join("[%s]" % self.rec(idx, PREC_NONE, 'i') for idx in index)) else: raise RuntimeError("nothing known about variable '%s'" % expr.aggregate.name) - def map_floor_div(self, expr, prec): + def map_floor_div(self, expr, enclosing_prec, type_context): + from loopy.symbolic import get_dependencies + iname_deps = get_dependencies(expr) & self.kernel.all_inames() + domain = self.kernel.get_inames_domain(iname_deps) + from loopy.isl_helpers import is_nonnegative - num_nonneg = is_nonnegative(expr.numerator, self.kernel.domain) - den_nonneg = is_nonnegative(expr.denominator, self.kernel.domain) + num_nonneg = is_nonnegative(expr.numerator, domain) + den_nonneg = is_nonnegative(expr.denominator, domain) + + def seen_func(name): + idt = self.kernel.index_dtype + self.seen_functions.add((name, name, (idt, idt))) if den_nonneg: if num_nonneg: - return CCodeMapper.map_floor_div(self, expr, prec) + return self.parenthesize_if_needed( + "%s / %s" % ( + self.rec(expr.numerator, PREC_PRODUCT, type_context), + # analogous to ^{-1} + self.rec(expr.denominator, PREC_POWER, type_context)), + enclosing_prec, PREC_PRODUCT) else: + seen_func("int_floor_div_pos_b") return ("int_floor_div_pos_b(%s, %s)" - % (self.rec(expr.numerator, PREC_NONE), - expr.denominator)) + % (self.rec(expr.numerator, PREC_NONE, 'i'), + self.rec(expr.denominator, PREC_NONE, 'i'))) else: + seen_func("int_floor_div") return ("int_floor_div(%s, %s)" - % (self.rec(expr.numerator, PREC_NONE), - self.rec(expr.denominator, PREC_NONE))) + % (self.rec(expr.numerator, PREC_NONE, 'i'), + self.rec(expr.denominator, PREC_NONE, 'i'))) - def map_min(self, expr, prec): + def map_min(self, expr, prec, type_context): what = type(expr).__name__.lower() children = expr.children[:] - result = self.rec(children.pop(), PREC_NONE) + result = self.rec(children.pop(), PREC_NONE, type_context) while children: result = "%s(%s, %s)" % (what, - self.rec(children.pop(), PREC_NONE), + self.rec(children.pop(), PREC_NONE, type_context), result) return result map_max = map_min - def map_constant(self, expr, enclosing_prec): + def map_constant(self, expr, enclosing_prec, type_context): if isinstance(expr, complex): - # FIXME: type-variable - return "(cdouble_t) (%s, %s)" % (repr(expr.real), repr(expr.imag)) + cast_type = "cdouble_t" + if type_context == "f": + cast_type = "cfloat_t" + + return "(%s) (%s, %s)" % (cast_type, repr(expr.real), repr(expr.imag)) else: - # FIXME: type-variable - return repr(float(expr)) + if type_context == "f": + return repr(float(expr))+"f" + elif type_context == "d": + return repr(float(expr)) + elif type_context == "i": + return str(int(expr)) + else: + raise RuntimeError("don't know how to generated code " + "for constant '%s'" % expr) - def map_call(self, expr, enclosing_prec): + def map_call(self, expr, enclosing_prec, type_context): from pymbolic.primitives import Variable from pymbolic.mapper.stringifier import PREC_NONE @@ -311,7 +391,7 @@ class LoopyCCodeMapper(CCodeMapper): par_dtypes = tuple(self.infer_type(par) for par in expr.parameters) - parameters = expr.parameters + str_parameters = None mangle_result = self.kernel.mangle_function(identifier, par_dtypes) if mangle_result is not None: @@ -320,23 +400,28 @@ class LoopyCCodeMapper(CCodeMapper): elif len(mangle_result) == 3: result_dtype, c_name, arg_tgt_dtypes = mangle_result - parameters = [ - perform_cast(self, par, par_dtype, tgt_dtype) + str_parameters = [ + self.rec( + perform_cast(self, par, par_dtype, tgt_dtype), + PREC_NONE, dtype_to_type_context(tgt_dtype)) for par, par_dtype, tgt_dtype in zip( - parameters, par_dtypes, arg_tgt_dtypes)] + expr.parameters, par_dtypes, arg_tgt_dtypes)] else: raise RuntimeError("result of function mangler " "for function '%s' not understood" % identifier) self.seen_functions.add((identifier, c_name, par_dtypes)) + if str_parameters is None: + str_parameters = [ + self.rec(par, PREC_NONE, type_context) + for par in expr.parameters] if c_name is None: raise RuntimeError("unable to find C name for function identifier '%s'" % identifier) - return self.format("%s(%s)", - c_name, self.join_rec(", ", parameters, PREC_NONE)) + return "%s(%s)" % (c_name, ", ".join(str_parameters)) # {{{ deal with complex-valued variables @@ -348,15 +433,22 @@ class LoopyCCodeMapper(CCodeMapper): else: raise RuntimeError - def map_sum(self, expr, enclosing_prec): + def map_sum(self, expr, enclosing_prec, type_context): + from pymbolic.mapper.stringifier import PREC_SUM + + def base_impl(expr, enclosing_prec, type_context): + return self.parenthesize_if_needed( + self.join_rec(" + ", expr.children, PREC_SUM, type_context), + enclosing_prec, PREC_SUM) + if not self.allow_complex: - return CCodeMapper.map_sum(self, expr, enclosing_prec) + return base_impl(expr, enclosing_prec, type_context) tgt_dtype = self.infer_type(expr) is_complex = tgt_dtype.kind == 'c' if not is_complex: - return CCodeMapper.map_sum(self, expr, enclosing_prec) + return base_impl(expr, enclosing_prec, type_context) else: tgt_name = self.complex_type_name(tgt_dtype) @@ -365,9 +457,8 @@ class LoopyCCodeMapper(CCodeMapper): complexes = [child for child in expr.children if 'c' == self.infer_type(child).kind] - from pymbolic.mapper.stringifier import PREC_SUM - real_sum = self.join_rec(" + ", reals, PREC_SUM) - complex_sum = self.join_rec(" + ", complexes, PREC_SUM) + real_sum = self.join_rec(" + ", reals, PREC_SUM, type_context) + complex_sum = self.join_rec(" + ", complexes, PREC_SUM, type_context) if real_sum: result = "%s_fromreal(%s) + %s" % (tgt_name, real_sum, complex_sum) @@ -376,15 +467,22 @@ class LoopyCCodeMapper(CCodeMapper): return self.parenthesize_if_needed(result, enclosing_prec, PREC_SUM) - def map_product(self, expr, enclosing_prec): + def map_product(self, expr, enclosing_prec, type_context): + def base_impl(expr, enclosing_prec, type_context): + # Spaces prevent '**z' (times dereference z), which + # is hard to read. + return self.parenthesize_if_needed( + self.join_rec(" * ", expr.children, PREC_PRODUCT, type_context), + enclosing_prec, PREC_PRODUCT) + if not self.allow_complex: - return CCodeMapper.map_product(self, expr, enclosing_prec) + return base_impl(expr, enclosing_prec, type_context) tgt_dtype = self.infer_type(expr) is_complex = 'c' == tgt_dtype.kind if not is_complex: - return CCodeMapper.map_product(self, expr, enclosing_prec) + return base_impl(expr, enclosing_prec, type_context) else: tgt_name = self.complex_type_name(tgt_dtype) @@ -393,19 +491,18 @@ class LoopyCCodeMapper(CCodeMapper): complexes = [child for child in expr.children if 'c' == self.infer_type(child).kind] - from pymbolic.mapper.stringifier import PREC_PRODUCT - real_prd = self.join_rec("*", reals, PREC_PRODUCT) + real_prd = self.join_rec("*", reals, PREC_PRODUCT, type_context) if len(complexes) == 1: myprec = PREC_PRODUCT else: myprec = PREC_NONE - complex_prd = self.rec(complexes[0], myprec) + complex_prd = self.rec(complexes[0], myprec, type_context) for child in complexes[1:]: complex_prd = "%s_mul(%s, %s)" % ( tgt_name, complex_prd, - self.rec(child, PREC_NONE)) + self.rec(child, PREC_NONE, type_context)) if real_prd: # elementwise semantics are correct @@ -415,9 +512,19 @@ class LoopyCCodeMapper(CCodeMapper): return self.parenthesize_if_needed(result, enclosing_prec, PREC_PRODUCT) - def map_quotient(self, expr, enclosing_prec): + def map_quotient(self, expr, enclosing_prec, type_context): + def base_impl(expr, enclosing_prec, type_context): + return self.parenthesize_if_needed( + "%s / %s" % ( + # space is necessary--otherwise '/*' becomes + # start-of-comment in C. + self.rec(expr.numerator, PREC_PRODUCT, type_context), + # analogous to ^{-1} + self.rec(expr.denominator, PREC_POWER, type_context)), + enclosing_prec, PREC_PRODUCT) + if not self.allow_complex: - return CCodeMapper.map_quotient(self, expr, enclosing_prec) + return base_impl(expr, enclosing_prec, type_context) n_complex = 'c' == self.infer_type(expr.numerator).kind d_complex = 'c' == self.infer_type(expr.denominator).kind @@ -425,36 +532,48 @@ class LoopyCCodeMapper(CCodeMapper): tgt_dtype = self.infer_type(expr) if not (n_complex or d_complex): - return CCodeMapper.map_quotient(self, expr, enclosing_prec) + return base_impl(expr, enclosing_prec, type_context) elif n_complex and not d_complex: # elementwise semnatics are correct - return CCodeMapper.map_quotient(self, expr, enclosing_prec) + return base_impl(expr, enclosing_prec, type_context) elif not n_complex and d_complex: return "%s_rdivide(%s, %s)" % ( self.complex_type_name(tgt_dtype), - self.rec(expr.numerator, PREC_NONE), - self.rec(expr.denominator, PREC_NONE)) + self.rec(expr.numerator, PREC_NONE, type_context), + self.rec(expr.denominator, PREC_NONE, type_context)) else: return "%s_divide(%s, %s)" % ( self.complex_type_name(tgt_dtype), - self.rec(expr.numerator, PREC_NONE), - self.rec(expr.denominator, PREC_NONE)) - - def map_remainder(self, expr, enclosing_prec): - if not self.allow_complex: - return CCodeMapper.map_remainder(self, expr, enclosing_prec) + self.rec(expr.numerator, PREC_NONE, type_context), + self.rec(expr.denominator, PREC_NONE, type_context)) + def map_remainder(self, expr, enclosing_prec, type_context): tgt_dtype = self.infer_type(expr) if 'c' == tgt_dtype.kind: raise RuntimeError("complex remainder not defined") - return CCodeMapper.map_remainder(self, expr, enclosing_prec) + return "(%s %% %s)" % ( + self.rec(expr.numerator, PREC_PRODUCT, type_context), + self.rec(expr.denominator, PREC_POWER, type_context)) # analogous to ^{-1} + + def map_power(self, expr, enclosing_prec, type_context): + def base_impl(expr, enclosing_prec, type_context): + from pymbolic.mapper.stringifier import PREC_NONE + from pymbolic.primitives import is_constant, is_zero + if is_constant(expr.exponent): + if is_zero(expr.exponent): + return "1" + elif is_zero(expr.exponent - 1): + return self.rec(expr.base, enclosing_prec, type_context) + elif is_zero(expr.exponent - 2): + return self.rec(expr.base*expr.base, enclosing_prec, type_context) + + return "pow(%s, %s)" % ( + self.rec(expr.base, PREC_NONE, type_context), + self.rec(expr.exponent, PREC_NONE, type_context)) - def map_power(self, expr, enclosing_prec): if not self.allow_complex: - return CCodeMapper.map_power(self, expr, enclosing_prec) - - from pymbolic.mapper.stringifier import PREC_NONE + return base_impl(expr, enclosing_prec, type_context) tgt_dtype = self.infer_type(expr) if 'c' == tgt_dtype.kind: @@ -462,7 +581,7 @@ class LoopyCCodeMapper(CCodeMapper): value = expr.base for i in range(expr.exponent-1): value = value * expr.base - return self.rec(value, enclosing_prec) + return self.rec(value, enclosing_prec, type_context) else: b_complex = 'c' == self.infer_type(expr.base).kind e_complex = 'c' == self.infer_type(expr.exponent).kind @@ -470,18 +589,22 @@ class LoopyCCodeMapper(CCodeMapper): if b_complex and not e_complex: return "%s_powr(%s, %s)" % ( self.complex_type_name(tgt_dtype), - self.rec(expr.base, PREC_NONE), - self.rec(expr.exponent, PREC_NONE)) + self.rec(expr.base, PREC_NONE, type_context), + self.rec(expr.exponent, PREC_NONE, type_context)) else: return "%s_pow(%s, %s)" % ( self.complex_type_name(tgt_dtype), - self.rec(expr.base, PREC_NONE), - self.rec(expr.exponent, PREC_NONE)) + self.rec(expr.base, PREC_NONE, type_context), + self.rec(expr.exponent, PREC_NONE, type_context)) - return CCodeMapper.map_power(self, expr, enclosing_prec) + return base_impl(self, expr, enclosing_prec, type_context) # }}} + def __call__(self, expr, type_context, prec=PREC_NONE): + from pymbolic.mapper import RecursiveMapper + return RecursiveMapper.__call__(self, expr, prec, type_context) + # }}} # vim: fdm=marker diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py index 46a29fc50ce27d0d5168c4c1c33d9724a338f6f7..80c8a79eafe61f341c5b5b84e7ae2ef54d5c11db 100644 --- a/loopy/codegen/instruction.py +++ b/loopy/codegen/instruction.py @@ -12,14 +12,22 @@ def generate_instruction_code(kernel, insn, codegen_state): expr = insn.expression from loopy.codegen.expression import perform_cast - expr = perform_cast(ccm, expr, expr_dtype=ccm.infer_type(expr), - target_dtype=kernel.get_var_descriptor(insn.get_assignee_var_name()).dtype) + target_dtype = kernel.get_var_descriptor(insn.get_assignee_var_name()).dtype + expr_dtype = ccm.infer_type(expr) + + expr = perform_cast(ccm, expr, + expr_dtype=expr_dtype, + target_dtype=target_dtype) from cgen import Assign - insn_code = Assign(ccm(insn.assignee), ccm(expr)) + from loopy.codegen.expression import dtype_to_type_context + insn_code = Assign( + ccm(insn.assignee, prec=None, type_context=None), + ccm(expr, prec=None, type_context=dtype_to_type_context(target_dtype))) from loopy.codegen.bounds import wrap_in_bounds_checks + insn_inames = kernel.insn_inames(insn) insn_code, impl_domain = wrap_in_bounds_checks( - ccm, kernel.domain, kernel.insn_inames(insn), + ccm, kernel.get_inames_domain(insn_inames), insn_inames, codegen_state.implemented_domain, insn_code) diff --git a/loopy/codegen/loop.py b/loopy/codegen/loop.py index d1ffea7a395722c477b4fca0d6771adde4924443..faeefa0250518a89b66c44d87d1ef099ea7f156b 100644 --- a/loopy/codegen/loop.py +++ b/loopy/codegen/loop.py @@ -8,10 +8,10 @@ from loopy.codegen.control import build_loop_nest -def get_simple_loop_bounds(kernel, sched_index, iname, implemented_domain): +def get_simple_loop_bounds(kernel, sched_index, iname, implemented_domain, iname_domain): from loopy.codegen.bounds import get_bounds_constraints, get_defined_inames lower_constraints_orig, upper_constraints_orig, equality_constraints_orig = \ - get_bounds_constraints(kernel.domain, iname, + get_bounds_constraints(iname_domain, iname, frozenset([iname]) | frozenset(get_defined_inames(kernel, sched_index+1)), allow_parameters=True) @@ -32,12 +32,17 @@ def get_simple_loop_bounds(kernel, sched_index, iname, implemented_domain): # {{{ conditional-minimizing slab decomposition def get_slab_decomposition(kernel, iname, sched_index, codegen_state): + iname_domain = kernel.get_inames_domain(iname) + + if iname_domain.is_empty(): + return () + lb_cns_orig, ub_cns_orig = get_simple_loop_bounds(kernel, sched_index, iname, - codegen_state.implemented_domain) + codegen_state.implemented_domain, iname_domain) - lower_incr, upper_incr = kernel.iname_slab_increments.get(iname, (0, 0)) + space = lb_cns_orig.space - iname_tp, iname_idx = kernel.iname_to_dim[iname] + lower_incr, upper_incr = kernel.iname_slab_increments.get(iname, (0, 0)) if lower_incr or upper_incr: bounds = kernel.get_iname_bounds(iname) @@ -60,9 +65,10 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state): from loopy.isl_helpers import iname_rel_aff + if lower_incr: assert lower_incr > 0 - lower_slab = ("initial", isl.BasicSet.universe(kernel.space) + lower_slab = ("initial", isl.BasicSet.universe(space) .add_constraint(lb_cns_orig) .add_constraint(ub_cns_orig) .add_constraint( @@ -78,16 +84,16 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state): if upper_incr: assert upper_incr > 0 - upper_slab = ("final", isl.BasicSet.universe(kernel.space) + upper_slab = ("final", isl.BasicSet.universe(space) .add_constraint(lb_cns_orig) .add_constraint(ub_cns_orig) .add_constraint( isl.Constraint.inequality_from_aff( - iname_rel_aff(kernel.space, + iname_rel_aff(space, iname, ">=", upper_bound_aff-upper_incr)))) upper_bulk_bound = ( isl.Constraint.inequality_from_aff( - iname_rel_aff(kernel.space, + iname_rel_aff(space, iname, "<", upper_bound_aff-upper_incr))) else: lower_slab = None @@ -98,7 +104,7 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state): slabs.append(lower_slab) slabs.append(( ("bulk", - (isl.BasicSet.universe(kernel.space) + (isl.BasicSet.universe(space) .add_constraint(lower_bulk_bound) .add_constraint(upper_bulk_bound))))) if upper_slab: @@ -108,7 +114,7 @@ def get_slab_decomposition(kernel, iname, sched_index, codegen_state): else: return [("bulk", - (isl.BasicSet.universe(kernel.space) + (isl.BasicSet.universe(space) .add_constraint(lb_cns_orig) .add_constraint(ub_cns_orig)))] @@ -138,7 +144,7 @@ def generate_unroll_loop(kernel, sched_index, codegen_state): for i in range(length): idx_aff = lower_bound_aff + i - new_codegen_state = codegen_state.fix(iname, idx_aff, kernel.space) + new_codegen_state = codegen_state.fix(iname, idx_aff) result.append( build_loop_nest(kernel, sched_index+1, new_codegen_state)) @@ -149,6 +155,14 @@ def generate_unroll_loop(kernel, sched_index, codegen_state): # }}} +def intersect_kernel_with_slab(kernel, slab, iname): + hdi = kernel.get_home_domain_index(iname) + home_domain = kernel.domains[hdi] + new_domains = kernel.domains[:] + new_domains[hdi] = home_domain & isl.align_spaces(slab, home_domain) + return kernel.copy(domains=new_domains) + + # {{{ hw-parallel loop def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left=None): @@ -189,12 +203,13 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left= result = [] bounds = kernel.get_iname_bounds(iname) + domain = kernel.get_inames_domain(iname) from loopy.isl_helpers import make_slab from loopy.isl_helpers import static_value_of_pw_aff lower_bound = static_value_of_pw_aff(bounds.lower_bound_pw_aff, constants_only=False) - slab = make_slab(kernel.space, iname, + slab = make_slab(domain.get_space(), iname, lower_bound, lower_bound+hw_axis_size) codegen_state = codegen_state.intersect(slab) @@ -216,9 +231,12 @@ def set_up_hw_parallel_loops(kernel, sched_index, codegen_state, hw_inames_left= if len(slabs) == 1: cmt = None - new_kernel = kernel.copy(domain=kernel.domain & slab) + # Have the conditional infrastructure generate the + # slabbin conditionals. + slabbed_kernel = intersect_kernel_with_slab(kernel, slab, iname) + inner = set_up_hw_parallel_loops( - new_kernel, sched_index, codegen_state, hw_inames_left) + slabbed_kernel, sched_index, codegen_state, hw_inames_left) result.append(add_comment(cmt, inner)) from loopy.codegen import gen_code_block @@ -242,7 +260,9 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): if len(slabs) == 1: cmt = None + # Conditionals for slab are generated below. new_codegen_state = codegen_state.intersect(slab) + inner = build_loop_nest(kernel, sched_index+1, new_codegen_state) @@ -252,7 +272,8 @@ def generate_sequential_loop_dim_code(kernel, sched_index, codegen_state): from cgen import Comment result.append(Comment(cmt)) result.append( - wrap_in_for_from_constraints(ccm, iname, slab, inner)) + wrap_in_for_from_constraints(ccm, iname, slab, inner, + kernel.index_dtype)) return gen_code_block(result) diff --git a/loopy/compiled.py b/loopy/compiled.py index 187c17050ee482ac2f0eeeac2f809a0d15f7d9bf..8cf223f471b1de357539547e4764ecbc5ad2bc77 100644 --- a/loopy/compiled.py +++ b/loopy/compiled.py @@ -157,7 +157,7 @@ class CompiledKernel: args = [] outputs = [] - encountered_non_numpy = False + encountered_numpy = False kwargs_copy = kwargs.copy() @@ -172,7 +172,7 @@ class CompiledKernel: # synchronous, so nothing to worry about val = cl_array.to_device(queue, val, allocator=allocator) elif val is not None: - encountered_non_numpy = True + encountered_numpy = True if val is None: if not is_written: @@ -209,7 +209,7 @@ class CompiledKernel: *args, g_times_l=True, wait_for=wait_for) - if out_host is None and not encountered_non_numpy: + if out_host is None and encountered_numpy: out_host = True if out_host: outputs = [o.get() for o in outputs] @@ -479,6 +479,12 @@ def auto_test_vs_ref(ref_knl, ctx, kernel_gen, op_count=[], op_label=[], paramet last_cpu_dev = None for pf in cl.get_platforms(): + if pf.name == "Portable OpenCL": + # That implementation [1] isn't quite good enough yet. + # [1] https://launchpad.net/pocl + # FIXME remove when no longer true. + continue + for dev in pf.get_devices(): last_dev = dev if dev.type == cl.device_type.CPU: diff --git a/loopy/creation.py b/loopy/creation.py index 28be2b09557f7ef285b7d2c5a9dc2010e3bd94eb..2f137c068efecd8f8db6c28e39cc1debb51258f8 100644 --- a/loopy/creation.py +++ b/loopy/creation.py @@ -5,7 +5,26 @@ from loopy.symbolic import IdentityMapper # {{{ sanity checking -def check_kernel(knl): +def check_for_duplicate_names(knl): + name_to_source = {} + + def add_name(name, source): + if name in name_to_source: + raise RuntimeError("invalid %s name '%s'--name already used as " + "%s" % (source, name, name_to_source[name])) + + name_to_source[name] = source + + for name in knl.all_inames(): + add_name(name, "iname") + for arg in knl.args: + add_name(arg.name, "argument") + for name in knl.temporary_variables: + add_name(name, "temporary") + for name in knl.substitutions: + add_name(name, "substitution") + +def check_for_nonexistent_iname_deps(knl): for insn in knl.instructions: if not set(insn.forced_iname_deps) <= knl.all_inames(): raise ValueError("In instruction '%s': " @@ -15,6 +34,36 @@ def check_kernel(knl): ",".join( set(insn.forced_iname_deps)-knl.all_inames()))) +def check_for_multiple_writes_to_loop_bounds(knl): + from islpy import dim_type + + domain_parameters = set() + for dom in knl.domains: + domain_parameters.update(dom.get_space().get_var_dict(dim_type.param)) + + temp_var_domain_parameters = domain_parameters & set( + knl.temporary_variables) + + wmap = knl.writer_map() + for tvpar in temp_var_domain_parameters: + par_writers = wmap[tvpar] + if len(par_writers) != 1: + raise RuntimeError("there must be exactly one write to data-dependent " + "domain parameter '%s' (found %d)" % (tvpar, len(par_writers))) + + +def check_written_variable_names(knl): + admissible_vars = ( + set(arg.name for arg in knl.args) + | set(knl.temporary_variables.iterkeys())) + + for insn in knl.instructions: + var_name = insn.get_assignee_var_name() + + if var_name not in admissible_vars: + raise RuntimeError("variable '%s' not declared or not " + "allowed for writing" % var_name) + # }}} # {{{ expand common subexpressions into assignments @@ -99,9 +148,7 @@ def create_temporaries(knl): new_temp_vars = knl.temporary_variables.copy() for insn in knl.instructions: - from loopy.kernel import ( - find_var_base_indices_and_shape_from_inames, - TemporaryVariable) + from loopy.kernel import TemporaryVariable if insn.temp_var_type is not None: assignee_name = insn.get_assignee_var_name() @@ -120,8 +167,15 @@ def create_temporaries(knl): assignee_indices.append(index_expr.name) base_indices, shape = \ - find_var_base_indices_and_shape_from_inames( - knl.domain, assignee_indices, knl.cache_manager) + knl.find_var_base_indices_and_shape_from_inames( + assignee_indices, knl.cache_manager) + + if assignee_name in new_temp_vars: + raise RuntimeError("cannot create temporary variable '%s'--" + "already exists" % assignee_name) + if assignee_name in knl.arg_dict: + raise RuntimeError("cannot create temporary variable '%s'--" + "already exists as argument" % assignee_name) new_temp_vars[assignee_name] = TemporaryVariable( name=assignee_name, @@ -187,7 +241,7 @@ def duplicate_reduction_inames(kernel): # }}} - new_domain = kernel.domain + new_domains = kernel.domains new_insns = [] new_iname_to_tag = kernel.iname_to_tag.copy() @@ -203,13 +257,13 @@ def duplicate_reduction_inames(kernel): from loopy.isl_helpers import duplicate_axes for old, new in zip(old_insn_inames, new_insn_inames): - new_domain = duplicate_axes(new_domain, [old], [new]) + new_domains = duplicate_axes(new_domains, [old], [new]) if old in kernel.iname_to_tag: new_iname_to_tag[new] = kernel.iname_to_tag[old] return kernel.copy( instructions=new_insns, - domain=new_domain, + domains=new_domains, iname_to_tag=new_iname_to_tag) # }}} @@ -218,7 +272,7 @@ def duplicate_reduction_inames(kernel): def duplicate_inames(knl): new_insns = [] - new_domain = knl.domain + new_domains = knl.domains new_iname_to_tag = knl.iname_to_tag.copy() newly_created_vars = set() @@ -256,7 +310,7 @@ def duplicate_inames(knl): newly_created_vars.update(new_inames) from loopy.isl_helpers import duplicate_axes - new_domain = duplicate_axes(new_domain, inames_to_duplicate, new_inames) + new_domains = duplicate_axes(new_domains, inames_to_duplicate, new_inames) from loopy.symbolic import SubstitutionMapper from pymbolic.mapper.substitutor import make_subst_func @@ -284,7 +338,7 @@ def duplicate_inames(knl): return knl.copy( instructions=new_insns, - domain=new_domain, + domains=new_domains, iname_to_tag=new_iname_to_tag) # }}} @@ -304,7 +358,7 @@ def make_kernel(*args, **kwargs): knl.iname_to_tag_requests).copy( iname_to_tag_requests=[]) - check_kernel(knl) + check_for_nonexistent_iname_deps(knl) knl = create_temporaries(knl) knl = duplicate_reduction_inames(knl) @@ -320,6 +374,16 @@ def make_kernel(*args, **kwargs): knl = expand_cses(knl) + # ------------------------------------------------------------------------- + # Ordering dependency: + # ------------------------------------------------------------------------- + # Must create temporary before checking for writes to temporary variables + # that are domain parameters. + # ------------------------------------------------------------------------- + check_for_multiple_writes_to_loop_bounds(knl) + check_for_duplicate_names(knl) + check_written_variable_names(knl) + return knl # }}} diff --git a/loopy/cse.py b/loopy/cse.py index 624ed6b5e34526f50e4262639accc2b4ba1befee..67e41cb9c1c4c198beb972dd19b12aa4f4b74864 100644 --- a/loopy/cse.py +++ b/loopy/cse.py @@ -166,12 +166,22 @@ def build_global_storage_to_sweep_map(invocation_descriptors, # {{{ compute storage bounds -def compute_bounds(kernel, subst_name, stor2sweep, sweep_inames, +def find_var_base_indices_and_shape_from_inames( + domain, inames, cache_manager, context=None): + base_indices_and_sizes = [ + cache_manager.base_index_and_length(domain, iname, context) + for iname in inames] + return zip(*base_indices_and_sizes) + + + + +def compute_bounds(kernel, sweep_domain, subst_name, stor2sweep, sweep_inames, storage_axis_names): # move non-sweep inames into parameter space - dup_sweep_index = kernel.space.dim(dim_type.out) + dup_sweep_index = sweep_domain.get_space().dim(dim_type.out) # map_space: [stor_axes'] -> [domain](dup_sweep_index)[dup_sweep] sp = stor2sweep.get_space() @@ -187,7 +197,6 @@ def compute_bounds(kernel, subst_name, stor2sweep, sweep_inames, "sweep did not result in a bounded storage domain" % subst_name) - from loopy.kernel import find_var_base_indices_and_shape_from_inames return find_var_base_indices_and_shape_from_inames( storage_domain, [saxis+"'" for saxis in storage_axis_names], kernel.cache_manager, context=kernel.assumptions) @@ -198,7 +207,7 @@ def compute_bounds(kernel, subst_name, stor2sweep, sweep_inames, -def get_access_info(kernel, subst_name, +def get_access_info(kernel, sweep_domain, subst_name, storage_axis_names, storage_axis_sources, sweep_inames, invocation_descriptors): @@ -206,9 +215,9 @@ def get_access_info(kernel, subst_name, primed_sweep_inames = [psin+"'" for psin in sweep_inames] from loopy.isl_helpers import duplicate_axes - dup_sweep_index = kernel.space.dim(dim_type.out) + dup_sweep_index = sweep_domain.space.dim(dim_type.out) domain_dup_sweep = duplicate_axes( - kernel.domain, sweep_inames, + sweep_domain, sweep_inames, primed_sweep_inames) prime_sweep_inames = SubstitutionMapper(make_subst_func( @@ -221,7 +230,7 @@ def get_access_info(kernel, subst_name, storage_axis_names, storage_axis_sources, prime_sweep_inames) storage_base_indices, storage_shape = compute_bounds( - kernel, subst_name, stor2sweep, sweep_inames, + kernel, sweep_domain, subst_name, stor2sweep, sweep_inames, storage_axis_names) # compute augmented domain @@ -588,9 +597,20 @@ def precompute(kernel, subst_use, dtype, sweep_inames=[], # }}} + if sweep_inames: + leaf_domain_index = kernel.get_leaf_domain_index(frozenset(sweep_inames)) + sweep_domain = kernel.domains[leaf_domain_index] + + for iname in sweep_inames: + if kernel.get_home_domain_index(iname) != leaf_domain_index: + raise RuntimeError("sweep iname '%s' is not 'at home' in the " + "sweep's leaf domain" % iname) + else: + sweep_domain = kernel.combine_domains(()) + (non1_storage_axis_names, new_domain, - storage_base_indices, non1_storage_base_indices, non1_storage_shape)= \ - get_access_info(kernel, subst_name, + storage_base_indices, non1_storage_base_indices, non1_storage_shape) = \ + get_access_info(kernel, sweep_domain, subst_name, storage_axis_names, storage_axis_sources, sweep_inames, invocation_descriptors) @@ -598,7 +618,7 @@ def precompute(kernel, subst_use, dtype, sweep_inames=[], if len(new_domain.get_basic_sets()) > 1: hull_new_domain = new_domain.simple_hull() - if hull_new_domain <= new_domain: + if isl.Set.from_basic_set(hull_new_domain) <= new_domain: new_domain = hull_new_domain new_domain = new_domain.coalesce() @@ -793,8 +813,12 @@ def precompute(kernel, subst_use, dtype, sweep_inames=[], # }}} + new_domains = kernel.domains[:] + if sweep_inames: + new_domains[leaf_domain_index] = new_domain + return kernel.copy( - domain=new_domain, + domains=new_domains, instructions=new_insns, substitutions=new_substs, temporary_variables=new_temporary_variables, diff --git a/loopy/isl_helpers.py b/loopy/isl_helpers.py index a509b8a3058c3b375b1b6f787e522e0230217f4b..dab7648af2a4b407cfd693c447b49da9cbf85846 100644 --- a/loopy/isl_helpers.py +++ b/loopy/isl_helpers.py @@ -105,11 +105,10 @@ def dump_space(ls): def make_slab(space, iname, start, stop): zero = isl.Aff.zero_on_domain(space) - from islpy import align_spaces if isinstance(start, (isl.Aff, isl.PwAff)): - start = align_spaces(pw_aff_to_aff(start), zero) + start, zero = isl.align_two(pw_aff_to_aff(start), zero) if isinstance(stop, (isl.Aff, isl.PwAff)): - stop = align_spaces(pw_aff_to_aff(stop), zero) + stop, zero = isl.align_two(pw_aff_to_aff(stop), zero) if isinstance(start, int): start = zero + start if isinstance(stop, int): stop = zero + stop @@ -200,6 +199,11 @@ def static_value_of_pw_aff(pw_aff, constants_only, context=None): def duplicate_axes(isl_obj, duplicate_inames, new_inames): + if isinstance(isl_obj, list): + return [ + duplicate_axes(i, duplicate_inames, new_inames) + for i in isl_obj] + if not duplicate_inames: return isl_obj @@ -244,6 +248,7 @@ def duplicate_axes(isl_obj, duplicate_inames, new_inames): + def is_nonnegative(expr, over_set): space = over_set.get_space() from loopy.symbolic import aff_from_expr diff --git a/loopy/kernel.py b/loopy/kernel.py index 30509831529c6190d055a9df7bd061b2766e0c86..d8d78ece8ce38e8f921cfbe61af27c62b41dc104 100644 --- a/loopy/kernel.py +++ b/loopy/kernel.py @@ -115,14 +115,14 @@ def parse_tag(tag): # {{{ arguments class _ShapedArg(object): - def __init__(self, name, dtype, strides=None, shape=None, order="C", + def __init__(self, name, dtype, shape=None, strides=None, order="C", offset=0): """ All of the following are optional. Specify either strides or shape. + :arg shape: :arg strides: like numpy strides, but in multiples of data type size - :arg shape: :arg order: :arg offset: Offset from the beginning of the vector from which the strides are counted. @@ -299,7 +299,7 @@ class Instruction(Record): """ def __init__(self, id, assignee, expression, - forced_iname_deps=set(), insn_deps=set(), boostable=None, + forced_iname_deps=frozenset(), insn_deps=set(), boostable=None, boostable_into=None, temp_var_type=None, duplicate_inames_and_tags=[]): @@ -309,7 +309,7 @@ class Instruction(Record): if isinstance(expression, str): assignee = parse(expression) - assert isinstance(forced_iname_deps, set) + assert isinstance(forced_iname_deps, frozenset) assert isinstance(insn_deps, set) Record.__init__(self, @@ -513,6 +513,24 @@ def default_preamble_generator(seen_dtypes, seen_functions): #include """) + c_funcs = set(c_name for name, c_name, arg_dtypes in seen_functions) + if "int_floor_div" in c_funcs: + yield ("05_int_floor_div", """ + #define int_floor_div(a,b) \ + (( (a) - \ + ( ( (a)<0 ) != ( (b)<0 )) \ + *( (b) + ( (b)<0 ) - ( (b)>=0 ) )) \ + / (b) ) + """) + + if "int_floor_div_pos_b" in c_funcs: + yield ("05_int_floor_div_pos_b", """ + #define int_floor_div_pos_b(a,b) ( \ + ( (a) - ( ((a)<0) ? ((b)-1) : 0 ) ) / (b) \ + ) + """) + + # }}} # {{{ loop kernel object @@ -525,11 +543,58 @@ def _generate_unique_possibilities(prefix): yield "%s_%d" % (prefix, try_num) try_num += 1 +_IDENTIFIER_RE = re.compile(r"\b([a-zA-Z_][a-zA-Z0-9_]*)\b") + +def _gather_identifiers(s): + return set(_IDENTIFIER_RE.findall(s)) + +def _parse_domains(ctx, args_and_vars, domains): + result = [] + available_parameters = args_and_vars.copy() + used_inames = set() + + for dom in domains: + if isinstance(dom, str): + if not dom.lstrip().startswith("["): + # i.e. if no parameters are already given + ids = _gather_identifiers(dom) + parameters = ids & available_parameters + dom = "[%s] -> %s" % (",".join(parameters), dom) + + try: + dom = isl.Set.read_from_str(ctx, dom) + except: + print "failed to parse domain '%s'" % dom + raise + else: + assert isinstance(dom, (isl.Set, isl.BasicSet)) + # assert dom.get_ctx() == ctx + + for i_iname in xrange(dom.dim(dim_type.set)): + iname = dom.get_dim_name(dim_type.set, i_iname) + + if iname is None: + raise RuntimeError("domain '%s' provided no iname at index " + "%d (redefined iname?)" % (dom, i_iname)) + + if iname in used_inames: + raise RuntimeError("domain '%s' redefines iname '%s' " + "that is part of a previous domain" % (dom, iname)) + + used_inames.add(iname) + available_parameters.add(iname) + + result.append(dom) + + return result + + + class LoopKernel(Record): """ :ivar device: :class:`pyopencl.Device` - :ivar domain: :class:`islpy.BasicSet` + :ivar domains: :class:`islpy.BasicSet` :ivar instructions: :ivar args: :ivar schedule: @@ -580,7 +645,7 @@ class LoopKernel(Record): :ivar iname_to_tag_requests: """ - def __init__(self, device, domain, instructions, args=None, schedule=None, + def __init__(self, device, domains, instructions, args=[], schedule=None, name="loopy_kernel", preambles=[], preamble_generators=[default_preamble_generator], @@ -602,7 +667,8 @@ class LoopKernel(Record): applied_iname_rewrites=[], cache_manager=None, iname_to_tag_requests=None, - lowest_priority_inames=[], breakable_inames=set()): + lowest_priority_inames=[], breakable_inames=set(), + index_dtype=np.int32): """ :arg domain: a :class:`islpy.BasicSet`, or a string parseable to a basic set by the isl. Example: "{[i,j]: 0<=i < 10 and 0<= j < 9}" @@ -614,12 +680,10 @@ class LoopKernel(Record): if cache_manager is None: cache_manager = SetOperationCacheManager() - if isinstance(domain, str): - ctx = isl.Context() - domain = isl.Set.read_from_str(ctx, domain) - iname_to_tag_requests = {} + # {{{ parse instructions + INAME_ENTRY_RE = re.compile( r"^\s*(?P\w+)\s*(?:\:\s*(?P[\w.]+))?\s*$") INSN_RE = re.compile( @@ -662,8 +726,6 @@ class LoopKernel(Record): return result - # {{{ instruction parser - def parse_insn(insn): insn_match = INSN_RE.match(insn) subst_match = SUBST_RE.match(insn) @@ -695,10 +757,10 @@ class LoopKernel(Record): if groups["iname_deps_and_tags"] is not None: inames_and_tags = parse_iname_and_tag_list( groups["iname_deps_and_tags"]) - forced_iname_deps = set(iname for iname, tag in inames_and_tags) + forced_iname_deps = frozenset(iname for iname, tag in inames_and_tags) iname_to_tag_requests.update(dict(inames_and_tags)) else: - forced_iname_deps = set() + forced_iname_deps = frozenset() if groups["duplicate_inames_and_tags"] is not None: duplicate_inames_and_tags = parse_iname_and_tag_list( @@ -720,9 +782,9 @@ class LoopKernel(Record): raise RuntimeError("left hand side of assignment '%s' must " "be variable or subscript" % lhs) - insns.append( + parsed_instructions.append( Instruction( - id=self.make_unique_instruction_id(insns, based_on=label), + id=self.make_unique_instruction_id(parsed_instructions, based_on=label), insn_deps=insn_deps, forced_iname_deps=forced_iname_deps, assignee=lhs, expression=rhs, @@ -756,8 +818,8 @@ class LoopKernel(Record): def parse_if_necessary(insn): if isinstance(insn, Instruction): if insn.id is None: - insn = insn.copy(id=self.make_unique_instruction_id(insns)) - insns.append(insn) + insn = insn.copy(id=self.make_unique_instruction_id(parsed_instructions)) + parsed_instructions.append(insn) return if not isinstance(insn, str): @@ -768,10 +830,7 @@ class LoopKernel(Record): for insn in expand_defines(insn, defines): parse_insn(insn) - - # }}} - - insns = [] + parsed_instructions = [] substitutions = substitutions.copy() @@ -779,23 +838,71 @@ class LoopKernel(Record): # must construct list one-by-one to facilitate unique id generation parse_if_necessary(insn) - if len(set(insn.id for insn in insns)) != len(insns): + if len(set(insn.id for insn in parsed_instructions)) != len(parsed_instructions): raise RuntimeError("instruction ids do not appear to be unique") + # }}} + + # Ordering dependency: + # Domain construction needs to know what temporary variables are + # available. That information can only be obtained once instructions + # are parsed. + + # {{{ construct domains + + if isinstance(domains, str): + domains = [domains] + + ctx = isl.Context() + scalar_arg_names = set(arg.name for arg in args if isinstance(arg, ScalarArg)) + var_names = ( + set(temporary_variables) + | set(insn.get_assignee_var_name() + for insn in parsed_instructions + if insn.temp_var_type is not None)) + domains = _parse_domains(ctx, scalar_arg_names | var_names, domains) + + # }}} + + # {{{ process assumptions + if assumptions is None: - assumptions_space = domain.get_space().params() + dom0_space = domains[0].get_space() + assumptions_space = isl.Space.params_alloc( + dom0_space.get_ctx(), dom0_space.dim(dim_type.param)) + for i in xrange(dom0_space.dim(dim_type.param)): + assumptions_space = assumptions_space.set_dim_name( + dim_type.param, i, dom0_space.get_dim_name(dim_type.param, i)) assumptions = isl.Set.universe(assumptions_space) elif isinstance(assumptions, str): - s = domain.get_space() - assumptions = isl.BasicSet.read_from_str(domain.get_ctx(), - "[%s] -> { : %s}" - % (",".join(s.get_dim_name(dim_type.param, i) - for i in range(s.dim(dim_type.param))), - assumptions)) + all_inames = set() + all_params = set() + for dom in domains: + all_inames.update(dom.get_var_names(dim_type.set)) + all_params.update(dom.get_var_names(dim_type.param)) + + domain_parameters = all_params-all_inames + + assumptions_set_str = "[%s] -> { : %s}" \ + % (",".join(s for s in domain_parameters), + assumptions) + assumptions = isl.BasicSet.read_from_str(domains[0].get_ctx(), + assumptions_set_str) + + assert assumptions.is_params() + + # }}} + + index_dtype = np.dtype(index_dtype) + if index_dtype.kind != 'i': + raise TypeError("index_dtype must be an integer") + if np.iinfo(index_dtype).min >= 0: + raise TypeError("index_dtype must be signed") Record.__init__(self, - device=device, domain=domain, instructions=insns, + device=device, domains=domains, + instructions=parsed_instructions, args=args, schedule=schedule, name=name, @@ -813,7 +920,8 @@ class LoopKernel(Record): breakable_inames=breakable_inames, applied_iname_rewrites=applied_iname_rewrites, function_manglers=function_manglers, - symbol_manglers=symbol_manglers) + symbol_manglers=symbol_manglers, + index_dtype=index_dtype) # {{{ function mangling @@ -831,6 +939,8 @@ class LoopKernel(Record): # }}} + # {{{ unique ids + def make_unique_instruction_id(self, insns=None, based_on="insn", extra_used_ids=set()): if insns is None: insns = self.instructions @@ -841,16 +951,225 @@ class LoopKernel(Record): if id_str not in used_ids: return id_str + # }}} + + # {{{ name listing + @memoize_method def all_inames(self): - from islpy import dim_type - return set(self.space.get_var_dict(dim_type.set).iterkeys()) + result = set() + for dom in self.domains: + result.update(dom.get_var_names(dim_type.set)) + return frozenset(result) @memoize_method def non_iname_variable_names(self): return (set(self.arg_dict.iterkeys()) | set(self.temporary_variables.iterkeys())) + # }}} + + # {{{ domain handling + + @memoize_method + def parents_per_domain(self): + """Return a list corresponding to self.domains (by index) + containing domain indices which are nested around this + domain. + + Each domains nest list walks from the leaves of the nesting + tree to the root. + """ + + # The stack of iname sets records which inames are active + # as we step through the linear list of domains. It also + # determines the granularity of inames to be popped/decactivated + # if we ascend a level. + + iname_set_stack = [] + result = [] + + writer_map = self.writer_map() + + for dom in self.domains: + parameters = set(dom.get_var_names(dim_type.param)) + inames = set(dom.get_var_names(dim_type.set)) + + # This next domain may be nested inside the previous domain. + # Or it may not, in which case we need to figure out how many + # levels of parents we need to discard in order to find the + # true parent. + + discard_level_count = 0 + while discard_level_count < len(iname_set_stack): + # {{{ check for parenthood by loop bound iname + + last_inames = iname_set_stack[-1-discard_level_count] + if last_inames & parameters: + break + + # }}} + + # {{{ check for parenthood by written variable + + is_parent_by_variable = False + for par in parameters: + if par in self.temporary_variables: + writer_insns = writer_map[par] + + if len(writer_insns) > 1: + raise RuntimeError("loop bound '%s' " + "may only be written to once" % par) + + writer_insn, = writer_insns + writer_inames = self.insn_inames(writer_insn) + + if writer_inames & last_inames: + is_parent_by_variable = True + break + + if is_parent_by_variable: + break + + # }}} + + discard_level_count += 1 + + if discard_level_count: + iname_set_stack = iname_set_stack[:-discard_level_count] + + if result: + parent = len(result)-1 + else: + parent = None + + for i in range(discard_level_count): + assert parent is not None + parent = result[parent] + + # found this domain's parent + result.append(parent) + + if iname_set_stack: + parent_inames = iname_set_stack[-1] + else: + parent_inames = set() + iname_set_stack.append(parent_inames | inames) + + return result + + @memoize_method + def all_parents_per_domain(self): + """Return a list corresponding to self.domains (by index) + containing domain indices which are nested around this + domain. + + Each domains nest list walks from the leaves of the nesting + tree to the root. + """ + result = [] + + ppd = self.parents_per_domain() + for dom, parent in zip(self.domains, ppd): + # keep walking up tree to find *all* parents + dom_result = [] + while parent is not None: + dom_result.insert(0, parent) + parent = ppd[parent] + + result.append(dom_result) + + return result + + @memoize_method + def _get_home_domain_map(self): + return dict( + (iname, i_domain) + for i_domain, dom in enumerate(self.domains) + for iname in dom.get_var_names(dim_type.set)) + + def get_home_domain_index(self, iname): + return self._get_home_domain_map()[iname] + + @memoize_method + def combine_domains(self, domains): + """ + :arg domains: domain indices of domains to be combined. More 'dominant' + domains (those which get most say on the actual dim_type of an iname) + must be later in the order. + """ + assert isinstance(domains, tuple) # for caching + + if not domains: + return isl.BasicSet.universe(self.domains[0].get_space()) + + result = None + for dom_index in domains: + dom = self.domains[dom_index] + if result is None: + result = dom + else: + aligned_dom, aligned_result = isl.align_two( + dom, result, across_dim_types=True) + result = aligned_result & aligned_dom + + return result + + def get_inames_domain(self, inames): + if not inames: + return self.combine_domains(()) + + if isinstance(inames, str): + inames = frozenset([inames]) + if not isinstance(inames, frozenset): + inames = frozenset(inames) + + from warnings import warn + warn("get_inames_domain did not get a frozenset", stacklevel=2) + + return self._get_inames_domain_backend(inames) + + @memoize_method + def get_leaf_domain_index(self, inames): + """Find the leaf of the domain tree needed to cover all inames.""" + + hdm = self._get_home_domain_map() + ppd = self.all_parents_per_domain() + + domain_indices = set() + + leaf_domain_index = None + + for iname in inames: + home_domain_index = hdm[iname] + if home_domain_index in domain_indices: + # nothin' new + continue + + leaf_domain_index = home_domain_index + + all_parents = set(ppd[home_domain_index]) + if not domain_indices <= all_parents: + raise RuntimeError("iname set '%s' requires " + "branch in domain tree (when adding '%s')" + % (", ".join(inames), iname)) + + domain_indices.add(home_domain_index) + domain_indices.update(all_parents) + + return leaf_domain_index + + @memoize_method + def _get_inames_domain_backend(self, inames): + leaf_dom_idx = self.get_leaf_domain_index(inames) + + return self.combine_domains(tuple(sorted( + self.all_parents_per_domain()[leaf_dom_idx] + + [leaf_dom_idx] + ))) + + # }}} + @memoize_method def all_insn_inames(self): """Return a mapping from instruction ids to inames inside which @@ -929,15 +1248,8 @@ class LoopKernel(Record): """ result = {} - admissible_vars = ( - set(arg.name for arg in self.args) - | set(self.temporary_variables.iterkeys())) - for insn in self.instructions: var_name = insn.get_assignee_var_name() - - if var_name not in admissible_vars: - raise RuntimeError("variable '%s' not declared or not allowed for writing" % var_name) var_names = [var_name] for var_name in var_names: @@ -945,11 +1257,6 @@ class LoopKernel(Record): return result - @property - @memoize_method - def iname_to_dim(self): - return self.domain.get_space().get_var_dict() - @memoize_method def get_written_variables(self): return set( @@ -962,7 +1269,7 @@ class LoopKernel(Record): set(self.temporary_variables.iterkeys()) | set(self.substitutions.iterkeys()) | set(arg.name for arg in self.args) - | set(self.iname_to_dim.keys())) + | set(self.all_inames())) def make_unique_var_name(self, based_on="var", extra_used_vars=set()): used_vars = self.all_variable_names() | extra_used_vars @@ -989,11 +1296,6 @@ class LoopKernel(Record): def id_to_insn(self): return dict((insn.id, insn) for insn in self.instructions) - @property - @memoize_method - def space(self): - return self.domain.get_space() - @property @memoize_method def arg_dict(self): @@ -1005,38 +1307,54 @@ class LoopKernel(Record): if self.args is None: return [] else: - loop_arg_names = [self.space.get_dim_name(dim_type.param, i) - for i in range(self.space.dim(dim_type.param))] + from pytools import flatten + loop_arg_names = list(flatten(dom.get_var_names(dim_type.param) + for dom in self.domains)) return [arg.name for arg in self.args if isinstance(arg, ScalarArg) if arg.name in loop_arg_names] @memoize_method def get_iname_bounds(self, iname): - dom_intersect_assumptions = ( - isl.align_spaces(self.assumptions, self.domain) - & self.domain) + domain = self.get_inames_domain(frozenset([iname])) + d_var_dict = domain.get_var_dict() + + dom_intersect_assumptions = (isl.align_spaces( + self.assumptions, domain, obj_bigger_ok=True) + & domain) + lower_bound_pw_aff = ( self.cache_manager.dim_min( dom_intersect_assumptions, - self.iname_to_dim[iname][1]) + d_var_dict[iname][1]) .coalesce()) upper_bound_pw_aff = ( self.cache_manager.dim_max( dom_intersect_assumptions, - self.iname_to_dim[iname][1]) + d_var_dict[iname][1]) .coalesce()) class BoundsRecord(Record): pass size = (upper_bound_pw_aff - lower_bound_pw_aff + 1) - size = size.intersect_domain(self.assumptions) + size = size.gist(self.assumptions) return BoundsRecord( lower_bound_pw_aff=lower_bound_pw_aff, upper_bound_pw_aff=upper_bound_pw_aff, size=size) + def find_var_base_indices_and_shape_from_inames( + self, inames, cache_manager, context=None): + if not inames: + return [], [] + + base_indices_and_sizes = [ + cache_manager.base_index_and_length( + self.get_inames_domain(iname), iname, context) + for iname in inames] + return zip(*base_indices_and_sizes) + @memoize_method def get_constant_iname_length(self, iname): from loopy.isl_helpers import static_max_of_pw_aff @@ -1103,8 +1421,6 @@ class LoopKernel(Record): size_list = [] sorted_axes = sorted(size_dict.iterkeys()) - zero_aff = isl.Aff.zero_on_domain(self.space.params()) - while sorted_axes or forced_sizes: if sorted_axes: cur_axis = sorted_axes.pop(0) @@ -1113,16 +1429,14 @@ class LoopKernel(Record): if len(size_list) in forced_sizes: size_list.append( - isl.PwAff.from_aff( - zero_aff + forced_sizes.pop(len(size_list)))) + forced_sizes.pop(len(size_list))) continue assert cur_axis is not None - while cur_axis > len(size_list): + if cur_axis > len(size_list): raise RuntimeError("%s axis %d unused" % ( which, len(size_list))) - size_list.append(zero_aff + 1) size_list.append(size_dict[cur_axis]) @@ -1140,7 +1454,7 @@ class LoopKernel(Record): def tup_to_exprs(tup): from loopy.symbolic import pw_aff_to_expr - return tuple(pw_aff_to_expr(i) for i in tup) + return tuple(pw_aff_to_expr(i, int_ok=True) for i in tup) return tup_to_exprs(grid_size), tup_to_exprs(group_size) @@ -1161,8 +1475,12 @@ class LoopKernel(Record): always nested around them. """ result = {} + + # {{{ examine instructions + iname_to_insns = self.iname_to_insns() + # examine pairs of all inames--O(n**2), I know. for inner_iname in self.all_inames(): result[inner_iname] = set() for outer_iname in self.all_inames(): @@ -1172,6 +1490,20 @@ class LoopKernel(Record): if iname_to_insns[inner_iname] < iname_to_insns[outer_iname]: result[inner_iname].add(outer_iname) + # }}} + + # {{{ examine domains + + for i_dom, (dom, parent_indices) in enumerate( + zip(self.domains, self.all_parents_per_domain())): + for parent_index in parent_indices: + for iname in dom.get_var_names(dim_type.set): + parent = self.domains[parent_index] + for parent_iname in parent.get_var_names(dim_type.set): + result[iname].add(parent_iname) + + # }}} + return result def map_expressions(self, func, exclude_instructions=False): @@ -1197,8 +1529,9 @@ class LoopKernel(Record): lines.append("%s: %s" % (iname, self.iname_to_tag.get(iname))) lines.append(sep) - lines.append("DOMAIN:") - lines.append(str(self.domain)) + lines.append("DOMAINS:") + for dom, parents in zip(self.domains, self.all_parents_per_domain()): + lines.append(len(parents)*" " + str(dom)) if self.substitutions: lines.append(sep) @@ -1306,31 +1639,6 @@ def find_all_insn_inames(instructions, all_inames, -def find_var_base_indices_and_shape_from_inames( - domain, inames, cache_manager, context=None): - base_indices = [] - shape = [] - - iname_to_dim = domain.get_space().get_var_dict() - for iname in inames: - lower_bound_pw_aff = cache_manager.dim_min(domain, iname_to_dim[iname][1]) - upper_bound_pw_aff = cache_manager.dim_max(domain, iname_to_dim[iname][1]) - - from loopy.isl_helpers import static_max_of_pw_aff, static_value_of_pw_aff - from loopy.symbolic import pw_aff_to_expr - - shape.append(pw_aff_to_expr(static_max_of_pw_aff( - upper_bound_pw_aff - lower_bound_pw_aff + 1, constants_only=True, - context=context))) - base_indices.append(pw_aff_to_expr( - static_value_of_pw_aff(lower_bound_pw_aff, constants_only=False, - context=context))) - - return base_indices, shape - - - - def get_dot_dependency_graph(kernel, iname_cluster=False, iname_edge=True): lines = [] for insn in kernel.instructions: @@ -1377,7 +1685,22 @@ class SetOperationCacheManager: def dim_max(self, set, *args): return self.op(set, "dim_max", set.dim_max, args) + def base_index_and_length(self, set, iname, context=None): + iname_to_dim = set.space.get_var_dict() + lower_bound_pw_aff = self.dim_min(set, iname_to_dim[iname][1]) + upper_bound_pw_aff = self.dim_max(set, iname_to_dim[iname][1]) + + from loopy.isl_helpers import static_max_of_pw_aff, static_min_of_pw_aff + from loopy.symbolic import pw_aff_to_expr + + size = pw_aff_to_expr(static_max_of_pw_aff( + upper_bound_pw_aff - lower_bound_pw_aff + 1, constants_only=True, + context=context)) + base_index = pw_aff_to_expr( + static_min_of_pw_aff(lower_bound_pw_aff, constants_only=False, + context=context)) + return base_index, size diff --git a/loopy/reduction.py b/loopy/reduction.py index 8b687cfc811b01033db07aa1f73215a10fd2eb63..9a09b3c0793c431212cb42e55c6a0b9504dae0fd 100644 --- a/loopy/reduction.py +++ b/loopy/reduction.py @@ -93,8 +93,8 @@ class _ArgExtremumReductionOperation(ReductionOperation): struct_dtype = np.dtype([("value", dtype), ("index", np.int32)]) ARGEXT_STRUCT_DTYPES[dtype] = struct_dtype - from pyopencl.tools import register_dtype - register_dtype(struct_dtype, self.prefix(dtype)+"_result", alias_ok=True) + from pyopencl.tools import get_or_register_dtype + get_or_register_dtype(self.prefix(dtype)+"_result", struct_dtype) return struct_dtype def neutral_element(self, dtype, inames): diff --git a/loopy/schedule.py b/loopy/schedule.py index 686f6f8e73a970cd5684d1715802d3ba202210fc..a6bc240c4e3382d37573d98f246bce9b082c0f09 100644 --- a/loopy/schedule.py +++ b/loopy/schedule.py @@ -446,9 +446,41 @@ def generate_loop_schedules_internal(kernel, loop_priority, schedule=[], allow_b useful_loops = [] for iname in needed_inames: - if not kernel.loop_nest_map()[iname] <= active_inames_set | parallel_inames: + + # {{{ check if scheduling this iname now is allowed/plausible + + currently_accessible_inames = active_inames_set | parallel_inames + if not kernel.loop_nest_map()[iname] <= currently_accessible_inames: + continue + + iname_home_domain = kernel.domains[kernel.get_home_domain_index(iname)] + from islpy import dim_type + iname_home_domain_params = set(iname_home_domain.get_var_names(dim_type.param)) + + # The previous check should have ensured this is true, because + # Kernel.loop_nest_map takes the domain dependency graph into + # consideration. + assert (iname_home_domain_params & kernel.all_inames() + <= currently_accessible_inames) + + # Check if any parameters are temporary variables, and if so, if their + # writes have already been scheduled. + + data_dep_written = True + for domain_par in ( + iname_home_domain_params + & + set(kernel.temporary_variables)): + writer_insn, = kernel.writer_map()[domain_par] + if writer_insn not in scheduled_insn_ids: + data_dep_written = False + break + + if not data_dep_written: continue + # }}} + # {{{ determine if that gets us closer to being able to schedule an insn useful = False diff --git a/loopy/symbolic.py b/loopy/symbolic.py index c7376b4d652156bdf953bae3eaba84dcb63f5065..29b2512ece0f22665d3f5ab36a75fdf6117babf9 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -432,10 +432,12 @@ def aff_to_expr(aff, except_name=None, error_on_name=None): -def pw_aff_to_expr(pw_aff): +def pw_aff_to_expr(pw_aff, int_ok=False): if isinstance(pw_aff, int): - from warnings import warn - warn("expected PwAff, got int", stacklevel=2) + if not int_ok: + from warnings import warn + warn("expected PwAff, got int", stacklevel=2) + return pw_aff pieces = pw_aff.get_pieces() @@ -691,7 +693,7 @@ def get_dependencies(expr): from loopy.symbolic import DependencyMapper dep_mapper = DependencyMapper(composite_leaves=False) - return set(dep.name for dep in dep_mapper(expr)) + return frozenset(dep.name for dep in dep_mapper(expr)) diff --git a/test/test_linalg.py b/test/test_linalg.py index a8109433647fb328ab78bad44b20128aa90b9dda..fefdaac15ec87c4c65e89202c951d2b1dbee8f6d 100644 --- a/test/test_linalg.py +++ b/test/test_linalg.py @@ -117,10 +117,7 @@ def test_axpy(ctx_factory): lp.GlobalArg("z", dtype, shape="n,"), lp.ScalarArg("n", np.int32, approximately=n), ], - name="axpy", assumptions="n>=1", - preamble=""" - #include - """) + name="axpy", assumptions="n>=1") seq_knl = knl @@ -139,11 +136,13 @@ def test_axpy(ctx_factory): return knl for variant in [variant_cpu, variant_gpu]: + #for variant in [ variant_gpu]: kernel_gen = lp.generate_loop_schedules(variant(knl)) kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, - op_count=np.dtype(dtype).itemsize*n*3/1e9, op_label="GBytes", + op_count=[np.dtype(dtype).itemsize*n*3/1e9], + op_label=["GBytes"], parameters={"a": a, "b": b, "n": n}, check_result=check) @@ -180,7 +179,7 @@ def test_transpose(ctx_factory): kernel_gen = lp.check_kernels(kernel_gen, {}) lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, - op_count=dtype.itemsize*n**2*2/1e9, op_label="GByte", + op_count=[dtype.itemsize*n**2*2/1e9], op_label=["GByte"], parameters={}) @@ -627,7 +626,7 @@ def test_image_matrix_mul_ilp(ctx_factory): kernel_gen = lp.check_kernels(kernel_gen, dict(n=n)) lp.auto_test_vs_ref(seq_knl, ctx, kernel_gen, - op_count=2*n**3/1e9, op_label="GFlops", + op_count=[2*n**3/1e9], op_label=["GFlops"], parameters={}) diff --git a/test/test_loopy.py b/test/test_loopy.py index e964f60139f2813ea40e26692e3dd31a1e6a556f..2c447e9cffcad0dfb44176934363479075c852d9 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -92,14 +92,15 @@ def test_stencil(ctx_factory): knl = lp.make_kernel(ctx.devices[0], "{[i,j]: 0<= i,j < 32}", [ - "[i] z[i,j] = -2*a[i,j]" + "[i] z[i,j] = -2*a[i,j]" " + a[i,j-1]" " + a[i,j+1]" " + a[i-1,j]" " + a[i+1,j]" ], [ - lp.GlobalArg("a", np.float32, shape=(32,32,)) + lp.GlobalArg("a", np.float32, shape=(32,32,)), + lp.GlobalArg("z", np.float32, shape=(32,32,)) ]) @@ -173,7 +174,7 @@ def test_argmax(ctx_factory): a = np.random.randn(10000).astype(dtype) cknl = lp.CompiledKernel(ctx, knl) - evt, (max_idx, max_val) = cknl(queue, a=a) + evt, (max_idx, max_val) = cknl(queue, a=a, out_host=True) assert max_val == np.max(np.abs(a)) assert max_idx == np.where(np.abs(a)==max_val)[-1] @@ -224,8 +225,8 @@ def make_random_expression(var_values, size): return make_random_expression(var_values, size) / make_random_expression(var_values, size) -def generate_random_fuzz_examples(): - for i in xrange(20): +def generate_random_fuzz_examples(count): + for i in xrange(count): size = [0] var_values = {} expr = make_random_expression(var_values, size) @@ -235,8 +236,8 @@ def test_fuzz_code_generator(ctx_factory): ctx = ctx_factory() queue = cl.CommandQueue(ctx) - from expr_fuzz import get_fuzz_examples - for expr, var_values in generate_random_fuzz_examples(): + #from expr_fuzz import get_fuzz_examples + for expr, var_values in generate_random_fuzz_examples(20): #for expr, var_values in get_fuzz_examples(): from pymbolic import evaluate true_value = evaluate(expr, var_values) @@ -255,7 +256,7 @@ def test_fuzz_code_generator(ctx_factory): for name, val in var_values.iteritems() ]) ck = lp.CompiledKernel(ctx, knl) - evt, (lp_value,) = ck(queue, **var_values) + evt, (lp_value,) = ck(queue, out_host=True, **var_values) err = abs(true_value-lp_value)/abs(true_value) if abs(err) > 1e-10: print "---------------------------------------------------------------------" @@ -278,6 +279,245 @@ def test_fuzz_code_generator(ctx_factory): +def test_empty_reduction(ctx_factory): + dtype = np.dtype(np.float32) + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel(ctx.devices[0], + [ + "{[i]: 0<=i<20}", + "{[j]: 0<=j<0}" + ], + [ + "a[i] = sum(j, j)", + ], + [ + lp.GlobalArg("a", dtype, (20,)), + ]) + cknl = lp.CompiledKernel(ctx, knl) + + evt, (a,) = cknl(queue) + + assert (a.get() == 0).all() + + + + + +def test_nested_dependent_reduction(ctx_factory): + dtype = np.dtype(np.int32) + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel(ctx.devices[0], + [ + "{[i]: 0<=i sumlen = l[i]", + "a[i] = sum(j, j)", + ], + [ + lp.ScalarArg("n", np.int32), + lp.GlobalArg("a", dtype, ("n",)), + lp.GlobalArg("l", np.int32, ("n",)), + ]) + + cknl = lp.CompiledKernel(ctx, knl) + + n = 330 + l = np.arange(n, dtype=np.int32) + evt, (a,) = cknl(queue, l=l, n=n, out_host=True) + + tgt_result = (2*l-1)*2*l/2 + assert (a == tgt_result).all() + + + + + + + +def test_dependent_loop_bounds(ctx_factory): + dtype = np.dtype(np.float32) + ctx = ctx_factory() + + knl = lp.make_kernel(ctx.devices[0], + [ + "{[i]: 0<=i row_len = a_rowstarts[i+1] - a_rowstarts[i]", + "ax[i] = sum(jj, a_values[a_rowstarts[i]+jj])", + ], + [ + lp.GlobalArg("a_rowstarts", np.int32), + lp.GlobalArg("a_indices", np.int32), + lp.GlobalArg("a_values", dtype), + lp.GlobalArg("x", dtype), + lp.GlobalArg("ax", dtype), + lp.ScalarArg("n", np.int32), + ], + assumptions="n>=1 and row_len>=1") + + cknl = lp.CompiledKernel(ctx, knl) + print "---------------------------------------------------" + cknl.print_code() + print "---------------------------------------------------" + + + + +def test_dependent_loop_bounds_2(ctx_factory): + dtype = np.dtype(np.float32) + ctx = ctx_factory() + + knl = lp.make_kernel(ctx.devices[0], + [ + "{[i]: 0<=i row_start = a_rowstarts[i]", + "<> row_len = a_rowstarts[i+1] - row_start", + "ax[i] = sum(jj, a_values[row_start+jj])", + ], + [ + lp.GlobalArg("a_rowstarts", np.int32), + lp.GlobalArg("a_indices", np.int32), + lp.GlobalArg("a_values", dtype), + lp.GlobalArg("x", dtype), + lp.GlobalArg("ax", dtype), + lp.ScalarArg("n", np.int32), + ], + assumptions="n>=1 and row_len>=1") + + knl = lp.split_dimension(knl, "i", 128, outer_tag="g.0", + inner_tag="l.0") + cknl = lp.CompiledKernel(ctx, knl) + print "---------------------------------------------------" + cknl.print_code() + print "---------------------------------------------------" + + + + + +def test_dependent_loop_bounds_3(ctx_factory): + # The point of this test is that it shows a dependency between + # domains that is exclusively mediated by the row_len temporary. + # It also makes sure that row_len gets read before any + # conditionals use it. + + dtype = np.dtype(np.float32) + ctx = ctx_factory() + + knl = lp.make_kernel(ctx.devices[0], + [ + "{[i]: 0<=i row_len = a_row_lengths[i]", + "a[i,jj] = 1", + ], + [ + lp.GlobalArg("a_row_lengths", np.int32), + lp.GlobalArg("a", dtype, shape=("n,n"), order="C"), + lp.ScalarArg("n", np.int32), + ]) + + assert knl.parents_per_domain()[1] == 0 + + knl = lp.split_dimension(knl, "i", 128, outer_tag="g.0", + inner_tag="l.0") + + cknl = lp.CompiledKernel(ctx, knl) + print "---------------------------------------------------" + cknl.print_code() + print "---------------------------------------------------" + + knl_bad = lp.split_dimension(knl, "jj", 128, outer_tag="g.1", + inner_tag="l.1") + + import pytest + with pytest.raises(RuntimeError): + list(lp.generate_loop_schedules(knl_bad)) + + + + +def test_independent_multi_domain(ctx_factory): + dtype = np.dtype(np.float32) + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + knl = lp.make_kernel(ctx.devices[0], + [ + "{[i]: 0<=i {[i]: 0<=i znirp = n", + "a[i] = 1", + ], + [ + lp.GlobalArg("a", dtype, shape=("n"), order="C"), + lp.ScalarArg("n", np.int32), + ]) + + cknl = lp.CompiledKernel(ctx, knl) + n = 20000 + evt, (a,) = cknl(queue, n=n, out_host=True) + + assert a.shape == (n,) + assert (a == 1).all() + + + + if __name__ == "__main__": import sys if len(sys.argv) > 1: