diff --git a/loopy/check.py b/loopy/check.py index e66af04d2fe4dfc2e1f5a99281783feecec2bee7..d46281fbf32eaeeee3d3e51ac3dc20b37b47f71c 100644 --- a/loopy/check.py +++ b/loopy/check.py @@ -384,6 +384,7 @@ class _AccessCheckMapper(WalkMapper): WalkMapper.map_subscript(self, expr, domain) from pymbolic.primitives import Variable + from pymbolic.mapper.evaluator import UnknownVariableError assert isinstance(expr.aggregate, Variable) shape = None @@ -420,8 +421,33 @@ class _AccessCheckMapper(WalkMapper): expr.aggregate.name, expr, len(subscript), len(shape))) + # apply predicates + access_range = domain + insn = self.kernel.id_to_insn[self.insn_id] + possible_warns = [] + if insn.predicates: + from loopy.symbolic import constraints_from_expr + for pred in insn.predicates: + if insn.within_inames & get_dependencies(pred): + with isl.SuppressedWarnings(domain.get_ctx()): + try: + constraints = constraints_from_expr( + domain.space, pred) + for constraint in constraints: + access_range = access_range.add_constraint( + constraint) + + except isl.Error: + # non-affine predicate - store for warning if we fail + # this check + possible_warns += [pred] + except UnknownVariableError: + # data dependent bounds + pass + try: - access_range = get_access_range(domain, subscript) + access_range = get_access_range(access_range, subscript, + self.kernel.assumptions) except UnableToDetermineAccessRange: # Likely: index was non-affine, nothing we can do. return @@ -439,6 +465,13 @@ class _AccessCheckMapper(WalkMapper): shape_domain = shape_domain.intersect(slab) if not access_range.is_subset(shape_domain): + if possible_warns: + import logging + logger = logging.getLogger(__name__) + logger.info("Predicates: ({}) are are expressed in a " + "non-affine manner, and were not considered " + "for out-of-bounds array checking.".format( + ", ".join(str(x) for x in possible_warns))) raise LoopyError("'%s' in instruction '%s' " "accesses out-of-bounds array element (could not" " establish '%s' is a subset of '%s')." diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 3cdc0708d0d7c89e4307f64cf7ab04879892cec8..4097e36c251c000e46dc96fccb1faad1619c163c 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1413,6 +1413,66 @@ class PwAffEvaluationMapper(EvaluationMapperBase, IdentityMapperMixin): "for as-pwaff evaluation" % expr) +class ConditionalMapper(EvaluationMapperBase, IdentityMapperMixin): + def __init__(self, space, vars_to_zero): + self.pw_map = PwAffEvaluationMapper(space, vars_to_zero) + super(ConditionalMapper, self).__init__(self.pw_map.context.copy()) + + def map_logical_not(self, expr): + constraints = self.rec(expr.child) + out = [] + for constraint in constraints: + negated = constraint.get_aff().neg() + if constraint.is_equality(): + out.append(isl.Constraint.equality_from_aff(negated)) + else: + # since we're flipping a >= need to account for the ='s + val = int(str(constraint.get_constant_val())) + if val > 0: + val = 1 + elif val < 0: + val = -1 + out.append(isl.Constraint.inequality_from_aff(negated + val)) + return out + + def map_logical_and(self, expr): + from pymbolic.mapper.evaluator import UnknownVariableError + constraints = [] + for child in expr.children: + try: + constraints += [c for c in self.rec(child)] + except UnknownVariableError: + # the child contained data-dependent conditionals -> can't apply + pass + return constraints + + map_logical_or = map_logical_and + + def map_constant(self, expr): + return self.pw_map(expr) + + def map_comparison(self, expr): + left = self.rec(expr.left) + right = self.rec(expr.right) + _, aff = (left - right).get_pieces()[-1] + if expr.operator == "==": + return [isl.Constraint.equality_from_aff(aff)] + elif expr.operator == "!=": + # piecewise + return [isl.Constraint.inequality_from_aff(aff + 1), + isl.Constraint.inequality_from_aff(aff - 1)] + elif expr.operator == "<": + return [isl.Constraint.inequality_from_aff((aff + 1).neg())] + elif expr.operator == "<=": + return [isl.Constraint.inequality_from_aff((aff).neg())] + elif expr.operator == ">": + return [isl.Constraint.inequality_from_aff((aff - 1))] + elif expr.operator == ">=": + return [isl.Constraint.inequality_from_aff((aff))] + else: + raise ValueError("invalid comparison operator") + + def aff_from_expr(space, expr, vars_to_zero=None): if vars_to_zero is None: vars_to_zero = frozenset() @@ -1546,6 +1606,15 @@ def simplify_using_aff(kernel, expr): # }}} +# {{{ expression/set <-> constraints conversion + +def constraints_from_expr(space, expr): + with isl.SuppressedWarnings(space.get_ctx()): + return ConditionalMapper(space, vars_to_zero=[None])(expr) + +# }}} + + # {{{ qpolynomial_to_expr def _term_to_expr(space, term): diff --git a/loopy/transform/data.py b/loopy/transform/data.py index a50725d20d579109f6e061fba0a1f408a6e23e93..e876c3aa4fdf5cf67d4560634331cfb7c299ede5 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -146,6 +146,7 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, temporary_address_space=None, temporary_scope=None, footprint_subscripts=None, fetch_bounding_box=False, + stream_iname=None, # if not None, use streaming prefetch in precompute fetch_outer_inames=None): """Prefetch all accesses to the variable *var_name*, with all accesses being swept through *sweep_inames*. @@ -332,7 +333,8 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, temporary_name=temporary_name, temporary_address_space=temporary_address_space, temporary_scope=temporary_scope, - precompute_outer_inames=fetch_outer_inames) + precompute_outer_inames=fetch_outer_inames, + stream_iname=stream_iname) # {{{ remove inames that were temporarily added by slice sweeps diff --git a/loopy/transform/precompute.py b/loopy/transform/precompute.py index cefed807d73bd0a9064c170190a3ba19b2d5abf6..2943368cecb8278c1f58bf99eef5b1017a627a13 100644 --- a/loopy/transform/precompute.py +++ b/loopy/transform/precompute.py @@ -267,6 +267,7 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, fetch_bounding_box=False, temporary_address_space=None, compute_insn_id=None, + stream_iname=None, # if not None, use streaming prefetch in precompute **kwargs): """Precompute the expression described in the substitution rule determined by *subst_use* and store it in a temporary array. A precomputation needs two @@ -835,31 +836,228 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, # }}} - from loopy.kernel.data import Assignment - if compute_insn_id is None: - compute_insn_id = kernel.make_unique_instruction_id(based_on=c_subst_name) - - compute_insn = Assignment( - id=compute_insn_id, - assignee=assignee, - expression=compute_expression, - # within_inames determined below - ) - compute_dep_id = compute_insn_id - added_compute_insns = [compute_insn] - - if temporary_address_space == AddressSpace.GLOBAL: - barrier_insn_id = kernel.make_unique_instruction_id( - based_on=c_subst_name+"_barrier") + if stream_iname is not None: + from pymbolic import parse + stream_var = parse(stream_iname) + + # {{{ some utility functions + + def increment(expr, iname): + from pymbolic import parse, substitute + if isinstance(iname, str): + iname = parse(iname) + return substitute(expr, {iname: iname + 1}) + + def decrement(expr, iname): + from pymbolic import parse, substitute + if isinstance(iname, str): + iname = parse(iname) + return substitute(expr, {iname: iname - 1}) + + def rename(set, old, new): + var_dict = set.get_var_dict() + dt, dim_idx = var_dict[old] + return set.set_dim_name(dt, dim_idx, new) + + def add_iname(set, inames): + from pymbolic.primitives import Variable + if isinstance(inames, Variable): + inames = [inames.name] + elif isinstance(inames, str): + inames = [inames] + for iname in inames: + set = set.add_dims(isl.dim_type.out, + 1).set_dim_name(isl.dim_type.out, set.n_dim(), iname) + return set + + # }}} + + # primed storage inames + non1_pstorage_axis_names = [name+"'" for name in non1_storage_axis_names] + + # append "_g" to storage inames for corresponding global iname + global_storage_axis_dict = {} + for iname in non1_storage_axis_names: + global_storage_axis_dict[iname] = iname+"_g" + global_storage_axis_names = list(global_storage_axis_dict.values()) + + domain = kernel.domains[0] # ??? what should I do about this indexing + + storage_axis_subst_dict_1 = storage_axis_subst_dict + + storage_axis_subst_dict_0 = {} + for iname in storage_axis_subst_dict.keys(): + storage_axis_subst_dict_0[iname] = \ + decrement(storage_axis_subst_dict[iname], stream_var) + + domain = add_iname(domain, global_storage_axis_names) + + from loopy.symbolic import aff_from_expr + + # these were removed from loopy.symbolic + def eq_constraint_from_expr(space, expr): + return isl.Constraint.equality_from_aff(aff_from_expr(space, expr)) + + def ineq_constraint_from_expr(space, expr): + return isl.Constraint.inequality_from_aff(aff_from_expr(space, expr)) + + # constraints on relationship between storage, etc. inames and global inames + constraints_0 \ + = [eq_constraint_from_expr(domain.space, + parse(global_storage_axis_dict[si]) + - storage_axis_subst_dict_0[si]) for si in non1_storage_axis_names] + constraints_1 \ + = [eq_constraint_from_expr(domain.space, + parse(global_storage_axis_dict[si]) + - storage_axis_subst_dict_1[si]) for si in non1_storage_axis_names] + + domain_0 = domain.add_constraints(constraints_0) + domain_1 = domain.add_constraints(constraints_1) + + for si, psi in zip(non1_storage_axis_names, non1_pstorage_axis_names): + domain_1 = add_iname(domain_1, psi) + domain_0 = rename(domain_0, si, psi) + domain_0 = add_iname(domain_0, si) + domain_0, domain_1 = isl.align_two(domain_0, domain_1) + + overlap = domain_0 & domain_1 + + # ??? better way to ensure stream_iname is not on first iteration? + # e.g., this (and other code) assumes stream_iname increments by positive 1 + dt, dim_idx = domain.get_var_dict()[stream_iname] + from loopy.symbolic import pw_aff_to_expr + stream_min = pw_aff_to_expr(domain.dim_min(dim_idx)) + overlap = overlap.add_constraint( + ineq_constraint_from_expr(overlap.space, stream_var - stream_min - 1)) + + from loopy.symbolic import basic_set_to_cond_expr + in_overlap = basic_set_to_cond_expr( + overlap.project_out_except( + non1_storage_axis_names+[stream_iname], [isl.dim_type.out])) + + fetch_var = var(temporary_name) + + stream_assignee = fetch_var[tuple(var(iname) + for iname in non1_storage_axis_names)] + + # {{{ obtain global indexing from constraints + + # ??? better way? + stream_replace_rules = overlap.project_out_except( + non1_storage_axis_names+non1_pstorage_axis_names, + [isl.dim_type.out]) + from loopy.symbolic import aff_to_expr + cns_exprs = [aff_to_expr(cns.get_aff()) + for cns in stream_replace_rules.get_constraints()] + + stream_subst_dict = {} + from pymbolic.algorithm import solve_affine_equations_for + stream_subst_dict = solve_affine_equations_for(non1_pstorage_axis_names, + [(0, cns) for cns in cns_exprs]) + + # }}} + + # {{{ create instructions + + from pymbolic import parse + fetch_var = parse(temporary_name) + stream_temp_expression = fetch_var[tuple([stream_subst_dict[Variable(psname)] + for psname in non1_pstorage_axis_names])] + + stream_fetch_expression = compute_expression + + var_name_gen = kernel.get_var_name_generator() + # ??? probably want to generate temporary name with var_name_gen? + stream_temp_assignee = temporary_name+"_temp" + + copy_temp_insn_id = temporary_name+"_copy_temp_rule" + stream_temp_insn_id = temporary_name+"_stream_temp_rule" + fetch_temp_insn_id = temporary_name+"_fetch_temp_rule" + + from loopy.kernel.data import Assignment + stream_temp_insn = Assignment( + id=stream_temp_insn_id, + assignee=stream_temp_assignee, + expression=stream_temp_expression, + predicates=[in_overlap], + depends_on_is_final=True + # within_inames determined below + ) + + fetch_temp_insn = Assignment( + id=fetch_temp_insn_id, + assignee=stream_temp_assignee, + expression=stream_fetch_expression, + predicates=[in_overlap.not_()], + depends_on_is_final=True + # within_inames determined below + ) + + stream_barrier_insn_id = kernel.make_unique_instruction_id( + based_on=stream_temp_insn_id+"_barrier") from loopy.kernel.instruction import BarrierInstruction - barrier_insn = BarrierInstruction( - id=barrier_insn_id, - depends_on=frozenset([compute_insn_id]), - synchronization_kind="global", - mem_kind="global") - compute_dep_id = barrier_insn_id + stream_barrier_insn = BarrierInstruction( + id=stream_barrier_insn_id, + depends_on=frozenset([stream_temp_insn_id, fetch_temp_insn_id]), + depends_on_is_final=True, + synchronization_kind="local", + mem_kind="local") + + copy_temp_insn = Assignment( + id=copy_temp_insn_id, + assignee=stream_assignee, + expression=stream_temp_assignee, + depends_on_is_final=True, + depends_on=frozenset([stream_barrier_insn_id]) + # within_inames determined below + ) + + copy_barrier_insn_id = kernel.make_unique_instruction_id( + based_on=copy_temp_insn_id+"_barrier") + from loopy.kernel.instruction import BarrierInstruction + copy_barrier_insn = BarrierInstruction( + id=copy_barrier_insn_id, + depends_on=frozenset([copy_temp_insn_id]), + depends_on_is_final=True, + synchronization_kind="local", + mem_kind="local") + + added_compute_insns = [fetch_temp_insn, stream_temp_insn, + stream_barrier_insn, copy_temp_insn, copy_barrier_insn] - added_compute_insns.append(barrier_insn) + # }}} + + # ??? this is for compatibility with old (shared) precompute code + compute_insn_id = copy_barrier_insn_id + compute_dep_id = copy_barrier_insn_id + compute_insn = copy_barrier_insn + else: + from loopy.kernel.data import Assignment + if compute_insn_id is None: + compute_insn_id = kernel.make_unique_instruction_id( + based_on=c_subst_name) + + compute_insn = Assignment( + id=compute_insn_id, + assignee=assignee, + expression=compute_expression, + # within_inames determined below + ) + compute_dep_id = compute_insn_id + added_compute_insns = [compute_insn] + + if temporary_address_space == AddressSpace.GLOBAL: + barrier_insn_id = kernel.make_unique_instruction_id( + based_on=c_subst_name+"_barrier") + from loopy.kernel.instruction import BarrierInstruction + barrier_insn = BarrierInstruction( + id=barrier_insn_id, + depends_on=frozenset([compute_insn_id]), + synchronization_kind="global", + mem_kind="global") + compute_dep_id = barrier_insn_id + + added_compute_insns.append(barrier_insn) # }}} @@ -878,19 +1076,21 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, kernel = invr.map_kernel(kernel) kernel = kernel.copy( - instructions=added_compute_insns + kernel.instructions) + instructions=added_compute_insns + kernel.instructions) kernel = rule_mapping_context.finish_kernel(kernel) # }}} # {{{ add dependencies to compute insn - kernel = kernel.copy( - instructions=[ - insn.copy(depends_on=frozenset(invr.compute_insn_depends_on)) - if insn.id == compute_insn_id - else insn - for insn in kernel.instructions]) + # ??? removed this from streaming case for now - it adds a redundant barrier + if stream_iname is None: + kernel = kernel.copy( + instructions=[ + insn.copy(depends_on=frozenset(invr.compute_insn_depends_on)) + if insn.id == compute_insn_id + else insn + for insn in kernel.instructions]) # }}} @@ -944,7 +1144,9 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, kernel = kernel.copy( instructions=[ insn.copy(within_inames=precompute_outer_inames) - if insn.id == compute_insn_id + # ??? replaced the below - should work for normal precompute? + # if insn.id == compute_insn_id + if insn.id in [a.id for a in added_compute_insns] else insn for insn in kernel.instructions]) @@ -1025,6 +1227,17 @@ def precompute(kernel, subst_use, sweep_inames=[], within=None, new_temporary_variables[temporary_name] = temp_var + if stream_iname is not None: + temp_name = temporary_name+"_temp" + temp_fetch_variable = lp.TemporaryVariable( + name=temp_name, + dtype=dtype, + base_indices=()*len(new_temp_shape), + shape=tuple(), + address_space=lp.AddressSpace.PRIVATE) + + new_temporary_variables[temp_name] = temp_fetch_variable + kernel = kernel.copy( temporary_variables=new_temporary_variables) diff --git a/test/test_apps.py b/test/test_apps.py index 56f4127ac6be827afda8bd41b6e87ee6d5e774dc..290900d1af82756168ed3fbccf842ecdeccd190b 100644 --- a/test/test_apps.py +++ b/test/test_apps.py @@ -107,10 +107,24 @@ def test_convolution(ctx_factory): default_tag="l.auto") return knl + def variant_3(knl): + knl = lp.split_iname(knl, "im_x", 16, inner_tag="l.0") + knl = lp.split_iname(knl, "im_y", 16, inner_tag="l.1") + knl = lp.tag_inames(knl, dict(iimg="g.0")) + knl = lp.add_prefetch(knl, "f[ifeat,:,:,:]", default_tag="l.auto") + knl = lp.add_prefetch(knl, "img", "im_x_inner, im_y_inner, f_x, f_y, icolor", + stream_iname="im_x_outer", + default_tag=None, + fetch_outer_inames="im_x_outer,im_y_outer,iimg,ifeat") + knl = lp.tag_inames(knl, dict(img_dim_1="l.0", img_dim_2="l.1")) + knl.silenced_warnings = ["single_writer_after_creation"] + return knl + for variant in [ #variant_0, #variant_1, - variant_2 + variant_2, + variant_3 ]: lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), parameters=dict( @@ -337,11 +351,10 @@ def test_stencil(ctx_factory): " + a_offset(i,j+1)" " + a_offset(i-1,j)" " + a_offset(i+1,j)" - ], - [ - lp.GlobalArg("a", np.float32, shape=(n+2, n+2,)), - lp.GlobalArg("z", np.float32, shape=(n+2, n+2,)) - ]) + ] + ) + + knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32)) ref_knl = knl @@ -360,9 +373,35 @@ def test_stencil(ctx_factory): knl = lp.prioritize_loops(knl, ["a_dim_0_outer", "a_dim_1_outer"]) return knl + # streaming, block covers output footprint + # default_tag="l.auto" gets ride of iname a_dim_1 + def variant_3(knl): + knl.silenced_warnings = ["single_writer_after_creation"] + knl = lp.split_iname(knl, "i", 4, outer_tag=None, inner_tag=None) + knl = lp.split_iname(knl, "j", 64, outer_tag="g.0", inner_tag="l.0") + knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], + fetch_bounding_box=True, default_tag="l.auto", + stream_iname="i_outer") + knl = lp.tag_inames(knl, dict(a_dim_0="l.1", i_inner="l.1")) + return knl + + # streaming, block covers input footprint (i.e., includes halos) + def variant_4(knl): + knl.silenced_warnings = ["single_writer_after_creation"] + knl = lp.split_iname(knl, "i", 4, inner_tag=None) + knl = lp.split_iname(knl, "j", 64, inner_tag="l.0") + knl = lp.add_prefetch(knl, "a", ["i_inner", "j_inner"], + fetch_bounding_box=True, default_tag=None, + stream_iname="i_outer") + knl = lp.tag_inames(knl, dict(a_dim_0="l.1", i_inner="l.1", a_dim_1="l.0")) + knl = lp.tag_inames(knl, dict(i_outer=None, j_outer="g.0")) + return knl + for variant in [ #variant_1, variant_2, + variant_3, + variant_4, ]: lp.auto_test_vs_ref(ref_knl, ctx, variant(knl), print_ref_code=False, diff --git a/test/test_loopy.py b/test/test_loopy.py index 2ac08f8c4893cb7a4dd48780a6bf450254b4fdfc..9ccb471a3d01983d992a96cb86d5b3fee1fe2773 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2742,6 +2742,107 @@ def test_dep_cycle_printing_and_error(): print(lp.generate_code(knl)[0]) +@pytest.mark.parametrize("op", [">", ">=", "<", "<=", "==", "!="]) +def test_conditional_access_range(ctx_factory, op): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + def get_condition(): + if op == ">": + return "not (i > 7)" + elif op == ">=": + return "not (i >= 8)" + elif op == "<": + return "i < 8" + elif op == "<=": + return "i <=7" + elif op == "==": + return " or ".join(["i == {}".format(i) for i in range(8)]) + elif op == "!=": + return " and ".join(["i != {}".format(i) for i in range(8, 10)]) + + condition = get_condition() + knl = lp.make_kernel( + "{[i]: 0 <= i < 10}", + """ + if {condition} + tmp[i] = tmp[i] + 1 + end + """.format(condition=condition), + [lp.GlobalArg("tmp", shape=(8,), dtype=np.int64)]) + + assert np.array_equal(knl(queue, tmp=np.arange(8))[1][0], np.arange(1, 9)) + + +def test_conditional_access_range_with_parameters(ctx_factory): + ctx = ctx_factory() + queue = cl.CommandQueue(ctx) + + # test that conditional on parameter works, otherwise the tmp[j, i] will show + # as OOB + knl = lp.make_kernel( + ["{[i]: 0 <= i < 10}", + "{[j]: 0 <= j < problem_size + 2}"], + """ + if i < 8 and j < problem_size + tmp[j, i] = tmp[j, i] + 1 + end + """, + [lp.GlobalArg("tmp", shape=("problem_size", 8,), dtype=np.int64), + lp.ValueArg("problem_size", dtype=np.int64)]) + + assert np.array_equal(knl(queue, tmp=np.arange(80).reshape((10, 8)), + problem_size=10)[1][0], np.arange(1, 81).reshape( + (10, 8))) + + # test a conditional that's only _half_ data-dependent to ensure the other + # half works + knl = lp.make_kernel( + ["{[i]: 0 <= i < 10}", + "{[j]: 0 <= j < problem_size}"], + """ + if i < 8 and (j + offset) < problem_size + tmp[j, i] = tmp[j, i] + 1 + end + """, + [lp.GlobalArg("tmp", shape=("problem_size", 8,), dtype=np.int64), + lp.ValueArg("problem_size", dtype=np.int64), + lp.ValueArg("offset", dtype=np.int64)]) + + assert np.array_equal(knl(queue, tmp=np.arange(80).reshape((10, 8)), + problem_size=10, + offset=0)[1][0], np.arange(1, 81).reshape( + (10, 8))) + + +def test_conditional_access_range_failure(ctx_factory): + # predicate doesn't actually limit access_range + knl = lp.make_kernel( + "{[i,j]: 0 <= i,j < 10}", + """ + if j < 8 + tmp[i] = tmp[i] + end + """, [lp.GlobalArg("tmp", shape=(8,), dtype=np.int32)]) + + from loopy.diagnostic import LoopyError + with pytest.raises(LoopyError): + lp.generate_code_v2(knl).device_code() + + # predicate non affine + knl = lp.make_kernel( + "{[i,j]: 0 <= i,j < 10}", + """ + if (i+3)*i < 15 + tmp[i] = tmp[i] + end + """, [lp.GlobalArg("tmp", shape=(2,), dtype=np.int32)]) + + from loopy.diagnostic import LoopyError + with pytest.raises(LoopyError): + lp.generate_code_v2(knl).device_code() + + def test_backwards_dep_printing_and_error(): knl = lp.make_kernel( "{[i]: 0<=i