diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 802cc7044bf73d51567e23a6eaac791982709d51..a64f1965e4a323e868f47bacaf1cb352ba70bf77 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -97,7 +97,7 @@ def c99_preamble_generator(preamble_info): yield("10_complex", "#include ") -def _preamble_generator(preamble_info): +def _preamble_generator(preamble_info, func_qualifier='inline'): integer_type_names = ["int8", "int16", "int32", "int64"] def_integer_types_macro = ("03_def_integer_types", r""" @@ -115,55 +115,55 @@ def _preamble_generator(preamble_info): function_defs = { "loopy_floor_div": r""" #define LOOPY_DEFINE_FLOOR_DIV(SUFFIX, TYPE) \ - inline TYPE loopy_floor_div_##SUFFIX(TYPE a, TYPE b) \ - { \ + {} TYPE loopy_floor_div_##SUFFIX(TYPE a, TYPE b) \ + {{ \ if ((a<0) != (b<0)) \ a = a - (b + (b<0) - (b>=0)); \ return a/b; \ - } + }} LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_FLOOR_DIV) #undef LOOPY_DEFINE_FLOOR_DIV - """, + """.format(func_qualifier), "loopy_floor_div_pos_b": r""" #define LOOPY_DEFINE_FLOOR_DIV_POS_B(SUFFIX, TYPE) \ - inline TYPE loopy_floor_div_pos_b_##SUFFIX(TYPE a, TYPE b) \ - { \ + {} TYPE loopy_floor_div_pos_b_##SUFFIX(TYPE a, TYPE b) \ + {{ \ if (a<0) \ a = a - (b-1); \ return a/b; \ - } + }} LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_FLOOR_DIV_POS_B) #undef LOOPY_DEFINE_FLOOR_DIV_POS_B - """, + """.format(func_qualifier), "loopy_mod": r""" #define LOOPY_DEFINE_MOD(SUFFIX, TYPE) \ - inline TYPE loopy_mod_##SUFFIX(TYPE a, TYPE b) \ - { \ + {} TYPE loopy_mod_##SUFFIX(TYPE a, TYPE b) \ + {{ \ TYPE result = a%b; \ if (result < 0 && b > 0) \ result += b; \ if (result > 0 && b < 0) \ result = result + b; \ return result; \ - } + }} LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_MOD) #undef LOOPY_DEFINE_MOD - """, + """ .format(func_qualifier), "loopy_mod_pos_b": r""" #define LOOPY_DEFINE_MOD_POS_B(SUFFIX, TYPE) \ - inline TYPE loopy_mod_pos_b_##SUFFIX(TYPE a, TYPE b) \ - { \ + {} TYPE loopy_mod_pos_b_##SUFFIX(TYPE a, TYPE b) \ + {{ \ TYPE result = a%b; \ if (result < 0) \ result += b; \ return result; \ - } + }} LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_MOD_POS_B) #undef LOOPY_DEFINE_MOD_POS_B - """, + """.format(func_qualifier), } c_funcs = set(func.c_name for func in preamble_info.seen_functions) @@ -483,6 +483,9 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True): class CFamilyASTBuilder(ASTBuilderBase): + + preamble_function_qualifier = 'inline' + # {{{ library def function_manglers(self): @@ -500,7 +503,8 @@ class CFamilyASTBuilder(ASTBuilderBase): def preamble_generators(self): return ( super(CFamilyASTBuilder, self).preamble_generators() + [ - _preamble_generator, + lambda preamble_info: _preamble_generator(preamble_info, + self.preamble_function_qualifier), ]) # }}} diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 50fd1026f7bd15ce72915d0d5d5e60f6da4e264c..0078710593375241e1da166beadf71aa4ef41afc 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -217,6 +217,9 @@ class CudaTarget(CFamilyTarget): # {{{ ast builder class CUDACASTBuilder(CFamilyASTBuilder): + + preamble_function_qualifier = 'inline __device__' + # {{{ library def function_manglers(self): diff --git a/loopy/transform/data.py b/loopy/transform/data.py index 5356d49038a142945c781e58943eb86492d12b3f..d082d0bb27e01b6eb629e1677edf3315518228d4 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -150,7 +150,9 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, temporary_address_space=None, temporary_scope=None, footprint_subscripts=None, fetch_bounding_box=False, - fetch_outer_inames=None): + fetch_outer_inames=None, + prefetch_insn_id=None, + within=None): """Prefetch all accesses to the variable *var_name*, with all accesses being swept through *sweep_inames*. @@ -237,6 +239,13 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, :arg fetch_outer_inames: The inames within which the fetch instruction is nested. If *None*, make an educated guess. + :arg fetch_insn_id: The ID of the instruction generated to perform the + prefetch. + + :arg within: a stack match as understood by + :func:`loopy.match.parse_stack_match` to select the instructions where + *var_name* is to be prefetched. + This function internally uses :func:`extract_subst` and :func:`precompute`. """ @@ -311,7 +320,8 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, # }}} from loopy.transform.subst import extract_subst - kernel = extract_subst(kernel, rule_name, uni_template, parameters) + kernel = extract_subst(kernel, rule_name, uni_template, parameters, + within=within) if isinstance(sweep_inames, str): sweep_inames = [s.strip() for s in sweep_inames.split(",")] @@ -336,7 +346,9 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, temporary_name=temporary_name, temporary_address_space=temporary_address_space, temporary_scope=temporary_scope, - precompute_outer_inames=fetch_outer_inames) + precompute_outer_inames=fetch_outer_inames, + compute_insn_id=prefetch_insn_id, + within=within) # {{{ remove inames that were temporarily added by slice sweeps diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index e6ecb4093ad24ceafe521c5379f4d2cd96ea6f52..7d1fae06d9fbfe6c4765322e548c8f4018fe1710 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -126,19 +126,46 @@ def add_dependency(kernel, insn_match, depends_on): def remove_instructions(kernel, insn_ids): """Return a new kernel with instructions in *insn_ids* removed. - Dependencies across (one, for now) deleted isntructions are propagated. - Behavior is undefined for now for chains of dependencies within the - set of deleted instructions. + Dependencies across deleted isntructions are recursively propagated. This also updates *no_sync_with* for all instructions. + + :arg insn_ids: An instance of :class:`set` or :class:`str` as + understood by :func:`loopy.match.parse_match` or + :class:`loopy.match.MatchExpressionBase`. """ if not insn_ids: return kernel + from loopy.match import MatchExpressionBase + + if isinstance(insn_ids, str): + from loopy.match import parse_match + insn_ids = parse_match(insn_ids) + if isinstance(insn_ids, MatchExpressionBase): + within = insn_ids + + insn_ids = set([insn.id for insn in kernel.instructions if + within(kernel, insn)]) + assert isinstance(insn_ids, set) id_to_insn = kernel.id_to_insn + # for each insn_id to be removed get deps in terms of remaining + # insns + retargeted_deps = {} + for insn_id in insn_ids: + depends_on = id_to_insn[insn_id].depends_on + while depends_on & insn_ids: + new_deps = depends_on - insn_ids + for dep_id in depends_on & insn_ids: + new_deps = new_deps | id_to_insn[dep_id].depends_on + + depends_on = new_deps.copy() + + retargeted_deps[insn_id] = depends_on + new_insns = [] for insn in kernel.instructions: if insn.id in insn_ids: @@ -153,11 +180,12 @@ def remove_instructions(kernel, insn_ids): new_deps = depends_on - insn_ids - for dep_id in depends_on & insn_ids: - new_deps = new_deps | id_to_insn[dep_id].depends_on + for insn_id in (depends_on & insn_ids): + new_deps = new_deps | retargeted_deps[insn_id] - # update no_sync_with + assert not (new_deps & insn_ids) + # update no_sync_with new_no_sync_with = frozenset((insn_id, scope) for insn_id, scope in insn.no_sync_with if insn_id not in insn_ids) diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index 717a051930e938457dae0ee4441325b3e631d2d9..943b34ea4fb8aa5881b21cfacf6a879823a79fd5 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -43,12 +43,14 @@ class ExprDescriptor(ImmutableRecord): # {{{ extract_subst -def extract_subst(kernel, subst_name, template, parameters=()): +def extract_subst(kernel, subst_name, template, parameters=(), within=None): """ :arg subst_name: The name of the substitution rule to be created. :arg template: Unification template expression. :arg parameters: An iterable of parameters used in *template*, or a comma-separated string of the same. + :arg within: An instance of :class:`loopy.match.MatchExpressionBase` or + :class:`str` as understood by :func:`loopy.match.parse_match`. All targeted subexpressions must match ('unify with') *template* The template may contain '*' wildcards that will have to match exactly across all @@ -63,6 +65,9 @@ def extract_subst(kernel, subst_name, template, parameters=()): parameters = tuple( s.strip() for s in parameters.split(",")) + from loopy.match import parse_match + within = parse_match(within) + var_name_gen = kernel.get_var_name_generator() # {{{ replace any wildcards in template with new variables @@ -112,8 +117,9 @@ def extract_subst(kernel, subst_name, template, parameters=()): dfmapper = CallbackMapper(gather_exprs, WalkMapper()) for insn in kernel.instructions: - dfmapper(insn.assignees) - dfmapper(insn.expression) + if within(kernel, insn): + dfmapper(insn.assignees) + dfmapper(insn.expression) for sr in six.itervalues(kernel.substitutions): dfmapper(sr.expression) @@ -170,9 +176,12 @@ def extract_subst(kernel, subst_name, template, parameters=()): raise ValueError("assignment LHS not understood") for insn in kernel.instructions: - new_insns.append( - insn.with_transformed_expressions( - cbmapper, assignee_f=transform_assignee)) + if within(kernel, insn): + new_insns.append( + insn.with_transformed_expressions( + cbmapper, assignee_f=transform_assignee)) + else: + new_insns.append(insn) from loopy.kernel.data import SubstitutionRule new_substs = { diff --git a/test/test_transform.py b/test/test_transform.py index e4ca2af0d657cea7769c9a573c14a79e8c197132..6ff3a120774c1e7e19ed9da241ff03fdea4bbb4b 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -570,6 +570,48 @@ def test_nested_substs_in_insns(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl) +def test_remove_instructions_with_recursive_deps(): + knl = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + y[i] = 0 {id=insn0} + a[i] = 2*b[i] {id=insn1} + c[i] = 2*b[i] {id=insn2} + y[i] = y[i] + x[i] {id=insn3} + """, seq_dependencies=True) + + knl = lp.remove_instructions(knl, set(['insn1', 'insn2'])) + + assert knl.id_to_insn['insn3'].depends_on == frozenset(['insn0']) + assert knl.id_to_insn['insn0'].depends_on == frozenset() + + +def test_prefetch_with_within(ctx_factory): + knl = lp.make_kernel( + "{[i, j, k]: 0<=i<100 and 0<=j,k<256}", + """ + f[j] = 3.14 * j {id=set_f} + ... gbarrier {id=insn_gbar} + y[i, k] = f[k] * x[i, k] {id=set_y} + """, [lp.GlobalArg('x', shape=lp.auto, dtype=float), '...'], + seq_dependencies=True) + + ref_knl = knl + + knl = lp.split_iname(knl, 'j', 32, inner_tag="l.0", outer_tag="g.0") + knl = lp.split_iname(knl, 'i', 32, inner_tag="l.0", outer_tag="g.0") + + knl = lp.add_prefetch(knl, 'f', prefetch_insn_id='f_prftch', within='id:set_y', + sweep_inames='k', + dim_arg_names='iprftch', + default_tag=None, + temporary_address_space=lp.AddressSpace.LOCAL) + knl = lp.add_dependency(knl, 'id:f_prftch', 'id:insn_gbar') + knl = lp.split_iname(knl, 'iprftch', 32, inner_tag="l.0") + + lp.auto_test_vs_ref(ref_knl, ctx_factory(), knl) + + def test_extract_subst_with_iname_deps_in_templ(ctx_factory): knl = lp.make_kernel( "{[i, j, k]: 0<=i<100 and 0<=j,k<5}",