From 253116d6c9a83f0168b88760321382a4dd88ec9e Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 20 Dec 2019 17:07:25 -0600 Subject: [PATCH 1/9] improvements to remove_instructions --- loopy/transform/instruction.py | 26 +++++++++++++++++++------- 1 file changed, 19 insertions(+), 7 deletions(-) diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index e6ecb4093..da927d950 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -126,16 +126,24 @@ def add_dependency(kernel, insn_match, depends_on): def remove_instructions(kernel, insn_ids): """Return a new kernel with instructions in *insn_ids* removed. - Dependencies across (one, for now) deleted isntructions are propagated. - Behavior is undefined for now for chains of dependencies within the - set of deleted instructions. + Dependencies across deleted isntructions are recursively propagated. This also updates *no_sync_with* for all instructions. + + :arg insn_ids: An instance of :class:`set` or :class:`str` as + understood by :func:`loopy.match.parse_match`. """ if not insn_ids: return kernel + if isinstance(insn_ids, str): + from loopy.match import parse_match + within = parse_match(insn_ids) + + insn_ids = set([insn.id for insn in kernel.instructions if + within(kernel, insn)]) + assert isinstance(insn_ids, set) id_to_insn = kernel.id_to_insn @@ -151,10 +159,14 @@ def remove_instructions(kernel, insn_ids): else: depends_on = insn.depends_on - new_deps = depends_on - insn_ids + while depends_on & insn_ids: + new_deps = depends_on - insn_ids + for dep_id in depends_on & insn_ids: + new_deps = new_deps | id_to_insn[dep_id].depends_on + + depends_on = new_deps.copy() - for dep_id in depends_on & insn_ids: - new_deps = new_deps | id_to_insn[dep_id].depends_on + assert not (depends_on & insn_ids) # update no_sync_with @@ -163,7 +175,7 @@ def remove_instructions(kernel, insn_ids): if insn_id not in insn_ids) new_insns.append( - insn.copy(depends_on=new_deps, no_sync_with=new_no_sync_with)) + insn.copy(depends_on=depends_on, no_sync_with=new_no_sync_with)) return kernel.copy( instructions=new_insns) -- GitLab From 6f2f469f6e33a84a7be771cf5c94e6e3b5ba3c9b Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 20 Dec 2019 17:21:02 -0600 Subject: [PATCH 2/9] test remove instructions --- test/test_transform.py | 16 ++++++++++++++++ 1 file changed, 16 insertions(+) diff --git a/test/test_transform.py b/test/test_transform.py index cdc0c14b8..dc854fb0b 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -570,6 +570,22 @@ def test_nested_substs_in_insns(ctx_factory): lp.auto_test_vs_ref(ref_knl, ctx, knl) +def test_remove_instructions_with_recursive_deps(): + knl = lp.make_kernel( + "{[i]: 0<=i<10}", + """ + y[i] = 0 {id=insn0} + a[i] = 2*b[i] {id=insn1} + c[i] = 2*b[i] {id=insn2} + y[i] = y[i] + x[i] {id=insn3} + """, seq_dependencies=True) + + knl = lp.remove_instructions(knl, set(['insn1', 'insn2'])) + + assert knl.id_to_insn['insn3'].depends_on == frozenset(['insn0']) + assert knl.id_to_insn['insn0'].depends_on == frozenset() + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab From 6c1409c092ced4687f7a30618216f9f1204f0dc1 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 20 Dec 2019 17:21:58 -0600 Subject: [PATCH 3/9] correct the function qualifier for Cuda target device functions --- loopy/target/c/__init__.py | 24 ++++++++++++++---------- loopy/target/cuda.py | 3 +++ 2 files changed, 17 insertions(+), 10 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 6e3602eda..cb9465a9a 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -82,7 +82,7 @@ def c99_preamble_generator(preamble_info): yield("10_stdint", "#include ") -def _preamble_generator(preamble_info): +def _preamble_generator(preamble_info, func_qualifier='inline'): integer_type_names = ["int8", "int16", "int32", "int64"] def_integer_types_macro = ("03_def_integer_types", r""" @@ -100,7 +100,7 @@ def _preamble_generator(preamble_info): function_defs = { "loopy_floor_div": r""" #define LOOPY_DEFINE_FLOOR_DIV(SUFFIX, TYPE) \ - inline TYPE loopy_floor_div_##SUFFIX(TYPE a, TYPE b) \ + %s TYPE loopy_floor_div_##SUFFIX(TYPE a, TYPE b) \ { \ if ((a<0) != (b<0)) \ a = a - (b + (b<0) - (b>=0)); \ @@ -108,11 +108,11 @@ def _preamble_generator(preamble_info): } LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_FLOOR_DIV) #undef LOOPY_DEFINE_FLOOR_DIV - """, + """ % func_qualifier, "loopy_floor_div_pos_b": r""" #define LOOPY_DEFINE_FLOOR_DIV_POS_B(SUFFIX, TYPE) \ - inline TYPE loopy_floor_div_pos_b_##SUFFIX(TYPE a, TYPE b) \ + %s TYPE loopy_floor_div_pos_b_##SUFFIX(TYPE a, TYPE b) \ { \ if (a<0) \ a = a - (b-1); \ @@ -120,11 +120,11 @@ def _preamble_generator(preamble_info): } LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_FLOOR_DIV_POS_B) #undef LOOPY_DEFINE_FLOOR_DIV_POS_B - """, + """ % func_qualifier, "loopy_mod": r""" #define LOOPY_DEFINE_MOD(SUFFIX, TYPE) \ - inline TYPE loopy_mod_##SUFFIX(TYPE a, TYPE b) \ + %s TYPE loopy_mod_##SUFFIX(TYPE a, TYPE b) \ { \ TYPE result = a%b; \ if (result < 0 && b > 0) \ @@ -135,11 +135,11 @@ def _preamble_generator(preamble_info): } LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_MOD) #undef LOOPY_DEFINE_MOD - """, + """ % func_qualifier, "loopy_mod_pos_b": r""" #define LOOPY_DEFINE_MOD_POS_B(SUFFIX, TYPE) \ - inline TYPE loopy_mod_pos_b_##SUFFIX(TYPE a, TYPE b) \ + %s TYPE loopy_mod_pos_b_##SUFFIX(TYPE a, TYPE b) \ { \ TYPE result = a%b; \ if (result < 0) \ @@ -148,7 +148,7 @@ def _preamble_generator(preamble_info): } LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_MOD_POS_B) #undef LOOPY_DEFINE_MOD_POS_B - """, + """ % func_qualifier, } c_funcs = set(func.c_name for func in preamble_info.seen_functions) @@ -469,6 +469,9 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True): class CFamilyASTBuilder(ASTBuilderBase): + + function_qualifier = 'inline' + # {{{ library def function_manglers(self): @@ -486,7 +489,8 @@ class CFamilyASTBuilder(ASTBuilderBase): def preamble_generators(self): return ( super(CFamilyASTBuilder, self).preamble_generators() + [ - _preamble_generator, + lambda preamble_info: _preamble_generator(preamble_info, + self.function_qualifier), ]) # }}} diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 50fd1026f..72952f7d9 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -217,6 +217,9 @@ class CudaTarget(CFamilyTarget): # {{{ ast builder class CUDACASTBuilder(CFamilyASTBuilder): + + function_qualifier = 'inline __device__' + # {{{ library def function_manglers(self): -- GitLab From 22e6e2b7653f235ae134c0092d25d746be964416 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 20 Dec 2019 17:45:10 -0600 Subject: [PATCH 4/9] adds missing options in add_prefetch --- loopy/transform/data.py | 15 +++++++++++++-- 1 file changed, 13 insertions(+), 2 deletions(-) diff --git a/loopy/transform/data.py b/loopy/transform/data.py index a6a2d7b4f..da2097aa4 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -150,7 +150,9 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, temporary_address_space=None, temporary_scope=None, footprint_subscripts=None, fetch_bounding_box=False, - fetch_outer_inames=None): + fetch_outer_inames=None, + prefetch_insn_id=None, + within=None): """Prefetch all accesses to the variable *var_name*, with all accesses being swept through *sweep_inames*. @@ -237,6 +239,13 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, :arg fetch_outer_inames: The inames within which the fetch instruction is nested. If *None*, make an educated guess. + :arg fetch_insn_id: The ID of the instruction generated to perform the + prefetch. + + :arg within: a stack match as understood by + :func:`loopy.match.parse_stack_match` to select the instructions where + *var_name* is to be prefetched. + This function internally uses :func:`extract_subst` and :func:`precompute`. """ @@ -336,7 +345,9 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, temporary_name=temporary_name, temporary_address_space=temporary_address_space, temporary_scope=temporary_scope, - precompute_outer_inames=fetch_outer_inames) + precompute_outer_inames=fetch_outer_inames, + compute_insn_id=prefetch_insn_id, + within=within) # {{{ remove inames that were temporarily added by slice sweeps -- GitLab From a914f8ec85265ed202c564ad3105a4a854bab7d7 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Fri, 20 Dec 2019 18:02:09 -0600 Subject: [PATCH 5/9] amends string formatting --- loopy/target/c/__init__.py | 32 ++++++++++++++++---------------- 1 file changed, 16 insertions(+), 16 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index cb9465a9a..2b1c7bc03 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -100,55 +100,55 @@ def _preamble_generator(preamble_info, func_qualifier='inline'): function_defs = { "loopy_floor_div": r""" #define LOOPY_DEFINE_FLOOR_DIV(SUFFIX, TYPE) \ - %s TYPE loopy_floor_div_##SUFFIX(TYPE a, TYPE b) \ - { \ + {} TYPE loopy_floor_div_##SUFFIX(TYPE a, TYPE b) \ + {{ \ if ((a<0) != (b<0)) \ a = a - (b + (b<0) - (b>=0)); \ return a/b; \ - } + }} LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_FLOOR_DIV) #undef LOOPY_DEFINE_FLOOR_DIV - """ % func_qualifier, + """.format(func_qualifier), "loopy_floor_div_pos_b": r""" #define LOOPY_DEFINE_FLOOR_DIV_POS_B(SUFFIX, TYPE) \ - %s TYPE loopy_floor_div_pos_b_##SUFFIX(TYPE a, TYPE b) \ - { \ + {} TYPE loopy_floor_div_pos_b_##SUFFIX(TYPE a, TYPE b) \ + {{ \ if (a<0) \ a = a - (b-1); \ return a/b; \ - } + }} LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_FLOOR_DIV_POS_B) #undef LOOPY_DEFINE_FLOOR_DIV_POS_B - """ % func_qualifier, + """.format(func_qualifier), "loopy_mod": r""" #define LOOPY_DEFINE_MOD(SUFFIX, TYPE) \ - %s TYPE loopy_mod_##SUFFIX(TYPE a, TYPE b) \ - { \ + {} TYPE loopy_mod_##SUFFIX(TYPE a, TYPE b) \ + {{ \ TYPE result = a%b; \ if (result < 0 && b > 0) \ result += b; \ if (result > 0 && b < 0) \ result = result + b; \ return result; \ - } + }} LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_MOD) #undef LOOPY_DEFINE_MOD - """ % func_qualifier, + """ .format(func_qualifier), "loopy_mod_pos_b": r""" #define LOOPY_DEFINE_MOD_POS_B(SUFFIX, TYPE) \ - %s TYPE loopy_mod_pos_b_##SUFFIX(TYPE a, TYPE b) \ - { \ + {} TYPE loopy_mod_pos_b_##SUFFIX(TYPE a, TYPE b) \ + {{ \ TYPE result = a%b; \ if (result < 0) \ result += b; \ return result; \ - } + }} LOOPY_CALL_WITH_INTEGER_TYPES(LOOPY_DEFINE_MOD_POS_B) #undef LOOPY_DEFINE_MOD_POS_B - """ % func_qualifier, + """.format(func_qualifier), } c_funcs = set(func.c_name for func in preamble_info.seen_functions) -- GitLab From be20301e3a475f656da1c3dfe83d8504e4019798 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Dec 2019 21:12:22 -0600 Subject: [PATCH 6/9] function_qualifier->preamble_function_qualifier --- loopy/target/c/__init__.py | 4 ++-- loopy/target/cuda.py | 2 +- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 2b1c7bc03..62ca90f88 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -470,7 +470,7 @@ def c_math_mangler(target, name, arg_dtypes, modify_name=True): class CFamilyASTBuilder(ASTBuilderBase): - function_qualifier = 'inline' + preamble_function_qualifier = 'inline' # {{{ library @@ -490,7 +490,7 @@ class CFamilyASTBuilder(ASTBuilderBase): return ( super(CFamilyASTBuilder, self).preamble_generators() + [ lambda preamble_info: _preamble_generator(preamble_info, - self.function_qualifier), + self.preamble_function_qualifier), ]) # }}} diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 72952f7d9..007871059 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -218,7 +218,7 @@ class CudaTarget(CFamilyTarget): class CUDACASTBuilder(CFamilyASTBuilder): - function_qualifier = 'inline __device__' + preamble_function_qualifier = 'inline __device__' # {{{ library -- GitLab From f5cf867dc5b13243bf57d17affe5a3654dc158a3 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Dec 2019 21:54:42 -0600 Subject: [PATCH 7/9] remove_instructions: insn_ids can be of type MatchExpressionBase --- loopy/transform/instruction.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index da927d950..809fa4f7c 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -131,15 +131,20 @@ def remove_instructions(kernel, insn_ids): This also updates *no_sync_with* for all instructions. :arg insn_ids: An instance of :class:`set` or :class:`str` as - understood by :func:`loopy.match.parse_match`. + understood by :func:`loopy.match.parse_match` or + :class:`loopy.match.MatchExpressionBase`. """ if not insn_ids: return kernel + from loopy.match import MatchExpressionBase + if isinstance(insn_ids, str): from loopy.match import parse_match - within = parse_match(insn_ids) + insn_ids = parse_match(insn_ids) + if isinstance(insn_ids, MatchExpressionBase): + within = insn_ids insn_ids = set([insn.id for insn in kernel.instructions if within(kernel, insn)]) -- GitLab From 948e1dc99f6c126a26afaf68ec0e402c52607bfd Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 30 Dec 2019 21:57:28 -0600 Subject: [PATCH 8/9] :zap: improves performance of remove_instructions --- loopy/transform/instruction.py | 27 +++++++++++++++++++-------- 1 file changed, 19 insertions(+), 8 deletions(-) diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 809fa4f7c..7d1fae06d 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -152,6 +152,20 @@ def remove_instructions(kernel, insn_ids): assert isinstance(insn_ids, set) id_to_insn = kernel.id_to_insn + # for each insn_id to be removed get deps in terms of remaining + # insns + retargeted_deps = {} + for insn_id in insn_ids: + depends_on = id_to_insn[insn_id].depends_on + while depends_on & insn_ids: + new_deps = depends_on - insn_ids + for dep_id in depends_on & insn_ids: + new_deps = new_deps | id_to_insn[dep_id].depends_on + + depends_on = new_deps.copy() + + retargeted_deps[insn_id] = depends_on + new_insns = [] for insn in kernel.instructions: if insn.id in insn_ids: @@ -164,23 +178,20 @@ def remove_instructions(kernel, insn_ids): else: depends_on = insn.depends_on - while depends_on & insn_ids: - new_deps = depends_on - insn_ids - for dep_id in depends_on & insn_ids: - new_deps = new_deps | id_to_insn[dep_id].depends_on + new_deps = depends_on - insn_ids - depends_on = new_deps.copy() + for insn_id in (depends_on & insn_ids): + new_deps = new_deps | retargeted_deps[insn_id] - assert not (depends_on & insn_ids) + assert not (new_deps & insn_ids) # update no_sync_with - new_no_sync_with = frozenset((insn_id, scope) for insn_id, scope in insn.no_sync_with if insn_id not in insn_ids) new_insns.append( - insn.copy(depends_on=depends_on, no_sync_with=new_no_sync_with)) + insn.copy(depends_on=new_deps, no_sync_with=new_no_sync_with)) return kernel.copy( instructions=new_insns) -- GitLab From 91aebf5bcd9b1f32cc36b717b519458727a952a8 Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Tue, 31 Dec 2019 14:06:36 -0600 Subject: [PATCH 9/9] adds 'within' arg in extract_subst - passes prefetch's within to extract_subst - tests it --- loopy/transform/data.py | 3 ++- loopy/transform/subst.py | 17 +++++++++++++---- test/test_transform.py | 26 ++++++++++++++++++++++++++ 3 files changed, 41 insertions(+), 5 deletions(-) diff --git a/loopy/transform/data.py b/loopy/transform/data.py index da2097aa4..27c3fb185 100644 --- a/loopy/transform/data.py +++ b/loopy/transform/data.py @@ -320,7 +320,8 @@ def add_prefetch(kernel, var_name, sweep_inames=[], dim_arg_names=None, # }}} from loopy.transform.subst import extract_subst - kernel = extract_subst(kernel, rule_name, uni_template, parameters) + kernel = extract_subst(kernel, rule_name, uni_template, parameters, + within=within) if isinstance(sweep_inames, str): sweep_inames = [s.strip() for s in sweep_inames.split(",")] diff --git a/loopy/transform/subst.py b/loopy/transform/subst.py index b92698ffa..1f05d36a3 100644 --- a/loopy/transform/subst.py +++ b/loopy/transform/subst.py @@ -44,12 +44,14 @@ class ExprDescriptor(ImmutableRecord): # {{{ extract_subst -def extract_subst(kernel, subst_name, template, parameters=()): +def extract_subst(kernel, subst_name, template, parameters=(), within=None): """ :arg subst_name: The name of the substitution rule to be created. :arg template: Unification template expression. :arg parameters: An iterable of parameters used in *template*, or a comma-separated string of the same. + :arg within: An instance of :class:`loopy.match.MatchExpressionBase` or + :class:`str` as understood by :func:`loopy.match.parse_match`. All targeted subexpressions must match ('unify with') *template* The template may contain '*' wildcards that will have to match exactly across all @@ -64,6 +66,9 @@ def extract_subst(kernel, subst_name, template, parameters=()): parameters = tuple( s.strip() for s in parameters.split(",")) + from loopy.match import parse_match + within = parse_match(within) + var_name_gen = kernel.get_var_name_generator() # {{{ replace any wildcards in template with new variables @@ -140,8 +145,9 @@ def extract_subst(kernel, subst_name, template, parameters=()): dfmapper = CallbackMapper(gather_exprs, WalkMapper()) for insn in kernel.instructions: - dfmapper(insn.assignees) - dfmapper(insn.expression) + if within(kernel, insn): + dfmapper(insn.assignees) + dfmapper(insn.expression) for sr in six.itervalues(kernel.substitutions): dfmapper(sr.expression) @@ -178,7 +184,10 @@ def extract_subst(kernel, subst_name, template, parameters=()): new_insns = [] for insn in kernel.instructions: - new_insns.append(insn.with_transformed_expressions(cbmapper)) + if within(kernel, insn): + new_insns.append(insn.with_transformed_expressions(cbmapper)) + else: + new_insns.append(insn) from loopy.kernel.data import SubstitutionRule new_substs = { diff --git a/test/test_transform.py b/test/test_transform.py index dc854fb0b..ad634a551 100644 --- a/test/test_transform.py +++ b/test/test_transform.py @@ -586,6 +586,32 @@ def test_remove_instructions_with_recursive_deps(): assert knl.id_to_insn['insn0'].depends_on == frozenset() +def test_prefetch_with_within(ctx_factory): + knl = lp.make_kernel( + "{[i, j, k]: 0<=i<100 and 0<=j,k<256}", + """ + f[j] = 3.14 * j {id=set_f} + ... gbarrier {id=insn_gbar} + y[i, k] = f[k] * x[i, k] {id=set_y} + """, [lp.GlobalArg('x', shape=lp.auto, dtype=float), '...'], + seq_dependencies=True) + + ref_knl = knl + + knl = lp.split_iname(knl, 'j', 32, inner_tag="l.0", outer_tag="g.0") + knl = lp.split_iname(knl, 'i', 32, inner_tag="l.0", outer_tag="g.0") + + knl = lp.add_prefetch(knl, 'f', prefetch_insn_id='f_prftch', within='id:set_y', + sweep_inames='k', + dim_arg_names='iprftch', + default_tag=None, + temporary_address_space=lp.AddressSpace.LOCAL) + knl = lp.add_dependency(knl, 'id:f_prftch', 'id:insn_gbar') + knl = lp.split_iname(knl, 'iprftch', 32, inner_tag="l.0") + + lp.auto_test_vs_ref(ref_knl, ctx_factory(), knl) + + if __name__ == "__main__": if len(sys.argv) > 1: exec(sys.argv[1]) -- GitLab