From 9bf5677fbb4ccd6ac2c8dafddab574ac8e090dbe Mon Sep 17 00:00:00 2001 From: Kaushik Kulkarni Date: Mon, 28 Oct 2019 04:34:00 -0500 Subject: [PATCH] minor changes in docs --- doc/tutorial.rst | 98 +++++++++++++++++++++--------------------- test/test_callables.py | 2 - 2 files changed, 49 insertions(+), 51 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index e6ef54b66..708d0520f 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -325,7 +325,7 @@ an explicit dependency: ... """ ... out[j,i] = a[i,j] {id=transpose} ... out[i,j] = 2*out[i,j] {dep=transpose} - ... """) + ... """, name="transpose_and_dbl") ``{id=transpose}`` assigns the identifier *transpose* to the first instruction, and ``{dep=transpose}`` declares a dependency of the second @@ -334,9 +334,9 @@ that these dependencies show up there, too: .. doctest:: - >>> print(knl.root_kernel.stringify(with_dependencies=True)) + >>> print(knl["transpose_and_dbl"].stringify(with_dependencies=True)) --------------------------------------------------------------------------- - KERNEL: loopy_kernel + KERNEL: transpose_and_dbl --------------------------------------------------------------------------- ... --------------------------------------------------------------------------- @@ -386,7 +386,7 @@ Let us take a look at the generated code for the above kernel: #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) transpose_and_dbl(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) { for (int i = 0; i <= -1 + n; ++i) for (int j = 0; j <= -1 + n; ++j) @@ -735,7 +735,7 @@ those for us: .. doctest:: - >>> glob, loc = knl.get_grid_size_upper_bounds() + >>> glob, loc = knl["loopy_kernel"].get_grid_size_upper_bounds(knl.callables_table) >>> print(glob) (Aff("[n] -> { [(floor((127 + n)/128))] }"),) >>> print(loc) @@ -1207,8 +1207,8 @@ happens when the instruction schedule is generated. To see the schedule, we should call :func:`loopy.get_one_scheduled_kernel`: >>> prog = lp.preprocess_kernel(prog) - >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) - >>> prog = prog.with_root_kernel(knl) + >>> knl = lp.get_one_scheduled_kernel(prog["rotate_v2"], prog.callables_table) + >>> prog = prog.with_kernel(knl) >>> print(prog) --------------------------------------------------------------------------- KERNEL: rotate_v2 @@ -1239,8 +1239,8 @@ that :func:`loopy.get_one_scheduled_kernel` needs to be called one more time to put those instructions into the schedule. >>> prog = lp.save_and_reload_temporaries(prog) - >>> knl = lp.get_one_scheduled_kernel(prog.root_kernel, prog.callables_table) # Schedule added instructions - >>> prog = prog.with_root_kernel(knl) + >>> knl = lp.get_one_scheduled_kernel(prog["rotate_v2"], prog.callables_table) # Schedule added instructions + >>> prog = prog.with_kernel(knl) >>> print(prog) --------------------------------------------------------------------------- KERNEL: rotate_v2 @@ -1306,7 +1306,7 @@ Now we can execute the kernel. >>> arr = cl.array.arange(queue, 16, dtype=np.int32) >>> print(arr) [ 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15] - >>> evt, (out,) = knl(queue, arr=arr) + >>> evt, (out,) = prog(queue, arr=arr) >>> print(arr) [15 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14] @@ -1543,7 +1543,7 @@ containing different types of data: ... """ ... c[i, j, k] = a[i,j,k]*b[i,j,k]/3.0+a[i,j,k] ... e[i, k] = g[i,k]*(2+h[i,k+1]) - ... """) + ... """, name="stats_knl") >>> knl = lp.add_and_infer_dtypes(knl, ... dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) @@ -1554,7 +1554,7 @@ information provided. Now we will count the operations: >>> op_map = lp.get_op_map(knl, subgroup_size=32) >>> print(lp.stringify_stats_mapping(op_map)) - Op(np:dtype('float32'), add, subgroup, loopy_kernel) : ... + Op(np:dtype('float32'), add, subgroup, stats_knl) : ... Each line of output will look roughly like:: @@ -1580,12 +1580,12 @@ One way to evaluate these polynomials is with :func:`islpy.eval_with_dict`: >>> param_dict = {'n': 256, 'm': 256, 'l': 8} >>> from loopy.statistics import CountGranularity as CG - >>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) - >>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) - >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) - >>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) - >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) - >>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP, knl.name)].eval_with_dict(param_dict) + >>> f32add = op_map[lp.Op(np.float32, 'add', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict) + >>> f32div = op_map[lp.Op(np.float32, 'div', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict) + >>> f32mul = op_map[lp.Op(np.float32, 'mul', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict) + >>> f64add = op_map[lp.Op(np.float64, 'add', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict) + >>> f64mul = op_map[lp.Op(np.float64, 'mul', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict) + >>> i32add = op_map[lp.Op(np.int32, 'add', CG.SUBGROUP, "stats_knl")].eval_with_dict(param_dict) >>> print("%i\n%i\n%i\n%i\n%i\n%i" % ... (f32add, f32div, f32mul, f64add, f64mul, i32add)) 524288 @@ -1642,15 +1642,15 @@ we'll continue using the kernel from the previous example: >>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, loopy_kernel) : ... + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, stats_knl) : ... Each line of output will look roughly like:: - MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, loopy_kernel) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup, loopy_kernel) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup, loopy_kernel) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, stats_knl) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup, stats_knl) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup, stats_knl) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. @@ -1685,13 +1685,13 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', None, CG.SUBGROUP, knl.name) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', None, CG.SUBGROUP, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', None, CG.SUBGROUP, knl.name) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', None, CG.SUBGROUP, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', None, CG.SUBGROUP, knl.name) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', None, CG.SUBGROUP, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', None, CG.SUBGROUP, knl.name) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', None, CG.SUBGROUP, "stats_knl") ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1709,7 +1709,7 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: >>> bytes_map = mem_map.to_bytes() >>> print(lp.stringify_stats_mapping(bytes_map)) - MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, loopy_kernel) : ... + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup, stats_knl) : ... >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global'] ... ).group_by('direction') @@ -1752,12 +1752,12 @@ this time. ... outer_tag="l.1", inner_tag="l.0") >>> mem_map = lp.get_mem_access_map(knl_consec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem, loopy_kernel) : ... - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem, loopy_kernel) : ... - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem, loopy_kernel) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem, loopy_kernel) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem, loopy_kernel) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem, stats_knl) : ... With this parallelization, consecutive work-items will access consecutive array @@ -1767,13 +1767,13 @@ array accesses has not changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', None, CG.WORKITEM, knl.name) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', None, CG.WORKITEM, knl.name) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', None, CG.WORKITEM, knl.name) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', None, CG.WORKITEM, knl.name) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1793,12 +1793,12 @@ we'll switch the inner and outer tags in our parallelization of the kernel: ... outer_tag="l.0", inner_tag="l.1") >>> mem_map = lp.get_mem_access_map(knl_nonconsec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem, loopy_kernel) : ... - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem, loopy_kernel) : ... - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem, loopy_kernel) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem, loopy_kernel) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem, loopy_kernel) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem, loopy_kernel) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem, stats_knl) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem, stats_knl) : ... With this parallelization, consecutive work-items will access *nonconsecutive* @@ -1807,13 +1807,13 @@ changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', None, CG.WORKITEM, knl.name) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', None, CG.WORKITEM, knl.name) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', None, CG.WORKITEM, knl.name) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', None, CG.WORKITEM, knl.name) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', None, CG.WORKITEM, "stats_knl") ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1847,14 +1847,14 @@ kernel from the previous example: >>> sync_map = lp.get_synchronization_map(knl) >>> print(lp.stringify_stats_mapping(sync_map)) - Sync(kernel_launch, loopy_kernel) : [l, m, n] -> { 1 } + Sync(kernel_launch, stats_knl) : [l, m, n] -> { 1 } We can evaluate this polynomial using :func:`islpy.eval_with_dict`: .. doctest:: - >>> launch_count = sync_map[lp.Sync("kernel_launch", knl.name)].eval_with_dict(param_dict) + >>> launch_count = sync_map[lp.Sync("kernel_launch", "stats_knl")].eval_with_dict(param_dict) >>> print("Kernel launch count: %s" % launch_count) Kernel launch count: 1 diff --git a/test/test_callables.py b/test/test_callables.py index 111861f4e..32e12ded0 100644 --- a/test/test_callables.py +++ b/test/test_callables.py @@ -438,8 +438,6 @@ def test_non_sub_array_refs_arguments(ctx_factory): print(inlined) - print(inlined) - @pytest.mark.parametrize("inline", [False, True]) def test_empty_sub_array_refs(ctx_factory, inline): -- GitLab