From ff08b1a23ad2c0b40490aaefe82b4822366eb872 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Sun, 23 Sep 2018 13:28:39 -0400 Subject: [PATCH 01/34] Add LICENSE --- LICENSE | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) create mode 100644 LICENSE diff --git a/LICENSE b/LICENSE new file mode 100644 index 000000000..601df74bd --- /dev/null +++ b/LICENSE @@ -0,0 +1,21 @@ +MIT License + +Copyright (c) 2018 Andreas Klöckner and contributors + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. -- GitLab From 4751b84e2690e05d02342f51fcf6b7c303f2ac6b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Sun, 23 Sep 2018 13:30:06 -0400 Subject: [PATCH 02/34] Add Pytest/JUnit/Gitlab integration --- .gitlab-ci.yml | 17 +++++++++++++++++ 1 file changed, 17 insertions(+) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 1caef802b..f9ed13c52 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -12,6 +12,10 @@ Python 2.7 POCL: - pocl except: - tags + artifacts: + reports: + junit: test/pytest.xml + Python 2.7 with legacy PyOpenCL: script: @@ -29,6 +33,10 @@ Python 2.7 with legacy PyOpenCL: except: - tags retry: 2 + artifacts: + reports: + junit: test/pytest.xml + Python 3.6 POCL: script: @@ -43,6 +51,10 @@ Python 3.6 POCL: - pocl except: - tags + artifacts: + reports: + junit: test/pytest.xml + Python 3.6 POCL Twice With Cache: script: @@ -59,6 +71,10 @@ Python 3.6 POCL Twice With Cache: - pocl except: - tags + artifacts: + reports: + junit: test/pytest.xml + # PyPy POCL: # script: @@ -87,6 +103,7 @@ Python 3.6 POCL Examples: except: - tags + CentOS binary: script: - (cd build-helpers; ./make-linux-build-docker.sh --nodate) -- GitLab From 429f7ab3e732255ca84192d28d3cef5e31a76d91 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Mon, 1 Oct 2018 14:49:46 -0400 Subject: [PATCH 03/34] Try explicitly installing ipython to avoid incompatible prompt-toolkit being installed (e.g. https://gitlab.tiker.net/inducer/loopy/-/jobs/63712) --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index f9ed13c52..ec2000c3d 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -93,7 +93,7 @@ Python 3.6 POCL Examples: script: - export PY_EXE=python3.6 - export PYOPENCL_TEST=portable - - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib jupyter nbconvert" + - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile ipython matplotlib jupyter nbconvert" - ". ./build-py-project-and-run-examples.sh" tags: - python3.6 -- GitLab From 0dd11214191d31417b7555977837329f92e58bbb Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Tue, 2 Oct 2018 13:31:00 -0400 Subject: [PATCH 04/34] Don't install all of jupyter in examples CI run May help with https://github.com/jupyter/jupyter_console/issues/158 --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index ec2000c3d..29a4b657b 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -93,7 +93,7 @@ Python 3.6 POCL Examples: script: - export PY_EXE=python3.6 - export PYOPENCL_TEST=portable - - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile ipython matplotlib jupyter nbconvert" + - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib nbconvert" - ". ./build-py-project-and-run-examples.sh" tags: - python3.6 -- GitLab From 97999153a0236ed2f7f2e510292bb063f1b67c5b Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Tue, 2 Oct 2018 13:58:08 -0400 Subject: [PATCH 05/34] Localize dep on 'packaging' to CentOS package --- build-helpers/make-linux-build-docker-inner-part-2.sh | 4 ++++ requirements.txt | 5 +---- 2 files changed, 5 insertions(+), 4 deletions(-) diff --git a/build-helpers/make-linux-build-docker-inner-part-2.sh b/build-helpers/make-linux-build-docker-inner-part-2.sh index 1e35a1e1b..035634b16 100755 --- a/build-helpers/make-linux-build-docker-inner-part-2.sh +++ b/build-helpers/make-linux-build-docker-inner-part-2.sh @@ -23,6 +23,10 @@ git clone --recursive git://github.com/inducer/loopy cd loopy grep -v pyopencl requirements.txt > myreq.txt + +# needed for pyinstaller package to be usable +echo packaging >> myreq.txt + pip install -r myreq.txt python setup.py install diff --git a/requirements.txt b/requirements.txt index a3e88cfea..97c202476 100644 --- a/requirements.txt +++ b/requirements.txt @@ -9,7 +9,4 @@ git+https://github.com/inducer/codepy.git git+https://github.com/inducer/f2py # Optional, needed for using the C preprocessor on Fortran -ply>=3.6 - -# This is needed for the pyinstaller executable to be usable. -packaging +ply>=3.6 \ No newline at end of file -- GitLab From bc5d195d15ed019e9dc039b0ace8b744de083270 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Tue, 2 Oct 2018 13:58:44 -0400 Subject: [PATCH 06/34] Add deps needed to keep nbconvert execute happy --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 29a4b657b..913460d81 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -93,7 +93,7 @@ Python 3.6 POCL Examples: script: - export PY_EXE=python3.6 - export PYOPENCL_TEST=portable - - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib nbconvert" + - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipython jupyter_client nbconvert" - ". ./build-py-project-and-run-examples.sh" tags: - python3.6 -- GitLab From da44bc51994bb7a21d595dd7859aa0c013ab545d Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= Date: Tue, 2 Oct 2018 14:16:59 -0400 Subject: [PATCH 07/34] More futzing with examples CI package deps: Use ipykernel instead of ipython --- .gitlab-ci.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 913460d81..ea69114d6 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -93,7 +93,7 @@ Python 3.6 POCL Examples: script: - export PY_EXE=python3.6 - export PYOPENCL_TEST=portable - - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipython jupyter_client nbconvert" + - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipykernel nbconvert" - ". ./build-py-project-and-run-examples.sh" tags: - python3.6 -- GitLab From 5d64d8244ff100180134e4e04313fa0c1cb86212 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 21 Oct 2018 01:25:42 -0500 Subject: [PATCH 08/34] Fix, test kernel dep arrow printing for continuing downward (i.e. bad) dependencies --- loopy/kernel/tools.py | 4 ++-- test/test_loopy.py | 17 +++++++++++++++++ 2 files changed, 19 insertions(+), 2 deletions(-) diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py index 95c3c336c..b8be6191d 100644 --- a/loopy/kernel/tools.py +++ b/loopy/kernel/tools.py @@ -1241,7 +1241,7 @@ def draw_dependencies_as_unicode_arrows( for dep in insn.depends_on: reverse_deps.setdefault(dep, set()).add(insn.id) - # mapping of (from_id, to_id) tuples to column_index + # mapping of to_id tuples to column_index dep_to_column = {} # {{{ find column assignments @@ -1318,7 +1318,7 @@ def draw_dependencies_as_unicode_arrows( elif insn.id in starts: starts.remove(insn.id) - if starts: + if starts or pointed_at_insn_id not in processed_ids: # will continue downward row[col] = do_flag_downward(u"├", pointed_at_insn_id) diff --git a/test/test_loopy.py b/test/test_loopy.py index accf9c1df..f19c76026 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2908,6 +2908,23 @@ def test_dep_cycle_printing_and_error(): print(lp.generate_code(knl)[0]) +def test_backwards_dep_printing_and_error(): + knl = lp.make_kernel( + "{[i]: 0<=i 1: exec(sys.argv[1]) -- GitLab From 29e133e54b051bfadc2173eb5a4b75a76fc9c153 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 21 Oct 2018 02:05:09 -0500 Subject: [PATCH 09/34] Py2 fix --- test/test_loopy.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/test_loopy.py b/test/test_loopy.py index f19c76026..38d1cd6b0 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -2918,8 +2918,10 @@ def test_backwards_dep_printing_and_error(): b[i] = b[i] + c[i] {id=insn4, dep=insn3} d[i] = 7*a[i ] {id=insn5, dep=insn4} a[i] = a[i] + d[i] {id=insn6, dep=insn5} - """, [lp.GlobalArg('a, b', dtype=np.float64), - ...], lang_version=(2018, 2)) + """, [ + lp.GlobalArg('a, b', dtype=np.float64), + "..." + ]) # Used to crash with KeyError print(knl) -- GitLab From 4c4ff5076661c0ef9bacd8ea119a96645453964f Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sun, 28 Oct 2018 14:36:19 -0500 Subject: [PATCH 10/34] Placate flake8 3.6 --- loopy/frontend/fortran/tree.py | 2 +- loopy/symbolic.py | 2 +- setup.cfg | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/loopy/frontend/fortran/tree.py b/loopy/frontend/fortran/tree.py index b1df6e3d0..6939bb6ad 100644 --- a/loopy/frontend/fortran/tree.py +++ b/loopy/frontend/fortran/tree.py @@ -53,7 +53,7 @@ class FTreeWalkerBase(object): ENTITY_RE = re.compile( r"^(?P[_0-9a-zA-Z]+)" - "(\((?P[-+*0-9:a-zA-Z, \t]+)\))?$") + r"(\((?P[-+*0-9:a-zA-Z, \t]+)\))?$") def parse_dimension_specs(self, node, dim_decls): def parse_bounds(bounds_str): diff --git a/loopy/symbolic.py b/loopy/symbolic.py index 8927cd6fb..f4d46854b 100644 --- a/loopy/symbolic.py +++ b/loopy/symbolic.py @@ -1629,7 +1629,7 @@ def get_access_range(domain, subscript, assumptions, shape=None, if shape is not None: try: shape_aff = guarded_aff_from_expr(access_map.space, shape[idim]) - except ExpressionToAffineConversionError as sub_err: + except ExpressionToAffineConversionError: pass if shape_aff is None: diff --git a/setup.cfg b/setup.cfg index b939ce0cf..eec3dfd1f 100644 --- a/setup.cfg +++ b/setup.cfg @@ -1,5 +1,5 @@ [flake8] -ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814 +ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814,W504 max-line-length=85 exclude= loopy/target/c/compyte/ndarray, -- GitLab From 1d7a5162a595bd490ea5e06800ac616920b2358e Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Thu, 1 Nov 2018 14:05:25 -0500 Subject: [PATCH 11/34] Fix test parameter order --- test/test_numa_diff.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py index 15d5ea7c9..54b608a18 100644 --- a/test/test_numa_diff.py +++ b/test/test_numa_diff.py @@ -47,8 +47,8 @@ __all__ = [ from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2 # noqa -@pytest.mark.parametrize("Nq", [7]) @pytest.mark.parametrize("ilp_multiple", [1, 2]) +@pytest.mark.parametrize("Nq", [7]) @pytest.mark.parametrize("opt_level", [11]) def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level): # noqa ctx = ctx_factory() -- GitLab From cded42ec99b1bccae276789ffc83ff6d3e615db9 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 10 Nov 2018 16:36:00 -0600 Subject: [PATCH 12/34] Fix CUDA short vector codegen --- loopy/target/cuda.py | 2 +- test/test_target.py | 17 +++++++++++++++++ 2 files changed, 18 insertions(+), 1 deletion(-) diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py index 673d3b284..d6f55091a 100644 --- a/loopy/target/cuda.py +++ b/loopy/target/cuda.py @@ -290,7 +290,7 @@ class CUDACASTBuilder(CASTBuilder): _VEC_AXES = "xyzw" def add_vector_access(self, access_expr, index): - return access_expr.a(self._VEC_AXES[index]) + return access_expr.attr(self._VEC_AXES[index]) def emit_barrier(self, synchronization_kind, mem_kind, comment): """ diff --git a/test/test_target.py b/test/test_target.py index 75b3c05ae..bcf85a340 100644 --- a/test/test_target.py +++ b/test/test_target.py @@ -358,6 +358,23 @@ def test_ispc_streaming_stores(): lp.generate_code_v2(knl).all_code() +def test_cuda_short_vector(): + knl = lp.make_kernel( + "{ [i]: 0<=i 1: exec(sys.argv[1]) -- GitLab From acc42897335d379d062b049d03ac8ab71361e994 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 10 Nov 2018 17:29:58 -0600 Subject: [PATCH 13/34] Tweak reduction example for linear stride in reduced array --- loopy/schedule/__init__.py | 10 +++++++--- test/test_reduction.py | 12 +++++++++--- 2 files changed, 16 insertions(+), 6 deletions(-) diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py index 652f8b893..58b68486b 100644 --- a/loopy/schedule/__init__.py +++ b/loopy/schedule/__init__.py @@ -794,9 +794,13 @@ def generate_loop_schedules_internal( if not is_ready: if debug_mode: - print("instruction '%s' is missing insn depedencies '%s'" % ( - format_insn(kernel, insn.id), ",".join( - insn.depends_on - sched_state.scheduled_insn_ids))) + # These are not that interesting when understanding scheduler + # failures. + + # print("instruction '%s' is missing insn depedencies '%s'" % ( + # format_insn(kernel, insn.id), ",".join( + # insn.depends_on - sched_state.scheduled_insn_ids))) + pass continue want = kernel.insn_inames(insn) - sched_state.parallel_inames diff --git a/test/test_reduction.py b/test/test_reduction.py index 78eca4d0c..ef229d5cd 100644 --- a/test/test_reduction.py +++ b/test/test_reduction.py @@ -225,22 +225,28 @@ def test_global_parallel_reduction(ctx_factory, size): "{[i]: 0 <= i < n }", """ # Using z[0] instead of z works around a bug in ancient PyOpenCL. - z[0] = sum(i, i/13) + z[0] = sum(i, a[i]) """) + knl = lp.add_and_infer_dtypes(knl, {"a": np.float32}) ref_knl = knl gsize = 128 knl = lp.split_iname(knl, "i", gsize * 20) - knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0") - knl = lp.split_reduction_inward(knl, "i_inner_inner") + knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0") + knl = lp.split_reduction_outward(knl, "i_outer") knl = lp.split_reduction_inward(knl, "i_inner_outer") from loopy.transform.data import reduction_arg_to_subst_rule knl = reduction_arg_to_subst_rule(knl, "i_outer") + knl = lp.precompute(knl, "red_i_outer_arg", "i_outer", temporary_scope=lp.temp_var_scope.GLOBAL, default_tag="l.auto") knl = lp.realize_reduction(knl) + knl = lp.tag_inames(knl, "i_outer_0:g.0") + + # Keep the i_outer accumulator on the correct (lower) side of the barrier, + # otherwise there will be useless save/reload code generated. knl = lp.add_dependency( knl, "writes:acc_i_outer", "id:red_i_outer_arg_barrier") -- GitLab From d69b0e24095eea943e38fd7dcb1a106b43e506b5 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 10 Nov 2018 20:15:54 -0600 Subject: [PATCH 14/34] Bank conflict avoidance: Use tuples for shape --- loopy/target/pyopencl.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 73e8e0092..2fc3bc1ed 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -125,7 +125,7 @@ def adjust_local_temp_var_storage(kernel, device): new_storage_shape = storage_shape - new_temp_vars[temp_var.name] = temp_var.copy(storage_shape=new_storage_shape) + new_temp_vars[temp_var.name] = temp_var.copy(storage_shape=tuple(new_storage_shape)) return kernel.copy(temporary_variables=new_temp_vars) -- GitLab From ba58da9ded42bccb28b1a30048ae74fd3916abe3 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Sat, 10 Nov 2018 20:33:26 -0600 Subject: [PATCH 15/34] Placate flake8 --- loopy/target/pyopencl.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py index 2fc3bc1ed..34faf0a03 100644 --- a/loopy/target/pyopencl.py +++ b/loopy/target/pyopencl.py @@ -125,7 +125,8 @@ def adjust_local_temp_var_storage(kernel, device): new_storage_shape = storage_shape - new_temp_vars[temp_var.name] = temp_var.copy(storage_shape=tuple(new_storage_shape)) + new_temp_vars[temp_var.name] = temp_var.copy( + storage_shape=tuple(new_storage_shape)) return kernel.copy(temporary_variables=new_temp_vars) -- GitLab From 9b519ecbac9db836460462dfd89deb47a79ff0d1 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Tue, 13 Nov 2018 06:44:26 -0600 Subject: [PATCH 16/34] added count_within_subscripts boolean to OpCounter --- loopy/statistics.py | 15 +++++++++++---- test/test_statistics.py | 14 +++++++++----- 2 files changed, 20 insertions(+), 9 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index 3fecfb778..b467e3334 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -697,8 +697,9 @@ class CounterBase(CombineMapper): # {{{ ExpressionOpCounter class ExpressionOpCounter(CounterBase): - def __init__(self, knl): + def __init__(self, knl, count_within_subscripts=True): self.knl = knl + self.count_within_subscripts = count_within_subscripts from loopy.type_inference import TypeInferenceMapper self.type_inf = TypeInferenceMapper(knl) @@ -719,7 +720,10 @@ class ExpressionOpCounter(CounterBase): ) + self.rec(expr.parameters) def map_subscript(self, expr): - return self.rec(expr.index) + if self.count_within_subscripts: + return self.rec(expr.index) + else: + return ToCountMap() def map_sum(self, expr): assert expr.children @@ -1314,7 +1318,7 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work, # {{{ get_op_map def get_op_map(knl, numpy_types=True, count_redundant_work=False, - subgroup_size=None): + count_within_subscripts=True, subgroup_size=None): """Count the number of operations in a loopy kernel. @@ -1330,6 +1334,9 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, (Likely desirable for performance modeling, but undesirable for code optimization.) + :arg count_within_subscripts: A :class:`bool` specifying whether to + count operations inside array indices. + :arg subgroup_size: (currently unused) An :class:`int`, :class:`str` ``'guess'``, or *None* that specifies the sub-group size. An OpenCL sub-group is an implementation-dependent grouping of work-items within @@ -1382,7 +1389,7 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False, knl = preprocess_kernel(knl) op_map = ToCountMap() - op_counter = ExpressionOpCounter(knl) + op_counter = ExpressionOpCounter(knl, count_within_subscripts) from loopy.kernel.instruction import ( CallInstruction, CInstruction, Assignment, diff --git a/test/test_statistics.py b/test/test_statistics.py index 3f2366521..41b44b5a7 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -57,7 +57,8 @@ def test_op_counter_basic(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) @@ -161,7 +162,8 @@ def test_op_counter_specialops(): knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32, g=np.float64, h=np.float64)) - op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=True) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) @@ -206,7 +208,8 @@ def test_op_counter_bitwise(): a=np.int32, b=np.int32, g=np.int64, h=np.int64)) - op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=False) n_workgroups = 1 group_size = 1 subgroups_per_group = div_ceil(group_size, SGS) @@ -226,7 +229,7 @@ def test_op_counter_bitwise(): i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.SUBGROUP) ].eval_with_dict(params) # (count-per-sub-group)*n_subgroups - assert i32add == n*m+n*m*ell*n_subgroups + assert i32add == n*m*ell*n_subgroups assert i32bw == 2*n*m*ell*n_subgroups assert i64bw == 2*n*m*n_subgroups assert i64add == i64mul == n*m*n_subgroups @@ -1153,7 +1156,8 @@ def test_summations_and_filters(): assert f32lall == (3*n*m*ell)*n_subgroups assert f64lall == (2*n*m)*n_subgroups - op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True) + op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True, + count_within_subscripts=True) #for k, v in op_map.items(): # print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v) -- GitLab From 9317bc4f034aa9624ecf4d6d8f45b78ea844687f Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Sun, 18 Nov 2018 18:41:00 -0600 Subject: [PATCH 17/34] Documentation fixes --- doc/index.rst | 10 +++++----- doc/ref_kernel.rst | 2 +- examples/python/hello-loopy.loopy | 2 +- loopy/kernel/data.py | 2 +- loopy/transform/add_barrier.py | 8 ++++---- loopy/transform/batch.py | 1 + loopy/transform/buffer.py | 2 +- loopy/transform/iname.py | 2 -- 8 files changed, 14 insertions(+), 15 deletions(-) diff --git a/doc/index.rst b/doc/index.rst index d862a8acd..b77bbb16f 100644 --- a/doc/index.rst +++ b/doc/index.rst @@ -25,18 +25,18 @@ Want to try out loopy? There's no need to go through :ref:`installation` if you'd just like to get a feel for what loopy is. Instead, you may -`download a self-contained Linux binary `_. +`download a self-contained Linux binary `_. This is purposefully built on an ancient Linux distribution, so it should work on most versions of Linux that are currently out there. Once you have the binary, do the following:: chmod +x ./loopy-centos6 - ./loopy-centos6 --target=opencl hello-loopy-lp.py - ./loopy-centos6 --target=cuda hello-loopy-lp.py - ./loopy-centos6 --target=ispc hello-loopy-lp.py + ./loopy-centos6 --target=opencl hello-loopy.loopy + ./loopy-centos6 --target=cuda hello-loopy.loopy + ./loopy-centos6 --target=ispc hello-loopy.loopy -Grab the example here: :download:`examples/python/hello-loopy.py <../examples/python/hello-loopy-lp.py>`. +Grab the example here: :download:`examples/python/hello-loopy.loopy <../examples/python/hello-loopy.loopy>`. You may also donwload the most recent version by going to the `list of builds `_, clicking on the newest one diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst index c9ce20626..62d76c779 100644 --- a/doc/ref_kernel.rst +++ b/doc/ref_kernel.rst @@ -406,7 +406,7 @@ Arguments :members: :undoc-members: -.. autoclass:: GlobalArg +.. autofunction:: GlobalArg :members: :undoc-members: diff --git a/examples/python/hello-loopy.loopy b/examples/python/hello-loopy.loopy index 0ba44d6ec..7f7973098 100644 --- a/examples/python/hello-loopy.loopy +++ b/examples/python/hello-loopy.loopy @@ -1,7 +1,7 @@ # This is a version of hello-loopy.py that can be run through # a loopy binary using # -# ./loopy --lang=loopy hello-loopy-lp.py - +# ./loopy --lang=loopy hello-loopy.loopy - knl = lp.make_kernel( "{ [i]: 0<=i Date: Sun, 18 Nov 2018 18:48:12 -0600 Subject: [PATCH 18/34] More fixes --- doc/ref_kernel.rst | 2 -- doc/tutorial.rst | 2 +- loopy/kernel/__init__.py | 2 +- 3 files changed, 2 insertions(+), 4 deletions(-) diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst index 62d76c779..11ec7f030 100644 --- a/doc/ref_kernel.rst +++ b/doc/ref_kernel.rst @@ -407,8 +407,6 @@ Arguments :undoc-members: .. autofunction:: GlobalArg - :members: - :undoc-members: .. autoclass:: ConstantArg :members: diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 1272d2a59..73f5dea75 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1361,7 +1361,7 @@ code-generation however additional implementation may be required for custom functions. The full lists of available functions may be found in a the :class:`TargetBase` implementation (e.g. :class:`CudaTarget`) -Custom user functions may be represented using the method described in :ref:`_functions` +Custom user functions may be represented using the method described in :ref:`functions` Data-dependent control flow diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py index 6b0033808..e3342d0f9 100644 --- a/loopy/kernel/__init__.py +++ b/loopy/kernel/__init__.py @@ -142,7 +142,7 @@ class LoopKernel(ImmutableRecordWithoutPickling): .. note:: This data structure and its attributes should be considered immutable, - even if it contains mutable data types. See :method:`copy` for an easy + even if it contains mutable data types. See :meth:`copy` for an easy way of producing a modified copy. .. attribute:: domains -- GitLab From 18ded6348d7c56c1d8994d7f0c412d9eb2337ed8 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Mon, 19 Nov 2018 17:11:31 -0600 Subject: [PATCH 19/34] Fix a broken doc entry --- loopy/kernel/array.py | 18 +++++++++--------- 1 file changed, 9 insertions(+), 9 deletions(-) diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py index 6bf733a84..bae9d7d1f 100644 --- a/loopy/kernel/array.py +++ b/loopy/kernel/array.py @@ -549,15 +549,15 @@ class ArrayBase(ImmutableRecord): .. attribute :: name .. attribute :: dtype - the :class:`loopy.loopytype` of the array. - if this is *none*, :mod:`loopy` will try to continue without - knowing the type of this array, where the idea is that precise - knowledge of the type will become available at invocation time. - :class:`loopy.compiledkernel` (and thereby - :meth:`loopy.loopkernel.__call__`) automatically add this type - information based on invocation arguments. - - note that some transformations, such as :func:`loopy.add_padding` + + The :class:`loopy.types.LoopyType` of the array. If this is *None*, + :mod:`loopy` will try to continue without knowing the type of this + array, where the idea is that precise knowledge of the type will become + available at invocation time. Calling the kernel + (via :meth:`loopy.LoopKernel.__call__`) + automatically adds this type information based on invocation arguments. + + Note that some transformations, such as :func:`loopy.add_padding` cannot be performed without knowledge of the exact *dtype*. .. attribute :: shape -- GitLab From 9cfdd91bdf1b613dc72e96ed4bc70d3b67223163 Mon Sep 17 00:00:00 2001 From: Matt Wala Date: Wed, 21 Nov 2018 00:03:54 -0600 Subject: [PATCH 20/34] Change GlobalArg to ArrayArg --- doc/ref_kernel.rst | 6 ++++-- loopy/kernel/data.py | 2 +- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst index 11ec7f030..896388d29 100644 --- a/doc/ref_kernel.rst +++ b/doc/ref_kernel.rst @@ -406,7 +406,9 @@ Arguments :members: :undoc-members: -.. autofunction:: GlobalArg +.. autoclass:: ArrayArg + :members: + :undoc-members: .. autoclass:: ConstantArg :members: @@ -591,7 +593,7 @@ Do not create :class:`LoopKernel` objects directly. Instead, refer to Implementation Detail: The Base Array ------------------------------------- -All array-like data in :mod:`loopy` (such as :class:`GlobalArg` and +All array-like data in :mod:`loopy` (such as :class:`ArrayArg` and :class:`TemporaryVariable`) derive from single, shared base array type, described next. diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py index d6490aa88..7877f8b93 100644 --- a/loopy/kernel/data.py +++ b/loopy/kernel/data.py @@ -49,7 +49,7 @@ from warnings import warn class auto(object): # noqa """A generic placeholder object for something that should be automatically determined. See, for example, the *shape* or *strides* argument of - :func:`GlobalArg`. + :class:`ArrayArg`. """ -- GitLab From fe4ed770ab4d037c53888d96290bb163ac56e33c Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Wed, 28 Nov 2018 18:58:17 -0600 Subject: [PATCH 21/34] added variable_tag to MemAccess; GlobalMemAccessCounter tracking variable tags for tagged global variables --- loopy/statistics.py | 21 ++++++++++++--- test/test_statistics.py | 59 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 77 insertions(+), 3 deletions(-) diff --git a/loopy/statistics.py b/loopy/statistics.py index b467e3334..9ce2bb081 100755 --- a/loopy/statistics.py +++ b/loopy/statistics.py @@ -581,6 +581,11 @@ class MemAccess(Record): A :class:`str` that specifies the variable name of the data accessed. + .. attribute:: variable_tag + + A :class:`str` that specifies the variable tag of a + :class:`pymbolic.primitives.TaggedVariable`. + .. attribute:: count_granularity A :class:`str` that specifies whether this operation should be counted @@ -597,7 +602,8 @@ class MemAccess(Record): """ def __init__(self, mtype=None, dtype=None, lid_strides=None, gid_strides=None, - direction=None, variable=None, count_granularity=None): + direction=None, variable=None, variable_tag=None, + count_granularity=None): if count_granularity not in CountGranularity.ALL+[None]: raise ValueError("Op.__init__: count_granularity '%s' is " @@ -607,12 +613,14 @@ class MemAccess(Record): if dtype is None: Record.__init__(self, mtype=mtype, dtype=dtype, lid_strides=lid_strides, gid_strides=gid_strides, direction=direction, - variable=variable, count_granularity=count_granularity) + variable=variable, variable_tag=variable_tag, + count_granularity=count_granularity) else: from loopy.types import to_loopy_type Record.__init__(self, mtype=mtype, dtype=to_loopy_type(dtype), lid_strides=lid_strides, gid_strides=gid_strides, direction=direction, variable=variable, + variable_tag=variable_tag, count_granularity=count_granularity) def __hash__(self): @@ -622,7 +630,7 @@ class MemAccess(Record): def __repr__(self): # Record.__repr__ overridden for consistent ordering and conciseness - return "MemAccess(%s, %s, %s, %s, %s, %s, %s)" % ( + return "MemAccess(%s, %s, %s, %s, %s, %s, %s, %s)" % ( self.mtype, self.dtype, None if self.lid_strides is None else dict( @@ -631,6 +639,7 @@ class MemAccess(Record): sorted(six.iteritems(self.gid_strides))), self.direction, self.variable, + self.variable_tag, self.count_granularity) # }}} @@ -985,6 +994,10 @@ class GlobalMemAccessCounter(MemAccessCounter): def map_subscript(self, expr): name = expr.aggregate.name + try: + var_tag = expr.aggregate.tag + except AttributeError: + var_tag = None if name in self.knl.arg_dict: array = self.knl.arg_dict[name] @@ -1013,6 +1026,7 @@ class GlobalMemAccessCounter(MemAccessCounter): lid_strides=dict(sorted(six.iteritems(lid_strides))), gid_strides=dict(sorted(six.iteritems(gid_strides))), variable=name, + variable_tag=var_tag, count_granularity=count_granularity ): 1} ) + self.rec(expr.index_tuple) @@ -1634,6 +1648,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False, gid_strides=mem_access.gid_strides, direction=mem_access.direction, variable=mem_access.variable, + variable_tag=mem_access.variable_tag, count_granularity=mem_access.count_granularity), ct) for mem_access, ct in six.iteritems(access_map.count_map)), diff --git a/test/test_statistics.py b/test/test_statistics.py index 41b44b5a7..b29edf1ed 100644 --- a/test/test_statistics.py +++ b/test/test_statistics.py @@ -1060,6 +1060,65 @@ def test_all_counters_parallel_matmul(): assert local_mem_s == m*2/bsize*n_subgroups +def test_mem_access_tagged_variables(): + bsize = 16 + knl = lp.make_kernel( + "{[i,k,j]: 0<=i Date: Wed, 28 Nov 2018 19:08:11 -0600 Subject: [PATCH 22/34] updated tutotial w/variable tag printing --- doc/tutorial.rst | 74 ++++++++++++++++++++++++------------------------ 1 file changed, 37 insertions(+), 37 deletions(-) diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 73f5dea75..397f34a98 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -1641,15 +1641,15 @@ we'll continue using the kernel from the previous example: >>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {}, {}, load, a, subgroup) : ... + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : ... Each line of output will look roughly like:: - MemAccess(global, np:dtype('float32'), {}, {}, load, a, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, {}, load, b, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, {}, store, c, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 } :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{** :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**. @@ -1684,13 +1684,13 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', CG.SUBGROUP) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', None, CG.SUBGROUP) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', CG.SUBGROUP) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', None, CG.SUBGROUP) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', CG.SUBGROUP) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', None, CG.SUBGROUP) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', CG.SUBGROUP) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', None, CG.SUBGROUP) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1708,13 +1708,13 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: >>> bytes_map = mem_map.to_bytes() >>> print(lp.stringify_stats_mapping(bytes_map)) - MemAccess(global, np:dtype('float32'), {}, {}, load, a, subgroup) : ... + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : ... >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global'] ... ).group_by('direction') >>> print(lp.stringify_stats_mapping(global_ld_st_bytes)) - MemAccess(None, None, None, None, load, None, None) : ... - MemAccess(None, None, None, None, store, None, None) : ... + MemAccess(None, None, None, None, load, None, None, None) : ... + MemAccess(None, None, None, None, store, None, None, None) : ... >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load') ... ].eval_with_dict(param_dict) @@ -1726,12 +1726,12 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`: The lines of output above might look like:: - MemAccess(global, np:dtype('float32'), {}, {}, load, a, subgroup) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, {}, load, b, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float32'), {}, {}, store, c, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), {}, {}, load, g, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), {}, {}, load, h, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } - MemAccess(global, np:dtype('float64'), {}, {}, store, e, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), {}, {}, load, g, None, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), {}, {}, load, h, None, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } + MemAccess(global, np:dtype('float64'), {}, {}, store, e, None, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 } One can see how these functions might be useful in computing, for example, achieved memory bandwidth in byte/sec or performance in FLOP/sec. @@ -1751,12 +1751,12 @@ this time. ... outer_tag="l.1", inner_tag="l.0") >>> mem_map = lp.get_mem_access_map(knl_consec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem) : ... With this parallelization, consecutive work-items will access consecutive array @@ -1766,13 +1766,13 @@ array accesses has not changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', CG.WORKITEM) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', None, CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', CG.WORKITEM) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', None, CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', CG.WORKITEM) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', None, CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', CG.WORKITEM) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', None, CG.WORKITEM) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) @@ -1792,12 +1792,12 @@ we'll switch the inner and outer tags in our parallelization of the kernel: ... outer_tag="l.0", inner_tag="l.1") >>> mem_map = lp.get_mem_access_map(knl_nonconsec, subgroup_size=32) >>> print(lp.stringify_stats_mapping(mem_map)) - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, workitem) : ... - MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, workitem) : ... - MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem) : ... + MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem) : ... + MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem) : ... With this parallelization, consecutive work-items will access *nonconsecutive* @@ -1806,13 +1806,13 @@ changed: .. doctest:: - >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', CG.WORKITEM) + >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', None, CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', CG.WORKITEM) + >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', None, CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', CG.WORKITEM) + >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', None, CG.WORKITEM) ... ].eval_with_dict(param_dict) - >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', CG.WORKITEM) + >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', None, CG.WORKITEM) ... ].eval_with_dict(param_dict) >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" % ... (f32ld_a, f32st_c, f64ld_g, f64st_e)) -- GitLab From 6ab14cebbacd1ee38df8cc479ce737bce8b741ed Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 3 Dec 2018 23:36:53 -0600 Subject: [PATCH 23/34] allowing (global) variables to be excluded in remove_work() --- loopy/transform/instruction.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 3dd7009ea..2325e1b0b 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -362,9 +362,10 @@ def uniquify_instruction_ids(kernel): # {{{ remove_work class _MemAccessGatherer(CombineMapper): - def __init__(self, kernel, address_space): + def __init__(self, kernel, address_space, exclude_vars=[]): self.kernel = kernel self.address_space = address_space + self.exclude_vars = exclude_vars def combine(self, values): from pytools import flatten @@ -381,7 +382,8 @@ class _MemAccessGatherer(CombineMapper): return set() descr = self.kernel.get_var_descriptor(name) - if descr.address_space == self.address_space: + if descr.address_space == self.address_space and \ + name not in self.exclude_vars: result = set([expr]) else: result = set() @@ -420,7 +422,7 @@ def _make_grid_size_domain(kernel, var_name_gen=None): return ggrid_var_names, lgrid_var_names, grid_range_dom -def remove_work(kernel): +def remove_work(kernel, remove_vars=[]): """This transform removes operations in a kernel, leaving only accesses to global memory. @@ -434,7 +436,8 @@ def remove_work(kernel): kernel = lp.preprocess_kernel(kernel) - gatherer = _MemAccessGatherer(kernel, lp.AddressSpace.GLOBAL) + gatherer = _MemAccessGatherer(kernel, lp.AddressSpace.GLOBAL, + exclude_vars=remove_vars) from loopy.kernel.instruction import MultiAssignmentBase, make_assignment -- GitLab From 2d53c7f9d0b78bd866de9b8b9cfbef3ae2782700 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Tue, 4 Dec 2018 00:15:13 -0600 Subject: [PATCH 24/34] (WIP) writing read_tgt_var to global to ensure instructions execute, which means creating a new global arg if output var no longer exists (was removed w/remove_work) --- loopy/transform/instruction.py | 58 +++++++++++++++++++++++++++++++++- 1 file changed, 57 insertions(+), 1 deletion(-) diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 2325e1b0b..5fe3c6c6b 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -384,6 +384,7 @@ class _MemAccessGatherer(CombineMapper): descr = self.kernel.get_var_descriptor(name) if descr.address_space == self.address_space and \ name not in self.exclude_vars: + # TODO what about tags? result = set([expr]) else: result = set() @@ -448,6 +449,7 @@ def remove_work(kernel, remove_vars=[]): var_name_gen = kernel.get_var_name_generator() read_tgt_var_name = var_name_gen("read_tgt") new_temporary_variables = kernel.temporary_variables.copy() + new_args = kernel.args.copy() new_temporary_variables[read_tgt_var_name] = lp.TemporaryVariable( read_tgt_var_name, address_space=lp.AddressSpace.PRIVATE) @@ -473,6 +475,8 @@ def remove_work(kernel, remove_vars=[]): read_insn_ids = [] + read_tgt_var_written_to_global = False + for insn in kernel.instructions: if not isinstance(insn, MultiAssignmentBase): new_instructions.append(insn) @@ -502,11 +506,14 @@ def remove_work(kernel, remove_vars=[]): new_instructions.append( make_assignment( (write_expr,), - 17, + #17, + p.Variable(read_tgt_var_name), # TODO temporary hack + # insns won't execute unless output is written to global id=new_id, within_inames=insn.within_inames, depends_on=insn.depends_on)) new_insn_ids.add(new_id) + read_tgt_var_written_to_global = True # TODO part of hack above old_to_new_ids[insn.id] = frozenset(new_insn_ids) @@ -514,6 +521,8 @@ def remove_work(kernel, remove_vars=[]): # {{{ create write-out insn for read target + # TODO writing to temp doesn't guarantee execution, need to write to global mem + """ _, lgrid = kernel.get_grid_size_upper_bounds_as_exprs() read_tgt_local_dest_name = var_name_gen("read_tgt_dest") new_temporary_variables[read_tgt_local_dest_name] = lp.TemporaryVariable( @@ -531,6 +540,52 @@ def remove_work(kernel, remove_vars=[]): id=write_read_tgt_id, depends_on=frozenset(read_insn_ids), within_inames=grid_inames)) + """ + + if not read_tgt_var_written_to_global: + # TODO must write read_tgt_var to global or instructions may not execute, + # TODO if write variable has been removed, need a new write variable + # TODO if write variable has not been removed, just write to that variable? + ggrid, lgrid = kernel.get_grid_size_upper_bounds_as_exprs() + lstrides = [] + gstrides = [] + tot_stride = 1 + lstrides.append(tot_stride) + for size in lgrid[:-1]: + tot_stride *= size + lstrides.append(tot_stride) + tot_stride *= lgrid[-1] + gstrides.append(tot_stride) + for size in ggrid[:-1]: + tot_stride *= size + gstrides.append(tot_stride) + + strides = ",".join(list(reversed([str(s) for s in lstrides+gstrides]))) + + # TODO decide what this mem access pattern should be + read_tgt_global_dest_name = var_name_gen("read_tgt_dest") + new_args.append(lp.GlobalArg( + name=read_tgt_global_dest_name, + shape=tuple(reversed(lgrid + ggrid)), + strides=strides, + )) + # TODO WEIRD BEHAVIOR: when kernel has not been cached, this works fine, + # but when kernel is used again, this new arg ends up const and writing + # to it causes an error... ??? + + write_read_tgt_id = insn_id_gen("write_read_tgt") + old_to_new_ids[write_read_tgt_id] = [write_read_tgt_id] + new_instructions.append( + make_assignment( + (p.Variable(read_tgt_global_dest_name)[ + tuple(p.Variable(gn) for gn in + reversed(lgrid_var_names+ggrid_var_names))],), + p.Variable(read_tgt_var_name), + id=write_read_tgt_id, + depends_on=frozenset(read_insn_ids), + within_inames=grid_inames, + #within_inames=grid_inames_untagged, + )) # }}} @@ -552,6 +607,7 @@ def remove_work(kernel, remove_vars=[]): domains=kernel.domains + [grid_range_dom], state=lp.KernelState.INITIAL, instructions=new_instructions_2, + args=new_args, temporary_variables=new_temporary_variables) from loopy.kernel.data import GroupIndexTag, LocalIndexTag -- GitLab From bbe75e285d709be68e3c02b3ac76de2e156e9b89 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Tue, 4 Dec 2018 19:05:16 -0600 Subject: [PATCH 25/34] more standard mem access pattern for newly created output var; made function defining index/shape/stride ordering for consistency --- loopy/transform/instruction.py | 51 ++++++++++++++++++++-------------- 1 file changed, 30 insertions(+), 21 deletions(-) diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 5fe3c6c6b..f1b1b3245 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -546,45 +546,54 @@ def remove_work(kernel, remove_vars=[]): # TODO must write read_tgt_var to global or instructions may not execute, # TODO if write variable has been removed, need a new write variable # TODO if write variable has not been removed, just write to that variable? - ggrid, lgrid = kernel.get_grid_size_upper_bounds_as_exprs() - lstrides = [] - gstrides = [] - tot_stride = 1 - lstrides.append(tot_stride) - for size in lgrid[:-1]: - tot_stride *= size - lstrides.append(tot_stride) - tot_stride *= lgrid[-1] - gstrides.append(tot_stride) - for size in ggrid[:-1]: - tot_stride *= size - gstrides.append(tot_stride) - - strides = ",".join(list(reversed([str(s) for s in lstrides+gstrides]))) + # define order for indexing/shape/strides + def index_order(local_list, global_list): + # produce this order: [g.n, l.n, ..., g.1, l.1, g.0, l.0] + # accept both dicts of {dim: val} and ordered lists [val0, val1, ...] + result = [] + for i in reversed(range(len(local_list))): + result.append(global_list[i]) + result.append(local_list[i]) + return result + + # define local/global strides # TODO decide what this mem access pattern should be + ggrid, lgrid = kernel.get_grid_size_upper_bounds_as_exprs() + lstrides = {0: 1} + assert len(lgrid) == len(ggrid) # TODO is this necessary? + for dim in range(1,len(lgrid)): + lstrides[dim] = lstrides[dim-1]*lgrid[dim-1]*ggrid[dim-1] + gstrides = {} + for dim in range(0,len(ggrid)): + gstrides[dim] = lstrides[dim]*lgrid[dim] + + # use consistent index ordering for strides, shape, and index + strides = index_order(lstrides, gstrides) + shape = tuple(index_order(lgrid, ggrid)) + index = tuple(p.Variable(i) for i in index_order(lgrid_var_names, ggrid_var_names)) + + # create new global arg to write results read_tgt_global_dest_name = var_name_gen("read_tgt_dest") new_args.append(lp.GlobalArg( name=read_tgt_global_dest_name, - shape=tuple(reversed(lgrid + ggrid)), - strides=strides, + shape=shape, + strides=",".join(str(s) for s in strides), )) # TODO WEIRD BEHAVIOR: when kernel has not been cached, this works fine, # but when kernel is used again, this new arg ends up const and writing # to it causes an error... ??? + # create instruction writing read_tgt to new global arg write_read_tgt_id = insn_id_gen("write_read_tgt") old_to_new_ids[write_read_tgt_id] = [write_read_tgt_id] new_instructions.append( make_assignment( - (p.Variable(read_tgt_global_dest_name)[ - tuple(p.Variable(gn) for gn in - reversed(lgrid_var_names+ggrid_var_names))],), + (p.Variable(read_tgt_global_dest_name)[index],), p.Variable(read_tgt_var_name), id=write_read_tgt_id, depends_on=frozenset(read_insn_ids), within_inames=grid_inames, - #within_inames=grid_inames_untagged, )) # }}} -- GitLab From 035814da3504b106a29fa6f089083724cade3382 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Thu, 6 Dec 2018 19:21:44 -0600 Subject: [PATCH 26/34] setting dtype of global arg --- loopy/transform/instruction.py | 22 ++++++++++++++++++---- 1 file changed, 18 insertions(+), 4 deletions(-) diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index f1b1b3245..322e0576f 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -476,7 +476,9 @@ def remove_work(kernel, remove_vars=[]): read_insn_ids = [] read_tgt_var_written_to_global = False - + type_inf = lp.type_inference.TypeInferenceMapper(kernel) + read_tgt_var_dtype = None + read_tgt_var_read_expr_acc = None for insn in kernel.instructions: if not isinstance(insn, MultiAssignmentBase): new_instructions.append(insn) @@ -501,6 +503,16 @@ def remove_work(kernel, remove_vars=[]): depends_on=insn.depends_on | frozenset([read_tgt_init_id]))) new_insn_ids.add(new_id) + # determine type of read_expr + # TODO loopy already has a way of doing this, + # use that instead? (need this to set type of output arg) + if read_tgt_var_dtype is None: + read_tgt_var_dtype = type_inf(read_expr) + read_tgt_var_read_expr_acc = read_expr + elif type_inf(read_expr) != read_tgt_var_dtype: + read_tgt_var_read_expr_acc += read_expr + read_tgt_var_dtype = type_inf(read_tgt_var_read_expr_acc) + for write_expr in writer_accesses: new_id = insn_id_gen(insn.id) new_instructions.append( @@ -562,22 +574,24 @@ def remove_work(kernel, remove_vars=[]): ggrid, lgrid = kernel.get_grid_size_upper_bounds_as_exprs() lstrides = {0: 1} assert len(lgrid) == len(ggrid) # TODO is this necessary? - for dim in range(1,len(lgrid)): + for dim in range(1, len(lgrid)): lstrides[dim] = lstrides[dim-1]*lgrid[dim-1]*ggrid[dim-1] gstrides = {} - for dim in range(0,len(ggrid)): + for dim in range(0, len(ggrid)): gstrides[dim] = lstrides[dim]*lgrid[dim] # use consistent index ordering for strides, shape, and index strides = index_order(lstrides, gstrides) shape = tuple(index_order(lgrid, ggrid)) - index = tuple(p.Variable(i) for i in index_order(lgrid_var_names, ggrid_var_names)) + index = tuple(p.Variable(i) for i in + index_order(lgrid_var_names, ggrid_var_names)) # create new global arg to write results read_tgt_global_dest_name = var_name_gen("read_tgt_dest") new_args.append(lp.GlobalArg( name=read_tgt_global_dest_name, shape=shape, + dtype=read_tgt_var_dtype, strides=",".join(str(s) for s in strides), )) # TODO WEIRD BEHAVIOR: when kernel has not been cached, this works fine, -- GitLab From 656acb204ec547407fdfb115e50099c60baa809e Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Thu, 6 Dec 2018 20:00:43 -0600 Subject: [PATCH 27/34] added todo (deal with 'instruction does not use all group hw axes' error) --- loopy/transform/instruction.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 322e0576f..532626396 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -480,6 +480,8 @@ def remove_work(kernel, remove_vars=[]): read_tgt_var_dtype = None read_tgt_var_read_expr_acc = None for insn in kernel.instructions: + # TODO after instructions are removed, could produce + # "instruction does not use all group hw axes" error... if not isinstance(insn, MultiAssignmentBase): new_instructions.append(insn) old_to_new_ids[insn.id] = frozenset([insn.id]) -- GitLab From a0e06bc26251dc313ad2a0cc35bb47ba0f7a5840 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Fri, 7 Dec 2018 16:21:43 -0600 Subject: [PATCH 28/34] don't copy args if they're being removed --- loopy/transform/instruction.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 532626396..dfe0891b5 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -449,7 +449,7 @@ def remove_work(kernel, remove_vars=[]): var_name_gen = kernel.get_var_name_generator() read_tgt_var_name = var_name_gen("read_tgt") new_temporary_variables = kernel.temporary_variables.copy() - new_args = kernel.args.copy() + new_args = [arg.copy() for arg in kernel.args if arg.name not in remove_vars] new_temporary_variables[read_tgt_var_name] = lp.TemporaryVariable( read_tgt_var_name, address_space=lp.AddressSpace.PRIVATE) -- GitLab From 3ba8e3fabf16ad9fa660428ef5823bca8e454200 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Sun, 9 Dec 2018 18:38:11 -0600 Subject: [PATCH 29/34] added todo for adding grid vars to within_inames in insn --- loopy/transform/instruction.py | 6 ++---- 1 file changed, 2 insertions(+), 4 deletions(-) diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index dfe0891b5..667023112 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -501,7 +501,7 @@ def remove_work(kernel, remove_vars=[]): (p.Variable(read_tgt_var_name),), p.Variable(read_tgt_var_name) + read_expr, id=new_id, - within_inames=insn.within_inames, + within_inames=insn.within_inames, # TODO try sticking in grid variables depends_on=insn.depends_on | frozenset([read_tgt_init_id]))) new_insn_ids.add(new_id) @@ -520,9 +520,7 @@ def remove_work(kernel, remove_vars=[]): new_instructions.append( make_assignment( (write_expr,), - #17, - p.Variable(read_tgt_var_name), # TODO temporary hack - # insns won't execute unless output is written to global + p.Variable(read_tgt_var_name), id=new_id, within_inames=insn.within_inames, depends_on=insn.depends_on)) -- GitLab From aaeaa2df5d091336c2239cfbdc42441fa8ecbcb8 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Tue, 11 Dec 2018 01:21:32 -0600 Subject: [PATCH 30/34] adding unrepresented axis tags to within_inames --- loopy/transform/instruction.py | 29 ++++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 667023112..0461c7a2d 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -473,6 +473,24 @@ def remove_work(kernel, remove_vars=[]): # {{{ rewrite instructions + from loopy.kernel.data import GroupIndexTag, LocalIndexTag + + def add_unrepresented_grid_vars_to_inames(inames): + local_axes_needed = set(range(len(lgrid_var_names))) + group_axes_needed = set(range(len(ggrid_var_names))) + #TODO deal with key errors + for iname in inames: + try: + tag = kernel.iname_to_tag[iname] + except KeyError: + continue + if isinstance(tag, LocalIndexTag): + local_axes_needed.remove(tag.axis) + elif isinstance(tag, GroupIndexTag): + group_axes_needed.remove(tag.axis) + return inames | set([lgrid_var_names[axis] for axis in local_axes_needed] + + [ggrid_var_names[axis] for axis in group_axes_needed]) + read_insn_ids = [] read_tgt_var_written_to_global = False @@ -493,6 +511,8 @@ def remove_work(kernel, remove_vars=[]): reader_accesses = gatherer(insn.expression) new_insn_ids = set() + new_within_inames = frozenset( + add_unrepresented_grid_vars_to_inames(insn.within_inames)) for read_expr in reader_accesses: new_id = insn_id_gen(insn.id) read_insn_ids.append(insn.id) @@ -501,13 +521,13 @@ def remove_work(kernel, remove_vars=[]): (p.Variable(read_tgt_var_name),), p.Variable(read_tgt_var_name) + read_expr, id=new_id, - within_inames=insn.within_inames, # TODO try sticking in grid variables + within_inames=new_within_inames, depends_on=insn.depends_on | frozenset([read_tgt_init_id]))) new_insn_ids.add(new_id) # determine type of read_expr - # TODO loopy already has a way of doing this, - # use that instead? (need this to set type of output arg) + # TODO loopy already has a way of figuring this out, + # can we use that instead? (need this to set type of output arg) if read_tgt_var_dtype is None: read_tgt_var_dtype = type_inf(read_expr) read_tgt_var_read_expr_acc = read_expr @@ -522,7 +542,7 @@ def remove_work(kernel, remove_vars=[]): (write_expr,), p.Variable(read_tgt_var_name), id=new_id, - within_inames=insn.within_inames, + within_inames=new_within_inames, depends_on=insn.depends_on)) new_insn_ids.add(new_id) read_tgt_var_written_to_global = True # TODO part of hack above @@ -633,7 +653,6 @@ def remove_work(kernel, remove_vars=[]): args=new_args, temporary_variables=new_temporary_variables) - from loopy.kernel.data import GroupIndexTag, LocalIndexTag kernel = lp.tag_inames(kernel, dict( (ggrid_var_names[i], GroupIndexTag(i)) for i in range(len(ggrid_var_names)))) -- GitLab From 4d2b8460b99dda21960355366d736634684ea035 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Tue, 11 Dec 2018 21:56:32 -0600 Subject: [PATCH 31/34] setting new kernel._cached_written_variables to None since they may have changed --- loopy/transform/instruction.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 0461c7a2d..a92b4e946 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -651,7 +651,8 @@ def remove_work(kernel, remove_vars=[]): state=lp.KernelState.INITIAL, instructions=new_instructions_2, args=new_args, - temporary_variables=new_temporary_variables) + temporary_variables=new_temporary_variables, + _cached_written_variables=None) kernel = lp.tag_inames(kernel, dict( (ggrid_var_names[i], GroupIndexTag(i)) -- GitLab From 943bd65dda1662f7aeb6ef9d96c20f156e030439 Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Thu, 13 Dec 2018 22:07:43 -0600 Subject: [PATCH 32/34] added warning about potential loop priority change in remove_work --- loopy/transform/instruction.py | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index a92b4e946..5ce9938c0 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -661,6 +661,14 @@ def remove_work(kernel, remove_vars=[]): (lgrid_var_names[i], LocalIndexTag(i)) for i in range(len(lgrid_var_names)))) + if not kernel.loop_priority: + from loopy.diagnostic import warn_with_kernel + warn_with_kernel(kernel, "remove_work_loop_priority", + "Kernel loop_priority unspecified. " + "remove_work() may yield loop priority differing " + "from that of original kernel. To ensure desired " + "loop priority, use lp.prioritize_loops().") + return kernel # }}} -- GitLab From 7538c6540c91bc53248602019c949ca3b9c35eff Mon Sep 17 00:00:00 2001 From: jdsteve2 Date: Mon, 31 Dec 2018 01:10:29 -0600 Subject: [PATCH 33/34] remove_work allowing for kernel name change --- loopy/transform/instruction.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 5ce9938c0..8a5bb7604 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -423,7 +423,7 @@ def _make_grid_size_domain(kernel, var_name_gen=None): return ggrid_var_names, lgrid_var_names, grid_range_dom -def remove_work(kernel, remove_vars=[]): +def remove_work(kernel, remove_vars=[], new_knl_name=None): """This transform removes operations in a kernel, leaving only accesses to global memory. @@ -446,6 +446,10 @@ def remove_work(kernel, remove_vars=[]): old_to_new_ids = {} insn_id_gen = kernel.get_instruction_id_generator() + if new_knl_name: + new_name = new_knl_name + else: + new_name = kernel.name var_name_gen = kernel.get_var_name_generator() read_tgt_var_name = var_name_gen("read_tgt") new_temporary_variables = kernel.temporary_variables.copy() @@ -652,7 +656,9 @@ def remove_work(kernel, remove_vars=[]): instructions=new_instructions_2, args=new_args, temporary_variables=new_temporary_variables, - _cached_written_variables=None) + name=new_name, + _cached_written_variables=None, + ) kernel = lp.tag_inames(kernel, dict( (ggrid_var_names[i], GroupIndexTag(i)) -- GitLab From 6444cfd277b4844ae00d742a38933d4b621774b9 Mon Sep 17 00:00:00 2001 From: James Stevens Date: Wed, 2 Jan 2019 02:20:33 -0600 Subject: [PATCH 34/34] adding unused inames to ops when removing work --- loopy/transform/instruction.py | 77 ++++++++++++++++++++++++++++------ 1 file changed, 65 insertions(+), 12 deletions(-) diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py index 8a5bb7604..4d7f70077 100644 --- a/loopy/transform/instruction.py +++ b/loopy/transform/instruction.py @@ -27,6 +27,7 @@ import islpy as isl from loopy.diagnostic import LoopyError from loopy.symbolic import CombineMapper +from pymbolic.mapper import Collector # {{{ find_instructions @@ -400,6 +401,34 @@ class _MemAccessGatherer(CombineMapper): return self._map_access(expr, expr.aggregate.name, expr.index) +class _VariableGatherer(Collector): + # TODO add tests for this + def __init__(self, search_variables): + self.search_variables = search_variables + + #def combine(self, values): + # from pytools import flatten + # return set(flatten(values)) + + def map_variable(self, expr): + if expr.name in self.search_variables: + return set([expr.name]) + else: + return set() + + map_tagged_variable = map_variable + + # TODO do I need this? + def map_reduction(self, expr): + return self.rec(expr.expr) + + # TODO do I need this? + map_linear_subscript = CombineMapper.map_subscript + +#def map_subscript(self, expr): +# return self.rec(expr.index) + + def _make_grid_size_domain(kernel, var_name_gen=None): if var_name_gen is None: var_name_gen = kernel.get_var_name_generator() @@ -423,7 +452,7 @@ def _make_grid_size_domain(kernel, var_name_gen=None): return ggrid_var_names, lgrid_var_names, grid_range_dom -def remove_work(kernel, remove_vars=[], new_knl_name=None): +def remove_work(kernel, remove_vars=[], new_knl_name=None, use_unused_inames=False): """This transform removes operations in a kernel, leaving only accesses to global memory. @@ -459,6 +488,25 @@ def remove_work(kernel, remove_vars=[], new_knl_name=None): new_instructions = [] + # TODO figure out which of these tags I really need to deal with + from loopy.kernel.data import GroupIndexTag, LocalIndexTag, UnrollTag + parallel_inames = set([iname for iname, tag in kernel.iname_to_tag.items() if + (isinstance(tag, LocalIndexTag) or + isinstance(tag, GroupIndexTag))]) + unrolled_inames = set([iname for iname, tag in kernel.iname_to_tag.items() if + isinstance(tag, UnrollTag)]) + + iname_gatherer = _VariableGatherer(kernel.all_inames() - + (parallel_inames | unrolled_inames)) + + def get_unused_inames(insn): + inames_required = insn.within_inames - (parallel_inames | unrolled_inames) + inames_found = set.union( + *[iname_gatherer(assignee) for assignee in insn.assignees], + iname_gatherer(insn.expression), + ) + return inames_required - inames_found + # {{{ create init insn for read target ggrid_var_names, lgrid_var_names, grid_range_dom = _make_grid_size_domain(kernel) @@ -477,8 +525,6 @@ def remove_work(kernel, remove_vars=[], new_knl_name=None): # {{{ rewrite instructions - from loopy.kernel.data import GroupIndexTag, LocalIndexTag - def add_unrepresented_grid_vars_to_inames(inames): local_axes_needed = set(range(len(lgrid_var_names))) group_axes_needed = set(range(len(ggrid_var_names))) @@ -502,8 +548,6 @@ def remove_work(kernel, remove_vars=[], new_knl_name=None): read_tgt_var_dtype = None read_tgt_var_read_expr_acc = None for insn in kernel.instructions: - # TODO after instructions are removed, could produce - # "instruction does not use all group hw axes" error... if not isinstance(insn, MultiAssignmentBase): new_instructions.append(insn) old_to_new_ids[insn.id] = frozenset([insn.id]) @@ -517,26 +561,34 @@ def remove_work(kernel, remove_vars=[], new_knl_name=None): new_insn_ids = set() new_within_inames = frozenset( add_unrepresented_grid_vars_to_inames(insn.within_inames)) + + if use_unused_inames: + inserted_inames = sorted(list(get_unused_inames(insn))) + else: + inserted_inames = [] + + from pytools import product for read_expr in reader_accesses: new_id = insn_id_gen(insn.id) read_insn_ids.append(insn.id) + add_expr = read_expr*product([p.Variable(iname) for iname in inserted_inames]) new_instructions.append( make_assignment( (p.Variable(read_tgt_var_name),), - p.Variable(read_tgt_var_name) + read_expr, + p.Variable(read_tgt_var_name) + add_expr, id=new_id, within_inames=new_within_inames, depends_on=insn.depends_on | frozenset([read_tgt_init_id]))) new_insn_ids.add(new_id) - # determine type of read_expr + # determine type of add_expr # TODO loopy already has a way of figuring this out, # can we use that instead? (need this to set type of output arg) if read_tgt_var_dtype is None: - read_tgt_var_dtype = type_inf(read_expr) - read_tgt_var_read_expr_acc = read_expr - elif type_inf(read_expr) != read_tgt_var_dtype: - read_tgt_var_read_expr_acc += read_expr + read_tgt_var_dtype = type_inf(add_expr) + read_tgt_var_read_expr_acc = add_expr + elif type_inf(add_expr) != read_tgt_var_dtype: + read_tgt_var_read_expr_acc += add_expr read_tgt_var_dtype = type_inf(read_tgt_var_read_expr_acc) for write_expr in writer_accesses: @@ -544,7 +596,8 @@ def remove_work(kernel, remove_vars=[], new_knl_name=None): new_instructions.append( make_assignment( (write_expr,), - p.Variable(read_tgt_var_name), + p.Variable(read_tgt_var_name)*product( + [p.Variable(iname) for iname in inserted_inames]), id=new_id, within_inames=new_within_inames, depends_on=insn.depends_on)) -- GitLab