From ff08b1a23ad2c0b40490aaefe82b4822366eb872 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= <inform@tiker.net>
Date: Sun, 23 Sep 2018 13:28:39 -0400
Subject: [PATCH 01/34] Add LICENSE

---
 LICENSE | 21 +++++++++++++++++++++
 1 file changed, 21 insertions(+)
 create mode 100644 LICENSE

diff --git a/LICENSE b/LICENSE
new file mode 100644
index 000000000..601df74bd
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 Andreas Klöckner and contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
-- 
GitLab


From 4751b84e2690e05d02342f51fcf6b7c303f2ac6b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= <inform@tiker.net>
Date: Sun, 23 Sep 2018 13:30:06 -0400
Subject: [PATCH 02/34] Add Pytest/JUnit/Gitlab integration

---
 .gitlab-ci.yml | 17 +++++++++++++++++
 1 file changed, 17 insertions(+)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 1caef802b..f9ed13c52 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -12,6 +12,10 @@ Python 2.7 POCL:
   - pocl
   except:
   - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
 
 Python 2.7 with legacy PyOpenCL:
   script:
@@ -29,6 +33,10 @@ Python 2.7 with legacy PyOpenCL:
   except:
   - tags
   retry: 2
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
 
 Python 3.6 POCL:
   script:
@@ -43,6 +51,10 @@ Python 3.6 POCL:
   - pocl
   except:
   - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
 
 Python 3.6 POCL Twice With Cache:
   script:
@@ -59,6 +71,10 @@ Python 3.6 POCL Twice With Cache:
   - pocl
   except:
   - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
 
 # PyPy POCL:
 #   script:
@@ -87,6 +103,7 @@ Python 3.6 POCL Examples:
   except:
   - tags
 
+
 CentOS binary:
   script:
   - (cd build-helpers; ./make-linux-build-docker.sh --nodate)
-- 
GitLab


From 429f7ab3e732255ca84192d28d3cef5e31a76d91 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= <inform@tiker.net>
Date: Mon, 1 Oct 2018 14:49:46 -0400
Subject: [PATCH 03/34] Try explicitly installing ipython to avoid incompatible
 prompt-toolkit being installed

(e.g. https://gitlab.tiker.net/inducer/loopy/-/jobs/63712)
---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index f9ed13c52..ec2000c3d 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -93,7 +93,7 @@ Python 3.6 POCL Examples:
   script:
   - export PY_EXE=python3.6
   - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib jupyter nbconvert"
+  - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile ipython matplotlib jupyter nbconvert"
   - ". ./build-py-project-and-run-examples.sh"
   tags:
   - python3.6
-- 
GitLab


From 0dd11214191d31417b7555977837329f92e58bbb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= <inform@tiker.net>
Date: Tue, 2 Oct 2018 13:31:00 -0400
Subject: [PATCH 04/34] Don't install all of jupyter in examples CI run

May help with https://github.com/jupyter/jupyter_console/issues/158
---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index ec2000c3d..29a4b657b 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -93,7 +93,7 @@ Python 3.6 POCL Examples:
   script:
   - export PY_EXE=python3.6
   - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile ipython matplotlib jupyter nbconvert"
+  - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib nbconvert"
   - ". ./build-py-project-and-run-examples.sh"
   tags:
   - python3.6
-- 
GitLab


From 97999153a0236ed2f7f2e510292bb063f1b67c5b Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= <inform@tiker.net>
Date: Tue, 2 Oct 2018 13:58:08 -0400
Subject: [PATCH 05/34] Localize dep on 'packaging' to CentOS package

---
 build-helpers/make-linux-build-docker-inner-part-2.sh | 4 ++++
 requirements.txt                                      | 5 +----
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/build-helpers/make-linux-build-docker-inner-part-2.sh b/build-helpers/make-linux-build-docker-inner-part-2.sh
index 1e35a1e1b..035634b16 100755
--- a/build-helpers/make-linux-build-docker-inner-part-2.sh
+++ b/build-helpers/make-linux-build-docker-inner-part-2.sh
@@ -23,6 +23,10 @@ git clone --recursive git://github.com/inducer/loopy
 cd loopy
 
 grep -v pyopencl requirements.txt > myreq.txt
+
+# needed for pyinstaller package to be usable
+echo packaging >> myreq.txt
+
 pip install -r myreq.txt
 python setup.py install
 
diff --git a/requirements.txt b/requirements.txt
index a3e88cfea..97c202476 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,4 @@ git+https://github.com/inducer/codepy.git
 git+https://github.com/inducer/f2py
 
 # Optional, needed for using the C preprocessor on Fortran
-ply>=3.6
-
-# This is needed for the pyinstaller executable to be usable.
-packaging
+ply>=3.6
\ No newline at end of file
-- 
GitLab


From bc5d195d15ed019e9dc039b0ace8b744de083270 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= <inform@tiker.net>
Date: Tue, 2 Oct 2018 13:58:44 -0400
Subject: [PATCH 06/34] Add deps needed to keep nbconvert execute happy

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 29a4b657b..913460d81 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -93,7 +93,7 @@ Python 3.6 POCL Examples:
   script:
   - export PY_EXE=python3.6
   - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib nbconvert"
+  - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipython jupyter_client nbconvert"
   - ". ./build-py-project-and-run-examples.sh"
   tags:
   - python3.6
-- 
GitLab


From da44bc51994bb7a21d595dd7859aa0c013ab545d Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andreas=20Kl=C3=B6ckner?= <inform@tiker.net>
Date: Tue, 2 Oct 2018 14:16:59 -0400
Subject: [PATCH 07/34] More futzing with examples CI package deps: Use
 ipykernel instead of ipython

---
 .gitlab-ci.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 913460d81..ea69114d6 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -93,7 +93,7 @@ Python 3.6 POCL Examples:
   script:
   - export PY_EXE=python3.6
   - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipython jupyter_client nbconvert"
+  - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipykernel nbconvert"
   - ". ./build-py-project-and-run-examples.sh"
   tags:
   - python3.6
-- 
GitLab


From 5d64d8244ff100180134e4e04313fa0c1cb86212 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 21 Oct 2018 01:25:42 -0500
Subject: [PATCH 08/34] Fix, test kernel dep arrow printing for continuing
 downward (i.e. bad) dependencies

---
 loopy/kernel/tools.py |  4 ++--
 test/test_loopy.py    | 17 +++++++++++++++++
 2 files changed, 19 insertions(+), 2 deletions(-)

diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 95c3c336c..b8be6191d 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -1241,7 +1241,7 @@ def draw_dependencies_as_unicode_arrows(
         for dep in insn.depends_on:
             reverse_deps.setdefault(dep, set()).add(insn.id)
 
-    # mapping of (from_id, to_id) tuples to column_index
+    # mapping of to_id tuples to column_index
     dep_to_column = {}
 
     # {{{ find column assignments
@@ -1318,7 +1318,7 @@ def draw_dependencies_as_unicode_arrows(
 
             elif insn.id in starts:
                 starts.remove(insn.id)
-                if starts:
+                if starts or pointed_at_insn_id not in processed_ids:
                     # will continue downward
                     row[col] = do_flag_downward(u"├", pointed_at_insn_id)
 
diff --git a/test/test_loopy.py b/test/test_loopy.py
index accf9c1df..f19c76026 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2908,6 +2908,23 @@ def test_dep_cycle_printing_and_error():
         print(lp.generate_code(knl)[0])
 
 
+def test_backwards_dep_printing_and_error():
+    knl = lp.make_kernel(
+            "{[i]: 0<=i<n}",
+            """
+            c[i] = a[i] + b[i]                       {id=insn1}
+            c[i] = 2*c[i]                            {id=insn2, dep=insn1}
+            c[i] = 7*c[i] + a[i]*a[i] + b[i]*b[i]    {id=insn3, dep=insn2}
+            b[i] = b[i] + c[i]                                 {id=insn4, dep=insn3}
+            d[i] = 7*a[i ]                                     {id=insn5, dep=insn4}
+            a[i] = a[i] + d[i]                                 {id=insn6, dep=insn5}
+            """, [lp.GlobalArg('a, b', dtype=np.float64),
+                ...], lang_version=(2018, 2))
+
+    # Used to crash with KeyError
+    print(knl)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
-- 
GitLab


From 29e133e54b051bfadc2173eb5a4b75a76fc9c153 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 21 Oct 2018 02:05:09 -0500
Subject: [PATCH 09/34] Py2 fix

---
 test/test_loopy.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index f19c76026..38d1cd6b0 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2918,8 +2918,10 @@ def test_backwards_dep_printing_and_error():
             b[i] = b[i] + c[i]                                 {id=insn4, dep=insn3}
             d[i] = 7*a[i ]                                     {id=insn5, dep=insn4}
             a[i] = a[i] + d[i]                                 {id=insn6, dep=insn5}
-            """, [lp.GlobalArg('a, b', dtype=np.float64),
-                ...], lang_version=(2018, 2))
+            """, [
+                lp.GlobalArg('a, b', dtype=np.float64),
+                "..."
+            ])
 
     # Used to crash with KeyError
     print(knl)
-- 
GitLab


From 4c4ff5076661c0ef9bacd8ea119a96645453964f Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sun, 28 Oct 2018 14:36:19 -0500
Subject: [PATCH 10/34] Placate flake8 3.6

---
 loopy/frontend/fortran/tree.py | 2 +-
 loopy/symbolic.py              | 2 +-
 setup.cfg                      | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/loopy/frontend/fortran/tree.py b/loopy/frontend/fortran/tree.py
index b1df6e3d0..6939bb6ad 100644
--- a/loopy/frontend/fortran/tree.py
+++ b/loopy/frontend/fortran/tree.py
@@ -53,7 +53,7 @@ class FTreeWalkerBase(object):
 
     ENTITY_RE = re.compile(
             r"^(?P<name>[_0-9a-zA-Z]+)"
-            "(\((?P<shape>[-+*0-9:a-zA-Z, \t]+)\))?$")
+            r"(\((?P<shape>[-+*0-9:a-zA-Z, \t]+)\))?$")
 
     def parse_dimension_specs(self, node, dim_decls):
         def parse_bounds(bounds_str):
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 8927cd6fb..f4d46854b 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -1629,7 +1629,7 @@ def get_access_range(domain, subscript, assumptions, shape=None,
             if shape is not None:
                 try:
                     shape_aff = guarded_aff_from_expr(access_map.space, shape[idim])
-                except ExpressionToAffineConversionError as sub_err:
+                except ExpressionToAffineConversionError:
                     pass
 
             if shape_aff is None:
diff --git a/setup.cfg b/setup.cfg
index b939ce0cf..eec3dfd1f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [flake8]
-ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814
+ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814,W504
 max-line-length=85
 exclude=
     loopy/target/c/compyte/ndarray,
-- 
GitLab


From 1d7a5162a595bd490ea5e06800ac616920b2358e Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Thu, 1 Nov 2018 14:05:25 -0500
Subject: [PATCH 11/34] Fix test parameter order

---
 test/test_numa_diff.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py
index 15d5ea7c9..54b608a18 100644
--- a/test/test_numa_diff.py
+++ b/test/test_numa_diff.py
@@ -47,8 +47,8 @@ __all__ = [
 from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa
 
 
-@pytest.mark.parametrize("Nq", [7])
 @pytest.mark.parametrize("ilp_multiple", [1, 2])
+@pytest.mark.parametrize("Nq", [7])
 @pytest.mark.parametrize("opt_level", [11])
 def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
     ctx = ctx_factory()
-- 
GitLab


From cded42ec99b1bccae276789ffc83ff6d3e615db9 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sat, 10 Nov 2018 16:36:00 -0600
Subject: [PATCH 12/34] Fix CUDA short vector codegen

---
 loopy/target/cuda.py |  2 +-
 test/test_target.py  | 17 +++++++++++++++++
 2 files changed, 18 insertions(+), 1 deletion(-)

diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index 673d3b284..d6f55091a 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -290,7 +290,7 @@ class CUDACASTBuilder(CASTBuilder):
     _VEC_AXES = "xyzw"
 
     def add_vector_access(self, access_expr, index):
-        return access_expr.a(self._VEC_AXES[index])
+        return access_expr.attr(self._VEC_AXES[index])
 
     def emit_barrier(self, synchronization_kind, mem_kind, comment):
         """
diff --git a/test/test_target.py b/test/test_target.py
index 75b3c05ae..bcf85a340 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -358,6 +358,23 @@ def test_ispc_streaming_stores():
     lp.generate_code_v2(knl).all_code()
 
 
+def test_cuda_short_vector():
+    knl = lp.make_kernel(
+        "{ [i]: 0<=i<n }",
+        "out[i] = 2*a[i]",
+        target=lp.CudaTarget())
+
+    knl = lp.set_options(knl, write_code=True)
+    knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec")
+    knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4)
+    knl = lp.tag_array_axes(knl, "a,out", "C,vec")
+
+    knl = lp.set_options(knl, write_wrapper=True)
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
+
+    print(lp.generate_code_v2(knl).device_code())
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
-- 
GitLab


From acc42897335d379d062b049d03ac8ab71361e994 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sat, 10 Nov 2018 17:29:58 -0600
Subject: [PATCH 13/34] Tweak reduction example for linear stride in reduced
 array

---
 loopy/schedule/__init__.py | 10 +++++++---
 test/test_reduction.py     | 12 +++++++++---
 2 files changed, 16 insertions(+), 6 deletions(-)

diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index 652f8b893..58b68486b 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -794,9 +794,13 @@ def generate_loop_schedules_internal(
 
         if not is_ready:
             if debug_mode:
-                print("instruction '%s' is missing insn depedencies '%s'" % (
-                        format_insn(kernel, insn.id), ",".join(
-                            insn.depends_on - sched_state.scheduled_insn_ids)))
+                # These are not that interesting when understanding scheduler
+                # failures.
+
+                # print("instruction '%s' is missing insn depedencies '%s'" % (
+                #         format_insn(kernel, insn.id), ",".join(
+                #             insn.depends_on - sched_state.scheduled_insn_ids)))
+                pass
             continue
 
         want = kernel.insn_inames(insn) - sched_state.parallel_inames
diff --git a/test/test_reduction.py b/test/test_reduction.py
index 78eca4d0c..ef229d5cd 100644
--- a/test/test_reduction.py
+++ b/test/test_reduction.py
@@ -225,22 +225,28 @@ def test_global_parallel_reduction(ctx_factory, size):
             "{[i]: 0 <= i < n }",
             """
             # Using z[0] instead of z works around a bug in ancient PyOpenCL.
-            z[0] = sum(i, i/13)
+            z[0] = sum(i, a[i])
             """)
 
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
     ref_knl = knl
 
     gsize = 128
     knl = lp.split_iname(knl, "i", gsize * 20)
-    knl = lp.split_iname(knl, "i_inner", gsize, outer_tag="l.0")
-    knl = lp.split_reduction_inward(knl, "i_inner_inner")
+    knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0")
+    knl = lp.split_reduction_outward(knl, "i_outer")
     knl = lp.split_reduction_inward(knl, "i_inner_outer")
     from loopy.transform.data import reduction_arg_to_subst_rule
     knl = reduction_arg_to_subst_rule(knl, "i_outer")
+
     knl = lp.precompute(knl, "red_i_outer_arg", "i_outer",
             temporary_scope=lp.temp_var_scope.GLOBAL,
             default_tag="l.auto")
     knl = lp.realize_reduction(knl)
+    knl = lp.tag_inames(knl, "i_outer_0:g.0")
+
+    # Keep the i_outer accumulator on the  correct (lower) side of the barrier,
+    # otherwise there will be useless save/reload code generated.
     knl = lp.add_dependency(
             knl, "writes:acc_i_outer",
             "id:red_i_outer_arg_barrier")
-- 
GitLab


From d69b0e24095eea943e38fd7dcb1a106b43e506b5 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sat, 10 Nov 2018 20:15:54 -0600
Subject: [PATCH 14/34] Bank conflict avoidance: Use tuples for shape

---
 loopy/target/pyopencl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index 73e8e0092..2fc3bc1ed 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -125,7 +125,7 @@ def adjust_local_temp_var_storage(kernel, device):
 
             new_storage_shape = storage_shape
 
-        new_temp_vars[temp_var.name] = temp_var.copy(storage_shape=new_storage_shape)
+        new_temp_vars[temp_var.name] = temp_var.copy(storage_shape=tuple(new_storage_shape))
 
     return kernel.copy(temporary_variables=new_temp_vars)
 
-- 
GitLab


From ba58da9ded42bccb28b1a30048ae74fd3916abe3 Mon Sep 17 00:00:00 2001
From: Andreas Kloeckner <inform@tiker.net>
Date: Sat, 10 Nov 2018 20:33:26 -0600
Subject: [PATCH 15/34] Placate flake8

---
 loopy/target/pyopencl.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index 2fc3bc1ed..34faf0a03 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -125,7 +125,8 @@ def adjust_local_temp_var_storage(kernel, device):
 
             new_storage_shape = storage_shape
 
-        new_temp_vars[temp_var.name] = temp_var.copy(storage_shape=tuple(new_storage_shape))
+        new_temp_vars[temp_var.name] = temp_var.copy(
+                storage_shape=tuple(new_storage_shape))
 
     return kernel.copy(temporary_variables=new_temp_vars)
 
-- 
GitLab


From 9b519ecbac9db836460462dfd89deb47a79ff0d1 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Tue, 13 Nov 2018 06:44:26 -0600
Subject: [PATCH 16/34] added count_within_subscripts boolean to OpCounter

---
 loopy/statistics.py     | 15 +++++++++++----
 test/test_statistics.py | 14 +++++++++-----
 2 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index 3fecfb778..b467e3334 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -697,8 +697,9 @@ class CounterBase(CombineMapper):
 # {{{ ExpressionOpCounter
 
 class ExpressionOpCounter(CounterBase):
-    def __init__(self, knl):
+    def __init__(self, knl, count_within_subscripts=True):
         self.knl = knl
+        self.count_within_subscripts = count_within_subscripts
         from loopy.type_inference import TypeInferenceMapper
         self.type_inf = TypeInferenceMapper(knl)
 
@@ -719,7 +720,10 @@ class ExpressionOpCounter(CounterBase):
                     ) + self.rec(expr.parameters)
 
     def map_subscript(self, expr):
-        return self.rec(expr.index)
+        if self.count_within_subscripts:
+            return self.rec(expr.index)
+        else:
+            return ToCountMap()
 
     def map_sum(self, expr):
         assert expr.children
@@ -1314,7 +1318,7 @@ def _get_insn_count(knl, insn_id, subgroup_size, count_redundant_work,
 # {{{ get_op_map
 
 def get_op_map(knl, numpy_types=True, count_redundant_work=False,
-               subgroup_size=None):
+               count_within_subscripts=True, subgroup_size=None):
 
     """Count the number of operations in a loopy kernel.
 
@@ -1330,6 +1334,9 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False,
         (Likely desirable for performance modeling, but undesirable for code
         optimization.)
 
+    :arg count_within_subscripts: A :class:`bool` specifying whether to
+        count operations inside array indices.
+
     :arg subgroup_size: (currently unused) An :class:`int`, :class:`str`
         ``'guess'``, or *None* that specifies the sub-group size. An OpenCL
         sub-group is an implementation-dependent grouping of work-items within
@@ -1382,7 +1389,7 @@ def get_op_map(knl, numpy_types=True, count_redundant_work=False,
     knl = preprocess_kernel(knl)
 
     op_map = ToCountMap()
-    op_counter = ExpressionOpCounter(knl)
+    op_counter = ExpressionOpCounter(knl, count_within_subscripts)
 
     from loopy.kernel.instruction import (
             CallInstruction, CInstruction, Assignment,
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 3f2366521..41b44b5a7 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -57,7 +57,8 @@ def test_op_counter_basic():
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
                                        g=np.float64, h=np.float64))
-    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True,
+                           count_within_subscripts=True)
     n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, SGS)
@@ -161,7 +162,8 @@ def test_op_counter_specialops():
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
                                        g=np.float64, h=np.float64))
-    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True,
+                           count_within_subscripts=True)
     n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, SGS)
@@ -206,7 +208,8 @@ def test_op_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int64, h=np.int64))
 
-    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True,
+                           count_within_subscripts=False)
     n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, SGS)
@@ -226,7 +229,7 @@ def test_op_counter_bitwise():
     i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.SUBGROUP)
                       ].eval_with_dict(params)
     # (count-per-sub-group)*n_subgroups
-    assert i32add == n*m+n*m*ell*n_subgroups
+    assert i32add == n*m*ell*n_subgroups
     assert i32bw == 2*n*m*ell*n_subgroups
     assert i64bw == 2*n*m*n_subgroups
     assert i64add == i64mul == n*m*n_subgroups
@@ -1153,7 +1156,8 @@ def test_summations_and_filters():
     assert f32lall == (3*n*m*ell)*n_subgroups
     assert f64lall == (2*n*m)*n_subgroups
 
-    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True,
+                           count_within_subscripts=True)
     #for k, v in op_map.items():
     #    print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v)
 
-- 
GitLab


From 9317bc4f034aa9624ecf4d6d8f45b78ea844687f Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Sun, 18 Nov 2018 18:41:00 -0600
Subject: [PATCH 17/34] Documentation fixes

---
 doc/index.rst                     | 10 +++++-----
 doc/ref_kernel.rst                |  2 +-
 examples/python/hello-loopy.loopy |  2 +-
 loopy/kernel/data.py              |  2 +-
 loopy/transform/add_barrier.py    |  8 ++++----
 loopy/transform/batch.py          |  1 +
 loopy/transform/buffer.py         |  2 +-
 loopy/transform/iname.py          |  2 --
 8 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/doc/index.rst b/doc/index.rst
index d862a8acd..b77bbb16f 100644
--- a/doc/index.rst
+++ b/doc/index.rst
@@ -25,18 +25,18 @@ Want to try out loopy?
 
 There's no need to go through :ref:`installation` if you'd just like to get a
 feel for what loopy is.  Instead, you may
-`download a self-contained Linux binary <https://gitlab.tiker.net/inducer/loopy/builds/36708/artifacts/browse/build-helpers/>`_.
+`download a self-contained Linux binary <https://gitlab.tiker.net/inducer/loopy/-/jobs/66778/artifacts/browse/build-helpers/>`_.
 This is purposefully built on an ancient Linux distribution, so it should work
 on most versions of Linux that are currently out there.
 
 Once you have the binary, do the following::
 
     chmod +x ./loopy-centos6
-    ./loopy-centos6 --target=opencl hello-loopy-lp.py
-    ./loopy-centos6 --target=cuda hello-loopy-lp.py
-    ./loopy-centos6 --target=ispc hello-loopy-lp.py
+    ./loopy-centos6 --target=opencl hello-loopy.loopy
+    ./loopy-centos6 --target=cuda hello-loopy.loopy
+    ./loopy-centos6 --target=ispc hello-loopy.loopy
 
-Grab the example here: :download:`examples/python/hello-loopy.py <../examples/python/hello-loopy-lp.py>`.
+Grab the example here: :download:`examples/python/hello-loopy.loopy <../examples/python/hello-loopy.loopy>`.
 
 You may also donwload the most recent version by going to the `list of builds
 <https://gitlab.tiker.net/inducer/loopy/builds>`_, clicking on the newest one
diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst
index c9ce20626..62d76c779 100644
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -406,7 +406,7 @@ Arguments
     :members:
     :undoc-members:
 
-.. autoclass:: GlobalArg
+.. autofunction:: GlobalArg
     :members:
     :undoc-members:
 
diff --git a/examples/python/hello-loopy.loopy b/examples/python/hello-loopy.loopy
index 0ba44d6ec..7f7973098 100644
--- a/examples/python/hello-loopy.loopy
+++ b/examples/python/hello-loopy.loopy
@@ -1,7 +1,7 @@
 # This is a version of hello-loopy.py that can be run through
 # a loopy binary using
 #
-# ./loopy --lang=loopy hello-loopy-lp.py -
+# ./loopy --lang=loopy hello-loopy.loopy -
 
 knl = lp.make_kernel(
         "{ [i]: 0<=i<n }",
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 3e776bd06..d6490aa88 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -49,7 +49,7 @@ from warnings import warn
 class auto(object):  # noqa
     """A generic placeholder object for something that should be automatically
     determined.  See, for example, the *shape* or *strides* argument of
-    :class:`GlobalArg`.
+    :func:`GlobalArg`.
     """
 
 
diff --git a/loopy/transform/add_barrier.py b/loopy/transform/add_barrier.py
index cfbbd56e9..a20a798cf 100644
--- a/loopy/transform/add_barrier.py
+++ b/loopy/transform/add_barrier.py
@@ -44,15 +44,15 @@ def add_barrier(knl, insn_before="", insn_after="", id_based_on=None,
     be any inputs that are understood by :func:`loopy.match.parse_match`.
 
     :arg insn_before: String expression that specifies the instruction(s)
-    before the barrier which is to be added
+        before the barrier which is to be added
     :arg insn_after: String expression that specifies the instruction(s) after
-    the barrier which is to be added
+        the barrier which is to be added
     :arg id: String on which the id of the barrier would be based on.
     :arg tags: The tag of the group to which the barrier must be added
     :arg synchronization_kind: Kind of barrier to be added. May be "global" or
-    "local"
+        "local"
     :arg kind: Type of memory to be synchronied. May be "global" or "local". Ignored
-    for "global" bariers.  If not supplied, defaults to :arg:`synchronization_kind`
+        for "global" bariers.  If not supplied, defaults to *synchronization_kind*
     """
 
     if mem_kind is None:
diff --git a/loopy/transform/batch.py b/loopy/transform/batch.py
index f0b9814c4..f6568918d 100644
--- a/loopy/transform/batch.py
+++ b/loopy/transform/batch.py
@@ -106,6 +106,7 @@ def to_batched(knl, nbatches, batch_varying_args, batch_iname_prefix="ibatch",
         sequential=False):
     """Takes in a kernel that carries out an operation and returns a kernel
     that carries out a batch of these operations.
+
     .. note::
        For temporaries in a kernel that are private or read only
        globals and if `sequential=True`, loopy does not does not batch these
diff --git a/loopy/transform/buffer.py b/loopy/transform/buffer.py
index 801da4c13..63d3a40fb 100644
--- a/loopy/transform/buffer.py
+++ b/loopy/transform/buffer.py
@@ -160,7 +160,7 @@ def buffer_array(kernel, var_name, buffer_inames, init_expression=None,
         matching contexts.  See :func:`loopy.match.parse_stack_match`
         for syntax.
     :arg temporary_scope: If given, override the choice of
-    :class:`AddressSpace` for the created temporary.
+        :class:`AddressSpace` for the created temporary.
     :arg default_tag: The default :ref:`iname-tags` to be assigned to the
         inames used for fetching and storing
     :arg fetch_bounding_box: If the access footprint is non-convex
diff --git a/loopy/transform/iname.py b/loopy/transform/iname.py
index ad1da3e7e..83598dcc2 100644
--- a/loopy/transform/iname.py
+++ b/loopy/transform/iname.py
@@ -66,8 +66,6 @@ __doc__ = """
 
 .. autofunction:: affine_map_inames
 
-.. autofunction:: realize_ilp
-
 .. autofunction:: find_unused_axis_tag
 
 .. autofunction:: make_reduction_inames_unique
-- 
GitLab


From 98f76aabfaca997bd9a1cc028ee51f57646654a4 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Sun, 18 Nov 2018 18:48:12 -0600
Subject: [PATCH 18/34] More fixes

---
 doc/ref_kernel.rst       | 2 --
 doc/tutorial.rst         | 2 +-
 loopy/kernel/__init__.py | 2 +-
 3 files changed, 2 insertions(+), 4 deletions(-)

diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst
index 62d76c779..11ec7f030 100644
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -407,8 +407,6 @@ Arguments
     :undoc-members:
 
 .. autofunction:: GlobalArg
-    :members:
-    :undoc-members:
 
 .. autoclass:: ConstantArg
     :members:
diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 1272d2a59..73f5dea75 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1361,7 +1361,7 @@ code-generation however additional implementation may be required for custom
 functions.  The full lists of available functions may be found in a the
 :class:`TargetBase` implementation (e.g. :class:`CudaTarget`)
 
-Custom user functions may be represented using the method described in :ref:`_functions`
+Custom user functions may be represented using the method described in :ref:`functions`
 
 
 Data-dependent control flow
diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 6b0033808..e3342d0f9 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -142,7 +142,7 @@ class LoopKernel(ImmutableRecordWithoutPickling):
     .. note::
 
         This data structure and its attributes should be considered immutable,
-        even if it contains mutable data types. See :method:`copy` for an easy
+        even if it contains mutable data types. See :meth:`copy` for an easy
         way of producing a modified copy.
 
     .. attribute:: domains
-- 
GitLab


From 18ded6348d7c56c1d8994d7f0c412d9eb2337ed8 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Mon, 19 Nov 2018 17:11:31 -0600
Subject: [PATCH 19/34] Fix a broken doc entry

---
 loopy/kernel/array.py | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 6bf733a84..bae9d7d1f 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -549,15 +549,15 @@ class ArrayBase(ImmutableRecord):
     .. attribute :: name
 
     .. attribute :: dtype
-        the :class:`loopy.loopytype` of the array.
-        if this is *none*, :mod:`loopy` will try to continue without
-        knowing the type of this array, where the idea is that precise
-        knowledge of the type will become available at invocation time.
-        :class:`loopy.compiledkernel` (and thereby
-        :meth:`loopy.loopkernel.__call__`) automatically add this type
-        information based on invocation arguments.
-
-        note that some transformations, such as :func:`loopy.add_padding`
+
+        The :class:`loopy.types.LoopyType` of the array. If this is *None*,
+        :mod:`loopy` will try to continue without knowing the type of this
+        array, where the idea is that precise knowledge of the type will become
+        available at invocation time.  Calling the kernel
+        (via :meth:`loopy.LoopKernel.__call__`)
+        automatically adds this type information based on invocation arguments.
+
+        Note that some transformations, such as :func:`loopy.add_padding`
         cannot be performed without knowledge of the exact *dtype*.
 
     .. attribute :: shape
-- 
GitLab


From 9cfdd91bdf1b613dc72e96ed4bc70d3b67223163 Mon Sep 17 00:00:00 2001
From: Matt Wala <wala1@illinois.edu>
Date: Wed, 21 Nov 2018 00:03:54 -0600
Subject: [PATCH 20/34] Change GlobalArg to ArrayArg

---
 doc/ref_kernel.rst   | 6 ++++--
 loopy/kernel/data.py | 2 +-
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/doc/ref_kernel.rst b/doc/ref_kernel.rst
index 11ec7f030..896388d29 100644
--- a/doc/ref_kernel.rst
+++ b/doc/ref_kernel.rst
@@ -406,7 +406,9 @@ Arguments
     :members:
     :undoc-members:
 
-.. autofunction:: GlobalArg
+.. autoclass:: ArrayArg
+    :members:
+    :undoc-members:
 
 .. autoclass:: ConstantArg
     :members:
@@ -591,7 +593,7 @@ Do not create :class:`LoopKernel` objects directly. Instead, refer to
 Implementation Detail: The Base Array
 -------------------------------------
 
-All array-like data in :mod:`loopy` (such as :class:`GlobalArg` and
+All array-like data in :mod:`loopy` (such as :class:`ArrayArg` and
 :class:`TemporaryVariable`) derive from single, shared base array type,
 described next.
 
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index d6490aa88..7877f8b93 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -49,7 +49,7 @@ from warnings import warn
 class auto(object):  # noqa
     """A generic placeholder object for something that should be automatically
     determined.  See, for example, the *shape* or *strides* argument of
-    :func:`GlobalArg`.
+    :class:`ArrayArg`.
     """
 
 
-- 
GitLab


From fe4ed770ab4d037c53888d96290bb163ac56e33c Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Wed, 28 Nov 2018 18:58:17 -0600
Subject: [PATCH 21/34] added variable_tag to MemAccess; GlobalMemAccessCounter
 tracking variable tags for tagged global variables

---
 loopy/statistics.py     | 21 ++++++++++++---
 test/test_statistics.py | 59 +++++++++++++++++++++++++++++++++++++++++
 2 files changed, 77 insertions(+), 3 deletions(-)

diff --git a/loopy/statistics.py b/loopy/statistics.py
index b467e3334..9ce2bb081 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -581,6 +581,11 @@ class MemAccess(Record):
        A :class:`str` that specifies the variable name of the data
        accessed.
 
+    .. attribute:: variable_tag
+
+       A :class:`str` that specifies the variable tag of a
+       :class:`pymbolic.primitives.TaggedVariable`.
+
     .. attribute:: count_granularity
 
        A :class:`str` that specifies whether this operation should be counted
@@ -597,7 +602,8 @@ class MemAccess(Record):
     """
 
     def __init__(self, mtype=None, dtype=None, lid_strides=None, gid_strides=None,
-                 direction=None, variable=None, count_granularity=None):
+                 direction=None, variable=None, variable_tag=None,
+                 count_granularity=None):
 
         if count_granularity not in CountGranularity.ALL+[None]:
             raise ValueError("Op.__init__: count_granularity '%s' is "
@@ -607,12 +613,14 @@ class MemAccess(Record):
         if dtype is None:
             Record.__init__(self, mtype=mtype, dtype=dtype, lid_strides=lid_strides,
                             gid_strides=gid_strides, direction=direction,
-                            variable=variable, count_granularity=count_granularity)
+                            variable=variable, variable_tag=variable_tag,
+                            count_granularity=count_granularity)
         else:
             from loopy.types import to_loopy_type
             Record.__init__(self, mtype=mtype, dtype=to_loopy_type(dtype),
                             lid_strides=lid_strides, gid_strides=gid_strides,
                             direction=direction, variable=variable,
+                            variable_tag=variable_tag,
                             count_granularity=count_granularity)
 
     def __hash__(self):
@@ -622,7 +630,7 @@ class MemAccess(Record):
 
     def __repr__(self):
         # Record.__repr__ overridden for consistent ordering and conciseness
-        return "MemAccess(%s, %s, %s, %s, %s, %s, %s)" % (
+        return "MemAccess(%s, %s, %s, %s, %s, %s, %s, %s)" % (
             self.mtype,
             self.dtype,
             None if self.lid_strides is None else dict(
@@ -631,6 +639,7 @@ class MemAccess(Record):
                 sorted(six.iteritems(self.gid_strides))),
             self.direction,
             self.variable,
+            self.variable_tag,
             self.count_granularity)
 
 # }}}
@@ -985,6 +994,10 @@ class GlobalMemAccessCounter(MemAccessCounter):
 
     def map_subscript(self, expr):
         name = expr.aggregate.name
+        try:
+            var_tag = expr.aggregate.tag
+        except AttributeError:
+            var_tag = None
 
         if name in self.knl.arg_dict:
             array = self.knl.arg_dict[name]
@@ -1013,6 +1026,7 @@ class GlobalMemAccessCounter(MemAccessCounter):
                             lid_strides=dict(sorted(six.iteritems(lid_strides))),
                             gid_strides=dict(sorted(six.iteritems(gid_strides))),
                             variable=name,
+                            variable_tag=var_tag,
                             count_granularity=count_granularity
                             ): 1}
                           ) + self.rec(expr.index_tuple)
@@ -1634,6 +1648,7 @@ def get_mem_access_map(knl, numpy_types=True, count_redundant_work=False,
                             gid_strides=mem_access.gid_strides,
                             direction=mem_access.direction,
                             variable=mem_access.variable,
+                            variable_tag=mem_access.variable_tag,
                             count_granularity=mem_access.count_granularity),
                         ct)
                         for mem_access, ct in six.iteritems(access_map.count_map)),
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 41b44b5a7..b29edf1ed 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -1060,6 +1060,65 @@ def test_all_counters_parallel_matmul():
     assert local_mem_s == m*2/bsize*n_subgroups
 
 
+def test_mem_access_tagged_variables():
+    bsize = 16
+    knl = lp.make_kernel(
+            "{[i,k,j]: 0<=i<n and 0<=k<m and 0<=j<ell}",
+            [
+                "c$mmresult[i, j] = sum(k, a$mmaload[i, k]*b$mmbload[k, j])"
+            ],
+            name="matmul", assumptions="n,m,ell >= 1")
+    knl = lp.add_and_infer_dtypes(knl, dict(a=np.float32, b=np.float32))
+    knl = lp.split_iname(knl, "i", bsize, outer_tag="g.0", inner_tag="l.1")
+    knl = lp.split_iname(knl, "j", bsize, outer_tag="g.1", inner_tag="l.0")
+    knl = lp.split_iname(knl, "k", bsize)
+    # knl = lp.add_prefetch(knl, "a", ["k_inner", "i_inner"], default_tag="l.auto")
+    # knl = lp.add_prefetch(knl, "b", ["j_inner", "k_inner"], default_tag="l.auto")
+
+    n = 512
+    m = 256
+    ell = 128
+    params = {'n': n, 'm': m, 'ell': ell}
+    group_size = bsize*bsize
+    n_workgroups = div_ceil(n, bsize)*div_ceil(ell, bsize)
+    subgroups_per_group = div_ceil(group_size, SGS)
+    n_subgroups = n_workgroups*subgroups_per_group
+
+    mem_access_map = lp.get_mem_access_map(knl, count_redundant_work=True,
+                                           subgroup_size=SGS)
+
+    f32s1lb = mem_access_map[lp.MemAccess('global', np.float32,
+                             lid_strides={0: 1},
+                             gid_strides={1: bsize},
+                             direction='load', variable='b',
+                             variable_tag='mmbload',
+                             count_granularity=CG.WORKITEM)
+                             ].eval_with_dict(params)
+    f32s1la = mem_access_map[lp.MemAccess('global', np.float32,
+                             lid_strides={1: Variable('m')},
+                             gid_strides={0: Variable('m')*bsize},
+                             direction='load',
+                             variable='a',
+                             variable_tag='mmaload',
+                             count_granularity=CG.SUBGROUP)
+                             ].eval_with_dict(params)
+
+    assert f32s1lb == n*m*ell
+
+    # uniform: (count-per-sub-group)*n_subgroups
+    assert f32s1la == m*n_subgroups
+
+    f32coal = mem_access_map[lp.MemAccess('global', np.float32,
+                             lid_strides={0: 1, 1: Variable('ell')},
+                             gid_strides={0: Variable('ell')*bsize, 1: bsize},
+                             direction='store', variable='c',
+                             variable_tag='mmresult',
+                             count_granularity=CG.WORKITEM)
+                             ].eval_with_dict(params)
+
+    assert f32coal == n*ell
+
+
 def test_gather_access_footprint():
     knl = lp.make_kernel(
             "{[i,k,j]: 0<=i,j,k<n}",
-- 
GitLab


From 390a96df3735e3c256538fbde41006ffba4d7608 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Wed, 28 Nov 2018 19:08:11 -0600
Subject: [PATCH 22/34] updated tutotial w/variable tag printing

---
 doc/tutorial.rst | 74 ++++++++++++++++++++++++------------------------
 1 file changed, 37 insertions(+), 37 deletions(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 73f5dea75..397f34a98 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -1641,15 +1641,15 @@ we'll continue using the kernel from the previous example:
 
     >>> mem_map = lp.get_mem_access_map(knl, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), {}, {}, load, a, subgroup) : ...
+    MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : ...
     <BLANKLINE>
 
 Each line of output will look roughly like::
 
 
-    MemAccess(global, np:dtype('float32'), {}, {}, load, a, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), {}, {}, load, b, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), {}, {}, store, c, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : [m, l, n] -> { 2 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup) : [m, l, n] -> { m * l * n : m > 0 and l > 0 and n > 0 }
 
 :func:`loopy.get_mem_access_map` returns a :class:`loopy.ToCountMap` of **{**
 :class:`loopy.MemAccess` **:** :class:`islpy.PwQPolynomial` **}**.
@@ -1684,13 +1684,13 @@ We can evaluate these polynomials using :func:`islpy.eval_with_dict`:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', CG.SUBGROUP)
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'load', 'g', None, CG.SUBGROUP)
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', CG.SUBGROUP)
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {}, {}, 'store', 'e', None, CG.SUBGROUP)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', CG.SUBGROUP)
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'load', 'a', None, CG.SUBGROUP)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', CG.SUBGROUP)
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {}, {}, 'store', 'c', None, CG.SUBGROUP)
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
@@ -1708,13 +1708,13 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
 
     >>> bytes_map = mem_map.to_bytes()
     >>> print(lp.stringify_stats_mapping(bytes_map))
-    MemAccess(global, np:dtype('float32'), {}, {}, load, a, subgroup) : ...
+    MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : ...
     <BLANKLINE>
     >>> global_ld_st_bytes = bytes_map.filter_by(mtype=['global']
     ...                                         ).group_by('direction')
     >>> print(lp.stringify_stats_mapping(global_ld_st_bytes))
-    MemAccess(None, None, None, None, load, None, None) : ...
-    MemAccess(None, None, None, None, store, None, None) : ...
+    MemAccess(None, None, None, None, load, None, None, None) : ...
+    MemAccess(None, None, None, None, store, None, None, None) : ...
     <BLANKLINE>
     >>> loaded = global_ld_st_bytes[lp.MemAccess(direction='load')
     ...                            ].eval_with_dict(param_dict)
@@ -1726,12 +1726,12 @@ using :func:`loopy.ToCountMap.to_bytes` and :func:`loopy.ToCountMap.group_by`:
 
 The lines of output above might look like::
 
-    MemAccess(global, np:dtype('float32'), {}, {}, load, a, subgroup) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), {}, {}, load, b, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float32'), {}, {}, store, c, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), {}, {}, load, g, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), {}, {}, load, h, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
-    MemAccess(global, np:dtype('float64'), {}, {}, store, e, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, load, a, None, subgroup) : [m, l, n] -> { 8 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, load, b, None, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float32'), {}, {}, store, c, None, subgroup) : [m, l, n] -> { 4 * m * l * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), {}, {}, load, g, None, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), {}, {}, load, h, None, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
+    MemAccess(global, np:dtype('float64'), {}, {}, store, e, None, subgroup) : [m, l, n] -> { 8 * m * n : m > 0 and l > 0 and n > 0 }
 
 One can see how these functions might be useful in computing, for example,
 achieved memory bandwidth in byte/sec or performance in FLOP/sec.
@@ -1751,12 +1751,12 @@ this time.
     ...                             outer_tag="l.1", inner_tag="l.0")
     >>> mem_map = lp.get_mem_access_map(knl_consec, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, workitem) : ...
-    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, workitem) : ...
-    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, a, None, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, load, b, None, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 1, 1: 128}, {}, store, c, None, workitem) : ...
+    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, g, None, workitem) : ...
+    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, load, h, None, workitem) : ...
+    MemAccess(global, np:dtype('float64'), {0: 1, 1: 128}, {}, store, e, None, workitem) : ...
     <BLANKLINE>
 
 With this parallelization, consecutive work-items will access consecutive array
@@ -1766,13 +1766,13 @@ array accesses has not changed:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', CG.WORKITEM)
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'load', 'g', None, CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', CG.WORKITEM)
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 1, 1: 128}, {}, 'store', 'e', None, CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', CG.WORKITEM)
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'load', 'a', None, CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', CG.WORKITEM)
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 1, 1: 128}, {}, 'store', 'c', None, CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
@@ -1792,12 +1792,12 @@ we'll switch the inner and outer tags in our parallelization of the kernel:
     ...                                outer_tag="l.0", inner_tag="l.1")
     >>> mem_map = lp.get_mem_access_map(knl_nonconsec, subgroup_size=32)
     >>> print(lp.stringify_stats_mapping(mem_map))
-    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, workitem) : ...
-    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, workitem) : ...
-    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, workitem) : ...
-    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, a, None, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, load, b, None, workitem) : ...
+    MemAccess(global, np:dtype('float32'), {0: 128, 1: 1}, {}, store, c, None, workitem) : ...
+    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, g, None, workitem) : ...
+    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, load, h, None, workitem) : ...
+    MemAccess(global, np:dtype('float64'), {0: 128, 1: 1}, {}, store, e, None, workitem) : ...
     <BLANKLINE>
 
 With this parallelization, consecutive work-items will access *nonconsecutive*
@@ -1806,13 +1806,13 @@ changed:
 
 .. doctest::
 
-    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', CG.WORKITEM)
+    >>> f64ld_g = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'load', 'g', None, CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', CG.WORKITEM)
+    >>> f64st_e = mem_map[lp.MemAccess('global', np.float64, {0: 128, 1: 1}, {}, 'store', 'e', None, CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', CG.WORKITEM)
+    >>> f32ld_a = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'load', 'a', None, CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
-    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', CG.WORKITEM)
+    >>> f32st_c = mem_map[lp.MemAccess('global', np.float32, {0: 128, 1: 1}, {}, 'store', 'c', None, CG.WORKITEM)
     ...                  ].eval_with_dict(param_dict)
     >>> print("f32 ld a: %i\nf32 st c: %i\nf64 ld g: %i\nf64 st e: %i" %
     ...       (f32ld_a, f32st_c, f64ld_g, f64st_e))
-- 
GitLab


From 6ab14cebbacd1ee38df8cc479ce737bce8b741ed Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 3 Dec 2018 23:36:53 -0600
Subject: [PATCH 23/34] allowing (global) variables to be excluded in
 remove_work()

---
 loopy/transform/instruction.py | 11 +++++++----
 1 file changed, 7 insertions(+), 4 deletions(-)

diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 3dd7009ea..2325e1b0b 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -362,9 +362,10 @@ def uniquify_instruction_ids(kernel):
 # {{{ remove_work
 
 class _MemAccessGatherer(CombineMapper):
-    def __init__(self, kernel, address_space):
+    def __init__(self, kernel, address_space, exclude_vars=[]):
         self.kernel = kernel
         self.address_space = address_space
+        self.exclude_vars = exclude_vars
 
     def combine(self, values):
         from pytools import flatten
@@ -381,7 +382,8 @@ class _MemAccessGatherer(CombineMapper):
             return set()
 
         descr = self.kernel.get_var_descriptor(name)
-        if descr.address_space == self.address_space:
+        if descr.address_space == self.address_space and \
+                    name not in self.exclude_vars:
             result = set([expr])
         else:
             result = set()
@@ -420,7 +422,7 @@ def _make_grid_size_domain(kernel, var_name_gen=None):
     return ggrid_var_names, lgrid_var_names, grid_range_dom
 
 
-def remove_work(kernel):
+def remove_work(kernel, remove_vars=[]):
     """This transform removes operations in a kernel, leaving only
     accesses to global memory.
 
@@ -434,7 +436,8 @@ def remove_work(kernel):
 
     kernel = lp.preprocess_kernel(kernel)
 
-    gatherer = _MemAccessGatherer(kernel, lp.AddressSpace.GLOBAL)
+    gatherer = _MemAccessGatherer(kernel, lp.AddressSpace.GLOBAL,
+                                  exclude_vars=remove_vars)
 
     from loopy.kernel.instruction import MultiAssignmentBase, make_assignment
 
-- 
GitLab


From 2d53c7f9d0b78bd866de9b8b9cfbef3ae2782700 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Tue, 4 Dec 2018 00:15:13 -0600
Subject: [PATCH 24/34] (WIP) writing read_tgt_var to global to ensure
 instructions execute, which means creating a new global arg if output var no
 longer exists (was removed w/remove_work)

---
 loopy/transform/instruction.py | 58 +++++++++++++++++++++++++++++++++-
 1 file changed, 57 insertions(+), 1 deletion(-)

diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 2325e1b0b..5fe3c6c6b 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -384,6 +384,7 @@ class _MemAccessGatherer(CombineMapper):
         descr = self.kernel.get_var_descriptor(name)
         if descr.address_space == self.address_space and \
                     name not in self.exclude_vars:
+            # TODO what about tags?
             result = set([expr])
         else:
             result = set()
@@ -448,6 +449,7 @@ def remove_work(kernel, remove_vars=[]):
     var_name_gen = kernel.get_var_name_generator()
     read_tgt_var_name = var_name_gen("read_tgt")
     new_temporary_variables = kernel.temporary_variables.copy()
+    new_args = kernel.args.copy()
     new_temporary_variables[read_tgt_var_name] = lp.TemporaryVariable(
             read_tgt_var_name, address_space=lp.AddressSpace.PRIVATE)
 
@@ -473,6 +475,8 @@ def remove_work(kernel, remove_vars=[]):
 
     read_insn_ids = []
 
+    read_tgt_var_written_to_global = False
+
     for insn in kernel.instructions:
         if not isinstance(insn, MultiAssignmentBase):
             new_instructions.append(insn)
@@ -502,11 +506,14 @@ def remove_work(kernel, remove_vars=[]):
             new_instructions.append(
                     make_assignment(
                         (write_expr,),
-                        17,
+                        #17,
+                        p.Variable(read_tgt_var_name),  # TODO temporary hack
+                        # insns won't execute unless output is written to global
                         id=new_id,
                         within_inames=insn.within_inames,
                         depends_on=insn.depends_on))
             new_insn_ids.add(new_id)
+            read_tgt_var_written_to_global = True  # TODO part of hack above
 
         old_to_new_ids[insn.id] = frozenset(new_insn_ids)
 
@@ -514,6 +521,8 @@ def remove_work(kernel, remove_vars=[]):
 
     # {{{ create write-out insn for read target
 
+    # TODO writing to temp doesn't guarantee execution, need to write to global mem
+    """
     _, lgrid = kernel.get_grid_size_upper_bounds_as_exprs()
     read_tgt_local_dest_name = var_name_gen("read_tgt_dest")
     new_temporary_variables[read_tgt_local_dest_name] = lp.TemporaryVariable(
@@ -531,6 +540,52 @@ def remove_work(kernel, remove_vars=[]):
             id=write_read_tgt_id,
             depends_on=frozenset(read_insn_ids),
             within_inames=grid_inames))
+    """
+
+    if not read_tgt_var_written_to_global:
+        # TODO must write read_tgt_var to global or instructions may not execute,
+        # TODO if write variable has been removed, need a new write variable
+        # TODO if write variable has not been removed, just write to that variable?
+        ggrid, lgrid = kernel.get_grid_size_upper_bounds_as_exprs()
+        lstrides = []
+        gstrides = []
+        tot_stride = 1
+        lstrides.append(tot_stride)
+        for size in lgrid[:-1]:
+            tot_stride *= size
+            lstrides.append(tot_stride)
+        tot_stride *= lgrid[-1]
+        gstrides.append(tot_stride)
+        for size in ggrid[:-1]:
+            tot_stride *= size
+            gstrides.append(tot_stride)
+
+        strides = ",".join(list(reversed([str(s) for s in lstrides+gstrides])))
+
+        # TODO decide what this mem access pattern should be
+        read_tgt_global_dest_name = var_name_gen("read_tgt_dest")
+        new_args.append(lp.GlobalArg(
+                name=read_tgt_global_dest_name,
+                shape=tuple(reversed(lgrid + ggrid)),
+                strides=strides,
+                ))
+        # TODO WEIRD BEHAVIOR: when kernel has not been cached, this works fine,
+        # but when kernel is used again, this new arg ends up const and writing
+        # to it causes an error... ???
+
+        write_read_tgt_id = insn_id_gen("write_read_tgt")
+        old_to_new_ids[write_read_tgt_id] = [write_read_tgt_id]
+        new_instructions.append(
+            make_assignment(
+                (p.Variable(read_tgt_global_dest_name)[
+                    tuple(p.Variable(gn) for gn in
+                    reversed(lgrid_var_names+ggrid_var_names))],),
+                p.Variable(read_tgt_var_name),
+                id=write_read_tgt_id,
+                depends_on=frozenset(read_insn_ids),
+                within_inames=grid_inames,
+                #within_inames=grid_inames_untagged,
+                ))
 
     # }}}
 
@@ -552,6 +607,7 @@ def remove_work(kernel, remove_vars=[]):
             domains=kernel.domains + [grid_range_dom],
             state=lp.KernelState.INITIAL,
             instructions=new_instructions_2,
+            args=new_args,
             temporary_variables=new_temporary_variables)
 
     from loopy.kernel.data import GroupIndexTag, LocalIndexTag
-- 
GitLab


From bbe75e285d709be68e3c02b3ac76de2e156e9b89 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Tue, 4 Dec 2018 19:05:16 -0600
Subject: [PATCH 25/34] more standard mem access pattern for newly created
 output var; made function defining index/shape/stride ordering for
 consistency

---
 loopy/transform/instruction.py | 51 ++++++++++++++++++++--------------
 1 file changed, 30 insertions(+), 21 deletions(-)

diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 5fe3c6c6b..f1b1b3245 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -546,45 +546,54 @@ def remove_work(kernel, remove_vars=[]):
         # TODO must write read_tgt_var to global or instructions may not execute,
         # TODO if write variable has been removed, need a new write variable
         # TODO if write variable has not been removed, just write to that variable?
-        ggrid, lgrid = kernel.get_grid_size_upper_bounds_as_exprs()
-        lstrides = []
-        gstrides = []
-        tot_stride = 1
-        lstrides.append(tot_stride)
-        for size in lgrid[:-1]:
-            tot_stride *= size
-            lstrides.append(tot_stride)
-        tot_stride *= lgrid[-1]
-        gstrides.append(tot_stride)
-        for size in ggrid[:-1]:
-            tot_stride *= size
-            gstrides.append(tot_stride)
-
-        strides = ",".join(list(reversed([str(s) for s in lstrides+gstrides])))
 
+        # define order for indexing/shape/strides
+        def index_order(local_list, global_list):
+            # produce this order: [g.n, l.n, ...,  g.1, l.1, g.0, l.0]
+            # accept both dicts of {dim: val} and ordered lists [val0, val1, ...]
+            result = []
+            for i in reversed(range(len(local_list))):
+                result.append(global_list[i])
+                result.append(local_list[i])
+            return result
+
+        # define local/global strides
         # TODO decide what this mem access pattern should be
+        ggrid, lgrid = kernel.get_grid_size_upper_bounds_as_exprs()
+        lstrides = {0: 1}
+        assert len(lgrid) == len(ggrid)  # TODO is this necessary?
+        for dim in range(1,len(lgrid)):
+            lstrides[dim] = lstrides[dim-1]*lgrid[dim-1]*ggrid[dim-1]
+        gstrides = {}
+        for dim in range(0,len(ggrid)):
+            gstrides[dim] = lstrides[dim]*lgrid[dim]
+
+        # use consistent index ordering for strides, shape, and index
+        strides = index_order(lstrides, gstrides)
+        shape = tuple(index_order(lgrid, ggrid))
+        index = tuple(p.Variable(i) for i in index_order(lgrid_var_names, ggrid_var_names))
+
+        # create new global arg to write results
         read_tgt_global_dest_name = var_name_gen("read_tgt_dest")
         new_args.append(lp.GlobalArg(
                 name=read_tgt_global_dest_name,
-                shape=tuple(reversed(lgrid + ggrid)),
-                strides=strides,
+                shape=shape,
+                strides=",".join(str(s) for s in strides),
                 ))
         # TODO WEIRD BEHAVIOR: when kernel has not been cached, this works fine,
         # but when kernel is used again, this new arg ends up const and writing
         # to it causes an error... ???
 
+        # create instruction writing read_tgt to new global arg
         write_read_tgt_id = insn_id_gen("write_read_tgt")
         old_to_new_ids[write_read_tgt_id] = [write_read_tgt_id]
         new_instructions.append(
             make_assignment(
-                (p.Variable(read_tgt_global_dest_name)[
-                    tuple(p.Variable(gn) for gn in
-                    reversed(lgrid_var_names+ggrid_var_names))],),
+                (p.Variable(read_tgt_global_dest_name)[index],),
                 p.Variable(read_tgt_var_name),
                 id=write_read_tgt_id,
                 depends_on=frozenset(read_insn_ids),
                 within_inames=grid_inames,
-                #within_inames=grid_inames_untagged,
                 ))
 
     # }}}
-- 
GitLab


From 035814da3504b106a29fa6f089083724cade3382 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Thu, 6 Dec 2018 19:21:44 -0600
Subject: [PATCH 26/34] setting dtype of global arg

---
 loopy/transform/instruction.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index f1b1b3245..322e0576f 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -476,7 +476,9 @@ def remove_work(kernel, remove_vars=[]):
     read_insn_ids = []
 
     read_tgt_var_written_to_global = False
-
+    type_inf = lp.type_inference.TypeInferenceMapper(kernel)
+    read_tgt_var_dtype = None
+    read_tgt_var_read_expr_acc = None
     for insn in kernel.instructions:
         if not isinstance(insn, MultiAssignmentBase):
             new_instructions.append(insn)
@@ -501,6 +503,16 @@ def remove_work(kernel, remove_vars=[]):
                         depends_on=insn.depends_on | frozenset([read_tgt_init_id])))
             new_insn_ids.add(new_id)
 
+            # determine type of read_expr
+            # TODO loopy already has a way of doing this,
+            # use that instead? (need this to set type of output arg)
+            if read_tgt_var_dtype is None:
+                read_tgt_var_dtype = type_inf(read_expr)
+                read_tgt_var_read_expr_acc = read_expr
+            elif type_inf(read_expr) != read_tgt_var_dtype:
+                read_tgt_var_read_expr_acc += read_expr
+                read_tgt_var_dtype = type_inf(read_tgt_var_read_expr_acc)
+
         for write_expr in writer_accesses:
             new_id = insn_id_gen(insn.id)
             new_instructions.append(
@@ -562,22 +574,24 @@ def remove_work(kernel, remove_vars=[]):
         ggrid, lgrid = kernel.get_grid_size_upper_bounds_as_exprs()
         lstrides = {0: 1}
         assert len(lgrid) == len(ggrid)  # TODO is this necessary?
-        for dim in range(1,len(lgrid)):
+        for dim in range(1, len(lgrid)):
             lstrides[dim] = lstrides[dim-1]*lgrid[dim-1]*ggrid[dim-1]
         gstrides = {}
-        for dim in range(0,len(ggrid)):
+        for dim in range(0, len(ggrid)):
             gstrides[dim] = lstrides[dim]*lgrid[dim]
 
         # use consistent index ordering for strides, shape, and index
         strides = index_order(lstrides, gstrides)
         shape = tuple(index_order(lgrid, ggrid))
-        index = tuple(p.Variable(i) for i in index_order(lgrid_var_names, ggrid_var_names))
+        index = tuple(p.Variable(i) for i in
+                      index_order(lgrid_var_names, ggrid_var_names))
 
         # create new global arg to write results
         read_tgt_global_dest_name = var_name_gen("read_tgt_dest")
         new_args.append(lp.GlobalArg(
                 name=read_tgt_global_dest_name,
                 shape=shape,
+                dtype=read_tgt_var_dtype,
                 strides=",".join(str(s) for s in strides),
                 ))
         # TODO WEIRD BEHAVIOR: when kernel has not been cached, this works fine,
-- 
GitLab


From 656acb204ec547407fdfb115e50099c60baa809e Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Thu, 6 Dec 2018 20:00:43 -0600
Subject: [PATCH 27/34] added todo (deal with 'instruction does not use all
 group hw axes' error)

---
 loopy/transform/instruction.py | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 322e0576f..532626396 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -480,6 +480,8 @@ def remove_work(kernel, remove_vars=[]):
     read_tgt_var_dtype = None
     read_tgt_var_read_expr_acc = None
     for insn in kernel.instructions:
+        # TODO after instructions are removed, could produce
+        # "instruction does not use all group hw axes" error...
         if not isinstance(insn, MultiAssignmentBase):
             new_instructions.append(insn)
             old_to_new_ids[insn.id] = frozenset([insn.id])
-- 
GitLab


From a0e06bc26251dc313ad2a0cc35bb47ba0f7a5840 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Fri, 7 Dec 2018 16:21:43 -0600
Subject: [PATCH 28/34] don't copy args if they're being removed

---
 loopy/transform/instruction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 532626396..dfe0891b5 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -449,7 +449,7 @@ def remove_work(kernel, remove_vars=[]):
     var_name_gen = kernel.get_var_name_generator()
     read_tgt_var_name = var_name_gen("read_tgt")
     new_temporary_variables = kernel.temporary_variables.copy()
-    new_args = kernel.args.copy()
+    new_args = [arg.copy() for arg in kernel.args if arg.name not in remove_vars]
     new_temporary_variables[read_tgt_var_name] = lp.TemporaryVariable(
             read_tgt_var_name, address_space=lp.AddressSpace.PRIVATE)
 
-- 
GitLab


From 3ba8e3fabf16ad9fa660428ef5823bca8e454200 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Sun, 9 Dec 2018 18:38:11 -0600
Subject: [PATCH 29/34] added todo for adding grid vars to within_inames in
 insn

---
 loopy/transform/instruction.py | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index dfe0891b5..667023112 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -501,7 +501,7 @@ def remove_work(kernel, remove_vars=[]):
                         (p.Variable(read_tgt_var_name),),
                         p.Variable(read_tgt_var_name) + read_expr,
                         id=new_id,
-                        within_inames=insn.within_inames,
+                        within_inames=insn.within_inames, # TODO try sticking in grid variables
                         depends_on=insn.depends_on | frozenset([read_tgt_init_id])))
             new_insn_ids.add(new_id)
 
@@ -520,9 +520,7 @@ def remove_work(kernel, remove_vars=[]):
             new_instructions.append(
                     make_assignment(
                         (write_expr,),
-                        #17,
-                        p.Variable(read_tgt_var_name),  # TODO temporary hack
-                        # insns won't execute unless output is written to global
+                        p.Variable(read_tgt_var_name),
                         id=new_id,
                         within_inames=insn.within_inames,
                         depends_on=insn.depends_on))
-- 
GitLab


From aaeaa2df5d091336c2239cfbdc42441fa8ecbcb8 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Tue, 11 Dec 2018 01:21:32 -0600
Subject: [PATCH 30/34] adding unrepresented axis tags to within_inames

---
 loopy/transform/instruction.py | 29 ++++++++++++++++++++++++-----
 1 file changed, 24 insertions(+), 5 deletions(-)

diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 667023112..0461c7a2d 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -473,6 +473,24 @@ def remove_work(kernel, remove_vars=[]):
 
     # {{{ rewrite instructions
 
+    from loopy.kernel.data import GroupIndexTag, LocalIndexTag
+
+    def add_unrepresented_grid_vars_to_inames(inames):
+        local_axes_needed = set(range(len(lgrid_var_names)))
+        group_axes_needed = set(range(len(ggrid_var_names)))
+        #TODO deal with key errors
+        for iname in inames:
+            try:
+                tag = kernel.iname_to_tag[iname]
+            except KeyError:
+                continue
+            if isinstance(tag, LocalIndexTag):
+                local_axes_needed.remove(tag.axis)
+            elif isinstance(tag, GroupIndexTag):
+                group_axes_needed.remove(tag.axis)
+        return inames | set([lgrid_var_names[axis] for axis in local_axes_needed] +
+               [ggrid_var_names[axis] for axis in group_axes_needed])
+
     read_insn_ids = []
 
     read_tgt_var_written_to_global = False
@@ -493,6 +511,8 @@ def remove_work(kernel, remove_vars=[]):
         reader_accesses = gatherer(insn.expression)
 
         new_insn_ids = set()
+        new_within_inames = frozenset(
+                add_unrepresented_grid_vars_to_inames(insn.within_inames))
         for read_expr in reader_accesses:
             new_id = insn_id_gen(insn.id)
             read_insn_ids.append(insn.id)
@@ -501,13 +521,13 @@ def remove_work(kernel, remove_vars=[]):
                         (p.Variable(read_tgt_var_name),),
                         p.Variable(read_tgt_var_name) + read_expr,
                         id=new_id,
-                        within_inames=insn.within_inames, # TODO try sticking in grid variables
+                        within_inames=new_within_inames,
                         depends_on=insn.depends_on | frozenset([read_tgt_init_id])))
             new_insn_ids.add(new_id)
 
             # determine type of read_expr
-            # TODO loopy already has a way of doing this,
-            # use that instead? (need this to set type of output arg)
+            # TODO loopy already has a way of figuring this out,
+            # can we use that instead? (need this to set type of output arg)
             if read_tgt_var_dtype is None:
                 read_tgt_var_dtype = type_inf(read_expr)
                 read_tgt_var_read_expr_acc = read_expr
@@ -522,7 +542,7 @@ def remove_work(kernel, remove_vars=[]):
                         (write_expr,),
                         p.Variable(read_tgt_var_name),
                         id=new_id,
-                        within_inames=insn.within_inames,
+                        within_inames=new_within_inames,
                         depends_on=insn.depends_on))
             new_insn_ids.add(new_id)
             read_tgt_var_written_to_global = True  # TODO part of hack above
@@ -633,7 +653,6 @@ def remove_work(kernel, remove_vars=[]):
             args=new_args,
             temporary_variables=new_temporary_variables)
 
-    from loopy.kernel.data import GroupIndexTag, LocalIndexTag
     kernel = lp.tag_inames(kernel, dict(
         (ggrid_var_names[i], GroupIndexTag(i))
         for i in range(len(ggrid_var_names))))
-- 
GitLab


From 4d2b8460b99dda21960355366d736634684ea035 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Tue, 11 Dec 2018 21:56:32 -0600
Subject: [PATCH 31/34] setting new kernel._cached_written_variables to None
 since they may have changed

---
 loopy/transform/instruction.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 0461c7a2d..a92b4e946 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -651,7 +651,8 @@ def remove_work(kernel, remove_vars=[]):
             state=lp.KernelState.INITIAL,
             instructions=new_instructions_2,
             args=new_args,
-            temporary_variables=new_temporary_variables)
+            temporary_variables=new_temporary_variables,
+            _cached_written_variables=None)
 
     kernel = lp.tag_inames(kernel, dict(
         (ggrid_var_names[i], GroupIndexTag(i))
-- 
GitLab


From 943bd65dda1662f7aeb6ef9d96c20f156e030439 Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Thu, 13 Dec 2018 22:07:43 -0600
Subject: [PATCH 32/34] added warning about potential loop priority change in
 remove_work

---
 loopy/transform/instruction.py | 8 ++++++++
 1 file changed, 8 insertions(+)

diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index a92b4e946..5ce9938c0 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -661,6 +661,14 @@ def remove_work(kernel, remove_vars=[]):
         (lgrid_var_names[i], LocalIndexTag(i))
         for i in range(len(lgrid_var_names))))
 
+    if not kernel.loop_priority:
+        from loopy.diagnostic import warn_with_kernel
+        warn_with_kernel(kernel, "remove_work_loop_priority",
+            "Kernel loop_priority unspecified. "
+            "remove_work() may yield loop priority differing "
+            "from that of original kernel. To ensure desired "
+            "loop priority, use lp.prioritize_loops().")
+
     return kernel
 
 # }}}
-- 
GitLab


From 7538c6540c91bc53248602019c949ca3b9c35eff Mon Sep 17 00:00:00 2001
From: jdsteve2 <jdsteve2@illinois.edu>
Date: Mon, 31 Dec 2018 01:10:29 -0600
Subject: [PATCH 33/34] remove_work allowing for kernel name change

---
 loopy/transform/instruction.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 5ce9938c0..8a5bb7604 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -423,7 +423,7 @@ def _make_grid_size_domain(kernel, var_name_gen=None):
     return ggrid_var_names, lgrid_var_names, grid_range_dom
 
 
-def remove_work(kernel, remove_vars=[]):
+def remove_work(kernel, remove_vars=[], new_knl_name=None):
     """This transform removes operations in a kernel, leaving only
     accesses to global memory.
 
@@ -446,6 +446,10 @@ def remove_work(kernel, remove_vars=[]):
     old_to_new_ids = {}
     insn_id_gen = kernel.get_instruction_id_generator()
 
+    if new_knl_name:
+        new_name = new_knl_name
+    else:
+        new_name = kernel.name
     var_name_gen = kernel.get_var_name_generator()
     read_tgt_var_name = var_name_gen("read_tgt")
     new_temporary_variables = kernel.temporary_variables.copy()
@@ -652,7 +656,9 @@ def remove_work(kernel, remove_vars=[]):
             instructions=new_instructions_2,
             args=new_args,
             temporary_variables=new_temporary_variables,
-            _cached_written_variables=None)
+            name=new_name,
+            _cached_written_variables=None,
+            )
 
     kernel = lp.tag_inames(kernel, dict(
         (ggrid_var_names[i], GroupIndexTag(i))
-- 
GitLab


From 6444cfd277b4844ae00d742a38933d4b621774b9 Mon Sep 17 00:00:00 2001
From: James Stevens <jdsteve2@illinois.edu>
Date: Wed, 2 Jan 2019 02:20:33 -0600
Subject: [PATCH 34/34] adding unused inames to ops when removing work

---
 loopy/transform/instruction.py | 77 ++++++++++++++++++++++++++++------
 1 file changed, 65 insertions(+), 12 deletions(-)

diff --git a/loopy/transform/instruction.py b/loopy/transform/instruction.py
index 8a5bb7604..4d7f70077 100644
--- a/loopy/transform/instruction.py
+++ b/loopy/transform/instruction.py
@@ -27,6 +27,7 @@ import islpy as isl
 
 from loopy.diagnostic import LoopyError
 from loopy.symbolic import CombineMapper
+from pymbolic.mapper import Collector
 
 
 # {{{ find_instructions
@@ -400,6 +401,34 @@ class _MemAccessGatherer(CombineMapper):
         return self._map_access(expr, expr.aggregate.name, expr.index)
 
 
+class _VariableGatherer(Collector):
+    # TODO add tests for this
+    def __init__(self, search_variables):
+        self.search_variables = search_variables
+
+    #def combine(self, values):
+    #    from pytools import flatten
+    #    return set(flatten(values))
+
+    def map_variable(self, expr):
+        if expr.name in self.search_variables:
+            return set([expr.name])
+        else:
+            return set()
+
+    map_tagged_variable = map_variable
+
+    # TODO do I need this?
+    def map_reduction(self, expr):
+        return self.rec(expr.expr)
+
+    # TODO do I need this?
+    map_linear_subscript = CombineMapper.map_subscript
+
+#def map_subscript(self, expr):
+#    return self.rec(expr.index)
+
+
 def _make_grid_size_domain(kernel, var_name_gen=None):
     if var_name_gen is None:
         var_name_gen = kernel.get_var_name_generator()
@@ -423,7 +452,7 @@ def _make_grid_size_domain(kernel, var_name_gen=None):
     return ggrid_var_names, lgrid_var_names, grid_range_dom
 
 
-def remove_work(kernel, remove_vars=[], new_knl_name=None):
+def remove_work(kernel, remove_vars=[], new_knl_name=None, use_unused_inames=False):
     """This transform removes operations in a kernel, leaving only
     accesses to global memory.
 
@@ -459,6 +488,25 @@ def remove_work(kernel, remove_vars=[], new_knl_name=None):
 
     new_instructions = []
 
+    # TODO figure out which of these tags I really need to deal with
+    from loopy.kernel.data import  GroupIndexTag, LocalIndexTag, UnrollTag
+    parallel_inames = set([iname for iname, tag in kernel.iname_to_tag.items() if
+                              (isinstance(tag, LocalIndexTag) or
+                               isinstance(tag, GroupIndexTag))])
+    unrolled_inames = set([iname for iname, tag in kernel.iname_to_tag.items() if
+                              isinstance(tag, UnrollTag)])
+
+    iname_gatherer = _VariableGatherer(kernel.all_inames() -
+                                       (parallel_inames | unrolled_inames))
+
+    def get_unused_inames(insn):
+        inames_required = insn.within_inames - (parallel_inames | unrolled_inames)
+        inames_found = set.union(
+                            *[iname_gatherer(assignee) for assignee in insn.assignees],
+                            iname_gatherer(insn.expression),
+                            )
+        return inames_required - inames_found
+
     # {{{ create init insn for read target
 
     ggrid_var_names, lgrid_var_names, grid_range_dom = _make_grid_size_domain(kernel)
@@ -477,8 +525,6 @@ def remove_work(kernel, remove_vars=[], new_knl_name=None):
 
     # {{{ rewrite instructions
 
-    from loopy.kernel.data import GroupIndexTag, LocalIndexTag
-
     def add_unrepresented_grid_vars_to_inames(inames):
         local_axes_needed = set(range(len(lgrid_var_names)))
         group_axes_needed = set(range(len(ggrid_var_names)))
@@ -502,8 +548,6 @@ def remove_work(kernel, remove_vars=[], new_knl_name=None):
     read_tgt_var_dtype = None
     read_tgt_var_read_expr_acc = None
     for insn in kernel.instructions:
-        # TODO after instructions are removed, could produce
-        # "instruction does not use all group hw axes" error...
         if not isinstance(insn, MultiAssignmentBase):
             new_instructions.append(insn)
             old_to_new_ids[insn.id] = frozenset([insn.id])
@@ -517,26 +561,34 @@ def remove_work(kernel, remove_vars=[], new_knl_name=None):
         new_insn_ids = set()
         new_within_inames = frozenset(
                 add_unrepresented_grid_vars_to_inames(insn.within_inames))
+
+        if use_unused_inames:
+            inserted_inames = sorted(list(get_unused_inames(insn)))
+        else:
+            inserted_inames = []
+
+        from pytools import product
         for read_expr in reader_accesses:
             new_id = insn_id_gen(insn.id)
             read_insn_ids.append(insn.id)
+            add_expr = read_expr*product([p.Variable(iname) for iname in inserted_inames])
             new_instructions.append(
                     make_assignment(
                         (p.Variable(read_tgt_var_name),),
-                        p.Variable(read_tgt_var_name) + read_expr,
+                        p.Variable(read_tgt_var_name) + add_expr,
                         id=new_id,
                         within_inames=new_within_inames,
                         depends_on=insn.depends_on | frozenset([read_tgt_init_id])))
             new_insn_ids.add(new_id)
 
-            # determine type of read_expr
+            # determine type of add_expr
             # TODO loopy already has a way of figuring this out,
             # can we use that instead? (need this to set type of output arg)
             if read_tgt_var_dtype is None:
-                read_tgt_var_dtype = type_inf(read_expr)
-                read_tgt_var_read_expr_acc = read_expr
-            elif type_inf(read_expr) != read_tgt_var_dtype:
-                read_tgt_var_read_expr_acc += read_expr
+                read_tgt_var_dtype = type_inf(add_expr)
+                read_tgt_var_read_expr_acc = add_expr
+            elif type_inf(add_expr) != read_tgt_var_dtype:
+                read_tgt_var_read_expr_acc += add_expr
                 read_tgt_var_dtype = type_inf(read_tgt_var_read_expr_acc)
 
         for write_expr in writer_accesses:
@@ -544,7 +596,8 @@ def remove_work(kernel, remove_vars=[], new_knl_name=None):
             new_instructions.append(
                     make_assignment(
                         (write_expr,),
-                        p.Variable(read_tgt_var_name),
+                        p.Variable(read_tgt_var_name)*product(
+                            [p.Variable(iname) for iname in inserted_inames]),
                         id=new_id,
                         within_inames=new_within_inames,
                         depends_on=insn.depends_on))
-- 
GitLab