From c1d4f72aec8d1d8d008a29bf4e4fe7bae311129e Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 31 Jan 2018 10:33:51 -0500
Subject: [PATCH 001/144] first crack at issue #124

---
 loopy/kernel/array.py | 30 ++++++++++++++++++++++++++++++
 test/test_loopy.py    | 26 ++++++++++++++++++++++++++
 2 files changed, 56 insertions(+)

diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index b672f0227..6c8cf82bd 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1245,6 +1245,31 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
 
         return result
 
+    def depends_only_on(idx, iname):
+        """
+        Test if the given iname is the only variable in idx
+        """
+
+        from pymbolic.mapper import WalkMapper
+        from pymbolic.primitives import Variable
+
+        class VariableMapper(WalkMapper):
+            def __init__(self, *args, **kwargs):
+                self.variables = set()
+                super(VariableMapper, self).__init__(*args, **kwargs)
+
+            def visit(self, expr, *args, **kwargs):
+                if isinstance(expr, Variable):
+                    self.variables.add(expr.name)
+                return True
+
+        # feed through mapper
+        mapv = VariableMapper()
+        mapv(idx)
+        if len(mapv.variables) == 1 and iname in mapv.variables:
+            return True
+        return False
+
     def apply_offset(sub):
         import loopy as lp
 
@@ -1328,6 +1353,11 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
                 # in the vector being returned.
                 pass
 
+            elif (vectorization_info is not None and
+                  depends_only_on(index[i], vectorization_info.iname)):
+                assert vector_index is None
+                vector_index = idx
+
             else:
                 idx = eval_expr_assert_integer_constant(i, idx)
 
diff --git a/test/test_loopy.py b/test/test_loopy.py
index e36a4c2c3..58ed5ca44 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2788,6 +2788,32 @@ def test_add_prefetch_works_in_lhs_index():
         assert "a1_map" not in get_dependencies(insn.assignees)
 
 
+def test_explicit_simd():
+    def create_and_test(insn):
+        knl = lp.make_kernel(['{[i]: 0 <= i < 12}', '{[j]: 0 <= j < 1}'],
+                             insn,
+                             [lp.GlobalArg('a', shape=(1, 12,), dtype=np.int32),
+                              lp.GlobalArg('b', shape=(1, 14,), dtype=np.int32)])
+
+        knl = lp.split_iname(knl, 'i', 4, inner_tag='vec')
+        knl = lp.tag_inames(knl, [('j', 'g.0')])
+        knl = lp.split_array_axis(knl, ['a', 'b'], 1, 4)
+        knl = lp.tag_array_axes(knl, ['a', 'b'], 'N1,N0,vec')
+
+        print(lp.generate_code_v2(knl).device_code())
+        ctx = cl.create_some_context()
+        queue = cl.CommandQueue(ctx)
+        assert np.array_equal(
+            knl(queue, a=np.zeros((1, 3, 4), dtype=np.int32),
+                b=np.arange(16, dtype=np.int32).reshape((1, 4, 4)))[1][0].flatten(
+                    'C'),
+            np.arange(2, 14, dtype=np.int32))
+
+    create_and_test("a[j, i] = b[j, i + 2]")
+    create_and_test("a[j, i] = b[j, i + 2] + a[j, i]")
+    create_and_test("a[j, i] = a[j, i] + b[j, i + 2]")
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
-- 
GitLab


From f62943dd68da577147e89210ff4efccf326dc152 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 31 Jan 2018 13:02:44 -0500
Subject: [PATCH 002/144] use builtin

---
 loopy/kernel/array.py | 21 +++------------------
 1 file changed, 3 insertions(+), 18 deletions(-)

diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 6c8cf82bd..5572832f4 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1250,25 +1250,10 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
         Test if the given iname is the only variable in idx
         """
 
-        from pymbolic.mapper import WalkMapper
-        from pymbolic.primitives import Variable
-
-        class VariableMapper(WalkMapper):
-            def __init__(self, *args, **kwargs):
-                self.variables = set()
-                super(VariableMapper, self).__init__(*args, **kwargs)
-
-            def visit(self, expr, *args, **kwargs):
-                if isinstance(expr, Variable):
-                    self.variables.add(expr.name)
-                return True
-
         # feed through mapper
-        mapv = VariableMapper()
-        mapv(idx)
-        if len(mapv.variables) == 1 and iname in mapv.variables:
-            return True
-        return False
+        from loopy.symbolic import get_dependencies
+        idx_vars = get_dependencies(idx)
+        return len(idx_vars) == 1 and iname in idx_vars
 
     def apply_offset(sub):
         import loopy as lp
-- 
GitLab


From 99f2f9ff3cd20a67e3e75162a063f9c409c1db7d Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 1 Feb 2018 10:25:06 -0500
Subject: [PATCH 003/144] force checking of _all_ children to raise
 unvectorizable if any child fails

---
 loopy/expression.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/expression.py b/loopy/expression.py
index 3269bc09f..5e92d9abb 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -75,7 +75,7 @@ class VectorizabilityChecker(RecursiveMapper):
         return reduce(and_, vectorizabilities)
 
     def map_sum(self, expr):
-        return any(self.rec(child) for child in expr.children)
+        return any([self.rec(child) for child in expr.children])
 
     map_product = map_sum
 
-- 
GitLab


From 0d660688625d4c1e6506979fc1f736c5d445e155 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 1 Feb 2018 10:25:55 -0500
Subject: [PATCH 004/144] rename

---
 test/test_loopy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index 58ed5ca44..294684ccf 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2788,7 +2788,7 @@ def test_add_prefetch_works_in_lhs_index():
         assert "a1_map" not in get_dependencies(insn.assignees)
 
 
-def test_explicit_simd():
+def test_explicit_simd_offset():
     def create_and_test(insn):
         knl = lp.make_kernel(['{[i]: 0 <= i < 12}', '{[j]: 0 <= j < 1}'],
                              insn,
-- 
GitLab


From c1117907ab80e85d15b22079be055e156bfcb959 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 1 Feb 2018 10:39:41 -0500
Subject: [PATCH 005/144] add small vector shuffle test

---
 test/test_loopy.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index 294684ccf..d7ac4ef0d 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2788,8 +2788,8 @@ def test_add_prefetch_works_in_lhs_index():
         assert "a1_map" not in get_dependencies(insn.assignees)
 
 
-def test_explicit_simd_offset():
-    def create_and_test(insn):
+def test_explicit_simd_shuffles():
+    def create_and_test(insn, answer=None):
         knl = lp.make_kernel(['{[i]: 0 <= i < 12}', '{[j]: 0 <= j < 1}'],
                              insn,
                              [lp.GlobalArg('a', shape=(1, 12,), dtype=np.int32),
@@ -2803,15 +2803,21 @@ def test_explicit_simd_offset():
         print(lp.generate_code_v2(knl).device_code())
         ctx = cl.create_some_context()
         queue = cl.CommandQueue(ctx)
+        if answer is None:
+            answer = np.arange(2, 14, dtype=np.int32)
         assert np.array_equal(
             knl(queue, a=np.zeros((1, 3, 4), dtype=np.int32),
                 b=np.arange(16, dtype=np.int32).reshape((1, 4, 4)))[1][0].flatten(
                     'C'),
-            np.arange(2, 14, dtype=np.int32))
+            answer)
 
     create_and_test("a[j, i] = b[j, i + 2]")
     create_and_test("a[j, i] = b[j, i + 2] + a[j, i]")
     create_and_test("a[j, i] = a[j, i] + b[j, i + 2]")
+    # test small vector shuffle
+    create_and_test("a[j, i] = b[j, (i + 2) % 4]",
+                    np.arange(16, dtype=np.int32)[(np.arange(16) + 2) % 4 +
+                                                  (np.arange(16) // 4) * 4])
 
 
 if __name__ == "__main__":
-- 
GitLab


From 2eef0059b7a45abd395a73bd516ee01e10c952c5 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 1 Feb 2018 11:08:21 -0500
Subject: [PATCH 006/144] force unvec fallback if not exactly vector iname,
 easiest way to simplify

---
 loopy/expression.py   | 11 ++++++-----
 loopy/kernel/array.py | 15 ---------------
 2 files changed, 6 insertions(+), 20 deletions(-)

diff --git a/loopy/expression.py b/loopy/expression.py
index 5e92d9abb..b386cd8e3 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -120,17 +120,18 @@ class VectorizabilityChecker(RecursiveMapper):
 
         possible = None
         for i in range(len(var.shape)):
-            if (
-                    isinstance(var.dim_tags[i], VectorArrayDimTag)
-                    and isinstance(index[i], Variable)
-                    and index[i].name == self.vec_iname):
+            # if index is exactly vector iname
+            if isinstance(var.dim_tags[i], VectorArrayDimTag) and (
+                    (isinstance(index[i], Variable)
+                     and index[i].name == self.vec_iname)):
                 if var.shape[i] != self.vec_iname_length:
                     raise Unvectorizable("vector length was mismatched")
 
                 if possible is None:
                     possible = True
 
-            else:
+            # of if not vector index, and vector iname is present
+            elif not isinstance(var.dim_tags[i], VectorArrayDimTag):
                 if self.vec_iname in get_dependencies(index[i]):
                     raise Unvectorizable("vectorizing iname '%s' occurs in "
                             "unvectorized subscript axis %d (1-based) of "
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 5572832f4..b672f0227 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1245,16 +1245,6 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
 
         return result
 
-    def depends_only_on(idx, iname):
-        """
-        Test if the given iname is the only variable in idx
-        """
-
-        # feed through mapper
-        from loopy.symbolic import get_dependencies
-        idx_vars = get_dependencies(idx)
-        return len(idx_vars) == 1 and iname in idx_vars
-
     def apply_offset(sub):
         import loopy as lp
 
@@ -1338,11 +1328,6 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
                 # in the vector being returned.
                 pass
 
-            elif (vectorization_info is not None and
-                  depends_only_on(index[i], vectorization_info.iname)):
-                assert vector_index is None
-                vector_index = idx
-
             else:
                 idx = eval_expr_assert_integer_constant(i, idx)
 
-- 
GitLab


From 6f265337ab3c942248c0ba9a5bd4987c35291a38 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 1 Feb 2018 11:08:46 -0500
Subject: [PATCH 007/144] raise Unvectorizable if we're not in unvec fallback

---
 loopy/kernel/array.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index b672f0227..d91af7642 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1229,17 +1229,23 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
 
     def eval_expr_assert_integer_constant(i, expr):
         from pymbolic.mapper.evaluator import UnknownVariableError
+        from loopy.codegen import Unvectorizable
+        # determine error type -- if vectorization_info is None, we're in the
+        # unvec fallback (and should raise a LoopyError)
+        # if vectorization_info is not None, we should raise an Unvectorizable
+        # on failure
+        error_type = LoopyError if vectorization_info is None else Unvectorizable
         try:
             result = eval_expr(expr)
         except UnknownVariableError as e:
-            raise LoopyError("When trying to index the array '%s' along axis "
+            raise error_type("When trying to index the array '%s' along axis "
                     "%d (tagged '%s'), the index was not a compile-time "
                     "constant (but it has to be in order for code to be "
                     "generated). You likely want to unroll the iname(s) '%s'."
                     % (ary.name, i, ary.dim_tags[i], str(e)))
 
         if not is_integer(result):
-            raise LoopyError("subscript '%s[%s]' has non-constant "
+            raise error_type("subscript '%s[%s]' has non-constant "
                     "index for separate-array axis %d (0-based)" % (
                         ary.name, index, i))
 
-- 
GitLab


From 2521a232424e6c8c67a65072d5e55276735eaf28 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 1 Feb 2018 11:14:51 -0500
Subject: [PATCH 008/144] update error message

---
 loopy/kernel/array.py | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index d91af7642..52d85e64a 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1238,11 +1238,15 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
         try:
             result = eval_expr(expr)
         except UnknownVariableError as e:
-            raise error_type("When trying to index the array '%s' along axis "
-                    "%d (tagged '%s'), the index was not a compile-time "
-                    "constant (but it has to be in order for code to be "
-                    "generated). You likely want to unroll the iname(s) '%s'."
-                    % (ary.name, i, ary.dim_tags[i], str(e)))
+            err_msg = ("When trying to index the array '%s' along axis "
+                       "%d (tagged '%s'), the index was not a compile-time "
+                       "constant (but it has to be in order for code to be "
+                       "generated)."
+                       % (ary.name, i, ary.dim_tags[i]))
+            if vectorization_info is not None:
+                # add bit about unrolling
+                err_msg += "You likely want to unroll the iname(s) '%s'" % str(e)
+            raise error_type(err_msg)
 
         if not is_integer(result):
             raise error_type("subscript '%s[%s]' has non-constant "
-- 
GitLab


From 3cf7e53dda04c552a6eaff5fd6be0644e4449053 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 1 Feb 2018 11:14:57 -0500
Subject: [PATCH 009/144] fix ans

---
 test/test_loopy.py | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index d7ac4ef0d..387c79874 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2816,8 +2816,7 @@ def test_explicit_simd_shuffles():
     create_and_test("a[j, i] = a[j, i] + b[j, i + 2]")
     # test small vector shuffle
     create_and_test("a[j, i] = b[j, (i + 2) % 4]",
-                    np.arange(16, dtype=np.int32)[(np.arange(16) + 2) % 4 +
-                                                  (np.arange(16) // 4) * 4])
+                    np.arange(12, dtype=np.int32)[(np.arange(12) + 2) % 4])
 
 
 if __name__ == "__main__":
-- 
GitLab


From aae7875ff148559530e35043722d28d8c7e9b86f Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 1 Feb 2018 18:06:20 -0500
Subject: [PATCH 010/144] add atomic example

---
 test/test_loopy.py | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index 387c79874..cdb31bef8 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2788,12 +2788,16 @@ def test_add_prefetch_works_in_lhs_index():
         assert "a1_map" not in get_dependencies(insn.assignees)
 
 
-def test_explicit_simd_shuffles():
-    def create_and_test(insn, answer=None):
+def test_explicit_simd_shuffles(ctx_factory):
+    ctx = ctx_factory()
+
+    def create_and_test(insn, answer=None, atomic=False):
         knl = lp.make_kernel(['{[i]: 0 <= i < 12}', '{[j]: 0 <= j < 1}'],
                              insn,
-                             [lp.GlobalArg('a', shape=(1, 12,), dtype=np.int32),
-                              lp.GlobalArg('b', shape=(1, 14,), dtype=np.int32)])
+                             [lp.GlobalArg('a', shape=(1, 12,), dtype=np.int32,
+                                           for_atomic=atomic),
+                              lp.GlobalArg('b', shape=(1, 14,), dtype=np.int32,
+                                           for_atomic=atomic)])
 
         knl = lp.split_iname(knl, 'i', 4, inner_tag='vec')
         knl = lp.tag_inames(knl, [('j', 'g.0')])
@@ -2801,7 +2805,6 @@ def test_explicit_simd_shuffles():
         knl = lp.tag_array_axes(knl, ['a', 'b'], 'N1,N0,vec')
 
         print(lp.generate_code_v2(knl).device_code())
-        ctx = cl.create_some_context()
         queue = cl.CommandQueue(ctx)
         if answer is None:
             answer = np.arange(2, 14, dtype=np.int32)
@@ -2817,6 +2820,13 @@ def test_explicit_simd_shuffles():
     # test small vector shuffle
     create_and_test("a[j, i] = b[j, (i + 2) % 4]",
                     np.arange(12, dtype=np.int32)[(np.arange(12) + 2) % 4])
+    # test atomics
+    temp = np.arange(12, dtype=np.int32)
+    answer = np.zeros(4, dtype=np.int32)
+    for i in range(4):
+        answer[i] = np.sum(temp[(i + 2) % 4::4])
+    create_and_test("a[j, (i + 2) % 4] = a[j, (i + 2) % 4] + b[j, i] {atomic}",
+                    answer, True)
 
 
 if __name__ == "__main__":
-- 
GitLab


From 407d0882014e5e3dccca5cc0102dfe24237f8f0e Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 2 Feb 2018 09:50:30 -0500
Subject: [PATCH 011/144] raise exception on atomic explicit-simd

---
 loopy/codegen/__init__.py | 11 ++++++++++-
 loopy/kernel/array.py     |  3 +++
 loopy/target/opencl.py    |  4 ++++
 test/test_loopy.py        | 14 ++++++++------
 4 files changed, 25 insertions(+), 7 deletions(-)

diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index e83515d31..d6c3eedf6 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -178,6 +178,10 @@ class CodeGenerationState(object):
 
         None or an instance of :class:`VectorizationInfo`
 
+    .. attribute:: vectorization_fallback
+        True IFF :func:`try_vectorized` failed, and we're in the :func:`unvectorize`
+        fallback
+
     .. attribute:: is_generating_device_code
 
     .. attribute:: gen_program_name
@@ -196,7 +200,8 @@ class CodeGenerationState(object):
             vectorization_info=None, var_name_generator=None,
             is_generating_device_code=None,
             gen_program_name=None,
-            schedule_index_end=None):
+            schedule_index_end=None,
+            vectorization_fallback=False):
         self.kernel = kernel
         self.implemented_data_info = implemented_data_info
         self.implemented_domain = implemented_domain
@@ -207,6 +212,7 @@ class CodeGenerationState(object):
         self.var_subst_map = var_subst_map.copy()
         self.allow_complex = allow_complex
         self.vectorization_info = vectorization_info
+        self.vectorization_fallback = vectorization_fallback
         self.var_name_generator = var_name_generator
         self.is_generating_device_code = is_generating_device_code
         self.gen_program_name = gen_program_name
@@ -227,7 +233,9 @@ class CodeGenerationState(object):
         if implemented_data_info is None:
             implemented_data_info = self.implemented_data_info
 
+        vectorization_fallback = self.vectorization_fallback
         if vectorization_info is False:
+            vectorization_fallback = True
             vectorization_info = None
 
         elif vectorization_info is None:
@@ -254,6 +262,7 @@ class CodeGenerationState(object):
                 var_subst_map=var_subst_map or self.var_subst_map,
                 allow_complex=self.allow_complex,
                 vectorization_info=vectorization_info,
+                vectorization_fallback=vectorization_fallback,
                 var_name_generator=self.var_name_generator,
                 is_generating_device_code=is_generating_device_code,
                 gen_program_name=gen_program_name,
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 52d85e64a..c14b7faaa 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1339,6 +1339,9 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
                 pass
 
             else:
+                # if vectorization_info is not None:
+
+
                 idx = eval_expr_assert_integer_constant(i, idx)
 
                 assert vector_index is None
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 31e0569b9..2ee7006a5 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -522,6 +522,10 @@ class OpenCLCASTBuilder(CASTBuilder):
         # FIXME: Could detect operations, generate atomic_{add,...} when
         # appropriate.
 
+        if codegen_state.vectorization_fallback or codegen_state.vectorization_info:
+            raise LoopyError('Atomic operators not yet implemented for '
+                             'explicit-SIMD vectorization')
+
         if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [
                 np.int32, np.int64, np.float32, np.float64]:
             from cgen import Block, DoWhile, Assign
diff --git a/test/test_loopy.py b/test/test_loopy.py
index cdb31bef8..30631fc90 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2821,12 +2821,14 @@ def test_explicit_simd_shuffles(ctx_factory):
     create_and_test("a[j, i] = b[j, (i + 2) % 4]",
                     np.arange(12, dtype=np.int32)[(np.arange(12) + 2) % 4])
     # test atomics
-    temp = np.arange(12, dtype=np.int32)
-    answer = np.zeros(4, dtype=np.int32)
-    for i in range(4):
-        answer[i] = np.sum(temp[(i + 2) % 4::4])
-    create_and_test("a[j, (i + 2) % 4] = a[j, (i + 2) % 4] + b[j, i] {atomic}",
-                    answer, True)
+    from loopy import LoopyError
+    with pytest.raises(LoopyError):
+        temp = np.arange(12, dtype=np.int32)
+        answer = np.zeros(4, dtype=np.int32)
+        for i in range(4):
+            answer[i] = np.sum(temp[(i + 2) % 4::4])
+        create_and_test("a[j, (i + 2) % 4] = a[j, (i + 2) % 4] + b[j, i] {atomic}",
+                        answer, True)
 
 
 if __name__ == "__main__":
-- 
GitLab


From 181e1ee0ada64446c8ff1b9e87d799e0c786f571 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 2 Feb 2018 10:43:14 -0500
Subject: [PATCH 012/144] try simplify vector-iname present in non-vector axis

---
 loopy/expression.py | 32 +++++++++++++++++++++++++-------
 1 file changed, 25 insertions(+), 7 deletions(-)

diff --git a/loopy/expression.py b/loopy/expression.py
index b386cd8e3..8c2c61d95 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -130,14 +130,32 @@ class VectorizabilityChecker(RecursiveMapper):
                 if possible is None:
                     possible = True
 
-            # of if not vector index, and vector iname is present
+            # or, if not vector index, and vector iname is present
             elif not isinstance(var.dim_tags[i], VectorArrayDimTag):
-                if self.vec_iname in get_dependencies(index[i]):
-                    raise Unvectorizable("vectorizing iname '%s' occurs in "
-                            "unvectorized subscript axis %d (1-based) of "
-                            "expression '%s'"
-                            % (self.vec_iname, i+1, expr))
-                    break
+                from loopy.symbolic import DependencyMapper
+                dep_mapper = DependencyMapper(composite_leaves=False)
+                deps = dep_mapper(index[i])
+                if self.vec_iname in set(x.name for x in deps):
+                    # check whether we can simplify out the vector iname
+                    context = {x: x for x in deps if x.name != self.vec_iname}
+                    from pymbolic import substitute
+                    from loopy.tools import is_integer
+                    for veci in range(self.vec_iname_length):
+                        ncontext = context.copy()
+                        ncontext[self.vec_iname] = veci
+                        try:
+                            idi = substitute(index[i], ncontext)
+                            if not is_integer(idi) and not all(
+                                    x in self.kernel.iname_to_tag
+                                    for x in get_dependencies(idi)):
+                                raise Unvectorizable(
+                                    "vectorizing iname '%s' occurs in "
+                                    "unvectorized subscript axis %d (1-based) of "
+                                    "expression '%s', and could not be simplified"
+                                    "to compile-time constants."
+                                    % (self.vec_iname, i+1, expr))
+                        except:
+                            break
 
         return bool(possible)
 
-- 
GitLab


From a362a22efe5b5612d33da3aa8aed7a10e0792f18 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 2 Feb 2018 10:46:47 -0500
Subject: [PATCH 013/144] make eval_expr overloadable, and test explicitly for
 shuffle / load

---
 loopy/kernel/array.py                | 42 +++++++++++++++++++++-------
 loopy/target/c/codegen/expression.py |  8 +++++-
 2 files changed, 39 insertions(+), 11 deletions(-)

diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index c14b7faaa..908ad42e8 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1226,17 +1226,17 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
 
     import loopy as lp
     from pymbolic import var
+    from loopy.codegen import Unvectorizable
 
-    def eval_expr_assert_integer_constant(i, expr):
+    def eval_expr_assert_integer_constant(i, expr, **kwargs):
         from pymbolic.mapper.evaluator import UnknownVariableError
-        from loopy.codegen import Unvectorizable
         # determine error type -- if vectorization_info is None, we're in the
         # unvec fallback (and should raise a LoopyError)
         # if vectorization_info is not None, we should raise an Unvectorizable
         # on failure
         error_type = LoopyError if vectorization_info is None else Unvectorizable
         try:
-            result = eval_expr(expr)
+            result = eval_expr(expr, **kwargs)
         except UnknownVariableError as e:
             err_msg = ("When trying to index the array '%s' along axis "
                        "%d (tagged '%s'), the index was not a compile-time "
@@ -1339,13 +1339,35 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
                 pass
 
             else:
-                # if vectorization_info is not None:
-
-
-                idx = eval_expr_assert_integer_constant(i, idx)
-
-                assert vector_index is None
-                vector_index = idx
+                if vectorization_info is not None:
+                    # check dependencies
+                    from loopy.symbolic import get_dependencies
+                    deps = get_dependencies(idx)
+                    if len(deps) == 1 and vectorization_info.iname in deps:
+                        # we depend only on the vectorized iname -- see if we can
+                        # simplify to a load / shuffle
+                        evaled = []
+                        for vec_i in range(vector_size):
+                            try:
+                                evaled.append(eval_expr_assert_integer_constant(
+                                    i, idx, **{vectorization_info.iname: vec_i}))
+                            except Unvectorizable:
+                                break
+
+                        seval = sorted(evaled)
+                        if len(evaled) == vector_size and (
+                                seval[-1] - seval[0] + 1) == vector_size:
+                            # we can generate a load or shuffle depending on the
+                            # alignment
+                            if seval[0] == 0:
+                                vector_index = ('shuffle', evaled)
+                            else:
+                                vector_index = ('load', evaled)
+
+                if vector_index is None:
+                    # if we haven't generated a load of shuffle...
+                    idx = eval_expr_assert_integer_constant(i, idx)
+                    vector_index = idx
 
         else:
             raise LoopyError("unsupported array dim implementation tag '%s' "
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index caee73eb1..0c7815c5a 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -189,8 +189,14 @@ class ExpressionToCExpressionMapper(IdentityMapper):
         index_tuple = tuple(
                 simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple)
 
+        def eval_expr_overloadable(expr, **extra_kwds):
+            # load any extra substitutions supplied in get_access_info
+            var_subst_map = self.codegen_state.var_subst_map.copy()
+            var_subst_map.update(**extra_kwds)
+            return evaluate(expr, var_subst_map)
+
         access_info = get_access_info(self.kernel.target, ary, index_tuple,
-                lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
+                eval_expr_overloadable,
                 self.codegen_state.vectorization_info)
 
         from loopy.kernel.data import (
-- 
GitLab


From a85b99c383776f38a824b2e8199d3caf93458deb Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 2 Feb 2018 12:23:10 -0500
Subject: [PATCH 014/144] simply pass a copy of the context dict into
 get_access_info

---
 loopy/kernel/array.py                |  8 ++++++--
 loopy/target/c/codegen/expression.py | 12 ++----------
 loopy/target/ispc.py                 |  7 ++-----
 3 files changed, 10 insertions(+), 17 deletions(-)

diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 908ad42e8..27d7ab96a 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1216,10 +1216,12 @@ class AccessInfo(ImmutableRecord):
     """
 
 
-def get_access_info(target, ary, index, eval_expr, vectorization_info):
+def get_access_info(target, ary, index, var_subst_map, vectorization_info):
     """
     :arg ary: an object of type :class:`ArrayBase`
     :arg index: a tuple of indices representing a subscript into ary
+    :arg var_subst_map: a context of variable substitutions from the calling codegen
+        state
     :arg vectorization_info: an instance of :class:`loopy.codegen.VectorizationInfo`,
         or *None*.
     """
@@ -1227,6 +1229,7 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
     import loopy as lp
     from pymbolic import var
     from loopy.codegen import Unvectorizable
+    from loopy.symbolic import get_dependencies
 
     def eval_expr_assert_integer_constant(i, expr, **kwargs):
         from pymbolic.mapper.evaluator import UnknownVariableError
@@ -1235,8 +1238,9 @@ def get_access_info(target, ary, index, eval_expr, vectorization_info):
         # if vectorization_info is not None, we should raise an Unvectorizable
         # on failure
         error_type = LoopyError if vectorization_info is None else Unvectorizable
+        from pymbolic import evaluate
         try:
-            result = eval_expr(expr, **kwargs)
+            result = evaluate(expr, kwargs)
         except UnknownVariableError as e:
             err_msg = ("When trying to index the array '%s' along axis "
                        "%d (tagged '%s'), the index was not a compile-time "
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 0c7815c5a..d42e02222 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -183,20 +183,13 @@ class ExpressionToCExpressionMapper(IdentityMapper):
         ary = self.find_array(expr)
 
         from loopy.kernel.array import get_access_info
-        from pymbolic import evaluate
 
         from loopy.symbolic import simplify_using_aff
         index_tuple = tuple(
                 simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple)
 
-        def eval_expr_overloadable(expr, **extra_kwds):
-            # load any extra substitutions supplied in get_access_info
-            var_subst_map = self.codegen_state.var_subst_map.copy()
-            var_subst_map.update(**extra_kwds)
-            return evaluate(expr, var_subst_map)
-
         access_info = get_access_info(self.kernel.target, ary, index_tuple,
-                eval_expr_overloadable,
+                self.codegen_state.var_subst_map.copy(),
                 self.codegen_state.vectorization_info)
 
         from loopy.kernel.data import (
@@ -404,9 +397,8 @@ class ExpressionToCExpressionMapper(IdentityMapper):
             ary = self.find_array(arg)
 
             from loopy.kernel.array import get_access_info
-            from pymbolic import evaluate
             access_info = get_access_info(self.kernel.target, ary, arg.index,
-                    lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
+                    self.codegen_state.var_subst_map.copy(),
                     self.codegen_state.vectorization_info)
 
             from loopy.kernel.data import ImageArg
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 45a59847b..89b32238c 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -109,10 +109,9 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper):
             if lsize:
                 lsize, = lsize
                 from loopy.kernel.array import get_access_info
-                from pymbolic import evaluate
 
                 access_info = get_access_info(self.kernel.target, ary, expr.index,
-                    lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
+                    self.codegen_state.var_subst_map.copy(),
                     self.codegen_state.vectorization_info)
 
                 subscript, = access_info.subscripts
@@ -390,14 +389,12 @@ class ISPCASTBuilder(CASTBuilder):
             ary = ecm.find_array(lhs)
 
             from loopy.kernel.array import get_access_info
-            from pymbolic import evaluate
-
             from loopy.symbolic import simplify_using_aff
             index_tuple = tuple(
                     simplify_using_aff(kernel, idx) for idx in lhs.index_tuple)
 
             access_info = get_access_info(kernel.target, ary, index_tuple,
-                    lambda expr: evaluate(expr, self.codegen_state.var_subst_map),
+                    self.codegen_state.var_subst_map.copy(),
                     codegen_state.vectorization_info)
 
             from loopy.kernel.data import GlobalArg, TemporaryVariable
-- 
GitLab


From 442ff69f2837e6deba78caa431d45764b030808a Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 2 Feb 2018 12:23:51 -0500
Subject: [PATCH 015/144] add first attempt at detecting shuffle / load &
 corresponding node

---
 loopy/kernel/array.py                | 65 +++++++++++++++++++++-------
 loopy/target/c/codegen/expression.py | 14 ++++++
 loopy/target/opencl.py               | 20 ++++++++-
 3 files changed, 82 insertions(+), 17 deletions(-)

diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 27d7ab96a..31920b577 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1312,6 +1312,28 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info):
 
     # }}}
 
+    def is_contiguous(arr):
+        sarr = sorted(arr)
+        return len(arr) == vector_size and (sarr[-1] - sarr[0] + 1) == vector_size
+
+    def is_monotonic(arr):
+        # check if array is monotonic increasing / decreasing
+        signs = [(arr[i + 1] - arr[i]) < 0 for i in range(len(arr) - 1)]
+        return all(s == signs[0] for s in signs[1:])
+
+    def run_over_vecrange(i, idx, base_subs):
+        evaled = []
+        for veci in range(vector_size):
+            try:
+                subsi = base_subs.copy()
+                subsi[vectorization_info.iname] = veci
+                evaled.append(eval_expr_assert_integer_constant(i, idx, **subsi))
+            except Unvectorizable:
+                pass
+        return evaled
+
+    vec_op_type = None
+
     # {{{ process remaining dim tags
 
     for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)):
@@ -1328,6 +1350,30 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info):
             elif stride is lp.auto:
                 stride = var(array_name + "_stride%d" % i)
 
+            if vectorization_info is not None and \
+                    vectorization_info.iname in get_dependencies(idx):
+                # need to determine here whether the vector iname is aligned with
+                # the vector size -> shuffle, or unaligned -> load
+
+                # TODO: need some way to pass in other inames here, such that
+                # we only eliminate truly "known" quantities
+                subs = {x: 0 for x in get_dependencies(idx)
+                        if x != vectorization_info.iname}
+                evaled = run_over_vecrange(i, idx, subs)
+                if is_monotonic(evaled):
+                    vec_op_type = 'shuffle' if all(x == evaled[0] for x in evaled) \
+                        else 'load'
+
+                    # update vector operation type if necessary
+                    if vector_index is not None and isinstance(vector_index, tuple):
+                        assert vector_index[0] is None
+                        vector_index = (vec_op_type, vector_index[1])
+                else:
+                    raise Unvectorizable('Vectorized iname %s present in '
+                        'unvectorized axis %s (1-based) access "%s", and not '
+                        'simplifiable to compile-time contigous access' % (
+                            vectorization_info.iname, i + 1, idx))
+
             subscripts[dim_tag.target_axis] += (stride // vector_size)*idx
 
         elif isinstance(dim_tag, SeparateArrayArrayDimTag):
@@ -1345,28 +1391,15 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info):
             else:
                 if vectorization_info is not None:
                     # check dependencies
-                    from loopy.symbolic import get_dependencies
                     deps = get_dependencies(idx)
                     if len(deps) == 1 and vectorization_info.iname in deps:
                         # we depend only on the vectorized iname -- see if we can
                         # simplify to a load / shuffle
-                        evaled = []
-                        for vec_i in range(vector_size):
-                            try:
-                                evaled.append(eval_expr_assert_integer_constant(
-                                    i, idx, **{vectorization_info.iname: vec_i}))
-                            except Unvectorizable:
-                                break
-
-                        seval = sorted(evaled)
-                        if len(evaled) == vector_size and (
-                                seval[-1] - seval[0] + 1) == vector_size:
+                        evaled = run_over_vecrange(i, idx, {})
+                        if is_contiguous(evaled):
                             # we can generate a load or shuffle depending on the
                             # alignment
-                            if seval[0] == 0:
-                                vector_index = ('shuffle', evaled)
-                            else:
-                                vector_index = ('load', evaled)
+                            vector_index = (vec_op_type, evaled)
 
                 if vector_index is None:
                     # if we haven't generated a load of shuffle...
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index d42e02222..a4dff62e3 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -244,6 +244,20 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                 result = make_var(access_info.array_name)[self.rec(subscript, 'i')]
 
             if access_info.vector_index is not None:
+                if isinstance(access_info.vector_index, tuple):
+                    # check for specific vector access nodes
+                    try:
+                        method, ind = access_info.vector_index
+                        method = getattr(self.codegen_state.ast_builder,
+                                          'add_vector_%s' % method)
+                        return method(result, ind)
+                    except AttributeError:
+                        from loopy.codegen import Unvectorizable
+                        raise Unvectorizable('Target %s has no map node for '
+                            'method add_vector_%s' % (
+                                str(type(self.codegen_state.ast_builder)),
+                                method))
+
                 return self.codegen_state.ast_builder.add_vector_access(
                     result, access_info.vector_index)
             else:
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 2ee7006a5..804e75fd5 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -440,7 +440,25 @@ class OpenCLCASTBuilder(CASTBuilder):
 
     def add_vector_access(self, access_expr, index):
         # The 'int' avoids an 'L' suffix for long ints.
-        return access_expr.attr("s%s" % hex(int(index))[2:])
+        def __map(ind, use_prefix=True):
+            strmap = 's%s' if use_prefix else '%s'
+            start = 2
+            return strmap % hex(int(ind))[start:]
+        try:
+            lookup = ''
+            for i, ind in enumerate(index):
+                lookup += __map(ind, not i)
+        except TypeError:
+            # not iterable
+            lookup = __map(index)
+        return access_expr.attr(lookup)
+
+    def add_vector_shuffle(self, access_expr, index):
+        # this can simply call a vector access with the index list
+        return self.add_vector_access(access_expr, index)
+
+    def add_vector_load(self, access_expr, index):
+        raise NotImplementedError()
 
     def emit_barrier(self, synchronization_kind, mem_kind, comment):
         """
-- 
GitLab


From 2b87aa8b2d370b7922f2abb801b96b1731c4c29f Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 2 Feb 2018 12:48:25 -0500
Subject: [PATCH 016/144] fix vecload / shuffle -- still issue with 'invalid
 foreign object'

---
 loopy/target/c/codegen/expression.py |  4 +++-
 loopy/target/opencl.py               | 16 +++++++++++++---
 2 files changed, 16 insertions(+), 4 deletions(-)

diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index a4dff62e3..b3550a9ac 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -250,7 +250,9 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                         method, ind = access_info.vector_index
                         method = getattr(self.codegen_state.ast_builder,
                                           'add_vector_%s' % method)
-                        return method(result, ind)
+                        return method(result,
+                                      self.codegen_state.vectorization_info.iname,
+                                      ary, ind)
                     except AttributeError:
                         from loopy.codegen import Unvectorizable
                         raise Unvectorizable('Target %s has no map node for '
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 804e75fd5..0f9ce188b 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -453,12 +453,22 @@ class OpenCLCASTBuilder(CASTBuilder):
             lookup = __map(index)
         return access_expr.attr(lookup)
 
-    def add_vector_shuffle(self, access_expr, index):
+    def add_vector_shuffle(self, access_expr, vec_iname, array, index):
         # this can simply call a vector access with the index list
         return self.add_vector_access(access_expr, index)
 
-    def add_vector_load(self, access_expr, index):
-        raise NotImplementedError()
+    def add_vector_load(self, access_expr, vec_iname, array, index):
+        from pymbolic import substitute
+        # get ctype for casting
+        ctype = self.target.get_dtype_registry().dtype_to_ctype(array.dtype)
+        # get size of load in bytes
+        size = array.dtype.itemsize * len(index)
+        # and finally, convert the vector access expression to an index based expr
+        # such that we can take the index
+        # to do so, we substitute the vector iname -> 0 to eliminate any term
+        # involving it, and then substitute the first pre-computed index term
+        access_expr = substitute(access_expr, {vec_iname: 0}) + index[0]
+        return 'vload%i(%i, &((%s*)%s))' % (len(index), size, ctype, access_expr)
 
     def emit_barrier(self, synchronization_kind, mem_kind, comment):
         """
-- 
GitLab


From e2553f908bdd7e095d80e167d6b8cd671e7466f6 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Mon, 5 Feb 2018 17:56:08 -0500
Subject: [PATCH 017/144] refine vectorization check to include compile-time
 integer constants

---
 loopy/expression.py | 23 ++++++++++++++++++++++-
 1 file changed, 22 insertions(+), 1 deletion(-)

diff --git a/loopy/expression.py b/loopy/expression.py
index 8c2c61d95..33b14acc3 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -138,6 +138,27 @@ class VectorizabilityChecker(RecursiveMapper):
                 if self.vec_iname in set(x.name for x in deps):
                     # check whether we can simplify out the vector iname
                     context = {x: x for x in deps if x.name != self.vec_iname}
+
+                    # determine allowed symbols as non-vector inames
+                    allowed_symbols = set(sym for sym in self.kernel.iname_to_tag
+                                          if sym != self.vec_iname)
+                    from loopy.kernel.instruction import Assignment
+                    from loopy.tools import is_integer
+                    from six import iteritems
+
+                    # and compile time integer temporaries
+                    compile_time_assign = set([
+                        str(insn.assignee) for insn in self.kernel.instructions if
+                        isinstance(insn, Assignment) and is_integer(
+                            insn.expression)])
+                    allowed_symbols.update(
+                        set(sym for sym, var in iteritems(
+                                self.kernel.temporary_variables)
+                            # temporary variables w/ no initializer, no shape
+                            if var.initializer is None and not var.shape
+                            # compile time integers
+                            and sym in compile_time_assign)
+                            )
                     from pymbolic import substitute
                     from loopy.tools import is_integer
                     for veci in range(self.vec_iname_length):
@@ -146,7 +167,7 @@ class VectorizabilityChecker(RecursiveMapper):
                         try:
                             idi = substitute(index[i], ncontext)
                             if not is_integer(idi) and not all(
-                                    x in self.kernel.iname_to_tag
+                                    x in allowed_symbols
                                     for x in get_dependencies(idi)):
                                 raise Unvectorizable(
                                     "vectorizing iname '%s' occurs in "
-- 
GitLab


From 6aa16ed828b2acc0bbdb12d39b3beaddccdd09e6 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Mon, 5 Feb 2018 18:36:56 -0500
Subject: [PATCH 018/144] move towards compile-time constants

---
 loopy/expression.py                  | 57 ++++++++++++++++++----------
 loopy/kernel/array.py                | 14 ++++---
 loopy/target/c/codegen/expression.py |  7 +++-
 test/test_loopy.py                   |  3 ++
 4 files changed, 55 insertions(+), 26 deletions(-)

diff --git a/loopy/expression.py b/loopy/expression.py
index 33b14acc3..a468616b9 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -97,6 +97,41 @@ class VectorizabilityChecker(RecursiveMapper):
 
         return False
 
+    @staticmethod
+    def allowed_non_vecdim_dependencies(kernel, vec_iname):
+        """
+        Returns the dictionary of non-vector inames and compile time constants
+        mapped to their 'value' (themselves in case of iname, integer value in case
+        of constant)
+
+        .. attribute:: kernel
+            The kernel to check
+        .. attribute:: vec_iname
+            the vector iname
+        """
+
+        # determine allowed symbols as non-vector inames
+        from pymbolic.primitives import Variable
+        allowed_symbols = {sym: Variable(sym) for sym in kernel.iname_to_tag
+                           if sym != vec_iname}
+        from loopy.kernel.instruction import Assignment
+        from loopy.tools import is_integer
+        from six import iteritems
+
+        # and compile time integer temporaries
+        compile_time_assign = {str(insn.assignee): insn.expression
+            for insn in kernel.instructions if
+            isinstance(insn, Assignment) and is_integer(
+                insn.expression)}
+        allowed_symbols.update(
+            {sym: compile_time_assign[sym] for sym, var in iteritems(
+                    kernel.temporary_variables)
+                # temporary variables w/ no initializer, no shape
+                if var.initializer is None and not var.shape
+                # compile time integers
+                and sym in compile_time_assign})
+        return allowed_symbols
+
     def map_subscript(self, expr):
         name = expr.aggregate.name
 
@@ -138,27 +173,9 @@ class VectorizabilityChecker(RecursiveMapper):
                 if self.vec_iname in set(x.name for x in deps):
                     # check whether we can simplify out the vector iname
                     context = {x: x for x in deps if x.name != self.vec_iname}
+                    allowed_symbols = self.allowed_non_vecdim_dependencies(
+                        self.kernel, self.vec_iname)
 
-                    # determine allowed symbols as non-vector inames
-                    allowed_symbols = set(sym for sym in self.kernel.iname_to_tag
-                                          if sym != self.vec_iname)
-                    from loopy.kernel.instruction import Assignment
-                    from loopy.tools import is_integer
-                    from six import iteritems
-
-                    # and compile time integer temporaries
-                    compile_time_assign = set([
-                        str(insn.assignee) for insn in self.kernel.instructions if
-                        isinstance(insn, Assignment) and is_integer(
-                            insn.expression)])
-                    allowed_symbols.update(
-                        set(sym for sym, var in iteritems(
-                                self.kernel.temporary_variables)
-                            # temporary variables w/ no initializer, no shape
-                            if var.initializer is None and not var.shape
-                            # compile time integers
-                            and sym in compile_time_assign)
-                            )
                     from pymbolic import substitute
                     from loopy.tools import is_integer
                     for veci in range(self.vec_iname_length):
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 31920b577..e8cb52030 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1216,7 +1216,8 @@ class AccessInfo(ImmutableRecord):
     """
 
 
-def get_access_info(target, ary, index, var_subst_map, vectorization_info):
+def get_access_info(target, ary, index, var_subst_map, vectorization_info,
+                    compile_time_constants):
     """
     :arg ary: an object of type :class:`ArrayBase`
     :arg index: a tuple of indices representing a subscript into ary
@@ -1224,6 +1225,8 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info):
         state
     :arg vectorization_info: an instance of :class:`loopy.codegen.VectorizationInfo`,
         or *None*.
+    :arg compile_time_constants: a set of compile time "constants" (inames and
+        integer temporaries w/ known values), used in detection of loads / shuffles
     """
 
     import loopy as lp
@@ -1239,6 +1242,7 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info):
         # on failure
         error_type = LoopyError if vectorization_info is None else Unvectorizable
         from pymbolic import evaluate
+        from pymbolic.primitives import Remainder
         try:
             result = evaluate(expr, kwargs)
         except UnknownVariableError as e:
@@ -1252,11 +1256,11 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info):
                 err_msg += "You likely want to unroll the iname(s) '%s'" % str(e)
             raise error_type(err_msg)
 
-        if not is_integer(result):
+        if not (is_integer(result) or (isinstance(result, Remainder) and
+                is_integer(result.denominator))):
             raise error_type("subscript '%s[%s]' has non-constant "
                     "index for separate-array axis %d (0-based)" % (
                         ary.name, index, i))
-
         return result
 
     def apply_offset(sub):
@@ -1391,11 +1395,11 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info):
             else:
                 if vectorization_info is not None:
                     # check dependencies
-                    deps = get_dependencies(idx)
+                    deps = get_dependencies(idx) - set(compile_time_constants.keys())
                     if len(deps) == 1 and vectorization_info.iname in deps:
                         # we depend only on the vectorized iname -- see if we can
                         # simplify to a load / shuffle
-                        evaled = run_over_vecrange(i, idx, {})
+                        evaled = run_over_vecrange(i, idx, compile_time_constants)
                         if is_contiguous(evaled):
                             # we can generate a load or shuffle depending on the
                             # alignment
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index b3550a9ac..868ed4105 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -188,9 +188,14 @@ class ExpressionToCExpressionMapper(IdentityMapper):
         index_tuple = tuple(
                 simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple)
 
+        from loopy.expression import VectorizabilityChecker
         access_info = get_access_info(self.kernel.target, ary, index_tuple,
                 self.codegen_state.var_subst_map.copy(),
-                self.codegen_state.vectorization_info)
+                self.codegen_state.vectorization_info,
+                VectorizabilityChecker.allowed_non_vecdim_dependencies(
+                    self.codegen_state.kernel,
+                    self.codegen_state.vectorization_info.iname)
+                )
 
         from loopy.kernel.data import (
                 ImageArg, GlobalArg, TemporaryVariable, ConstantArg)
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 30631fc90..13a8521cf 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2814,6 +2814,9 @@ def test_explicit_simd_shuffles(ctx_factory):
                     'C'),
             answer)
 
+    # test w/ compile time temporary constant
+    create_and_test("<>c = 2\n" +
+                    "a[j, i] = a[j, i] + b[j, i + c]")
     create_and_test("a[j, i] = b[j, i + 2]")
     create_and_test("a[j, i] = b[j, i + 2] + a[j, i]")
     create_and_test("a[j, i] = a[j, i] + b[j, i + 2]")
-- 
GitLab


From 6dcc1a42f5edc23ad3ad87ab3b3fddea5ca881d5 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Mon, 5 Feb 2018 18:47:32 -0500
Subject: [PATCH 019/144] use simplify_via_aff

---
 loopy/kernel/array.py | 16 +++++++---------
 1 file changed, 7 insertions(+), 9 deletions(-)

diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index e8cb52030..9210a7768 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1242,7 +1242,6 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info,
         # on failure
         error_type = LoopyError if vectorization_info is None else Unvectorizable
         from pymbolic import evaluate
-        from pymbolic.primitives import Remainder
         try:
             result = evaluate(expr, kwargs)
         except UnknownVariableError as e:
@@ -1256,8 +1255,12 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info,
                 err_msg += "You likely want to unroll the iname(s) '%s'" % str(e)
             raise error_type(err_msg)
 
-        if not (is_integer(result) or (isinstance(result, Remainder) and
-                is_integer(result.denominator))):
+        if not is_integer(result):
+            # try to simplify further
+            from loopy.isl_helpers import simplify_via_aff
+            result = simplify_via_aff(result)
+
+        if not is_integer(result):
             raise error_type("subscript '%s[%s]' has non-constant "
                     "index for separate-array axis %d (0-based)" % (
                         ary.name, index, i))
@@ -1358,12 +1361,7 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info,
                     vectorization_info.iname in get_dependencies(idx):
                 # need to determine here whether the vector iname is aligned with
                 # the vector size -> shuffle, or unaligned -> load
-
-                # TODO: need some way to pass in other inames here, such that
-                # we only eliminate truly "known" quantities
-                subs = {x: 0 for x in get_dependencies(idx)
-                        if x != vectorization_info.iname}
-                evaled = run_over_vecrange(i, idx, subs)
+                evaled = run_over_vecrange(i, idx, compile_time_constants)
                 if is_monotonic(evaled):
                     vec_op_type = 'shuffle' if all(x == evaled[0] for x in evaled) \
                         else 'load'
-- 
GitLab


From 1493067695ff0466f2c94173e40807fb77cf9682 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Mon, 5 Feb 2018 18:50:48 -0500
Subject: [PATCH 020/144] convert assigns to stores

---
 loopy/codegen/instruction.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index e590502fb..ffcefef25 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -113,6 +113,9 @@ def generate_assignment_instruction_code(codegen_state, insn):
         vcheck = VectorizabilityChecker(
                 kernel, vinfo.iname, vinfo.length)
         lhs_is_vector = vcheck(insn.assignee)
+        if isinstance(lhs_is_vector, tuple) and lhs_is_vector[0] == 'load':
+            # convert vector 'load' assignes to stores
+            lhs_is_vector = 'store'
         rhs_is_vector = vcheck(insn.expression)
 
         if not lhs_is_vector and rhs_is_vector:
-- 
GitLab


From 75d6f6e9a69ee81fdd5f9709a0dceae1234277e7 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Mon, 5 Feb 2018 18:56:21 -0500
Subject: [PATCH 021/144] accept compile-time constants

---
 loopy/kernel/array.py | 24 +++++++++++++++++-------
 1 file changed, 17 insertions(+), 7 deletions(-)

diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 9210a7768..38e45856f 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1234,7 +1234,7 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info,
     from loopy.codegen import Unvectorizable
     from loopy.symbolic import get_dependencies
 
-    def eval_expr_assert_integer_constant(i, expr, **kwargs):
+    def eval_expr_assert_constant(i, expr, **kwargs):
         from pymbolic.mapper.evaluator import UnknownVariableError
         # determine error type -- if vectorization_info is None, we're in the
         # unvec fallback (and should raise a LoopyError)
@@ -1260,7 +1260,7 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info,
             from loopy.isl_helpers import simplify_via_aff
             result = simplify_via_aff(result)
 
-        if not is_integer(result):
+        if any([x not in compile_time_constants for x in get_dependencies(result)]):
             raise error_type("subscript '%s[%s]' has non-constant "
                     "index for separate-array axis %d (0-based)" % (
                         ary.name, index, i))
@@ -1314,18 +1314,28 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info,
 
     for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)):
         if isinstance(dim_tag, SeparateArrayArrayDimTag):
-            idx = eval_expr_assert_integer_constant(i, idx)
+            idx = eval_expr_assert_constant(i, idx)
             array_name += "_s%d" % idx
 
     # }}}
 
+    def __get_simplified(arr):
+        from loopy.isl_helpers import simplify_via_aff
+        return [simplify_via_aff(arr[i]) for i in range(len(arr))]
+
     def is_contiguous(arr):
+        if not len(arr):
+            return False
         sarr = sorted(arr)
         return len(arr) == vector_size and (sarr[-1] - sarr[0] + 1) == vector_size
 
     def is_monotonic(arr):
+        if not len(arr):
+            return False
+        signs = __get_simplified(
+            [arr[i + 1] - arr[i] for i in range(len(arr) - 1)])
         # check if array is monotonic increasing / decreasing
-        signs = [(arr[i + 1] - arr[i]) < 0 for i in range(len(arr) - 1)]
+        signs = [x < 0 for x in signs]
         return all(s == signs[0] for s in signs[1:])
 
     def run_over_vecrange(i, idx, base_subs):
@@ -1334,10 +1344,10 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info,
             try:
                 subsi = base_subs.copy()
                 subsi[vectorization_info.iname] = veci
-                evaled.append(eval_expr_assert_integer_constant(i, idx, **subsi))
+                evaled.append(eval_expr_assert_constant(i, idx, **subsi))
             except Unvectorizable:
                 pass
-        return evaled
+        return __get_simplified(evaled)
 
     vec_op_type = None
 
@@ -1405,7 +1415,7 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info,
 
                 if vector_index is None:
                     # if we haven't generated a load of shuffle...
-                    idx = eval_expr_assert_integer_constant(i, idx)
+                    idx = eval_expr_assert_constant(i, idx)
                     vector_index = idx
 
         else:
-- 
GitLab


From 62f13492ef85526ab6c4f718d1dba170d7955a1b Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Mon, 5 Feb 2018 19:21:57 -0500
Subject: [PATCH 022/144] add check for fixed non-vec iname index

---
 loopy/kernel/array.py | 15 ++++++++++-----
 1 file changed, 10 insertions(+), 5 deletions(-)

diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 38e45856f..251458051 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1375,16 +1375,18 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info,
                 if is_monotonic(evaled):
                     vec_op_type = 'shuffle' if all(x == evaled[0] for x in evaled) \
                         else 'load'
-
-                    # update vector operation type if necessary
-                    if vector_index is not None and isinstance(vector_index, tuple):
-                        assert vector_index[0] is None
-                        vector_index = (vec_op_type, vector_index[1])
                 else:
                     raise Unvectorizable('Vectorized iname %s present in '
                         'unvectorized axis %s (1-based) access "%s", and not '
                         'simplifiable to compile-time contigous access' % (
                             vectorization_info.iname, i + 1, idx))
+            elif vectorization_info is not None:
+                vec_op_type = 'shuffle'  # independent of vector iname
+
+            # update vector operation type if necessary
+            if vector_index is not None and isinstance(vector_index, tuple):
+                assert vector_index[0] is None
+                vector_index = (vec_op_type, vector_index[1])
 
             subscripts[dim_tag.target_axis] += (stride // vector_size)*idx
 
@@ -1432,6 +1434,9 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info,
 
         subscripts[0] = apply_offset(subscripts[0])
 
+    if isinstance(vector_index, tuple):
+        assert vector_index[0] is not None, 'Unknown vectorization type'
+
     return AccessInfo(
             array_name=array_name,
             vector_index=vector_index,
-- 
GitLab


From faaf15586a338f539543d10ab5a3b9b9e671f9ff Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Mon, 5 Feb 2018 19:22:23 -0500
Subject: [PATCH 023/144] convert both c expression get_access_info's to pass
 compile time constnats

---
 loopy/target/c/codegen/expression.py | 19 ++++++++++++++-----
 1 file changed, 14 insertions(+), 5 deletions(-)

diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 868ed4105..bb847b807 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -189,13 +189,15 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                 simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple)
 
         from loopy.expression import VectorizabilityChecker
+        ctc = {}
+        if self.codegen_state.vectorization_info is not None:
+            ctc = VectorizabilityChecker.allowed_non_vecdim_dependencies(
+                    self.codegen_state.kernel,
+                    self.codegen_state.vectorization_info.iname)
         access_info = get_access_info(self.kernel.target, ary, index_tuple,
                 self.codegen_state.var_subst_map.copy(),
                 self.codegen_state.vectorization_info,
-                VectorizabilityChecker.allowed_non_vecdim_dependencies(
-                    self.codegen_state.kernel,
-                    self.codegen_state.vectorization_info.iname)
-                )
+                ctc)
 
         from loopy.kernel.data import (
                 ImageArg, GlobalArg, TemporaryVariable, ConstantArg)
@@ -418,9 +420,16 @@ class ExpressionToCExpressionMapper(IdentityMapper):
             ary = self.find_array(arg)
 
             from loopy.kernel.array import get_access_info
+            ctc = {}
+            if self.codegen_state.vectorization_info is not None:
+                from loopy.expression import VectorizabilityChecker
+                ctc = VectorizabilityChecker.allowed_non_vecdim_dependencies(
+                        self.codegen_state.kernel,
+                        self.codegen_state.vectorization_info.iname)
             access_info = get_access_info(self.kernel.target, ary, arg.index,
                     self.codegen_state.var_subst_map.copy(),
-                    self.codegen_state.vectorization_info)
+                    self.codegen_state.vectorization_info,
+                    ctc)
 
             from loopy.kernel.data import ImageArg
             if isinstance(ary, ImageArg):
-- 
GitLab


From 20b2341530a9b8aacd24b33dcf555b6098127321 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Tue, 6 Feb 2018 09:38:57 -0500
Subject: [PATCH 024/144] add first pass at vload / vstore manglers

---
 loopy/target/opencl.py | 31 +++++++++++++++++++++++++++++--
 1 file changed, 29 insertions(+), 2 deletions(-)

diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 0f9ce188b..81ecef3b0 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -147,8 +147,9 @@ _CL_SIMPLE_MULTI_ARG_FUNCTIONS = {
         }
 
 
-VECTOR_LITERAL_FUNCS = dict(
-        ("make_%s%d" % (name, count), (name, dtype, count))
+def get_vector_func(func, template):
+    return dict(
+        (template % dict(func=func, name=name, count=count), (name, dtype, count))
         for name, dtype in [
             ('char', np.int8),
             ('uchar', np.uint8),
@@ -165,6 +166,11 @@ VECTOR_LITERAL_FUNCS = dict(
         )
 
 
+VECTOR_LITERAL_FUNCS = get_vector_func('make', '%(func)s_%(name)s%(count)d')
+VECTOR_STORE_FUNCS = get_vector_func('vstore', '%(func)s%(count)d')
+VECTOR_LOAD_FUNCS = get_vector_func('vload', '%(func)s%(count)d')
+
+
 def opencl_function_mangler(kernel, name, arg_dtypes):
     if not isinstance(name, str):
         return None
@@ -219,6 +225,27 @@ def opencl_function_mangler(kernel, name, arg_dtypes):
                     NumpyType(dtype), count),),
                 arg_dtypes=(NumpyType(dtype),)*count)
 
+    if name in VECTOR_LOAD_FUNCS or name in VECTOR_STORE_FUNCS:
+        if name in VECTOR_LOAD_FUNCS:
+            load = True
+            _, dtype, count = VECTOR_LOAD_FUNCS[name]
+            result = (kernel.target.vector_dtype(NumpyType(dtype), count),)
+            args = (kernel.index_dtype, NumpyType(dtype))
+        else:
+            load = False
+            _, dtype, count = VECTOR_STORE_FUNCS[name]
+            result = tuple()
+            args = (kernel.target.vector_dtype(NumpyType(dtype), count),
+                    kernel.index_dtype, NumpyType(dtype))
+
+        if not ((count == 2 and load) or (count == 3 and not load)):
+            return None
+
+        return CallMangleInfo(
+                target_name=name,
+                result_dtypes=result,
+                arg_dtypes=args)
+
     return None
 
 # }}}
-- 
GitLab


From 55414e328c1e56a6855c93cbcd3989e2cade5e50 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Tue, 6 Feb 2018 09:39:18 -0500
Subject: [PATCH 025/144] convert to pymbolic call

---
 loopy/target/opencl.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 81ecef3b0..1f2b991e9 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -495,7 +495,10 @@ class OpenCLCASTBuilder(CASTBuilder):
         # to do so, we substitute the vector iname -> 0 to eliminate any term
         # involving it, and then substitute the first pre-computed index term
         access_expr = substitute(access_expr, {vec_iname: 0}) + index[0]
-        return 'vload%i(%i, &((%s*)%s))' % (len(index), size, ctype, access_expr)
+        # and stringify
+        access_expr = '&((%s*)%s)' % (ctype, access_expr)
+        from pymbolic.primitives import Call
+        return Call('vload%d' % len(index), (size, access_expr))
 
     def emit_barrier(self, synchronization_kind, mem_kind, comment):
         """
-- 
GitLab


From 6e901e099213681bc1e7f3ec9a7d43c4805888c0 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Tue, 6 Feb 2018 10:34:15 -0500
Subject: [PATCH 026/144] convert to variable call

---
 loopy/target/opencl.py | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 1f2b991e9..76b63de5b 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -497,8 +497,9 @@ class OpenCLCASTBuilder(CASTBuilder):
         access_expr = substitute(access_expr, {vec_iname: 0}) + index[0]
         # and stringify
         access_expr = '&((%s*)%s)' % (ctype, access_expr)
-        from pymbolic.primitives import Call
-        return Call('vload%d' % len(index), (size, access_expr))
+        from pymbolic.primitives import Call, Variable, Expression
+        return Call(Variable('vload%d' % len(index)), (
+            size, Expression(access_expr)))
 
     def emit_barrier(self, synchronization_kind, mem_kind, comment):
         """
-- 
GitLab


From 7828ec9c228bc24d751b1fd15bb734a152b221bf Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Tue, 13 Feb 2018 19:14:24 -0500
Subject: [PATCH 027/144] d'oh, fix the pymbolification

---
 loopy/target/opencl.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 76b63de5b..1cf8d0937 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -494,12 +494,14 @@ class OpenCLCASTBuilder(CASTBuilder):
         # such that we can take the index
         # to do so, we substitute the vector iname -> 0 to eliminate any term
         # involving it, and then substitute the first pre-computed index term
-        access_expr = substitute(access_expr, {vec_iname: 0}) + index[0]
+        access_expr = str(substitute(access_expr, {vec_iname: 0}) + index[0])
+        # and finally remove the array name
+        access_expr = access_expr[access_expr.index(array.name) + len(array.name):]
         # and stringify
-        access_expr = '&((%s*)%s)' % (ctype, access_expr)
-        from pymbolic.primitives import Call, Variable, Expression
+        access_expr = '&(((%s*)%s)%s' % (ctype, array.name, access_expr)
+        from pymbolic.primitives import Call, Variable
         return Call(Variable('vload%d' % len(index)), (
-            size, Expression(access_expr)))
+            Variable(str(size)), Variable(access_expr)))
 
     def emit_barrier(self, synchronization_kind, mem_kind, comment):
         """
-- 
GitLab


From 487e2be5424772890436e25816d7d45db63214c6 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Tue, 13 Feb 2018 19:22:42 -0500
Subject: [PATCH 028/144] fix missing closing parenthesis

---
 loopy/target/opencl.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 1cf8d0937..1851ffdc1 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -498,7 +498,7 @@ class OpenCLCASTBuilder(CASTBuilder):
         # and finally remove the array name
         access_expr = access_expr[access_expr.index(array.name) + len(array.name):]
         # and stringify
-        access_expr = '&(((%s*)%s)%s' % (ctype, array.name, access_expr)
+        access_expr = '&(((%s*)%s)%s)' % (ctype, array.name, access_expr)
         from pymbolic.primitives import Call, Variable
         return Call(Variable('vload%d' % len(index)), (
             Variable(str(size)), Variable(access_expr)))
-- 
GitLab


From 3c9a28471f390e5dcbb5db008dc5737f2900b087 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 14 Feb 2018 08:53:13 -0500
Subject: [PATCH 029/144] fix cast & subst

---
 loopy/target/opencl.py | 23 ++++++++++-------------
 1 file changed, 10 insertions(+), 13 deletions(-)

diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 1851ffdc1..6c16488b9 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -487,21 +487,18 @@ class OpenCLCASTBuilder(CASTBuilder):
     def add_vector_load(self, access_expr, vec_iname, array, index):
         from pymbolic import substitute
         # get ctype for casting
-        ctype = self.target.get_dtype_registry().dtype_to_ctype(array.dtype)
-        # get size of load in bytes
-        size = array.dtype.itemsize * len(index)
-        # and finally, convert the vector access expression to an index based expr
-        # such that we can take the index
-        # to do so, we substitute the vector iname -> 0 to eliminate any term
-        # involving it, and then substitute the first pre-computed index term
-        access_expr = str(substitute(access_expr, {vec_iname: 0}) + index[0])
-        # and finally remove the array name
-        access_expr = access_expr[access_expr.index(array.name) + len(array.name):]
-        # and stringify
-        access_expr = '&(((%s*)%s)%s)' % (ctype, array.name, access_expr)
+        ctype = str(array.get_arg_decl(
+            self, '', array.shape, array.dtype, False))
+        ctype = ctype[:ctype.rindex(array.name) - 1]
+        # and convert the vector access expression to a vector offset
+        # to do so, we substitute the vector iname -> 0 to eliminate it from the
+        # expression
+        offset = str(substitute(access_expr.index, {vec_iname: 0}))
+        # and cast / substitute in the calculated vector iname offset
+        cast_expr = '&((%s)%s)[%s]' % (ctype, array.name, index[0])
         from pymbolic.primitives import Call, Variable
         return Call(Variable('vload%d' % len(index)), (
-            Variable(str(size)), Variable(access_expr)))
+            Variable(offset), Variable(cast_expr)))
 
     def emit_barrier(self, synchronization_kind, mem_kind, comment):
         """
-- 
GitLab


From 07a18a9eb7cd63466863f38aa5866606b4199d5b Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 14 Feb 2018 09:51:15 -0500
Subject: [PATCH 030/144] add simplify to avoid troubles with floor division

---
 loopy/target/opencl.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 6c16488b9..9e5f978a4 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -493,12 +493,18 @@ class OpenCLCASTBuilder(CASTBuilder):
         # and convert the vector access expression to a vector offset
         # to do so, we substitute the vector iname -> 0 to eliminate it from the
         # expression
-        offset = str(substitute(access_expr.index, {vec_iname: 0}))
+        offset = substitute(access_expr.index, {vec_iname: 0})
+        # try symplify
+        try:
+            from loopy.isl_helpers import simplify_via_aff
+            offset = simplify_via_aff(offset)
+        except:
+            pass
         # and cast / substitute in the calculated vector iname offset
         cast_expr = '&((%s)%s)[%s]' % (ctype, array.name, index[0])
         from pymbolic.primitives import Call, Variable
         return Call(Variable('vload%d' % len(index)), (
-            Variable(offset), Variable(cast_expr)))
+            Variable(str(offset)), Variable(cast_expr)))
 
     def emit_barrier(self, synchronization_kind, mem_kind, comment):
         """
-- 
GitLab


From 19d3df8561d8f5dc325761dc550eff7bce940d5a Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 14 Feb 2018 09:51:33 -0500
Subject: [PATCH 031/144] update test

---
 test/test_loopy.py | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index 491fc419d..c90f5ddf9 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2800,7 +2800,7 @@ def test_add_prefetch_works_in_lhs_index():
 def test_explicit_simd_shuffles(ctx_factory):
     ctx = ctx_factory()
 
-    def create_and_test(insn, answer=None, atomic=False):
+    def create_and_test(insn, answer=None, atomic=False, additional_check=None):
         knl = lp.make_kernel(['{[i]: 0 <= i < 12}', '{[j]: 0 <= j < 1}'],
                              insn,
                              [lp.GlobalArg('a', shape=(1, 12,), dtype=np.int32,
@@ -2822,10 +2822,14 @@ def test_explicit_simd_shuffles(ctx_factory):
                 b=np.arange(16, dtype=np.int32).reshape((1, 4, 4)))[1][0].flatten(
                     'C'),
             answer)
+        if additional_check is not None:
+            assert additional_check(knl)
 
     # test w/ compile time temporary constant
     create_and_test("<>c = 2\n" +
-                    "a[j, i] = a[j, i] + b[j, i + c]")
+                    "a[j, i] = b[j, i + c]",
+                    additional_check=lambda knl: 'vload' in lp.generate_code_v2(
+                        knl).device_code())
     create_and_test("a[j, i] = b[j, i + 2]")
     create_and_test("a[j, i] = b[j, i + 2] + a[j, i]")
     create_and_test("a[j, i] = a[j, i] + b[j, i + 2]")
-- 
GitLab


From 7320981000b4dcb0d7b38a7a3cb2f85427283054 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 15 Feb 2018 10:29:14 -0500
Subject: [PATCH 032/144] possible fix to show andreas

---
 loopy/kernel/array.py                |  4 ++--
 loopy/target/c/codegen/expression.py | 23 +++++++++++++++++------
 test/test_loopy.py                   |  2 +-
 3 files changed, 20 insertions(+), 9 deletions(-)

diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 251458051..56ed25e33 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1314,7 +1314,7 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info,
 
     for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)):
         if isinstance(dim_tag, SeparateArrayArrayDimTag):
-            idx = eval_expr_assert_constant(i, idx)
+            idx = eval_expr_assert_constant(i, idx, **compile_time_constants)
             array_name += "_s%d" % idx
 
     # }}}
@@ -1417,7 +1417,7 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info,
 
                 if vector_index is None:
                     # if we haven't generated a load of shuffle...
-                    idx = eval_expr_assert_constant(i, idx)
+                    idx = eval_expr_assert_constant(i, idx, **compile_time_constants)
                     vector_index = idx
 
         else:
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index b55c70d3c..4c6087da9 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -189,11 +189,12 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                 simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple)
 
         from loopy.expression import VectorizabilityChecker
-        ctc = {}
+        ctc_iname = ''
         if self.codegen_state.vectorization_info is not None:
-            ctc = VectorizabilityChecker.allowed_non_vecdim_dependencies(
-                    self.codegen_state.kernel,
-                    self.codegen_state.vectorization_info.iname)
+            ctc_iname = self.codegen_state.vectorization_info.iname
+        ctc = VectorizabilityChecker.allowed_non_vecdim_dependencies(
+                self.codegen_state.kernel,
+                ctc_iname)
         access_info = get_access_info(self.kernel.target, ary, index_tuple,
                 self.codegen_state.var_subst_map.copy(),
                 self.codegen_state.vectorization_info,
@@ -267,8 +268,18 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                                 str(type(self.codegen_state.ast_builder)),
                                 method))
 
-                return self.codegen_state.ast_builder.add_vector_access(
-                    result, access_info.vector_index)
+                try:
+                    from loopy.tools import is_integer
+                    assert is_integer(access_info.vector_index)
+                    return self.codegen_state.ast_builder.add_vector_access(
+                        result, access_info.vector_index)
+                except AssertionError:
+                    from loopy.codegen import Unvectorizable
+                    raise Unvectorizable(
+                        "Cannot add vector access for non-integer vector addressing "
+                        "did you mean to tag iname '%s' as a vector index?" % (
+                            access_info.vector_index))
+
             else:
                 return result
 
diff --git a/test/test_loopy.py b/test/test_loopy.py
index c90f5ddf9..7b2418663 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -769,7 +769,7 @@ def test_vector_types(ctx_factory, vec_len):
 
     ref_knl = knl
 
-    knl = lp.tag_data_axes(knl, "out", "c,vec")
+    knl = lp.tag_array_axes(knl, "out", "c,vec")
     knl = lp.tag_inames(knl, dict(j="unr"))
 
     knl = lp.split_iname(knl, "i", 128, outer_tag="g.0", inner_tag="l.0")
-- 
GitLab


From 1fa83a952fccb27a2b3f3f428e23aa7e5cb808cc Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 14 Mar 2018 16:44:27 -0400
Subject: [PATCH 033/144] fix for loading git revision from dev-tree without
 installed version of loopy

---
 loopy/version.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/loopy/version.py b/loopy/version.py
index 2f29e806e..965baf092 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -29,13 +29,18 @@ if os.environ.get("AKPYTHON_EXEC_IMPORT_UNAVAILABLE") is not None:
     _git_rev = None
 
 else:
-    import loopy._git_rev as _git_rev_mod
-    _git_rev = _git_rev_mod.GIT_REVISION
+    try:
+        import loopy._git_rev as _git_rev_mod
+        _git_rev = _git_rev_mod.GIT_REVISION
+    except ImportError:
+        _git_rev = None
 
     # If we're running from a dev tree, the last install (and hence the most
     # recent update of the above git rev) could have taken place very long ago.
     from pytools import find_module_git_revision
     _runtime_git_rev = find_module_git_revision(__file__, n_levels_up=1)
+    if _git_rev is None and _runtime_git_rev is None:
+        raise Exception("Cannot determine git revision from install or dev-tree.")
     if _runtime_git_rev is not None:
         _git_rev = _runtime_git_rev
 
-- 
GitLab


From c8fdcb176697b80cc436c0e3ded67595798c784e Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 14 Mar 2018 17:28:29 -0400
Subject: [PATCH 034/144] don't update w/ compile time constants unless we're
 actually trying to vectorize

---
 loopy/kernel/array.py                | 20 +++++++++-----------
 loopy/target/c/codegen/expression.py | 14 +++++++-------
 2 files changed, 16 insertions(+), 18 deletions(-)

diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 56ed25e33..25f2ce702 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1216,17 +1216,15 @@ class AccessInfo(ImmutableRecord):
     """
 
 
-def get_access_info(target, ary, index, var_subst_map, vectorization_info,
-                    compile_time_constants):
+def get_access_info(target, ary, index, var_subst_map, vectorization_info):
     """
     :arg ary: an object of type :class:`ArrayBase`
     :arg index: a tuple of indices representing a subscript into ary
     :arg var_subst_map: a context of variable substitutions from the calling codegen
-        state
+        state and potentially other compile-time "constants" (inames and
+        integer temporaries w/ known values), used in detection of loads / shuffles
     :arg vectorization_info: an instance of :class:`loopy.codegen.VectorizationInfo`,
         or *None*.
-    :arg compile_time_constants: a set of compile time "constants" (inames and
-        integer temporaries w/ known values), used in detection of loads / shuffles
     """
 
     import loopy as lp
@@ -1260,7 +1258,7 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info,
             from loopy.isl_helpers import simplify_via_aff
             result = simplify_via_aff(result)
 
-        if any([x not in compile_time_constants for x in get_dependencies(result)]):
+        if any([x not in var_subst_map for x in get_dependencies(result)]):
             raise error_type("subscript '%s[%s]' has non-constant "
                     "index for separate-array axis %d (0-based)" % (
                         ary.name, index, i))
@@ -1314,7 +1312,7 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info,
 
     for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)):
         if isinstance(dim_tag, SeparateArrayArrayDimTag):
-            idx = eval_expr_assert_constant(i, idx, **compile_time_constants)
+            idx = eval_expr_assert_constant(i, idx, **var_subst_map)
             array_name += "_s%d" % idx
 
     # }}}
@@ -1371,7 +1369,7 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info,
                     vectorization_info.iname in get_dependencies(idx):
                 # need to determine here whether the vector iname is aligned with
                 # the vector size -> shuffle, or unaligned -> load
-                evaled = run_over_vecrange(i, idx, compile_time_constants)
+                evaled = run_over_vecrange(i, idx, var_subst_map)
                 if is_monotonic(evaled):
                     vec_op_type = 'shuffle' if all(x == evaled[0] for x in evaled) \
                         else 'load'
@@ -1405,11 +1403,11 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info,
             else:
                 if vectorization_info is not None:
                     # check dependencies
-                    deps = get_dependencies(idx) - set(compile_time_constants.keys())
+                    deps = get_dependencies(idx) - set(var_subst_map.keys())
                     if len(deps) == 1 and vectorization_info.iname in deps:
                         # we depend only on the vectorized iname -- see if we can
                         # simplify to a load / shuffle
-                        evaled = run_over_vecrange(i, idx, compile_time_constants)
+                        evaled = run_over_vecrange(i, idx, var_subst_map)
                         if is_contiguous(evaled):
                             # we can generate a load or shuffle depending on the
                             # alignment
@@ -1417,7 +1415,7 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info,
 
                 if vector_index is None:
                     # if we haven't generated a load of shuffle...
-                    idx = eval_expr_assert_constant(i, idx, **compile_time_constants)
+                    idx = eval_expr_assert_constant(i, idx, **var_subst_map)
                     vector_index = idx
 
         else:
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 163d67e7a..63c841dce 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -189,16 +189,16 @@ class ExpressionToCExpressionMapper(IdentityMapper):
                 simplify_using_aff(self.kernel, idx) for idx in expr.index_tuple)
 
         from loopy.expression import VectorizabilityChecker
-        ctc_iname = ''
+        var_subst_map = self.codegen_state.var_subst_map.copy()
         if self.codegen_state.vectorization_info is not None:
             ctc_iname = self.codegen_state.vectorization_info.iname
-        ctc = VectorizabilityChecker.allowed_non_vecdim_dependencies(
-                self.codegen_state.kernel,
-                ctc_iname)
+            ctc = VectorizabilityChecker.allowed_non_vecdim_dependencies(
+                    self.codegen_state.kernel,
+                    ctc_iname)
+            var_subst_map.update(ctc)
+
         access_info = get_access_info(self.kernel.target, ary, index_tuple,
-                self.codegen_state.var_subst_map.copy(),
-                self.codegen_state.vectorization_info,
-                ctc)
+                var_subst_map, self.codegen_state.vectorization_info)
 
         from loopy.kernel.data import (
                 ImageArg, GlobalArg, TemporaryVariable, ConstantArg)
-- 
GitLab


From d175445b11511abbd81f9aa41b7606acf0e5ee88 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 14 Mar 2018 17:37:59 -0400
Subject: [PATCH 035/144] apply to other get access_info calls

---
 loopy/target/c/codegen/expression.py |  6 +++---
 loopy/target/ispc.py                 | 22 ++++++++++++++++++----
 2 files changed, 21 insertions(+), 7 deletions(-)

diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 63c841dce..dcce34e5d 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -432,16 +432,16 @@ class ExpressionToCExpressionMapper(IdentityMapper):
             ary = self.find_array(arg)
 
             from loopy.kernel.array import get_access_info
-            ctc = {}
+            var_subst_map = self.codegen_state.var_subst_map.copy()
             if self.codegen_state.vectorization_info is not None:
                 from loopy.expression import VectorizabilityChecker
                 ctc = VectorizabilityChecker.allowed_non_vecdim_dependencies(
                         self.codegen_state.kernel,
                         self.codegen_state.vectorization_info.iname)
+                var_subst_map.update(ctc)
             access_info = get_access_info(self.kernel.target, ary, arg.index,
                     self.codegen_state.var_subst_map.copy(),
-                    self.codegen_state.vectorization_info,
-                    ctc)
+                    self.codegen_state.vectorization_info)
 
             from loopy.kernel.data import ImageArg
             if isinstance(ary, ImageArg):
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 89b32238c..61e6a7830 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -110,9 +110,16 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper):
                 lsize, = lsize
                 from loopy.kernel.array import get_access_info
 
+                var_subst_map = self.codegen_state.var_subst_map.copy()
+                if self.codegen_state.vectorization_info is not None:
+                    from loopy.expression import VectorizabilityChecker
+                    ctc = VectorizabilityChecker.allowed_non_vecdim_dependencies(
+                            self.codegen_state.kernel,
+                            self.codegen_state.vectorization_info.iname)
+                    var_subst_map.update(ctc)
+
                 access_info = get_access_info(self.kernel.target, ary, expr.index,
-                    self.codegen_state.var_subst_map.copy(),
-                    self.codegen_state.vectorization_info)
+                    var_subst_map, self.codegen_state.vectorization_info)
 
                 subscript, = access_info.subscripts
                 result = var(access_info.array_name)[
@@ -393,9 +400,16 @@ class ISPCASTBuilder(CASTBuilder):
             index_tuple = tuple(
                     simplify_using_aff(kernel, idx) for idx in lhs.index_tuple)
 
+            var_subst_map = codegen_state.var_subst_map.copy()
+            if codegen_state.vectorization_info is not None:
+                from loopy.expression import VectorizabilityChecker
+                ctc = VectorizabilityChecker.allowed_non_vecdim_dependencies(
+                        codegen_state.kernel,
+                        codegen_state.vectorization_info.iname)
+                var_subst_map.update(ctc)
+
             access_info = get_access_info(kernel.target, ary, index_tuple,
-                    self.codegen_state.var_subst_map.copy(),
-                    codegen_state.vectorization_info)
+                    var_subst_map, codegen_state.vectorization_info)
 
             from loopy.kernel.data import GlobalArg, TemporaryVariable
 
-- 
GitLab


From 65f16d3d0f54c20490e555467b9c39201e6f8180 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 14 Mar 2018 17:55:17 -0400
Subject: [PATCH 036/144] fix to avoid potential duplicate kwargs

---
 loopy/kernel/array.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 25f2ce702..33305262c 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1232,7 +1232,7 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info):
     from loopy.codegen import Unvectorizable
     from loopy.symbolic import get_dependencies
 
-    def eval_expr_assert_constant(i, expr, **kwargs):
+    def eval_expr_assert_constant(i, expr, kwargs):
         from pymbolic.mapper.evaluator import UnknownVariableError
         # determine error type -- if vectorization_info is None, we're in the
         # unvec fallback (and should raise a LoopyError)
@@ -1312,7 +1312,7 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info):
 
     for i, (idx, dim_tag) in enumerate(zip(index, ary.dim_tags)):
         if isinstance(dim_tag, SeparateArrayArrayDimTag):
-            idx = eval_expr_assert_constant(i, idx, **var_subst_map)
+            idx = eval_expr_assert_constant(i, idx, var_subst_map)
             array_name += "_s%d" % idx
 
     # }}}
@@ -1342,7 +1342,7 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info):
             try:
                 subsi = base_subs.copy()
                 subsi[vectorization_info.iname] = veci
-                evaled.append(eval_expr_assert_constant(i, idx, **subsi))
+                evaled.append(eval_expr_assert_constant(i, idx, subsi))
             except Unvectorizable:
                 pass
         return __get_simplified(evaled)
@@ -1415,7 +1415,7 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info):
 
                 if vector_index is None:
                     # if we haven't generated a load of shuffle...
-                    idx = eval_expr_assert_constant(i, idx, **var_subst_map)
+                    idx = eval_expr_assert_constant(i, idx, var_subst_map)
                     vector_index = idx
 
         else:
-- 
GitLab


From 36ad18f0326c9f6cbf7a428377318ef8076791a0 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 14 Mar 2018 17:59:19 -0400
Subject: [PATCH 037/144] add specific execeptions for flake

---
 loopy/expression.py    | 3 ++-
 loopy/target/opencl.py | 5 +++--
 2 files changed, 5 insertions(+), 3 deletions(-)

diff --git a/loopy/expression.py b/loopy/expression.py
index a468616b9..3df637985 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -177,6 +177,7 @@ class VectorizabilityChecker(RecursiveMapper):
                         self.kernel, self.vec_iname)
 
                     from pymbolic import substitute
+                    from pymbolic.mapper.evaluator import UnknownVariableError
                     from loopy.tools import is_integer
                     for veci in range(self.vec_iname_length):
                         ncontext = context.copy()
@@ -192,7 +193,7 @@ class VectorizabilityChecker(RecursiveMapper):
                                     "expression '%s', and could not be simplified"
                                     "to compile-time constants."
                                     % (self.vec_iname, i+1, expr))
-                        except:
+                        except UnknownVariableError:
                             break
 
         return bool(possible)
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 9e5f978a4..21dd01721 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -494,11 +494,12 @@ class OpenCLCASTBuilder(CASTBuilder):
         # to do so, we substitute the vector iname -> 0 to eliminate it from the
         # expression
         offset = substitute(access_expr.index, {vec_iname: 0})
-        # try symplify
+        # try simplify
+        from pymbolic.mapper.evaluator import UnknownVariableError
         try:
             from loopy.isl_helpers import simplify_via_aff
             offset = simplify_via_aff(offset)
-        except:
+        except UnknownVariableError:
             pass
         # and cast / substitute in the calculated vector iname offset
         cast_expr = '&((%s)%s)[%s]' % (ctype, array.name, index[0])
-- 
GitLab


From 7062d45a74af1a6108aefe1038e476010c721a19 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 14 Mar 2018 18:03:44 -0400
Subject: [PATCH 038/144] py2.6 fix

---
 loopy/expression.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/loopy/expression.py b/loopy/expression.py
index 3df637985..6f76d4761 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -112,8 +112,8 @@ class VectorizabilityChecker(RecursiveMapper):
 
         # determine allowed symbols as non-vector inames
         from pymbolic.primitives import Variable
-        allowed_symbols = {sym: Variable(sym) for sym in kernel.iname_to_tag
-                           if sym != vec_iname}
+        allowed_symbols = dict((sym, Variable(sym)) for sym in kernel.iname_to_tag
+                               if sym != vec_iname)
         from loopy.kernel.instruction import Assignment
         from loopy.tools import is_integer
         from six import iteritems
-- 
GitLab


From e7bc3ac7a12e9d0e74182b53cdf8261503b9f016 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 14 Mar 2018 18:09:47 -0400
Subject: [PATCH 039/144] more py2.6 fixes

---
 loopy/expression.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/loopy/expression.py b/loopy/expression.py
index 6f76d4761..6d9abeef2 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -119,17 +119,17 @@ class VectorizabilityChecker(RecursiveMapper):
         from six import iteritems
 
         # and compile time integer temporaries
-        compile_time_assign = {str(insn.assignee): insn.expression
-            for insn in kernel.instructions if
-            isinstance(insn, Assignment) and is_integer(
-                insn.expression)}
+        compile_time_assign = dict((str(insn.assignee), insn.expression)
+                                   for insn in kernel.instructions if
+                                   isinstance(insn, Assignment) and is_integer(
+                                   insn.expression))
         allowed_symbols.update(
-            {sym: compile_time_assign[sym] for sym, var in iteritems(
+            dict((sym, compile_time_assign[sym]) for sym, var in iteritems(
                     kernel.temporary_variables)
                 # temporary variables w/ no initializer, no shape
                 if var.initializer is None and not var.shape
                 # compile time integers
-                and sym in compile_time_assign})
+                and sym in compile_time_assign))
         return allowed_symbols
 
     def map_subscript(self, expr):
-- 
GitLab


From 7b4c3d7c8fec98447ca452ec6a657313623146f1 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 14 Mar 2018 18:16:09 -0400
Subject: [PATCH 040/144] still more py2.7 fixes

---
 loopy/expression.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/expression.py b/loopy/expression.py
index 6d9abeef2..6c97838cb 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -172,7 +172,7 @@ class VectorizabilityChecker(RecursiveMapper):
                 deps = dep_mapper(index[i])
                 if self.vec_iname in set(x.name for x in deps):
                     # check whether we can simplify out the vector iname
-                    context = {x: x for x in deps if x.name != self.vec_iname}
+                    context = dict((x, x) for x in deps if x.name != self.vec_iname)
                     allowed_symbols = self.allowed_non_vecdim_dependencies(
                         self.kernel, self.vec_iname)
 
-- 
GitLab


From 080cac7219b84012c6e72c28a313ede9b9ca5b99 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 14 Mar 2018 19:03:28 -0400
Subject: [PATCH 041/144] Revert "fix for loading git revision from dev-tree
 without installed version of loopy" -- shoudl only be on it's own branch

This reverts commit 1fa83a952fccb27a2b3f3f428e23aa7e5cb808cc.
---
 loopy/version.py | 9 ++-------
 1 file changed, 2 insertions(+), 7 deletions(-)

diff --git a/loopy/version.py b/loopy/version.py
index 965baf092..2f29e806e 100644
--- a/loopy/version.py
+++ b/loopy/version.py
@@ -29,18 +29,13 @@ if os.environ.get("AKPYTHON_EXEC_IMPORT_UNAVAILABLE") is not None:
     _git_rev = None
 
 else:
-    try:
-        import loopy._git_rev as _git_rev_mod
-        _git_rev = _git_rev_mod.GIT_REVISION
-    except ImportError:
-        _git_rev = None
+    import loopy._git_rev as _git_rev_mod
+    _git_rev = _git_rev_mod.GIT_REVISION
 
     # If we're running from a dev tree, the last install (and hence the most
     # recent update of the above git rev) could have taken place very long ago.
     from pytools import find_module_git_revision
     _runtime_git_rev = find_module_git_revision(__file__, n_levels_up=1)
-    if _git_rev is None and _runtime_git_rev is None:
-        raise Exception("Cannot determine git revision from install or dev-tree.")
     if _runtime_git_rev is not None:
         _git_rev = _runtime_git_rev
 
-- 
GitLab


From b7435863d5b4956cd71bc0fbddd670fd056f5f9e Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 15 Mar 2018 10:33:14 -0400
Subject: [PATCH 042/144] refine error messages

---
 loopy/kernel/array.py | 22 ++++++++++++++--------
 1 file changed, 14 insertions(+), 8 deletions(-)

diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 33305262c..c4ef06b9b 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1243,15 +1243,21 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info):
         try:
             result = evaluate(expr, kwargs)
         except UnknownVariableError as e:
-            err_msg = ("When trying to index the array '%s' along axis "
-                       "%d (tagged '%s'), the index was not a compile-time "
-                       "constant (but it has to be in order for code to be "
-                       "generated)."
-                       % (ary.name, i, ary.dim_tags[i]))
             if vectorization_info is not None:
-                # add bit about unrolling
-                err_msg += "You likely want to unroll the iname(s) '%s'" % str(e)
-            raise error_type(err_msg)
+                # failed vectorization
+                raise Unvectorizable(
+                    "When trying to vectorize the array '%s' along axis "
+                    "%d (tagged '%s'), the index was not a compile-time "
+                    "constant (but it has to be in order for code to be "
+                    "generated). You likely want to unroll the iname(s) '%s'"
+                    % (ary.name, i, ary.dim_tags[i], str(e)))
+            else:
+                raise LoopyError(
+                    "When trying to unroll the array '%s' along axis "
+                    "%d (tagged '%s'), the index was not an unrollable-iname "
+                    "or constant (but it has to be in order for code to be "
+                    "generated). You likely want to unroll/change array index(s)"
+                    " '%s'" % (ary.name, i, ary.dim_tags[i], str(e)))
 
         if not is_integer(result):
             # try to simplify further
-- 
GitLab


From a49d8072d4c9f501dc620cb016bc1bcf3b9a16ec Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 15 Mar 2018 10:33:14 -0400
Subject: [PATCH 043/144] refine error messages

---
 loopy/codegen/__init__.py            | 16 ++-----------
 loopy/kernel/array.py                | 35 ++++++++++++++++------------
 loopy/target/c/codegen/expression.py |  4 ++--
 loopy/target/ispc.py                 |  4 ++--
 loopy/target/opencl.py               |  8 ++++++-
 5 files changed, 33 insertions(+), 34 deletions(-)

diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index d6c3eedf6..6f7442dc5 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -178,10 +178,6 @@ class CodeGenerationState(object):
 
         None or an instance of :class:`VectorizationInfo`
 
-    .. attribute:: vectorization_fallback
-        True IFF :func:`try_vectorized` failed, and we're in the :func:`unvectorize`
-        fallback
-
     .. attribute:: is_generating_device_code
 
     .. attribute:: gen_program_name
@@ -200,8 +196,7 @@ class CodeGenerationState(object):
             vectorization_info=None, var_name_generator=None,
             is_generating_device_code=None,
             gen_program_name=None,
-            schedule_index_end=None,
-            vectorization_fallback=False):
+            schedule_index_end=None):
         self.kernel = kernel
         self.implemented_data_info = implemented_data_info
         self.implemented_domain = implemented_domain
@@ -212,7 +207,6 @@ class CodeGenerationState(object):
         self.var_subst_map = var_subst_map.copy()
         self.allow_complex = allow_complex
         self.vectorization_info = vectorization_info
-        self.vectorization_fallback = vectorization_fallback
         self.var_name_generator = var_name_generator
         self.is_generating_device_code = is_generating_device_code
         self.gen_program_name = gen_program_name
@@ -233,12 +227,7 @@ class CodeGenerationState(object):
         if implemented_data_info is None:
             implemented_data_info = self.implemented_data_info
 
-        vectorization_fallback = self.vectorization_fallback
-        if vectorization_info is False:
-            vectorization_fallback = True
-            vectorization_info = None
-
-        elif vectorization_info is None:
+        if vectorization_info is None:
             vectorization_info = self.vectorization_info
 
         if is_generating_device_code is None:
@@ -262,7 +251,6 @@ class CodeGenerationState(object):
                 var_subst_map=var_subst_map or self.var_subst_map,
                 allow_complex=self.allow_complex,
                 vectorization_info=vectorization_info,
-                vectorization_fallback=vectorization_fallback,
                 var_name_generator=self.var_name_generator,
                 is_generating_device_code=is_generating_device_code,
                 gen_program_name=gen_program_name,
diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 33305262c..2fd71e3b8 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1236,22 +1236,28 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info):
         from pymbolic.mapper.evaluator import UnknownVariableError
         # determine error type -- if vectorization_info is None, we're in the
         # unvec fallback (and should raise a LoopyError)
-        # if vectorization_info is not None, we should raise an Unvectorizable
+        # if vectorization_info is 'True', we should raise an Unvectorizable
         # on failure
         error_type = LoopyError if vectorization_info is None else Unvectorizable
         from pymbolic import evaluate
         try:
             result = evaluate(expr, kwargs)
         except UnknownVariableError as e:
-            err_msg = ("When trying to index the array '%s' along axis "
-                       "%d (tagged '%s'), the index was not a compile-time "
-                       "constant (but it has to be in order for code to be "
-                       "generated)."
-                       % (ary.name, i, ary.dim_tags[i]))
-            if vectorization_info is not None:
-                # add bit about unrolling
-                err_msg += "You likely want to unroll the iname(s) '%s'" % str(e)
-            raise error_type(err_msg)
+            if vectorization_info:
+                # failed vectorization
+                raise Unvectorizable(
+                    "When trying to vectorize the array '%s' along axis "
+                    "%d (tagged '%s'), the index was not a compile-time "
+                    "constant (but it has to be in order for code to be "
+                    "generated). You likely want to unroll the iname(s) '%s'"
+                    % (ary.name, i, ary.dim_tags[i], str(e)))
+            else:
+                raise LoopyError(
+                    "When trying to unroll the array '%s' along axis "
+                    "%d (tagged '%s'), the index was not an unrollable-iname "
+                    "or constant (but it has to be in order for code to be "
+                    "generated). You likely want to unroll/change array index(s)"
+                    " '%s'" % (ary.name, i, ary.dim_tags[i], str(e)))
 
         if not is_integer(result):
             # try to simplify further
@@ -1365,7 +1371,7 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info):
             elif stride is lp.auto:
                 stride = var(array_name + "_stride%d" % i)
 
-            if vectorization_info is not None and \
+            if vectorization_info and \
                     vectorization_info.iname in get_dependencies(idx):
                 # need to determine here whether the vector iname is aligned with
                 # the vector size -> shuffle, or unaligned -> load
@@ -1378,7 +1384,7 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info):
                         'unvectorized axis %s (1-based) access "%s", and not '
                         'simplifiable to compile-time contigous access' % (
                             vectorization_info.iname, i + 1, idx))
-            elif vectorization_info is not None:
+            elif vectorization_info:
                 vec_op_type = 'shuffle'  # independent of vector iname
 
             # update vector operation type if necessary
@@ -1393,15 +1399,14 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info):
 
         elif isinstance(dim_tag, VectorArrayDimTag):
             from pymbolic.primitives import Variable
-            if (vectorization_info is not None
-                    and isinstance(index[i], Variable)
+            if (vectorization_info and isinstance(index[i], Variable)
                     and index[i].name == vectorization_info.iname):
                 # We'll do absolutely nothing here, which will result
                 # in the vector being returned.
                 pass
 
             else:
-                if vectorization_info is not None:
+                if vectorization_info:
                     # check dependencies
                     deps = get_dependencies(idx) - set(var_subst_map.keys())
                     if len(deps) == 1 and vectorization_info.iname in deps:
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index dcce34e5d..241ddb979 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -190,7 +190,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
 
         from loopy.expression import VectorizabilityChecker
         var_subst_map = self.codegen_state.var_subst_map.copy()
-        if self.codegen_state.vectorization_info is not None:
+        if self.codegen_state.vectorization_info:
             ctc_iname = self.codegen_state.vectorization_info.iname
             ctc = VectorizabilityChecker.allowed_non_vecdim_dependencies(
                     self.codegen_state.kernel,
@@ -433,7 +433,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
 
             from loopy.kernel.array import get_access_info
             var_subst_map = self.codegen_state.var_subst_map.copy()
-            if self.codegen_state.vectorization_info is not None:
+            if self.codegen_state.vectorization_info:
                 from loopy.expression import VectorizabilityChecker
                 ctc = VectorizabilityChecker.allowed_non_vecdim_dependencies(
                         self.codegen_state.kernel,
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 61e6a7830..9041f946e 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -111,7 +111,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper):
                 from loopy.kernel.array import get_access_info
 
                 var_subst_map = self.codegen_state.var_subst_map.copy()
-                if self.codegen_state.vectorization_info is not None:
+                if self.codegen_state.vectorization_info:
                     from loopy.expression import VectorizabilityChecker
                     ctc = VectorizabilityChecker.allowed_non_vecdim_dependencies(
                             self.codegen_state.kernel,
@@ -401,7 +401,7 @@ class ISPCASTBuilder(CASTBuilder):
                     simplify_using_aff(kernel, idx) for idx in lhs.index_tuple)
 
             var_subst_map = codegen_state.var_subst_map.copy()
-            if codegen_state.vectorization_info is not None:
+            if codegen_state.vectorization_info:
                 from loopy.expression import VectorizabilityChecker
                 ctc = VectorizabilityChecker.allowed_non_vecdim_dependencies(
                         codegen_state.kernel,
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 21dd01721..652fce659 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -587,7 +587,13 @@ class OpenCLCASTBuilder(CASTBuilder):
         # FIXME: Could detect operations, generate atomic_{add,...} when
         # appropriate.
 
-        if codegen_state.vectorization_fallback or codegen_state.vectorization_info:
+        if codegen_state.vectorization_info is not None:
+            # note - this check whether we've previously tried to vectorize and
+            # failed (in which case vectorization_info will be False) or whether
+            # vectorization_info is a valid :class:`VectorizationInfo`
+            #
+            # Both cases should fail (as we can't take the index of an unrolled
+            # atomic)
             raise LoopyError('Atomic operators not yet implemented for '
                              'explicit-SIMD vectorization')
 
-- 
GitLab


From 189ad65aaa84ff9fd1ab402de96c54f0a48a0123 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 15 Mar 2018 17:46:32 -0400
Subject: [PATCH 044/144] rename and update docs

---
 loopy/expression.py                  | 19 ++++++++++++-------
 loopy/target/c/codegen/expression.py |  4 ++--
 loopy/target/ispc.py                 |  4 ++--
 3 files changed, 16 insertions(+), 11 deletions(-)

diff --git a/loopy/expression.py b/loopy/expression.py
index 6c97838cb..3e58451ff 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -98,16 +98,21 @@ class VectorizabilityChecker(RecursiveMapper):
         return False
 
     @staticmethod
-    def allowed_non_vecdim_dependencies(kernel, vec_iname):
+    def compile_time_constants(kernel, vec_iname):
         """
-        Returns the dictionary of non-vector inames and compile time constants
-        mapped to their 'value' (themselves in case of iname, integer value in case
-        of constant)
+        Returns a dictionary of (non-vector) inames and temporary variables whose
+        value is known at "compile" time. These are used (in combination with a
+        codegen state's variable substitution map) to simplifying access expressions
+        in :func:`get_access_info`.
 
-        .. attribute:: kernel
+        Note: inames are mapped to the :class:`Variable` version of themselves,
+              while temporary variables are mapped to their integer value
+
+        .. parameter:: kernel
             The kernel to check
-        .. attribute:: vec_iname
+        .. parameter:: vec_iname
             the vector iname
+
         """
 
         # determine allowed symbols as non-vector inames
@@ -173,7 +178,7 @@ class VectorizabilityChecker(RecursiveMapper):
                 if self.vec_iname in set(x.name for x in deps):
                     # check whether we can simplify out the vector iname
                     context = dict((x, x) for x in deps if x.name != self.vec_iname)
-                    allowed_symbols = self.allowed_non_vecdim_dependencies(
+                    allowed_symbols = self.compile_time_constants(
                         self.kernel, self.vec_iname)
 
                     from pymbolic import substitute
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index 241ddb979..f3590ecf6 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -192,7 +192,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
         var_subst_map = self.codegen_state.var_subst_map.copy()
         if self.codegen_state.vectorization_info:
             ctc_iname = self.codegen_state.vectorization_info.iname
-            ctc = VectorizabilityChecker.allowed_non_vecdim_dependencies(
+            ctc = VectorizabilityChecker.compile_time_constants(
                     self.codegen_state.kernel,
                     ctc_iname)
             var_subst_map.update(ctc)
@@ -435,7 +435,7 @@ class ExpressionToCExpressionMapper(IdentityMapper):
             var_subst_map = self.codegen_state.var_subst_map.copy()
             if self.codegen_state.vectorization_info:
                 from loopy.expression import VectorizabilityChecker
-                ctc = VectorizabilityChecker.allowed_non_vecdim_dependencies(
+                ctc = VectorizabilityChecker.compile_time_constants(
                         self.codegen_state.kernel,
                         self.codegen_state.vectorization_info.iname)
                 var_subst_map.update(ctc)
diff --git a/loopy/target/ispc.py b/loopy/target/ispc.py
index 9041f946e..1cdfb3a69 100644
--- a/loopy/target/ispc.py
+++ b/loopy/target/ispc.py
@@ -113,7 +113,7 @@ class ExprToISPCExprMapper(ExpressionToCExpressionMapper):
                 var_subst_map = self.codegen_state.var_subst_map.copy()
                 if self.codegen_state.vectorization_info:
                     from loopy.expression import VectorizabilityChecker
-                    ctc = VectorizabilityChecker.allowed_non_vecdim_dependencies(
+                    ctc = VectorizabilityChecker.compile_time_constants(
                             self.codegen_state.kernel,
                             self.codegen_state.vectorization_info.iname)
                     var_subst_map.update(ctc)
@@ -403,7 +403,7 @@ class ISPCASTBuilder(CASTBuilder):
             var_subst_map = codegen_state.var_subst_map.copy()
             if codegen_state.vectorization_info:
                 from loopy.expression import VectorizabilityChecker
-                ctc = VectorizabilityChecker.allowed_non_vecdim_dependencies(
+                ctc = VectorizabilityChecker.compile_time_constants(
                         codegen_state.kernel,
                         codegen_state.vectorization_info.iname)
                 var_subst_map.update(ctc)
-- 
GitLab


From 198c0961209559e443d89d3729dbe8208d8cc928 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 15 Mar 2018 17:52:19 -0400
Subject: [PATCH 045/144] unify disparate paths

---
 loopy/expression.py | 69 ++++++++++++++++++++-------------------------
 1 file changed, 31 insertions(+), 38 deletions(-)

diff --git a/loopy/expression.py b/loopy/expression.py
index 3e58451ff..9fe918620 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -154,52 +154,45 @@ class VectorizabilityChecker(RecursiveMapper):
 
         index = expr.index_tuple
 
-        from loopy.symbolic import get_dependencies
+        from loopy.symbolic import get_dependencies, DependencyMapper
         from loopy.kernel.array import VectorArrayDimTag
-        from pymbolic.primitives import Variable
 
         possible = None
         for i in range(len(var.shape)):
-            # if index is exactly vector iname
-            if isinstance(var.dim_tags[i], VectorArrayDimTag) and (
-                    (isinstance(index[i], Variable)
-                     and index[i].name == self.vec_iname)):
+            dep_mapper = DependencyMapper(composite_leaves=False)
+            deps = dep_mapper(index[i])
+            # if we're on the vector index
+            if isinstance(var.dim_tags[i], VectorArrayDimTag):
                 if var.shape[i] != self.vec_iname_length:
                     raise Unvectorizable("vector length was mismatched")
-
                 if possible is None:
-                    possible = True
-
+                    possible = self.vec_iname in [str(x) for x in deps]
             # or, if not vector index, and vector iname is present
-            elif not isinstance(var.dim_tags[i], VectorArrayDimTag):
-                from loopy.symbolic import DependencyMapper
-                dep_mapper = DependencyMapper(composite_leaves=False)
-                deps = dep_mapper(index[i])
-                if self.vec_iname in set(x.name for x in deps):
-                    # check whether we can simplify out the vector iname
-                    context = dict((x, x) for x in deps if x.name != self.vec_iname)
-                    allowed_symbols = self.compile_time_constants(
-                        self.kernel, self.vec_iname)
-
-                    from pymbolic import substitute
-                    from pymbolic.mapper.evaluator import UnknownVariableError
-                    from loopy.tools import is_integer
-                    for veci in range(self.vec_iname_length):
-                        ncontext = context.copy()
-                        ncontext[self.vec_iname] = veci
-                        try:
-                            idi = substitute(index[i], ncontext)
-                            if not is_integer(idi) and not all(
-                                    x in allowed_symbols
-                                    for x in get_dependencies(idi)):
-                                raise Unvectorizable(
-                                    "vectorizing iname '%s' occurs in "
-                                    "unvectorized subscript axis %d (1-based) of "
-                                    "expression '%s', and could not be simplified"
-                                    "to compile-time constants."
-                                    % (self.vec_iname, i+1, expr))
-                        except UnknownVariableError:
-                            break
+            elif self.vec_iname in set(x.name for x in deps):
+                # check whether we can simplify out the vector iname
+                context = dict((x, x) for x in deps if x.name != self.vec_iname)
+                allowed_symbols = self.compile_time_constants(
+                    self.kernel, self.vec_iname)
+
+                from pymbolic import substitute
+                from pymbolic.mapper.evaluator import UnknownVariableError
+                from loopy.tools import is_integer
+                for veci in range(self.vec_iname_length):
+                    ncontext = context.copy()
+                    ncontext[self.vec_iname] = veci
+                    try:
+                        idi = substitute(index[i], ncontext)
+                        if not is_integer(idi) and not all(
+                                x in allowed_symbols
+                                for x in get_dependencies(idi)):
+                            raise Unvectorizable(
+                                "vectorizing iname '%s' occurs in "
+                                "unvectorized subscript axis %d (1-based) of "
+                                "expression '%s', and could not be simplified"
+                                "to compile-time constants."
+                                % (self.vec_iname, i+1, expr))
+                    except UnknownVariableError:
+                        break
 
         return bool(possible)
 
-- 
GitLab


From b24844e80aa7ec510679a71075d1c50c7ca56517 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 15 Mar 2018 18:21:33 -0400
Subject: [PATCH 046/144] remove old 'store' override as it doesn't work here
 anyways

---
 loopy/codegen/instruction.py | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index ffcefef25..e590502fb 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -113,9 +113,6 @@ def generate_assignment_instruction_code(codegen_state, insn):
         vcheck = VectorizabilityChecker(
                 kernel, vinfo.iname, vinfo.length)
         lhs_is_vector = vcheck(insn.assignee)
-        if isinstance(lhs_is_vector, tuple) and lhs_is_vector[0] == 'load':
-            # convert vector 'load' assignes to stores
-            lhs_is_vector = 'store'
         rhs_is_vector = vcheck(insn.expression)
 
         if not lhs_is_vector and rhs_is_vector:
-- 
GitLab


From 96317d0059f2f6b4edf8af878d0f677bbe727102 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 16 Mar 2018 17:52:50 -0400
Subject: [PATCH 047/144] attempt to avoid incorrect vector promotion of
 temporary variables

---
 loopy/kernel/__init__.py | 31 +++++++++++++++++++++++++++++++
 loopy/transform/ilp.py   | 32 +++++++++++++++++++++++++-------
 test/test_loopy.py       | 36 ++++++++++++++++++++++++++++++++++++
 3 files changed, 92 insertions(+), 7 deletions(-)

diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index 32b233900..cf269486d 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -778,6 +778,23 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         return result
 
+    @memoize_method
+    def insn_to_reader_map(self):
+        """
+        :return: a dict that maps instruction names to the variables that they read
+        """
+        result = {}
+
+        admissible_vars = (
+                set(arg.name for arg in self.args)
+                | set(six.iterkeys(self.temporary_variables)))
+
+        for insn in self.instructions:
+            for var_name in insn.read_dependency_names() & admissible_vars:
+                result.setdefault(insn.id, set()).add(var_name)
+
+        return result
+
     @memoize_method
     def writer_map(self):
         """
@@ -792,6 +809,20 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         return result
 
+    @memoize_method
+    def insn_to_writer_map(self):
+        """
+        :return: a dict that maps instruction names to the variables that they write
+            to
+        """
+        result = {}
+
+        for insn in self.instructions:
+            for var_name in insn.assignee_var_names():
+                result.setdefault(insn.id, set()).add(var_name)
+
+        return result
+
     @memoize_method
     def get_read_variables(self):
         result = set()
diff --git a/loopy/transform/ilp.py b/loopy/transform/ilp.py
index 0ac71d603..ac19d61af 100644
--- a/loopy/transform/ilp.py
+++ b/loopy/transform/ilp.py
@@ -65,12 +65,13 @@ class ExtraInameIndexInserter(IdentityMapper):
 
 
 def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
-    if iname is not None:
-        logger.debug("%s: add axes to temporaries for ilp" % kernel.name)
+    logger.debug("%s: add axes to temporaries for ilp / vec" % kernel.name)
 
     wmap = kernel.writer_map()
+    itr_map = kernel.insn_to_reader_map()
 
     from loopy.kernel.data import IlpBaseTag, VectorizeTag
+    from loopy.kernel.array import VectorArrayDimTag
 
     var_to_new_ilp_inames = {}
 
@@ -81,11 +82,28 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
             writer_insn = kernel.id_to_insn[writer_insn_id]
 
             if iname is None:
-                ilp_inames = frozenset(iname
-                        for iname in kernel.insn_inames(writer_insn)
-                        if isinstance(
-                            kernel.iname_to_tag.get(iname),
-                            (IlpBaseTag, VectorizeTag)))
+                ilp_inames = set()
+                for iname in kernel.insn_inames(writer_insn):
+                    if isinstance(kernel.iname_to_tag.get(iname), IlpBaseTag):
+                        ilp_inames.add(iname)
+                    elif isinstance(kernel.iname_to_tag.get(iname), VectorizeTag):
+                        if itr_map[writer_insn_id]:
+                            # check all things that write to this temporary to see
+                            # if we can glean the intended ilp/vectorness
+                            for writer in itr_map[writer_insn_id]:
+                                if writer in kernel.temporary_variables:
+                                    writer = kernel.temporary_variables[writer]
+                                else:
+                                    writer = kernel.arg_dict[writer]
+                            if any(isinstance(dim, VectorArrayDimTag)
+                                  for dim in writer.dim_tags):
+                                # this is a vector assignment
+                                ilp_inames.add(iname)
+                        else:
+                            # default to vector assignment
+                            ilp_inames.add(iname)
+
+                ilp_inames = frozenset(ilp_inames)
             else:
                 if not isinstance(
                         kernel.iname_to_tag.get(iname),
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 871996bb7..306ab59a6 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2847,6 +2847,42 @@ def test_explicit_simd_shuffles(ctx_factory):
                         answer, True)
 
 
+def test_explicit_simd_temporary_promotion(ctx_factory):
+    from loopy.kernel.data import temp_var_scope as scopes
+
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    # fun with vector temporaries
+
+    # first broken case -- incorrect promotion of temporaries to vector dtypes
+
+    knl = lp.make_kernel(
+        '{[i,j]: 0 <= i,j < 12}',
+        """
+        for j
+            for i
+                <int32> test = mask[i]
+                if test
+                    a[i, j] = 1
+                end
+            end
+        end
+        """,
+        [lp.GlobalArg('a', shape=(12, 12)),
+         lp.TemporaryVariable('mask', shape=(12,), initializer=np.array(
+                              np.arange(12) >= 6, dtype=np.int), read_only=True,
+                              scope=scopes.GLOBAL)])
+
+    knl = lp.split_iname(knl, 'j', 4, inner_tag='vec')
+    knl = lp.split_array_axis(knl, 'a', 1, 4)
+    knl = lp.tag_array_axes(knl, 'a', 'N1,N0,vec')
+
+    ans = np.zeros((12, 3, 4))
+    ans[6:, :, :] = 1
+    assert np.array_equal(knl(queue, a=np.zeros((12, 3, 4)))[1][0], ans)
+
+
 def test_check_for_variable_access_ordering():
     knl = lp.make_kernel(
             "{[i]: 0<=i<n}",
-- 
GitLab


From a349c816d699a726934febc9166566a59379ee86 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Sat, 17 Mar 2018 12:33:14 -0400
Subject: [PATCH 048/144] Revert "attempt to avoid incorrect vector promotion
 of temporary variables"

This reverts commit 96317d0059f2f6b4edf8af878d0f677bbe727102.
---
 loopy/kernel/__init__.py | 31 -------------------------------
 loopy/transform/ilp.py   | 32 +++++++-------------------------
 test/test_loopy.py       | 36 ------------------------------------
 3 files changed, 7 insertions(+), 92 deletions(-)

diff --git a/loopy/kernel/__init__.py b/loopy/kernel/__init__.py
index cf269486d..32b233900 100644
--- a/loopy/kernel/__init__.py
+++ b/loopy/kernel/__init__.py
@@ -778,23 +778,6 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         return result
 
-    @memoize_method
-    def insn_to_reader_map(self):
-        """
-        :return: a dict that maps instruction names to the variables that they read
-        """
-        result = {}
-
-        admissible_vars = (
-                set(arg.name for arg in self.args)
-                | set(six.iterkeys(self.temporary_variables)))
-
-        for insn in self.instructions:
-            for var_name in insn.read_dependency_names() & admissible_vars:
-                result.setdefault(insn.id, set()).add(var_name)
-
-        return result
-
     @memoize_method
     def writer_map(self):
         """
@@ -809,20 +792,6 @@ class LoopKernel(ImmutableRecordWithoutPickling):
 
         return result
 
-    @memoize_method
-    def insn_to_writer_map(self):
-        """
-        :return: a dict that maps instruction names to the variables that they write
-            to
-        """
-        result = {}
-
-        for insn in self.instructions:
-            for var_name in insn.assignee_var_names():
-                result.setdefault(insn.id, set()).add(var_name)
-
-        return result
-
     @memoize_method
     def get_read_variables(self):
         result = set()
diff --git a/loopy/transform/ilp.py b/loopy/transform/ilp.py
index ac19d61af..0ac71d603 100644
--- a/loopy/transform/ilp.py
+++ b/loopy/transform/ilp.py
@@ -65,13 +65,12 @@ class ExtraInameIndexInserter(IdentityMapper):
 
 
 def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
-    logger.debug("%s: add axes to temporaries for ilp / vec" % kernel.name)
+    if iname is not None:
+        logger.debug("%s: add axes to temporaries for ilp" % kernel.name)
 
     wmap = kernel.writer_map()
-    itr_map = kernel.insn_to_reader_map()
 
     from loopy.kernel.data import IlpBaseTag, VectorizeTag
-    from loopy.kernel.array import VectorArrayDimTag
 
     var_to_new_ilp_inames = {}
 
@@ -82,28 +81,11 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
             writer_insn = kernel.id_to_insn[writer_insn_id]
 
             if iname is None:
-                ilp_inames = set()
-                for iname in kernel.insn_inames(writer_insn):
-                    if isinstance(kernel.iname_to_tag.get(iname), IlpBaseTag):
-                        ilp_inames.add(iname)
-                    elif isinstance(kernel.iname_to_tag.get(iname), VectorizeTag):
-                        if itr_map[writer_insn_id]:
-                            # check all things that write to this temporary to see
-                            # if we can glean the intended ilp/vectorness
-                            for writer in itr_map[writer_insn_id]:
-                                if writer in kernel.temporary_variables:
-                                    writer = kernel.temporary_variables[writer]
-                                else:
-                                    writer = kernel.arg_dict[writer]
-                            if any(isinstance(dim, VectorArrayDimTag)
-                                  for dim in writer.dim_tags):
-                                # this is a vector assignment
-                                ilp_inames.add(iname)
-                        else:
-                            # default to vector assignment
-                            ilp_inames.add(iname)
-
-                ilp_inames = frozenset(ilp_inames)
+                ilp_inames = frozenset(iname
+                        for iname in kernel.insn_inames(writer_insn)
+                        if isinstance(
+                            kernel.iname_to_tag.get(iname),
+                            (IlpBaseTag, VectorizeTag)))
             else:
                 if not isinstance(
                         kernel.iname_to_tag.get(iname),
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 306ab59a6..871996bb7 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2847,42 +2847,6 @@ def test_explicit_simd_shuffles(ctx_factory):
                         answer, True)
 
 
-def test_explicit_simd_temporary_promotion(ctx_factory):
-    from loopy.kernel.data import temp_var_scope as scopes
-
-    ctx = ctx_factory()
-    queue = cl.CommandQueue(ctx)
-
-    # fun with vector temporaries
-
-    # first broken case -- incorrect promotion of temporaries to vector dtypes
-
-    knl = lp.make_kernel(
-        '{[i,j]: 0 <= i,j < 12}',
-        """
-        for j
-            for i
-                <int32> test = mask[i]
-                if test
-                    a[i, j] = 1
-                end
-            end
-        end
-        """,
-        [lp.GlobalArg('a', shape=(12, 12)),
-         lp.TemporaryVariable('mask', shape=(12,), initializer=np.array(
-                              np.arange(12) >= 6, dtype=np.int), read_only=True,
-                              scope=scopes.GLOBAL)])
-
-    knl = lp.split_iname(knl, 'j', 4, inner_tag='vec')
-    knl = lp.split_array_axis(knl, 'a', 1, 4)
-    knl = lp.tag_array_axes(knl, 'a', 'N1,N0,vec')
-
-    ans = np.zeros((12, 3, 4))
-    ans[6:, :, :] = 1
-    assert np.array_equal(knl(queue, a=np.zeros((12, 3, 4)))[1][0], ans)
-
-
 def test_check_for_variable_access_ordering():
     knl = lp.make_kernel(
             "{[i]: 0<=i<n}",
-- 
GitLab


From be7ed95286df35ea74485d02c6960735b7bf9fb8 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Sat, 17 Mar 2018 14:29:02 -0400
Subject: [PATCH 049/144] add ability to force temporary to be a scalar even if
 created in vector-loop & test

---
 loopy/kernel/creation.py    |  8 ++++++-
 loopy/kernel/data.py        | 36 ++++++++++++++++++++++++++++-
 loopy/kernel/instruction.py | 29 ++++++++++++++++++++---
 loopy/symbolic.py           | 35 ++++++++++++++++++++++++----
 loopy/transform/ilp.py      |  5 +++-
 test/test_loopy.py          | 46 +++++++++++++++++++++++++++++++++++++
 6 files changed, 148 insertions(+), 11 deletions(-)

diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index 0daf327f4..e84562255 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -482,6 +482,7 @@ def parse_insn(groups, insn_options):
     new_lhs = []
     assignee_names = []
 
+    force_scalar = False
     for lhs_i in lhs:
         if isinstance(lhs_i, TypeAnnotation):
             if lhs_i.type is None:
@@ -489,6 +490,9 @@ def parse_insn(groups, insn_options):
             else:
                 temp_var_types.append(lhs_i.type)
 
+            if lhs_i.force_scalar:
+                force_scalar = True
+
             lhs_i = lhs_i.child
         else:
             temp_var_types.append(None)
@@ -528,6 +532,7 @@ def parse_insn(groups, insn_options):
                     intern(insn_id)
                     if isinstance(insn_id, str)
                     else insn_id),
+                force_scalar=force_scalar,
                 **insn_options)
 
     from loopy.kernel.instruction import make_assignment
@@ -1445,7 +1450,8 @@ def create_temporaries(knl, default_order):
                         base_indices=lp.auto,
                         shape=lp.auto,
                         order=default_order,
-                        target=knl.target)
+                        target=knl.target,
+                        force_scalar=insn.force_scalar)
 
                 if isinstance(insn, Assignment):
                     insn = insn.copy(temp_var_type=None)
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index c90e8a64b..d06afc77f 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -377,6 +377,18 @@ class TemporaryVariable(ArrayBase):
         the temporary as a ``restrict`` const pointer to the base storage
         memory location. If *True*, the restrict part is omitted on this
         declaration.
+
+    .. attribute:: force_scalar
+
+        If True, this temporary variable is created as an assignee, and will be a
+        scalar variable, regardless of the vector status of the instruction that
+        assigns to it.
+
+        .. note::
+
+            This is useful for OpenCL code-generation, to allow for if-statements
+            that do not depend on a vector temporary (which causes compilation
+            failures).
     """
 
     min_target_axes = 0
@@ -390,13 +402,15 @@ class TemporaryVariable(ArrayBase):
             "initializer",
             "read_only",
             "_base_storage_access_may_be_aliasing",
+            "force_scalar"
             ]
 
     def __init__(self, name, dtype=None, shape=(), scope=auto,
             dim_tags=None, offset=0, dim_names=None, strides=None, order=None,
             base_indices=None, storage_shape=None,
             base_storage=None, initializer=None, read_only=False,
-            _base_storage_access_may_be_aliasing=False, **kwargs):
+            _base_storage_access_may_be_aliasing=False,
+            force_scalar=False, **kwargs):
         """
         :arg dtype: :class:`loopy.auto` or a :class:`numpy.dtype`
         :arg shape: :class:`loopy.auto` or a shape tuple
@@ -411,6 +425,11 @@ class TemporaryVariable(ArrayBase):
                         "temporary variable '%s': "
                         "offset must be 0 if initializer specified"
                         % name)
+            if force_scalar:
+                raise LoopyError(
+                        "temporary variable '%s': "
+                        "cannot specify force_scalar if initializer is specified"
+                        % name)
 
             from loopy.types import NumpyType, to_loopy_type
             if dtype is auto or dtype is None:
@@ -444,6 +463,12 @@ class TemporaryVariable(ArrayBase):
                     "are not currently supported "
                     "(did you mean to set read_only=True?)"
                     % name)
+        elif read_only and force_scalar:
+            raise LoopyError(
+                "temporary variable '%s': "
+                "cannot specify force_scalar for a read_only variable, force_scalar "
+                "applies only to temporary variables resulting from assignments."
+                % name)
 
         if base_storage is not None and initializer is not None:
             raise LoopyError(
@@ -459,6 +484,12 @@ class TemporaryVariable(ArrayBase):
                     "base_storage given!"
                     % name)
 
+        if base_storage is not None and force_scalar:
+            raise LoopyError(
+                "temporary variable '%s': "
+                "cannot specify force_scalar if base_storage is supplied."
+                % name)
+
         ArrayBase.__init__(self, name=intern(name),
                 dtype=dtype, shape=shape, strides=strides,
                 dim_tags=dim_tags, offset=offset, dim_names=dim_names,
@@ -470,6 +501,7 @@ class TemporaryVariable(ArrayBase):
                 read_only=read_only,
                 _base_storage_access_may_be_aliasing=(
                     _base_storage_access_may_be_aliasing),
+                force_scalar=force_scalar,
                 **kwargs)
 
     @property
@@ -534,6 +566,7 @@ class TemporaryVariable(ArrayBase):
                 and self.read_only == other.read_only
                 and (self._base_storage_access_may_be_aliasing
                     == other._base_storage_access_may_be_aliasing)
+                and (self.force_scalar == other.force_scalar)
                 )
 
     def update_persistent_hash(self, key_hash, key_builder):
@@ -555,6 +588,7 @@ class TemporaryVariable(ArrayBase):
 
         key_builder.rec(key_hash, self.read_only)
         key_builder.rec(key_hash, self._base_storage_access_may_be_aliasing)
+        key_builder.rec(key_hash, self.force_scalar)
 
 # }}}
 
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index 95001c78b..aaa32d965 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -785,11 +785,22 @@ class Assignment(MultiAssignmentBase):
                 EVALUATE ztemp_new = f(ztemp_old) + a
             WHILE compare_and_swap(z[i], ztemp_new, ztemp_old) did not succeed
 
+    .. attribute:: force_scalar
+
+        If True, temporary variable created from the assignee will be a scalar
+        variable, regardless of the vector status of the instruction.
+
+        .. note::
+
+            This is useful for OpenCL code-generation, to allow for if-statements
+            that do not depend on a vector temporary (which causes compilation
+            failures).
+
     .. automethod:: __init__
     """
 
     fields = MultiAssignmentBase.fields | \
-            set("assignee temp_var_type atomicity".split())
+            set("assignee temp_var_type atomicity force_scalar".split())
     pymbolic_fields = MultiAssignmentBase.pymbolic_fields | set(["assignee"])
 
     def __init__(self,
@@ -806,7 +817,8 @@ class Assignment(MultiAssignmentBase):
             temp_var_type=None, atomicity=(),
             priority=0, predicates=frozenset(),
             insn_deps=None, insn_deps_is_final=None,
-            forced_iname_deps=None, forced_iname_deps_is_final=None):
+            forced_iname_deps=None, forced_iname_deps_is_final=None,
+            force_scalar=False):
 
         super(Assignment, self).__init__(
                 id=id,
@@ -842,6 +854,7 @@ class Assignment(MultiAssignmentBase):
         self.expression = expression
         self.temp_var_type = temp_var_type
         self.atomicity = atomicity
+        self.force_scalar = force_scalar
 
     # {{{ implement InstructionBase interface
 
@@ -857,7 +870,8 @@ class Assignment(MultiAssignmentBase):
                 assignee=f(self.assignee, *args),
                 expression=f(self.expression, *args),
                 predicates=frozenset(
-                    f(pred, *args) for pred in self.predicates))
+                    f(pred, *args) for pred in self.predicates),
+                force_scalar=self.force_scalar)
 
     # }}}
 
@@ -1032,6 +1046,11 @@ class CallInstruction(MultiAssignmentBase):
         # issue altogether by disallowing atomicity.
         return ()
 
+    @property
+    def force_scalar(self):
+        # unified interface with Assignment
+        return False
+
 # }}}
 
 
@@ -1048,6 +1067,10 @@ def make_assignment(assignees, expression, temp_var_types=None, **kwargs):
             raise LoopyError("right-hand side in multiple assignment must be "
                     "function call or reduction, got: '%s'" % expression)
 
+        if kwargs.pop('force_scalar', False):
+            raise LoopyError("Force scalar option cannot be used with multiple "
+                             "assigments.")
+
         return CallInstruction(
                 assignees=assignees,
                 expression=expression,
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 0cc8f4ba6..a4e8e8d79 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -104,7 +104,7 @@ class IdentityMapperMixin(object):
         return expr
 
     def map_type_annotation(self, expr, *args):
-        return type(expr)(expr.type, self.rec(expr.child))
+        return type(expr)(expr.type, self.rec(expr.child), expr.force_scalar)
 
     map_type_cast = map_type_annotation
 
@@ -416,13 +416,14 @@ class TypeAnnotation(p.Expression):
     assignments that create temporaries.
     """
 
-    def __init__(self, type, child):
+    def __init__(self, type, child, force_scalar=False):
         super(TypeAnnotation, self).__init__()
         self.type = type
         self.child = child
+        self.force_scalar = force_scalar
 
     def __getinitargs__(self):
-        return (self.type, self.child)
+        return (self.type, self.child, self.force_scalar)
 
     def stringifier(self):
         return StringifyMapper
@@ -1128,22 +1129,46 @@ class LoopyParser(ParserBase):
             return float(val)  # generic float
 
     def parse_prefix(self, pstate):
-        from pymbolic.parser import _PREC_UNARY, _less, _greater, _identifier
+        from pymbolic.parser import _PREC_UNARY, _less, _greater, _identifier, _colon
         if pstate.is_next(_less):
             pstate.advance()
+            force_scalar = None
             if pstate.is_next(_greater):
                 typename = None
                 pstate.advance()
+            elif pstate.is_next(_colon):
+                # force scalar specified
+                typename = None
+                pstate.advance()
+                pstate.expect(_identifier)
+                force_scalar = pstate.next_str_and_advance()
+                pstate.expect(_greater)
+                pstate.advance()
             else:
                 pstate.expect(_identifier)
                 typename = pstate.next_str()
                 pstate.advance()
+                force_scalar = None
+                # check for force scalar
+                if pstate.is_next(_colon):
+                    pstate.advance()
+                    pstate.expect(_identifier)
+                    force_scalar = pstate.next_str()
+                    pstate.advance()
+
                 pstate.expect(_greater)
                 pstate.advance()
 
+            if force_scalar:
+                if force_scalar != 's':
+                    raise TypeError("Cannot force assignment to type '{}'"
+                                    "did you mean, 's' (scalar)?" % force_scalar)
+                force_scalar = True
+
             return TypeAnnotation(
                     typename,
-                    self.parse_expression(pstate, _PREC_UNARY))
+                    self.parse_expression(pstate, _PREC_UNARY),
+                    force_scalar=force_scalar)
         else:
             return super(LoopyParser, self).parse_prefix(pstate)
 
diff --git a/loopy/transform/ilp.py b/loopy/transform/ilp.py
index 0ac71d603..597aa4472 100644
--- a/loopy/transform/ilp.py
+++ b/loopy/transform/ilp.py
@@ -85,7 +85,10 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
                         for iname in kernel.insn_inames(writer_insn)
                         if isinstance(
                             kernel.iname_to_tag.get(iname),
-                            (IlpBaseTag, VectorizeTag)))
+                            (IlpBaseTag, VectorizeTag))
+                        and not (tv.force_scalar and isinstance(
+                            kernel.iname_to_tag.get(iname), VectorizeTag))
+                        )
             else:
                 if not isinstance(
                         kernel.iname_to_tag.get(iname),
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 871996bb7..bbf39d59f 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2847,6 +2847,52 @@ def test_explicit_simd_shuffles(ctx_factory):
                         answer, True)
 
 
+def test_explicit_simd_temporary_promotion(ctx_factory):
+    from loopy.kernel.data import temp_var_scope as scopes
+
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    # fun with vector temporaries
+
+    # first, test parsing
+    knl = lp.make_kernel(
+        '{[i,j]: 0 <= i,j < 12}',
+        """
+        <> t = 1
+        <int32> t1 = 1
+        <int32:s> t2 = 1
+        <:s> t3 = 1
+        """)
+
+    # first broken case -- incorrect promotion of temporaries to vector dtypes
+
+    knl = lp.make_kernel(
+        '{[i,j]: 0 <= i,j < 12}',
+        """
+        for j
+            for i
+                <:s> test = mask[i]
+                if test
+                    a[i, j] = 1
+                end
+            end
+        end
+        """,
+        [lp.GlobalArg('a', shape=(12, 12)),
+         lp.TemporaryVariable('mask', shape=(12,), initializer=np.array(
+                              np.arange(12) >= 6, dtype=np.int), read_only=True,
+                              scope=scopes.GLOBAL)])
+
+    knl = lp.split_iname(knl, 'j', 4, inner_tag='vec')
+    knl = lp.split_array_axis(knl, 'a', 1, 4)
+    knl = lp.tag_array_axes(knl, 'a', 'N1,N0,vec')
+
+    ans = np.zeros((12, 3, 4))
+    ans[6:, :, :] = 1
+    assert np.array_equal(knl(queue, a=np.zeros((12, 3, 4)))[1][0], ans)
+
+
 def test_check_for_variable_access_ordering():
     knl = lp.make_kernel(
             "{[i]: 0<=i<n}",
-- 
GitLab


From 581749b665fb42f42f372513bf93c475c52b7317 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Sun, 18 Mar 2018 12:40:56 -0400
Subject: [PATCH 050/144] compat. w/ TypeCast

---
 loopy/symbolic.py | 5 ++++-
 1 file changed, 4 insertions(+), 1 deletion(-)

diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index a4e8e8d79..a1fe8d29e 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -104,7 +104,10 @@ class IdentityMapperMixin(object):
         return expr
 
     def map_type_annotation(self, expr, *args):
-        return type(expr)(expr.type, self.rec(expr.child), expr.force_scalar)
+        kwargs = {}
+        if isinstance(expr, TypeAnnotation):
+            kwargs['force_scalar'] = expr.force_scalar
+        return type(expr)(expr.type, self.rec(expr.child), **kwargs)
 
     map_type_cast = map_type_annotation
 
-- 
GitLab


From d9e6f772a16b058bd2d6adf2a365edeea7440790 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 23 Mar 2018 10:52:30 -0400
Subject: [PATCH 051/144] first pass at writer heuristic for ILP/vector iname
 expansion

---
 loopy/transform/ilp.py | 38 ++++++++++++++++++--------------------
 1 file changed, 18 insertions(+), 20 deletions(-)

diff --git a/loopy/transform/ilp.py b/loopy/transform/ilp.py
index 597aa4472..a678b6b6e 100644
--- a/loopy/transform/ilp.py
+++ b/loopy/transform/ilp.py
@@ -65,40 +65,38 @@ class ExtraInameIndexInserter(IdentityMapper):
 
 
 def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
-    if iname is not None:
-        logger.debug("%s: add axes to temporaries for ilp" % kernel.name)
+    logger.debug("%s: add axes to temporaries for ilp%s" % (
+        kernel.name, '' if iname is not None else '/vec'))
 
     wmap = kernel.writer_map()
 
     from loopy.kernel.data import IlpBaseTag, VectorizeTag
+    from loopy.symbolic import get_dependencies
 
     var_to_new_ilp_inames = {}
 
+    def find_ilp_inames(writer_insn, iname, raise_on_missing=False):
+        # test that -- a) the iname is an ILP or vector tag
+        if isinstance(kernel.iname_to_tag.get(iname), (IlpBaseTag, VectorizeTag)):
+            # and b) instruction depends on the ILP/vector iname
+            return set([iname]) & (get_dependencies(writer_insn.expression) |
+                                   get_dependencies(writer_insn.assignee))
+        elif raise_on_missing:
+            raise LoopyError("'%s' is not an ILP iname" % iname)
+        return set()
+
     # {{{ find variables that need extra indices
 
     for tv in six.itervalues(kernel.temporary_variables):
         for writer_insn_id in wmap.get(tv.name, []):
             writer_insn = kernel.id_to_insn[writer_insn_id]
 
-            if iname is None:
-                ilp_inames = frozenset(iname
-                        for iname in kernel.insn_inames(writer_insn)
-                        if isinstance(
-                            kernel.iname_to_tag.get(iname),
-                            (IlpBaseTag, VectorizeTag))
-                        and not (tv.force_scalar and isinstance(
-                            kernel.iname_to_tag.get(iname), VectorizeTag))
-                        )
-            else:
-                if not isinstance(
-                        kernel.iname_to_tag.get(iname),
-                        (IlpBaseTag, VectorizeTag)):
-                    raise LoopyError(
-                            "'%s' is not an ILP iname"
-                            % iname)
-
-                ilp_inames = frozenset([iname])
+            test_inames = kernel.insn_inames(writer_insn) if iname is None else iname
+            ilp_inames = set()
+            for ti in test_inames:
+                ilp_inames |= find_ilp_inames(writer_insn, ti, iname is not None)
 
+            ilp_inames = frozenset(ilp_inames)
             referenced_ilp_inames = (ilp_inames
                     & writer_insn.write_dependency_names())
 
-- 
GitLab


From 61b72d5a04b0ebb9d0a18639010750610eb0a17a Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 23 Mar 2018 11:24:11 -0400
Subject: [PATCH 052/144] add force vector analagous to scalar

---
 loopy/kernel/creation.py    |  4 ++++
 loopy/kernel/instruction.py | 28 +++++++++++++++++++++++++---
 loopy/symbolic.py           | 31 +++++++++++++++++++------------
 3 files changed, 48 insertions(+), 15 deletions(-)

diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index e84562255..f533caa8b 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -483,6 +483,7 @@ def parse_insn(groups, insn_options):
     assignee_names = []
 
     force_scalar = False
+    force_vector = False
     for lhs_i in lhs:
         if isinstance(lhs_i, TypeAnnotation):
             if lhs_i.type is None:
@@ -492,6 +493,8 @@ def parse_insn(groups, insn_options):
 
             if lhs_i.force_scalar:
                 force_scalar = True
+            elif lhs_i.force_vector:
+                force_vector = True
 
             lhs_i = lhs_i.child
         else:
@@ -533,6 +536,7 @@ def parse_insn(groups, insn_options):
                     if isinstance(insn_id, str)
                     else insn_id),
                 force_scalar=force_scalar,
+                force_vector=force_vector,
                 **insn_options)
 
     from loopy.kernel.instruction import make_assignment
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index aaa32d965..b8941c12c 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -788,7 +788,18 @@ class Assignment(MultiAssignmentBase):
     .. attribute:: force_scalar
 
         If True, temporary variable created from the assignee will be a scalar
-        variable, regardless of the vector status of the instruction.
+        variable, regardless of the vector status of this assignment.
+
+        .. note::
+
+            This is useful for OpenCL code-generation, to allow for if-statements
+            that do not depend on a vector temporary (which causes compilation
+            failures).
+
+    .. attribute:: force_vector
+
+        If True, temporary variable created from the assignee will be a vector
+        variable, regardless of the vector status of this assignment.
 
         .. note::
 
@@ -800,7 +811,7 @@ class Assignment(MultiAssignmentBase):
     """
 
     fields = MultiAssignmentBase.fields | \
-            set("assignee temp_var_type atomicity force_scalar".split())
+            set("assignee temp_var_type atomicity force_scalar force_vector".split())
     pymbolic_fields = MultiAssignmentBase.pymbolic_fields | set(["assignee"])
 
     def __init__(self,
@@ -818,7 +829,8 @@ class Assignment(MultiAssignmentBase):
             priority=0, predicates=frozenset(),
             insn_deps=None, insn_deps_is_final=None,
             forced_iname_deps=None, forced_iname_deps_is_final=None,
-            force_scalar=False):
+            force_scalar=False,
+            force_vector=False):
 
         super(Assignment, self).__init__(
                 id=id,
@@ -855,6 +867,7 @@ class Assignment(MultiAssignmentBase):
         self.temp_var_type = temp_var_type
         self.atomicity = atomicity
         self.force_scalar = force_scalar
+        self.force_vector = force_vector
 
     # {{{ implement InstructionBase interface
 
@@ -1051,6 +1064,11 @@ class CallInstruction(MultiAssignmentBase):
         # unified interface with Assignment
         return False
 
+    @property
+    def force_vector(self):
+        # unified interface with Assignment
+        return False
+
 # }}}
 
 
@@ -1071,6 +1089,10 @@ def make_assignment(assignees, expression, temp_var_types=None, **kwargs):
             raise LoopyError("Force scalar option cannot be used with multiple "
                              "assigments.")
 
+        if kwargs.pop('force_vector', False):
+            raise LoopyError("Force vector option cannot be used with multiple "
+                             "assigments.")
+
         return CallInstruction(
                 assignees=assignees,
                 expression=expression,
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index a1fe8d29e..0910301b1 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -107,6 +107,7 @@ class IdentityMapperMixin(object):
         kwargs = {}
         if isinstance(expr, TypeAnnotation):
             kwargs['force_scalar'] = expr.force_scalar
+            kwargs['force_vector'] = expr.force_vector
         return type(expr)(expr.type, self.rec(expr.child), **kwargs)
 
     map_type_cast = map_type_annotation
@@ -419,14 +420,19 @@ class TypeAnnotation(p.Expression):
     assignments that create temporaries.
     """
 
-    def __init__(self, type, child, force_scalar=False):
+    def __init__(self, type, child, force_scalar=False, force_vector=False):
         super(TypeAnnotation, self).__init__()
         self.type = type
         self.child = child
         self.force_scalar = force_scalar
+        self.force_vector = force_vector
+
+        if (self.force_scalar and self.force_vector):
+            raise TypeError('A type annotation cannot simultaneously be forced to '
+                            'both scalar and vector types')
 
     def __getinitargs__(self):
-        return (self.type, self.child, self.force_scalar)
+        return (self.type, self.child, self.force_scalar, self.force_vector)
 
     def stringifier(self):
         return StringifyMapper
@@ -1135,7 +1141,7 @@ class LoopyParser(ParserBase):
         from pymbolic.parser import _PREC_UNARY, _less, _greater, _identifier, _colon
         if pstate.is_next(_less):
             pstate.advance()
-            force_scalar = None
+            force_type = None
             if pstate.is_next(_greater):
                 typename = None
                 pstate.advance()
@@ -1144,34 +1150,35 @@ class LoopyParser(ParserBase):
                 typename = None
                 pstate.advance()
                 pstate.expect(_identifier)
-                force_scalar = pstate.next_str_and_advance()
+                scalar_or_vec = pstate.next_str_and_advance()
                 pstate.expect(_greater)
                 pstate.advance()
             else:
                 pstate.expect(_identifier)
                 typename = pstate.next_str()
                 pstate.advance()
-                force_scalar = None
-                # check for force scalar
+                # check for scalar / vector specification
                 if pstate.is_next(_colon):
                     pstate.advance()
                     pstate.expect(_identifier)
-                    force_scalar = pstate.next_str()
+                    scalar_or_vec = pstate.next_str()
                     pstate.advance()
 
                 pstate.expect(_greater)
                 pstate.advance()
 
-            if force_scalar:
-                if force_scalar != 's':
+            if scalar_or_vec:
+                if scalar_or_vec not in ['s', 'v']:
                     raise TypeError("Cannot force assignment to type '{}'"
-                                    "did you mean, 's' (scalar)?" % force_scalar)
-                force_scalar = True
+                                    "did you mean, 's' (scalar) or 'v' (vector)?" %
+                                    scalar_or_vec)
+                force_type = scalar_or_vec
 
             return TypeAnnotation(
                     typename,
                     self.parse_expression(pstate, _PREC_UNARY),
-                    force_scalar=force_scalar)
+                    force_scalar=force_type == 's',
+                    force_vector=force_type == 'v')
         else:
             return super(LoopyParser, self).parse_prefix(pstate)
 
-- 
GitLab


From 8b65bd0346271f502da73ec1c3f63d5048cbed4a Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 23 Mar 2018 11:24:57 -0400
Subject: [PATCH 053/144] update

---
 loopy/transform/ilp.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/loopy/transform/ilp.py b/loopy/transform/ilp.py
index a678b6b6e..bf58dbe29 100644
--- a/loopy/transform/ilp.py
+++ b/loopy/transform/ilp.py
@@ -78,6 +78,11 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
     def find_ilp_inames(writer_insn, iname, raise_on_missing=False):
         # test that -- a) the iname is an ILP or vector tag
         if isinstance(kernel.iname_to_tag.get(iname), (IlpBaseTag, VectorizeTag)):
+            # check for user specified type
+            if writer_insn.force_scalar:
+                return set()
+            elif writer_insn.force_vector:
+                return set([iname])
             # and b) instruction depends on the ILP/vector iname
             return set([iname]) & (get_dependencies(writer_insn.expression) |
                                    get_dependencies(writer_insn.assignee))
-- 
GitLab


From a9b7447f89f8404c72669221fbb3a28ed8316cc6 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 23 Mar 2018 11:27:04 -0400
Subject: [PATCH 054/144] fix parsing

---
 loopy/symbolic.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 0910301b1..a25d9a30d 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -1141,7 +1141,7 @@ class LoopyParser(ParserBase):
         from pymbolic.parser import _PREC_UNARY, _less, _greater, _identifier, _colon
         if pstate.is_next(_less):
             pstate.advance()
-            force_type = None
+            scalar_or_vec = None
             if pstate.is_next(_greater):
                 typename = None
                 pstate.advance()
@@ -1172,13 +1172,12 @@ class LoopyParser(ParserBase):
                     raise TypeError("Cannot force assignment to type '{}'"
                                     "did you mean, 's' (scalar) or 'v' (vector)?" %
                                     scalar_or_vec)
-                force_type = scalar_or_vec
 
             return TypeAnnotation(
                     typename,
                     self.parse_expression(pstate, _PREC_UNARY),
-                    force_scalar=force_type == 's',
-                    force_vector=force_type == 'v')
+                    force_scalar=scalar_or_vec == 's',
+                    force_vector=scalar_or_vec == 'v')
         else:
             return super(LoopyParser, self).parse_prefix(pstate)
 
-- 
GitLab


From e40b0a0bd3a72de22e8f9ca633568b665f100e29 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 23 Mar 2018 11:55:56 -0400
Subject: [PATCH 055/144] make conflict detection smarter

---
 loopy/transform/ilp.py | 21 +++++++++++++++------
 1 file changed, 15 insertions(+), 6 deletions(-)

diff --git a/loopy/transform/ilp.py b/loopy/transform/ilp.py
index bf58dbe29..63cbbfc79 100644
--- a/loopy/transform/ilp.py
+++ b/loopy/transform/ilp.py
@@ -108,15 +108,24 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
             new_ilp_inames = ilp_inames - referenced_ilp_inames
 
             if not new_ilp_inames:
-                break
+                continue
 
             if tv.name in var_to_new_ilp_inames:
                 if new_ilp_inames != set(var_to_new_ilp_inames[tv.name]):
-                    raise LoopyError("instruction '%s' requires adding "
-                            "indices for ILP inames '%s' on var '%s', but previous "
-                            "instructions required inames '%s'"
-                            % (writer_insn_id, ", ".join(new_ilp_inames),
-                                ", ".join(var_to_new_ilp_inames[tv.name])))
+                    # either 1) the previous iname were empty -> upgrade
+                    if not set(var_to_new_ilp_inames[tv.name]):
+                        logger.debug("Expanding vector/ILP inames considered for "
+                                     "var '%s' from empty set to '%s' for insn '%s'"
+                                     % (tv.name, ", ".join(new_ilp_inames),
+                                        writer_insn_id))
+                    else:
+                        # or 2) there is a conflict
+                        raise LoopyError("instruction '%s' requires adding "
+                                "indices for vector/ILP inames '%s' on var '%s', "
+                                "but previous instructions required inames '%s'"
+                                % (writer_insn_id, ", ".join(new_ilp_inames),
+                                    tv.name, ", ".join(
+                                        var_to_new_ilp_inames[tv.name])))
 
                 continue
 
-- 
GitLab


From 8dabb18e51d6111a826060b9576f8ab59bd81d75 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 23 Mar 2018 13:26:27 -0400
Subject: [PATCH 056/144] unified interface

---
 loopy/kernel/instruction.py | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index b8941c12c..a571716bc 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -1277,6 +1277,17 @@ class CInstruction(InstructionBase):
         return first_line + "\n    " + "\n    ".join(
                 self.code.split("\n"))
 
+    @property
+    def force_scalar(self):
+        # unified interface with Assignment
+        return False
+
+    @property
+    def force_vector(self):
+        # unified interface with Assignment
+        return False
+
+
 # }}}
 
 
-- 
GitLab


From 8ebed2df4b055da36652492a8f0731625b489e73 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 23 Mar 2018 13:26:42 -0400
Subject: [PATCH 057/144] update heuristic

---
 loopy/transform/ilp.py | 33 ++++++++++++++++-----------------
 1 file changed, 16 insertions(+), 17 deletions(-)

diff --git a/loopy/transform/ilp.py b/loopy/transform/ilp.py
index 63cbbfc79..cbebb8536 100644
--- a/loopy/transform/ilp.py
+++ b/loopy/transform/ilp.py
@@ -71,7 +71,6 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
     wmap = kernel.writer_map()
 
     from loopy.kernel.data import IlpBaseTag, VectorizeTag
-    from loopy.symbolic import get_dependencies
 
     var_to_new_ilp_inames = {}
 
@@ -84,8 +83,7 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
             elif writer_insn.force_vector:
                 return set([iname])
             # and b) instruction depends on the ILP/vector iname
-            return set([iname]) & (get_dependencies(writer_insn.expression) |
-                                   get_dependencies(writer_insn.assignee))
+            return set([iname]) & writer_insn.dependency_names()
         elif raise_on_missing:
             raise LoopyError("'%s' is not an ILP iname" % iname)
         return set()
@@ -107,25 +105,26 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
 
             new_ilp_inames = ilp_inames - referenced_ilp_inames
 
+            if not new_ilp_inames and writer_insn.force_scalar and \
+                    tv.name in var_to_new_ilp_inames:
+                # conflict
+                raise LoopyError("instruction '%s' requires var '%s' to be a scalar "
+                                 "but previous instructions required vector/ILP "
+                                 "inames '%s'" % (writer_insn_id, tv.name, ", ".join(
+                                        var_to_new_ilp_inames[tv.name])))
+
             if not new_ilp_inames:
                 continue
 
             if tv.name in var_to_new_ilp_inames:
                 if new_ilp_inames != set(var_to_new_ilp_inames[tv.name]):
-                    # either 1) the previous iname were empty -> upgrade
-                    if not set(var_to_new_ilp_inames[tv.name]):
-                        logger.debug("Expanding vector/ILP inames considered for "
-                                     "var '%s' from empty set to '%s' for insn '%s'"
-                                     % (tv.name, ", ".join(new_ilp_inames),
-                                        writer_insn_id))
-                    else:
-                        # or 2) there is a conflict
-                        raise LoopyError("instruction '%s' requires adding "
-                                "indices for vector/ILP inames '%s' on var '%s', "
-                                "but previous instructions required inames '%s'"
-                                % (writer_insn_id, ", ".join(new_ilp_inames),
-                                    tv.name, ", ".join(
-                                        var_to_new_ilp_inames[tv.name])))
+                    # conflict
+                    raise LoopyError("instruction '%s' requires adding "
+                            "indices for vector/ILP inames '%s' on var '%s', "
+                            "but previous instructions required inames '%s'"
+                            % (writer_insn_id, ", ".join(new_ilp_inames),
+                                tv.name, ", ".join(
+                                    var_to_new_ilp_inames[tv.name])))
 
                 continue
 
-- 
GitLab


From 8db84b379b3d2c002b15c31dc56c272b4a579fb0 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 23 Mar 2018 14:50:36 -0400
Subject: [PATCH 058/144] fix

---
 test/test_loopy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index bbf39d59f..1420893b1 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2736,7 +2736,7 @@ def test_preamble_with_separate_temporaries(ctx_factory):
 
     print(lp.generate_code(kernel)[0])
     # and call (functionality unimportant, more that it compiles)
-    ctx = cl.create_some_context()
+    ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
     # check that it actually performs the lookup correctly
     assert np.allclose(kernel(
-- 
GitLab


From 5098ade8c1f26368a827d5c43a9c34164b0f4b49 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 23 Mar 2018 14:50:47 -0400
Subject: [PATCH 059/144] update test

---
 test/test_loopy.py | 57 ++++++++++++++++++++++++++++++----------------
 1 file changed, 37 insertions(+), 20 deletions(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index 1420893b1..75827845c 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2863,34 +2863,51 @@ def test_explicit_simd_temporary_promotion(ctx_factory):
         <int32> t1 = 1
         <int32:s> t2 = 1
         <:s> t3 = 1
+        <:v> tv = 1
+        <int32> tv1 = 1
+        <int32:v> tv2 = 1
+        <:v> tv3 = 1
         """)
 
-    # first broken case -- incorrect promotion of temporaries to vector dtypes
-
-    knl = lp.make_kernel(
-        '{[i,j]: 0 <= i,j < 12}',
-        """
-        for j
-            for i
-                <:s> test = mask[i]
-                if test
-                    a[i, j] = 1
+    def make_kernel(insn, ans=None):
+        knl = lp.make_kernel(
+            '{[i,j]: 0 <= i,j < 12}',
+            """
+            for j
+                for i
+                    %(insn)s
+                    if test
+                        a[i, j] = 1
+                    end
                 end
             end
-        end
-        """,
-        [lp.GlobalArg('a', shape=(12, 12)),
-         lp.TemporaryVariable('mask', shape=(12,), initializer=np.array(
-                              np.arange(12) >= 6, dtype=np.int), read_only=True,
-                              scope=scopes.GLOBAL)])
+            """ % dict(insn=insn),
+            [lp.GlobalArg('a', shape=(12, 12)),
+             lp.TemporaryVariable('mask', shape=(12,), initializer=np.array(
+                                  np.arange(12) >= 6, dtype=np.int), read_only=True,
+                                  scope=scopes.GLOBAL)])
+
+        knl = lp.split_iname(knl, 'j', 4, inner_tag='vec')
+        knl = lp.split_array_axis(knl, 'a', 1, 4)
+        knl = lp.tag_array_axes(knl, 'a', 'N1,N0,vec')
+        knl = lp.preprocess_kernel(knl)
 
-    knl = lp.split_iname(knl, 'j', 4, inner_tag='vec')
-    knl = lp.split_array_axis(knl, 'a', 1, 4)
-    knl = lp.tag_array_axes(knl, 'a', 'N1,N0,vec')
+        if ans is not None:
+            assert np.array_equal(knl(queue, a=np.zeros((12, 3, 4), dtype=np.int32))[
+                1][0], ans)
+
+        return knl
 
     ans = np.zeros((12, 3, 4))
     ans[6:, :, :] = 1
-    assert np.array_equal(knl(queue, a=np.zeros((12, 3, 4)))[1][0], ans)
+    # first broken case -- incorrect promotion of temporaries to vector dtypes
+    make_kernel('<> test = mask[i]', ans)
+
+    # next test the writer heuristic
+
+    # case 1) assignment from a vector iname
+    knl = make_kernel('<> test = mask[j]')
+    assert knl.temporary_variables['test'].shape == (4,)
 
 
 def test_check_for_variable_access_ordering():
-- 
GitLab


From 3e755419169550610906f8e1cdae5f024b8f72e6 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 23 Mar 2018 15:15:13 -0400
Subject: [PATCH 060/144] fix for recursive depends & test

---
 loopy/transform/ilp.py | 67 ++++++++++++++++++++++++------------------
 test/test_loopy.py     |  7 +++++
 2 files changed, 46 insertions(+), 28 deletions(-)

diff --git a/loopy/transform/ilp.py b/loopy/transform/ilp.py
index cbebb8536..3f3075de7 100644
--- a/loopy/transform/ilp.py
+++ b/loopy/transform/ilp.py
@@ -71,6 +71,7 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
     wmap = kernel.writer_map()
 
     from loopy.kernel.data import IlpBaseTag, VectorizeTag
+    from loopy.kernel.tools import find_recursive_dependencies
 
     var_to_new_ilp_inames = {}
 
@@ -92,43 +93,53 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
 
     for tv in six.itervalues(kernel.temporary_variables):
         for writer_insn_id in wmap.get(tv.name, []):
-            writer_insn = kernel.id_to_insn[writer_insn_id]
+            # the instructions we have to consider here are those that directly
+            # write to this variable, and those that are recursive dependencies of
+            # this instruction
 
-            test_inames = kernel.insn_inames(writer_insn) if iname is None else iname
-            ilp_inames = set()
-            for ti in test_inames:
-                ilp_inames |= find_ilp_inames(writer_insn, ti, iname is not None)
+            writer_insns = set([writer_insn_id]) | \
+                find_recursive_dependencies(kernel, frozenset([writer_insn_id]))
 
-            ilp_inames = frozenset(ilp_inames)
-            referenced_ilp_inames = (ilp_inames
-                    & writer_insn.write_dependency_names())
+            for inner_id in writer_insns:
+                writer_insn = kernel.id_to_insn[inner_id]
 
-            new_ilp_inames = ilp_inames - referenced_ilp_inames
+                test_inames = kernel.insn_inames(writer_insn) if iname is None else \
+                    iname
+                ilp_inames = set()
+                for ti in test_inames:
+                    ilp_inames |= find_ilp_inames(writer_insn, ti, iname is not None)
 
-            if not new_ilp_inames and writer_insn.force_scalar and \
-                    tv.name in var_to_new_ilp_inames:
-                # conflict
-                raise LoopyError("instruction '%s' requires var '%s' to be a scalar "
-                                 "but previous instructions required vector/ILP "
-                                 "inames '%s'" % (writer_insn_id, tv.name, ", ".join(
-                                        var_to_new_ilp_inames[tv.name])))
+                ilp_inames = frozenset(ilp_inames)
+                referenced_ilp_inames = (ilp_inames
+                        & writer_insn.write_dependency_names())
 
-            if not new_ilp_inames:
-                continue
+                new_ilp_inames = ilp_inames - referenced_ilp_inames
 
-            if tv.name in var_to_new_ilp_inames:
-                if new_ilp_inames != set(var_to_new_ilp_inames[tv.name]):
+                if not new_ilp_inames and writer_insn.force_scalar and \
+                        tv.name in var_to_new_ilp_inames:
                     # conflict
-                    raise LoopyError("instruction '%s' requires adding "
-                            "indices for vector/ILP inames '%s' on var '%s', "
-                            "but previous instructions required inames '%s'"
-                            % (writer_insn_id, ", ".join(new_ilp_inames),
-                                tv.name, ", ".join(
-                                    var_to_new_ilp_inames[tv.name])))
+                    raise LoopyError("instruction '%s' requires var '%s' to be a "
+                                     "scalar but previous instructions required "
+                                     "vector/ILP inames '%s'" % (
+                                            inner_id, tv.name, ", ".join(
+                                                var_to_new_ilp_inames[tv.name])))
+
+                if not new_ilp_inames:
+                    continue
+
+                if tv.name in var_to_new_ilp_inames:
+                    if new_ilp_inames != set(var_to_new_ilp_inames[tv.name]):
+                        # conflict
+                        raise LoopyError("instruction '%s' requires adding "
+                                "indices for vector/ILP inames '%s' on var '%s', "
+                                "but previous instructions required inames '%s'"
+                                % (inner_id, ", ".join(new_ilp_inames),
+                                    tv.name, ", ".join(
+                                        var_to_new_ilp_inames[tv.name])))
 
-                continue
+                    continue
 
-            var_to_new_ilp_inames[tv.name] = set(new_ilp_inames)
+                var_to_new_ilp_inames[tv.name] = set(new_ilp_inames)
 
     # }}}
 
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 75827845c..8a7cbe836 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2909,6 +2909,13 @@ def test_explicit_simd_temporary_promotion(ctx_factory):
     knl = make_kernel('<> test = mask[j]')
     assert knl.temporary_variables['test'].shape == (4,)
 
+    # case 2) recursive dependency
+    knl = make_kernel("""
+        <> test = mask[j]
+        <> test2 = test
+        """)
+    assert knl.temporary_variables['test2'].shape == (4,)
+
 
 def test_check_for_variable_access_ordering():
     knl = lp.make_kernel(
-- 
GitLab


From c4e65b9b81439e1ff827e890465f573fe8416b33 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 23 Mar 2018 15:20:30 -0400
Subject: [PATCH 061/144] use getattr to avoid putting this on every insn type

---
 loopy/kernel/instruction.py | 21 ---------------------
 loopy/transform/ilp.py      | 12 +++++++++---
 2 files changed, 9 insertions(+), 24 deletions(-)

diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index a571716bc..73c9cae9e 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -1059,16 +1059,6 @@ class CallInstruction(MultiAssignmentBase):
         # issue altogether by disallowing atomicity.
         return ()
 
-    @property
-    def force_scalar(self):
-        # unified interface with Assignment
-        return False
-
-    @property
-    def force_vector(self):
-        # unified interface with Assignment
-        return False
-
 # }}}
 
 
@@ -1277,17 +1267,6 @@ class CInstruction(InstructionBase):
         return first_line + "\n    " + "\n    ".join(
                 self.code.split("\n"))
 
-    @property
-    def force_scalar(self):
-        # unified interface with Assignment
-        return False
-
-    @property
-    def force_vector(self):
-        # unified interface with Assignment
-        return False
-
-
 # }}}
 
 
diff --git a/loopy/transform/ilp.py b/loopy/transform/ilp.py
index 3f3075de7..26a7ade67 100644
--- a/loopy/transform/ilp.py
+++ b/loopy/transform/ilp.py
@@ -75,13 +75,19 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
 
     var_to_new_ilp_inames = {}
 
+    def force_scalar(insn):
+        return getattr(insn, 'force_scalar', False)
+
+    def force_vector(insn):
+        return getattr(insn, 'force_vector', False)
+
     def find_ilp_inames(writer_insn, iname, raise_on_missing=False):
         # test that -- a) the iname is an ILP or vector tag
         if isinstance(kernel.iname_to_tag.get(iname), (IlpBaseTag, VectorizeTag)):
             # check for user specified type
-            if writer_insn.force_scalar:
+            if force_scalar(writer_insn):
                 return set()
-            elif writer_insn.force_vector:
+            elif force_vector(writer_insn):
                 return set([iname])
             # and b) instruction depends on the ILP/vector iname
             return set([iname]) & writer_insn.dependency_names()
@@ -115,7 +121,7 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
 
                 new_ilp_inames = ilp_inames - referenced_ilp_inames
 
-                if not new_ilp_inames and writer_insn.force_scalar and \
+                if not new_ilp_inames and force_scalar(writer_insn) and \
                         tv.name in var_to_new_ilp_inames:
                     # conflict
                     raise LoopyError("instruction '%s' requires var '%s' to be a "
-- 
GitLab


From 2ce4c22cd76b7b4e5aa0e3a01a7638d35f47981b Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 23 Mar 2018 15:32:07 -0400
Subject: [PATCH 062/144] move into temporary variable

---
 loopy/kernel/creation.py |  3 ++-
 loopy/kernel/data.py     | 42 +++++++++++++++++++++++++++++++++-------
 loopy/kernel/tools.py    |  2 +-
 loopy/transform/ilp.py   | 18 +++++++----------
 4 files changed, 45 insertions(+), 20 deletions(-)

diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index f533caa8b..9540e127f 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -1455,7 +1455,8 @@ def create_temporaries(knl, default_order):
                         shape=lp.auto,
                         order=default_order,
                         target=knl.target,
-                        force_scalar=insn.force_scalar)
+                        force_scalar=getattr(insn, 'force_scalar', False),
+                        force_vector=getattr(insn, 'force_vector', False))
 
                 if isinstance(insn, Assignment):
                     insn = insn.copy(temp_var_type=None)
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index d06afc77f..fc99aa08e 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -384,6 +384,28 @@ class TemporaryVariable(ArrayBase):
         scalar variable, regardless of the vector status of the instruction that
         assigns to it.
 
+        .. note::
+
+            This is useful for OpenCL code-generation, to allow for if-statements
+            that do not depend on a vector temporary (which causes compilation
+            failures).
+
+    .. attribute:: force_scalar
+
+        If True, temporary variable created from the assignee will be a scalar
+        variable, regardless of the vector status of this assignment.
+
+        .. note::
+
+            This is useful for OpenCL code-generation, to allow for if-statements
+            that do not depend on a vector temporary (which causes compilation
+            failures).
+
+    .. attribute:: force_vector
+
+        If True, temporary variable created from the assignee will be a vector
+        variable, regardless of the vector status of this assignment.
+
         .. note::
 
             This is useful for OpenCL code-generation, to allow for if-statements
@@ -402,7 +424,8 @@ class TemporaryVariable(ArrayBase):
             "initializer",
             "read_only",
             "_base_storage_access_may_be_aliasing",
-            "force_scalar"
+            "force_scalar",
+            "force_vector"
             ]
 
     def __init__(self, name, dtype=None, shape=(), scope=auto,
@@ -410,7 +433,7 @@ class TemporaryVariable(ArrayBase):
             base_indices=None, storage_shape=None,
             base_storage=None, initializer=None, read_only=False,
             _base_storage_access_may_be_aliasing=False,
-            force_scalar=False, **kwargs):
+            force_scalar=False, force_vector=False, **kwargs):
         """
         :arg dtype: :class:`loopy.auto` or a :class:`numpy.dtype`
         :arg shape: :class:`loopy.auto` or a shape tuple
@@ -463,11 +486,12 @@ class TemporaryVariable(ArrayBase):
                     "are not currently supported "
                     "(did you mean to set read_only=True?)"
                     % name)
-        elif read_only and force_scalar:
+        elif read_only and (force_scalar or force_vector):
             raise LoopyError(
                 "temporary variable '%s': "
-                "cannot specify force_scalar for a read_only variable, force_scalar "
-                "applies only to temporary variables resulting from assignments."
+                "cannot specify force_scalar/force_vector for a read_only variable, "
+                "as these options apply only to temporary variables resulting from "
+                "assignments."
                 % name)
 
         if base_storage is not None and initializer is not None:
@@ -484,10 +508,11 @@ class TemporaryVariable(ArrayBase):
                     "base_storage given!"
                     % name)
 
-        if base_storage is not None and force_scalar:
+        if base_storage is not None and (force_scalar or force_vector):
             raise LoopyError(
                 "temporary variable '%s': "
-                "cannot specify force_scalar if base_storage is supplied."
+                "cannot specify force_scalar/force_vector if base_storage is "
+                "supplied."
                 % name)
 
         ArrayBase.__init__(self, name=intern(name),
@@ -502,6 +527,7 @@ class TemporaryVariable(ArrayBase):
                 _base_storage_access_may_be_aliasing=(
                     _base_storage_access_may_be_aliasing),
                 force_scalar=force_scalar,
+                force_vector=force_vector,
                 **kwargs)
 
     @property
@@ -567,6 +593,7 @@ class TemporaryVariable(ArrayBase):
                 and (self._base_storage_access_may_be_aliasing
                     == other._base_storage_access_may_be_aliasing)
                 and (self.force_scalar == other.force_scalar)
+                and (self.force_vector == other.force_vector)
                 )
 
     def update_persistent_hash(self, key_hash, key_builder):
@@ -589,6 +616,7 @@ class TemporaryVariable(ArrayBase):
         key_builder.rec(key_hash, self.read_only)
         key_builder.rec(key_hash, self._base_storage_access_may_be_aliasing)
         key_builder.rec(key_hash, self.force_scalar)
+        key_builder.rec(key_hash, self.force_vector)
 
 # }}}
 
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index ec26916f3..23fa6b3ed 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -34,7 +34,7 @@ import numpy as np
 import islpy as isl
 from islpy import dim_type
 from loopy.diagnostic import LoopyError, warn_with_kernel
-from pytools import memoize_on_first_arg
+from pytools import memoize_on_first_arg, memoize_method
 from loopy.tools import natsorted
 
 import logging
diff --git a/loopy/transform/ilp.py b/loopy/transform/ilp.py
index 26a7ade67..2dcc5c4e2 100644
--- a/loopy/transform/ilp.py
+++ b/loopy/transform/ilp.py
@@ -75,19 +75,14 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
 
     var_to_new_ilp_inames = {}
 
-    def force_scalar(insn):
-        return getattr(insn, 'force_scalar', False)
-
-    def force_vector(insn):
-        return getattr(insn, 'force_vector', False)
-
-    def find_ilp_inames(writer_insn, iname, raise_on_missing=False):
+    def find_ilp_inames(writer_insn, iname, temp_var,
+                        raise_on_missing=False):
         # test that -- a) the iname is an ILP or vector tag
         if isinstance(kernel.iname_to_tag.get(iname), (IlpBaseTag, VectorizeTag)):
             # check for user specified type
-            if force_scalar(writer_insn):
+            if temp_var.force_scalar:
                 return set()
-            elif force_vector(writer_insn):
+            elif temp_var.force_vector:
                 return set([iname])
             # and b) instruction depends on the ILP/vector iname
             return set([iname]) & writer_insn.dependency_names()
@@ -113,7 +108,8 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
                     iname
                 ilp_inames = set()
                 for ti in test_inames:
-                    ilp_inames |= find_ilp_inames(writer_insn, ti, iname is not None)
+                    ilp_inames |= find_ilp_inames(writer_insn, ti, tv,
+                                                  iname is not None)
 
                 ilp_inames = frozenset(ilp_inames)
                 referenced_ilp_inames = (ilp_inames
@@ -121,7 +117,7 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
 
                 new_ilp_inames = ilp_inames - referenced_ilp_inames
 
-                if not new_ilp_inames and force_scalar(writer_insn) and \
+                if not new_ilp_inames and tv.force_scalar and \
                         tv.name in var_to_new_ilp_inames:
                     # conflict
                     raise LoopyError("instruction '%s' requires var '%s' to be a "
-- 
GitLab


From c147da0463b0b793cf0d73dd20a86e3c5df0426a Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 23 Mar 2018 15:35:05 -0400
Subject: [PATCH 063/144] memoize since it'll be called fairly often

---
 loopy/kernel/tools.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 23fa6b3ed..4383025fe 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -1160,6 +1160,7 @@ def get_visual_iname_order_embedding(kernel):
 
 # {{{ find_recursive_dependencies
 
+@memoize_method
 def find_recursive_dependencies(kernel, insn_ids):
     queue = list(insn_ids)
 
-- 
GitLab


From a0b3a2c56c620571926000fa3ce99767a38456f0 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Tue, 27 Mar 2018 13:59:27 -0400
Subject: [PATCH 064/144] fix for inferring recursive dependencies only w/in
 the same inames

---
 loopy/transform/ilp.py | 34 ++++++++++++++++++++--------------
 1 file changed, 20 insertions(+), 14 deletions(-)

diff --git a/loopy/transform/ilp.py b/loopy/transform/ilp.py
index 2dcc5c4e2..170cc9990 100644
--- a/loopy/transform/ilp.py
+++ b/loopy/transform/ilp.py
@@ -93,23 +93,29 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
     # {{{ find variables that need extra indices
 
     for tv in six.itervalues(kernel.temporary_variables):
-        for writer_insn_id in wmap.get(tv.name, []):
+        writer_insns = set(wmap.get(tv.name, []))
+
+        for writer_insn_id in writer_insns:
+            writer_insn = kernel.id_to_insn[writer_insn_id]
+            inner_ids = set([writer_insn_id])
             # the instructions we have to consider here are those that directly
             # write to this variable, and those that are recursive dependencies of
             # this instruction
-
-            writer_insns = set([writer_insn_id]) | \
-                find_recursive_dependencies(kernel, frozenset([writer_insn_id]))
-
-            for inner_id in writer_insns:
-                writer_insn = kernel.id_to_insn[inner_id]
-
-                test_inames = kernel.insn_inames(writer_insn) if iname is None else \
-                    iname
+            rec_deps = find_recursive_dependencies(kernel, frozenset([
+                writer_insn_id]))
+            # however, we must make sure to limit to those inames that we are
+            # actually inside of
+            inner_ids |= set([
+                x for x in rec_deps if kernel.id_to_insn[x].within_inames <=
+                writer_insn.within_inames])
+
+            for insn_id in inner_ids:
+                insn = kernel.id_to_insn[insn_id]
+                test_inames = (kernel.insn_inames(insn) if iname is None else
+                    set([iname]))
                 ilp_inames = set()
                 for ti in test_inames:
-                    ilp_inames |= find_ilp_inames(writer_insn, ti, tv,
-                                                  iname is not None)
+                    ilp_inames |= find_ilp_inames(insn, ti, tv, iname is not None)
 
                 ilp_inames = frozenset(ilp_inames)
                 referenced_ilp_inames = (ilp_inames
@@ -123,7 +129,7 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
                     raise LoopyError("instruction '%s' requires var '%s' to be a "
                                      "scalar but previous instructions required "
                                      "vector/ILP inames '%s'" % (
-                                            inner_id, tv.name, ", ".join(
+                                            insn_id, tv.name, ", ".join(
                                                 var_to_new_ilp_inames[tv.name])))
 
                 if not new_ilp_inames:
@@ -135,7 +141,7 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
                         raise LoopyError("instruction '%s' requires adding "
                                 "indices for vector/ILP inames '%s' on var '%s', "
                                 "but previous instructions required inames '%s'"
-                                % (inner_id, ", ".join(new_ilp_inames),
+                                % (insn_id, ", ".join(new_ilp_inames),
                                     tv.name, ", ".join(
                                         var_to_new_ilp_inames[tv.name])))
 
-- 
GitLab


From 414ce2a4cc7b267f1d5058438c909a6520972148 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Tue, 27 Mar 2018 14:03:55 -0400
Subject: [PATCH 065/144] more tests

---
 test/test_loopy.py | 40 +++++++++++++++++++++++++++++-----------
 1 file changed, 29 insertions(+), 11 deletions(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index 8a7cbe836..8ce92a50a 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2869,19 +2869,24 @@ def test_explicit_simd_temporary_promotion(ctx_factory):
         <:v> tv3 = 1
         """)
 
-    def make_kernel(insn, ans=None):
-        knl = lp.make_kernel(
-            '{[i,j]: 0 <= i,j < 12}',
-            """
-            for j
-                for i
-                    %(insn)s
-                    if test
-                        a[i, j] = 1
-                    end
+    def make_kernel(insn, ans=None, preamble=None, extra_inames=None):
+        skeleton = """
+        %(preamble)s
+        for j
+            for i
+                %(insn)s
+                if test
+                    a[i, j] = 1
                 end
             end
-            """ % dict(insn=insn),
+        end
+        """
+        inames = ['i, j']
+        if extra_inames is not None:
+            inames += list(extra_inames)
+        knl = lp.make_kernel(
+            '{[%(inames)s]: 0 <= %(inames)s < 12}' % {'inames': ', '.join(inames)},
+            skeleton % dict(insn=insn, preamble='' if not preamble else preamble),
             [lp.GlobalArg('a', shape=(12, 12)),
              lp.TemporaryVariable('mask', shape=(12,), initializer=np.array(
                                   np.arange(12) >= 6, dtype=np.int), read_only=True,
@@ -2916,6 +2921,19 @@ def test_explicit_simd_temporary_promotion(ctx_factory):
         """)
     assert knl.temporary_variables['test2'].shape == (4,)
 
+    # case 3) test that a conflict in user-specified vector types results in error
+
+    # 3a) initial scalar assignment w/ later vector access
+    preamble = """
+    for k
+        <:s> test = 1
+    end
+    """
+
+    from loopy import LoopyError
+    with pytest.raises(LoopyError):
+        make_kernel('test = mask[j]', preamble=preamble, extra_inames='k')
+
 
 def test_check_for_variable_access_ordering():
     knl = lp.make_kernel(
-- 
GitLab


From 1b81f398e503897daf3bebb58a48cbdc91321636 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Tue, 27 Mar 2018 14:14:43 -0400
Subject: [PATCH 066/144] fix sequential conflict detection & test

---
 loopy/transform/ilp.py | 6 ++++++
 test/test_loopy.py     | 2 +-
 2 files changed, 7 insertions(+), 1 deletion(-)

diff --git a/loopy/transform/ilp.py b/loopy/transform/ilp.py
index 170cc9990..6d81a7103 100644
--- a/loopy/transform/ilp.py
+++ b/loopy/transform/ilp.py
@@ -81,6 +81,12 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
         if isinstance(kernel.iname_to_tag.get(iname), (IlpBaseTag, VectorizeTag)):
             # check for user specified type
             if temp_var.force_scalar:
+                if iname in writer_insn.read_dependency_names():
+                    raise LoopyError(
+                        "Cannot write to (user-specified) scalar variable '%s' "
+                        "using vec/ILP iname '%s' in instruction '%s'." % (
+                            temp_var.name, iname, writer_insn.id)
+                        )
                 return set()
             elif temp_var.force_vector:
                 return set([iname])
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 8ce92a50a..fb6f4cd6b 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2932,7 +2932,7 @@ def test_explicit_simd_temporary_promotion(ctx_factory):
 
     from loopy import LoopyError
     with pytest.raises(LoopyError):
-        make_kernel('test = mask[j]', preamble=preamble, extra_inames='k')
+        k = make_kernel('test = mask[j]', preamble=preamble, extra_inames='k')
 
 
 def test_check_for_variable_access_ordering():
-- 
GitLab


From a479b1f156025ba5bf26822d73893336e3a906c8 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Tue, 27 Mar 2018 14:40:24 -0400
Subject: [PATCH 067/144] avoid warning for user-specified vectorize'd function
 & test

---
 loopy/transform/ilp.py | 10 ++++++++--
 test/test_loopy.py     | 17 ++++++++++++++++-
 2 files changed, 24 insertions(+), 3 deletions(-)

diff --git a/loopy/transform/ilp.py b/loopy/transform/ilp.py
index 6d81a7103..e6b155753 100644
--- a/loopy/transform/ilp.py
+++ b/loopy/transform/ilp.py
@@ -213,8 +213,14 @@ def add_axes_to_temporaries_for_ilp_and_vec(kernel, iname=None):
         new_insn = insn.with_transformed_expressions(eiii)
         if not eiii.seen_ilp_inames <= insn.within_inames:
 
-            from loopy.diagnostic import warn_with_kernel
-            warn_with_kernel(
+            # the only O.K. case here is that the user specified that the instruction
+            # should be a vector, and all the missing iname tags are vectors.
+            if not getattr(insn, 'force_vector', False) and all(isinstance(
+                kernel.iname_to_tag.get(iname), VectorizeTag) for x in
+                    eiii.seen_ilp_inames - insn.within_inames):
+
+                from loopy.diagnostic import warn_with_kernel
+                warn_with_kernel(
                     kernel,
                     "implicit_ilp_iname",
                     "Instruction '%s': touched variable that (for ILP) "
diff --git a/test/test_loopy.py b/test/test_loopy.py
index fb6f4cd6b..19e8ac0ff 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2932,7 +2932,22 @@ def test_explicit_simd_temporary_promotion(ctx_factory):
 
     from loopy import LoopyError
     with pytest.raises(LoopyError):
-        k = make_kernel('test = mask[j]', preamble=preamble, extra_inames='k')
+        make_kernel('test = mask[j]', preamble=preamble, extra_inames='k')
+
+    # 3b) initial vector assignment w/ later scalar access -- OK
+
+    preamble = """
+    for k
+        <:v> test = 1
+    end
+    """
+
+    from loopy import LoopyError
+    # treat warning as error to make sure the logic detecting user specified
+    # vectorization is good
+    import warnings
+    warnings.filterwarnings('error')
+    make_kernel('test = mask[i]', preamble=preamble, extra_inames='k')
 
 
 def test_check_for_variable_access_ordering():
-- 
GitLab


From ffc61af09afb645f255dd4195834e2638b7c327c Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Tue, 27 Mar 2018 15:13:39 -0400
Subject: [PATCH 068/144] fix

---
 test/test_loopy.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index 19e8ac0ff..cb9713804 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2946,8 +2946,14 @@ def test_explicit_simd_temporary_promotion(ctx_factory):
     # treat warning as error to make sure the logic detecting user specified
     # vectorization is good
     import warnings
-    warnings.filterwarnings('error')
-    make_kernel('test = mask[i]', preamble=preamble, extra_inames='k')
+    try:
+        warnings.filterwarnings(
+            'error', r"Instruction '[^\W]+': touched variable that \(for ILP\)")
+        make_kernel('test = mask[i]', preamble=preamble, extra_inames='k')
+    except Exception:
+        raise
+    finally:
+        warnings.resetwarnings()
 
 
 def test_check_for_variable_access_ordering():
-- 
GitLab


From 3bb0ac40e492dd2532974b7a3dedc663a15b8788 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 25 Apr 2018 16:21:39 -0400
Subject: [PATCH 069/144] reword

---
 test/test_loopy.py | 12 ++++++------
 1 file changed, 6 insertions(+), 6 deletions(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index cb9713804..95d7b1bd5 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2905,25 +2905,25 @@ def test_explicit_simd_temporary_promotion(ctx_factory):
 
     ans = np.zeros((12, 3, 4))
     ans[6:, :, :] = 1
-    # first broken case -- incorrect promotion of temporaries to vector dtypes
+    # case 1) -- incorrect promotion of temporaries to vector dtypes
     make_kernel('<> test = mask[i]', ans)
 
     # next test the writer heuristic
 
-    # case 1) assignment from a vector iname
+    # case 2) assignment from a vector iname
     knl = make_kernel('<> test = mask[j]')
     assert knl.temporary_variables['test'].shape == (4,)
 
-    # case 2) recursive dependency
+    # case 3) recursive dependency
     knl = make_kernel("""
         <> test = mask[j]
         <> test2 = test
         """)
     assert knl.temporary_variables['test2'].shape == (4,)
 
-    # case 3) test that a conflict in user-specified vector types results in error
+    # case 4) test that a conflict in user-specified vector types results in error
 
-    # 3a) initial scalar assignment w/ later vector access
+    # 4a) initial scalar assignment w/ later vector access
     preamble = """
     for k
         <:s> test = 1
@@ -2934,7 +2934,7 @@ def test_explicit_simd_temporary_promotion(ctx_factory):
     with pytest.raises(LoopyError):
         make_kernel('test = mask[j]', preamble=preamble, extra_inames='k')
 
-    # 3b) initial vector assignment w/ later scalar access -- OK
+    # 4b) initial vector assignment w/ later scalar access -- OK
 
     preamble = """
     for k
-- 
GitLab


From da20a7ed7896b2ddcc97e67f2cea371180ceb69a Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 25 Apr 2018 17:13:36 -0400
Subject: [PATCH 070/144] update for merge/rename

---
 loopy/transform/privatize.py | 192 +++++++++++++++--------------------
 1 file changed, 80 insertions(+), 112 deletions(-)

diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py
index 12b488e7f..d6396849d 100644
--- a/loopy/transform/privatize.py
+++ b/loopy/transform/privatize.py
@@ -104,6 +104,10 @@ def privatize_temporaries_with_inames(
 
     .. versionadded:: 2018.1
     """
+
+    from loopy.kernel.data import VectorizeTag, IlpBaseTag
+    from loopy.kernel.tools import find_recursive_dependencies
+
     if isinstance(privatizing_inames, str):
         privatizing_inames = frozenset(
                 s.strip()
@@ -118,113 +122,86 @@ def privatize_temporaries_with_inames(
 
     var_to_new_priv_axis_iname = {}
 
-    # def find_ilp_inames(writer_insn, iname, temp_var,
-    #                     raise_on_missing=False):
-    #     # test that -- a) the iname is an ILP or vector tag
-    #     if isinstance(kernel.iname_to_tag.get(iname), (IlpBaseTag, VectorizeTag)):
-    #         # check for user specified type
-    #         if temp_var.force_scalar:
-    #             if iname in writer_insn.read_dependency_names():
-    #                 raise LoopyError(
-    #                     "Cannot write to (user-specified) scalar variable '%s' "
-    #                     "using vec/ILP iname '%s' in instruction '%s'." % (
-    #                         temp_var.name, iname, writer_insn.id)
-    #                     )
-    #             return set()
-    #         elif temp_var.force_vector:
-    #             return set([iname])
-    #         # and b) instruction depends on the ILP/vector iname
-    #         return set([iname]) & writer_insn.dependency_names()
-    #     elif raise_on_missing:
-    #         raise LoopyError("'%s' is not an ILP iname" % iname)
-    #     return set()
+    def find_privitzing_inames(writer_insn, iname, temp_var):
+        # test that -- a) the iname is an ILP or vector tag
+        if isinstance(kernel.iname_to_tag.get(iname), (IlpBaseTag, VectorizeTag)):
+            # check for user specified type
+            if temp_var.force_scalar:
+                if iname in writer_insn.read_dependency_names():
+                    raise LoopyError(
+                        "Cannot write to (user-specified) scalar variable '%s' "
+                        "using vec/ILP iname '%s' in instruction '%s'." % (
+                            temp_var.name, iname, writer_insn.id)
+                        )
+                return set()
+            elif temp_var.force_vector:
+                return set([iname])
+            # and b) instruction depends on the ILP/vector iname
+            return set([iname]) & writer_insn.dependency_names()
+        return set()
 
     # {{{ find variables that need extra indices
 
     for tv in six.itervalues(kernel.temporary_variables):
+        # check variables to transform
         if only_var_names is not None and tv.name not in only_var_names:
             continue
 
-        for writer_insn_id in wmap.get(tv.name, []):
+        for writer_insn_id in set(wmap.get(tv.name, [])):
             writer_insn = kernel.id_to_insn[writer_insn_id]
+            inner_ids = set([writer_insn_id])
+            # the instructions we have to consider here are those that directly
+            # write to this variable, and those that are recursive dependencies of
+            # this instruction
+            rec_deps = find_recursive_dependencies(kernel, frozenset([
+                writer_insn_id]))
+            # however, we must make sure to limit to those inames that we are
+            # actually inside of
+            inner_ids |= set([
+                x for x in rec_deps if kernel.id_to_insn[x].within_inames <=
+                writer_insn.within_inames])
+
+            for insn_id in inner_ids:
+                insn = kernel.id_to_insn[insn_id]
+                test_inames = kernel.insn_inames(insn) & privatizing_inames
+
+                priv_axis_inames = set()
+                for ti in test_inames:
+                    priv_axis_inames |= find_privitzing_inames(insn, ti, tv)
+
+                priv_axis_inames = frozenset(priv_axis_inames)
+                referenced_priv_axis_inames = (priv_axis_inames
+                    & writer_insn.write_dependency_names())
 
-            priv_axis_inames = kernel.insn_inames(writer_insn) & privatizing_inames
+                new_priv_axis_inames = priv_axis_inames - referenced_priv_axis_inames
 
-            referenced_priv_axis_inames = (priv_axis_inames
-                    & writer_insn.write_dependency_names())
+                if not new_priv_axis_inames and tv.force_scalar and \
+                        tv.name in var_to_new_priv_axis_iname:
+                    # conflict
+                    raise LoopyError("instruction '%s' requires var '%s' to be a "
+                                     "scalar but previous instructions required "
+                                     "vector/ILP inames '%s'" % (
+                                            insn_id, tv.name, ", ".join(
+                                                var_to_new_priv_axis_iname[
+                                                    tv.name])))
+
+                if not new_priv_axis_inames:
+                    continue
 
-            new_priv_axis_inames = priv_axis_inames - referenced_priv_axis_inames
-
-            if not new_priv_axis_inames:
-                break
-
-            if tv.name in var_to_new_priv_axis_iname:
-                if new_priv_axis_inames != set(var_to_new_priv_axis_iname[tv.name]):
-                    raise LoopyError("instruction '%s' requires adding "
-                            "indices for privatizing var '%s' on iname(s) '%s', "
-                            "but previous instructions required inames '%s'"
-                            % (writer_insn_id, tv.name,
-                                ", ".join(new_priv_axis_inames),
-                                ", ".join(var_to_new_priv_axis_iname[tv.name])))
-
-            var_to_new_priv_axis_iname[tv.name] = set(new_priv_axis_inames)
-
-    # for tv in six.itervalues(kernel.temporary_variables):
-    #     writer_insns = set(wmap.get(tv.name, []))
-
-        # for writer_insn_id in writer_insns:
-        #     writer_insn = kernel.id_to_insn[writer_insn_id]
-        #     inner_ids = set([writer_insn_id])
-        #     # the instructions we have to consider here are those that directly
-        #     # write to this variable, and those that are recursive dependencies of
-        #     # this instruction
-        #     rec_deps = find_recursive_dependencies(kernel, frozenset([
-        #         writer_insn_id]))
-        #     # however, we must make sure to limit to those inames that we are
-        #     # actually inside of
-        #     inner_ids |= set([
-        #         x for x in rec_deps if kernel.id_to_insn[x].within_inames <=
-        #         writer_insn.within_inames])
-
-        #     for insn_id in inner_ids:
-        #         insn = kernel.id_to_insn[insn_id]
-        #         test_inames = (kernel.insn_inames(insn) if iname is None else
-        #             set([iname]))
-        #         ilp_inames = set()
-        #         for ti in test_inames:
-        #             ilp_inames |= find_ilp_inames(insn, ti, tv, iname is not None)
-
-        #         ilp_inames = frozenset(ilp_inames)
-        #         referenced_ilp_inames = (ilp_inames
-        #                 & writer_insn.write_dependency_names())
-
-        #         new_ilp_inames = ilp_inames - referenced_ilp_inames
-
-        #         if not new_ilp_inames and tv.force_scalar and \
-        #                 tv.name in var_to_new_ilp_inames:
-        #             # conflict
-        #             raise LoopyError("instruction '%s' requires var '%s' to be a "
-        #                              "scalar but previous instructions required "
-        #                              "vector/ILP inames '%s'" % (
-        #                                     insn_id, tv.name, ", ".join(
-        #                                         var_to_new_ilp_inames[tv.name])))
-
-        #         if not new_ilp_inames:
-        #             continue
-
-        #         if tv.name in var_to_new_ilp_inames:
-        #             if new_ilp_inames != set(var_to_new_ilp_inames[tv.name]):
-        #                 # conflict
-        #                 raise LoopyError("instruction '%s' requires adding "
-        #                         "indices for vector/ILP inames '%s' on var '%s', "
-        #                         "but previous instructions required inames '%s'"
-        #                         % (insn_id, ", ".join(new_ilp_inames),
-        #                             tv.name, ", ".join(
-        #                                 var_to_new_ilp_inames[tv.name])))
-
-        #             continue
-
-        #         var_to_new_ilp_inames[tv.name] = set(new_ilp_inames)
+                if tv.name in var_to_new_priv_axis_iname:
+                    if new_priv_axis_inames != set(
+                            var_to_new_priv_axis_iname[tv.name]):
+                        # conflict
+                        raise LoopyError("instruction '%s' requires adding "
+                                "indices for vector/ILP inames '%s' on var '%s', "
+                                "but previous instructions required inames '%s'"
+                                % (insn_id, ", ".join(new_priv_axis_inames),
+                                    tv.name, ", ".join(
+                                        var_to_new_priv_axis_iname[tv.name])))
+
+                    continue
+
+                var_to_new_priv_axis_iname[tv.name] = set(new_priv_axis_inames)
 
     # }}}
 
@@ -250,8 +227,6 @@ def privatize_temporaries_with_inames(
 
     # {{{ change temporary variables
 
-    from loopy.kernel.data import VectorizeTag
-
     new_temp_vars = kernel.temporary_variables.copy()
     for tv_name, inames in six.iteritems(var_to_new_priv_axis_iname):
         tv = new_temp_vars[tv_name]
@@ -284,21 +259,14 @@ def privatize_temporaries_with_inames(
     for insn in kernel.instructions:
         eiii = ExtraInameIndexInserter(var_to_extra_iname)
         new_insn = insn.with_transformed_expressions(eiii)
-        # if not eiii.seen_ilp_inames <= insn.within_inames:
-
-        #     # the only O.K. case here is that the user specified that the instruction
-        #     # should be a vector, and all the missing iname tags are vectors.
-        #     if not getattr(insn, 'force_vector', False) and all(isinstance(
-        #         kernel.iname_to_tag.get(iname), VectorizeTag) for x in
-        #             eiii.seen_ilp_inames - insn.within_inames):
-
-        #         from loopy.diagnostic import warn_with_kernel
-        #         warn_with_kernel(
-        #             kernel,
-        #             "implicit_ilp_iname",
-        #             "Instruction '%s': touched variable that (for ILP) "
         if not eiii.seen_priv_axis_inames <= insn.within_inames:
-            raise LoopyError(
+
+            # the only O.K. case here is that the user specified that the instruction
+            # should be a vector, and all the missing iname tags are vectors.
+            if not getattr(insn, 'force_vector', False) and all(isinstance(
+                kernel.iname_to_tag.get(iname), VectorizeTag) for x in
+                    eiii.seen_ilp_inames - insn.within_inames):
+                raise LoopyError(
                     "Kernel '%s': Instruction '%s': touched variable that "
                     "(for privatization, e.g. as performed for ILP) "
                     "required iname(s) '%s', but that the instruction was not "
-- 
GitLab


From a63856bdc7ab0b96239457d474caab90de83426b Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 25 Apr 2018 17:19:17 -0400
Subject: [PATCH 071/144] don't re-process instructions if possible

---
 loopy/transform/privatize.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py
index d6396849d..cca234c1c 100644
--- a/loopy/transform/privatize.py
+++ b/loopy/transform/privatize.py
@@ -147,7 +147,10 @@ def privatize_temporaries_with_inames(
         if only_var_names is not None and tv.name not in only_var_names:
             continue
 
+        seen = set()
         for writer_insn_id in set(wmap.get(tv.name, [])):
+            if writer_insn_id in seen:
+                continue
             writer_insn = kernel.id_to_insn[writer_insn_id]
             inner_ids = set([writer_insn_id])
             # the instructions we have to consider here are those that directly
@@ -162,6 +165,8 @@ def privatize_temporaries_with_inames(
                 writer_insn.within_inames])
 
             for insn_id in inner_ids:
+                seen.add(insn_id)
+
                 insn = kernel.id_to_insn[insn_id]
                 test_inames = kernel.insn_inames(insn) & privatizing_inames
 
-- 
GitLab


From d9686a9e315675308e468aeceaab0c8c0ce4e599 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 26 Apr 2018 12:12:09 -0400
Subject: [PATCH 072/144] add first pass at vector store implementation

---
 loopy/target/opencl.py | 88 ++++++++++++++++++++++++++++++++++++++++--
 1 file changed, 85 insertions(+), 3 deletions(-)

diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 652fce659..97716eec0 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -34,6 +34,7 @@ from loopy.types import NumpyType
 from loopy.target.c import DTypeRegistryWrapper, c_math_mangler
 from loopy.kernel.data import temp_var_scope, CallMangleInfo
 from pymbolic import var
+from pymbolic.primitives import Call
 
 from functools import partial
 
@@ -387,6 +388,67 @@ class OpenCLTarget(CTarget):
 
 # }}}
 
+# {{{ simple opencl function wrappers
+
+
+class VectorFunc(Call):
+    def __init__(self, function, parameters):
+        # check that function and parameters are variables
+        from pymbolic.primitives import Variable, Expression
+        if not isinstance(function, Variable):
+            function = var(function)
+        parameters = list(parameters)
+        for i, param in enumerate(parameters):
+            if not isinstance(param, (Variable, Expression)):
+                parameters[i] = var(str(param))
+        super(VectorFunc, self).__init__(function, tuple(parameters))
+
+
+class VectorStore(VectorFunc):
+    def __init__(self, vector_width, store, offset, array):
+        """
+        Represents a vstoren
+
+        :arg vector_width: the SIMD vector-width
+        :arg store: the data to store
+        :arg offset: the offset in the array
+        :arg array: the array to store the data in
+        """
+
+        name = 'vstore%d' % vector_width
+        super(VectorStore, self).__init__(name, (store, offset, array))
+
+
+class VectorLoad(VectorFunc):
+    def __init__(self, vector_width, offset, array):
+        """
+        Represents a vloadn
+
+        :arg vector_width: the SIMD vector-width
+        :arg offset: the offset in the array
+        :arg array: the array to store the data in
+        """
+
+        name = 'vload%d' % vector_width
+        super(VectorLoad, self).__init__(name, (offset, array))
+
+
+class VectorSelect(VectorFunc):
+    def __init__(self, select_if_true, select_if_false, select_on):
+        """
+        Represents a vector-select
+
+        :arg select_if_true: the value to be chosen if select_on is true
+        :arg select_if_false: the value to be chosen if select_on is false
+        :arg select_on: the conditional selection value
+        """
+
+        name = 'select'
+        super(VectorSelect, self).__init__(name, (
+            select_if_true, select_if_false, select_on))
+
+# }}}
+
 
 # {{{ ast builder
 
@@ -465,6 +527,28 @@ class OpenCLCASTBuilder(CASTBuilder):
     def get_expression_to_c_expression_mapper(self, codegen_state):
         return ExpressionToOpenCLCExpressionMapper(codegen_state)
 
+    def emit_assignment(self, codegen_state, insn):
+        """
+        A wrapper around the base C-target emit_assignment, to handle explicit-SIMD
+        functions, such as selects, vstore's and vload's and shuffles
+        """
+
+        assignment = super(OpenCLCASTBuilder, self).emit_assignment(
+            codegen_state, insn)
+
+        # fix-up
+        if isinstance(assignment.lvalue.expr, VectorLoad):
+            from cgen import Statement
+            # get vector width
+            func = str(assignment.lvalue.expr.function)
+            vw = int(func[func.index('vload') + len('vload'):])
+            # convert to vector store
+            store = VectorStore(vw, assignment.rvalue.expr,
+                                *assignment.lvalue.expr.parameters)
+            # and to statement
+            assignment = Statement(str(store))
+        return assignment
+
     def add_vector_access(self, access_expr, index):
         # The 'int' avoids an 'L' suffix for long ints.
         def __map(ind, use_prefix=True):
@@ -503,9 +587,7 @@ class OpenCLCASTBuilder(CASTBuilder):
             pass
         # and cast / substitute in the calculated vector iname offset
         cast_expr = '&((%s)%s)[%s]' % (ctype, array.name, index[0])
-        from pymbolic.primitives import Call, Variable
-        return Call(Variable('vload%d' % len(index)), (
-            Variable(str(offset)), Variable(cast_expr)))
+        return VectorLoad(len(index), str(offset), str(cast_expr))
 
     def emit_barrier(self, synchronization_kind, mem_kind, comment):
         """
-- 
GitLab


From 06f3a665c51df654b043f93212e9fd01bac5bec1 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 26 Apr 2018 12:18:29 -0400
Subject: [PATCH 073/144] Add store tests

---
 test/test_loopy.py | 22 ++++++++++++++++++----
 1 file changed, 18 insertions(+), 4 deletions(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index 572605f0b..4d1219005 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2800,10 +2800,11 @@ def test_add_prefetch_works_in_lhs_index():
 def test_explicit_simd_shuffles(ctx_factory):
     ctx = ctx_factory()
 
-    def create_and_test(insn, answer=None, atomic=False, additional_check=None):
+    def create_and_test(insn, answer=None, atomic=False, additional_check=None,
+                        store=False):
         knl = lp.make_kernel(['{[i]: 0 <= i < 12}', '{[j]: 0 <= j < 1}'],
                              insn,
-                             [lp.GlobalArg('a', shape=(1, 12,), dtype=np.int32,
+                             [lp.GlobalArg('a', shape=(1, 14,), dtype=np.int32,
                                            for_atomic=atomic),
                               lp.GlobalArg('b', shape=(1, 14,), dtype=np.int32,
                                            for_atomic=atomic)])
@@ -2816,9 +2817,13 @@ def test_explicit_simd_shuffles(ctx_factory):
         print(lp.generate_code_v2(knl).device_code())
         queue = cl.CommandQueue(ctx)
         if answer is None:
-            answer = np.arange(2, 14, dtype=np.int32)
+            answer = np.zeros(16, dtype=np.int32)
+            if store:
+                answer[2:-2] = np.arange(0, 12, dtype=np.int32)
+            else:
+                answer[:-4] = np.arange(2, 14, dtype=np.int32)
         assert np.array_equal(
-            knl(queue, a=np.zeros((1, 3, 4), dtype=np.int32),
+            knl(queue, a=np.zeros((1, 4, 4), dtype=np.int32),
                 b=np.arange(16, dtype=np.int32).reshape((1, 4, 4)))[1][0].flatten(
                     'C'),
             answer)
@@ -2833,6 +2838,15 @@ def test_explicit_simd_shuffles(ctx_factory):
     create_and_test("a[j, i] = b[j, i + 2]")
     create_and_test("a[j, i] = b[j, i + 2] + a[j, i]")
     create_and_test("a[j, i] = a[j, i] + b[j, i + 2]")
+    # test vector stores
+    create_and_test("<>c = 2\n" +
+                    "a[j, i + c] = b[j, i]",
+                    additional_check=lambda knl: 'vstore' in lp.generate_code_v2(
+                        knl).device_code(),
+                    store=True)
+    create_and_test("a[j, i + 2] = b[j, i]", store=True)
+    create_and_test("a[j, i + 2] = b[j, i] + a[j, i + 2]", store=True)
+    create_and_test("a[j, i + 2] = a[j, i + 2] + b[j, i]", store=True)
     # test small vector shuffle
     create_and_test("a[j, i] = b[j, (i + 2) % 4]",
                     np.arange(12, dtype=np.int32)[(np.arange(12) + 2) % 4])
-- 
GitLab


From c1adf4705dcf2945b352203a87f83c61936ee387 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 26 Apr 2018 12:30:30 -0400
Subject: [PATCH 074/144] fix answers for shuffle test

---
 test/test_loopy.py | 18 +++++++++++-------
 1 file changed, 11 insertions(+), 7 deletions(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index 4d1219005..9d096da34 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2822,11 +2822,12 @@ def test_explicit_simd_shuffles(ctx_factory):
                 answer[2:-2] = np.arange(0, 12, dtype=np.int32)
             else:
                 answer[:-4] = np.arange(2, 14, dtype=np.int32)
-        assert np.array_equal(
-            knl(queue, a=np.zeros((1, 4, 4), dtype=np.int32),
-                b=np.arange(16, dtype=np.int32).reshape((1, 4, 4)))[1][0].flatten(
-                    'C'),
-            answer)
+
+        a = np.zeros((1, 4, 4), dtype=np.int32)
+        b = np.arange(16, dtype=np.int32).reshape((1, 4, 4))
+        result = knl(queue, a=a, b=b)[1][0]
+
+        assert np.array_equal(result.flatten('C'), answer)
         if additional_check is not None:
             assert additional_check(knl)
 
@@ -2848,8 +2849,11 @@ def test_explicit_simd_shuffles(ctx_factory):
     create_and_test("a[j, i + 2] = b[j, i] + a[j, i + 2]", store=True)
     create_and_test("a[j, i + 2] = a[j, i + 2] + b[j, i]", store=True)
     # test small vector shuffle
-    create_and_test("a[j, i] = b[j, (i + 2) % 4]",
-                    np.arange(12, dtype=np.int32)[(np.arange(12) + 2) % 4])
+    shuffled = np.arange(16, dtype=np.int32)[(np.arange(16) + 2) % 4 +
+                                              4 * (np.arange(16) // 4)]
+    shuffled[12:] = 0
+    create_and_test("a[j, i] = b[j, (i + 2) % 4 + 4 * (i // 4)]", shuffled)
+    create_and_test("a[j, (i + 2) % 4 + 4 * (i // 4)] = b[j, i]", shuffled)
     # test atomics
     from loopy import LoopyError
     with pytest.raises(LoopyError):
-- 
GitLab


From 9e763dadad7ec7c10bbca61c52bae1a6b8ba7d06 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 26 Apr 2018 13:17:11 -0400
Subject: [PATCH 075/144] add guard

---
 loopy/target/opencl.py | 23 +++++++++++++----------
 1 file changed, 13 insertions(+), 10 deletions(-)

diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 97716eec0..92efffabf 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -537,16 +537,19 @@ class OpenCLCASTBuilder(CASTBuilder):
             codegen_state, insn)
 
         # fix-up
-        if isinstance(assignment.lvalue.expr, VectorLoad):
-            from cgen import Statement
-            # get vector width
-            func = str(assignment.lvalue.expr.function)
-            vw = int(func[func.index('vload') + len('vload'):])
-            # convert to vector store
-            store = VectorStore(vw, assignment.rvalue.expr,
-                                *assignment.lvalue.expr.parameters)
-            # and to statement
-            assignment = Statement(str(store))
+        try:
+            if isinstance(assignment.lvalue.expr, VectorLoad):
+                from cgen import Statement
+                # get vector width
+                func = str(assignment.lvalue.expr.function)
+                vw = int(func[func.index('vload') + len('vload'):])
+                # convert to vector store
+                store = VectorStore(vw, assignment.rvalue.expr,
+                                    *assignment.lvalue.expr.parameters)
+                # and to statement
+                assignment = Statement(str(store))
+        except AttributeError:
+            pass
         return assignment
 
     def add_vector_access(self, access_expr, index):
-- 
GitLab


From 9b0196ff9a37398527291229db263a95f4801e3e Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 26 Apr 2018 15:48:22 -0400
Subject: [PATCH 076/144] add first pass at a vector select & test

---
 loopy/codegen/control.py | 31 +++++++++++++++++++++++++-
 loopy/codegen/result.py  |  8 +++++--
 loopy/target/__init__.py |  3 +++
 loopy/target/opencl.py   | 18 ++++++++++++---
 test/test_loopy.py       | 48 ++++++++++++++++++++++++++++++++++++++++
 5 files changed, 102 insertions(+), 6 deletions(-)

diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index e3e209726..c2741147f 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -507,10 +507,39 @@ def build_loop_nest(codegen_state, schedule_index):
 
                     prev_result = prev_gen_code(inner_codegen_state)
 
+                    # determine if any conditions are vector
+                    try:
+                        vec_if = False
+                        vec_iname = inner_codegen_state.vectorization_info.iname
+                    except AttributeError:
+                        # no vectorization info
+                        pass
+                    else:
+                        deps = set()
+                        for x in condition_exprs:
+                            deps |= get_dependencies(x)
+                        deps = frozenset(deps)
+                        if deps & set([vec_iname]):
+                            # we'd have to insert our own mirror temporary of the
+                            # vector iname here
+                            vec_if = True
+                            raise NotImplementedError(
+                                "Can't use vector iname directly in conditional")
+                        else:
+                            from loopy.kernel.array import VectorArrayDimTag
+                            # check if any vector arrays are in condition
+                            knl = inner_codegen_state.kernel
+                            vec_arys = set([x.name for x in knl.args + list(
+                                knl.temporary_variables.values()) if any(
+                                    isinstance(dt, VectorArrayDimTag)
+                                    for dt in x.dim_tags)])
+                            vec_if |= vec_arys & deps
+
                     return [wrap_in_if(
                         inner_codegen_state,
                         condition_exprs,
-                        merge_codegen_results(codegen_state, prev_result))]
+                        merge_codegen_results(codegen_state, prev_result),
+                        vector=vec_if)]
 
                 cannot_vectorize = False
                 if new_codegen_state.vectorization_info is not None:
diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index 4318ad71c..de90420b5 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -254,14 +254,18 @@ def merge_codegen_results(codegen_state, elements, collapse=True):
                 **kwargs))
 
 
-def wrap_in_if(codegen_state, condition_exprs, inner):
+def wrap_in_if(codegen_state, condition_exprs, inner, vector=False):
     if condition_exprs:
         from pymbolic.primitives import LogicalAnd
         from pymbolic.mapper.stringifier import PREC_NONE
         cur_ast = inner.current_ast(codegen_state)
+        if vector:
+            method = codegen_state.ast_builder.emit_if
+        else:
+            method = codegen_state.ast_builder.emit_vector_if
         return inner.with_new_ast(
                 codegen_state,
-                codegen_state.ast_builder.emit_if(
+                method(
                     codegen_state.expression_to_code_mapper(
                         LogicalAnd(tuple(condition_exprs)), PREC_NONE),
                     cur_ast))
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index a08b406f5..c3b7a739c 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -220,6 +220,9 @@ class ASTBuilderBase(object):
     def emit_if(self, condition_str, ast):
         raise NotImplementedError()
 
+    def emit_vector_if(self, condition_str, ast):
+        raise NotImplementedError()
+
     def emit_initializer(self, codegen_state, dtype, name, val_str, is_const):
         raise NotImplementedError()
 
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 92efffabf..fed3c49b7 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -434,18 +434,18 @@ class VectorLoad(VectorFunc):
 
 
 class VectorSelect(VectorFunc):
-    def __init__(self, select_if_true, select_if_false, select_on):
+    def __init__(self, select_if_true, select_if_false, condition):
         """
         Represents a vector-select
 
         :arg select_if_true: the value to be chosen if select_on is true
         :arg select_if_false: the value to be chosen if select_on is false
-        :arg select_on: the conditional selection value
+        :arg condition: the conditional selection value
         """
 
         name = 'select'
         super(VectorSelect, self).__init__(name, (
-            select_if_true, select_if_false, select_on))
+            select_if_true, select_if_false, condition))
 
 # }}}
 
@@ -552,6 +552,18 @@ class OpenCLCASTBuilder(CASTBuilder):
             pass
         return assignment
 
+    def emit_vector_if(self, condition_str, ast):
+        """
+        Emit's a vector select function
+        """
+
+        try:
+            return VectorSelect(ast.rvalue.expr, ast.lvalue.expr, condition_str)
+        except AttributeError:
+            raise LoopyError("Vector conditionals can only be generated for simple"
+                             "assign statements, condition (%s) on instruction (%s) "
+                             "invalid" % (condition_str, str(ast)))
+
     def add_vector_access(self, access_expr, index):
         # The 'int' avoids an 'L' suffix for long ints.
         def __map(ind, use_prefix=True):
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 9d096da34..92dc5c78a 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2974,6 +2974,54 @@ def test_explicit_simd_temporary_promotion(ctx_factory):
         warnings.resetwarnings()
 
 
+def test_explicit_simd_selects(ctx_factory):
+    ctx = ctx_factory()
+
+    def create_and_test(insn, condition, answer, exception=None, a=None, b=None):
+        a = np.zeros(12, dtype=np.int32) if a is None else a
+        data = [lp.GlobalArg('a', shape=a.shape, dtype=a.dtype)]
+        kwargs = dict(a=a)
+        if b is not None:
+            data += [lp.GlobalArg('b', shape=b.shape, dtype=b.dtype)]
+            kwargs['b'] = b
+        names = [d.name for d in data]
+
+        knl = lp.make_kernel(['{[i]: 0 <= i < 12}'],
+            """
+            if %(condition)s
+                %(insn)s
+            end
+            """ % dict(condition=condition,
+                       insn=insn),
+            data
+            )
+
+        knl = lp.split_iname(knl, 'i', 4, inner_tag='vec')
+        knl = lp.split_array_axis(knl, names, 0, 4)
+        knl = lp.tag_array_axes(knl, names, 'N0,vec')
+
+        queue = cl.CommandQueue(ctx)
+        if exception is not None:
+            with pytest.raises(exception):
+                print(lp.generate_code_v2(knl).device_code())
+
+        if exception is not None:
+            with pytest.raises(exception):
+                knl(queue, **kwargs)
+        else:
+            result = knl(queue, **kwargs)[1][0]
+            assert np.array_equal(result.flatten('C'), answer)
+
+    ans = np.zeros(12, dtype=np.int32)
+    ans[7:] = 1
+    # 1) test a conditional on a vector iname -- currently unimplemented
+    create_and_test('a[i] = 1', 'i > 6', ans, exception=NotImplementedError)
+    # 2) condition on a vector variable -- unimplemented as the i_inner in the
+    # condition currently isn't resolved
+    create_and_test('a[i] = 1', 'b[i] > 6', ans, b=np.arange(12, dtype=np.int32),
+                    exception=NotImplementedError)
+
+
 def test_check_for_variable_access_ordering():
     knl = lp.make_kernel(
             "{[i]: 0<=i<n}",
-- 
GitLab


From 953598c8e23bcd4e1d38fc3ba8dc5e8359304537 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 26 Apr 2018 15:57:39 -0400
Subject: [PATCH 077/144] fixes

---
 loopy/target/opencl.py | 4 ++--
 test/test_loopy.py     | 9 +++++++--
 2 files changed, 9 insertions(+), 4 deletions(-)

diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index fed3c49b7..ad1ec787c 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -560,9 +560,9 @@ class OpenCLCASTBuilder(CASTBuilder):
         try:
             return VectorSelect(ast.rvalue.expr, ast.lvalue.expr, condition_str)
         except AttributeError:
-            raise LoopyError("Vector conditionals can only be generated for simple"
+            raise LoopyError("Vector conditionals can only be generated for simple "
                              "assign statements, condition (%s) on instruction (%s) "
-                             "invalid" % (condition_str, str(ast)))
+                             "invalid" % (str(condition_str), str(ast)))
 
     def add_vector_access(self, access_expr, index):
         # The 'int' avoids an 'L' suffix for long ints.
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 92dc5c78a..34ca505b1 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2977,7 +2977,8 @@ def test_explicit_simd_temporary_promotion(ctx_factory):
 def test_explicit_simd_selects(ctx_factory):
     ctx = ctx_factory()
 
-    def create_and_test(insn, condition, answer, exception=None, a=None, b=None):
+    def create_and_test(insn, condition, answer, exception=None, a=None, b=None,
+                        extra_insns=''):
         a = np.zeros(12, dtype=np.int32) if a is None else a
         data = [lp.GlobalArg('a', shape=a.shape, dtype=a.dtype)]
         kwargs = dict(a=a)
@@ -2988,11 +2989,13 @@ def test_explicit_simd_selects(ctx_factory):
 
         knl = lp.make_kernel(['{[i]: 0 <= i < 12}'],
             """
+            %(extra_insns)s
             if %(condition)s
                 %(insn)s
             end
             """ % dict(condition=condition,
-                       insn=insn),
+                       insn=insn,
+                       extra_insns=extra_insns),
             data
             )
 
@@ -3020,6 +3023,8 @@ def test_explicit_simd_selects(ctx_factory):
     # condition currently isn't resolved
     create_and_test('a[i] = 1', 'b[i] > 6', ans, b=np.arange(12, dtype=np.int32),
                     exception=NotImplementedError)
+    # and 3) just so we have something to test, a scalar condition
+    create_and_test('a[i] = 1', 'c > 6', np.ones_like(ans), extra_insns='<>c = 7')
 
 
 def test_check_for_variable_access_ordering():
-- 
GitLab


From aefbad56517c4336ed8e4664cd6fa41eff1be968 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 26 Apr 2018 16:35:43 -0400
Subject: [PATCH 078/144] fixes -- mainly to package to ask andreas

---
 loopy/codegen/result.py |  4 ++--
 test/test_loopy.py      | 21 ++++++++++++++-------
 2 files changed, 16 insertions(+), 9 deletions(-)

diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index de90420b5..52caf3283 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -260,9 +260,9 @@ def wrap_in_if(codegen_state, condition_exprs, inner, vector=False):
         from pymbolic.mapper.stringifier import PREC_NONE
         cur_ast = inner.current_ast(codegen_state)
         if vector:
-            method = codegen_state.ast_builder.emit_if
-        else:
             method = codegen_state.ast_builder.emit_vector_if
+        else:
+            method = codegen_state.ast_builder.emit_if
         return inner.with_new_ast(
                 codegen_state,
                 method(
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 34ca505b1..ee36f175f 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2974,28 +2974,32 @@ def test_explicit_simd_temporary_promotion(ctx_factory):
         warnings.resetwarnings()
 
 
+@pytest.mark.xfail('Weird conditional dropping for the case that actually should'
+                   'work')
 def test_explicit_simd_selects(ctx_factory):
     ctx = ctx_factory()
 
     def create_and_test(insn, condition, answer, exception=None, a=None, b=None,
-                        extra_insns=''):
-        a = np.zeros(12, dtype=np.int32) if a is None else a
-        data = [lp.GlobalArg('a', shape=a.shape, dtype=a.dtype)]
+                        c=None):
+        a = np.zeros((3, 4), dtype=np.int32) if a is None else a
+        data = [lp.GlobalArg('a', shape=(12,), dtype=a.dtype)]
         kwargs = dict(a=a)
         if b is not None:
             data += [lp.GlobalArg('b', shape=b.shape, dtype=b.dtype)]
             kwargs['b'] = b
         names = [d.name for d in data]
 
+        if c is not None:
+            data += [lp.ValueArg('c', dtype=c.dtype)]
+            kwargs['c'] = c
+
         knl = lp.make_kernel(['{[i]: 0 <= i < 12}'],
             """
-            %(extra_insns)s
             if %(condition)s
                 %(insn)s
             end
             """ % dict(condition=condition,
-                       insn=insn,
-                       extra_insns=extra_insns),
+                       insn=insn),
             data
             )
 
@@ -3007,6 +3011,8 @@ def test_explicit_simd_selects(ctx_factory):
         if exception is not None:
             with pytest.raises(exception):
                 print(lp.generate_code_v2(knl).device_code())
+        else:
+            print(lp.generate_code_v2(knl).device_code())
 
         if exception is not None:
             with pytest.raises(exception):
@@ -3024,7 +3030,8 @@ def test_explicit_simd_selects(ctx_factory):
     create_and_test('a[i] = 1', 'b[i] > 6', ans, b=np.arange(12, dtype=np.int32),
                     exception=NotImplementedError)
     # and 3) just so we have something to test, a scalar condition
-    create_and_test('a[i] = 1', 'c > 6', np.ones_like(ans), extra_insns='<>c = 7')
+    create_and_test('a[i] = 1', 'c > 6', ans, c=np.array(
+        7, dtype=np.int32))
 
 
 def test_check_for_variable_access_ordering():
-- 
GitLab


From 7aa2f888a14774ee0f752e243a748b584891ef6c Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 26 Apr 2018 16:52:20 -0400
Subject: [PATCH 079/144] kill the ugly tri-state flag

---
 loopy/codegen/__init__.py | 15 +++++++++++++--
 loopy/target/opencl.py    |  7 ++++---
 2 files changed, 17 insertions(+), 5 deletions(-)

diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 6f7442dc5..fe2eb9347 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -178,6 +178,11 @@ class CodeGenerationState(object):
 
         None or an instance of :class:`VectorizationInfo`
 
+    .. attribute:: insn_was_not_vectorizable
+
+        If true, we have a call to :func:`try_vectorized` has failed, and we are
+        in the :func:`unvectorize` fallback
+
     .. attribute:: is_generating_device_code
 
     .. attribute:: gen_program_name
@@ -219,7 +224,8 @@ class CodeGenerationState(object):
             var_subst_map=None, vectorization_info=None,
             is_generating_device_code=None,
             gen_program_name=None,
-            schedule_index_end=None):
+            schedule_index_end=None,
+            insn_was_not_vectorizable=False):
 
         if kernel is None:
             kernel = self.kernel
@@ -227,6 +233,10 @@ class CodeGenerationState(object):
         if implemented_data_info is None:
             implemented_data_info = self.implemented_data_info
 
+        if vectorization_info is False:
+            insn_was_not_vectorizable = True
+            vectorization_info = None
+
         if vectorization_info is None:
             vectorization_info = self.vectorization_info
 
@@ -254,7 +264,8 @@ class CodeGenerationState(object):
                 var_name_generator=self.var_name_generator,
                 is_generating_device_code=is_generating_device_code,
                 gen_program_name=gen_program_name,
-                schedule_index_end=schedule_index_end)
+                schedule_index_end=schedule_index_end,
+                insn_was_not_vectorizable=insn_was_not_vectorizable)
 
     def copy_and_assign(self, name, value):
         """Make a copy of self with variable *name* fixed to *value*."""
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index ad1ec787c..c94146591 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -684,10 +684,11 @@ class OpenCLCASTBuilder(CASTBuilder):
         # FIXME: Could detect operations, generate atomic_{add,...} when
         # appropriate.
 
-        if codegen_state.vectorization_info is not None:
+        if codegen_state.vectorization_info is not None or \
+                codegen_state.insn_was_not_vectorizable:
             # note - this check whether we've previously tried to vectorize and
-            # failed (in which case vectorization_info will be False) or whether
-            # vectorization_info is a valid :class:`VectorizationInfo`
+            # failed (in which case insn_was_not_vectorizable will be True) or
+            # whether vectorization_info is a valid :class:`VectorizationInfo`
             #
             # Both cases should fail (as we can't take the index of an unrolled
             # atomic)
-- 
GitLab


From 37c6e18f9ee63e461b347b115d9cce649c03f4ac Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 26 Apr 2018 16:54:38 -0400
Subject: [PATCH 080/144] missing in init

---
 loopy/codegen/__init__.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index fe2eb9347..b275e6630 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -201,7 +201,8 @@ class CodeGenerationState(object):
             vectorization_info=None, var_name_generator=None,
             is_generating_device_code=None,
             gen_program_name=None,
-            schedule_index_end=None):
+            schedule_index_end=None,
+            insn_was_not_vectorizable=False):
         self.kernel = kernel
         self.implemented_data_info = implemented_data_info
         self.implemented_domain = implemented_domain
@@ -216,6 +217,7 @@ class CodeGenerationState(object):
         self.is_generating_device_code = is_generating_device_code
         self.gen_program_name = gen_program_name
         self.schedule_index_end = schedule_index_end
+        self.insn_was_not_vectorizable = insn_was_not_vectorizable
 
     # {{{ copy helpers
 
-- 
GitLab


From ea049a57edc7c9d85d8b3b5c2ed52f4db1899f27 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 26 Apr 2018 17:11:21 -0400
Subject: [PATCH 081/144] fix boolean update

---
 loopy/codegen/control.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index c2741147f..d18f813f4 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -533,7 +533,7 @@ def build_loop_nest(codegen_state, schedule_index):
                                 knl.temporary_variables.values()) if any(
                                     isinstance(dt, VectorArrayDimTag)
                                     for dt in x.dim_tags)])
-                            vec_if |= vec_arys & deps
+                            vec_if |= len(vec_arys & deps)
 
                     return [wrap_in_if(
                         inner_codegen_state,
-- 
GitLab


From 9c93d550d2975d6918486c3d4caf012b20db1132 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 26 Apr 2018 17:11:31 -0400
Subject: [PATCH 082/144] Test fixes

---
 loopy/target/opencl.py | 2 +-
 test/test_loopy.py     | 7 ++++---
 2 files changed, 5 insertions(+), 4 deletions(-)

diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index c94146591..f7e4b2548 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -692,7 +692,7 @@ class OpenCLCASTBuilder(CASTBuilder):
             #
             # Both cases should fail (as we can't take the index of an unrolled
             # atomic)
-            raise LoopyError('Atomic operators not yet implemented for '
+            raise LoopyError('Atomic operators not implemented for '
                              'explicit-SIMD vectorization')
 
         if isinstance(lhs_dtype, NumpyType) and lhs_dtype.numpy_dtype in [
diff --git a/test/test_loopy.py b/test/test_loopy.py
index ee36f175f..c7c6cc9dc 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2856,7 +2856,8 @@ def test_explicit_simd_shuffles(ctx_factory):
     create_and_test("a[j, (i + 2) % 4 + 4 * (i // 4)] = b[j, i]", shuffled)
     # test atomics
     from loopy import LoopyError
-    with pytest.raises(LoopyError):
+    from loopy.codegen import Unvectorizable
+    with pytest.raises((LoopyError, Unvectorizable)):
         temp = np.arange(12, dtype=np.int32)
         answer = np.zeros(4, dtype=np.int32)
         for i in range(4):
@@ -2974,8 +2975,8 @@ def test_explicit_simd_temporary_promotion(ctx_factory):
         warnings.resetwarnings()
 
 
-@pytest.mark.xfail('Weird conditional dropping for the case that actually should'
-                   'work')
+@pytest.mark.xfail("Weird conditional dropping for the case that actually "
+                   "should work")
 def test_explicit_simd_selects(ctx_factory):
     ctx = ctx_factory()
 
-- 
GitLab


From d52fa34ac953c80dfca5d417369cda73f974817f Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Mon, 30 Apr 2018 15:49:53 -0400
Subject: [PATCH 083/144] can't use this conditional, as it gets elevated into
 the python wrapper

---
 test/test_loopy.py | 12 +++---------
 1 file changed, 3 insertions(+), 9 deletions(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index c7c6cc9dc..aef6640be 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2975,8 +2975,6 @@ def test_explicit_simd_temporary_promotion(ctx_factory):
         warnings.resetwarnings()
 
 
-@pytest.mark.xfail("Weird conditional dropping for the case that actually "
-                   "should work")
 def test_explicit_simd_selects(ctx_factory):
     ctx = ctx_factory()
 
@@ -3009,11 +3007,10 @@ def test_explicit_simd_selects(ctx_factory):
         knl = lp.tag_array_axes(knl, names, 'N0,vec')
 
         queue = cl.CommandQueue(ctx)
-        if exception is not None:
-            with pytest.raises(exception):
-                print(lp.generate_code_v2(knl).device_code())
-        else:
+        try:
             print(lp.generate_code_v2(knl).device_code())
+        except exception:
+            pass
 
         if exception is not None:
             with pytest.raises(exception):
@@ -3030,9 +3027,6 @@ def test_explicit_simd_selects(ctx_factory):
     # condition currently isn't resolved
     create_and_test('a[i] = 1', 'b[i] > 6', ans, b=np.arange(12, dtype=np.int32),
                     exception=NotImplementedError)
-    # and 3) just so we have something to test, a scalar condition
-    create_and_test('a[i] = 1', 'c > 6', ans, c=np.array(
-        7, dtype=np.int32))
 
 
 def test_check_for_variable_access_ordering():
-- 
GitLab


From 5ffde833c40c9a20e573e5cbbdc1c4dab00e806a Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Mon, 30 Apr 2018 16:30:24 -0400
Subject: [PATCH 084/144] switch order

---
 loopy/codegen/__init__.py | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index b275e6630..5911c5ca5 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -235,13 +235,13 @@ class CodeGenerationState(object):
         if implemented_data_info is None:
             implemented_data_info = self.implemented_data_info
 
+        if vectorization_info is None:
+            vectorization_info = self.vectorization_info
+
         if vectorization_info is False:
             insn_was_not_vectorizable = True
             vectorization_info = None
 
-        if vectorization_info is None:
-            vectorization_info = self.vectorization_info
-
         if is_generating_device_code is None:
             is_generating_device_code = self.is_generating_device_code
 
-- 
GitLab


From 86f9a552efacfd40fbee76d10372aed51def0095 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Mon, 30 Apr 2018 17:17:53 -0400
Subject: [PATCH 085/144] fix insn_was_not_vectorizable copy

---
 loopy/codegen/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 5911c5ca5..8a57c3b2d 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -226,8 +226,7 @@ class CodeGenerationState(object):
             var_subst_map=None, vectorization_info=None,
             is_generating_device_code=None,
             gen_program_name=None,
-            schedule_index_end=None,
-            insn_was_not_vectorizable=False):
+            schedule_index_end=None):
 
         if kernel is None:
             kernel = self.kernel
@@ -238,6 +237,7 @@ class CodeGenerationState(object):
         if vectorization_info is None:
             vectorization_info = self.vectorization_info
 
+        insn_was_not_vectorizable = self.insn_was_not_vectorizable
         if vectorization_info is False:
             insn_was_not_vectorizable = True
             vectorization_info = None
-- 
GitLab


From 1e3f64d234426b0b7dce215ea2bb45c3db82b50d Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Mon, 30 Apr 2018 17:50:01 -0400
Subject: [PATCH 086/144] select is functional

---
 loopy/codegen/control.py | 37 ++++---------------------------
 loopy/codegen/result.py  | 48 +++++++++++++++++++++++++++++++++-------
 loopy/target/opencl.py   |  6 +++--
 test/test_loopy.py       |  2 +-
 4 files changed, 49 insertions(+), 44 deletions(-)

diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index d18f813f4..17a7a4a48 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -507,39 +507,10 @@ def build_loop_nest(codegen_state, schedule_index):
 
                     prev_result = prev_gen_code(inner_codegen_state)
 
-                    # determine if any conditions are vector
-                    try:
-                        vec_if = False
-                        vec_iname = inner_codegen_state.vectorization_info.iname
-                    except AttributeError:
-                        # no vectorization info
-                        pass
-                    else:
-                        deps = set()
-                        for x in condition_exprs:
-                            deps |= get_dependencies(x)
-                        deps = frozenset(deps)
-                        if deps & set([vec_iname]):
-                            # we'd have to insert our own mirror temporary of the
-                            # vector iname here
-                            vec_if = True
-                            raise NotImplementedError(
-                                "Can't use vector iname directly in conditional")
-                        else:
-                            from loopy.kernel.array import VectorArrayDimTag
-                            # check if any vector arrays are in condition
-                            knl = inner_codegen_state.kernel
-                            vec_arys = set([x.name for x in knl.args + list(
-                                knl.temporary_variables.values()) if any(
-                                    isinstance(dt, VectorArrayDimTag)
-                                    for dt in x.dim_tags)])
-                            vec_if |= len(vec_arys & deps)
-
-                    return [wrap_in_if(
-                        inner_codegen_state,
-                        condition_exprs,
-                        merge_codegen_results(codegen_state, prev_result),
-                        vector=vec_if)]
+                    inner = merge_codegen_results(codegen_state, prev_result)
+                    return [new_codegen_state.try_vectorized(
+                        inner.current_ast(inner_codegen_state),
+                        lambda ics: wrap_in_if(ics, condition_exprs, inner))]
 
                 cannot_vectorize = False
                 if new_codegen_state.vectorization_info is not None:
diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index 52caf3283..2783c191b 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -254,21 +254,53 @@ def merge_codegen_results(codegen_state, elements, collapse=True):
                 **kwargs))
 
 
-def wrap_in_if(codegen_state, condition_exprs, inner, vector=False):
+def wrap_in_if(codegen_state, condition_exprs, inner):
     if condition_exprs:
         from pymbolic.primitives import LogicalAnd
         from pymbolic.mapper.stringifier import PREC_NONE
         cur_ast = inner.current_ast(codegen_state)
-        if vector:
+        method = codegen_state.ast_builder.emit_if
+
+        def condition_mapper():
+            return codegen_state.expression_to_code_mapper(
+                    LogicalAnd(tuple(condition_exprs)), PREC_NONE)
+        mapper = condition_mapper
+
+        if codegen_state.vectorization_info is not None:
+            from loopy.symbolic import get_dependencies
             method = codegen_state.ast_builder.emit_vector_if
-        else:
-            method = codegen_state.ast_builder.emit_if
+            vec_iname = codegen_state.vectorization_info.iname
+
+            def check_vec_dep(condition):
+                # check conditions for explicit vector iname dependecies
+                return len(get_dependencies(condition) & set([vec_iname]))
+
+            if any(check_vec_dep(cond) for cond in condition_exprs):
+                # condition directly involves a vector array or iname
+
+                def condition_mapper_wrapper():
+                    condition = condition_mapper()
+                    from loopy.diagnostic import LoopyError
+                    deps = set()
+                    try:
+                        for condition in condition.expr.children:
+                            deps |= get_dependencies(condition)
+
+                        if deps & set([vec_iname]):
+                            # we'd have to insert our own mirror temporary of the
+                            # vector iname here
+                            raise LoopyError("Can't directly use vector iname in "
+                                             "conditional")
+                    except (AttributeError, TypeError):
+                        pass
+
+                    return condition
+
+                mapper = condition_mapper_wrapper
+
         return inner.with_new_ast(
                 codegen_state,
-                method(
-                    codegen_state.expression_to_code_mapper(
-                        LogicalAnd(tuple(condition_exprs)), PREC_NONE),
-                    cur_ast))
+                method(mapper(), cur_ast))
 
     return inner
 
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index f7e4b2548..653a8e18d 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -445,7 +445,7 @@ class VectorSelect(VectorFunc):
 
         name = 'select'
         super(VectorSelect, self).__init__(name, (
-            select_if_true, select_if_false, condition))
+            select_if_false, select_if_true, condition))
 
 # }}}
 
@@ -558,7 +558,9 @@ class OpenCLCASTBuilder(CASTBuilder):
         """
 
         try:
-            return VectorSelect(ast.rvalue.expr, ast.lvalue.expr, condition_str)
+            from cgen import Assign
+            return Assign(str(ast.lvalue.expr), str(VectorSelect(
+                ast.rvalue.expr, ast.lvalue.expr, condition_str)))
         except AttributeError:
             raise LoopyError("Vector conditionals can only be generated for simple "
                              "assign statements, condition (%s) on instruction (%s) "
diff --git a/test/test_loopy.py b/test/test_loopy.py
index aef6640be..e97a1b333 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2984,7 +2984,7 @@ def test_explicit_simd_selects(ctx_factory):
         data = [lp.GlobalArg('a', shape=(12,), dtype=a.dtype)]
         kwargs = dict(a=a)
         if b is not None:
-            data += [lp.GlobalArg('b', shape=b.shape, dtype=b.dtype)]
+            data += [lp.GlobalArg('b', shape=(12,), dtype=b.dtype)]
             kwargs['b'] = b
         names = [d.name for d in data]
 
-- 
GitLab


From 2813a802f8276dc382d3f22405f68cef6ac4f17a Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Mon, 30 Apr 2018 17:53:14 -0400
Subject: [PATCH 087/144] update test

---
 test/test_loopy.py | 15 ++++++---------
 1 file changed, 6 insertions(+), 9 deletions(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index e97a1b333..a930df3e6 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -3007,11 +3007,6 @@ def test_explicit_simd_selects(ctx_factory):
         knl = lp.tag_array_axes(knl, names, 'N0,vec')
 
         queue = cl.CommandQueue(ctx)
-        try:
-            print(lp.generate_code_v2(knl).device_code())
-        except exception:
-            pass
-
         if exception is not None:
             with pytest.raises(exception):
                 knl(queue, **kwargs)
@@ -3021,12 +3016,14 @@ def test_explicit_simd_selects(ctx_factory):
 
     ans = np.zeros(12, dtype=np.int32)
     ans[7:] = 1
-    # 1) test a conditional on a vector iname -- currently unimplemented
-    create_and_test('a[i] = 1', 'i > 6', ans, exception=NotImplementedError)
+    from loopy.diagnostic import LoopyError
+    # 1) test a conditional on a vector iname -- currently unimplemented as it
+    # would require creating a 'shadow' vector iname temporary
+    create_and_test('a[i] = 1', 'i > 6', ans, exception=LoopyError)
     # 2) condition on a vector variable -- unimplemented as the i_inner in the
     # condition currently isn't resolved
-    create_and_test('a[i] = 1', 'b[i] > 6', ans, b=np.arange(12, dtype=np.int32),
-                    exception=NotImplementedError)
+    create_and_test('a[i] = 1', 'b[i] > 6', ans, b=np.arange(
+        12, dtype=np.int32).reshape((3, 4)))
 
 
 def test_check_for_variable_access_ordering():
-- 
GitLab


From 88f4ad57dcef7a70accfd26bb9e210055950ba9b Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Mon, 30 Apr 2018 17:59:56 -0400
Subject: [PATCH 088/144] move into vectorization conditional

---
 loopy/codegen/result.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index 2783c191b..b2a29e571 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -268,7 +268,6 @@ def wrap_in_if(codegen_state, condition_exprs, inner):
 
         if codegen_state.vectorization_info is not None:
             from loopy.symbolic import get_dependencies
-            method = codegen_state.ast_builder.emit_vector_if
             vec_iname = codegen_state.vectorization_info.iname
 
             def check_vec_dep(condition):
@@ -296,6 +295,7 @@ def wrap_in_if(codegen_state, condition_exprs, inner):
 
                     return condition
 
+                method = codegen_state.ast_builder.emit_vector_if
                 mapper = condition_mapper_wrapper
 
         return inner.with_new_ast(
-- 
GitLab


From aa4c5b3828df38c0578c5308f2139cbf1f506e45 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Tue, 1 May 2018 10:49:33 -0400
Subject: [PATCH 089/144] in order to get around the vector iname conditional
 issue, we have to fix this parameter before generating the kernel

---
 test/test_transform.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/test_transform.py b/test/test_transform.py
index 210984512..8b91d0a52 100644
--- a/test/test_transform.py
+++ b/test/test_transform.py
@@ -260,6 +260,9 @@ def test_vectorize(ctx_factory):
             split_kwargs=dict(slabs=(0, 1)))
 
     knl = lp.tag_data_axes(knl, "a,b", "c,vec")
+    # note: in order to eliminate a vector iname from the (implicit) conditional on
+    # the size of the i-loop, we must fix `n` before generating the kernel
+    knl = lp.fix_parameters(knl, n=30)
     ref_knl = knl
     ref_knl = lp.tag_inames(ref_knl, {"i_inner": "unr"})
 
@@ -270,8 +273,7 @@ def test_vectorize(ctx_factory):
     code, inf = lp.generate_code(knl)
 
     lp.auto_test_vs_ref(
-            ref_knl, ctx, knl,
-            parameters=dict(n=30))
+            ref_knl, ctx, knl)
 
 
 def test_extract_subst(ctx_factory):
-- 
GitLab


From eaa117816f41fe189ac6bbb6f3d86d545c7f5dc5 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Tue, 1 May 2018 11:16:21 -0400
Subject: [PATCH 090/144] fix n in the vector types example

---
 examples/python/vector-types.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/examples/python/vector-types.py b/examples/python/vector-types.py
index 328aea154..17bc07755 100644
--- a/examples/python/vector-types.py
+++ b/examples/python/vector-types.py
@@ -14,8 +14,9 @@ knl = lp.make_kernel(
         "out[i] = 2*a[i]")
 
 knl = lp.set_options(knl, write_code=True)
+knl = lp.set_parameters(knl, n=n)
 knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec")
 knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4)
 knl = lp.tag_array_axes(knl, "a,out", "C,vec")
 
-knl(queue, a=a.reshape(-1, 4), n=n)
+knl(queue, a=a.reshape(-1, 4))
-- 
GitLab


From 3ea05074064ec713500edb48fb3032919401db79 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Tue, 1 May 2018 11:41:55 -0400
Subject: [PATCH 091/144] s/set/fix

---
 examples/python/vector-types.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/python/vector-types.py b/examples/python/vector-types.py
index 17bc07755..82aadb817 100644
--- a/examples/python/vector-types.py
+++ b/examples/python/vector-types.py
@@ -14,7 +14,7 @@ knl = lp.make_kernel(
         "out[i] = 2*a[i]")
 
 knl = lp.set_options(knl, write_code=True)
-knl = lp.set_parameters(knl, n=n)
+knl = lp.fix_parameters(knl, n=n)
 knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec")
 knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4)
 knl = lp.tag_array_axes(knl, "a,out", "C,vec")
-- 
GitLab


From 7aa4ac91d907ec33b23cd51f780d2ccf8ae8336d Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 2 May 2018 14:58:21 -0400
Subject: [PATCH 092/144] update vectorizability checker

---
 loopy/expression.py | 48 +++++++++++++++++++++++++++++++++++++++++----
 1 file changed, 44 insertions(+), 4 deletions(-)

diff --git a/loopy/expression.py b/loopy/expression.py
index 9fe918620..cb7926bce 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -90,10 +90,35 @@ class VectorizabilityChecker(RecursiveMapper):
     def map_call(self, expr):
         # FIXME: Should implement better vectorization check for function calls
 
+        # https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/mathFunctions.html
+        # this is a simple list of math functions from OpenCL-1.2
+        functions = """acos    acosh   acospi  asin
+        asinh   asinpi  atan    atan2
+        atanh   atanpi  atan2pi cbrt
+        ceil    copysign    cos cosh
+        cospi   erfc    erf exp
+        exp2    exp10   expm1   fabs
+        fdim    floor   fma fmax
+        fmin    fmod    fract   frexp
+        hypot   ilogb   ldexp   lgamma
+        lgamma_r    log log2    log10
+        log1p   logb    mad maxmag
+        minmag  modf    nan nextafter
+        pow pown    powr    remainder
+        remquo  rint    rootn   round
+        rsqrt   sin sincos  sinh
+        sinpi   sqrt    tan tanh
+        tanpi   tgamma  trunc"""
+
+        functions = [x.strip() for x in functions.split() if x.strip()]
+
         rec_pars = [
                 self.rec(child) for child in expr.parameters]
         if any(rec_pars):
-            raise Unvectorizable("fucntion calls cannot yet be vectorized")
+            if expr.name not in functions:
+                return Unvectorizable(
+                    'Function {} is not known to be vectorizable'.format(expr.name))
+            return True
 
         return False
 
@@ -216,17 +241,32 @@ class VectorizabilityChecker(RecursiveMapper):
         return False
 
     def map_comparison(self, expr):
-        # FIXME: These actually can be vectorized:
         # https://www.khronos.org/registry/cl/sdk/1.0/docs/man/xhtml/relationalFunctions.html
 
+        # even better for OpenCL <, <=, >, >=, !=, == are all vectorizable by default
+        # (see: sec 6.3.d-6.d.3 in OpenCL-1.2 docs)
+
+        if expr.operator in ["<", "<=", ">", ">=", "!=", "=="]:
+            return any(self.rec(x) for x in [expr.left, expr.right])
+
         raise Unvectorizable()
 
     def map_logical_not(self, expr):
-        raise Unvectorizable()
+        # 6.3.h in OpenCL-1.2 docs
+        return self.rec(expr.child)
+
+    def map_logical_and(self, expr):
+        # 6.3.h in OpenCL-1.2 docs
+        return any(self.rec(x) for x in expr.children)
 
-    map_logical_and = map_logical_not
     map_logical_or = map_logical_not
 
+    # sec 6.3.f in OpenCL-1.2 docs
+    map_bitwise_not = map_logical_not
+    map_bitwise_or = map_logical_and
+    map_bitwise_xor = map_logical_and
+    map_bitwise_and = map_logical_and
+
     def map_reduction(self, expr):
         # FIXME: Do this more carefully
         raise Unvectorizable()
-- 
GitLab


From e611207f924169d7ff8c6b9846a4720333ccd549 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 2 May 2018 16:28:46 -0400
Subject: [PATCH 093/144] update vectorizability checker to recognize (some)
 functions, and conditionals & test

---
 loopy/expression.py | 50 ++++++++++++++-------------
 test/test_loopy.py  | 83 +++++++++++++++++++++++++++++++++++++++------
 2 files changed, 99 insertions(+), 34 deletions(-)

diff --git a/loopy/expression.py b/loopy/expression.py
index cb7926bce..fae65f043 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -63,6 +63,30 @@ class VectorizabilityChecker(RecursiveMapper):
     .. attribute:: vec_iname
     """
 
+    # this is a simple list of math functions from OpenCL-1.2
+    # https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/mathFunctions.html
+    # this could be expanded / moved to it's own target specific VecCheck if
+    # necessary
+    functions = """acos    acosh   acospi  asin
+    asinh   asinpi  atan    atan2
+    atanh   atanpi  atan2pi cbrt
+    ceil    copysign    cos cosh
+    cospi   erfc    erf exp
+    exp2    exp10   expm1   fabs
+    fdim    floor   fma fmax
+    fmin    fmod    fract   frexp
+    hypot   ilogb   ldexp   lgamma
+    lgamma_r    log log2    log10
+    log1p   logb    mad maxmag
+    minmag  modf    nan nextafter
+    pow pown    powr    remainder
+    remquo  rint    rootn   round
+    rsqrt   sin sincos  sinh
+    sinpi   sqrt    tan tanh
+    tanpi   tgamma  trunc"""
+
+    functions = [x.strip() for x in functions.split() if x.strip()]
+
     def __init__(self, kernel, vec_iname, vec_iname_length):
         self.kernel = kernel
         self.vec_iname = vec_iname
@@ -90,32 +114,10 @@ class VectorizabilityChecker(RecursiveMapper):
     def map_call(self, expr):
         # FIXME: Should implement better vectorization check for function calls
 
-        # https://www.khronos.org/registry/OpenCL/sdk/1.2/docs/man/xhtml/mathFunctions.html
-        # this is a simple list of math functions from OpenCL-1.2
-        functions = """acos    acosh   acospi  asin
-        asinh   asinpi  atan    atan2
-        atanh   atanpi  atan2pi cbrt
-        ceil    copysign    cos cosh
-        cospi   erfc    erf exp
-        exp2    exp10   expm1   fabs
-        fdim    floor   fma fmax
-        fmin    fmod    fract   frexp
-        hypot   ilogb   ldexp   lgamma
-        lgamma_r    log log2    log10
-        log1p   logb    mad maxmag
-        minmag  modf    nan nextafter
-        pow pown    powr    remainder
-        remquo  rint    rootn   round
-        rsqrt   sin sincos  sinh
-        sinpi   sqrt    tan tanh
-        tanpi   tgamma  trunc"""
-
-        functions = [x.strip() for x in functions.split() if x.strip()]
-
         rec_pars = [
                 self.rec(child) for child in expr.parameters]
         if any(rec_pars):
-            if expr.name not in functions:
+            if str(expr.function) not in VectorizabilityChecker.functions:
                 return Unvectorizable(
                     'Function {} is not known to be vectorizable'.format(expr.name))
             return True
@@ -259,7 +261,7 @@ class VectorizabilityChecker(RecursiveMapper):
         # 6.3.h in OpenCL-1.2 docs
         return any(self.rec(x) for x in expr.children)
 
-    map_logical_or = map_logical_not
+    map_logical_or = map_logical_and
 
     # sec 6.3.f in OpenCL-1.2 docs
     map_bitwise_not = map_logical_not
diff --git a/test/test_loopy.py b/test/test_loopy.py
index a930df3e6..df3e34fde 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2979,7 +2979,7 @@ def test_explicit_simd_selects(ctx_factory):
     ctx = ctx_factory()
 
     def create_and_test(insn, condition, answer, exception=None, a=None, b=None,
-                        c=None):
+                        extra_insns=None):
         a = np.zeros((3, 4), dtype=np.int32) if a is None else a
         data = [lp.GlobalArg('a', shape=(12,), dtype=a.dtype)]
         kwargs = dict(a=a)
@@ -2988,17 +2988,17 @@ def test_explicit_simd_selects(ctx_factory):
             kwargs['b'] = b
         names = [d.name for d in data]
 
-        if c is not None:
-            data += [lp.ValueArg('c', dtype=c.dtype)]
-            kwargs['c'] = c
-
         knl = lp.make_kernel(['{[i]: 0 <= i < 12}'],
             """
-            if %(condition)s
-                %(insn)s
+            for i
+                %(extra)s
+                if %(condition)s
+                    %(insn)s
+                end
             end
             """ % dict(condition=condition,
-                       insn=insn),
+                       insn=insn,
+                       extra=extra_insns if extra_insns else ''),
             data
             )
 
@@ -3020,10 +3020,73 @@ def test_explicit_simd_selects(ctx_factory):
     # 1) test a conditional on a vector iname -- currently unimplemented as it
     # would require creating a 'shadow' vector iname temporary
     create_and_test('a[i] = 1', 'i > 6', ans, exception=LoopyError)
-    # 2) condition on a vector variable -- unimplemented as the i_inner in the
-    # condition currently isn't resolved
+    # 2) condition on a vector variable
     create_and_test('a[i] = 1', 'b[i] > 6', ans, b=np.arange(
         12, dtype=np.int32).reshape((3, 4)))
+    # 3) condition on a vector temporary
+    create_and_test('a[i] = 1', '1', ans, extra_insns='<> c = i < 6')
+
+
+def test_vectorizability():
+    # check new vectorizability conditions
+    from loopy.kernel.array import VectorArrayDimTag
+    from loopy.kernel.data import VectorizeTag
+
+    def create_and_test(insn, exception=None, a=None, b=None):
+        a = np.zeros((3, 4), dtype=np.int32) if a is None else a
+        data = [lp.GlobalArg('a', shape=(12,), dtype=a.dtype)]
+        kwargs = dict(a=a)
+        if b is not None:
+            data += [lp.GlobalArg('b', shape=(12,), dtype=b.dtype)]
+            kwargs['b'] = b
+        names = [d.name for d in data]
+
+        knl = lp.make_kernel(['{[i]: 0 <= i < 12}'],
+            """
+            for i
+                %(insn)s
+            end
+            """ % dict(insn=insn),
+            data
+            )
+
+        knl = lp.split_iname(knl, 'i', 4, inner_tag='vec')
+        knl = lp.split_array_axis(knl, names, 0, 4)
+        knl = lp.tag_array_axes(knl, names, 'N0,vec')
+        knl = lp.preprocess_kernel(knl)
+        lp.generate_code_v2(knl).device_code()
+        assert knl.instructions[0].within_inames & set(['i_inner'])
+        assert isinstance(knl.args[0].dim_tags[-1], VectorArrayDimTag)
+        assert isinstance(knl.args[0].dim_tags[-1], VectorArrayDimTag)
+        assert isinstance(knl.iname_to_tag['i_inner'], VectorizeTag)
+
+    def run(op_list=[], unary_operators=[], func_list=[], unary_funcs=[]):
+        for op in op_list:
+            template = 'a[i] = a[i] %(op)s %(rval)s' \
+                if op not in unary_operators else 'a[i] = %(op)s a[i]'
+
+            create_and_test(template % dict(op=op, rval='1'))
+            create_and_test(template % dict(op=op, rval='a[i]'))
+        for func in func_list:
+            template = 'a[i] = %(func)s(a[i], %(rval)s)' \
+                if func not in unary_funcs else 'a[i] = %(func)s(a[i])'
+            create_and_test(template % dict(func=func, rval='1'))
+            create_and_test(template % dict(func=func, rval='a[i]'))
+
+    # 1) comparisons
+    run(['>', '>=', '<', '<=', '==', '!='])
+
+    # 2) logical operators
+    run(['and', 'or', 'not'], ['not'])
+
+    # 3) bitwize operators
+    # bitwize and '&' is broken in parsing currently (#139)
+    # bitwize xor '^' not not implemented in codegen
+    run(['~', '|'], ['~'])
+
+    # 4) functions -- a random selection of the enabled math functions in opencl
+    run(func_list=['acos', 'exp10', 'atan2', 'round'],
+        unary_funcs=['round', 'acos', 'exp10'])
 
 
 def test_check_for_variable_access_ordering():
-- 
GitLab


From ea7c371538ce56271d6f2f775ef4c522bdb466e1 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 2 May 2018 17:05:45 -0400
Subject: [PATCH 094/144] force check all children

---
 loopy/expression.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/expression.py b/loopy/expression.py
index fae65f043..3340cf46d 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -249,7 +249,7 @@ class VectorizabilityChecker(RecursiveMapper):
         # (see: sec 6.3.d-6.d.3 in OpenCL-1.2 docs)
 
         if expr.operator in ["<", "<=", ">", ">=", "!=", "=="]:
-            return any(self.rec(x) for x in [expr.left, expr.right])
+            return any([self.rec(x) for x in [expr.left, expr.right]])
 
         raise Unvectorizable()
 
-- 
GitLab


From dc27caa383bdbd0717dc480b961e51a1339841ae Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 2 May 2018 17:06:06 -0400
Subject: [PATCH 095/144] Better detection of vectorized conditionals (consider
 vector arrays)

---
 loopy/codegen/result.py | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index b2a29e571..23309747b 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -268,11 +268,25 @@ def wrap_in_if(codegen_state, condition_exprs, inner):
 
         if codegen_state.vectorization_info is not None:
             from loopy.symbolic import get_dependencies
+            from loopy.kernel.array import VectorArrayDimTag
+
             vec_iname = codegen_state.vectorization_info.iname
 
+            # precalculate vector arrays / temporaries
+            knl = codegen_state.kernel
+            vec_arys = set([x.name for x in knl.args + list(
+                knl.temporary_variables.values()) if any(
+                    isinstance(dt, VectorArrayDimTag)
+                    for dt in x.dim_tags)])
+
             def check_vec_dep(condition):
+                deps = get_dependencies(condition)
                 # check conditions for explicit vector iname dependecies
-                return len(get_dependencies(condition) & set([vec_iname]))
+                if len(deps & set([vec_iname])):
+                    return True
+                # check for vector temporaries / arrays in conditional
+                if len(deps & vec_arys):
+                    return True
 
             if any(check_vec_dep(cond) for cond in condition_exprs):
                 # condition directly involves a vector array or iname
-- 
GitLab


From 98f249c004072eff16ac55515ee93fb151d0b9b2 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 2 May 2018 17:02:13 -0400
Subject: [PATCH 096/144] update selection test

---
 loopy/target/opencl.py | 14 ++++++++++++++
 test/test_loopy.py     | 12 +++++++++---
 2 files changed, 23 insertions(+), 3 deletions(-)

diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 653a8e18d..1ce8bc59e 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -326,6 +326,20 @@ class ExpressionToOpenCLCExpressionMapper(ExpressionToCExpressionMapper):
     def map_local_hw_index(self, expr, type_context):
         return var("lid")(expr.axis)
 
+    def map_comparison(self, expr, type_context):
+        from loopy.symbolic import get_dependencies
+        from loopy.kernel.data import VectorizeTag
+        vec_inames = set([x for x in self.kernel.iname_to_tag
+                          if isinstance(self.kernel.iname_to_tag[x], VectorizeTag)])
+        if get_dependencies(expr) & vec_inames and \
+                self.codegen_state.insn_was_not_vectorizable:
+            raise LoopyError("Cannot unroll a vector-iname comparison, as scalar"
+                             " assignment results in incorrect 'truthiness' for "
+                             " vector dtypes.")
+
+        return super(ExpressionToOpenCLCExpressionMapper, self).map_comparison(
+            expr, type_context)
+
 # }}}
 
 
diff --git a/test/test_loopy.py b/test/test_loopy.py
index df3e34fde..20aaebbc6 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -3020,11 +3020,17 @@ def test_explicit_simd_selects(ctx_factory):
     # 1) test a conditional on a vector iname -- currently unimplemented as it
     # would require creating a 'shadow' vector iname temporary
     create_and_test('a[i] = 1', 'i > 6', ans, exception=LoopyError)
-    # 2) condition on a vector variable
+    # 2) condition on a vector array
     create_and_test('a[i] = 1', 'b[i] > 6', ans, b=np.arange(
         12, dtype=np.int32).reshape((3, 4)))
-    # 3) condition on a vector temporary
-    create_and_test('a[i] = 1', '1', ans, extra_insns='<> c = i < 6')
+    # 3) condition on a vector temporary -- this is currently broken for the
+    # same reason as #1
+    create_and_test('a[i] = 1', 'c', ans, extra_insns='<> c = i < 6',
+                    exception=LoopyError)
+    # 4) condition on an assigned vector array, this should work as assignment to a
+    # vector can be safely unrolled
+    create_and_test('a[i] = 1', 'b[i] > 6', ans, b=np.zeros((3, 4), dtype=np.int32),
+                    extra_insns='b[i] = i')
 
 
 def test_vectorizability():
-- 
GitLab


From 00380d232c76f81137abea16d38370eb894760a1 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 3 May 2018 10:54:53 -0400
Subject: [PATCH 097/144] switch to iname to tags / filter for most recent
 merge

---
 loopy/expression.py          | 4 ++--
 loopy/target/opencl.py       | 7 ++++---
 loopy/transform/privatize.py | 3 ++-
 test/test_loopy.py           | 4 ++--
 4 files changed, 10 insertions(+), 8 deletions(-)

diff --git a/loopy/expression.py b/loopy/expression.py
index 3340cf46d..a93f7127d 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -144,7 +144,7 @@ class VectorizabilityChecker(RecursiveMapper):
 
         # determine allowed symbols as non-vector inames
         from pymbolic.primitives import Variable
-        allowed_symbols = dict((sym, Variable(sym)) for sym in kernel.iname_to_tag
+        allowed_symbols = dict((sym, Variable(sym)) for sym in kernel.all_inames()
                                if sym != vec_iname)
         from loopy.kernel.instruction import Assignment
         from loopy.tools import is_integer
@@ -197,7 +197,7 @@ class VectorizabilityChecker(RecursiveMapper):
             # or, if not vector index, and vector iname is present
             elif self.vec_iname in set(x.name for x in deps):
                 # check whether we can simplify out the vector iname
-                context = dict((x, x) for x in deps if x.name != self.vec_iname)
+                context = dict((str(x), x) for x in deps if x.name != self.vec_iname)
                 allowed_symbols = self.compile_time_constants(
                     self.kernel, self.vec_iname)
 
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 1ce8bc59e..75fbc8894 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -328,9 +328,10 @@ class ExpressionToOpenCLCExpressionMapper(ExpressionToCExpressionMapper):
 
     def map_comparison(self, expr, type_context):
         from loopy.symbolic import get_dependencies
-        from loopy.kernel.data import VectorizeTag
-        vec_inames = set([x for x in self.kernel.iname_to_tag
-                          if isinstance(self.kernel.iname_to_tag[x], VectorizeTag)])
+        from loopy.kernel.data import VectorizeTag, filter_iname_tags_by_type
+        vec_inames = set([x for x in self.kernel.all_inames()
+                          if filter_iname_tags_by_type(
+                            self.kernel.iname_to_tags[x], VectorizeTag)])
         if get_dependencies(expr) & vec_inames and \
                 self.codegen_state.insn_was_not_vectorizable:
             raise LoopyError("Cannot unroll a vector-iname comparison, as scalar"
diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py
index c7d6fe62c..15aa3d945 100644
--- a/loopy/transform/privatize.py
+++ b/loopy/transform/privatize.py
@@ -123,7 +123,8 @@ def privatize_temporaries_with_inames(
 
     def find_privitzing_inames(writer_insn, iname, temp_var):
         # test that -- a) the iname is an ILP or vector tag
-        if isinstance(kernel.iname_to_tag.get(iname), (IlpBaseTag, VectorizeTag)):
+        if filter_iname_tags_by_type(kernel.iname_to_tag[iname],
+                                     (IlpBaseTag, VectorizeTag)):
             # check for user specified type
             if temp_var.force_scalar:
                 if iname in writer_insn.read_dependency_names():
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 20aaebbc6..568d686bd 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -3036,7 +3036,7 @@ def test_explicit_simd_selects(ctx_factory):
 def test_vectorizability():
     # check new vectorizability conditions
     from loopy.kernel.array import VectorArrayDimTag
-    from loopy.kernel.data import VectorizeTag
+    from loopy.kernel.data import VectorizeTag, filter_iname_tags_by_type
 
     def create_and_test(insn, exception=None, a=None, b=None):
         a = np.zeros((3, 4), dtype=np.int32) if a is None else a
@@ -3064,7 +3064,7 @@ def test_vectorizability():
         assert knl.instructions[0].within_inames & set(['i_inner'])
         assert isinstance(knl.args[0].dim_tags[-1], VectorArrayDimTag)
         assert isinstance(knl.args[0].dim_tags[-1], VectorArrayDimTag)
-        assert isinstance(knl.iname_to_tag['i_inner'], VectorizeTag)
+        assert filter_iname_tags_by_type(knl.iname_to_tags['i_inner'], VectorizeTag)
 
     def run(op_list=[], unary_operators=[], func_list=[], unary_funcs=[]):
         for op in op_list:
-- 
GitLab


From c640ec0d41af3dbaf0a51268c767aaf557ade6c6 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 3 May 2018 10:56:23 -0400
Subject: [PATCH 098/144] s/tag/tags

---
 loopy/transform/privatize.py | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py
index 15aa3d945..618bdb7a4 100644
--- a/loopy/transform/privatize.py
+++ b/loopy/transform/privatize.py
@@ -123,7 +123,7 @@ def privatize_temporaries_with_inames(
 
     def find_privitzing_inames(writer_insn, iname, temp_var):
         # test that -- a) the iname is an ILP or vector tag
-        if filter_iname_tags_by_type(kernel.iname_to_tag[iname],
+        if filter_iname_tags_by_type(kernel.iname_to_tags[iname],
                                      (IlpBaseTag, VectorizeTag)):
             # check for user specified type
             if temp_var.force_scalar:
@@ -268,9 +268,10 @@ def privatize_temporaries_with_inames(
 
             # the only O.K. case here is that the user specified that the instruction
             # should be a vector, and all the missing iname tags are vectors.
-            if not getattr(insn, 'force_vector', False) and all(isinstance(
-                kernel.iname_to_tag.get(iname), VectorizeTag) for x in
-                    eiii.seen_ilp_inames - insn.within_inames):
+            if not getattr(insn, 'force_vector', False) and all(
+                    filter_iname_tags_by_type(kernel.iname_to_tags[iname],
+                                              VectorizeTag)
+                    for x in eiii.seen_ilp_inames - insn.within_inames):
                 raise LoopyError(
                     "Kernel '%s': Instruction '%s': touched variable that "
                     "(for privatization, e.g. as performed for ILP) "
-- 
GitLab


From 0cccfe6116f06a029d4f2f4342db8d56d83610d7 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 3 May 2018 11:10:22 -0400
Subject: [PATCH 099/144] add back bitwise and, since #139 is a function of my
 old version of pymbolic

---
 test/test_loopy.py | 7 +++----
 1 file changed, 3 insertions(+), 4 deletions(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index 568d686bd..a76cb9400 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -3085,10 +3085,9 @@ def test_vectorizability():
     # 2) logical operators
     run(['and', 'or', 'not'], ['not'])
 
-    # 3) bitwize operators
-    # bitwize and '&' is broken in parsing currently (#139)
-    # bitwize xor '^' not not implemented in codegen
-    run(['~', '|'], ['~'])
+    # 3) bitwise operators
+    # bitwise xor '^' not not implemented in codegen
+    run(['~', '|', '&'], ['~'])
 
     # 4) functions -- a random selection of the enabled math functions in opencl
     run(func_list=['acos', 'exp10', 'atan2', 'round'],
-- 
GitLab


From 49b44df66966eefd1df96d31b05069e70021b81d Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Tue, 8 May 2018 17:06:28 -0400
Subject: [PATCH 100/144] Add support for multiple (simple) assignments in a
 single vector-conditional, and test

---
 loopy/target/opencl.py | 37 +++++++++++++++++++++++++++++--------
 test/test_loopy.py     | 18 +++++++++++++++---
 2 files changed, 44 insertions(+), 11 deletions(-)

diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 75fbc8894..740a19901 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -572,14 +572,35 @@ class OpenCLCASTBuilder(CASTBuilder):
         Emit's a vector select function
         """
 
-        try:
-            from cgen import Assign
-            return Assign(str(ast.lvalue.expr), str(VectorSelect(
-                ast.rvalue.expr, ast.lvalue.expr, condition_str)))
-        except AttributeError:
-            raise LoopyError("Vector conditionals can only be generated for simple "
-                             "assign statements, condition (%s) on instruction (%s) "
-                             "invalid" % (str(condition_str), str(ast)))
+        def vecify(assign):
+            try:
+                # treat it as an assignment
+                return Assign(str(assign.lvalue.expr), str(VectorSelect(
+                    assign.rvalue.expr, assign.lvalue.expr, condition_str)))
+            except AttributeError:
+                return False
+
+        from cgen import Assign, Block
+        vec_if = vecify(ast)
+        if not vec_if:
+            try:
+                vec_if = []
+                for assign in ast.contents:
+                    vec_if.append(vecify(assign))
+                if any(not x for x in vec_if):
+                    # one 'assign' failed
+                    vec_if = False
+                else:
+                    vec_if = Block(vec_if)
+            except AttributeError:
+                vec_if = False
+        if not vec_if:
+            raise LoopyError(
+                "Vector conditionals can only be generated for simple "
+                "assign statements, condition (%s) on instruction (%s) "
+                "invalid" % (str(condition_str), str(ast)))
+
+        return vec_if
 
     def add_vector_access(self, access_expr, index):
         # The 'int' avoids an 'L' suffix for long ints.
diff --git a/test/test_loopy.py b/test/test_loopy.py
index a76cb9400..c63bf2ba2 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2979,13 +2979,16 @@ def test_explicit_simd_selects(ctx_factory):
     ctx = ctx_factory()
 
     def create_and_test(insn, condition, answer, exception=None, a=None, b=None,
-                        extra_insns=None):
+                        extra_insns=None, c=None):
         a = np.zeros((3, 4), dtype=np.int32) if a is None else a
         data = [lp.GlobalArg('a', shape=(12,), dtype=a.dtype)]
         kwargs = dict(a=a)
         if b is not None:
             data += [lp.GlobalArg('b', shape=(12,), dtype=b.dtype)]
             kwargs['b'] = b
+        if c is not None:
+            data += [lp.GlobalArg('c', shape=(12,), dtype=b.dtype)]
+            kwargs['c'] = c
         names = [d.name for d in data]
 
         knl = lp.make_kernel(['{[i]: 0 <= i < 12}'],
@@ -3011,8 +3014,11 @@ def test_explicit_simd_selects(ctx_factory):
             with pytest.raises(exception):
                 knl(queue, **kwargs)
         else:
-            result = knl(queue, **kwargs)[1][0]
-            assert np.array_equal(result.flatten('C'), answer)
+            if not isinstance(answer, tuple):
+                answer = (answer,)
+            result = knl(queue, **kwargs)[1]
+            for r, a in zip(result, answer):
+                assert np.array_equal(r.flatten('C'), a)
 
     ans = np.zeros(12, dtype=np.int32)
     ans[7:] = 1
@@ -3031,6 +3037,12 @@ def test_explicit_simd_selects(ctx_factory):
     # vector can be safely unrolled
     create_and_test('a[i] = 1', 'b[i] > 6', ans, b=np.zeros((3, 4), dtype=np.int32),
                     extra_insns='b[i] = i')
+    # 5) a block of simple assignments, this should be seemlessly translated to
+    # multiple vector if statements
+    c_ans = np.ones(12, dtype=np.int32)
+    c_ans[7:] = 0
+    create_and_test('a[i] = 1\nc[i] = 0', 'b[i] > 6', (ans, c_ans), b=np.arange(
+        12, dtype=np.int32).reshape((3, 4)), c=np.ones((3, 4), dtype=np.int32))
 
 
 def test_vectorizability():
-- 
GitLab


From fa37fd34e8451d7710f909e77df25fcef5ba0b69 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Tue, 8 May 2018 17:59:59 -0400
Subject: [PATCH 101/144] inner loop was overwiting condition expression -- and
 add test

---
 loopy/codegen/result.py | 4 ++--
 test/test_loopy.py      | 4 ++++
 2 files changed, 6 insertions(+), 2 deletions(-)

diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index 23309747b..b8c654f91 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -296,8 +296,8 @@ def wrap_in_if(codegen_state, condition_exprs, inner):
                     from loopy.diagnostic import LoopyError
                     deps = set()
                     try:
-                        for condition in condition.expr.children:
-                            deps |= get_dependencies(condition)
+                        for c in condition.expr.children:
+                            deps |= get_dependencies(c)
 
                         if deps & set([vec_iname]):
                             # we'd have to insert our own mirror temporary of the
diff --git a/test/test_loopy.py b/test/test_loopy.py
index c63bf2ba2..cc287f0de 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -3043,6 +3043,10 @@ def test_explicit_simd_selects(ctx_factory):
     c_ans[7:] = 0
     create_and_test('a[i] = 1\nc[i] = 0', 'b[i] > 6', (ans, c_ans), b=np.arange(
         12, dtype=np.int32).reshape((3, 4)), c=np.ones((3, 4), dtype=np.int32))
+    # 6) test a negated conditional
+    ans_negated = np.invert(ans) + 2
+    create_and_test('a[i] = 1', 'not (b[i] > 6)', ans_negated, b=np.arange(
+        12, dtype=np.int32).reshape((3, 4)))
 
 
 def test_vectorizability():
-- 
GitLab


From 59cc874bf71ba0d4826fc41ef859f3f1d7de86b9 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 10 May 2018 11:10:20 -0400
Subject: [PATCH 102/144] add explicit simd type conversions and tests

---
 loopy/target/opencl.py | 12 ++++++++++++
 test/test_loopy.py     | 27 +++++++++++++++++++++++++++
 2 files changed, 39 insertions(+)

diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 740a19901..fa2557b80 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -341,6 +341,18 @@ class ExpressionToOpenCLCExpressionMapper(ExpressionToCExpressionMapper):
         return super(ExpressionToOpenCLCExpressionMapper, self).map_comparison(
             expr, type_context)
 
+    def wrap_in_typecast(self, actual_type, needed_dtype, s):
+        wrap = super(ExpressionToOpenCLCExpressionMapper, self).wrap_in_typecast(
+            actual_type, needed_dtype, s)
+        if self.codegen_state.vectorization_info is not None and (
+                actual_type != needed_dtype):
+            ctype = self.kernel.target.get_dtype_registry().dtype_to_ctype(
+                needed_dtype)
+            vw = self.codegen_state.vectorization_info.length
+            # need to add an explicit conversion
+            return var("convert_%s%d(%s)" % (ctype, vw, wrap))
+        return wrap
+
 # }}}
 
 
diff --git a/test/test_loopy.py b/test/test_loopy.py
index cc287f0de..a8c3edc3e 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -3049,6 +3049,33 @@ def test_explicit_simd_selects(ctx_factory):
         12, dtype=np.int32).reshape((3, 4)))
 
 
+@pytest.mark.parametrize(('lhs_dtype', 'rhs_dtype'), [
+    (np.int32, np.int64),
+    (np.float32, np.float64)])
+def test_explicit_vector_dtype_conversion(ctx_factory, lhs_dtype, rhs_dtype):
+    ctx = ctx_factory()
+
+    # test that dtype conversion happens correctly between differing vector-dtypes
+
+    vw = 4
+    a_lp = lp.GlobalArg('a', shape=(12,), dtype=rhs_dtype)
+    temp_lp = lp.TemporaryVariable('temp', dtype=lhs_dtype)
+
+    knl = lp.make_kernel(['{[i]: 0 <= i < 12}'],
+            """
+            for i
+                temp = a[i]
+            end
+            """,
+            [a_lp, temp_lp])
+    knl = lp.split_iname(knl, 'i', vw, inner_tag='vec')
+    knl = lp.split_array_axis(knl, 'a', 0, 4)
+    knl = lp.tag_array_axes(knl, 'a', 'N0,vec')
+
+    queue = cl.CommandQueue(ctx)
+    knl(queue, a=np.zeros((12,), dtype=rhs_dtype).reshape((3, 4)))
+
+
 def test_vectorizability():
     # check new vectorizability conditions
     from loopy.kernel.array import VectorArrayDimTag
-- 
GitLab


From d6003640c8ff703b502b46a73c1d149a35b477cf Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 10 May 2018 12:17:02 -0400
Subject: [PATCH 103/144] fix to avoid convert_T on literals

---
 loopy/target/opencl.py | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index fa2557b80..cc1ed0e8a 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -346,6 +346,11 @@ class ExpressionToOpenCLCExpressionMapper(ExpressionToCExpressionMapper):
             actual_type, needed_dtype, s)
         if self.codegen_state.vectorization_info is not None and (
                 actual_type != needed_dtype):
+            from loopy.symbolic import Literal
+            if isinstance(s, Literal):
+                # if its a literal, no need for explicit conversion
+                return wrap
+
             ctype = self.kernel.target.get_dtype_registry().dtype_to_ctype(
                 needed_dtype)
             vw = self.codegen_state.vectorization_info.length
-- 
GitLab


From 59403ff33ebc4b4940d376f433fb128211f009bf Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 10 May 2018 14:34:56 -0400
Subject: [PATCH 104/144] modify emit_if / emit_vector_if such that they take
 the condition_mapper as an arguement, rather than the condition_str itself --
 additionally, modify the condition_mappers such that they it can take an
 optional parameter that is the current inner AST to be wrapped in an
 if-statement

---
 loopy/codegen/result.py    | 27 ++++++++++++++++++++-------
 loopy/target/__init__.py   |  4 ++--
 loopy/target/c/__init__.py |  4 ++--
 loopy/target/opencl.py     |  9 +++++----
 loopy/target/python.py     |  4 ++--
 5 files changed, 31 insertions(+), 17 deletions(-)

diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index b8c654f91..cf6fe3ff9 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -261,9 +261,10 @@ def wrap_in_if(codegen_state, condition_exprs, inner):
         cur_ast = inner.current_ast(codegen_state)
         method = codegen_state.ast_builder.emit_if
 
-        def condition_mapper():
+        def condition_mapper(ast=None, type_context=None, needed_dtype=None):
             return codegen_state.expression_to_code_mapper(
-                    LogicalAnd(tuple(condition_exprs)), PREC_NONE)
+                    LogicalAnd(tuple(condition_exprs)), PREC_NONE,
+                    type_context=type_context, needed_dtype=needed_dtype)
         mapper = condition_mapper
 
         if codegen_state.vectorization_info is not None:
@@ -291,12 +292,17 @@ def wrap_in_if(codegen_state, condition_exprs, inner):
             if any(check_vec_dep(cond) for cond in condition_exprs):
                 # condition directly involves a vector array or iname
 
-                def condition_mapper_wrapper():
-                    condition = condition_mapper()
+                def condition_mapper_wrapper(ast=None):
+                    if ast is None:
+                        # default case for printing
+                        return condition_mapper()
+
+                    # get the default condition to check for vectorizability
+                    check = condition_mapper()
                     from loopy.diagnostic import LoopyError
                     deps = set()
                     try:
-                        for c in condition.expr.children:
+                        for c in check.expr.children:
                             deps |= get_dependencies(c)
 
                         if deps & set([vec_iname]):
@@ -307,14 +313,21 @@ def wrap_in_if(codegen_state, condition_exprs, inner):
                     except (AttributeError, TypeError):
                         pass
 
-                    return condition
+                    # get LHS dtype for (potential) casting
+                    from loopy.expression import dtype_to_type_context
+                    lhs_dtype = codegen_state.expression_to_code_mapper.infer_type(
+                        ast.lvalue.expr)
+                    type_context = dtype_to_type_context(codegen_state.kernel.target,
+                        lhs_dtype)
+                    return condition_mapper(
+                        type_context=type_context, needed_dtype=lhs_dtype)
 
                 method = codegen_state.ast_builder.emit_vector_if
                 mapper = condition_mapper_wrapper
 
         return inner.with_new_ast(
                 codegen_state,
-                method(mapper(), cur_ast))
+                method(mapper, cur_ast))
 
     return inner
 
diff --git a/loopy/target/__init__.py b/loopy/target/__init__.py
index c3b7a739c..7dbec5a1a 100644
--- a/loopy/target/__init__.py
+++ b/loopy/target/__init__.py
@@ -217,10 +217,10 @@ class ASTBuilderBase(object):
     def can_implement_conditionals(self):
         return False
 
-    def emit_if(self, condition_str, ast):
+    def emit_if(self, condition_mapper, ast):
         raise NotImplementedError()
 
-    def emit_vector_if(self, condition_str, ast):
+    def emit_vector_if(self, condition_mapper, ast):
         raise NotImplementedError()
 
     def emit_initializer(self, codegen_state, dtype, name, val_str, is_const):
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 8e69793e8..d63cdfd58 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -957,9 +957,9 @@ class CASTBuilder(ASTBuilderBase):
     def can_implement_conditionals(self):
         return True
 
-    def emit_if(self, condition_str, ast):
+    def emit_if(self, condition_mapper, ast):
         from cgen import If
-        return If(condition_str, ast)
+        return If(condition_mapper(), ast)
 
     # }}}
 
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index cc1ed0e8a..8ff26fe1a 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -355,7 +355,7 @@ class ExpressionToOpenCLCExpressionMapper(ExpressionToCExpressionMapper):
                 needed_dtype)
             vw = self.codegen_state.vectorization_info.length
             # need to add an explicit conversion
-            return var("convert_%s%d(%s)" % (ctype, vw, wrap))
+            return var("convert_%s%d" % (ctype, vw))(wrap)
         return wrap
 
 # }}}
@@ -584,7 +584,7 @@ class OpenCLCASTBuilder(CASTBuilder):
             pass
         return assignment
 
-    def emit_vector_if(self, condition_str, ast):
+    def emit_vector_if(self, condition_mapper, ast):
         """
         Emit's a vector select function
         """
@@ -593,7 +593,8 @@ class OpenCLCASTBuilder(CASTBuilder):
             try:
                 # treat it as an assignment
                 return Assign(str(assign.lvalue.expr), str(VectorSelect(
-                    assign.rvalue.expr, assign.lvalue.expr, condition_str)))
+                    assign.rvalue.expr, assign.lvalue.expr,
+                    condition_mapper(assign))))
             except AttributeError:
                 return False
 
@@ -615,7 +616,7 @@ class OpenCLCASTBuilder(CASTBuilder):
             raise LoopyError(
                 "Vector conditionals can only be generated for simple "
                 "assign statements, condition (%s) on instruction (%s) "
-                "invalid" % (str(condition_str), str(ast)))
+                "invalid" % (str(condition_mapper()), str(ast)))
 
         return vec_if
 
diff --git a/loopy/target/python.py b/loopy/target/python.py
index ce04986d3..ca4b116d0 100644
--- a/loopy/target/python.py
+++ b/loopy/target/python.py
@@ -287,9 +287,9 @@ class PythonASTBuilderBase(ASTBuilderBase):
     def can_implement_conditionals(self):
         return True
 
-    def emit_if(self, condition_str, ast):
+    def emit_if(self, condition_mapper, ast):
         from genpy import If
-        return If(condition_str, ast)
+        return If(condition_mapper(), ast)
 
     def emit_assignment(self, codegen_state, insn):
         ecm = codegen_state.expression_to_code_mapper
-- 
GitLab


From 6c0b86a7d66484a471025a34500dc62d4deb7036 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 10 May 2018 14:36:02 -0400
Subject: [PATCH 105/144] update type inferencing for conditionals /
 comparisons, to pick up the right dtype (int64 vs int32), as this is required
 for proper conversion of conditions in vector selects

---
 loopy/type_inference.py | 22 +++++++++++++++++-----
 1 file changed, 17 insertions(+), 5 deletions(-)

diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 010a0658f..4b616f71c 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -349,14 +349,26 @@ class TypeInferenceMapper(CombineMapper):
         dtype = field[0]
         return [NumpyType(dtype)]
 
-    def map_comparison(self, expr):
+    def map_comparison_types(self, dtype):
         # "bool" is unusable because OpenCL's bool has indeterminate memory
         # format.
-        return [NumpyType(np.dtype(np.int32))]
 
-    map_logical_not = map_comparison
-    map_logical_and = map_comparison
-    map_logical_or = map_comparison
+        if dtype[0].itemsize == 8:
+            return [NumpyType(np.dtype(np.int64))]
+        else:
+            return [NumpyType(np.dtype(np.int32))]
+
+    def map_logical_not(self, expr):
+        return self.map_comparison_types(self.rec(expr.child))
+
+    def map_logical_and(self, expr):
+        return self.map_comparison_types(
+            self.combine([self.rec(x) for x in expr.children]))
+    map_logical_or = map_logical_and
+
+    def map_comparison(self, expr):
+        return self.map_comparison_types(
+            self.combine([self.rec(expr.left), self.rec(expr.right)]))
 
     def map_group_hw_index(self, expr, *args):
         return [self.kernel.index_dtype]
-- 
GitLab


From 1de5aabb23a812ec20d9c21eafa0016e4c2378d5 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 10 May 2018 14:37:25 -0400
Subject: [PATCH 106/144] add test for implicit conversion of dtypes within a
 vector-select conditional

---
 test/test_loopy.py | 4 ++++
 1 file changed, 4 insertions(+)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index a8c3edc3e..3131f1731 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -3047,6 +3047,10 @@ def test_explicit_simd_selects(ctx_factory):
     ans_negated = np.invert(ans) + 2
     create_and_test('a[i] = 1', 'not (b[i] > 6)', ans_negated, b=np.arange(
         12, dtype=np.int32).reshape((3, 4)))
+    # 7) test conditional on differing dtype
+    ans_negated = np.invert(ans) + 2
+    create_and_test('a[i] = 1', 'not (b[i] > 6)', ans_negated, b=np.arange(
+        12, dtype=np.int64).reshape((3, 4)))
 
 
 @pytest.mark.parametrize(('lhs_dtype', 'rhs_dtype'), [
-- 
GitLab


From 7f3473b73aa6074f42fd19e0e3f34fb97ceb17a7 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 10 May 2018 14:59:37 -0400
Subject: [PATCH 107/144] convert to callable

---
 loopy/codegen/instruction.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/codegen/instruction.py b/loopy/codegen/instruction.py
index e590502fb..e370eef67 100644
--- a/loopy/codegen/instruction.py
+++ b/loopy/codegen/instruction.py
@@ -64,7 +64,7 @@ def to_codegen_result(
         from pymbolic.primitives import LogicalAnd
         from pymbolic.mapper.stringifier import PREC_NONE
         ast = codegen_state.ast_builder.emit_if(
-                codegen_state.expression_to_code_mapper(
+                lambda: codegen_state.expression_to_code_mapper(
                     LogicalAnd(tuple(condition_exprs)), PREC_NONE),
                 ast)
 
-- 
GitLab


From 978e1f08ea0ba4446d6c13e46e0daf26c3a82f0c Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 10 May 2018 15:30:15 -0400
Subject: [PATCH 108/144] add an empty stub for literal mapping (and some tests
 that triggered the error)

---
 loopy/symbolic.py  | 3 +++
 test/test_loopy.py | 6 ++++++
 2 files changed, 9 insertions(+)

diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index a25d9a30d..52996456d 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -291,6 +291,9 @@ class DependencyMapper(DependencyMapperBase):
     def map_type_cast(self, expr):
         return self.rec(expr.child)
 
+    def map_literal(self, expr):
+        return set()
+
 
 class SubstitutionRuleExpander(IdentityMapper):
     def __init__(self, rules):
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 3131f1731..1fac7ecb7 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -3051,6 +3051,12 @@ def test_explicit_simd_selects(ctx_factory):
     ans_negated = np.invert(ans) + 2
     create_and_test('a[i] = 1', 'not (b[i] > 6)', ans_negated, b=np.arange(
         12, dtype=np.int64).reshape((3, 4)))
+    # 7) test conditional on differing dtype (float->int) and (int->float)
+    ans_negated = np.invert(ans) + 2
+    create_and_test('a[i] = 1', 'not (b[i] > 6)', ans_negated, b=np.arange(
+        12, dtype=np.float64).reshape((3, 4)))
+    create_and_test('a[i] = 1', 'not (b[i] > 6)', ans_negated, b=np.arange(
+        12, dtype=np.int64).reshape((3, 4)), a=np.zeros((3, 4), dtype=np.int32))
 
 
 @pytest.mark.parametrize(('lhs_dtype', 'rhs_dtype'), [
-- 
GitLab


From 72da53b80b734a5897cc8fc71e1511464b4d7780 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 10 May 2018 17:27:53 -0400
Subject: [PATCH 109/144] convert conditional dtype to corresponding integer of
 floating point assignment & fix test

---
 loopy/codegen/result.py | 11 ++++++++++-
 test/test_loopy.py      |  2 +-
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index cf6fe3ff9..424cbe6f0 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -313,10 +313,19 @@ def wrap_in_if(codegen_state, condition_exprs, inner):
                     except (AttributeError, TypeError):
                         pass
 
-                    # get LHS dtype for (potential) casting
+                    # get LHS dtype for (potential) casting of condition
                     from loopy.expression import dtype_to_type_context
                     lhs_dtype = codegen_state.expression_to_code_mapper.infer_type(
                         ast.lvalue.expr)
+                    if not lhs_dtype.is_integral():
+                        # the necessary dtype is the integer version of the floating
+                        # point type (e.g., float64 -> int64)
+                        from loopy.types import to_loopy_type
+                        import numpy as np
+                        lhs_dtype = to_loopy_type(
+                            np.dtype('i%d' % lhs_dtype.itemsize),
+                            lhs_dtype.target)
+
                     type_context = dtype_to_type_context(codegen_state.kernel.target,
                         lhs_dtype)
                     return condition_mapper(
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 1fac7ecb7..687310107 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -3056,7 +3056,7 @@ def test_explicit_simd_selects(ctx_factory):
     create_and_test('a[i] = 1', 'not (b[i] > 6)', ans_negated, b=np.arange(
         12, dtype=np.float64).reshape((3, 4)))
     create_and_test('a[i] = 1', 'not (b[i] > 6)', ans_negated, b=np.arange(
-        12, dtype=np.int64).reshape((3, 4)), a=np.zeros((3, 4), dtype=np.int32))
+        12, dtype=np.int64).reshape((3, 4)), a=np.zeros((3, 4), dtype=np.float32))
 
 
 @pytest.mark.parametrize(('lhs_dtype', 'rhs_dtype'), [
-- 
GitLab


From 93b19970ced25e9e4ce44e96a0b814704ca129f4 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 10 May 2018 18:06:32 -0400
Subject: [PATCH 110/144] s/warn/warn_with_kernel

---
 loopy/codegen/__init__.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 8a57c3b2d..ab2702d23 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -24,7 +24,7 @@ THE SOFTWARE.
 
 import six
 
-from loopy.diagnostic import LoopyError, warn
+from loopy.diagnostic import LoopyError, warn_with_kernel
 from pytools import ImmutableRecord
 import islpy as isl
 
@@ -333,7 +333,7 @@ class CodeGenerationState(object):
         try:
             return func(self)
         except Unvectorizable as e:
-            warn(self.kernel, "vectorize_failed",
+            warn_with_kernel(self.kernel, "vectorize_failed",
                     "Vectorization of '%s' failed because '%s'"
                     % (what, e))
 
-- 
GitLab


From 8b4745ff6dc1c59bbdba6762bcf0cccfed3e6bb5 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 11 May 2018 11:06:03 -0400
Subject: [PATCH 111/144] ok, so the test finally reproduces the pyjac error

---
 test/test_loopy.py | 42 +++++++++++++++++++++++++++++++++++++-----
 1 file changed, 37 insertions(+), 5 deletions(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index 687310107..8158f2d63 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2888,7 +2888,8 @@ def test_explicit_simd_temporary_promotion(ctx_factory):
         <:v> tv3 = 1
         """)
 
-    def make_kernel(insn, ans=None, preamble=None, extra_inames=None):
+    def make_kernel(insn, ans=None, preamble=None, extra_inames=None, skeleton=None,
+                    dtype=None):
         skeleton = """
         %(preamble)s
         for j
@@ -2899,16 +2900,18 @@ def test_explicit_simd_temporary_promotion(ctx_factory):
                 end
             end
         end
-        """
+        """ if skeleton is None else skeleton
+        dtype = dtype if dtype is not None else (
+            ans.dtype if ans is not None else np.int32)
         inames = ['i, j']
         if extra_inames is not None:
             inames += list(extra_inames)
         knl = lp.make_kernel(
             '{[%(inames)s]: 0 <= %(inames)s < 12}' % {'inames': ', '.join(inames)},
             skeleton % dict(insn=insn, preamble='' if not preamble else preamble),
-            [lp.GlobalArg('a', shape=(12, 12)),
+            [lp.GlobalArg('a', shape=(12, 12), dtype=dtype),
              lp.TemporaryVariable('mask', shape=(12,), initializer=np.array(
-                                  np.arange(12) >= 6, dtype=np.int), read_only=True,
+                                  np.arange(12) >= 6, dtype=dtype), read_only=True,
                                   scope=scopes.GLOBAL)])
 
         knl = lp.split_iname(knl, 'j', 4, inner_tag='vec')
@@ -2917,7 +2920,7 @@ def test_explicit_simd_temporary_promotion(ctx_factory):
         knl = lp.preprocess_kernel(knl)
 
         if ans is not None:
-            assert np.array_equal(knl(queue, a=np.zeros((12, 3, 4), dtype=np.int32))[
+            assert np.array_equal(knl(queue, a=np.zeros((12, 3, 4), dtype=dtype))[
                 1][0], ans)
 
         return knl
@@ -2974,6 +2977,35 @@ def test_explicit_simd_temporary_promotion(ctx_factory):
     finally:
         warnings.resetwarnings()
 
+    # modified case from pyjac
+    skeleton = """
+    for j
+        %(preamble)s
+        for i
+            %(insn)s
+            if i > 6
+                <> P_val = 100 {id=pset0, nosync=pset1}
+            else
+                P_val = 0.01 {id=pset1, nosync=pset0}
+            end
+            <> B_sum = 0 {id=bset0}
+            for k
+                B_sum = B_sum + k * a[i, j] {id=bset1, dep=*:bset0}
+            end
+            # here, we are testing that Kc is properly promoted to a vector dtype
+            <> P_sum = P_val * i {id=pset2, dep=pset0:pset1}
+            B_sum = exp(B_sum) {id=bset2, dep=bset0:bset1}
+            <> Kc = P_sum * B_sum {id=kset, dep=bset*:pset2}
+            a[i, j] = Kc {dep=*:kset, nosync=pset0:pset1}
+        end
+    end
+    """
+
+    knl = make_kernel('', dtype=np.float32, skeleton=skeleton, extra_inames='k')
+    from loopy.kernel.array import VectorArrayDimTag
+    assert any(isinstance(x, VectorArrayDimTag)
+               for x in knl.temporary_variables['Kc'].dim_tags)
+
 
 def test_explicit_simd_selects(ctx_factory):
     ctx = ctx_factory()
-- 
GitLab


From 302df79e0a625d125effa1919ac8af477c4d621d Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 11 May 2018 12:10:56 -0400
Subject: [PATCH 112/144] add vector temporary write heuristic

---
 loopy/transform/privatize.py | 41 +++++++++++++++++++++++++++++++++++-
 1 file changed, 40 insertions(+), 1 deletion(-)

diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py
index 618bdb7a4..565170e98 100644
--- a/loopy/transform/privatize.py
+++ b/loopy/transform/privatize.py
@@ -120,6 +120,7 @@ def privatize_temporaries_with_inames(
     wmap = kernel.writer_map()
 
     var_to_new_priv_axis_iname = {}
+    tv_wmap = {}
 
     def find_privitzing_inames(writer_insn, iname, temp_var):
         # test that -- a) the iname is an ILP or vector tag
@@ -153,6 +154,7 @@ def privatize_temporaries_with_inames(
                 continue
             writer_insn = kernel.id_to_insn[writer_insn_id]
             inner_ids = set([writer_insn_id])
+
             # the instructions we have to consider here are those that directly
             # write to this variable, and those that are recursive dependencies of
             # this instruction
@@ -170,13 +172,29 @@ def privatize_temporaries_with_inames(
                 insn = kernel.id_to_insn[insn_id]
                 test_inames = kernel.insn_inames(insn) & privatizing_inames
 
+                # while we're here, we also build a temporary variable write map
+                # the reason being that a temporary variable that's only assigned to
+                # from other vector temporaries will never have a direct-dependency
+                # on the privitizing iname
+
+                # if we build this, we can recursively travel down the
+                # temporary variable write-map of any newly privitized variable
+                # and add the privitizing iname to any temporary variable it assigns
+                # to
+                for tv_read in insn.read_dependency_names():
+                    if tv_read in kernel.temporary_variables:
+                        if tv_read not in tv_wmap:
+                            tv_wmap[tv_read] = set()
+
+                        tv_wmap[tv_read].add(tv.name)
+
                 priv_axis_inames = set()
                 for ti in test_inames:
                     priv_axis_inames |= find_privitzing_inames(insn, ti, tv)
 
                 priv_axis_inames = frozenset(priv_axis_inames)
                 referenced_priv_axis_inames = (priv_axis_inames
-                    & writer_insn.write_dependency_names())
+                    & insn.write_dependency_names())
 
                 new_priv_axis_inames = priv_axis_inames - referenced_priv_axis_inames
 
@@ -210,6 +228,27 @@ def privatize_temporaries_with_inames(
 
     # }}}
 
+    # {{{ recursively apply vector temporary write heuristic
+
+    applied = set()
+
+    def apply(varname, starting_dict):
+        if varname not in tv_wmap or varname in applied:
+            return starting_dict
+        applied.add(varname)
+        for written_to in tv_wmap[varname]:
+            if written_to not in starting_dict:
+                starting_dict[written_to] = set()
+            starting_dict[written_to] |= starting_dict[varname]
+            starting_dict.update(apply(written_to, starting_dict.copy()))
+        return starting_dict
+
+    for varname in list(var_to_new_priv_axis_iname.keys()):
+        var_to_new_priv_axis_iname.update(apply(
+            varname, var_to_new_priv_axis_iname.copy()))
+
+    # }}}
+
     # {{{ find ilp iname lengths
 
     from loopy.isl_helpers import static_max_of_pw_aff
-- 
GitLab


From 4a7b65e87127cf073050624f1ef51c70af9126bd Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 23 May 2018 16:24:41 -0400
Subject: [PATCH 113/144] fix

---
 loopy/transform/privatize.py | 10 ++++++----
 1 file changed, 6 insertions(+), 4 deletions(-)

diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py
index 565170e98..3feadd8d6 100644
--- a/loopy/transform/privatize.py
+++ b/loopy/transform/privatize.py
@@ -194,7 +194,7 @@ def privatize_temporaries_with_inames(
 
                 priv_axis_inames = frozenset(priv_axis_inames)
                 referenced_priv_axis_inames = (priv_axis_inames
-                    & insn.write_dependency_names())
+                    & writer_insn.write_dependency_names())
 
                 new_priv_axis_inames = priv_axis_inames - referenced_priv_axis_inames
 
@@ -243,9 +243,11 @@ def privatize_temporaries_with_inames(
             starting_dict.update(apply(written_to, starting_dict.copy()))
         return starting_dict
 
-    for varname in list(var_to_new_priv_axis_iname.keys()):
-        var_to_new_priv_axis_iname.update(apply(
-            varname, var_to_new_priv_axis_iname.copy()))
+    for varname, inames in six.iteritems(var_to_new_priv_axis_iname):
+        for iname in inames:
+            if filter_iname_tags_by_type(kernel.iname_to_tags[iname], VectorizeTag):
+                var_to_new_priv_axis_iname.update(apply(
+                    varname, var_to_new_priv_axis_iname.copy()))
 
     # }}}
 
-- 
GitLab


From defc18be76443662789a758e2a4f2d3396059fa1 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 23 May 2018 16:25:05 -0400
Subject: [PATCH 114/144] fix old reference

---
 loopy/transform/privatize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py
index 3feadd8d6..a65b645bb 100644
--- a/loopy/transform/privatize.py
+++ b/loopy/transform/privatize.py
@@ -312,7 +312,7 @@ def privatize_temporaries_with_inames(
             if not getattr(insn, 'force_vector', False) and all(
                     filter_iname_tags_by_type(kernel.iname_to_tags[iname],
                                               VectorizeTag)
-                    for x in eiii.seen_ilp_inames - insn.within_inames):
+                    for x in eiii.seen_priv_axis_inames - insn.within_inames):
                 raise LoopyError(
                     "Kernel '%s': Instruction '%s': touched variable that "
                     "(for privatization, e.g. as performed for ILP) "
-- 
GitLab


From b71e895ce34a55bd31071e7b3b6579377b1327ad Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 23 May 2018 16:25:18 -0400
Subject: [PATCH 115/144] add description of test and reason for it

---
 test/test_loopy.py | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index 8158f2d63..b1c1a7a05 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2977,7 +2977,11 @@ def test_explicit_simd_temporary_promotion(ctx_factory):
     finally:
         warnings.resetwarnings()
 
-    # modified case from pyjac
+    # modified case from pyjac -- what makes this case special is that
+    # Kc is never directly assigned to in an instruction that directly references
+    # the vector iname, j_inner.  Instead, it is a good test of the recursive
+    # vector temporary promotion, as it is written to by B_sum, which _is_ directly
+    # written to from an instruction (bset1) that references j_inner
     skeleton = """
     for j
         %(preamble)s
-- 
GitLab


From a2e2491e8fc6d2b12ecdeb96f637fe8fa908fb0b Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 23 May 2018 16:42:00 -0400
Subject: [PATCH 116/144] fix for not changing dict size during iter

---
 loopy/transform/privatize.py | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py
index a65b645bb..897ce3d60 100644
--- a/loopy/transform/privatize.py
+++ b/loopy/transform/privatize.py
@@ -243,11 +243,11 @@ def privatize_temporaries_with_inames(
             starting_dict.update(apply(written_to, starting_dict.copy()))
         return starting_dict
 
-    for varname, inames in six.iteritems(var_to_new_priv_axis_iname):
-        for iname in inames:
-            if filter_iname_tags_by_type(kernel.iname_to_tags[iname], VectorizeTag):
-                var_to_new_priv_axis_iname.update(apply(
-                    varname, var_to_new_priv_axis_iname.copy()))
+    for varname, inames in list(var_to_new_priv_axis_iname.keys()):
+        if any(filter_iname_tags_by_type(kernel.iname_to_tags[iname], VectorizeTag)
+               for iname in var_to_new_priv_axis_iname[varname]):
+            var_to_new_priv_axis_iname.update(apply(
+                varname, var_to_new_priv_axis_iname.copy()))
 
     # }}}
 
-- 
GitLab


From 352f1aefd0dd108f27acdd5ab31b5955955c5629 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 23 May 2018 16:42:46 -0400
Subject: [PATCH 117/144] doh

---
 loopy/transform/privatize.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py
index 897ce3d60..aaebd041f 100644
--- a/loopy/transform/privatize.py
+++ b/loopy/transform/privatize.py
@@ -243,7 +243,7 @@ def privatize_temporaries_with_inames(
             starting_dict.update(apply(written_to, starting_dict.copy()))
         return starting_dict
 
-    for varname, inames in list(var_to_new_priv_axis_iname.keys()):
+    for varname in list(var_to_new_priv_axis_iname.keys()):
         if any(filter_iname_tags_by_type(kernel.iname_to_tags[iname], VectorizeTag)
                for iname in var_to_new_priv_axis_iname[varname]):
             var_to_new_priv_axis_iname.update(apply(
-- 
GitLab


From c122dfbf94a8317733fc38218961aa63db8a89e1 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Wed, 23 May 2018 17:03:25 -0400
Subject: [PATCH 118/144] Fix incorrect stringifcation

---
 loopy/expression.py | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/loopy/expression.py b/loopy/expression.py
index a93f7127d..721a95f74 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -119,7 +119,8 @@ class VectorizabilityChecker(RecursiveMapper):
         if any(rec_pars):
             if str(expr.function) not in VectorizabilityChecker.functions:
                 return Unvectorizable(
-                    'Function {} is not known to be vectorizable'.format(expr.name))
+                    'Function {} is not known to be vectorizable'.format(
+                        str(expr.function)))
             return True
 
         return False
-- 
GitLab


From 0a648e565b545cabbc2331c5f0142c1bf692868a Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Mon, 11 Jun 2018 15:55:26 -0400
Subject: [PATCH 119/144] fix for vector conditional on value-arg

---
 loopy/codegen/result.py | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index 424cbe6f0..f38585c46 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -270,13 +270,15 @@ def wrap_in_if(codegen_state, condition_exprs, inner):
         if codegen_state.vectorization_info is not None:
             from loopy.symbolic import get_dependencies
             from loopy.kernel.array import VectorArrayDimTag
+            from loopy.kernel.data import ValueArg
 
             vec_iname = codegen_state.vectorization_info.iname
 
             # precalculate vector arrays / temporaries
             knl = codegen_state.kernel
             vec_arys = set([x.name for x in knl.args + list(
-                knl.temporary_variables.values()) if any(
+                knl.temporary_variables.values())
+                    if not isinstance(x, ValueArg) and any(
                     isinstance(dt, VectorArrayDimTag)
                     for dt in x.dim_tags)])
 
-- 
GitLab


From 18ee5634f40e637ec6ec08bf987e6c6e543b4bdc Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Mon, 11 Jun 2018 16:02:18 -0400
Subject: [PATCH 120/144] add test for valuearg vector conditional

---
 test/test_loopy.py | 9 +++++++--
 1 file changed, 7 insertions(+), 2 deletions(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index b1c1a7a05..6395e254e 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -3015,7 +3015,7 @@ def test_explicit_simd_selects(ctx_factory):
     ctx = ctx_factory()
 
     def create_and_test(insn, condition, answer, exception=None, a=None, b=None,
-                        extra_insns=None, c=None):
+                        extra_insns=None, c=None, v=None):
         a = np.zeros((3, 4), dtype=np.int32) if a is None else a
         data = [lp.GlobalArg('a', shape=(12,), dtype=a.dtype)]
         kwargs = dict(a=a)
@@ -3025,6 +3025,9 @@ def test_explicit_simd_selects(ctx_factory):
         if c is not None:
             data += [lp.GlobalArg('c', shape=(12,), dtype=b.dtype)]
             kwargs['c'] = c
+        if v is not None:
+            data += [lp.ValueArg('v', dtype=v.dtype)]
+            kwargs['v'] = v
         names = [d.name for d in data]
 
         knl = lp.make_kernel(['{[i]: 0 <= i < 12}'],
@@ -3087,12 +3090,14 @@ def test_explicit_simd_selects(ctx_factory):
     ans_negated = np.invert(ans) + 2
     create_and_test('a[i] = 1', 'not (b[i] > 6)', ans_negated, b=np.arange(
         12, dtype=np.int64).reshape((3, 4)))
-    # 7) test conditional on differing dtype (float->int) and (int->float)
+    # 8) test conditional on differing dtype (float->int) and (int->float)
     ans_negated = np.invert(ans) + 2
     create_and_test('a[i] = 1', 'not (b[i] > 6)', ans_negated, b=np.arange(
         12, dtype=np.float64).reshape((3, 4)))
     create_and_test('a[i] = 1', 'not (b[i] > 6)', ans_negated, b=np.arange(
         12, dtype=np.int64).reshape((3, 4)), a=np.zeros((3, 4), dtype=np.float32))
+    # 9) test conditional on valuearg
+    create_and_test('a[i] = 1', 'not v', np.zeros_like(a), v=1)
 
 
 @pytest.mark.parametrize(('lhs_dtype', 'rhs_dtype'), [
-- 
GitLab


From 8f4cbfa41cca5267074be4bc1953bd46050b9f86 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Mon, 11 Jun 2018 16:56:10 -0400
Subject: [PATCH 121/144] test fix

---
 test/test_loopy.py | 18 ++++++++++++------
 1 file changed, 12 insertions(+), 6 deletions(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index 6395e254e..41dd76fbc 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -3015,7 +3015,7 @@ def test_explicit_simd_selects(ctx_factory):
     ctx = ctx_factory()
 
     def create_and_test(insn, condition, answer, exception=None, a=None, b=None,
-                        extra_insns=None, c=None, v=None):
+                        extra_insns=None, c=None, v=None, check=None):
         a = np.zeros((3, 4), dtype=np.int32) if a is None else a
         data = [lp.GlobalArg('a', shape=(12,), dtype=a.dtype)]
         kwargs = dict(a=a)
@@ -3025,10 +3025,11 @@ def test_explicit_simd_selects(ctx_factory):
         if c is not None:
             data += [lp.GlobalArg('c', shape=(12,), dtype=b.dtype)]
             kwargs['c'] = c
+        names = [d.name for d in data]
+        # add after defining names to avoid trying to split value arg
         if v is not None:
-            data += [lp.ValueArg('v', dtype=v.dtype)]
+            data += [lp.ValueArg('v', dtype=np.int32)]
             kwargs['v'] = v
-        names = [d.name for d in data]
 
         knl = lp.make_kernel(['{[i]: 0 <= i < 12}'],
             """
@@ -3047,9 +3048,13 @@ def test_explicit_simd_selects(ctx_factory):
         knl = lp.split_iname(knl, 'i', 4, inner_tag='vec')
         knl = lp.split_array_axis(knl, names, 0, 4)
         knl = lp.tag_array_axes(knl, names, 'N0,vec')
+        if v is not None:
+            knl = lp.set_options(knl, write_wrapper=True)
 
         queue = cl.CommandQueue(ctx)
-        if exception is not None:
+        if check is not None:
+            assert check(knl)
+        elif exception is not None:
             with pytest.raises(exception):
                 knl(queue, **kwargs)
         else:
@@ -3096,8 +3101,9 @@ def test_explicit_simd_selects(ctx_factory):
         12, dtype=np.float64).reshape((3, 4)))
     create_and_test('a[i] = 1', 'not (b[i] > 6)', ans_negated, b=np.arange(
         12, dtype=np.int64).reshape((3, 4)), a=np.zeros((3, 4), dtype=np.float32))
-    # 9) test conditional on valuearg
-    create_and_test('a[i] = 1', 'not v', np.zeros_like(a), v=1)
+    # 9) test conditional on valuearg, the "test" here is that we can actually
+    # generate the code
+    create_and_test('a[i] = 1', 'v', np.ones_like(ans), v=1)
 
 
 @pytest.mark.parametrize(('lhs_dtype', 'rhs_dtype'), [
-- 
GitLab


From bc3f8e8a71b1acb7fb9a9bc49647a0d6ec082354 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Tue, 26 Jun 2018 15:33:48 -0400
Subject: [PATCH 122/144] fix vector iname lookup to avoid new (non-default)
 dictionary iname_to_tags

---
 loopy/target/opencl.py | 9 ++++++---
 1 file changed, 6 insertions(+), 3 deletions(-)

diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 8ff26fe1a..0825abab5 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -329,9 +329,12 @@ class ExpressionToOpenCLCExpressionMapper(ExpressionToCExpressionMapper):
     def map_comparison(self, expr, type_context):
         from loopy.symbolic import get_dependencies
         from loopy.kernel.data import VectorizeTag, filter_iname_tags_by_type
-        vec_inames = set([x for x in self.kernel.all_inames()
-                          if filter_iname_tags_by_type(
-                            self.kernel.iname_to_tags[x], VectorizeTag)])
+        from six import iteritems
+
+        vec_inames = set([iname for iname, tags in
+                          iteritems(self.kernel.iname_to_tags)
+                          if filter_iname_tags_by_type(tags, VectorizeTag)])
+
         if get_dependencies(expr) & vec_inames and \
                 self.codegen_state.insn_was_not_vectorizable:
             raise LoopyError("Cannot unroll a vector-iname comparison, as scalar"
-- 
GitLab


From 0d4ff5da0daffb10a6c2ded0783f047e35311fbf Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 29 Jun 2018 11:31:54 -0400
Subject: [PATCH 123/144] Correct improper use of
 'find_recursive_dependencies', this refers to user-specified instruction
 dependencies, not general 'dependencies'. Besides the recursive write-map
 application technique is capable of handling the cases the recursive
 instruction dependency approach was trying to fix

---
 loopy/kernel/tools.py        |   1 -
 loopy/transform/privatize.py | 152 ++++++++++++++++-------------------
 2 files changed, 70 insertions(+), 83 deletions(-)

diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index ea57b9ec0..669011964 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -1184,7 +1184,6 @@ def get_visual_iname_order_embedding(kernel):
 
 # {{{ find_recursive_dependencies
 
-@memoize_method
 def find_recursive_dependencies(kernel, insn_ids):
     queue = list(insn_ids)
 
diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py
index 9a11ee674..483b37d1f 100644
--- a/loopy/transform/privatize.py
+++ b/loopy/transform/privatize.py
@@ -105,7 +105,6 @@ def privatize_temporaries_with_inames(
     """
 
     from loopy.kernel.data import VectorizeTag, IlpBaseTag, filter_iname_tags_by_type
-    from loopy.kernel.tools import find_recursive_dependencies
 
     if isinstance(privatizing_inames, str):
         privatizing_inames = frozenset(
@@ -117,11 +116,6 @@ def privatize_temporaries_with_inames(
                 s.strip()
                 for s in only_var_names.split(","))
 
-    wmap = kernel.writer_map()
-
-    var_to_new_priv_axis_iname = {}
-    tv_wmap = {}
-
     def find_privitzing_inames(writer_insn, iname, temp_var):
         # test that -- a) the iname is an ILP or vector tag
         if filter_iname_tags_by_type(kernel.iname_to_tags[iname],
@@ -143,6 +137,11 @@ def privatize_temporaries_with_inames(
 
     # {{{ find variables that need extra indices
 
+    from collections import defaultdict
+    tv_wmap = defaultdict(lambda: set())
+    wmap = kernel.writer_map()
+    var_to_new_priv_axis_iname = {}
+
     for tv in six.itervalues(kernel.temporary_variables):
         # check variables to transform
         if only_var_names is not None and tv.name not in only_var_names:
@@ -153,100 +152,89 @@ def privatize_temporaries_with_inames(
             if writer_insn_id in seen:
                 continue
             writer_insn = kernel.id_to_insn[writer_insn_id]
-            inner_ids = set([writer_insn_id])
-
-            # the instructions we have to consider here are those that directly
-            # write to this variable, and those that are recursive dependencies of
-            # this instruction
-            rec_deps = find_recursive_dependencies(kernel, frozenset([
-                writer_insn_id]))
-            # however, we must make sure to limit to those inames that we are
-            # actually inside of
-            inner_ids |= set([
-                x for x in rec_deps if kernel.id_to_insn[x].within_inames <=
-                writer_insn.within_inames])
-
-            for insn_id in inner_ids:
-                seen.add(insn_id)
-
-                insn = kernel.id_to_insn[insn_id]
-                test_inames = kernel.insn_inames(insn) & privatizing_inames
-
-                # while we're here, we also build a temporary variable write map
-                # the reason being that a temporary variable that's only assigned to
-                # from other vector temporaries will never have a direct-dependency
-                # on the privitizing iname
-
-                # if we build this, we can recursively travel down the
-                # temporary variable write-map of any newly privitized variable
-                # and add the privitizing iname to any temporary variable it assigns
-                # to
-                for tv_read in insn.read_dependency_names():
-                    if tv_read in kernel.temporary_variables:
-                        if tv_read not in tv_wmap:
-                            tv_wmap[tv_read] = set()
-
-                        tv_wmap[tv_read].add(tv.name)
-
-                priv_axis_inames = set()
-                for ti in test_inames:
-                    priv_axis_inames |= find_privitzing_inames(insn, ti, tv)
-
-                priv_axis_inames = frozenset(priv_axis_inames)
-                referenced_priv_axis_inames = (priv_axis_inames
-                    & writer_insn.write_dependency_names())
-
-                new_priv_axis_inames = priv_axis_inames - referenced_priv_axis_inames
-
-                if not new_priv_axis_inames and tv.force_scalar and \
-                        tv.name in var_to_new_priv_axis_iname:
+            seen.add(writer_insn_id)
+
+            test_inames = kernel.insn_inames(writer_insn) & privatizing_inames
+
+            # A temporary variable that's only assigned to from other vector or ILP
+            # temporaries will never have a direct-dependency on the privitizing
+            # iname. After building a map of which temporary variables write to
+            # others, we can recursively travel down the temporary variable write-map
+            # of any newly privitized temporary variable, and extend the
+            # privitization to those temporary variables dependent on it.
+
+            for tv_read in writer_insn.read_dependency_names():
+                if tv_read in kernel.temporary_variables:
+                    tv_wmap[tv_read].add(tv.name)
+
+            priv_axis_inames = set()
+            for ti in test_inames:
+                priv_axis_inames |= find_privitzing_inames(writer_insn, ti, tv)
+
+            priv_axis_inames = frozenset(priv_axis_inames)
+            referenced_priv_axis_inames = (priv_axis_inames
+                & writer_insn.write_dependency_names())
+
+            new_priv_axis_inames = priv_axis_inames - referenced_priv_axis_inames
+
+            if not new_priv_axis_inames and tv.force_scalar and \
+                    tv.name in var_to_new_priv_axis_iname:
+                # conflict
+                raise LoopyError("instruction '%s' requires var '%s' to be a "
+                                 "scalar but previous instructions required "
+                                 "vector/ILP inames '%s'" % (
+                                        writer_insn_id, tv.name, ", ".join(
+                                            var_to_new_priv_axis_iname[
+                                                tv.name])))
+
+            if not new_priv_axis_inames:
+                continue
+
+            if tv.name in var_to_new_priv_axis_iname:
+                if new_priv_axis_inames != set(
+                        var_to_new_priv_axis_iname[tv.name]):
                     # conflict
-                    raise LoopyError("instruction '%s' requires var '%s' to be a "
-                                     "scalar but previous instructions required "
-                                     "vector/ILP inames '%s'" % (
-                                            insn_id, tv.name, ", ".join(
-                                                var_to_new_priv_axis_iname[
-                                                    tv.name])))
-
-                if not new_priv_axis_inames:
-                    continue
-
-                if tv.name in var_to_new_priv_axis_iname:
-                    if new_priv_axis_inames != set(
-                            var_to_new_priv_axis_iname[tv.name]):
-                        # conflict
-                        raise LoopyError("instruction '%s' requires adding "
-                                "indices for vector/ILP inames '%s' on var '%s', "
-                                "but previous instructions required inames '%s'"
-                                % (insn_id, ", ".join(new_priv_axis_inames),
-                                    tv.name, ", ".join(
-                                        var_to_new_priv_axis_iname[tv.name])))
-
-                    continue
-
-                var_to_new_priv_axis_iname[tv.name] = set(new_priv_axis_inames)
+                    raise LoopyError("instruction '%s' requires adding "
+                            "indices for vector/ILP inames '%s' on var '%s', "
+                            "but previous instructions required inames '%s'"
+                            % (writer_insn_id, ", ".join(new_priv_axis_inames),
+                                tv.name, ", ".join(
+                                    var_to_new_priv_axis_iname[tv.name])))
+
+                continue
+
+            var_to_new_priv_axis_iname[tv.name] = set(new_priv_axis_inames)
 
     # }}}
 
-    # {{{ recursively apply vector temporary write heuristic
+    # {{{ recursively apply vector / ILP temporary write heuristic
 
-    applied = set()
+    def recursively_apply(varname, starting_dict, applied=None):
+        if applied is None:
+            # root case, set up set of variables we've already applied to act as
+            # a base case and avoid infinite recursion.
+            applied = set()
 
-    def apply(varname, starting_dict):
         if varname not in tv_wmap or varname in applied:
+            # if no other variables depend on the starting variable, or the starting
+            # variable's privitizing inames have already been applied
             return starting_dict
+
         applied.add(varname)
         for written_to in tv_wmap[varname]:
             if written_to not in starting_dict:
                 starting_dict[written_to] = set()
+            # update the dependency
             starting_dict[written_to] |= starting_dict[varname]
-            starting_dict.update(apply(written_to, starting_dict.copy()))
+            # and recursively apply to the dependecy's dependencies
+            starting_dict.update(recursively_apply(written_to, starting_dict.copy()))
+
         return starting_dict
 
     for varname in list(var_to_new_priv_axis_iname.keys()):
         if any(filter_iname_tags_by_type(kernel.iname_to_tags[iname], VectorizeTag)
                for iname in var_to_new_priv_axis_iname[varname]):
-            var_to_new_priv_axis_iname.update(apply(
+            var_to_new_priv_axis_iname.update(recursively_apply(
                 varname, var_to_new_priv_axis_iname.copy()))
 
     # }}}
-- 
GitLab


From c21c832c094112f72b9c586b45a25067e29ebba0 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 29 Jun 2018 12:07:45 -0400
Subject: [PATCH 124/144] remove old scalar / vector type annotation code, as
 it doesn't work properly anyways

---
 loopy/kernel/creation.py     | 13 +-------
 loopy/kernel/data.py         | 65 ++----------------------------------
 loopy/kernel/instruction.py  | 41 ++---------------------
 loopy/symbolic.py            | 41 +++--------------------
 loopy/transform/privatize.py | 40 ++++++----------------
 test/test_loopy.py           | 49 +--------------------------
 6 files changed, 22 insertions(+), 227 deletions(-)

diff --git a/loopy/kernel/creation.py b/loopy/kernel/creation.py
index f808ffa86..a41af3cf1 100644
--- a/loopy/kernel/creation.py
+++ b/loopy/kernel/creation.py
@@ -484,8 +484,6 @@ def parse_insn(groups, insn_options):
     new_lhs = []
     assignee_names = []
 
-    force_scalar = False
-    force_vector = False
     for lhs_i in lhs:
         if isinstance(lhs_i, TypeAnnotation):
             if lhs_i.type is None:
@@ -493,11 +491,6 @@ def parse_insn(groups, insn_options):
             else:
                 temp_var_types.append(lhs_i.type)
 
-            if lhs_i.force_scalar:
-                force_scalar = True
-            elif lhs_i.force_vector:
-                force_vector = True
-
             lhs_i = lhs_i.child
         else:
             temp_var_types.append(None)
@@ -537,8 +530,6 @@ def parse_insn(groups, insn_options):
                     intern(insn_id)
                     if isinstance(insn_id, str)
                     else insn_id),
-                force_scalar=force_scalar,
-                force_vector=force_vector,
                 **insn_options)
 
     from loopy.kernel.instruction import make_assignment
@@ -1456,9 +1447,7 @@ def create_temporaries(knl, default_order):
                         base_indices=lp.auto,
                         shape=lp.auto,
                         order=default_order,
-                        target=knl.target,
-                        force_scalar=getattr(insn, 'force_scalar', False),
-                        force_vector=getattr(insn, 'force_vector', False))
+                        target=knl.target)
 
                 if isinstance(insn, Assignment):
                     insn = insn.copy(temp_var_type=None)
diff --git a/loopy/kernel/data.py b/loopy/kernel/data.py
index 955b6ff82..616ba0097 100644
--- a/loopy/kernel/data.py
+++ b/loopy/kernel/data.py
@@ -412,39 +412,6 @@ class TemporaryVariable(ArrayBase):
         memory location. If *True*, the restrict part is omitted on this
         declaration.
 
-    .. attribute:: force_scalar
-
-        If True, this temporary variable is created as an assignee, and will be a
-        scalar variable, regardless of the vector status of the instruction that
-        assigns to it.
-
-        .. note::
-
-            This is useful for OpenCL code-generation, to allow for if-statements
-            that do not depend on a vector temporary (which causes compilation
-            failures).
-
-    .. attribute:: force_scalar
-
-        If True, temporary variable created from the assignee will be a scalar
-        variable, regardless of the vector status of this assignment.
-
-        .. note::
-
-            This is useful for OpenCL code-generation, to allow for if-statements
-            that do not depend on a vector temporary (which causes compilation
-            failures).
-
-    .. attribute:: force_vector
-
-        If True, temporary variable created from the assignee will be a vector
-        variable, regardless of the vector status of this assignment.
-
-        .. note::
-
-            This is useful for OpenCL code-generation, to allow for if-statements
-            that do not depend on a vector temporary (which causes compilation
-            failures).
     """
 
     min_target_axes = 0
@@ -457,17 +424,14 @@ class TemporaryVariable(ArrayBase):
             "base_storage",
             "initializer",
             "read_only",
-            "_base_storage_access_may_be_aliasing",
-            "force_scalar",
-            "force_vector"
+            "_base_storage_access_may_be_aliasing"
             ]
 
     def __init__(self, name, dtype=None, shape=(), scope=auto,
             dim_tags=None, offset=0, dim_names=None, strides=None, order=None,
             base_indices=None, storage_shape=None,
             base_storage=None, initializer=None, read_only=False,
-            _base_storage_access_may_be_aliasing=False,
-            force_scalar=False, force_vector=False, **kwargs):
+            _base_storage_access_may_be_aliasing=False, **kwargs):
         """
         :arg dtype: :class:`loopy.auto` or a :class:`numpy.dtype`
         :arg shape: :class:`loopy.auto` or a shape tuple
@@ -482,11 +446,6 @@ class TemporaryVariable(ArrayBase):
                         "temporary variable '%s': "
                         "offset must be 0 if initializer specified"
                         % name)
-            if force_scalar:
-                raise LoopyError(
-                        "temporary variable '%s': "
-                        "cannot specify force_scalar if initializer is specified"
-                        % name)
 
             from loopy.types import NumpyType, to_loopy_type
             if dtype is auto or dtype is None:
@@ -520,13 +479,6 @@ class TemporaryVariable(ArrayBase):
                     "are not currently supported "
                     "(did you mean to set read_only=True?)"
                     % name)
-        elif read_only and (force_scalar or force_vector):
-            raise LoopyError(
-                "temporary variable '%s': "
-                "cannot specify force_scalar/force_vector for a read_only variable, "
-                "as these options apply only to temporary variables resulting from "
-                "assignments."
-                % name)
 
         if base_storage is not None and initializer is not None:
             raise LoopyError(
@@ -542,13 +494,6 @@ class TemporaryVariable(ArrayBase):
                     "base_storage given!"
                     % name)
 
-        if base_storage is not None and (force_scalar or force_vector):
-            raise LoopyError(
-                "temporary variable '%s': "
-                "cannot specify force_scalar/force_vector if base_storage is "
-                "supplied."
-                % name)
-
         ArrayBase.__init__(self, name=intern(name),
                 dtype=dtype, shape=shape, strides=strides,
                 dim_tags=dim_tags, offset=offset, dim_names=dim_names,
@@ -560,8 +505,6 @@ class TemporaryVariable(ArrayBase):
                 read_only=read_only,
                 _base_storage_access_may_be_aliasing=(
                     _base_storage_access_may_be_aliasing),
-                force_scalar=force_scalar,
-                force_vector=force_vector,
                 **kwargs)
 
     @property
@@ -626,8 +569,6 @@ class TemporaryVariable(ArrayBase):
                 and self.read_only == other.read_only
                 and (self._base_storage_access_may_be_aliasing
                     == other._base_storage_access_may_be_aliasing)
-                and (self.force_scalar == other.force_scalar)
-                and (self.force_vector == other.force_vector)
                 )
 
     def update_persistent_hash(self, key_hash, key_builder):
@@ -649,8 +590,6 @@ class TemporaryVariable(ArrayBase):
 
         key_builder.rec(key_hash, self.read_only)
         key_builder.rec(key_hash, self._base_storage_access_may_be_aliasing)
-        key_builder.rec(key_hash, self.force_scalar)
-        key_builder.rec(key_hash, self.force_vector)
 
 # }}}
 
diff --git a/loopy/kernel/instruction.py b/loopy/kernel/instruction.py
index 878fb0d04..5c238ec28 100644
--- a/loopy/kernel/instruction.py
+++ b/loopy/kernel/instruction.py
@@ -785,33 +785,11 @@ class Assignment(MultiAssignmentBase):
                 EVALUATE ztemp_new = f(ztemp_old) + a
             WHILE compare_and_swap(z[i], ztemp_new, ztemp_old) did not succeed
 
-    .. attribute:: force_scalar
-
-        If True, temporary variable created from the assignee will be a scalar
-        variable, regardless of the vector status of this assignment.
-
-        .. note::
-
-            This is useful for OpenCL code-generation, to allow for if-statements
-            that do not depend on a vector temporary (which causes compilation
-            failures).
-
-    .. attribute:: force_vector
-
-        If True, temporary variable created from the assignee will be a vector
-        variable, regardless of the vector status of this assignment.
-
-        .. note::
-
-            This is useful for OpenCL code-generation, to allow for if-statements
-            that do not depend on a vector temporary (which causes compilation
-            failures).
-
     .. automethod:: __init__
     """
 
     fields = MultiAssignmentBase.fields | \
-            set("assignee temp_var_type atomicity force_scalar force_vector".split())
+            set("assignee temp_var_type atomicity".split())
     pymbolic_fields = MultiAssignmentBase.pymbolic_fields | set(["assignee"])
 
     def __init__(self,
@@ -828,9 +806,7 @@ class Assignment(MultiAssignmentBase):
             temp_var_type=None, atomicity=(),
             priority=0, predicates=frozenset(),
             insn_deps=None, insn_deps_is_final=None,
-            forced_iname_deps=None, forced_iname_deps_is_final=None,
-            force_scalar=False,
-            force_vector=False):
+            forced_iname_deps=None, forced_iname_deps_is_final=None):
 
         super(Assignment, self).__init__(
                 id=id,
@@ -866,8 +842,6 @@ class Assignment(MultiAssignmentBase):
         self.expression = expression
         self.temp_var_type = temp_var_type
         self.atomicity = atomicity
-        self.force_scalar = force_scalar
-        self.force_vector = force_vector
 
     # {{{ implement InstructionBase interface
 
@@ -883,8 +857,7 @@ class Assignment(MultiAssignmentBase):
                 assignee=f(self.assignee, *args),
                 expression=f(self.expression, *args),
                 predicates=frozenset(
-                    f(pred, *args) for pred in self.predicates),
-                force_scalar=self.force_scalar)
+                    f(pred, *args) for pred in self.predicates))
 
     # }}}
 
@@ -1076,14 +1049,6 @@ def make_assignment(assignees, expression, temp_var_types=None, **kwargs):
             raise LoopyError("right-hand side in multiple assignment must be "
                     "function call or reduction, got: '%s'" % expression)
 
-        if kwargs.pop('force_scalar', False):
-            raise LoopyError("Force scalar option cannot be used with multiple "
-                             "assigments.")
-
-        if kwargs.pop('force_vector', False):
-            raise LoopyError("Force vector option cannot be used with multiple "
-                             "assigments.")
-
         return CallInstruction(
                 assignees=assignees,
                 expression=expression,
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 52996456d..19795fb4a 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -105,9 +105,6 @@ class IdentityMapperMixin(object):
 
     def map_type_annotation(self, expr, *args):
         kwargs = {}
-        if isinstance(expr, TypeAnnotation):
-            kwargs['force_scalar'] = expr.force_scalar
-            kwargs['force_vector'] = expr.force_vector
         return type(expr)(expr.type, self.rec(expr.child), **kwargs)
 
     map_type_cast = map_type_annotation
@@ -423,19 +420,13 @@ class TypeAnnotation(p.Expression):
     assignments that create temporaries.
     """
 
-    def __init__(self, type, child, force_scalar=False, force_vector=False):
+    def __init__(self, type, child):
         super(TypeAnnotation, self).__init__()
         self.type = type
         self.child = child
-        self.force_scalar = force_scalar
-        self.force_vector = force_vector
-
-        if (self.force_scalar and self.force_vector):
-            raise TypeError('A type annotation cannot simultaneously be forced to '
-                            'both scalar and vector types')
 
     def __getinitargs__(self):
-        return (self.type, self.child, self.force_scalar, self.force_vector)
+        return (self.type, self.child)
 
     def stringifier(self):
         return StringifyMapper
@@ -1141,46 +1132,22 @@ class LoopyParser(ParserBase):
             return float(val)  # generic float
 
     def parse_prefix(self, pstate):
-        from pymbolic.parser import _PREC_UNARY, _less, _greater, _identifier, _colon
+        from pymbolic.parser import _PREC_UNARY, _less, _greater, _identifier
         if pstate.is_next(_less):
             pstate.advance()
-            scalar_or_vec = None
             if pstate.is_next(_greater):
                 typename = None
                 pstate.advance()
-            elif pstate.is_next(_colon):
-                # force scalar specified
-                typename = None
-                pstate.advance()
-                pstate.expect(_identifier)
-                scalar_or_vec = pstate.next_str_and_advance()
-                pstate.expect(_greater)
-                pstate.advance()
             else:
                 pstate.expect(_identifier)
                 typename = pstate.next_str()
                 pstate.advance()
-                # check for scalar / vector specification
-                if pstate.is_next(_colon):
-                    pstate.advance()
-                    pstate.expect(_identifier)
-                    scalar_or_vec = pstate.next_str()
-                    pstate.advance()
-
                 pstate.expect(_greater)
                 pstate.advance()
 
-            if scalar_or_vec:
-                if scalar_or_vec not in ['s', 'v']:
-                    raise TypeError("Cannot force assignment to type '{}'"
-                                    "did you mean, 's' (scalar) or 'v' (vector)?" %
-                                    scalar_or_vec)
-
             return TypeAnnotation(
                     typename,
-                    self.parse_expression(pstate, _PREC_UNARY),
-                    force_scalar=scalar_or_vec == 's',
-                    force_vector=scalar_or_vec == 'v')
+                    self.parse_expression(pstate, _PREC_UNARY))
         else:
             return super(LoopyParser, self).parse_prefix(pstate)
 
diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py
index 483b37d1f..5ae9e5393 100644
--- a/loopy/transform/privatize.py
+++ b/loopy/transform/privatize.py
@@ -120,17 +120,6 @@ def privatize_temporaries_with_inames(
         # test that -- a) the iname is an ILP or vector tag
         if filter_iname_tags_by_type(kernel.iname_to_tags[iname],
                                      (IlpBaseTag, VectorizeTag)):
-            # check for user specified type
-            if temp_var.force_scalar:
-                if iname in writer_insn.read_dependency_names():
-                    raise LoopyError(
-                        "Cannot write to (user-specified) scalar variable '%s' "
-                        "using vec/ILP iname '%s' in instruction '%s'." % (
-                            temp_var.name, iname, writer_insn.id)
-                        )
-                return set()
-            elif temp_var.force_vector:
-                return set([iname])
             # and b) instruction depends on the ILP/vector iname
             return set([iname]) & writer_insn.dependency_names()
         return set()
@@ -177,8 +166,7 @@ def privatize_temporaries_with_inames(
 
             new_priv_axis_inames = priv_axis_inames - referenced_priv_axis_inames
 
-            if not new_priv_axis_inames and tv.force_scalar and \
-                    tv.name in var_to_new_priv_axis_iname:
+            if not new_priv_axis_inames and tv.name in var_to_new_priv_axis_iname:
                 # conflict
                 raise LoopyError("instruction '%s' requires var '%s' to be a "
                                  "scalar but previous instructions required "
@@ -227,7 +215,8 @@ def privatize_temporaries_with_inames(
             # update the dependency
             starting_dict[written_to] |= starting_dict[varname]
             # and recursively apply to the dependecy's dependencies
-            starting_dict.update(recursively_apply(written_to, starting_dict.copy()))
+            starting_dict.update(recursively_apply(
+                written_to, starting_dict.copy(), applied=applied))
 
         return starting_dict
 
@@ -294,21 +283,14 @@ def privatize_temporaries_with_inames(
         eiii = ExtraInameIndexInserter(var_to_extra_iname)
         new_insn = insn.with_transformed_expressions(eiii)
         if not eiii.seen_priv_axis_inames <= insn.within_inames:
-
-            # the only O.K. case here is that the user specified that the instruction
-            # should be a vector, and all the missing iname tags are vectors.
-            if not getattr(insn, 'force_vector', False) and all(
-                    filter_iname_tags_by_type(kernel.iname_to_tags[iname],
-                                              VectorizeTag)
-                    for x in eiii.seen_priv_axis_inames - insn.within_inames):
-                raise LoopyError(
-                    "Kernel '%s': Instruction '%s': touched variable that "
-                    "(for privatization, e.g. as performed for ILP) "
-                    "required iname(s) '%s', but that the instruction was not "
-                    "previously within the iname(s). To remedy this, first promote"
-                    "the instruction into the iname."
-                    % (kernel.name, insn.id, ", ".join(
-                        eiii.seen_priv_axis_inames - insn.within_inames)))
+            raise LoopyError(
+                "Kernel '%s': Instruction '%s': touched variable that "
+                "(for privatization, e.g. as performed for ILP) "
+                "required iname(s) '%s', but that the instruction was not "
+                "previously within the iname(s). To remedy this, first promote"
+                "the instruction into the iname."
+                % (kernel.name, insn.id, ", ".join(
+                    eiii.seen_priv_axis_inames - insn.within_inames)))
 
         new_insns.append(new_insn)
 
diff --git a/test/test_loopy.py b/test/test_loopy.py
index f379319f4..3f01b0bc9 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2880,20 +2880,6 @@ def test_explicit_simd_temporary_promotion(ctx_factory):
 
     # fun with vector temporaries
 
-    # first, test parsing
-    knl = lp.make_kernel(
-        '{[i,j]: 0 <= i,j < 12}',
-        """
-        <> t = 1
-        <int32> t1 = 1
-        <int32:s> t2 = 1
-        <:s> t3 = 1
-        <:v> tv = 1
-        <int32> tv1 = 1
-        <int32:v> tv2 = 1
-        <:v> tv3 = 1
-        """)
-
     def make_kernel(insn, ans=None, preamble=None, extra_inames=None, skeleton=None,
                     dtype=None):
         skeleton = """
@@ -2949,40 +2935,7 @@ def test_explicit_simd_temporary_promotion(ctx_factory):
         """)
     assert knl.temporary_variables['test2'].shape == (4,)
 
-    # case 4) test that a conflict in user-specified vector types results in error
-
-    # 4a) initial scalar assignment w/ later vector access
-    preamble = """
-    for k
-        <:s> test = 1
-    end
-    """
-
-    from loopy import LoopyError
-    with pytest.raises(LoopyError):
-        make_kernel('test = mask[j]', preamble=preamble, extra_inames='k')
-
-    # 4b) initial vector assignment w/ later scalar access -- OK
-
-    preamble = """
-    for k
-        <:v> test = 1
-    end
-    """
-
-    from loopy import LoopyError
-    # treat warning as error to make sure the logic detecting user specified
-    # vectorization is good
-    import warnings
-    try:
-        warnings.filterwarnings(
-            'error', r"Instruction '[^\W]+': touched variable that \(for ILP\)")
-        make_kernel('test = mask[i]', preamble=preamble, extra_inames='k')
-    except Exception:
-        raise
-    finally:
-        warnings.resetwarnings()
-
+    # case 4)
     # modified case from pyjac -- what makes this case special is that
     # Kc is never directly assigned to in an instruction that directly references
     # the vector iname, j_inner.  Instead, it is a good test of the recursive
-- 
GitLab


From 837db1d8b4abd7c81764e414c4464ecb533fba91 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 29 Jun 2018 13:09:07 -0400
Subject: [PATCH 125/144] remove old force_scalar error check

---
 loopy/transform/privatize.py | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py
index 5ae9e5393..1e9b726f3 100644
--- a/loopy/transform/privatize.py
+++ b/loopy/transform/privatize.py
@@ -166,15 +166,6 @@ def privatize_temporaries_with_inames(
 
             new_priv_axis_inames = priv_axis_inames - referenced_priv_axis_inames
 
-            if not new_priv_axis_inames and tv.name in var_to_new_priv_axis_iname:
-                # conflict
-                raise LoopyError("instruction '%s' requires var '%s' to be a "
-                                 "scalar but previous instructions required "
-                                 "vector/ILP inames '%s'" % (
-                                        writer_insn_id, tv.name, ", ".join(
-                                            var_to_new_priv_axis_iname[
-                                                tv.name])))
-
             if not new_priv_axis_inames:
                 continue
 
-- 
GitLab


From 1ed10445615dd4f76001a1c7b971a5e106e7898c Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 29 Jun 2018 14:15:35 -0400
Subject: [PATCH 126/144] incorporate two different privitizing 'flavors', one
 for ILP and one for vectorization -- the main difference is that
 vectorization requires a _direct_ write-dependency on the vectorizing iname,
 whereas an ILP promotion simply requires that the temporary is inside an
 ILP-loop

---
 loopy/transform/privatize.py | 31 ++++++++++++++++---------------
 1 file changed, 16 insertions(+), 15 deletions(-)

diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py
index 1e9b726f3..cb541b573 100644
--- a/loopy/transform/privatize.py
+++ b/loopy/transform/privatize.py
@@ -117,10 +117,15 @@ def privatize_temporaries_with_inames(
                 for s in only_var_names.split(","))
 
     def find_privitzing_inames(writer_insn, iname, temp_var):
-        # test that -- a) the iname is an ILP or vector tag
-        if filter_iname_tags_by_type(kernel.iname_to_tags[iname],
-                                     (IlpBaseTag, VectorizeTag)):
-            # and b) instruction depends on the ILP/vector iname
+        # test that the iname is an ILP or vector tag
+        if filter_iname_tags_by_type(kernel.iname_to_tags[iname], IlpBaseTag):
+            # ILP inames have no additional requirements for promotion
+            return set([iname])
+        if filter_iname_tags_by_type(kernel.iname_to_tags[iname], VectorizeTag):
+            # For vector inames, we should only consider an iname if the
+            # instruction _directly_ depends on it (to avoid spurious vector
+            # promotions).  Missed promotions will be handled in the recursive
+            # application step
             return set([iname]) & writer_insn.dependency_names()
         return set()
 
@@ -136,21 +141,16 @@ def privatize_temporaries_with_inames(
         if only_var_names is not None and tv.name not in only_var_names:
             continue
 
-        seen = set()
         for writer_insn_id in set(wmap.get(tv.name, [])):
-            if writer_insn_id in seen:
-                continue
             writer_insn = kernel.id_to_insn[writer_insn_id]
-            seen.add(writer_insn_id)
-
             test_inames = kernel.insn_inames(writer_insn) & privatizing_inames
 
-            # A temporary variable that's only assigned to from other vector or ILP
-            # temporaries will never have a direct-dependency on the privitizing
+            # A temporary variable that's only assigned to from other vector
+            # temporaries will never have a direct-dependency on the vector
             # iname. After building a map of which temporary variables write to
             # others, we can recursively travel down the temporary variable write-map
-            # of any newly privitized temporary variable, and extend the
-            # privitization to those temporary variables dependent on it.
+            # of any newly vectorized temporary variable, and extend the
+            # vectorization to those temporary variables dependent on it.
 
             for tv_read in writer_insn.read_dependency_names():
                 if tv_read in kernel.temporary_variables:
@@ -186,7 +186,7 @@ def privatize_temporaries_with_inames(
 
     # }}}
 
-    # {{{ recursively apply vector / ILP temporary write heuristic
+    # {{{ recursively apply vector temporary write heuristic
 
     def recursively_apply(varname, starting_dict, applied=None):
         if applied is None:
@@ -211,6 +211,7 @@ def privatize_temporaries_with_inames(
 
         return starting_dict
 
+    # apply recursive write heueristic
     for varname in list(var_to_new_priv_axis_iname.keys()):
         if any(filter_iname_tags_by_type(kernel.iname_to_tags[iname], VectorizeTag)
                for iname in var_to_new_priv_axis_iname[varname]):
@@ -219,7 +220,7 @@ def privatize_temporaries_with_inames(
 
     # }}}
 
-    # {{{ find ilp iname lengths
+    # {{{ find privitizing iname lengths
 
     from loopy.isl_helpers import static_max_of_pw_aff
     from loopy.symbolic import pw_aff_to_expr
-- 
GitLab


From 601f6c7615bd98206fdaa5988be8ba3be70d834e Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 29 Jun 2018 14:27:17 -0400
Subject: [PATCH 127/144] provide much more detailed reasoning of heuristic
 flavors

---
 loopy/transform/privatize.py | 55 ++++++++++++++++++++++++++++--------
 1 file changed, 43 insertions(+), 12 deletions(-)

diff --git a/loopy/transform/privatize.py b/loopy/transform/privatize.py
index cb541b573..5c149f177 100644
--- a/loopy/transform/privatize.py
+++ b/loopy/transform/privatize.py
@@ -117,19 +117,46 @@ def privatize_temporaries_with_inames(
                 for s in only_var_names.split(","))
 
     def find_privitzing_inames(writer_insn, iname, temp_var):
-        # test that the iname is an ILP or vector tag
+        # There are now two flavors of privitzing iname promotion, one for ILP and
+        # another for vectorization
+
+        # Temporaries inside an ILP loop have have no additional requirements for
+        # promotion
+
+        # However, we should _not_ assume that this is the case for temporaries
+        # inside a vectorizing loop.  Instead, only temporaries written to by
+        # instructions that directly depend on the vector iname should be promoted.
+        # This is to avoid spurious promotions of constants (not-dependent on the
+        # vector iname) to vector dtypes, for example (w/ j_inner the vectorizing
+        # iname, and 'a' a data-array with a vector dtype on the second axis):
+        #
+        # ```
+        #   for j_outer
+        #       for j_inner
+        #           <> c = function()
+        #           a[c, j_inner] = 1
+        #       end
+        #   end
+        # ```
+        #
+        # is perfectly valid -- however, if c is promoted to a vector-dtype, we will
+        # hit issues with a (potentially) non-constant "vector" index being in a
+        # non-vector axis.  Hence, we must be cautions in vector promotions; those
+        # vector temporaries _not_ written to by a directly vector-iname dependent
+        # instruction will be promoted in the second stage (recursive application of
+        # the write map)
+
         if filter_iname_tags_by_type(kernel.iname_to_tags[iname], IlpBaseTag):
-            # ILP inames have no additional requirements for promotion
             return set([iname])
         if filter_iname_tags_by_type(kernel.iname_to_tags[iname], VectorizeTag):
             # For vector inames, we should only consider an iname if the
-            # instruction _directly_ depends on it (to avoid spurious vector
+            # instruction has a _direct_ dependency on it (to avoid spurious vector
             # promotions).  Missed promotions will be handled in the recursive
             # application step
             return set([iname]) & writer_insn.dependency_names()
         return set()
 
-    # {{{ find variables that need extra indices
+    # {{{ Stage 1: find variables that need extra indices
 
     from collections import defaultdict
     tv_wmap = defaultdict(lambda: set())
@@ -145,13 +172,7 @@ def privatize_temporaries_with_inames(
             writer_insn = kernel.id_to_insn[writer_insn_id]
             test_inames = kernel.insn_inames(writer_insn) & privatizing_inames
 
-            # A temporary variable that's only assigned to from other vector
-            # temporaries will never have a direct-dependency on the vector
-            # iname. After building a map of which temporary variables write to
-            # others, we can recursively travel down the temporary variable write-map
-            # of any newly vectorized temporary variable, and extend the
-            # vectorization to those temporary variables dependent on it.
-
+            # see stage 2
             for tv_read in writer_insn.read_dependency_names():
                 if tv_read in kernel.temporary_variables:
                     tv_wmap[tv_read].add(tv.name)
@@ -186,7 +207,17 @@ def privatize_temporaries_with_inames(
 
     # }}}
 
-    # {{{ recursively apply vector temporary write heuristic
+    # {{{ Stage 2: recursively apply vector temporary write heuristic
+
+    # A temporary variable that's only assigned to from other vector
+    # temporaries will never have a direct-dependency on the vector
+    # iname. After building a map of which temporary variables write to
+    # others, we can recursively travel down the temporary variable write-map
+    # of any newly vectorized temporary variable, and extend the
+    # vectorization to those temporary variables dependent on it.
+    #
+    # See ..func: `find_privitzing_inames` for reasoning about vector temporary
+    # promotion
 
     def recursively_apply(varname, starting_dict, applied=None):
         if applied is None:
-- 
GitLab


From 61653be00e6d595239a766ef1d30b76bf42265c6 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 29 Jun 2018 14:32:18 -0400
Subject: [PATCH 128/144] remove unused commit

---
 loopy/tools.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/loopy/tools.py b/loopy/tools.py
index 8c5d36390..b20ca10d5 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -26,7 +26,6 @@ import six
 
 import collections
 import numpy as np
-from pytools import memoize_method
 from pytools.persistent_dict import KeyBuilder as KeyBuilderBase
 from loopy.symbolic import WalkMapper as LoopyWalkMapper
 from pymbolic.mapper.persistent_hash import (
-- 
GitLab


From be8566a2aec005306248304f1fb400b41c161e97 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 29 Jun 2018 14:41:33 -0400
Subject: [PATCH 129/144] Revert "remove unused commit"

This reverts commit 61653be00e6d595239a766ef1d30b76bf42265c6.
---
 loopy/tools.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/loopy/tools.py b/loopy/tools.py
index b20ca10d5..8c5d36390 100644
--- a/loopy/tools.py
+++ b/loopy/tools.py
@@ -26,6 +26,7 @@ import six
 
 import collections
 import numpy as np
+from pytools import memoize_method
 from pytools.persistent_dict import KeyBuilder as KeyBuilderBase
 from loopy.symbolic import WalkMapper as LoopyWalkMapper
 from pymbolic.mapper.persistent_hash import (
-- 
GitLab


From fdec20e9ad832e63bcb461a995f1644bf66b05b1 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Fri, 29 Jun 2018 14:42:26 -0400
Subject: [PATCH 130/144] remove unused import for flake

---
 loopy/kernel/tools.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 669011964..df89e6c6c 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -34,7 +34,7 @@ import numpy as np
 import islpy as isl
 from islpy import dim_type
 from loopy.diagnostic import LoopyError, warn_with_kernel
-from pytools import memoize_on_first_arg, memoize_method
+from pytools import memoize_on_first_arg
 from loopy.tools import natsorted
 
 import logging
-- 
GitLab


From 9ae0a8a949417eb7e36e428b4aba782a75ba9ba5 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Mon, 2 Jul 2018 10:42:29 -0400
Subject: [PATCH 131/144] Improved detection of when to use explict vector
 conversion and update conversion test to include:

1.  Literal to vector assignment (no explicit conversion should be used)
2.  Non-vector temporary variable to vector assignment (no explicit conversion)
3.  Vector-temporary to vector variable assignment (explicit conversion)

Previously, only vector to vector assignment was tested.
---
 loopy/target/opencl.py | 28 +++++++++++------
 test/test_loopy.py     | 71 ++++++++++++++++++++++++++++++++----------
 2 files changed, 73 insertions(+), 26 deletions(-)

diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 0825abab5..3965e24bf 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -349,16 +349,24 @@ class ExpressionToOpenCLCExpressionMapper(ExpressionToCExpressionMapper):
             actual_type, needed_dtype, s)
         if self.codegen_state.vectorization_info is not None and (
                 actual_type != needed_dtype):
-            from loopy.symbolic import Literal
-            if isinstance(s, Literal):
-                # if its a literal, no need for explicit conversion
-                return wrap
-
-            ctype = self.kernel.target.get_dtype_registry().dtype_to_ctype(
-                needed_dtype)
-            vw = self.codegen_state.vectorization_info.length
-            # need to add an explicit conversion
-            return var("convert_%s%d" % (ctype, vw))(wrap)
+            from loopy.symbolic import get_dependencies
+            from loopy.kernel.array import VectorArrayDimTag
+            rhs_deps = get_dependencies(s)
+
+            def is_vector(var):
+                return any(isinstance(x, VectorArrayDimTag) for x in var.dim_tags)
+            # if we have a vector-type on the RHS and the RHS dtype != LHS dtype,
+            # we need an explicit conversion
+            rhs_temp_vars = rhs_deps & set(self.kernel.temporary_variables.keys())
+            rhs_args = rhs_deps & set(self.kernel.arg_dict.keys())
+            if any(is_vector(self.kernel.temporary_variables[x])
+                   for x in rhs_temp_vars) or any(
+                   is_vector(self.kernel.arg_dict[x]) for x in rhs_args):
+                ctype = self.kernel.target.get_dtype_registry().dtype_to_ctype(
+                    needed_dtype)
+                vw = self.codegen_state.vectorization_info.length
+                # need to add an explicit conversion
+                return var("convert_%s%d" % (ctype, vw))(wrap)
         return wrap
 
 # }}}
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 3f01b0bc9..c9ec37574 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -3072,24 +3072,63 @@ def test_explicit_vector_dtype_conversion(ctx_factory, lhs_dtype, rhs_dtype):
     ctx = ctx_factory()
 
     # test that dtype conversion happens correctly between differing vector-dtypes
+    def __make_kernel(insn, has_conversion=True, uses_temp=True):
+        vw = 4
+        a_lp = lp.GlobalArg('a', shape=(12,), dtype=rhs_dtype)
+        temp_lp = lp.TemporaryVariable('temp', dtype=lhs_dtype)
 
-    vw = 4
-    a_lp = lp.GlobalArg('a', shape=(12,), dtype=rhs_dtype)
-    temp_lp = lp.TemporaryVariable('temp', dtype=lhs_dtype)
-
-    knl = lp.make_kernel(['{[i]: 0 <= i < 12}'],
-            """
-            for i
-                temp = a[i]
-            end
-            """,
-            [a_lp, temp_lp])
-    knl = lp.split_iname(knl, 'i', vw, inner_tag='vec')
-    knl = lp.split_array_axis(knl, 'a', 0, 4)
-    knl = lp.tag_array_axes(knl, 'a', 'N0,vec')
+        knl = lp.make_kernel(['{[i]: 0 <= i < 12}'],
+                """
+                for i
+                    {insn}
+                end
+                """.format(insn=insn),
+                [a_lp, temp_lp],
+                target=lp.PyOpenCLTarget(ctx.devices[0]),
+                silenced_warnings=['temp_to_write(temp)'] if not uses_temp else [])
+        knl = lp.split_iname(knl, 'i', vw, inner_tag='vec')
+        knl = lp.split_array_axis(knl, 'a', 0, 4)
+        knl = lp.tag_array_axes(knl, 'a', 'N0,vec')
 
-    queue = cl.CommandQueue(ctx)
-    knl(queue, a=np.zeros((12,), dtype=rhs_dtype).reshape((3, 4)))
+        queue = cl.CommandQueue(ctx)
+        # check that the kernel compiles correctly
+        knl(queue, a=np.zeros((12,), dtype=rhs_dtype).reshape((3, 4)))
+
+        # check that we have or don't have a conversion
+        assert ('convert_' in lp.generate_code_v2(knl).device_code()) == \
+            has_conversion
+
+    # test simple dtype conversion
+    __make_kernel("temp = a[i]")
+
+    # test literal assignment
+    __make_kernel("a[i] = 1", False, False)
+
+    # test that a non-vector temporary doesn't trigger conversion
+    #
+    # this should generate the code (e.g.,):
+    #   __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1)))
+    #   loopy_kernel(__global long4 *__restrict__ a)
+    #   {
+    #      int temp;
+    #      for (int i_outer = 0; i_outer <= 2; ++i_outer)
+    #      {
+    #        temp = 1;
+    #        a[i_outer] = temp;
+    #      }
+    #    }
+    #
+    # that is, temp should _not_ be assigned to "a" w/ convert_long4
+    __make_kernel("""
+                      temp = 1
+                      a[i] = temp
+                  """, has_conversion=False)
+
+    # test that the inverse _does_ result in a convers
+    __make_kernel("""
+                      temp = a[i] {id=1, dep=*}
+                      a[i] = temp {id=2, dep=1}
+                  """)
 
 
 def test_vectorizability():
-- 
GitLab


From 1857b74e75c3e23b57babecdd0b47d74c02dbed5 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Tue, 3 Jul 2018 15:35:00 -0400
Subject: [PATCH 132/144] make remainder vectorizable, and add to
 vectorizability check

---
 loopy/expression.py |  2 ++
 test/test_loopy.py  | 15 +++++++++------
 2 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/loopy/expression.py b/loopy/expression.py
index 721a95f74..0a4761eda 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -108,6 +108,8 @@ class VectorizabilityChecker(RecursiveMapper):
                 or
                 self.rec(expr.denominator))
 
+    map_remainder = map_quotient
+
     def map_linear_subscript(self, expr):
         return False
 
diff --git a/test/test_loopy.py b/test/test_loopy.py
index c9ec37574..27dae31a5 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -3164,18 +3164,18 @@ def test_vectorizability():
         assert isinstance(knl.args[0].dim_tags[-1], VectorArrayDimTag)
         assert filter_iname_tags_by_type(knl.iname_to_tags['i_inner'], VectorizeTag)
 
-    def run(op_list=[], unary_operators=[], func_list=[], unary_funcs=[]):
+    def run(op_list=[], unary_operators=[], func_list=[], unary_funcs=[],
+            rvals=['1', 'a[i]']):
         for op in op_list:
             template = 'a[i] = a[i] %(op)s %(rval)s' \
                 if op not in unary_operators else 'a[i] = %(op)s a[i]'
-
-            create_and_test(template % dict(op=op, rval='1'))
-            create_and_test(template % dict(op=op, rval='a[i]'))
+            for rval in rvals:
+                create_and_test(template % dict(op=op, rval=rval))
         for func in func_list:
             template = 'a[i] = %(func)s(a[i], %(rval)s)' \
                 if func not in unary_funcs else 'a[i] = %(func)s(a[i])'
-            create_and_test(template % dict(func=func, rval='1'))
-            create_and_test(template % dict(func=func, rval='a[i]'))
+            for rval in rvals:
+                create_and_test(template % dict(func=func, rval=rval))
 
     # 1) comparisons
     run(['>', '>=', '<', '<=', '==', '!='])
@@ -3191,6 +3191,9 @@ def test_vectorizability():
     run(func_list=['acos', 'exp10', 'atan2', 'round'],
         unary_funcs=['round', 'acos', 'exp10'])
 
+    # 5) remainders
+    run(['%'])
+
 
 def test_check_for_variable_access_ordering():
     knl = lp.make_kernel(
-- 
GitLab


From 13a891727dcde7cb8bea58b79a60ff54ec4cb3e5 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Tue, 3 Jul 2018 15:57:38 -0400
Subject: [PATCH 133/144] enable floor-div vectorizability check and test

---
 loopy/expression.py | 8 ++++++++
 test/test_loopy.py  | 5 +++--
 2 files changed, 11 insertions(+), 2 deletions(-)

diff --git a/loopy/expression.py b/loopy/expression.py
index 0a4761eda..06fe3bb06 100644
--- a/loopy/expression.py
+++ b/loopy/expression.py
@@ -110,6 +110,14 @@ class VectorizabilityChecker(RecursiveMapper):
 
     map_remainder = map_quotient
 
+    def map_floor_div(self, expr):
+        """
+        (a) - ( ((a)<0) ? ((b)-1) : 0 )  ) / (b)
+        """
+        a, b = expr.numerator, expr.denominator
+        return self.rec(a) and self.rec(a.lt(0)) and self.rec(b - 1) and \
+            self.rec((a - (b - 1)) / b) and self.rec(a / b)
+
     def map_linear_subscript(self, expr):
         return False
 
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 27dae31a5..002925137 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -3191,8 +3191,9 @@ def test_vectorizability():
     run(func_list=['acos', 'exp10', 'atan2', 'round'],
         unary_funcs=['round', 'acos', 'exp10'])
 
-    # 5) remainders
-    run(['%'])
+    # 5) remainders and floor division (use 4 instead of 1 to avoid pymbolic
+    #    optimizing out the a[i] % 1)
+    run(['%', '//'], rvals=['a[i]', '4'])
 
 
 def test_check_for_variable_access_ordering():
-- 
GitLab


From 029a6692b3e8dfe588f63e241d5df5934ba2c22b Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Mon, 27 Aug 2018 17:37:03 -0400
Subject: [PATCH 134/144] better explanation of why this is here

---
 loopy/codegen/result.py | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index f38585c46..18091371d 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -320,8 +320,10 @@ def wrap_in_if(codegen_state, condition_exprs, inner):
                     lhs_dtype = codegen_state.expression_to_code_mapper.infer_type(
                         ast.lvalue.expr)
                     if not lhs_dtype.is_integral():
-                        # the necessary dtype is the integer version of the floating
-                        # point type (e.g., float64 -> int64)
+                        # in OpenCL, the dtype of the conditional in a select call
+                        # must be an integer of the same 'bitness' as the dtype of
+                        # the conditional (https://www.khronos.org/registry/OpenCL/sdk/1.0/docs/man/xhtml/select.html)  # noqa
+                        # (e.g., float64 -> int64)
                         from loopy.types import to_loopy_type
                         import numpy as np
                         lhs_dtype = to_loopy_type(
-- 
GitLab


From c39194f1845caf56cd583d996a4fe88bf5bff9b0 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Tue, 28 Aug 2018 10:34:16 -0400
Subject: [PATCH 135/144] Add the ability to reference a vector iname directly
 in a conditional. This is achived via:

1.  A new VectorLiteral class, based off ArrayLiteral, but with the string mapping changed to have enclosing parenthesis rather than curly braces.
2.  A new VectorTypeCast class, based off TypeCast, but explicitly storing both the base numpy dtype of the VectorLiteral (for inferencing) and the stringified vector dtype (e.g., 'int4') for generation
3.  Modification of the condition mapper wrapper in `wrap_in_if` to properly generate the VectorTypeCast

and is tested in test/test_loopy.py::test_explicit_simd_vector_iname_in_conditional
---
 loopy/codegen/result.py              | 60 ++++++++++++-----
 loopy/symbolic.py                    | 97 ++++++++++++++++++++++++++++
 loopy/target/c/codegen/expression.py |  7 ++
 loopy/target/opencl.py               |  3 +
 loopy/type_inference.py              |  5 ++
 test/test_loopy.py                   | 35 ++++++++++
 6 files changed, 190 insertions(+), 17 deletions(-)

diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index 18091371d..d4b85b9b9 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -261,9 +261,15 @@ def wrap_in_if(codegen_state, condition_exprs, inner):
         cur_ast = inner.current_ast(codegen_state)
         method = codegen_state.ast_builder.emit_if
 
-        def condition_mapper(ast=None, type_context=None, needed_dtype=None):
+        def condition_mapper(ast=None, type_context=None, needed_dtype=None,
+                             condition=None):
+            if condition is not None:
+                # explicit vectorization override
+                pass
+            else:
+                condition = LogicalAnd(tuple(condition_exprs))
             return codegen_state.expression_to_code_mapper(
-                    LogicalAnd(tuple(condition_exprs)), PREC_NONE,
+                    condition, PREC_NONE,
                     type_context=type_context, needed_dtype=needed_dtype)
         mapper = condition_mapper
 
@@ -301,19 +307,6 @@ def wrap_in_if(codegen_state, condition_exprs, inner):
 
                     # get the default condition to check for vectorizability
                     check = condition_mapper()
-                    from loopy.diagnostic import LoopyError
-                    deps = set()
-                    try:
-                        for c in check.expr.children:
-                            deps |= get_dependencies(c)
-
-                        if deps & set([vec_iname]):
-                            # we'd have to insert our own mirror temporary of the
-                            # vector iname here
-                            raise LoopyError("Can't directly use vector iname in "
-                                             "conditional")
-                    except (AttributeError, TypeError):
-                        pass
 
                     # get LHS dtype for (potential) casting of condition
                     from loopy.expression import dtype_to_type_context
@@ -329,11 +322,44 @@ def wrap_in_if(codegen_state, condition_exprs, inner):
                         lhs_dtype = to_loopy_type(
                             np.dtype('i%d' % lhs_dtype.itemsize),
                             lhs_dtype.target)
-
                     type_context = dtype_to_type_context(codegen_state.kernel.target,
                         lhs_dtype)
+
+                    from loopy.symbolic import VectorTypeCast
+                    from loopy.types import to_loopy_type
+                    from pymbolic.primitives import Variable
+                    from pymbolic.mapper.substitutor import substitute
+                    import numpy as np
+                    kwargs = {}
+                    deps = set()
+                    try:
+                        for c in check.expr.children:
+                            deps |= get_dependencies(c)
+
+                        if deps & set([vec_iname]):
+                            # we have to insert our own temporary version of the
+                            # vector iname here
+                            # first, determine the dtype
+                            size = lhs_dtype.itemsize
+                            np_dtype = np.dtype('i%d' % lhs_dtype.itemsize)
+                            dtype = codegen_state.kernel.target.\
+                                get_dtype_registry().dtype_to_ctype(
+                                    to_loopy_type(np_dtype,
+                                    target=codegen_state.kernel.target))
+                            name = '%s%d' % (dtype, size)
+                            # next, get the base of a vector temporary
+                            init = range(size)
+                            # finally, put in a vextor typecast
+                            temp_iname = VectorTypeCast(np_dtype, init, name)
+                            kwargs['condition'] = substitute(
+                                check.expr, {Variable(vec_iname): temp_iname})
+
+                    except (AttributeError, TypeError):
+                        pass
+
                     return condition_mapper(
-                        type_context=type_context, needed_dtype=lhs_dtype)
+                        type_context=type_context, needed_dtype=lhs_dtype,
+                        **kwargs)
 
                 method = codegen_state.ast_builder.emit_vector_if
                 mapper = condition_mapper_wrapper
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index b1bbad176..e89d3109a 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -75,6 +75,8 @@ class IdentityMapperMixin(object):
     def map_array_literal(self, expr, *args):
         return type(expr)(tuple(self.rec(ch, *args) for ch in expr.children))
 
+    map_vector_literal = map_array_literal
+
     def map_group_hw_index(self, expr, *args):
         return expr
 
@@ -110,6 +112,7 @@ class IdentityMapperMixin(object):
         return type(expr)(expr.type, self.rec(expr.child), **kwargs)
 
     map_type_cast = map_type_annotation
+    map_vector_type_cast = map_type_annotation
 
     map_linear_subscript = IdentityMapperBase.map_subscript
 
@@ -140,6 +143,8 @@ class WalkMapper(WalkMapperBase):
         for ch in expr.children:
             self.rec(ch, *args)
 
+    map_vector_literal = map_array_literal
+
     def map_group_hw_index(self, expr, *args):
         self.visit(expr)
 
@@ -157,6 +162,7 @@ class WalkMapper(WalkMapperBase):
             return
         self.rec(expr.child, *args)
 
+    map_vector_type_cast = map_type_cast
     map_tagged_variable = WalkMapperBase.map_variable
 
     def map_loopy_function_identifier(self, expr, *args):
@@ -196,6 +202,10 @@ class StringifyMapper(StringifyMapperBase):
     def map_array_literal(self, expr, *args):
         return "{%s}" % ", ".join(self.rec(ch) for ch in expr.children)
 
+    def map_vector_literal(self, expr, *args):
+        from pymbolic.mapper.stringifier import PREC_NONE
+        return "(%s)" % ", ".join(self.rec(ch, PREC_NONE) for ch in expr.children)
+
     def map_group_hw_index(self, expr, enclosing_prec):
         return "grp.%d" % expr.index
 
@@ -233,6 +243,11 @@ class StringifyMapper(StringifyMapperBase):
         from pymbolic.mapper.stringifier import PREC_NONE
         return "cast(%s, %s)" % (repr(expr.type), self.rec(expr.child, PREC_NONE))
 
+    def map_vector_type_cast(self, expr, enclosing_prec):
+        from pymbolic.mapper.stringifier import PREC_NONE
+        return "cast(%s, %s)" % (repr(expr.type_name), self.rec(
+            expr.child, PREC_NONE))
+
 
 class UnidirectionalUnifier(UnidirectionalUnifierBase):
     def map_reduction(self, expr, other, unis):
@@ -290,9 +305,14 @@ class DependencyMapper(DependencyMapperBase):
     def map_type_cast(self, expr):
         return self.rec(expr.child)
 
+    map_vector_type_cast = map_type_cast
+
     def map_literal(self, expr):
         return set()
 
+    def map_vector_literal(self, expr):
+        return self.combine(self.rec(child) for child in expr.children)
+
 
 class SubstitutionRuleExpander(IdentityMapper):
     def __init__(self, rules):
@@ -369,6 +389,25 @@ class ArrayLiteral(p.Leaf):
     mapper_method = "map_array_literal"
 
 
+class VectorLiteral(p.Leaf):
+    """An vector dtype literal."""
+
+    # Currently only in conjunction with the VectorTypeCast
+
+    def __init__(self, children):
+        self.children = children
+
+    def stringifier(self):
+        return StringifyMapper
+
+    def __getinitargs__(self):
+        return (self.children,)
+
+    init_arg_names = ("children",)
+
+    mapper_method = "map_vector_literal"
+
+
 class HardwareAxisIndex(p.Leaf):
     def __init__(self, axis):
         self.axis = axis
@@ -479,6 +518,62 @@ class TypeCast(p.Expression):
     mapper_method = intern("map_type_cast")
 
 
+class VectorTypeCast(p.Expression):
+    """
+    A workaround for casts of vector temporaries, e.g.:
+        (int4)(0, 1, 2, 3)
+
+    Useful for inserting temporaries into expressions to avoid unvectorizable code
+
+    .. attribute:: type
+
+        The (non-vector) numpy type to cast to. e.g., if using 'int4', the type
+        would be np.int32
+
+    .. attribute:: child
+
+        The :class:`VectorLiteral` initializer list to convert to via typecast
+
+    .. attribute:: type_name
+
+        The stringified type (including vector size), e.g., 'int4'
+    """
+
+    def __init__(self, type, init, type_name):
+        super(VectorTypeCast, self).__init__()
+
+        from loopy.types import to_loopy_type, NumpyType
+        type = to_loopy_type(type)
+
+        if (not isinstance(type, NumpyType)
+                or not issubclass(type.dtype.type, np.number)):
+            from loopy.diagnostic import LoopyError
+            raise LoopyError("TypeCast only supports numerical numpy types, "
+                    "not '%s'" % type)
+
+        # We're storing the type as a name for now to avoid
+        # numpy pickling bug madness. (see loopy.types)
+        self.type_name = type_name
+        self.child = VectorLiteral(tuple(s for s in init))
+        self._base_type = type.dtype
+
+    @property
+    def type(self):
+        from loopy.types import NumpyType
+        return NumpyType(self._base_type)
+
+    # init_arg_names is a misnomer--they're attribute names used for pickling.
+    init_arg_names = ("type_name", "child", "_base_type")
+
+    def __getinitargs__(self):
+        return (self.type_name, self.child, self._base_type)
+
+    def stringifier(self):
+        return StringifyMapper
+
+    mapper_method = intern("map_vector_type_cast")
+
+
 class TaggedVariable(p.Variable):
     """This is an identifier with a tag, such as 'matrix$one', where
     'one' identifies this specific use of the identifier. This mechanism
@@ -1742,6 +1837,8 @@ class BatchedAccessRangeMapper(WalkMapper):
     def map_type_cast(self, expr, inames):
         return self.rec(expr.child, inames)
 
+    map_vector_type_cast = map_type_cast
+
 
 class AccessRangeMapper(object):
     """**IMPORTANT**
diff --git a/loopy/target/c/codegen/expression.py b/loopy/target/c/codegen/expression.py
index b82f0275d..b65004d21 100644
--- a/loopy/target/c/codegen/expression.py
+++ b/loopy/target/c/codegen/expression.py
@@ -380,6 +380,10 @@ class ExpressionToCExpressionMapper(IdentityMapper):
         cast = var("(%s)" % registry.dtype_to_ctype(expr.type))
         return cast(self.rec(expr.child, type_context))
 
+    def map_vector_type_cast(self, expr, type_context):
+        cast = var("(%s)" % expr.type_name)
+        return cast(self.rec(expr.child, type_context))
+
     def map_constant(self, expr, type_context):
         if isinstance(expr, (complex, np.complexfloating)):
             try:
@@ -935,6 +939,9 @@ class CExpressionToCodeMapper(RecursiveMapper):
     def map_array_literal(self, expr, enclosing_prec):
         return "{ %s }" % self.join_rec(", ", expr.children, PREC_NONE)
 
+    def map_vector_literal(self, expr, enclosing_prec):
+        return "( %s )" % self.join_rec(", ", expr.children, PREC_NONE)
+
 # }}}
 
 # vim: fdm=marker
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 7c5dd0ad7..5a670e80c 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -327,6 +327,9 @@ class ExpressionToOpenCLCExpressionMapper(ExpressionToCExpressionMapper):
     def map_local_hw_index(self, expr, type_context):
         return var("lid")(expr.axis)
 
+    def map_vector_literal(self, expr, type_context):
+        return var(', '.join('%s' % x for x in expr.children))
+
     def map_comparison(self, expr, type_context):
         from loopy.symbolic import get_dependencies
         from loopy.kernel.data import VectorizeTag, filter_iname_tags_by_type
diff --git a/loopy/type_inference.py b/loopy/type_inference.py
index 4b616f71c..7716bcf46 100644
--- a/loopy/type_inference.py
+++ b/loopy/type_inference.py
@@ -243,6 +243,11 @@ class TypeInferenceMapper(CombineMapper):
             raise LoopyError("Can't cast a '%s' to '%s'" % (subtype, expr.type))
         return [expr.type]
 
+    map_vector_type_cast = map_type_cast
+
+    def map_vector_literal(self, expr):
+        return self.combine([self.rec(child) for child in expr.children])
+
     def map_subscript(self, expr):
         return self.rec(expr.aggregate)
 
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 67a355bb9..359e673e6 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -3128,6 +3128,41 @@ def test_explicit_vector_dtype_conversion(ctx_factory, lhs_dtype, rhs_dtype):
                   """)
 
 
+def test_explicit_simd_vector_iname_in_conditional(ctx_factory):
+    ctx = ctx_factory()
+
+    def create_and_test(insn, answer, debug=False):
+        knl = lp.make_kernel(['{[i]: 0 <= i < 12}', '{[j]: 0 <= j < 1}'],
+                             insn,
+                             [lp.GlobalArg('a', shape=(1, 12,), dtype=np.int32),
+                              lp.GlobalArg('b', shape=(1, 12,), dtype=np.int32)])
+
+        knl = lp.split_iname(knl, 'i', 4, inner_tag='vec')
+        knl = lp.tag_inames(knl, [('j', 'g.0')])
+        knl = lp.split_array_axis(knl, ['a', 'b'], 1, 4)
+        knl = lp.tag_array_axes(knl, ['a', 'b'], 'N1,N0,vec')
+
+        # ensure we can generate code
+        code = lp.generate_code_v2(knl).device_code()
+        if debug:
+            print(code)
+        # and check answer
+        queue = cl.CommandQueue(ctx)
+        a = np.zeros((1, 3, 4), dtype=np.int32)
+        b = np.arange(12, dtype=np.int32).reshape((1, 3, 4))
+        result = knl(queue, a=a, b=b)[1][0]
+
+        assert np.array_equal(result.flatten('C'), answer)
+
+    ans = np.arange(12, dtype=np.int32)
+    ans[:7] = 0
+    create_and_test("""
+        if i >= 7
+            a[j, i] = b[j, i]
+        end
+    """, ans)
+
+
 def test_vectorizability():
     # check new vectorizability conditions
     from loopy.kernel.array import VectorArrayDimTag
-- 
GitLab


From db160e79087cb156fc0def4013c75721e91822bd Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Tue, 28 Aug 2018 11:05:13 -0400
Subject: [PATCH 136/144] can now remove expected error in select test, as
 conditional in if statement is implemented

---
 test/test_loopy.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index 359e673e6..681423690 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -3025,7 +3025,7 @@ def test_explicit_simd_selects(ctx_factory):
     from loopy.diagnostic import LoopyError
     # 1) test a conditional on a vector iname -- currently unimplemented as it
     # would require creating a 'shadow' vector iname temporary
-    create_and_test('a[i] = 1', 'i > 6', ans, exception=LoopyError)
+    create_and_test('a[i] = 1', 'i > 6', ans)
     # 2) condition on a vector array
     create_and_test('a[i] = 1', 'b[i] > 6', ans, b=np.arange(
         12, dtype=np.int32).reshape((3, 4)))
-- 
GitLab


From 39214f7792d0d1208d3f09017b2c32080cb9d504 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Tue, 28 Aug 2018 12:40:03 -0400
Subject: [PATCH 137/144] test multiple dtypes, and correct the specification
 of the vector length in the condition_mapper_wrapper

---
 loopy/codegen/result.py |  6 ++++--
 test/test_loopy.py      | 11 ++++++-----
 2 files changed, 10 insertions(+), 7 deletions(-)

diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index d4b85b9b9..2d1be769e 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -339,13 +339,15 @@ def wrap_in_if(codegen_state, condition_exprs, inner):
                         if deps & set([vec_iname]):
                             # we have to insert our own temporary version of the
                             # vector iname here
-                            # first, determine the dtype
-                            size = lhs_dtype.itemsize
+                            # get the vector size
+                            size = codegen_state.vectorization_info.length
+                            # determine the dtype
                             np_dtype = np.dtype('i%d' % lhs_dtype.itemsize)
                             dtype = codegen_state.kernel.target.\
                                 get_dtype_registry().dtype_to_ctype(
                                     to_loopy_type(np_dtype,
                                     target=codegen_state.kernel.target))
+                            # get the string form
                             name = '%s%d' % (dtype, size)
                             # next, get the base of a vector temporary
                             init = range(size)
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 681423690..c4878dfe0 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -3128,14 +3128,15 @@ def test_explicit_vector_dtype_conversion(ctx_factory, lhs_dtype, rhs_dtype):
                   """)
 
 
-def test_explicit_simd_vector_iname_in_conditional(ctx_factory):
+@pytest.mark.parametrize(('dtype'), [np.int32, np.int64, np.float32, np.float64])
+def test_explicit_simd_vector_iname_in_conditional(ctx_factory, dtype):
     ctx = ctx_factory()
 
     def create_and_test(insn, answer, debug=False):
         knl = lp.make_kernel(['{[i]: 0 <= i < 12}', '{[j]: 0 <= j < 1}'],
                              insn,
-                             [lp.GlobalArg('a', shape=(1, 12,), dtype=np.int32),
-                              lp.GlobalArg('b', shape=(1, 12,), dtype=np.int32)])
+                             [lp.GlobalArg('a', shape=(1, 12,), dtype=dtype),
+                              lp.GlobalArg('b', shape=(1, 12,), dtype=dtype)])
 
         knl = lp.split_iname(knl, 'i', 4, inner_tag='vec')
         knl = lp.tag_inames(knl, [('j', 'g.0')])
@@ -3148,8 +3149,8 @@ def test_explicit_simd_vector_iname_in_conditional(ctx_factory):
             print(code)
         # and check answer
         queue = cl.CommandQueue(ctx)
-        a = np.zeros((1, 3, 4), dtype=np.int32)
-        b = np.arange(12, dtype=np.int32).reshape((1, 3, 4))
+        a = np.zeros((1, 3, 4), dtype=dtype)
+        b = np.arange(12, dtype=dtype).reshape((1, 3, 4))
         result = knl(queue, a=a, b=b)[1][0]
 
         assert np.array_equal(result.flatten('C'), answer)
-- 
GitLab


From eb93e4a9d00154d93d19b15f1630c24de0fd1f23 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Tue, 28 Aug 2018 18:26:28 -0400
Subject: [PATCH 138/144] Correct handling of emit of unvectorized conditionals
 after failed vectorization and add test.

---
 loopy/codegen/__init__.py | 16 ++++++++----
 loopy/codegen/control.py  |  5 ++--
 loopy/codegen/result.py   | 18 ++++++++++---
 loopy/target/opencl.py    | 13 ++++++----
 test/test_loopy.py        | 54 ++++++++++++++++++++++++++-------------
 5 files changed, 72 insertions(+), 34 deletions(-)

diff --git a/loopy/codegen/__init__.py b/loopy/codegen/__init__.py
index 73d7d8528..43d5a6112 100644
--- a/loopy/codegen/__init__.py
+++ b/loopy/codegen/__init__.py
@@ -226,7 +226,8 @@ class CodeGenerationState(object):
             var_subst_map=None, vectorization_info=None,
             is_generating_device_code=None,
             gen_program_name=None,
-            schedule_index_end=None):
+            schedule_index_end=None,
+            removed_predicates=frozenset()):
 
         if kernel is None:
             kernel = self.kernel
@@ -256,7 +257,8 @@ class CodeGenerationState(object):
                 implemented_data_info=implemented_data_info,
                 implemented_domain=implemented_domain or self.implemented_domain,
                 implemented_predicates=(
-                    implemented_predicates or self.implemented_predicates),
+                    (implemented_predicates or self.implemented_predicates) -
+                    removed_predicates),
                 seen_dtypes=self.seen_dtypes,
                 seen_functions=self.seen_functions,
                 seen_atomic_dtypes=self.seen_atomic_dtypes,
@@ -316,7 +318,7 @@ class CodeGenerationState(object):
         return self.copy_and_assign(iname, expr).copy(
                 implemented_domain=new_impl_domain)
 
-    def try_vectorized(self, what, func):
+    def try_vectorized(self, what, func, vector_kwargs={}):
         """If *self* is in a vectorizing state (:attr:`vectorization_info` is
         not None), tries to call func (which must be a callable accepting a
         single :class:`CodeGenerationState` argument). If this fails with
@@ -331,7 +333,7 @@ class CodeGenerationState(object):
             return func(self)
 
         try:
-            return func(self)
+            return func(self, **vector_kwargs)
         except Unvectorizable as e:
             warn_with_kernel(self.kernel, "vectorize_failed",
                     "Vectorization of '%s' failed because '%s'"
@@ -342,7 +344,11 @@ class CodeGenerationState(object):
     def unvectorize(self, func):
         vinf = self.vectorization_info
         result = []
-        novec_self = self.copy(vectorization_info=False)
+        novec_self = self.copy(
+            vectorization_info=False,
+            # we must clear the implemented predicates, as they may have been
+            # generated as vector conditionals, and no longer be valide
+            removed_predicates=self.implemented_predicates)
 
         for i in range(vinf.length):
             idx_aff = isl.Aff.zero_on_domain(vinf.space.params()) + i
diff --git a/loopy/codegen/control.py b/loopy/codegen/control.py
index 27e92b911..14a8a6e26 100644
--- a/loopy/codegen/control.py
+++ b/loopy/codegen/control.py
@@ -510,11 +510,12 @@ def build_loop_nest(codegen_state, schedule_index):
                                 pred_chk for pred_chk in pred_checks]
 
                     prev_result = prev_gen_code(inner_codegen_state)
-
                     inner = merge_codegen_results(codegen_state, prev_result)
                     return [new_codegen_state.try_vectorized(
                         inner.current_ast(inner_codegen_state),
-                        lambda ics: wrap_in_if(ics, condition_exprs, inner))]
+                        lambda ics, **kwargs: wrap_in_if(
+                            ics, condition_exprs, inner, **kwargs),
+                        vector_kwargs={'is_vectorized': True})]
 
                 cannot_vectorize = False
                 if new_codegen_state.vectorization_info is not None:
diff --git a/loopy/codegen/result.py b/loopy/codegen/result.py
index 2d1be769e..eb8586f0d 100644
--- a/loopy/codegen/result.py
+++ b/loopy/codegen/result.py
@@ -88,6 +88,11 @@ class CodeGenerationResult(ImmutableRecord):
 
         a list of :class:`loopy.codegen.ImplementedDataInfo` objects.
         Only added at the very end of code generation.
+
+    .. attribute:: vectorize_failed
+
+        If True, the currently generated instructions are in the unrolled failed
+        vectorization state (i.e., 'unvectorize')
     """
 
     @staticmethod
@@ -100,12 +105,12 @@ class CodeGenerationResult(ImmutableRecord):
         if codegen_state.is_generating_device_code:
             kwargs = {
                     "host_program": None,
-                    "device_programs": [prg],
+                    "device_programs": [prg]
                     }
         else:
             kwargs = {
                     "host_program": prg,
-                    "device_programs": [],
+                    "device_programs": []
                     }
 
         return CodeGenerationResult(
@@ -254,7 +259,11 @@ def merge_codegen_results(codegen_state, elements, collapse=True):
                 **kwargs))
 
 
-def wrap_in_if(codegen_state, condition_exprs, inner):
+def wrap_in_if(codegen_state, condition_exprs, inner, is_vectorized=False):
+    """
+    :param:`is_vectorized` indicates whether the generated AST was successfully
+        vectorized, or whether it was fed through unvectorize
+    """
     if condition_exprs:
         from pymbolic.primitives import LogicalAnd
         from pymbolic.mapper.stringifier import PREC_NONE
@@ -273,7 +282,7 @@ def wrap_in_if(codegen_state, condition_exprs, inner):
                     type_context=type_context, needed_dtype=needed_dtype)
         mapper = condition_mapper
 
-        if codegen_state.vectorization_info is not None:
+        if codegen_state.vectorization_info is not None and is_vectorized:
             from loopy.symbolic import get_dependencies
             from loopy.kernel.array import VectorArrayDimTag
             from loopy.kernel.data import ValueArg
@@ -363,6 +372,7 @@ def wrap_in_if(codegen_state, condition_exprs, inner):
                         type_context=type_context, needed_dtype=lhs_dtype,
                         **kwargs)
 
+                # mark as vector predicates
                 method = codegen_state.ast_builder.emit_vector_if
                 mapper = condition_mapper_wrapper
 
diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 5a670e80c..64d7716d7 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -625,11 +625,14 @@ class OpenCLCASTBuilder(CASTBuilder):
                     vec_if = Block(vec_if)
             except AttributeError:
                 vec_if = False
-        if not vec_if:
-            raise LoopyError(
-                "Vector conditionals can only be generated for simple "
-                "assign statements, condition (%s) on instruction (%s) "
-                "invalid" % (str(condition_mapper()), str(ast)))
+        if not vec_if and isinstance(ast, Block):
+            import logging
+            logger = logging.getLogger(__name__)
+            logger.info('Cannot convert non-simple assign statement for instruction '
+                        '(%s) to vectorized conditional. '
+                        'Assuming that this is the result of a previous unvectorize '
+                        'call.')
+            return ast
 
         return vec_if
 
diff --git a/test/test_loopy.py b/test/test_loopy.py
index c4878dfe0..9f4577be1 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2971,7 +2971,7 @@ def test_explicit_simd_selects(ctx_factory):
     ctx = ctx_factory()
 
     def create_and_test(insn, condition, answer, exception=None, a=None, b=None,
-                        extra_insns=None, c=None, v=None, check=None):
+                        extra_insns=None, c=None, v=None, check=None, debug=False):
         a = np.zeros((3, 4), dtype=np.int32) if a is None else a
         data = [lp.GlobalArg('a', shape=(12,), dtype=a.dtype)]
         kwargs = dict(a=a)
@@ -3016,33 +3016,33 @@ def test_explicit_simd_selects(ctx_factory):
         else:
             if not isinstance(answer, tuple):
                 answer = (answer,)
+            if debug:
+                print(lp.generate_code_v2(knl).device_code())
             result = knl(queue, **kwargs)[1]
             for r, a in zip(result, answer):
                 assert np.array_equal(r.flatten('C'), a)
 
     ans = np.zeros(12, dtype=np.int32)
     ans[7:] = 1
-    from loopy.diagnostic import LoopyError
-    # 1) test a conditional on a vector iname -- currently unimplemented as it
-    # would require creating a 'shadow' vector iname temporary
+    # 1) test a conditional on a vector iname
     create_and_test('a[i] = 1', 'i > 6', ans)
     # 2) condition on a vector array
     create_and_test('a[i] = 1', 'b[i] > 6', ans, b=np.arange(
         12, dtype=np.int32).reshape((3, 4)))
-    # 3) condition on a vector temporary -- this is currently broken for the
-    # same reason as #1
-    create_and_test('a[i] = 1', 'c', ans, extra_insns='<> c = i < 6',
-                    exception=LoopyError)
+    # 3) condition on a vector temporary
+    create_and_test('a[i] = 1', 'c', ans, extra_insns='<> c = (i < 7) - 1')
     # 4) condition on an assigned vector array, this should work as assignment to a
     # vector can be safely unrolled
-    create_and_test('a[i] = 1', 'b[i] > 6', ans, b=np.zeros((3, 4), dtype=np.int32),
+    create_and_test('a[i] = 1', '(b[i] > 6)', ans,
+                    b=np.zeros((3, 4), dtype=np.int32),
                     extra_insns='b[i] = i')
     # 5) a block of simple assignments, this should be seemlessly translated to
     # multiple vector if statements
     c_ans = np.ones(12, dtype=np.int32)
     c_ans[7:] = 0
-    create_and_test('a[i] = 1\nc[i] = 0', 'b[i] > 6', (ans, c_ans), b=np.arange(
-        12, dtype=np.int32).reshape((3, 4)), c=np.ones((3, 4), dtype=np.int32))
+    create_and_test('a[i] = 1\nc[i] = 0', '(b[i] > 6)', (ans, c_ans), b=np.arange(
+        12, dtype=np.int32).reshape((3, 4)), c=np.ones((3, 4), dtype=np.int32),
+        debug=True)
     # 6) test a negated conditional
     ans_negated = np.invert(ans) + 2
     create_and_test('a[i] = 1', 'not (b[i] > 6)', ans_negated, b=np.arange(
@@ -3132,16 +3132,19 @@ def test_explicit_vector_dtype_conversion(ctx_factory, lhs_dtype, rhs_dtype):
 def test_explicit_simd_vector_iname_in_conditional(ctx_factory, dtype):
     ctx = ctx_factory()
 
-    def create_and_test(insn, answer, debug=False):
-        knl = lp.make_kernel(['{[i]: 0 <= i < 12}', '{[j]: 0 <= j < 1}'],
+    def create_and_test(insn, answer, shape=(1, 12), debug=False,
+                        vectors=['a', 'b']):
+        num_conditions = shape[0]
+        knl = lp.make_kernel(['{[i]: 0 <= i < 12}',
+                              '{{[j]: 0 <= j < {}}}'.format(num_conditions)],
                              insn,
-                             [lp.GlobalArg('a', shape=(1, 12,), dtype=dtype),
-                              lp.GlobalArg('b', shape=(1, 12,), dtype=dtype)])
+                             [lp.GlobalArg('a', shape=shape, dtype=dtype),
+                              lp.GlobalArg('b', shape=shape, dtype=dtype)])
 
         knl = lp.split_iname(knl, 'i', 4, inner_tag='vec')
         knl = lp.tag_inames(knl, [('j', 'g.0')])
         knl = lp.split_array_axis(knl, ['a', 'b'], 1, 4)
-        knl = lp.tag_array_axes(knl, ['a', 'b'], 'N1,N0,vec')
+        knl = lp.tag_array_axes(knl, vectors, 'N1,N0,vec')
 
         # ensure we can generate code
         code = lp.generate_code_v2(knl).device_code()
@@ -3149,8 +3152,11 @@ def test_explicit_simd_vector_iname_in_conditional(ctx_factory, dtype):
             print(code)
         # and check answer
         queue = cl.CommandQueue(ctx)
-        a = np.zeros((1, 3, 4), dtype=dtype)
-        b = np.arange(12, dtype=dtype).reshape((1, 3, 4))
+
+        num_vectors = int(shape[1] / 4)
+        a = np.zeros((num_conditions, num_vectors, 4), dtype=dtype)
+        b = np.arange(num_conditions * num_vectors * 4, dtype=dtype).reshape(
+            (num_conditions, num_vectors, 4))
         result = knl(queue, a=a, b=b)[1][0]
 
         assert np.array_equal(result.flatten('C'), answer)
@@ -3163,6 +3169,18 @@ def test_explicit_simd_vector_iname_in_conditional(ctx_factory, dtype):
         end
     """, ans)
 
+    # a case that will result in a unvectorized evaluation
+    # this tests that we are properly able to unwind any vectorized conditional that
+    # has been applied, and then reapply the correct scalar conditional in
+    # unvectorize
+    ans = np.arange(144, dtype=np.int32)
+    ans[:7] = 0
+    create_and_test("""
+        if j * 12 + i >= 7
+            a[j, i] = b[j, i]
+        end
+    """, ans, shape=(12, 12), vectors=['b'])
+
 
 def test_vectorizability():
     # check new vectorizability conditions
-- 
GitLab


From 87141c1698097c97afe997e5c3f608016ffd30ee Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Tue, 28 Aug 2018 18:46:11 -0400
Subject: [PATCH 139/144] convert loopy error into warning

---
 loopy/target/opencl.py | 10 ++++++----
 test/test_loopy.py     |  3 +--
 2 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/loopy/target/opencl.py b/loopy/target/opencl.py
index 64d7716d7..b2a1ceb03 100644
--- a/loopy/target/opencl.py
+++ b/loopy/target/opencl.py
@@ -340,10 +340,12 @@ class ExpressionToOpenCLCExpressionMapper(ExpressionToCExpressionMapper):
                           if filter_iname_tags_by_type(tags, VectorizeTag)])
 
         if get_dependencies(expr) & vec_inames and \
-                self.codegen_state.insn_was_not_vectorizable:
-            raise LoopyError("Cannot unroll a vector-iname comparison, as scalar"
-                             " assignment results in incorrect 'truthiness' for "
-                             " vector dtypes.")
+               self.codegen_state.insn_was_not_vectorizable:
+            from loopy.diagnostic import warn_with_kernel
+            warn_with_kernel(self.codegen_state.kernel,
+                             'unrolled_vector_iname_conditional',
+                             'Unrolled vector-loop iname detected in vector ' +
+                             'comparison; this may in unexpected truth-values.')
 
         return super(ExpressionToOpenCLCExpressionMapper, self).map_comparison(
             expr, type_context)
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 9f4577be1..b68998e77 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -3041,8 +3041,7 @@ def test_explicit_simd_selects(ctx_factory):
     c_ans = np.ones(12, dtype=np.int32)
     c_ans[7:] = 0
     create_and_test('a[i] = 1\nc[i] = 0', '(b[i] > 6)', (ans, c_ans), b=np.arange(
-        12, dtype=np.int32).reshape((3, 4)), c=np.ones((3, 4), dtype=np.int32),
-        debug=True)
+        12, dtype=np.int32).reshape((3, 4)), c=np.ones((3, 4), dtype=np.int32))
     # 6) test a negated conditional
     ans_negated = np.invert(ans) + 2
     create_and_test('a[i] = 1', 'not (b[i] > 6)', ans_negated, b=np.arange(
-- 
GitLab


From 35492a13db73f390b5df0c164bf26a23a5cc8924 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 30 Aug 2018 17:47:39 -0400
Subject: [PATCH 140/144] Improve test of contiguity in get_access_info, and
 add a test similar to what triggered the failure in pyjac

---
 loopy/kernel/array.py |  8 +++++---
 test/test_loopy.py    | 38 ++++++++++++++++++++++++++++++++++++++
 2 files changed, 43 insertions(+), 3 deletions(-)

diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 63734ba97..323b28424 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1323,10 +1323,12 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info):
         return [simplify_via_aff(arr[i]) for i in range(len(arr))]
 
     def is_contiguous(arr):
-        if not len(arr):
+        from loopy.isl_helpers import simplify_via_aff
+        from functools import cmp_to_key
+        if not len(arr) or len(arr) != vector_size:
             return False
-        sarr = sorted(arr)
-        return len(arr) == vector_size and (sarr[-1] - sarr[0] + 1) == vector_size
+        sarr = sorted(arr, key=cmp_to_key(lambda x, y: simplify_via_aff(x - y) > 0))
+        return simplify_via_aff(sarr[-1] - sarr[0] + 1) == vector_size
 
     def is_monotonic(arr):
         if not len(arr):
diff --git a/test/test_loopy.py b/test/test_loopy.py
index b68998e77..74e9e1762 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2869,6 +2869,44 @@ def test_explicit_simd_shuffles(ctx_factory):
                         answer, True)
 
 
+def test_explicit_simd_unr_iname(ctx_factory):
+    """
+    tests as scatter load to a specific lane of a vector array via an unrolled iname
+    """
+    ctx = ctx_factory()
+    queue = cl.CommandQueue(ctx)
+
+    insns = """
+        for j
+            for i
+                for lane
+                    a[j, i, lane] = b[j + lane, i]
+                end
+            end
+        end
+        """
+    dtype = np.int32
+    knl = lp.make_kernel(
+        ['{[j]: 0 <= j < 9}', '{[i]: 0 <= i < 3}', '{[lane]: 0 <= lane < 4}'],
+        insns,
+        [lp.GlobalArg('a', shape=(12, 3, 4), dtype=dtype),
+         lp.GlobalArg('b', shape=(12, 12), dtype=dtype)])
+
+    knl = lp.tag_array_axes(knl, 'a', 'N1,N0,vec')
+    knl = lp.tag_inames(knl, {'lane': 'unr'})
+    knl = lp.prioritize_loops(knl, 'j, i, lane')
+    knl = lp.preprocess_kernel(knl)
+
+    b = np.arange(144, dtype=dtype).reshape((12, 12))
+    a = knl(queue, b=b, a=np.zeros((12, 3, 4), dtype=dtype))[1][0]
+
+    ans = np.tile(np.arange(4, dtype=dtype), int(144 / 4)).reshape((12, 3, 4))
+    ans[:9] = (ans[:9] + np.arange(9)[:, np.newaxis, np.newaxis]) * 12
+    ans[:9] = (ans[:9] + np.arange(3)[np.newaxis, :, np.newaxis])
+    ans[9:] = 0
+    assert np.array_equal(a, ans)
+
+
 def test_explicit_simd_temporary_promotion(ctx_factory):
     from loopy.kernel.data import temp_var_scope as scopes
 
-- 
GitLab


From 88b07bf8bdbcea677dc3cbabaee0f7b9bbb35036 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 30 Aug 2018 18:30:31 -0400
Subject: [PATCH 141/144] improve vector iname in conditional test to include
 different vector widths / dtypes

---
 test/test_loopy.py | 17 ++++++++++-------
 1 file changed, 10 insertions(+), 7 deletions(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index 74e9e1762..42afc4236 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -3165,14 +3165,17 @@ def test_explicit_vector_dtype_conversion(ctx_factory, lhs_dtype, rhs_dtype):
                   """)
 
 
-@pytest.mark.parametrize(('dtype'), [np.int32, np.int64, np.float32, np.float64])
-def test_explicit_simd_vector_iname_in_conditional(ctx_factory, dtype):
+@pytest.mark.parametrize('dtype', [np.int32, np.int64, np.float32, np.float64])
+@pytest.mark.parametrize('vec_width', [2, 3, 4, 8, 16])
+def test_explicit_simd_vector_iname_in_conditional(ctx_factory, dtype, vec_width):
     ctx = ctx_factory()
 
-    def create_and_test(insn, answer, shape=(1, 12), debug=False,
+    size = vec_width * 4
+
+    def create_and_test(insn, answer, shape=(1, size), debug=False,
                         vectors=['a', 'b']):
         num_conditions = shape[0]
-        knl = lp.make_kernel(['{[i]: 0 <= i < 12}',
+        knl = lp.make_kernel(['{{[i]: 0 <= i < {}}}'.format(size),
                               '{{[j]: 0 <= j < {}}}'.format(num_conditions)],
                              insn,
                              [lp.GlobalArg('a', shape=shape, dtype=dtype),
@@ -3198,7 +3201,7 @@ def test_explicit_simd_vector_iname_in_conditional(ctx_factory, dtype):
 
         assert np.array_equal(result.flatten('C'), answer)
 
-    ans = np.arange(12, dtype=np.int32)
+    ans = np.arange(size, dtype=np.int32)
     ans[:7] = 0
     create_and_test("""
         if i >= 7
@@ -3210,13 +3213,13 @@ def test_explicit_simd_vector_iname_in_conditional(ctx_factory, dtype):
     # this tests that we are properly able to unwind any vectorized conditional that
     # has been applied, and then reapply the correct scalar conditional in
     # unvectorize
-    ans = np.arange(144, dtype=np.int32)
+    ans = np.arange(12 * size, dtype=np.int32)
     ans[:7] = 0
     create_and_test("""
         if j * 12 + i >= 7
             a[j, i] = b[j, i]
         end
-    """, ans, shape=(12, 12), vectors=['b'])
+    """, ans, shape=(12, size), vectors=['b'])
 
 
 def test_vectorizability():
-- 
GitLab


From ae023cd218765fbe23d0a91c4a5765d601f2129f Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 30 Aug 2018 19:15:09 -0400
Subject: [PATCH 142/144] Make the comparison sort fallback to avoid issues w/
 shuffles

---
 loopy/kernel/array.py | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 323b28424..59731d705 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1327,7 +1327,12 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info):
         from functools import cmp_to_key
         if not len(arr) or len(arr) != vector_size:
             return False
-        sarr = sorted(arr, key=cmp_to_key(lambda x, y: simplify_via_aff(x - y) > 0))
+        try:
+            sarr = sorted(arr)
+        except TypeError:
+            # tried to sort a pymbolic expression, try w/ comparison sort
+            sarr = sorted(arr, key=cmp_to_key(
+                lambda x, y: simplify_via_aff(x - y) > 0))
         return simplify_via_aff(sarr[-1] - sarr[0] + 1) == vector_size
 
     def is_monotonic(arr):
-- 
GitLab


From 9ca6b667a4a0aaed827914e4e19eecb4e459c131 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 30 Aug 2018 19:55:23 -0400
Subject: [PATCH 143/144] Fix the unrolled vector iname test to match what I'm
 actually doing.

---
 test/test_loopy.py | 34 ++++++++++++++++------------------
 1 file changed, 16 insertions(+), 18 deletions(-)

diff --git a/test/test_loopy.py b/test/test_loopy.py
index 42afc4236..63e05eb0a 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2877,33 +2877,31 @@ def test_explicit_simd_unr_iname(ctx_factory):
     queue = cl.CommandQueue(ctx)
 
     insns = """
-        for j
-            for i
-                for lane
-                    a[j, i, lane] = b[j + lane, i]
-                end
-            end
+        for j_outer, lane, i
+            a[j_outer, i, lane] = b[j_outer + lane, i]
         end
         """
-    dtype = np.int32
     knl = lp.make_kernel(
-        ['{[j]: 0 <= j < 9}', '{[i]: 0 <= i < 3}', '{[lane]: 0 <= lane < 4}'],
+        ['{[j_outer]: 0 <= j_outer < 4}',
+         '{[i]: 0 <= i < 4}',
+         '{[lane]: 0 <= lane < 4}'],
         insns,
-        [lp.GlobalArg('a', shape=(12, 3, 4), dtype=dtype),
-         lp.GlobalArg('b', shape=(12, 12), dtype=dtype)])
+        [lp.GlobalArg('a', shape=(4, 4, 4)),
+         lp.GlobalArg('b', shape=(8, 4))])
 
     knl = lp.tag_array_axes(knl, 'a', 'N1,N0,vec')
     knl = lp.tag_inames(knl, {'lane': 'unr'})
-    knl = lp.prioritize_loops(knl, 'j, i, lane')
-    knl = lp.preprocess_kernel(knl)
+    knl = lp.prioritize_loops(knl, 'j_outer, i, lane')
+
+    a = np.zeros((4, 4, 4))
+    b = np.arange(8 * 4).reshape((8, 4))
 
-    b = np.arange(144, dtype=dtype).reshape((12, 12))
-    a = knl(queue, b=b, a=np.zeros((12, 3, 4), dtype=dtype))[1][0]
+    a = knl(queue, a=a, b=b)[1][0]
+    # create answer
+    ans = np.tile(np.arange(4, dtype=np.float64), 16).reshape((4, 4, 4))
+    ans *= 4
+    ans += 4 * np.arange(4)[:, np.newaxis, np.newaxis] + np.arange(4)[:, np.newaxis]
 
-    ans = np.tile(np.arange(4, dtype=dtype), int(144 / 4)).reshape((12, 3, 4))
-    ans[:9] = (ans[:9] + np.arange(9)[:, np.newaxis, np.newaxis]) * 12
-    ans[:9] = (ans[:9] + np.arange(3)[np.newaxis, :, np.newaxis])
-    ans[9:] = 0
     assert np.array_equal(a, ans)
 
 
-- 
GitLab


From 0cc7cea839a8dfb2ed2e0581fb2ce81bbfc77426 Mon Sep 17 00:00:00 2001
From: Nick <nicholas.curtis@uconn.edu>
Date: Thu, 30 May 2019 20:28:01 -0400
Subject: [PATCH 144/144] fix pylint errors; probably a better way to store the
 vector_index variable such that pylint doesn't get mad, but we'll get there
 when this PR is more active

---
 loopy/kernel/array.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/loopy/kernel/array.py b/loopy/kernel/array.py
index 3dd330d6f..f7025f6ee 100644
--- a/loopy/kernel/array.py
+++ b/loopy/kernel/array.py
@@ -1393,8 +1393,8 @@ def get_access_info(target, ary, index, var_subst_map, vectorization_info):
 
             # update vector operation type if necessary
             if vector_index is not None and isinstance(vector_index, tuple):
-                assert vector_index[0] is None
-                vector_index = (vec_op_type, vector_index[1])
+                assert vector_index[0] is None  # pylint: disable=E1136
+                vector_index = (vec_op_type, vector_index[1])  # pylint: disable=E1136; # noqa
 
             subscripts[dim_tag.target_axis] += (stride // vector_size)*idx
 
-- 
GitLab