From c12c610978b2b1ecab1a6b619f64315b241bfa0e Mon Sep 17 00:00:00 2001
From: Kaushik Kulkarni <kaushikcfd@gmail.com>
Date: Tue, 20 Nov 2018 12:45:04 -0600
Subject: [PATCH] Merge 'master' into 'new_function_interface'

---
 .gitlab-ci.yml                                | 19 ++++++++++-
 LICENSE                                       | 21 ++++++++++++
 .../make-linux-build-docker-inner-part-2.sh   |  4 +++
 loopy/frontend/fortran/tree.py                |  2 +-
 loopy/kernel/tools.py                         |  4 +--
 loopy/schedule/__init__.py                    | 10 ++++--
 loopy/statistics.py                           | 20 ++++++++----
 loopy/symbolic.py                             |  2 +-
 loopy/target/cuda.py                          |  2 +-
 loopy/target/pyopencl.py                      |  3 +-
 requirements.txt                              |  5 +--
 setup.cfg                                     |  2 +-
 test/test_loopy.py                            | 19 +++++++++++
 test/test_numa_diff.py                        |  2 +-
 test/test_reduction.py                        | 32 +++++++++++--------
 test/test_statistics.py                       | 14 +++++---
 test/test_target.py                           | 17 ++++++++++
 17 files changed, 137 insertions(+), 41 deletions(-)
 create mode 100644 LICENSE

diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 1caef802b..ea69114d6 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -12,6 +12,10 @@ Python 2.7 POCL:
   - pocl
   except:
   - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
 
 Python 2.7 with legacy PyOpenCL:
   script:
@@ -29,6 +33,10 @@ Python 2.7 with legacy PyOpenCL:
   except:
   - tags
   retry: 2
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
 
 Python 3.6 POCL:
   script:
@@ -43,6 +51,10 @@ Python 3.6 POCL:
   - pocl
   except:
   - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
 
 Python 3.6 POCL Twice With Cache:
   script:
@@ -59,6 +71,10 @@ Python 3.6 POCL Twice With Cache:
   - pocl
   except:
   - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
 
 # PyPy POCL:
 #   script:
@@ -77,7 +93,7 @@ Python 3.6 POCL Examples:
   script:
   - export PY_EXE=python3.6
   - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib jupyter nbconvert"
+  - export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipykernel nbconvert"
   - ". ./build-py-project-and-run-examples.sh"
   tags:
   - python3.6
@@ -87,6 +103,7 @@ Python 3.6 POCL Examples:
   except:
   - tags
 
+
 CentOS binary:
   script:
   - (cd build-helpers; ./make-linux-build-docker.sh --nodate)
diff --git a/LICENSE b/LICENSE
new file mode 100644
index 000000000..601df74bd
--- /dev/null
+++ b/LICENSE
@@ -0,0 +1,21 @@
+MIT License
+
+Copyright (c) 2018 Andreas Klöckner and contributors
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.
diff --git a/build-helpers/make-linux-build-docker-inner-part-2.sh b/build-helpers/make-linux-build-docker-inner-part-2.sh
index 1e35a1e1b..035634b16 100755
--- a/build-helpers/make-linux-build-docker-inner-part-2.sh
+++ b/build-helpers/make-linux-build-docker-inner-part-2.sh
@@ -23,6 +23,10 @@ git clone --recursive git://github.com/inducer/loopy
 cd loopy
 
 grep -v pyopencl requirements.txt > myreq.txt
+
+# needed for pyinstaller package to be usable
+echo packaging >> myreq.txt
+
 pip install -r myreq.txt
 python setup.py install
 
diff --git a/loopy/frontend/fortran/tree.py b/loopy/frontend/fortran/tree.py
index b1df6e3d0..6939bb6ad 100644
--- a/loopy/frontend/fortran/tree.py
+++ b/loopy/frontend/fortran/tree.py
@@ -53,7 +53,7 @@ class FTreeWalkerBase(object):
 
     ENTITY_RE = re.compile(
             r"^(?P<name>[_0-9a-zA-Z]+)"
-            "(\((?P<shape>[-+*0-9:a-zA-Z, \t]+)\))?$")
+            r"(\((?P<shape>[-+*0-9:a-zA-Z, \t]+)\))?$")
 
     def parse_dimension_specs(self, node, dim_decls):
         def parse_bounds(bounds_str):
diff --git a/loopy/kernel/tools.py b/loopy/kernel/tools.py
index 006ac6ba3..3aaa8d56a 100644
--- a/loopy/kernel/tools.py
+++ b/loopy/kernel/tools.py
@@ -1253,7 +1253,7 @@ def draw_dependencies_as_unicode_arrows(
         for dep in insn.depends_on:
             reverse_deps.setdefault(dep, set()).add(insn.id)
 
-    # mapping of (from_id, to_id) tuples to column_index
+    # mapping of to_id tuples to column_index
     dep_to_column = {}
 
     # {{{ find column assignments
@@ -1330,7 +1330,7 @@ def draw_dependencies_as_unicode_arrows(
 
             elif insn.id in starts:
                 starts.remove(insn.id)
-                if starts:
+                if starts or pointed_at_insn_id not in processed_ids:
                     # will continue downward
                     row[col] = do_flag_downward(u"├", pointed_at_insn_id)
 
diff --git a/loopy/schedule/__init__.py b/loopy/schedule/__init__.py
index 2b3f7a3b9..3dc1c0bbe 100644
--- a/loopy/schedule/__init__.py
+++ b/loopy/schedule/__init__.py
@@ -794,9 +794,13 @@ def generate_loop_schedules_internal(
 
         if not is_ready:
             if debug_mode:
-                print("instruction '%s' is missing insn depedencies '%s'" % (
-                        format_insn(kernel, insn.id), ",".join(
-                            insn.depends_on - sched_state.scheduled_insn_ids)))
+                # These are not that interesting when understanding scheduler
+                # failures.
+
+                # print("instruction '%s' is missing insn depedencies '%s'" % (
+                #         format_insn(kernel, insn.id), ",".join(
+                #             insn.depends_on - sched_state.scheduled_insn_ids)))
+                pass
             continue
 
         want = kernel.insn_inames(insn) - sched_state.parallel_inames
diff --git a/loopy/statistics.py b/loopy/statistics.py
index d65387d16..454cca18e 100755
--- a/loopy/statistics.py
+++ b/loopy/statistics.py
@@ -707,9 +707,10 @@ class CounterBase(CombineMapper):
 # {{{ ExpressionOpCounter
 
 class ExpressionOpCounter(CounterBase):
-    def __init__(self, knl, callables_table):
+    def __init__(self, knl, callables_table, count_within_subscripts=True):
         self.knl = knl
         self.callables_table = callables_table
+        self.count_within_subscripts = count_within_subscripts
         from loopy.type_inference import TypeInferenceMapper
         self.type_inf = TypeInferenceMapper(knl, callables_table)
 
@@ -737,7 +738,10 @@ class ExpressionOpCounter(CounterBase):
                     ) + self.rec(expr.parameters)
 
     def map_subscript(self, expr):
-        return self.rec(expr.index)
+        if self.count_within_subscripts:
+            return self.rec(expr.index)
+        else:
+            return ToCountMap()
 
     def map_sum(self, expr):
         assert expr.children
@@ -1343,10 +1347,9 @@ def _get_insn_count(knl, callables_table, insn_id, subgroup_size,
 
 # {{{ get_op_map
 
-
 def get_op_map_for_single_kernel(knl, callables_table,
         numpy_types=True, count_redundant_work=False,
-               subgroup_size=None):
+        count_within_subscripts=True, subgroup_size=None):
 
     if not knl.options.ignore_boostable_into:
         raise LoopyError("Kernel '%s': Using operation counting requires the option "
@@ -1394,7 +1397,7 @@ def get_op_map_for_single_kernel(knl, callables_table,
 
 
 def get_op_map(program, numpy_types=True, count_redundant_work=False,
-               subgroup_size=None):
+               count_within_subscripts=True, subgroup_size=None):
 
     """Count the number of operations in a loopy kernel.
 
@@ -1410,6 +1413,9 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False,
         (Likely desirable for performance modeling, but undesirable for code
         optimization.)
 
+    :arg count_within_subscripts: A :class:`bool` specifying whether to
+        count operations inside array indices.
+
     :arg subgroup_size: (currently unused) An :class:`int`, :class:`str`
         ``'guess'``, or *None* that specifies the sub-group size. An OpenCL
         sub-group is an implementation-dependent grouping of work-items within
@@ -1464,8 +1470,8 @@ def get_op_map(program, numpy_types=True, count_redundant_work=False,
         if isinstance(in_knl_callable, CallableKernel):
             knl = in_knl_callable.subkernel
             knl_op_map = get_op_map_for_single_kernel(knl,
-                        program.callables_table, numpy_types,
-                        count_redundant_work, subgroup_size)
+                    program.callables_table, numpy_types, count_redundant_work,
+                    count_within_subscripts, subgroup_size)
 
             for i in range(callables_count[func_id]):
                 op_map += knl_op_map
diff --git a/loopy/symbolic.py b/loopy/symbolic.py
index 92b209ac9..04cf2d02b 100644
--- a/loopy/symbolic.py
+++ b/loopy/symbolic.py
@@ -1696,7 +1696,7 @@ def get_access_range(domain, subscript, assumptions, shape=None,
             if shape is not None:
                 try:
                     shape_aff = guarded_aff_from_expr(access_map.space, shape[idim])
-                except ExpressionToAffineConversionError as sub_err:
+                except ExpressionToAffineConversionError:
                     pass
 
             if shape_aff is None:
diff --git a/loopy/target/cuda.py b/loopy/target/cuda.py
index 32b810eb3..6b4385bff 100644
--- a/loopy/target/cuda.py
+++ b/loopy/target/cuda.py
@@ -344,7 +344,7 @@ class CUDACASTBuilder(CASTBuilder):
     _VEC_AXES = "xyzw"
 
     def add_vector_access(self, access_expr, index):
-        return access_expr.a(self._VEC_AXES[index])
+        return access_expr.attr(self._VEC_AXES[index])
 
     def emit_barrier(self, synchronization_kind, mem_kind, comment):
         """
diff --git a/loopy/target/pyopencl.py b/loopy/target/pyopencl.py
index d98b6cdd6..5ef564572 100644
--- a/loopy/target/pyopencl.py
+++ b/loopy/target/pyopencl.py
@@ -125,7 +125,8 @@ def adjust_local_temp_var_storage(kernel, device):
 
             new_storage_shape = storage_shape
 
-        new_temp_vars[temp_var.name] = temp_var.copy(storage_shape=new_storage_shape)
+        new_temp_vars[temp_var.name] = temp_var.copy(
+                storage_shape=tuple(new_storage_shape))
 
     return kernel.copy(temporary_variables=new_temp_vars)
 
diff --git a/requirements.txt b/requirements.txt
index a3e88cfea..97c202476 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -9,7 +9,4 @@ git+https://github.com/inducer/codepy.git
 git+https://github.com/inducer/f2py
 
 # Optional, needed for using the C preprocessor on Fortran
-ply>=3.6
-
-# This is needed for the pyinstaller executable to be usable.
-packaging
+ply>=3.6
\ No newline at end of file
diff --git a/setup.cfg b/setup.cfg
index b939ce0cf..eec3dfd1f 100644
--- a/setup.cfg
+++ b/setup.cfg
@@ -1,5 +1,5 @@
 [flake8]
-ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814
+ignore = E126,E127,E128,E123,E226,E241,E242,E265,N802,W503,E402,N814,W504
 max-line-length=85
 exclude=
     loopy/target/c/compyte/ndarray,
diff --git a/test/test_loopy.py b/test/test_loopy.py
index fa32ca04c..b770497f1 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -2890,6 +2890,25 @@ def test_dep_cycle_printing_and_error():
         print(lp.generate_code(knl).device_code())
 
 
+def test_backwards_dep_printing_and_error():
+    knl = lp.make_kernel(
+            "{[i]: 0<=i<n}",
+            """
+            c[i] = a[i] + b[i]                       {id=insn1}
+            c[i] = 2*c[i]                            {id=insn2, dep=insn1}
+            c[i] = 7*c[i] + a[i]*a[i] + b[i]*b[i]    {id=insn3, dep=insn2}
+            b[i] = b[i] + c[i]                                 {id=insn4, dep=insn3}
+            d[i] = 7*a[i ]                                     {id=insn5, dep=insn4}
+            a[i] = a[i] + d[i]                                 {id=insn6, dep=insn5}
+            """, [
+                lp.GlobalArg('a, b', dtype=np.float64),
+                "..."
+            ])
+
+    # Used to crash with KeyError
+    print(knl)
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
diff --git a/test/test_numa_diff.py b/test/test_numa_diff.py
index 62f490cee..1ba44e77e 100644
--- a/test/test_numa_diff.py
+++ b/test/test_numa_diff.py
@@ -47,8 +47,8 @@ __all__ = [
 from loopy.version import LOOPY_USE_LANGUAGE_VERSION_2018_2  # noqa
 
 
-@pytest.mark.parametrize("Nq", [7])
 @pytest.mark.parametrize("ilp_multiple", [1, 2])
+@pytest.mark.parametrize("Nq", [7])
 @pytest.mark.parametrize("opt_level", [11])
 def test_gnuma_horiz_kernel(ctx_factory, ilp_multiple, Nq, opt_level):  # noqa
     ctx = ctx_factory()
diff --git a/test/test_reduction.py b/test/test_reduction.py
index 96dab405a..aaf11ee29 100644
--- a/test/test_reduction.py
+++ b/test/test_reduction.py
@@ -219,32 +219,38 @@ def test_local_parallel_reduction(ctx_factory, size):
 def test_global_parallel_reduction(ctx_factory, size):
     ctx = ctx_factory()
 
-    prog = lp.make_kernel(
+    knl = lp.make_kernel(
             "{[i]: 0 <= i < n }",
             """
             # Using z[0] instead of z works around a bug in ancient PyOpenCL.
-            z[0] = sum(i, i/13)
+            z[0] = sum(i, a[i])
             """)
 
-    ref_prog = prog
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
+    ref_knl = knl
 
     gsize = 128
-    prog = lp.split_iname(prog, "i", gsize * 20)
-    prog = lp.split_iname(prog, "i_inner", gsize, outer_tag="l.0")
-    prog = lp.split_reduction_inward(prog, "i_inner_inner")
-    prog = lp.split_reduction_inward(prog, "i_inner_outer")
+    knl = lp.split_iname(knl, "i", gsize * 20)
+    knl = lp.split_iname(knl, "i_inner", gsize, inner_tag="l.0")
+    knl = lp.split_reduction_outward(knl, "i_outer")
+    knl = lp.split_reduction_inward(knl, "i_inner_outer")
     from loopy.transform.data import reduction_arg_to_subst_rule
-    prog = reduction_arg_to_subst_rule(prog, "i_outer")
-    prog = lp.precompute(prog, "red_i_outer_arg", "i_outer",
+    knl = reduction_arg_to_subst_rule(knl, "i_outer")
+
+    knl = lp.precompute(knl, "red_i_outer_arg", "i_outer",
             temporary_scope=lp.temp_var_scope.GLOBAL,
             default_tag="l.auto")
-    prog = lp.realize_reduction(prog)
-    prog = lp.add_dependency(
-            prog, "writes:acc_i_outer",
+    knl = lp.realize_reduction(knl)
+    knl = lp.tag_inames(knl, "i_outer_0:g.0")
+
+    # Keep the i_outer accumulator on the  correct (lower) side of the barrier,
+    # otherwise there will be useless save/reload code generated.
+    knl = lp.add_dependency(
+            knl, "writes:acc_i_outer",
             "id:red_i_outer_arg_barrier")
 
     lp.auto_test_vs_ref(
-            ref_prog, ctx, prog, parameters={"n": size},
+            ref_knl, ctx, knl, parameters={"n": size},
             print_ref_code=True)
 
 
diff --git a/test/test_statistics.py b/test/test_statistics.py
index 3f2366521..41b44b5a7 100644
--- a/test/test_statistics.py
+++ b/test/test_statistics.py
@@ -57,7 +57,8 @@ def test_op_counter_basic():
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
                                        g=np.float64, h=np.float64))
-    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True,
+                           count_within_subscripts=True)
     n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, SGS)
@@ -161,7 +162,8 @@ def test_op_counter_specialops():
     knl = lp.add_and_infer_dtypes(knl,
                                   dict(a=np.float32, b=np.float32,
                                        g=np.float64, h=np.float64))
-    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True,
+                           count_within_subscripts=True)
     n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, SGS)
@@ -206,7 +208,8 @@ def test_op_counter_bitwise():
                 a=np.int32, b=np.int32,
                 g=np.int64, h=np.int64))
 
-    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True,
+                           count_within_subscripts=False)
     n_workgroups = 1
     group_size = 1
     subgroups_per_group = div_ceil(group_size, SGS)
@@ -226,7 +229,7 @@ def test_op_counter_bitwise():
     i64shift = op_map[lp.Op(np.dtype(np.int64), 'shift', CG.SUBGROUP)
                       ].eval_with_dict(params)
     # (count-per-sub-group)*n_subgroups
-    assert i32add == n*m+n*m*ell*n_subgroups
+    assert i32add == n*m*ell*n_subgroups
     assert i32bw == 2*n*m*ell*n_subgroups
     assert i64bw == 2*n*m*n_subgroups
     assert i64add == i64mul == n*m*n_subgroups
@@ -1153,7 +1156,8 @@ def test_summations_and_filters():
     assert f32lall == (3*n*m*ell)*n_subgroups
     assert f64lall == (2*n*m)*n_subgroups
 
-    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True)
+    op_map = lp.get_op_map(knl, subgroup_size=SGS, count_redundant_work=True,
+                           count_within_subscripts=True)
     #for k, v in op_map.items():
     #    print(type(k), "\n", k.name, k.dtype, type(k.dtype), " :\n", v)
 
diff --git a/test/test_target.py b/test/test_target.py
index a5186c71c..095bf0939 100644
--- a/test/test_target.py
+++ b/test/test_target.py
@@ -350,6 +350,23 @@ def test_ispc_streaming_stores():
     lp.generate_code_v2(knl).all_code()
 
 
+def test_cuda_short_vector():
+    knl = lp.make_kernel(
+        "{ [i]: 0<=i<n }",
+        "out[i] = 2*a[i]",
+        target=lp.CudaTarget())
+
+    knl = lp.set_options(knl, write_code=True)
+    knl = lp.split_iname(knl, "i", 4, slabs=(0, 1), inner_tag="vec")
+    knl = lp.split_array_axis(knl, "a,out", axis_nr=0, count=4)
+    knl = lp.tag_array_axes(knl, "a,out", "C,vec")
+
+    knl = lp.set_options(knl, write_wrapper=True)
+    knl = lp.add_and_infer_dtypes(knl, {"a": np.float32})
+
+    print(lp.generate_code_v2(knl).device_code())
+
+
 if __name__ == "__main__":
     if len(sys.argv) > 1:
         exec(sys.argv[1])
-- 
GitLab