From 698cc8721466ae242ba5c21bc71a606e0ed254bf Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lawrence@wence.uk>
Date: Wed, 2 Oct 2019 20:21:42 +0100
Subject: [PATCH 1/2] target: Fix order of Const application for temporary
 declarations

A recent bugfix in cgen exposed incorrect usage here. We should ask
for Const(ArrayOf(POD)), not ArrayOf(Const(POD)).
---
 loopy/target/c/__init__.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)
diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py
index 25b190809..892b2fa36 100644
--- a/loopy/target/c/__init__.py
+++ b/loopy/target/c/__init__.py
@@ -759,10 +759,6 @@ class CASTBuilder(ASTBuilderBase):
     def get_temporary_decl(self, codegen_state, schedule_index, temp_var, decl_info):
         temp_var_decl = POD(self, decl_info.dtype, decl_info.name)
 
-        if temp_var.read_only:
-            from cgen import Const
-            temp_var_decl = Const(temp_var_decl)
-
         if decl_info.shape:
             from cgen import ArrayOf
             ecm = self.get_expression_to_code_mapper(codegen_state)
@@ -770,6 +766,10 @@ class CASTBuilder(ASTBuilderBase):
                     ecm(p.flattened_product(decl_info.shape),
                         prec=PREC_NONE, type_context="i"))
 
+        if temp_var.read_only:
+            from cgen import Const
+            temp_var_decl = Const(temp_var_decl)
+
         if temp_var.alignment:
             from cgen import AlignedAttribute
             temp_var_decl = AlignedAttribute(temp_var.alignment, temp_var_decl)
-- 
GitLab


From 9b7640784d900673663a9fd5cb47ae9fe69a502c Mon Sep 17 00:00:00 2001
From: Lawrence Mitchell <lawrence@wence.uk>
Date: Thu, 3 Oct 2019 08:25:13 +0100
Subject: [PATCH 2/2] tests: Fix reference string comparison output

cgen now parenthesises differently.
---
 doc/tutorial.rst   | 26 +++++++++++++-------------
 test/test_loopy.py |  4 ++--
 2 files changed, 15 insertions(+), 15 deletions(-)

diff --git a/doc/tutorial.rst b/doc/tutorial.rst
index 753b09b5d..ec2960080 100644
--- a/doc/tutorial.rst
+++ b/doc/tutorial.rst
@@ -188,7 +188,7 @@ by passing :attr:`loopy.Options.write_cl`.
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
+    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ (a), int const n, __global float *__restrict__ (out))
     {
       for (int i = 0; i <= -1 + n; ++i)
         out[i] = 2.0f * a[i];
@@ -262,7 +262,7 @@ call :func:`loopy.generate_code`:
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
+    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ (a), int const n, __global float *__restrict__ (out))
     {
       for (int i = 0; i <= -1 + n; ++i)
         out[i] = 2.0f * a[i];
@@ -275,7 +275,7 @@ the :func:`loopy.generate_header`:
 
     >>> header = str(lp.generate_header(typed_knl)[0])
     >>> print(header)
-    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out);
+    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ (a), int const n, __global float *__restrict__ (out));
 
 .. }}}
 
@@ -386,7 +386,7 @@ Let us take a look at the generated code for the above kernel:
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
+    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ (a), int const n, __global float *__restrict__ (out))
     {
       for (int i = 0; i <= -1 + n; ++i)
         for (int j = 0; j <= -1 + n; ++j)
@@ -435,7 +435,7 @@ Now the intended code is generated and our test passes.
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
+    __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ (a), int const n, __global float *__restrict__ (out))
     {
       for (int i = 0; i <= -1 + n; ++i)
         for (int j = 0; j <= -1 + n; ++j)
@@ -720,7 +720,7 @@ Let's try this out on our vector fill kernel by creating workgroups of size
     >>> evt, (out,) = knl(queue, a=x_vec_dev)
     #define lid(N) ((int) get_local_id(N))
     ...
-    __kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *__restrict__ a, int const n)
+    __kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *__restrict__ (a), int const n)
     {
       if (-1 + -128 * gid(0) + -1 * lid(0) + n >= 0)
         a[128 * gid(0) + lid(0)] = 0.0f;
@@ -952,7 +952,7 @@ Consider the following example:
     #define lid(N) ((int) get_local_id(N))
     ...
     {
-      __local float a_temp[16];
+      __local float (a_temp)[16];
       float acc_k;
     <BLANKLINE>
       if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0)
@@ -1283,7 +1283,7 @@ The kernel translates into two OpenCL kernels.
    #define lid(N) ((int) get_local_id(N))
    #define gid(N) ((int) get_group_id(N))
    <BLANKLINE>
-   __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot)
+   __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ (arr), int const n, __global int *__restrict__ (tmp_save_slot))
    {
      int tmp;
    <BLANKLINE>
@@ -1291,7 +1291,7 @@ The kernel translates into two OpenCL kernels.
      tmp_save_slot[16 * gid(0) + lid(0)] = tmp;
    }
    <BLANKLINE>
-   __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot)
+   __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ (arr), int const n, __global int *__restrict__ (tmp_save_slot))
    {
      int tmp;
    <BLANKLINE>
@@ -1476,9 +1476,9 @@ When we ask to see the code, the issue becomes apparent:
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) transpose(__global float const *__restrict__ a, int const n, __global float *__restrict__ out)
+    __kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) transpose(__global float const *__restrict__ (a), int const n, __global float *__restrict__ (out))
     {
-      float a_fetch[16];
+      float (a_fetch)[16];
     <BLANKLINE>
       ...
           a_fetch[lid(0)] = a[n * (16 * gid(1) + lid(0)) + 16 * gid(0) + lid(1)];
@@ -1878,9 +1878,9 @@ Now to make things more interesting, we'll create a kernel with barriers:
     #define lid(N) ((int) get_local_id(N))
     #define gid(N) ((int) get_group_id(N))
     <BLANKLINE>
-    __kernel void __attribute__ ((reqd_work_group_size(97, 1, 1))) loopy_kernel(__global int const *__restrict__ a, __global int *__restrict__ e)
+    __kernel void __attribute__ ((reqd_work_group_size(97, 1, 1))) loopy_kernel(__global int const *__restrict__ (a), __global int *__restrict__ (e))
     {
-      __local int c[50 * 10 * 99];
+      __local int (c)[50 * 10 * 99];
     <BLANKLINE>
       {
         int const k_outer = 0;
diff --git a/test/test_loopy.py b/test/test_loopy.py
index d101f6fd0..e59cafdc3 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -1916,7 +1916,7 @@ def test_header_extract():
     #test C
     cknl = knl.copy(target=lp.CTarget())
     assert str(lp.generate_header(cknl)[0]) == (
-            'void loopy_kernel(float *__restrict__ T);')
+            'void loopy_kernel(float *__restrict__ (T));')
 
     #test CUDA
     cuknl = knl.copy(target=lp.CudaTarget())
@@ -1928,7 +1928,7 @@ def test_header_extract():
     oclknl = knl.copy(target=lp.PyOpenCLTarget())
     assert str(lp.generate_header(oclknl)[0]) == (
             '__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) '
-            'loopy_kernel(__global float *__restrict__ T);')
+            'loopy_kernel(__global float *__restrict__ (T));')
 
 
 def test_scalars_with_base_storage(ctx_factory):
-- 
GitLab