diff --git a/doc/tutorial.rst b/doc/tutorial.rst index 753b09b5da42835b88a000bc0400fa18a254d80f..ec296008060e82c5c60fdb4c0246d75815dbb47f 100644 --- a/doc/tutorial.rst +++ b/doc/tutorial.rst @@ -188,7 +188,7 @@ by passing :attr:`loopy.Options.write_cl`. #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ (a), int const n, __global float *__restrict__ (out)) { for (int i = 0; i <= -1 + n; ++i) out[i] = 2.0f * a[i]; @@ -262,7 +262,7 @@ call :func:`loopy.generate_code`: #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ (a), int const n, __global float *__restrict__ (out)) { for (int i = 0; i <= -1 + n; ++i) out[i] = 2.0f * a[i]; @@ -275,7 +275,7 @@ the :func:`loopy.generate_header`: >>> header = str(lp.generate_header(typed_knl)[0]) >>> print(header) - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out); + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ (a), int const n, __global float *__restrict__ (out)); .. }}} @@ -386,7 +386,7 @@ Let us take a look at the generated code for the above kernel: #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ (a), int const n, __global float *__restrict__ (out)) { for (int i = 0; i <= -1 + n; ++i) for (int j = 0; j <= -1 + n; ++j) @@ -435,7 +435,7 @@ Now the intended code is generated and our test passes. #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) + __kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) loopy_kernel(__global float const *__restrict__ (a), int const n, __global float *__restrict__ (out)) { for (int i = 0; i <= -1 + n; ++i) for (int j = 0; j <= -1 + n; ++j) @@ -720,7 +720,7 @@ Let's try this out on our vector fill kernel by creating workgroups of size >>> evt, (out,) = knl(queue, a=x_vec_dev) #define lid(N) ((int) get_local_id(N)) ... - __kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *__restrict__ a, int const n) + __kernel void __attribute__ ((reqd_work_group_size(128, 1, 1))) loopy_kernel(__global float *__restrict__ (a), int const n) { if (-1 + -128 * gid(0) + -1 * lid(0) + n >= 0) a[128 * gid(0) + lid(0)] = 0.0f; @@ -952,7 +952,7 @@ Consider the following example: #define lid(N) ((int) get_local_id(N)) ... { - __local float a_temp[16]; + __local float (a_temp)[16]; float acc_k; if (-1 + -16 * gid(0) + -1 * lid(0) + n >= 0) @@ -1283,7 +1283,7 @@ The kernel translates into two OpenCL kernels. #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot) + __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2(__global int *__restrict__ (arr), int const n, __global int *__restrict__ (tmp_save_slot)) { int tmp; @@ -1291,7 +1291,7 @@ The kernel translates into two OpenCL kernels. tmp_save_slot[16 * gid(0) + lid(0)] = tmp; } - __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ arr, int const n, __global int *__restrict__ tmp_save_slot) + __kernel void __attribute__ ((reqd_work_group_size(16, 1, 1))) rotate_v2_0(__global int *__restrict__ (arr), int const n, __global int *__restrict__ (tmp_save_slot)) { int tmp; @@ -1476,9 +1476,9 @@ When we ask to see the code, the issue becomes apparent: #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) transpose(__global float const *__restrict__ a, int const n, __global float *__restrict__ out) + __kernel void __attribute__ ((reqd_work_group_size(16, 16, 1))) transpose(__global float const *__restrict__ (a), int const n, __global float *__restrict__ (out)) { - float a_fetch[16]; + float (a_fetch)[16]; ... a_fetch[lid(0)] = a[n * (16 * gid(1) + lid(0)) + 16 * gid(0) + lid(1)]; @@ -1878,9 +1878,9 @@ Now to make things more interesting, we'll create a kernel with barriers: #define lid(N) ((int) get_local_id(N)) #define gid(N) ((int) get_group_id(N)) - __kernel void __attribute__ ((reqd_work_group_size(97, 1, 1))) loopy_kernel(__global int const *__restrict__ a, __global int *__restrict__ e) + __kernel void __attribute__ ((reqd_work_group_size(97, 1, 1))) loopy_kernel(__global int const *__restrict__ (a), __global int *__restrict__ (e)) { - __local int c[50 * 10 * 99]; + __local int (c)[50 * 10 * 99]; { int const k_outer = 0; diff --git a/loopy/target/c/__init__.py b/loopy/target/c/__init__.py index 25b190809fdc38341c811ede15a8baae693a3116..892b2fa3672a34306456b68887bb67a75647bea0 100644 --- a/loopy/target/c/__init__.py +++ b/loopy/target/c/__init__.py @@ -759,10 +759,6 @@ class CASTBuilder(ASTBuilderBase): def get_temporary_decl(self, codegen_state, schedule_index, temp_var, decl_info): temp_var_decl = POD(self, decl_info.dtype, decl_info.name) - if temp_var.read_only: - from cgen import Const - temp_var_decl = Const(temp_var_decl) - if decl_info.shape: from cgen import ArrayOf ecm = self.get_expression_to_code_mapper(codegen_state) @@ -770,6 +766,10 @@ class CASTBuilder(ASTBuilderBase): ecm(p.flattened_product(decl_info.shape), prec=PREC_NONE, type_context="i")) + if temp_var.read_only: + from cgen import Const + temp_var_decl = Const(temp_var_decl) + if temp_var.alignment: from cgen import AlignedAttribute temp_var_decl = AlignedAttribute(temp_var.alignment, temp_var_decl) diff --git a/test/test_loopy.py b/test/test_loopy.py index d101f6fd0ce86f74a76416305857c3c681c0722d..e59cafdc3d01cb624b4fa99137a81139a5d91102 100644 --- a/test/test_loopy.py +++ b/test/test_loopy.py @@ -1916,7 +1916,7 @@ def test_header_extract(): #test C cknl = knl.copy(target=lp.CTarget()) assert str(lp.generate_header(cknl)[0]) == ( - 'void loopy_kernel(float *__restrict__ T);') + 'void loopy_kernel(float *__restrict__ (T));') #test CUDA cuknl = knl.copy(target=lp.CudaTarget()) @@ -1928,7 +1928,7 @@ def test_header_extract(): oclknl = knl.copy(target=lp.PyOpenCLTarget()) assert str(lp.generate_header(oclknl)[0]) == ( '__kernel void __attribute__ ((reqd_work_group_size(1, 1, 1))) ' - 'loopy_kernel(__global float *__restrict__ T);') + 'loopy_kernel(__global float *__restrict__ (T));') def test_scalars_with_base_storage(ctx_factory):