From 5d90fcf4ad3b23aaa60c9a838da995aa1556efa3 Mon Sep 17 00:00:00 2001 From: Andreas Kloeckner Date: Thu, 23 Jun 2011 00:05:46 -0400 Subject: [PATCH] Fix multiple bugs in M2P kernel. --- sumpy/m2p.py | 64 +++++++++++++++++++++++++++++++------------- sumpy/p2m.py | 4 +-- test/test_kernels.py | 20 +++++++++++--- 3 files changed, 63 insertions(+), 25 deletions(-) diff --git a/sumpy/m2p.py b/sumpy/m2p.py index 82558a97..b86009ed 100644 --- a/sumpy/m2p.py +++ b/sumpy/m2p.py @@ -1,6 +1,7 @@ from __future__ import division import numpy as np +import sympy as sp from mako.template import Template from pytools import memoize_method @@ -15,7 +16,7 @@ from sumpy.kernel_common import ( M2P_KERNEL = Template( COMMON_PREAMBLE + -"""//CL// +r"""//CL// typedef ${offset_type} offset_t; typedef ${mpole_offset_type} mpole_offset_t; typedef ${coefficient_type} coeff_t; @@ -33,7 +34,7 @@ typedef ${output_type} output_t; // --------------- // Each work item is responsible for computing the summed-up multipole expansions // at one target point. Each work group is responsible for one target cell. -// To achieve the, the kernel loops first over batches of particles within the target +// To achieve this, the kernel loops first over batches of particles within the target // cell (if necessary) and then over source cells. The outer "loop over source // cell batches" orchestrates cooperative loading of multipole coefficients. @@ -65,31 +66,50 @@ void m2p( __local coeff_t mpole_coeff_l[${par_cell_cnt} * ${ctr_coeff_cnt_size}]; // index into this cell's list of particles - uint plist_idx = lid; + uint plist_base = 0; // loop over particle batches - while (plist_idx < tgt_cell_particle_count) + + // NOTE: Since there is barrier synchronization at the innermost + // level of the loop, all work items must participate until the + // end of the loop, whether they need to or not. + + // TODO: Convert to batched loop + while (plist_base < tgt_cell_particle_count) { + uint plist_idx = plist_base+lid; geometry_vec_t tgt; - ${load_vector_g("tgt", "t", "tgt_cell_particle_offset + plist_idx")} + + if (plist_idx < tgt_cell_particle_count) + { + ${load_vector_g("tgt", "t", "tgt_cell_particle_offset + plist_idx")} + } % for i in range(output_count): output_t output${i} = 0; % endfor - // index into overall M2P interaction list - offset_t ilist_idx = ilist_start + get_local_id(1); + offset_t ilist_base = ilist_start; // loop over source cell batches - while (ilist_idx < ilist_end) + // TODO: Convert to batched loop + while (ilist_base < ilist_end) { + // index into overall M2P interaction list + offset_t ilist_idx = ilist_base + get_local_id(1); + barrier(CLK_LOCAL_MEM_FENCE); - mpole_offset_t mpole_offset = m2p_ilist_mpole_offsets_g[ilist_idx]; - mpole_coeff_l[lid] = mpole_coeff_g[mpole_offset + get_local_id(0)]; + if (ilist_idx < ilist_end) + { + mpole_offset_t mpole_offset = m2p_ilist_mpole_offsets_g[ilist_idx]; + mpole_coeff_l[lid] = mpole_coeff_g[mpole_offset + get_local_id(0)]; + } barrier(CLK_LOCAL_MEM_FENCE); + uint batch_cell_cnt = min(${par_cell_cnt}, ilist_end-ilist_base); + // loop over source cells - for (uint src_cell = 0; src_cell < ${par_cell_cnt}; ++src_cell) + for (uint src_cell = 0; src_cell < batch_cell_cnt; ++src_cell) { uint loc_mpole_base = src_cell*${ctr_coeff_cnt_size}; geometry_vec_t ctr; @@ -98,7 +118,7 @@ void m2p( % endfor loc_mpole_base += ${dimensions}; - // FIXME: Are mpole_coeff_l actually used? + #define COEFF(i) mpole_coeff_l[loc_mpole_base+${dimensions}+i] % for var, expr in vars_and_exprs: % if var.startswith("output"): @@ -107,16 +127,19 @@ void m2p( output_t ${var} = ${expr}; % endif % endfor - } + } - ilist_idx += ${par_cell_cnt}; + ilist_base += ${par_cell_cnt}; } - % for i in range(output_count): - output${i}_g[tgt_cell_particle_offset + plist_idx] = output${i}; - % endfor + if (plist_idx < tgt_cell_particle_count) + { + % for i in range(output_count): + output${i}_g[tgt_cell_particle_offset + plist_idx] = output${i}; + % endfor + } - plist_idx += ${wg_size}; + plist_base += ${wg_size}; } } """, strict_undefined=True, disable_unicode=True) @@ -159,7 +182,9 @@ class M2PKernel(object): outputs = [ ("output%d" % output_idx, - output_map(sum(tc_basis_fun for tc_basis_fun in tc_basis))) + output_map(sum( + sp.Symbol("COEFF(%d)" % i)*tc_basis_fun + for i, tc_basis_fun in enumerate(tc_basis)))) for output_idx, output_map in enumerate(self.output_maps) ] @@ -189,6 +214,7 @@ class M2PKernel(object): output_type=dtype_to_ctype(output_dtype), ) + print kernel_src prg = cl.Program(self.context, kernel_src).build(self.options) kernel = getattr(prg, self.name) kernel.set_scalar_arg_dtypes( diff --git a/sumpy/p2m.py b/sumpy/p2m.py index c5ea0c24..45c9910f 100644 --- a/sumpy/p2m.py +++ b/sumpy/p2m.py @@ -182,7 +182,7 @@ class P2MKernel(object): from pytools import div_ceil wg_count = div_ceil(cell_count, wg_size) - kernel(queue, (wg_count*wg_size,), (wg_size,), + kernel(queue, (wg_count,), (wg_size,), *( [cell_count] + [cell_idx_to_particle_offset.data, @@ -191,7 +191,7 @@ class P2MKernel(object): + [strength.data] + [ctr_i.data for ctr_i in cell_centers] + [mpole_coeff.data] - ), wait_for=wait_for) + ), wait_for=wait_for, g_times_l=True) return mpole_coeff diff --git a/test/test_kernels.py b/test/test_kernels.py index 5df81f77..81e56c9e 100644 --- a/test/test_kernels.py +++ b/test/test_kernels.py @@ -74,10 +74,12 @@ def test_p2m2p(ctx_getter): ctx = ctx_getter() queue = cl.CommandQueue(ctx) + res = 10 + dimensions = 3 sources = np.random.rand(dimensions, 5).astype(np.float32) - targets = np.random.rand(dimensions, 10).astype(np.float32) - targets[0] += 2 + #targets = np.random.rand(dimensions, 10).astype(np.float32) + targets = np.mgrid[-2:2:res*1j, -2:2:res*1j, 2:2:1j].reshape(3, -1).astype(np.float32) centers = np.array([[0.5]]*dimensions).astype(np.float32) from sumpy.tools import vector_to_device @@ -102,7 +104,7 @@ def test_p2m2p(ctx_getter): make_coulomb_kernel_in("b", dimensions), order=2, dimensions=dimensions) - coeff_dtype = np.float64 + coeff_dtype = np.float32 # {{{ apply P2M @@ -146,7 +148,17 @@ def test_p2m2p(ctx_getter): potential_dev_direct, x_derivative_dir = knl(targets_dev, sources_dev, strengths_dev) - print potential_dev - potential_dev_direct + if 0: + import matplotlib.pyplot as pt + pt.imshow(potential_dev_direct.get().reshape(res, res)) + pt.show() + pt.imshow(potential_dev.get().reshape(res, res)) + pt.show() + + print potential_dev-potential_dev_direct + print potential_dev + print potential_dev_direct + #print mpole_coeff # }}} -- GitLab