diff --git a/sumpy/p2p.py b/sumpy/p2p.py index b76e78900f74c32920d7f1c582784eaef1db35ee..e7b47d1e68a464dba2cabc3e396ca0a9d09d389e 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -201,8 +201,7 @@ class SingleSrcTgtListP2PBase(P2PComputationBase): if targets_is_obj_array: knl = lp.tag_array_axes(knl, "targets", "sep,C") - # FIXME: how to split when using blocks? - # knl = lp.split_iname(knl, "itgt", 1024, outer_tag="g.0") + knl = lp.split_iname(knl, "itgt", 1024, outer_tag="g.0") return knl @@ -300,9 +299,8 @@ class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase): ] def get_domains(self): - # FIXME: this doesn't work when separating j and k return [ - "{[i]: 0 <= i < nranges - 1}", + "{[irange]: 0 <= irange < nranges - 1}", "{[j, k]: 0 <= j < tgt_length and 0 <= k < src_length}", "{[idim]: 0 <= idim < dim}" ] @@ -310,12 +308,12 @@ class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase): def get_loop_begin(self): return [ """ - for i - <> tgtstart = tgtranges[i] - <> tgtend = tgtranges[i + 1] + for irange + <> tgtstart = tgtranges[irange] + <> tgtend = tgtranges[irange + 1] <> tgt_length = tgtend - tgtstart - <> srcstart = srcranges[i] - <> srcend = srcranges[i + 1] + <> srcstart = srcranges[irange] + <> srcend = srcranges[irange + 1] <> src_length = srcend - srcstart for j, k <> itgt = tgtindices[tgtstart + j] @@ -341,11 +339,10 @@ class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase): ] def get_result_store_instructions(self): - # FIXME: doesn't work without inames=i. check how the loops are nested! return [ """ result_{i}[tgtstart + j, srcstart + k] = \ - knl_{i}_scaling * pair_result_{i} {{inames=i}} + knl_{i}_scaling * pair_result_{i} {{inames=irange}} """.format(i=iknl) for iknl in range(len(self.kernels)) ] @@ -353,14 +350,27 @@ class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase): def get_assumptions(self): return "nranges>=2" + def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array): + # FIXME + knl = self.get_kernel() + + if sources_is_obj_array: + knl = lp.tag_array_axes(knl, "sources", "sep,C") + if targets_is_obj_array: + knl = lp.tag_array_axes(knl, "targets", "sep,C") + + knl = lp.split_iname(knl, "irange", 128, outer_tag="g.0") + return knl + def __call__(self, queue, targets, sources, tgtindices, srcindices, tgtranges, srcranges, **kwargs): from pytools.obj_array import is_obj_array - knl = self.get_optimized_kernel( + knl = self.get_cached_optimized_kernel( targets_is_obj_array=( is_obj_array(targets) or isinstance(targets, (tuple, list))), sources_is_obj_array=( is_obj_array(sources) or isinstance(sources, (tuple, list)))) + print(knl) return knl(queue, targets=targets, sources=sources, tgtindices=tgtindices, srcindices=srcindices, diff --git a/sumpy/qbx.py b/sumpy/qbx.py index 64ea5578c9a24638b9becb0c4dfd907cae49023c..52db31f0145b16a6fdc88482fbc0fba541659e23 100644 --- a/sumpy/qbx.py +++ b/sumpy/qbx.py @@ -212,7 +212,6 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper): def get_optimized_kernel(self): # FIXME specialize/tune for GPU/CPU loopy_knl = self.get_kernel() - return loopy_knl # FIXME: how to tune for blocks? import pyopencl as cl @@ -335,7 +334,7 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase): def get_domains(self): # FIXME: this doesn't work when separating j and k return [ - "{[i]: 0 <= i < nranges - 1}", + "{[irange]: 0 <= irange < nranges - 1}", "{[j, k]: 0 <= j < tgt_length and 0 <= k < src_length}", "{[idim]: 0 <= idim < dim}" ] @@ -343,12 +342,12 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase): def get_loop_begin(self): return [ """ - for i - <> tgtstart = tgtranges[i] - <> tgtend = tgtranges[i + 1] + for irange + <> tgtstart = tgtranges[irange] + <> tgtend = tgtranges[irange + 1] <> tgt_length = tgtend - tgtstart - <> srcstart = srcranges[i] - <> srcend = srcranges[i + 1] + <> srcstart = srcranges[irange] + <> srcend = srcranges[irange + 1] <> src_length = srcend - srcstart for j, k <> itgt = tgtindices[tgtstart + j] @@ -377,7 +376,7 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase): return [ """ result_KNLIDX[tgtstart + j, srcstart + k] = \ - knl_KNLIDX_scaling*pair_result_KNLIDX {inames=i} + knl_KNLIDX_scaling*pair_result_KNLIDX {inames=irange} """.replace("KNLIDX", str(iknl)) for iknl in range(len(self.expansions)) ] @@ -385,6 +384,14 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase): def get_assumptions(self): return "nranges>=2" + @memoize_method + def get_optimized_kernel(self): + # FIXME + loopy_knl = self.get_kernel() + + loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0") + return loopy_knl + def __call__(self, queue, targets, sources, centers, expansion_radii, tgtindices, srcindices, tgtranges, srcranges, **kwargs): knl = self.get_optimized_kernel()