diff --git a/sumpy/p2p.py b/sumpy/p2p.py index 8c37b0e06ea93f5cf07c561b05b188671f6de36d..d8586cc228e47d6aa9f12d7c7c436726ad28bc2e 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -120,10 +120,26 @@ class SingleSrcTgtListP2PBase(P2PComputationBase): shape=(self.dim, "nsources")), lp.GlobalArg("targets", None, shape=(self.dim, "ntargets")), - lp.ValueArg("nsources", None), - lp.ValueArg("ntargets", None) + lp.ValueArg("nsources", np.int32), + lp.ValueArg("ntargets", np.int32) ] + def get_domains(self): + return [ + "{[itgt]: 0 <= itgt < ntargets}", + "{[isrc]: 0 <= isrc < nsources}", + "{[idim]: 0 <= idim < dim}" + ] + + def get_loop_begin(self): + return ["for itgt, isrc"] + + def get_loop_end(self): + return ["end"] + + def get_assumptions(self): + return "nsources>=1 and ntargets>=1" + def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() @@ -146,25 +162,24 @@ class SingleSrcTgtListP2PBase(P2PComputationBase): + gather_loopy_source_arguments(self.kernels)) loopy_knl = lp.make_kernel( - "{[isrc,itgt,idim]: 0<=itgt d[idim] = targets[idim,itgt] - sources[idim,isrc]"] + ["<> is_self = (isrc == target_to_source[itgt])" if self.exclude_self else ""] + + loopy_insns + [ lp.Assignment(id=None, assignee="pair_result_%d" % i, expression=expr, temp_var_type=lp.auto) for i, expr in enumerate(exprs) ] - + ["end"] + + self.get_loop_end() + self.get_result_store_instructions(), arguments, + assumptions=self.get_assumptions(), name=self.name, - assumptions="nsources>=1 and ntargets>=1", fixed_parameters=dict( dim=self.dim, nstrengths=self.strength_count, @@ -266,6 +281,103 @@ class P2PMatrixGenerator(SingleSrcTgtListP2PBase): # }}} +# {{{ + +class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase): + default_name = "p2p_block" + + def get_src_tgt_arguments(self): + return super(P2PMatrixBlockGenerator, self).get_src_tgt_arguments() \ + + [ + lp.GlobalArg("srcindices", None, shape="nsrcindices"), + lp.GlobalArg("tgtindices", None, shape="ntgtindices"), + lp.GlobalArg("srcranges", None, shape="nranges + 1"), + lp.GlobalArg("tgtranges", None, shape="nranges + 1"), + lp.ValueArg("nsrcindices", np.int32), + lp.ValueArg("ntgtindices", np.int32), + lp.ValueArg("nranges", None) + ] + + def get_domains(self): + return [ + "{[irange]: 0 <= irange < nranges}", + "{[j, k]: 0 <= j < tgt_length and 0 <= k < src_length}", + "{[idim]: 0 <= idim < dim}" + ] + + def get_loop_begin(self): + return [ + """ + for irange + <> tgtstart = tgtranges[irange] + <> tgtend = tgtranges[irange + 1] + <> tgt_length = tgtend - tgtstart + <> srcstart = srcranges[irange] + <> srcend = srcranges[irange + 1] + <> src_length = srcend - srcstart + for j, k + <> itgt = tgtindices[tgtstart + j] + <> isrc = srcindices[srcstart + k] + """ + ] + + def get_loop_end(self): + return [ + """ + end + end + """ + ] + + def get_strength_or_not(self, isrc, kernel_idx): + return 1 + + def get_input_and_output_arguments(self): + return [ + lp.GlobalArg("result_%d" % i, dtype, shape="ntgtindices,nsrcindices") + for i, dtype in enumerate(self.value_dtypes) + ] + + def get_result_store_instructions(self): + return [ + """ + result_{i}[tgtstart + j, srcstart + k] = \ + knl_{i}_scaling * pair_result_{i} {{inames=irange}} + """.format(i=iknl) + for iknl in range(len(self.kernels)) + ] + + def get_assumptions(self): + return "nranges>=1" + + def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array): + # FIXME + knl = self.get_kernel() + + if sources_is_obj_array: + knl = lp.tag_array_axes(knl, "sources", "sep,C") + if targets_is_obj_array: + knl = lp.tag_array_axes(knl, "targets", "sep,C") + + knl = lp.split_iname(knl, "irange", 128, outer_tag="g.0") + return knl + + def __call__(self, queue, targets, sources, tgtindices, srcindices, + tgtranges, srcranges, **kwargs): + from pytools.obj_array import is_obj_array + knl = self.get_cached_optimized_kernel( + targets_is_obj_array=( + is_obj_array(targets) or isinstance(targets, (tuple, list))), + sources_is_obj_array=( + is_obj_array(sources) or isinstance(sources, (tuple, list)))) + + return knl(queue, targets=targets, sources=sources, + tgtindices=tgtindices, srcindices=srcindices, + tgtranges=tgtranges, srcranges=srcranges, **kwargs) + +# }}} + + # {{{ P2P from CSR-like interaction list class P2PFromCSR(P2PComputationBase): diff --git a/sumpy/qbx.py b/sumpy/qbx.py index cde20ef235ee2cd7f5b7dfdf57e30be2d4d8c20c..0f617e62990c4aad8f187b03d22e2ac9c72b46fd 100644 --- a/sumpy/qbx.py +++ b/sumpy/qbx.py @@ -1,6 +1,9 @@ from __future__ import division, absolute_import -__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" +__copyright__ = """ +Copyright (C) 2012 Andreas Kloeckner +Copyright (C) 2018 Alexandru Fikl +""" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy @@ -116,10 +119,26 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper): lp.GlobalArg("center", None, shape=(self.dim, "ntargets"), order="C"), lp.GlobalArg("expansion_radii", None, shape="ntargets"), - lp.ValueArg("nsources", None), - lp.ValueArg("ntargets", None), + lp.ValueArg("nsources", np.int32), + lp.ValueArg("ntargets", np.int32), + ] + + def get_domains(self): + return [ + "{[itgt]: 0 <= itgt < ntargets}", + "{[isrc]: 0 <= isrc < nsources}", + "{[idim]: 0 <= idim < dim}" ] + def get_loop_begin(self): + return ["for itgt, isrc"] + + def get_loop_end(self): + return ["end"] + + def get_assumptions(self): + return "nsources>=1 and ntargets>=1" + @memoize_method def get_kernel(self): from sumpy.symbolic import make_sym_vector @@ -162,10 +181,9 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper): + gather_loopy_source_arguments(self.kernels)) loopy_knl = lp.make_kernel( - "{[isrc,itgt,idim]: 0<=itgt tgtstart = tgtranges[irange] + <> tgtend = tgtranges[irange + 1] + <> tgt_length = tgtend - tgtstart + <> srcstart = srcranges[irange] + <> srcend = srcranges[irange + 1] + <> src_length = srcend - srcstart + for j, k + <> itgt = tgtindices[tgtstart + j] + <> isrc = srcindices[srcstart + k] + """ + ] + + def get_loop_end(self): + return [ + """ + end + end + """ + ] + + def get_strength_or_not(self, isrc, kernel_idx): + return 1 + + def get_input_and_output_arguments(self): + return [ + lp.GlobalArg("result_%d" % i, dtype, shape="ntgtindices,nsrcindices") + for i, dtype in enumerate(self.value_dtypes) + ] + + def get_result_store_instructions(self): + return [ + """ + result_KNLIDX[tgtstart + j, srcstart + k] = \ + knl_KNLIDX_scaling*pair_result_KNLIDX {inames=irange} + """.replace("KNLIDX", str(iknl)) + for iknl in range(len(self.expansions)) + ] + + def get_assumptions(self): + return "nranges>=1" + + @memoize_method + def get_optimized_kernel(self): + # FIXME + loopy_knl = self.get_kernel() + + loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0") + return loopy_knl + + def __call__(self, queue, targets, sources, centers, expansion_radii, + tgtindices, srcindices, tgtranges, srcranges, **kwargs): + knl = self.get_optimized_kernel() + + return knl(queue, src=sources, tgt=targets, center=centers, + expansion_radii=expansion_radii, + tgtindices=tgtindices, srcindices=srcindices, + tgtranges=tgtranges, srcranges=srcranges, **kwargs) + +# }}} + # }}} diff --git a/sumpy/version.py b/sumpy/version.py index 98f05a2db719067b04cb7b36457b24c84b4ddddf..7edeb71ff7b0f7325e82ac21ed43b7c3c5cebc15 100644 --- a/sumpy/version.py +++ b/sumpy/version.py @@ -29,4 +29,4 @@ VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS # branch name, so as to avoid conflicts with the master branch. Make sure # to reset this to the next number up with "master" before merging into # master. -KERNEL_VERSION = ("master", 28) +KERNEL_VERSION = ("master", 29) diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py index 244a555b30e8be3a10c0bf635f936d89f3ebdce7..ed5cbae7039ed56df6882352354fe2904f807326 100644 --- a/test/test_matrixgen.py +++ b/test/test_matrixgen.py @@ -73,15 +73,19 @@ def test_qbx_direct(ctx_getter): from sumpy.kernel import LaplaceKernel lknl = LaplaceKernel(2) + nblks = 10 order = 12 mode_nr = 25 from sumpy.qbx import LayerPotential from sumpy.qbx import LayerPotentialMatrixGenerator + from sumpy.qbx import LayerPotentialMatrixBlockGenerator from sumpy.expansion.local import LineTaylorLocalExpansion + lpot = LayerPotential(ctx, [LineTaylorLocalExpansion(lknl, order)]) mat_gen = LayerPotentialMatrixGenerator(ctx, [LineTaylorLocalExpansion(lknl, order)]) - lpot = LayerPotential(ctx, [LineTaylorLocalExpansion(lknl, order)]) + blk_gen = LayerPotentialMatrixBlockGenerator(ctx, + [LineTaylorLocalExpansion(lknl, order)]) for n in [200, 300, 400]: targets, sources, centers, sigma, expansion_radii = \ @@ -90,16 +94,37 @@ def test_qbx_direct(ctx_getter): h = 2 * np.pi / n strengths = (sigma * h,) + tgtindices = np.arange(0, n) + tgtindices = np.random.choice(tgtindices, size=int(0.8 * n)) + tgtranges = np.arange(0, tgtindices.shape[0] + 1, + tgtindices.shape[0] // nblks) + srcindices = np.arange(0, n) + srcindices = np.random.choice(srcindices, size=int(0.8 * n)) + srcranges = np.arange(0, srcindices.shape[0] + 1, + srcindices.shape[0] // nblks) + assert tgtranges.shape == srcranges.shape + _, (mat,) = mat_gen(queue, targets, sources, centers, - expansion_radii=expansion_radii) + expansion_radii) result_mat = mat.dot(strengths[0]) _, (result_lpot,) = lpot(queue, targets, sources, centers, strengths, - expansion_radii=expansion_radii) + expansion_radii) eps = 1.0e-10 * la.norm(result_lpot) assert la.norm(result_mat - result_lpot) < eps + _, (blk,) = blk_gen(queue, targets, sources, centers, expansion_radii, + tgtindices, srcindices, tgtranges, srcranges) + + for i in range(srcranges.shape[0] - 1): + itgt = np.s_[tgtranges[i]:tgtranges[i + 1]] + isrc = np.s_[srcranges[i]:srcranges[i + 1]] + block = np.ix_(tgtindices[itgt], srcindices[isrc]) + + eps = 1.0e-10 * la.norm(mat[block]) + assert la.norm(blk[itgt, isrc] - mat[block]) < eps + @pytest.mark.parametrize("exclude_self", [True, False]) def test_p2p_direct(ctx_getter, exclude_self): @@ -112,12 +137,14 @@ def test_p2p_direct(ctx_getter, exclude_self): from sumpy.kernel import LaplaceKernel lknl = LaplaceKernel(2) + nblks = 10 mode_nr = 25 from sumpy.p2p import P2P - from sumpy.p2p import P2PMatrixGenerator - mat_gen = P2PMatrixGenerator(ctx, [lknl], exclude_self=exclude_self) + from sumpy.p2p import P2PMatrixGenerator, P2PMatrixBlockGenerator lpot = P2P(ctx, [lknl], exclude_self=exclude_self) + mat_gen = P2PMatrixGenerator(ctx, [lknl], exclude_self=exclude_self) + blk_gen = P2PMatrixBlockGenerator(ctx, [lknl], exclude_self=exclude_self) for n in [200, 300, 400]: targets, sources, _, sigma, _ = \ @@ -126,6 +153,16 @@ def test_p2p_direct(ctx_getter, exclude_self): h = 2 * np.pi / n strengths = (sigma * h,) + tgtindices = np.arange(0, n) + tgtindices = np.random.choice(tgtindices, size=int(0.8 * n)) + tgtranges = np.arange(0, tgtindices.shape[0] + 1, + tgtindices.shape[0] // nblks) + srcindices = np.arange(0, n) + srcindices = np.random.choice(srcindices, size=int(0.8 * n)) + srcranges = np.arange(0, srcindices.shape[0] + 1, + srcindices.shape[0] // nblks) + assert tgtranges.shape == srcranges.shape + extra_kwargs = {} if exclude_self: extra_kwargs["target_to_source"] = np.arange(n, dtype=np.int32) @@ -139,6 +176,18 @@ def test_p2p_direct(ctx_getter, exclude_self): eps = 1.0e-10 * la.norm(result_lpot) assert la.norm(result_mat - result_lpot) < eps + _, (blk,) = blk_gen(queue, targets, sources, + tgtindices, srcindices, tgtranges, srcranges, + **extra_kwargs) + + for i in range(srcranges.shape[0] - 1): + itgt = np.s_[tgtranges[i]:tgtranges[i + 1]] + isrc = np.s_[srcranges[i]:srcranges[i + 1]] + block = np.ix_(tgtindices[itgt], srcindices[isrc]) + + eps = 1.0e-10 * la.norm(mat[block]) + assert la.norm(blk[itgt, isrc] - mat[block]) < eps + # You can test individual routines by typing # $ python test_kernels.py 'test_p2p(cl.create_some_context)'