Skip to content
Snippets Groups Projects
Commit 2df8cd3b authored by Alexandru Fikl's avatar Alexandru Fikl
Browse files

add block matrix generators

parent d06c01d5
No related branches found
No related tags found
1 merge request!73Add block matrix generators
......@@ -120,10 +120,26 @@ class SingleSrcTgtListP2PBase(P2PComputationBase):
shape=(self.dim, "nsources")),
lp.GlobalArg("targets", None,
shape=(self.dim, "ntargets")),
lp.ValueArg("nsources", None),
lp.ValueArg("ntargets", None)
lp.ValueArg("nsources", np.int32),
lp.ValueArg("ntargets", np.int32)
]
def get_domains(self):
return [
"{[itgt]: 0 <= itgt < ntargets}",
"{[isrc]: 0 <= isrc < nsources}",
"{[idim]: 0 <= idim < dim}"
]
def get_loop_begin(self):
return ["for itgt, isrc"]
def get_loop_end(self):
return ["end"]
def get_assumptions(self):
return "nsources>=1 and ntargets>=1"
def get_kernel(self):
loopy_insns, result_names = self.get_loopy_insns_and_result_names()
......@@ -146,25 +162,24 @@ class SingleSrcTgtListP2PBase(P2PComputationBase):
+ gather_loopy_source_arguments(self.kernels))
loopy_knl = lp.make_kernel(
"{[isrc,itgt,idim]: 0<=itgt<ntargets and 0<=isrc<nsources \
and 0<=idim<dim}",
self.get_domains(),
self.get_kernel_scaling_assignments()
+ ["for itgt, isrc"]
+ loopy_insns
+ self.get_loop_begin()
+ ["<> d[idim] = targets[idim,itgt] - sources[idim,isrc]"]
+ ["<> is_self = (isrc == target_to_source[itgt])"
if self.exclude_self else ""]
+ loopy_insns
+ [
lp.Assignment(id=None,
assignee="pair_result_%d" % i, expression=expr,
temp_var_type=lp.auto)
for i, expr in enumerate(exprs)
]
+ ["end"]
+ self.get_loop_end()
+ self.get_result_store_instructions(),
arguments,
assumptions=self.get_assumptions(),
name=self.name,
assumptions="nsources>=1 and ntargets>=1",
fixed_parameters=dict(
dim=self.dim,
nstrengths=self.strength_count,
......@@ -186,7 +201,8 @@ class SingleSrcTgtListP2PBase(P2PComputationBase):
if targets_is_obj_array:
knl = lp.tag_array_axes(knl, "targets", "sep,C")
knl = lp.split_iname(knl, "itgt", 1024, outer_tag="g.0")
# FIXME: how to split when using blocks?
# knl = lp.split_iname(knl, "itgt", 1024, outer_tag="g.0")
return knl
......@@ -266,6 +282,93 @@ class P2PMatrixGenerator(SingleSrcTgtListP2PBase):
# }}}
# {{{
class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase):
default_name = "p2p_block"
def get_src_tgt_arguments(self):
return super(P2PMatrixBlockGenerator, self).get_src_tgt_arguments() \
+ [
lp.GlobalArg("srcindices", None, shape="nsrcindices"),
lp.GlobalArg("tgtindices", None, shape="ntgtindices"),
lp.GlobalArg("srcranges", None, shape="nranges"),
lp.GlobalArg("tgtranges", None, shape="nranges"),
lp.ValueArg("nsrcindices", np.int32),
lp.ValueArg("ntgtindices", np.int32),
lp.ValueArg("nranges", None)
]
def get_domains(self):
# FIXME: this doesn't work when separating j and k
return [
"{[i]: 0 <= i < nranges - 1}",
"{[j, k]: 0 <= j < tgt_length and 0 <= k < src_length}",
"{[idim]: 0 <= idim < dim}"
]
def get_loop_begin(self):
return [
"""
for i
<> tgtstart = tgtranges[i]
<> tgtend = tgtranges[i + 1]
<> tgt_length = tgtend - tgtstart
<> srcstart = srcranges[i]
<> srcend = srcranges[i + 1]
<> src_length = srcend - srcstart
for j, k
<> itgt = tgtindices[tgtstart + j]
<> isrc = srcindices[srcstart + k]
"""
]
def get_loop_end(self):
return [
"""
end
end
"""
]
def get_strength_or_not(self, isrc, kernel_idx):
return 1
def get_input_and_output_arguments(self):
return [
lp.GlobalArg("result_%d" % i, dtype, shape="ntgtindices,nsrcindices")
for i, dtype in enumerate(self.value_dtypes)
]
def get_result_store_instructions(self):
# FIXME: doesn't work without inames=i. check how the loops are nested!
return [
"""
result_{i}[tgtstart + j, srcstart + k] = \
knl_{i}_scaling * pair_result_{i} {{inames=i}}
""".format(i=iknl)
for iknl in range(len(self.kernels))
]
def get_assumptions(self):
return "nranges>=2"
def __call__(self, queue, targets, sources, tgtindices, srcindices,
tgtranges, srcranges, **kwargs):
from pytools.obj_array import is_obj_array
knl = self.get_optimized_kernel(
targets_is_obj_array=(
is_obj_array(targets) or isinstance(targets, (tuple, list))),
sources_is_obj_array=(
is_obj_array(sources) or isinstance(sources, (tuple, list))))
return knl(queue, targets=targets, sources=sources,
tgtindices=tgtindices, srcindices=srcindices,
tgtranges=tgtranges, srcranges=srcranges, **kwargs)
# }}}
# {{{ P2P from CSR-like interaction list
class P2PFromCSR(P2PComputationBase):
......
......@@ -116,10 +116,26 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
lp.GlobalArg("center", None,
shape=(self.dim, "ntargets"), order="C"),
lp.GlobalArg("expansion_radii", None, shape="ntargets"),
lp.ValueArg("nsources", None),
lp.ValueArg("ntargets", None),
lp.ValueArg("nsources", np.int32),
lp.ValueArg("ntargets", np.int32),
]
def get_domains(self):
return [
"{[itgt]: 0 <= itgt < ntargets}",
"{[isrc]: 0 <= isrc < nsources}",
"{[idim]: 0 <= idim < dim}"
]
def get_loop_begin(self):
return ["for itgt, isrc"]
def get_loop_end(self):
return ["end"]
def get_assumptions(self):
return "nsources>=1 and ntargets>=1"
@memoize_method
def get_kernel(self):
from sumpy.symbolic import make_sym_vector
......@@ -162,10 +178,9 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
+ gather_loopy_source_arguments(self.kernels))
loopy_knl = lp.make_kernel(
"{[isrc,itgt,idim]: 0<=itgt<ntargets and 0<=isrc<nsources "
"and 0<=idim<dim}",
self.get_domains(),
self.get_kernel_scaling_assignments()
+ ["for itgt, isrc"]
+ self.get_loop_begin()
+ [self.get_compute_a_and_b_vecs()]
+ loopy_insns
+ [
......@@ -174,11 +189,11 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
temp_var_type=lp.auto)
for i, (expr, dtype) in enumerate(zip(exprs, self.value_dtypes))
]
+ ["end"]
+ self.get_loop_end()
+ self.get_result_store_instructions(),
arguments,
name=self.name,
assumptions="nsources>=1 and ntargets>=1",
assumptions=self.get_assumptions(),
default_offset=lp.auto,
silenced_warnings="write_race(write_lpot*)",
fixed_parameters=dict(dim=self.dim),
......@@ -197,7 +212,9 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
def get_optimized_kernel(self):
# FIXME specialize/tune for GPU/CPU
loopy_knl = self.get_kernel()
return loopy_knl
# FIXME: how to tune for blocks?
import pyopencl as cl
dev = self.context.devices[0]
if dev.type & cl.device_type.CPU:
......@@ -297,6 +314,88 @@ class LayerPotentialMatrixGenerator(LayerPotentialBase):
# }}}
# {{{
class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
default_name = "qbx_block"
def get_src_tgt_arguments(self):
return LayerPotentialBase.get_src_tgt_arguments(self) \
+ [
lp.GlobalArg("srcindices", None, shape="nsrcindices"),
lp.GlobalArg("tgtindices", None, shape="ntgtindices"),
lp.GlobalArg("srcranges", None, shape="nranges"),
lp.GlobalArg("tgtranges", None, shape="nranges"),
lp.ValueArg("nsrcindices", np.int32),
lp.ValueArg("ntgtindices", np.int32),
lp.ValueArg("nranges", None)
]
def get_domains(self):
# FIXME: this doesn't work when separating j and k
return [
"{[i]: 0 <= i < nranges - 1}",
"{[j, k]: 0 <= j < tgt_length and 0 <= k < src_length}",
"{[idim]: 0 <= idim < dim}"
]
def get_loop_begin(self):
return [
"""
for i
<> tgtstart = tgtranges[i]
<> tgtend = tgtranges[i + 1]
<> tgt_length = tgtend - tgtstart
<> srcstart = srcranges[i]
<> srcend = srcranges[i + 1]
<> src_length = srcend - srcstart
for j, k
<> itgt = tgtindices[tgtstart + j]
<> isrc = srcindices[srcstart + k]
"""
]
def get_loop_end(self):
return [
"""
end
end
"""
]
def get_strength_or_not(self, isrc, kernel_idx):
return 1
def get_input_and_output_arguments(self):
return [
lp.GlobalArg("result_%d" % i, dtype, shape="ntgtindices,nsrcindices")
for i, dtype in enumerate(self.value_dtypes)
]
def get_result_store_instructions(self):
return [
"""
result_KNLIDX[tgtstart + j, srcstart + k] = \
knl_KNLIDX_scaling*pair_result_KNLIDX {inames=i}
""".replace("KNLIDX", str(iknl))
for iknl in range(len(self.expansions))
]
def get_assumptions(self):
return "nranges>=2"
def __call__(self, queue, targets, sources, centers, expansion_radii,
tgtindices, srcindices, tgtranges, srcranges, **kwargs):
knl = self.get_optimized_kernel()
return knl(queue, src=sources, tgt=targets, center=centers,
expansion_radii=expansion_radii,
tgtindices=tgtindices, srcindices=srcindices,
tgtranges=tgtranges, srcranges=srcranges, **kwargs)
# }}}
# }}}
......
......@@ -73,15 +73,19 @@ def test_qbx_direct(ctx_getter):
from sumpy.kernel import LaplaceKernel
lknl = LaplaceKernel(2)
nblks = 10
order = 12
mode_nr = 25
from sumpy.qbx import LayerPotential
from sumpy.qbx import LayerPotentialMatrixGenerator
from sumpy.qbx import LayerPotentialMatrixBlockGenerator
from sumpy.expansion.local import LineTaylorLocalExpansion
lpot = LayerPotential(ctx, [LineTaylorLocalExpansion(lknl, order)])
mat_gen = LayerPotentialMatrixGenerator(ctx,
[LineTaylorLocalExpansion(lknl, order)])
lpot = LayerPotential(ctx, [LineTaylorLocalExpansion(lknl, order)])
blk_gen = LayerPotentialMatrixBlockGenerator(ctx,
[LineTaylorLocalExpansion(lknl, order)])
for n in [200, 300, 400]:
targets, sources, centers, sigma, expansion_radii = \
......@@ -90,16 +94,37 @@ def test_qbx_direct(ctx_getter):
h = 2 * np.pi / n
strengths = (sigma * h,)
tgtindices = np.arange(0, n)
tgtindices = np.random.choice(tgtindices, size=int(0.8 * n))
tgtranges = np.arange(0, tgtindices.shape[0] + 1,
tgtindices.shape[0] // nblks)
srcindices = np.arange(0, n)
srcindices = np.random.choice(srcindices, size=int(0.8 * n))
srcranges = np.arange(0, srcindices.shape[0] + 1,
srcindices.shape[0] // nblks)
assert tgtranges.shape == srcranges.shape
_, (mat,) = mat_gen(queue, targets, sources, centers,
expansion_radii=expansion_radii)
expansion_radii)
result_mat = mat.dot(strengths[0])
_, (result_lpot,) = lpot(queue, targets, sources, centers, strengths,
expansion_radii=expansion_radii)
expansion_radii)
eps = 1.0e-10 * la.norm(result_lpot)
assert la.norm(result_mat - result_lpot) < eps
_, (blk,) = blk_gen(queue, targets, sources, centers, expansion_radii,
tgtindices, srcindices, tgtranges, srcranges)
for i in range(srcranges.shape[0] - 1):
itgt = np.s_[tgtranges[i]:tgtranges[i + 1]]
isrc = np.s_[srcranges[i]:srcranges[i + 1]]
block = np.ix_(tgtindices[itgt], srcindices[isrc])
eps = 1.0e-10 * la.norm(mat[block])
assert la.norm(blk[itgt, isrc] - mat[block]) < eps
@pytest.mark.parametrize("exclude_self", [True, False])
def test_p2p_direct(ctx_getter, exclude_self):
......@@ -112,12 +137,14 @@ def test_p2p_direct(ctx_getter, exclude_self):
from sumpy.kernel import LaplaceKernel
lknl = LaplaceKernel(2)
nblks = 10
mode_nr = 25
from sumpy.p2p import P2P
from sumpy.p2p import P2PMatrixGenerator
mat_gen = P2PMatrixGenerator(ctx, [lknl], exclude_self=exclude_self)
from sumpy.p2p import P2PMatrixGenerator, P2PMatrixBlockGenerator
lpot = P2P(ctx, [lknl], exclude_self=exclude_self)
mat_gen = P2PMatrixGenerator(ctx, [lknl], exclude_self=exclude_self)
blk_gen = P2PMatrixBlockGenerator(ctx, [lknl], exclude_self=exclude_self)
for n in [200, 300, 400]:
targets, sources, _, sigma, _ = \
......@@ -126,6 +153,16 @@ def test_p2p_direct(ctx_getter, exclude_self):
h = 2 * np.pi / n
strengths = (sigma * h,)
tgtindices = np.arange(0, n)
tgtindices = np.random.choice(tgtindices, size=int(0.8 * n))
tgtranges = np.arange(0, tgtindices.shape[0] + 1,
tgtindices.shape[0] // nblks)
srcindices = np.arange(0, n)
srcindices = np.random.choice(srcindices, size=int(0.8 * n))
srcranges = np.arange(0, srcindices.shape[0] + 1,
srcindices.shape[0] // nblks)
assert tgtranges.shape == srcranges.shape
extra_kwargs = {}
if exclude_self:
extra_kwargs["target_to_source"] = np.arange(n, dtype=np.int32)
......@@ -139,6 +176,18 @@ def test_p2p_direct(ctx_getter, exclude_self):
eps = 1.0e-10 * la.norm(result_lpot)
assert la.norm(result_mat - result_lpot) < eps
_, (blk,) = blk_gen(queue, targets, sources,
tgtindices, srcindices, tgtranges, srcranges,
**extra_kwargs)
for i in range(srcranges.shape[0] - 1):
itgt = np.s_[tgtranges[i]:tgtranges[i + 1]]
isrc = np.s_[srcranges[i]:srcranges[i + 1]]
block = np.ix_(tgtindices[itgt], srcindices[isrc])
eps = 1.0e-10 * la.norm(mat[block])
assert la.norm(blk[itgt, isrc] - mat[block]) < eps
# You can test individual routines by typing
# $ python test_kernels.py 'test_p2p(cl.create_some_context)'
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment