diff --git a/.gitignore b/.gitignore index 71f03d3ad65ed14d6e37cfb471ba601adf355b4b..e0287a537d33cf2e5d567c9ee2d2463955238248 100644 --- a/.gitignore +++ b/.gitignore @@ -16,5 +16,7 @@ doc/_build .cache .DS_Store .ipynb_checkpoints +.pytest_cache sumpy/_git_rev.py +.asv diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 946ff3b118a645e8d78c8489789480c60e8c0d94..97c8ee2404035df17dc5dade007d6593a117b3a1 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -2,7 +2,7 @@ Python 3.5 Titan X: script: - py_version=2.7 - export PYOPENCL_TEST=nvi:titan - - EXTRA_INSTALL="numpy mako" + - EXTRA_INSTALL="pybind11 numpy mako" - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" tags: @@ -32,7 +32,7 @@ Python 2.7 POCL: script: - export PY_EXE=python2.7 - export PYOPENCL_TEST=portable - - export EXTRA_INSTALL="numpy mako" + - export EXTRA_INSTALL="pybind11 numpy mako" - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" tags: @@ -45,7 +45,7 @@ Python 3.5 POCL: script: - export PY_EXE=python3.5 - export PYOPENCL_TEST=portable - - export EXTRA_INSTALL="numpy mako" + - export EXTRA_INSTALL="pybind11 numpy mako" - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" tags: @@ -58,7 +58,7 @@ Python 3.6 POCL: script: - export PY_EXE=python3.6 - export PYOPENCL_TEST=portable - - export EXTRA_INSTALL="numpy mako" + - export EXTRA_INSTALL="pybind11 numpy mako" - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh - ". ./build-and-test-py-project.sh" tags: @@ -83,7 +83,7 @@ Python 3.5 Conda: Documentation: script: - - EXTRA_INSTALL="numpy mako" + - EXTRA_INSTALL="pybind11 numpy mako" - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-docs.sh - ". ./build-docs.sh" tags: @@ -99,3 +99,17 @@ Flake8: - python3.5 except: - tags + +Benchmarks: + script: + - CONDA_ENVIRONMENT=.test-conda-env-py3.yml + - REQUIREMENTS_TXT=.test-conda-env-py3-requirements.txt + - PROJECT=sumpy + - PYOPENCL_TEST=portable + - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-benchmark-py-project.sh + - ". ./build-and-benchmark-py-project.sh" + tags: + - linux + - benchmark + except: + - tags diff --git a/README.rst b/README.rst new file mode 100644 index 0000000000000000000000000000000000000000..1f6e76d1584fbdfd2075d965b492507a742c92a0 --- /dev/null +++ b/README.rst @@ -0,0 +1,35 @@ +sumpy: n-body kernels and translation operators +=============================================== + +.. image:: https://gitlab.tiker.net/inducer/sumpy/badges/master/pipeline.svg + :target: https://gitlab.tiker.net/inducer/sumpy/commits/master +.. image:: https://badge.fury.io/py/sumpy.png + :target: http://pypi.python.org/pypi/sumpy + +Sumpy is mainly a 'scaffolding' package for Fast Multipole and quadrature methods. +If you're building one of those and need code generation for the required Multipole +and local expansions, come right on in. Together with boxtree, there is a full, +symbolically kernel-independent FMM implementation here. + +Sumpy relies on + +* `numpy <http://pypi.python.org/pypi/numpy>`_ for arrays +* `boxtree <http://pypi.python.org/pypi/boxtree>`_ for FMM tree building +* `sumpy <http://pypi.python.org/pypi/sumpy>`_ for expansions and analytical routines +* `loopy <http://pypi.python.org/pypi/loo.py>`_ for fast array operations +* `pytest <http://pypi.python.org/pypi/pytest>`_ for automated testing + +and, indirectly, + +* `PyOpenCL <http://pypi.python.org/pypi/pyopencl>`_ as computational infrastructure + +PyOpenCL is likely the only package you'll have to install +by hand, all the others will be installed automatically. + +Resources: + +* `documentation <http://documen.tician.de/sumpy>`_ +* `source code via git <http://github.com/inducer/sumpy>`_ + +If you can see inside the UIUC firewall, you may browse +`benchmark results <http://koelsch.d.tiker.net/benchmarks/asv/sumpy/>`_. \ No newline at end of file diff --git a/asv.conf.json b/asv.conf.json new file mode 100644 index 0000000000000000000000000000000000000000..9632c4fb70310c528b43e3d055a96d96875e629f --- /dev/null +++ b/asv.conf.json @@ -0,0 +1,160 @@ +{ + // The version of the config file format. Do not change, unless + // you know what you are doing. + "version": 1, + + // The name of the project being benchmarked + "project": "sumpy", + + // The project's homepage + "project_url": "https://documen.tician.de/sumpy", + + // The URL or local path of the source code repository for the + // project being benchmarked + "repo": ".", + + // The Python project's subdirectory in your repo. If missing or + // the empty string, the project is assumed to be located at the root + // of the repository. + // "repo_subdir": "", + + // List of branches to benchmark. If not provided, defaults to "master" + // (for git) or "default" (for mercurial). + // "branches": ["master"], // for git + // "branches": ["default"], // for mercurial + + // The DVCS being used. If not set, it will be automatically + // determined from "repo" by looking at the protocol in the URL + // (if remote), or by looking for special directories, such as + // ".git" (if local). + // "dvcs": "git", + + // The tool to use to create environments. May be "conda", + // "virtualenv" or other value depending on the plugins in use. + // If missing or the empty string, the tool will be automatically + // determined by looking for tools on the PATH environment + // variable. + "environment_type": "conda", + + // timeout in seconds for installing any dependencies in environment + // defaults to 10 min + //"install_timeout": 600, + + // the base URL to show a commit for the project. + "show_commit_url": "http://gitlab.tiker.net/inducer/sumpy/commits/", + + // The Pythons you'd like to test against. If not provided, defaults + // to the current version of Python used to run `asv`. + // "pythons": ["2.7", "3.6"], + + // The list of conda channel names to be searched for benchmark + // dependency packages in the specified order + "conda_channels": ["conda-forge", "defaults"], + + // The matrix of dependencies to test. Each key is the name of a + // package (in PyPI) and the values are version numbers. An empty + // list or empty string indicates to just test against the default + // (latest) version. null indicates that the package is to not be + // installed. If the package to be tested is only available from + // PyPi, and the 'environment_type' is conda, then you can preface + // the package name by 'pip+', and the package will be installed via + // pip (with all the conda available packages installed first, + // followed by the pip installed packages). + // + // "matrix": { + // "numpy": ["1.6", "1.7"], + // "six": ["", null], // test with and without six installed + // "pip+emcee": [""], // emcee is only available for install with pip. + // }, + "matrix": { + "numpy" : [""], + "sympy" : ["1.0"], + "pyopencl" : [""], + "islpy" : [""], + "pocl" : [""], + "pip+git+https://github.com/inducer/pymbolic#egg=pymbolic": [""], + "pip+git+https://gitlab.tiker.net/inducer/boxtree#egg=boxtree": [""], + "pip+git+https://github.com/inducer/loopy#egg=loopy": [""], + }, + + // Combinations of libraries/python versions can be excluded/included + // from the set to test. Each entry is a dictionary containing additional + // key-value pairs to include/exclude. + // + // An exclude entry excludes entries where all values match. The + // values are regexps that should match the whole string. + // + // An include entry adds an environment. Only the packages listed + // are installed. The 'python' key is required. The exclude rules + // do not apply to includes. + // + // In addition to package names, the following keys are available: + // + // - python + // Python version, as in the *pythons* variable above. + // - environment_type + // Environment type, as above. + // - sys_platform + // Platform, as in sys.platform. Possible values for the common + // cases: 'linux2', 'win32', 'cygwin', 'darwin'. + // + // "exclude": [ + // {"python": "3.2", "sys_platform": "win32"}, // skip py3.2 on windows + // {"environment_type": "conda", "six": null}, // don't run without six on conda + // ], + // + // "include": [ + // // additional env for python2.7 + // {"python": "2.7", "numpy": "1.8"}, + // // additional env if run on windows+conda + // {"platform": "win32", "environment_type": "conda", "python": "2.7", "libpython": ""}, + // ], + + // The directory (relative to the current directory) that benchmarks are + // stored in. If not provided, defaults to "benchmarks" + // "benchmark_dir": "benchmarks", + + // The directory (relative to the current directory) to cache the Python + // environments in. If not provided, defaults to "env" + "env_dir": ".asv/env", + + // The directory (relative to the current directory) that raw benchmark + // results are stored in. If not provided, defaults to "results". + "results_dir": ".asv/results", + + // The directory (relative to the current directory) that the html tree + // should be written to. If not provided, defaults to "html". + "html_dir": ".asv/html", + + // The number of characters to retain in the commit hashes. + // "hash_length": 8, + + // `asv` will cache wheels of the recent builds in each + // environment, making them faster to install next time. This is + // number of builds to keep, per environment. + // "wheel_cache_size": 0 + + // The commits after which the regression search in `asv publish` + // should start looking for regressions. Dictionary whose keys are + // regexps matching to benchmark names, and values corresponding to + // the commit (exclusive) after which to start looking for + // regressions. The default is to start from the first commit + // with results. If the commit is `null`, regression detection is + // skipped for the matching benchmark. + // + // "regressions_first_commits": { + // "some_benchmark": "352cdf", // Consider regressions only after this commit + // "another_benchmark": null, // Skip regression detection altogether + // } + + // The thresholds for relative change in results, after which `asv + // publish` starts reporting regressions. Dictionary of the same + // form as in ``regressions_first_commits``, with values + // indicating the thresholds. If multiple entries match, the + // maximum is taken. If no entry matches, the default is 5%. + // + // "regressions_thresholds": { + // "some_benchmark": 0.01, // Threshold of 1% + // "another_benchmark": 0.5, // Threshold of 50% + // } +} diff --git a/benchmarks/__init__.py b/benchmarks/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/benchmarks/bench_translations.py b/benchmarks/bench_translations.py new file mode 100644 index 0000000000000000000000000000000000000000..8d6cfdd882be3c8bc91f0078e9c1f82cfc1c841c --- /dev/null +++ b/benchmarks/bench_translations.py @@ -0,0 +1,129 @@ +import numpy as np + +import pytest +import pyopencl as cl +from pyopencl.tools import ( # noqa + pytest_generate_tests_for_pyopencl as pytest_generate_tests) + +from sumpy.expansion.multipole import ( + VolumeTaylorMultipoleExpansion, H2DMultipoleExpansion, + VolumeTaylorMultipoleExpansionBase, + LaplaceConformingVolumeTaylorMultipoleExpansion, + HelmholtzConformingVolumeTaylorMultipoleExpansion) +from sumpy.expansion.local import ( + VolumeTaylorLocalExpansion, H2DLocalExpansion, + LaplaceConformingVolumeTaylorLocalExpansion, + HelmholtzConformingVolumeTaylorLocalExpansion) + +from sumpy.kernel import (LaplaceKernel, HelmholtzKernel, AxisTargetDerivative, + DirectionalSourceDerivative) + +import logging +logger = logging.getLogger(__name__) + +import sympy +import six +import pymbolic.mapper.flop_counter + +import sumpy.symbolic as sym +from sumpy.assignment_collection import SymbolicAssignmentCollection +from sumpy.codegen import to_loopy_insns + +class Param: + def __init__(self, dim, order): + self.dim = dim + self.order = order + + def __repr__(self): + return "{}D_order_{}".format(self.dim, self.order) + + +class TranslationBenchmarkSuite: + + params = [ + Param(2, 10), + Param(2, 15), + Param(2, 20), + Param(3, 5), + Param(3, 10), + ] + + param_names = ['order'] + + def setup(self, param): + logging.basicConfig(level=logging.INFO) + np.random.seed(17) + if self.__class__ == TranslationBenchmarkSuite: + raise NotImplementedError + mpole_expn_class = self.mpole_expn_class + if param.order == 3 and H2DMultipoleExpansion == mpole_expn_class: + raise NotImplementedError + + def track_m2l_op_count(self, param): + knl = self.knl(param.dim) + m_expn = self.mpole_expn_class(knl, order=param.order) + l_expn = self.local_expn_class(knl, order=param.order) + + src_coeff_exprs = [sym.Symbol("src_coeff%d" % i) + for i in range(len(m_expn))] + dvec = sym.make_sym_vector("d", knl.dim) + src_rscale = sym.Symbol("src_rscale") + tgt_rscale = sym.Symbol("tgt_rscale") + result = l_expn.translate_from(m_expn, src_coeff_exprs, src_rscale, + dvec, tgt_rscale) + sac = SymbolicAssignmentCollection() + for i, expr in enumerate(result): + sac.assign_unique("coeff%d" % i, expr) + sac.run_global_cse() + insns = to_loopy_insns(six.iteritems(sac.assignments)) + counter = pymbolic.mapper.flop_counter.CSEAwareFlopCounter() + + return sum([counter.rec(insn.expression)+1 for insn in insns]) + + track_m2l_op_count.unit = "ops" + track_m2l_op_count.timeout = 200.0 + + +class LaplaceVolumeTaylorTranslation(TranslationBenchmarkSuite): + knl = LaplaceKernel + local_expn_class = VolumeTaylorLocalExpansion + mpole_expn_class = VolumeTaylorMultipoleExpansion + params = [ + Param(2, 10), + Param(3, 5), + ] + + +class LaplaceConformingVolumeTaylorTranslation(TranslationBenchmarkSuite): + knl = LaplaceKernel + local_expn_class = LaplaceConformingVolumeTaylorLocalExpansion + mpole_expn_class = LaplaceConformingVolumeTaylorMultipoleExpansion + + +class HelmholtzVolumeTaylorTranslation(TranslationBenchmarkSuite): + knl = HelmholtzKernel + local_expn_class = VolumeTaylorLocalExpansion + mpole_expn_class = VolumeTaylorMultipoleExpansion + params = [ + Param(2, 10), + Param(3, 5), + ] + + +class HelmholtzConformingVolumeTaylorTranslation(TranslationBenchmarkSuite): + knl = HelmholtzKernel + local_expn_class = HelmholtzConformingVolumeTaylorLocalExpansion + mpole_expn_class = HelmholtzConformingVolumeTaylorMultipoleExpansion + + +class Helmholtz2DTranslation(TranslationBenchmarkSuite): + knl = HelmholtzKernel + local_expn_class = H2DLocalExpansion + mpole_expn_class = H2DMultipoleExpansion + params = [ + Param(2, 10), + Param(2, 15), + Param(2, 20), + ] + + diff --git a/contrib/translations/PDE-reduction-symbolic.ipynb b/contrib/translations/PDE-reduction-symbolic.ipynb new file mode 100644 index 0000000000000000000000000000000000000000..bffcdf9e1e24beb09f0da9fbf50bf6d0ecdf1903 --- /dev/null +++ b/contrib/translations/PDE-reduction-symbolic.ipynb @@ -0,0 +1,133 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "import pyopencl as cl\n", + "import sumpy.toys as t\n", + "import numpy as np\n", + "import numpy.linalg as la\n", + "import matplotlib.pyplot as plt\n", + "from sumpy.visualization import FieldPlotter\n", + "from pytools import add_tuples\n", + "\n", + "from sumpy.expansion.local import VolumeTaylorLocalExpansion\n", + "from sumpy.expansion.multipole import (VolumeTaylorMultipoleExpansion,\n", + " LaplaceConformingVolumeTaylorMultipoleExpansion,\n", + " HelmholtzConformingVolumeTaylorMultipoleExpansion)\n", + " \n", + "from sumpy.kernel import (YukawaKernel, HelmholtzKernel, LaplaceKernel)\n", + "\n", + "from sumpy.symbolic import make_sym_vector\n", + "\n", + "order = 2\n", + "dim = 2\n", + "\n", + "if 0:\n", + " knl = LaplaceKernel(dim)\n", + " extra_kernel_kwargs = {}\n", + " mpole_expn_reduced_class = LaplaceConformingVolumeTaylorMultipoleExpansion\n", + " \n", + "else:\n", + " helm_k = 1.2\n", + " knl = HelmholtzKernel(dim)\n", + " extra_kernel_kwargs={\"k\": helm_k}\n", + " mpole_expn_reduced_class = HelmholtzConformingVolumeTaylorMultipoleExpansion\n", + "\n", + "mpole_expn_reduced = mpole_expn_reduced_class(knl, order)\n", + "mpole_expn = VolumeTaylorMultipoleExpansion(knl, order)\n", + "local_expn = VolumeTaylorLocalExpansion(knl, order)\n" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "from pytools import factorial\n", + "\n", + "def mi_factorial(n):\n", + " return np.prod([factorial(n1) for n1 in n])" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "reduced_wrangler = mpole_expn_reduced.derivative_wrangler\n", + "full_wrangler = mpole_expn.derivative_wrangler\n", + "\n", + "reduced_derivatives = list(make_sym_vector(\"deriv\", len(reduced_wrangler.stored_identifiers)))\n", + "full_derivatives = reduced_wrangler.get_full_kernel_derivatives_from_stored(reduced_derivatives, 1)\n", + "\n", + "print(reduced_derivatives)\n", + "print(full_derivatives)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "full_coeffs = list(make_sym_vector(\"coeff\", len(reduced_wrangler.get_full_coefficient_identifiers())))\n", + "\n", + "reduced_coeffs = reduced_wrangler.get_stored_mpole_coefficients_from_full(full_mpole_coefficients=full_coeffs, rscale=1)\n", + "\n", + "print(full_coeffs)\n", + "print(reduced_coeffs)" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "dvec = make_sym_vector(\"d\", dim)\n", + "translated_reduce_coeffs = mpole_expn_reduced.translate_from(mpole_expn_reduced, reduced_coeffs, 1, dvec, 1)\n", + "translated_full_coeffs = mpole_expn.translate_from(mpole_expn, full_coeffs, 1, dvec, 1)\n", + "translated_full_coeffs" + ] + }, + { + "cell_type": "code", + "execution_count": null, + "metadata": {}, + "outputs": [], + "source": [ + "eval_reduced = sum(a*b for a, b in zip(translated_reduce_coeffs, reduced_derivatives))\n", + "eval_full = sum(a*b for a, b in zip(translated_full_coeffs, full_derivatives))\n", + "\n", + "(eval_full-eval_reduced).simplify()" + ] + } + ], + "metadata": { + "kernelspec": { + "display_name": "Python 3", + "language": "python", + "name": "python3" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 3 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython3", + "version": "3.6.4" + } + }, + "nbformat": 4, + "nbformat_minor": 2 +} diff --git a/requirements.txt b/requirements.txt index 1e86e61017a9f70631b25ba46ad808e16f0b715e..8e48bc1c8b0da2b4d81682839c688c0ac69637d5 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,5 +1,5 @@ numpy -sympy==1.0 +sympy==1.1.1 git+https://github.com/inducer/pymbolic git+https://github.com/inducer/islpy git+https://github.com/inducer/pyopencl diff --git a/setup.py b/setup.py index 8d0850540362ddfc3ecedf453f2ccc53bba21a1c..9457bbdc269697f61153e755a930efc9fccae41d 100644 --- a/setup.py +++ b/setup.py @@ -94,7 +94,7 @@ setup(name="sumpy", install_requires=[ "pytools>=2018.2", "loo.py>=2017.2", - "boxtree>=2013.1", + "boxtree>=2018.1", "pytest>=2.3", "six", diff --git a/sumpy/fmm.py b/sumpy/fmm.py index ed2a1e4bd35b07c4bb1f601c6b850044fdf5539a..bd1c51d9b8050529c27174fdf068ffafbb766351 100644 --- a/sumpy/fmm.py +++ b/sumpy/fmm.py @@ -55,6 +55,10 @@ class SumpyExpansionWranglerCodeContainer(object): necessarily must have a :class:`pyopencl.CommandQueue`, but this queue is allowed to be more ephemeral than the code, the code's lifetime is decoupled by storing it in this object. + + Timing results returned by this wrangler contain the values *wall_elapsed* + which measures elapsed wall time. This requires a command queue with + profiling enabled. """ def __init__(self, cl_context, @@ -145,6 +149,53 @@ class SumpyExpansionWranglerCodeContainer(object): # }}} +# {{{ timing future + +_SECONDS_PER_NANOSECOND = 1e-9 + + +class UnableToCollectTimingData(UserWarning): + pass + + +class SumpyTimingFuture(object): + + def __init__(self, queue, events): + self.queue = queue + self.events = events + + @memoize_method + def result(self): + from boxtree.fmm import TimingResult + + if not self.queue.properties & cl.command_queue_properties.PROFILING_ENABLE: + from warnings import warn + warn( + "Profiling was not enabled in the command queue. " + "Timing data will not be collected.", + category=UnableToCollectTimingData, + stacklevel=3) + return TimingResult(wall_elapsed=None) + + pyopencl.wait_for_events(self.events) + + result = 0 + for event in self.events: + result += ( + (event.profile.end - event.profile.start) + * _SECONDS_PER_NANOSECOND) + + return TimingResult(wall_elapsed=result) + + def done(self): + return all( + event.get_info(cl.event_info.COMMAND_EXECUTION_STATUS) + == cl.command_execution_status.COMPLETE + for event in self.events) + +# }}} + + # {{{ expansion wrangler class SumpyExpansionWrangler(object): @@ -175,6 +226,7 @@ class SumpyExpansionWrangler(object): self.code = code_container self.queue = queue self.tree = tree + self.issued_timing_data_warning = False self.dtype = dtype @@ -305,6 +357,8 @@ class SumpyExpansionWrangler(object): kwargs = self.extra_kwargs.copy() kwargs.update(self.box_source_list_kwargs()) + events = [] + for lev in range(self.tree.nlevels): p2m = self.code.p2m(self.level_orders[lev]) start, stop = level_start_source_box_nrs[lev:lev+2] @@ -325,10 +379,11 @@ class SumpyExpansionWrangler(object): rscale=level_to_rscale(self.tree, lev), **kwargs) + events.append(evt) assert mpoles_res is mpoles_view - return mpoles + return (mpoles, SumpyTimingFuture(self.queue, events)) def coarsen_multipoles(self, level_start_source_parent_box_nrs, @@ -336,7 +391,7 @@ class SumpyExpansionWrangler(object): mpoles): tree = self.tree - evt = None + events = [] # nlevels-1 is the last valid level index # nlevels-2 is the last valid level that could have children @@ -378,12 +433,14 @@ class SumpyExpansionWrangler(object): tgt_rscale=level_to_rscale(self.tree, target_level), **self.kernel_extra_kwargs) + events.append(evt) + assert mpoles_res is target_mpoles_view - if evt is not None: - mpoles.add_event(evt) + if events: + mpoles.add_event(events[-1]) - return mpoles + return (mpoles, SumpyTimingFuture(self.queue, events)) def eval_direct(self, target_boxes, source_box_starts, source_box_lists, src_weights): @@ -394,6 +451,8 @@ class SumpyExpansionWrangler(object): kwargs.update(self.box_source_list_kwargs()) kwargs.update(self.box_target_list_kwargs()) + events = [] + evt, pot_res = self.code.p2p()(self.queue, target_boxes=target_boxes, source_box_starts=source_box_starts, @@ -402,12 +461,13 @@ class SumpyExpansionWrangler(object): result=pot, **kwargs) + events.append(evt) for pot_i, pot_res_i in zip(pot, pot_res): assert pot_i is pot_res_i pot_i.add_event(evt) - return pot + return (pot, SumpyTimingFuture(self.queue, events)) def multipole_to_local(self, level_start_target_box_nrs, @@ -415,6 +475,8 @@ class SumpyExpansionWrangler(object): mpole_exps): local_exps = self.local_expansion_zeros() + events = [] + for lev in range(self.tree.nlevels): start, stop = level_start_target_box_nrs[lev:lev+2] if start == stop: @@ -445,8 +507,9 @@ class SumpyExpansionWrangler(object): tgt_rscale=level_to_rscale(self.tree, lev), **self.kernel_extra_kwargs) + events.append(evt) - return local_exps + return (local_exps, SumpyTimingFuture(self.queue, events)) def eval_multipoles(self, target_boxes_by_source_level, source_boxes_by_level, mpole_exps): @@ -455,9 +518,10 @@ class SumpyExpansionWrangler(object): kwargs = self.kernel_extra_kwargs.copy() kwargs.update(self.box_target_list_kwargs()) + events = [] + wait_for = mpole_exps.events - has_evt = False for isrc_level, ssn in enumerate(source_boxes_by_level): if len(target_boxes_by_source_level[isrc_level]) == 0: continue @@ -484,19 +548,18 @@ class SumpyExpansionWrangler(object): wait_for=wait_for, **kwargs) + events.append(evt) - has_evt = True wait_for = [evt] for pot_i, pot_res_i in zip(pot, pot_res): assert pot_i is pot_res_i - if has_evt: + if events: for pot_i in pot: - # Intentionally only adding the last event. - pot_i.add_event(evt) + pot_i.add_event(events[-1]) - return pot + return (pot, SumpyTimingFuture(self.queue, events)) def form_locals(self, level_start_target_or_target_parent_box_nrs, @@ -506,6 +569,8 @@ class SumpyExpansionWrangler(object): kwargs = self.extra_kwargs.copy() kwargs.update(self.box_source_list_kwargs()) + events = [] + for lev in range(self.tree.nlevels): start, stop = \ level_start_target_or_target_parent_box_nrs[lev:lev+2] @@ -531,15 +596,19 @@ class SumpyExpansionWrangler(object): rscale=level_to_rscale(self.tree, lev), **kwargs) + events.append(evt) assert result is target_local_exps_view - return local_exps + return (local_exps, SumpyTimingFuture(self.queue, events)) def refine_locals(self, level_start_target_or_target_parent_box_nrs, target_or_target_parent_boxes, local_exps): + + events = [] + for target_lev in range(1, self.tree.nlevels): start, stop = level_start_target_or_target_parent_box_nrs[ target_lev:target_lev+2] @@ -570,12 +639,13 @@ class SumpyExpansionWrangler(object): tgt_rscale=level_to_rscale(self.tree, target_lev), **self.kernel_extra_kwargs) + events.append(evt) assert local_exps_res is target_local_exps_view local_exps.add_event(evt) - return local_exps + return (local_exps, SumpyTimingFuture(self.queue, [evt])) def eval_locals(self, level_start_target_box_nrs, target_boxes, local_exps): pot = self.output_zeros() @@ -583,6 +653,8 @@ class SumpyExpansionWrangler(object): kwargs = self.kernel_extra_kwargs.copy() kwargs.update(self.box_target_list_kwargs()) + events = [] + for lev in range(self.tree.nlevels): start, stop = level_start_target_box_nrs[lev:lev+2] if start == stop: @@ -606,11 +678,12 @@ class SumpyExpansionWrangler(object): rscale=level_to_rscale(self.tree, lev), **kwargs) + events.append(evt) for pot_i, pot_res_i in zip(pot, pot_res): assert pot_i is pot_res_i - return pot + return (pot, SumpyTimingFuture(self.queue, events)) def finalize_potentials(self, potentials): return potentials diff --git a/sumpy/kernel.py b/sumpy/kernel.py index 40e82dc72af28830e548e42d1d8e1d6d9e77c9c1..12255a93f6d686c003f8f99f16f2f43db22ba037 100644 --- a/sumpy/kernel.py +++ b/sumpy/kernel.py @@ -269,12 +269,12 @@ class Kernel(object): return expr def get_global_scaling_const(self): - """Return a global scaling constant of the kernel. - Typically, this ensures that the kernel is scaled so that - :math:`\mathcal L(G)(x)=C\delta(x)` with a constant of 1, where - :math:`\mathcal L` is the PDE operator associated with the kernel. - Not to be confused with *rscale*, which keeps expansion - coefficients benignly scaled. + r"""Return a global scaling constant of the kernel. + Typically, this ensures that the kernel is scaled so that + :math:`\mathcal L(G)(x)=C\delta(x)` with a constant of 1, where + :math:`\mathcal L` is the PDE operator associated with the kernel. + Not to be confused with *rscale*, which keeps expansion + coefficients benignly scaled. """ raise NotImplementedError @@ -302,7 +302,7 @@ class ExpressionKernel(Kernel): def __init__(self, dim, expression, global_scaling_const, is_complex_valued): - """ + r""" :arg expression: A :mod:`pymbolic` expression depending on variables *d_1* through *d_N* where *N* equals *dim*. (These variables match what is returned from diff --git a/sumpy/p2e.py b/sumpy/p2e.py index e9037a3f55396d57103a06970de3a1418adf00f3..daa7d93dc5b393d358424031500a4915e2c2dd9f 100644 --- a/sumpy/p2e.py +++ b/sumpy/p2e.py @@ -27,6 +27,7 @@ from six.moves import range import numpy as np import loopy as lp +from loopy.version import MOST_RECENT_LANGUAGE_VERSION from sumpy.tools import KernelCacheWrapper @@ -159,7 +160,8 @@ class P2EFromSingleBox(P2EBase): assumptions="nsrc_boxes>=1", silenced_warnings="write_race(write_expn*)", default_offset=lp.auto, - fixed_parameters=dict(dim=self.dim)) + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") @@ -265,7 +267,8 @@ class P2EFromCSR(P2EBase): assumptions="ntgt_boxes>=1", silenced_warnings="write_race(write_expn*)", default_offset=lp.auto, - fixed_parameters=dict(dim=self.dim)) + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION) loopy_knl = self.expansion.prepare_loopy_kernel(loopy_knl) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") diff --git a/sumpy/p2p.py b/sumpy/p2p.py index d8586cc228e47d6aa9f12d7c7c436726ad28bc2e..a0e89f242f958bbaea411090582e885d942b2a1d 100644 --- a/sumpy/p2p.py +++ b/sumpy/p2p.py @@ -30,6 +30,7 @@ from six.moves import range import numpy as np import loopy as lp +from loopy.version import MOST_RECENT_LANGUAGE_VERSION from pymbolic import var from sumpy.tools import KernelComputation, KernelCacheWrapper @@ -40,10 +41,10 @@ __doc__ = """ Particle-to-particle -------------------- -.. autoclass:: P2PComputationBase -.. autoclass:: SingleSrcTgtListP2PBase +.. autoclass:: P2PBase .. autoclass:: P2P .. autoclass:: P2PMatrixGenerator +.. autoclass:: P2PMatrixBlockGenerator .. autoclass:: P2PFromCSR """ @@ -54,7 +55,7 @@ Particle-to-particle # {{{ p2p base class -class P2PComputationBase(KernelComputation, KernelCacheWrapper): +class P2PBase(KernelComputation, KernelCacheWrapper): def __init__(self, ctx, kernels, exclude_self, strength_usage=None, value_dtypes=None, options=[], name=None, device=None): @@ -74,6 +75,10 @@ class P2PComputationBase(KernelComputation, KernelCacheWrapper): from pytools import single_valued self.dim = single_valued(knl.dim for knl in self.kernels) + def get_cache_key(self): + return (type(self).__name__, tuple(self.kernels), self.exclude_self, + tuple(self.strength_usage), tuple(self.value_dtypes)) + def get_loopy_insns_and_result_names(self): from sumpy.symbolic import make_sym_vector dvec = make_sym_vector("d", self.dim) @@ -104,93 +109,40 @@ class P2PComputationBase(KernelComputation, KernelCacheWrapper): return loopy_insns, result_names - def get_cache_key(self): - return (type(self).__name__, tuple(self.kernels), self.exclude_self, - tuple(self.strength_usage), tuple(self.value_dtypes)) - -# }}} - - -# {{{ P2P with list of sources and list of targets - -class SingleSrcTgtListP2PBase(P2PComputationBase): - def get_src_tgt_arguments(self): - return [ - lp.GlobalArg("sources", None, - shape=(self.dim, "nsources")), - lp.GlobalArg("targets", None, - shape=(self.dim, "ntargets")), - lp.ValueArg("nsources", np.int32), - lp.ValueArg("ntargets", np.int32) - ] - - def get_domains(self): - return [ - "{[itgt]: 0 <= itgt < ntargets}", - "{[isrc]: 0 <= isrc < nsources}", - "{[idim]: 0 <= idim < dim}" - ] - - def get_loop_begin(self): - return ["for itgt, isrc"] - - def get_loop_end(self): - return ["end"] - - def get_assumptions(self): - return "nsources>=1 and ntargets>=1" + def get_strength_or_not(self, isrc, kernel_idx): + return var("strength").index((self.strength_usage[kernel_idx], isrc)) - def get_kernel(self): - loopy_insns, result_names = self.get_loopy_insns_and_result_names() + def get_kernel_exprs(self, result_names): + from pymbolic import var isrc_sym = var("isrc") - exprs = [ - var(name) - * self.get_strength_or_not(isrc_sym, i) - for i, name in enumerate(result_names)] + exprs = [var(name) * self.get_strength_or_not(isrc_sym, i) + for i, name in enumerate(result_names)] if self.exclude_self: from pymbolic.primitives import If, Variable exprs = [If(Variable("is_self"), 0, expr) for expr in exprs] - from sumpy.tools import gather_loopy_source_arguments - arguments = ( - self.get_src_tgt_arguments() - + self.get_input_and_output_arguments() - + ([lp.GlobalArg("target_to_source", np.int32, shape=("ntargets",))] - if self.exclude_self else []) - + gather_loopy_source_arguments(self.kernels)) - - loopy_knl = lp.make_kernel( - self.get_domains(), - self.get_kernel_scaling_assignments() - + self.get_loop_begin() - + ["<> d[idim] = targets[idim,itgt] - sources[idim,isrc]"] - + ["<> is_self = (isrc == target_to_source[itgt])" - if self.exclude_self else ""] - + loopy_insns - + [ - lp.Assignment(id=None, - assignee="pair_result_%d" % i, expression=expr, - temp_var_type=lp.auto) - for i, expr in enumerate(exprs) - ] - + self.get_loop_end() - + self.get_result_store_instructions(), - arguments, - assumptions=self.get_assumptions(), - name=self.name, - fixed_parameters=dict( - dim=self.dim, - nstrengths=self.strength_count, - nresults=len(self.kernels))) + return [lp.Assignment(id=None, + assignee="pair_result_%d" % i, expression=expr, + temp_var_type=lp.auto) + for i, expr in enumerate(exprs)] - loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") - - for knl in self.kernels: - loopy_knl = knl.prepare_loopy_kernel(loopy_knl) + def get_default_src_tgt_arguments(self): + from sumpy.tools import gather_loopy_source_arguments + return ([ + lp.GlobalArg("sources", None, + shape=(self.dim, "nsources")), + lp.GlobalArg("targets", None, + shape=(self.dim, "ntargets")), + lp.ValueArg("nsources", None), + lp.ValueArg("ntargets", None)] + + ([lp.GlobalArg("target_to_source", None, shape=("ntargets",))] + if self.exclude_self else []) + + gather_loopy_source_arguments(self.kernels)) - return loopy_knl + def get_kernel(self): + raise NotImplementedError def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array): # FIXME @@ -210,26 +162,56 @@ class SingleSrcTgtListP2PBase(P2PComputationBase): # {{{ P2P point-interaction calculation -class P2P(SingleSrcTgtListP2PBase): - default_name = "p2p_apply" +class P2P(P2PBase): + """Direct applier for P2P interactions.""" - def get_strength_or_not(self, isrc, kernel_idx): - return var("strength").index((self.strength_usage[kernel_idx], isrc)) + default_name = "p2p_apply" - def get_input_and_output_arguments(self): - return [ + def get_kernel(self): + loopy_insns, result_names = self.get_loopy_insns_and_result_names() + kernel_exprs = self.get_kernel_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + + [ lp.GlobalArg("strength", None, - shape="nstrengths,nsources", dim_tags="sep,C"), + shape="nstrengths, nsources", dim_tags="sep,C"), lp.GlobalArg("result", None, - shape="nresults,ntargets", dim_tags="sep,C") - ] + shape="nresults, ntargets", dim_tags="sep,C") + ]) + + loopy_knl = lp.make_kernel([""" + {[itgt, isrc, idim]: \ + 0 <= itgt < ntargets and \ + 0 <= isrc < nsources and \ + 0 <= idim < dim} + """], + self.get_kernel_scaling_assignments() + + ["for itgt, isrc"] + + ["<> d[idim] = targets[idim, itgt] - sources[idim, isrc]"] + + ["<> is_self = (isrc == target_to_source[itgt])" + if self.exclude_self else ""] + + loopy_insns + kernel_exprs + + [""" + result[{i}, itgt] = knl_{i}_scaling * \ + simul_reduce(sum, isrc, pair_result_{i}) {{inames=itgt}} + """.format(i=iknl) + for iknl in range(len(self.kernels))] + + ["end"], + arguments, + assumptions="nsources>=1 and ntargets>=1", + name=self.name, + fixed_parameters=dict( + dim=self.dim, + nstrengths=self.strength_count, + nresults=len(self.kernels)), + lang_version=MOST_RECENT_LANGUAGE_VERSION) - def get_result_store_instructions(self): - return [""" - result[{i}, itgt] = knl_{i}_scaling \ - * simul_reduce(sum, isrc, pair_result_{i}) {{inames=itgt}} - """.format(i=iknl) - for iknl in range(len(self.kernels))] + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + + for knl in self.kernels: + loopy_knl = knl.prepare_loopy_kernel(loopy_knl) + + return loopy_knl def __call__(self, queue, targets, sources, strength, **kwargs): from pytools.obj_array import is_obj_array @@ -247,26 +229,53 @@ class P2P(SingleSrcTgtListP2PBase): # {{{ P2P matrix writer -class P2PMatrixGenerator(SingleSrcTgtListP2PBase): +class P2PMatrixGenerator(P2PBase): + """Generator for P2P interaction matrix entries.""" + default_name = "p2p_matrix" def get_strength_or_not(self, isrc, kernel_idx): return 1 - def get_input_and_output_arguments(self): - return [ - lp.GlobalArg("result_%d" % i, dtype, shape="ntargets,nsources") - for i, dtype in enumerate(self.value_dtypes) - ] - - def get_result_store_instructions(self): - return [ - """ + def get_kernel(self): + loopy_insns, result_names = self.get_loopy_insns_and_result_names() + kernel_exprs = self.get_kernel_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + + [lp.GlobalArg("result_%d" % i, dtype, + shape="ntargets,nsources") + for i, dtype in enumerate(self.value_dtypes)]) + + loopy_knl = lp.make_kernel([""" + {[itgt, isrc, idim]: \ + 0 <= itgt < ntargets and \ + 0 <= isrc < nsources and \ + 0 <= idim < dim} + """], + self.get_kernel_scaling_assignments() + + ["for itgt, isrc"] + + ["<> d[idim] = targets[idim, itgt] - sources[idim, isrc]"] + + ["<> is_self = (isrc == target_to_source[itgt])" + if self.exclude_self else ""] + + loopy_insns + kernel_exprs + + [""" result_{i}[itgt, isrc] = \ knl_{i}_scaling * pair_result_{i} {{inames=isrc:itgt}} """.format(i=iknl) - for iknl in range(len(self.kernels)) - ] + for iknl in range(len(self.kernels))] + + ["end"], + arguments, + assumptions="nsources>=1 and ntargets>=1", + name=self.name, + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION) + + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + + for knl in self.kernels: + loopy_knl = knl.prepare_loopy_kernel(loopy_knl) + + return loopy_knl def __call__(self, queue, targets, sources, **kwargs): from pytools.obj_array import is_obj_array @@ -281,74 +290,67 @@ class P2PMatrixGenerator(SingleSrcTgtListP2PBase): # }}} -# {{{ +# {{{ P2P matrix block writer -class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase): - default_name = "p2p_block" +class P2PMatrixBlockGenerator(P2PBase): + """Generator for a subset of P2P interaction matrix entries. - def get_src_tgt_arguments(self): - return super(P2PMatrixBlockGenerator, self).get_src_tgt_arguments() \ - + [ - lp.GlobalArg("srcindices", None, shape="nsrcindices"), - lp.GlobalArg("tgtindices", None, shape="ntgtindices"), - lp.GlobalArg("srcranges", None, shape="nranges + 1"), - lp.GlobalArg("tgtranges", None, shape="nranges + 1"), - lp.ValueArg("nsrcindices", np.int32), - lp.ValueArg("ntgtindices", np.int32), - lp.ValueArg("nranges", None) - ] - - def get_domains(self): - return [ - "{[irange]: 0 <= irange < nranges}", - "{[j, k]: 0 <= j < tgt_length and 0 <= k < src_length}", - "{[idim]: 0 <= idim < dim}" - ] - - def get_loop_begin(self): - return [ - """ - for irange - <> tgtstart = tgtranges[irange] - <> tgtend = tgtranges[irange + 1] - <> tgt_length = tgtend - tgtstart - <> srcstart = srcranges[irange] - <> srcend = srcranges[irange + 1] - <> src_length = srcend - srcstart - for j, k - <> itgt = tgtindices[tgtstart + j] - <> isrc = srcindices[srcstart + k] - """ - ] - - def get_loop_end(self): - return [ - """ - end - end - """ - ] + .. automethod:: __call__ + """ + + default_name = "p2p_block" def get_strength_or_not(self, isrc, kernel_idx): return 1 - def get_input_and_output_arguments(self): - return [ - lp.GlobalArg("result_%d" % i, dtype, shape="ntgtindices,nsrcindices") - for i, dtype in enumerate(self.value_dtypes) - ] - - def get_result_store_instructions(self): - return [ - """ - result_{i}[tgtstart + j, srcstart + k] = \ - knl_{i}_scaling * pair_result_{i} {{inames=irange}} + def get_kernel(self): + loopy_insns, result_names = self.get_loopy_insns_and_result_names() + kernel_exprs = self.get_kernel_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + + [ + lp.GlobalArg("srcindices", None, shape="nresult"), + lp.GlobalArg("tgtindices", None, shape="nresult"), + lp.ValueArg("nresult", None) + ] + + [lp.GlobalArg("result_%d" % i, dtype, shape="nresult") + for i, dtype in enumerate(self.value_dtypes)]) + + loopy_knl = lp.make_kernel( + "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}", + self.get_kernel_scaling_assignments() + + [""" + for imat + <> d[idim] = targets[idim, tgtindices[imat]] - \ + sources[idim, srcindices[imat]] + """] + + [""" + <> is_self = (srcindices[imat] == + target_to_source[tgtindices[imat]]) + """ if self.exclude_self else ""] + + loopy_insns + kernel_exprs + + [""" + result_{i}[imat] = \ + knl_{i}_scaling * pair_result_{i} \ + {{id_prefix=write_p2p}} """.format(i=iknl) - for iknl in range(len(self.kernels)) - ] + for iknl in range(len(self.kernels))] + + ["end"], + arguments, + assumptions="nresult>=1", + silenced_warnings="write_race(write_p2p*)", + name=self.name, + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION) + + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + loopy_knl = lp.add_dtypes(loopy_knl, + dict(nsources=np.int32, ntargets=np.int32)) - def get_assumptions(self): - return "nranges>=1" + for knl in self.kernels: + loopy_knl = knl.prepare_loopy_kernel(loopy_knl) + + return loopy_knl def get_optimized_kernel(self, targets_is_obj_array, sources_is_obj_array): # FIXME @@ -359,11 +361,31 @@ class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase): if targets_is_obj_array: knl = lp.tag_array_axes(knl, "targets", "sep,C") - knl = lp.split_iname(knl, "irange", 128, outer_tag="g.0") + knl = lp.split_iname(knl, "imat", 1024, outer_tag="g.0") return knl - def __call__(self, queue, targets, sources, tgtindices, srcindices, - tgtranges, srcranges, **kwargs): + def __call__(self, queue, targets, sources, index_set, **kwargs): + """Construct a set of blocks of the full P2P interaction matrix. + + The blocks are returned as one-dimensional arrays, for performance + and storage reasons. If the two-dimensional form is desired, it can + be obtained using the information in the `index_set` for a block + :math:`i` in the following way: + + .. code-block:: python + + blkranges = index_set.linear_ranges() + blkshape = index_set.block_shape(i) + + block2d = result[blkranges[i]:blkranges[i + 1]].reshape(*blkshape) + + :arg targets: target point coordinates. + :arg sources: source point coordinates. + :arg index_set: a :class:`sumpy.tools.MatrixBlockIndexRanges` used + to define the blocks. + :return: a tuple of one-dimensional arrays of kernel evaluations at + target-source pairs described by `index_set`. + """ from pytools.obj_array import is_obj_array knl = self.get_cached_optimized_kernel( targets_is_obj_array=( @@ -371,111 +393,104 @@ class P2PMatrixBlockGenerator(SingleSrcTgtListP2PBase): sources_is_obj_array=( is_obj_array(sources) or isinstance(sources, (tuple, list)))) - return knl(queue, targets=targets, sources=sources, - tgtindices=tgtindices, srcindices=srcindices, - tgtranges=tgtranges, srcranges=srcranges, **kwargs) + return knl(queue, + targets=targets, + sources=sources, + tgtindices=index_set.linear_row_indices, + srcindices=index_set.linear_col_indices, **kwargs) # }}} # {{{ P2P from CSR-like interaction list -class P2PFromCSR(P2PComputationBase): +class P2PFromCSR(P2PBase): default_name = "p2p_from_csr" def get_kernel(self): loopy_insns, result_names = self.get_loopy_insns_and_result_names() - - from pymbolic import var - exprs = [ - var(name) - * var("strength").index((self.strength_usage[i], var("isrc"))) - for i, name in enumerate(result_names)] - - if self.exclude_self: - from pymbolic.primitives import If, Variable - exprs = [If(Variable("is_self"), 0, expr) for expr in exprs] - - from sumpy.tools import gather_loopy_source_arguments - loopy_knl = lp.make_kernel( + kernel_exprs = self.get_kernel_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + [ - "{[itgt_box]: 0<=itgt_box<ntgt_boxes}", - "{[isrc_box]: isrc_box_start<=isrc_box<isrc_box_end}", - "{[itgt,isrc,idim]: \ - itgt_start<=itgt<itgt_end and \ - isrc_start<=isrc<isrc_end and \ - 0<=idim<dim }", - ], + lp.GlobalArg("box_target_starts", + None, shape=None), + lp.GlobalArg("box_target_counts_nonchild", + None, shape=None), + lp.GlobalArg("box_source_starts", + None, shape=None), + lp.GlobalArg("box_source_counts_nonchild", + None, shape=None), + lp.GlobalArg("source_box_starts", + None, shape=None), + lp.GlobalArg("source_box_lists", + None, shape=None), + lp.GlobalArg("strength", None, + shape="nstrengths, nsources", dim_tags="sep,C"), + lp.GlobalArg("result", None, + shape="nkernels, ntargets", dim_tags="sep,C"), + "..." + ]) + + loopy_knl = lp.make_kernel([ + "{[itgt_box]: 0 <= itgt_box < ntgt_boxes}", + "{[isrc_box]: isrc_box_start <= isrc_box < isrc_box_end}", + "{[itgt, isrc, idim]: \ + itgt_start <= itgt < itgt_end and \ + isrc_start <= isrc < isrc_end and \ + 0 <= idim < dim}", + ], self.get_kernel_scaling_assignments() - + [ - """ + + [""" for itgt_box - <> tgt_ibox = target_boxes[itgt_box] - <> itgt_start = box_target_starts[tgt_ibox] - <> itgt_end = itgt_start+box_target_counts_nonchild[tgt_ibox] - - <> isrc_box_start = source_box_starts[itgt_box] - <> isrc_box_end = source_box_starts[itgt_box+1] - - for isrc_box - <> src_ibox = source_box_lists[isrc_box] - <> isrc_start = box_source_starts[src_ibox] - <> isrc_end = isrc_start+box_source_counts_nonchild[src_ibox] - - for itgt - for isrc - <> d[idim] = \ - targets[idim,itgt] - sources[idim,isrc] \ - {dup=idim} - """] + [""" - <> is_self = (isrc == target_to_source[itgt]) - """ if self.exclude_self else ""] + [ - ] + loopy_insns + [ - lp.Assignment(id=None, - assignee="pair_result_%d" % i, expression=expr, - temp_var_type=lp.auto) - for i, expr in enumerate(exprs) - ] + [ - """ - end - """] + [""" - result[KNLIDX, itgt] = result[KNLIDX, itgt] + \ - knl_KNLIDX_scaling \ - * simul_reduce(sum, isrc, pair_result_KNLIDX) - """.replace("KNLIDX", str(iknl)) - for iknl in range(len(exprs))] + [""" - end + <> tgt_ibox = target_boxes[itgt_box] + <> itgt_start = box_target_starts[tgt_ibox] + <> itgt_end = itgt_start + box_target_counts_nonchild[tgt_ibox] + + <> isrc_box_start = source_box_starts[itgt_box] + <> isrc_box_end = source_box_starts[itgt_box+1] + + for isrc_box + <> src_ibox = source_box_lists[isrc_box] + <> isrc_start = box_source_starts[src_ibox] + <> isrc_end = isrc_start + box_source_counts_nonchild[src_ibox] + + for itgt + for isrc + <> d[idim] = \ + targets[idim, itgt] - sources[idim, isrc] {dup=idim} + """] + [""" + <> is_self = (isrc == target_to_source[itgt]) + """ if self.exclude_self else ""] + + loopy_insns + kernel_exprs + + [" end"] + + [""" + result[{i}, itgt] = result[{i}, itgt] + \ + knl_{i}_scaling * simul_reduce(sum, isrc, pair_result_{i}) \ + {{id_prefix=write_csr}} + """.format(i=iknl) + for iknl in range(len(self.kernels))] + + [""" end end - """], - [ - lp.GlobalArg("box_target_starts,box_target_counts_nonchild," - "box_source_starts,box_source_counts_nonchild,", - None, shape=None), - lp.GlobalArg("source_box_starts, source_box_lists,", - None, shape=None), - lp.GlobalArg("strength", None, shape="nstrengths,nsources"), - lp.GlobalArg("result", None, - shape="nkernels,ntargets", dim_tags="sep,c"), - lp.GlobalArg("targets", None, - shape="dim,ntargets", dim_tags="sep,c"), - lp.GlobalArg("sources", None, - shape="dim,nsources", dim_tags="sep,c"), - lp.ValueArg("nsources", np.int32), - lp.ValueArg("ntargets", np.int32), - "...", - ] + ( - [lp.GlobalArg("target_to_source", np.int32, shape=("ntargets",))] - if self.exclude_self else [] - ) + gather_loopy_source_arguments(self.kernels), - name=self.name, assumptions="ntgt_boxes>=1", + end + """], + arguments, + assumptions="ntgt_boxes>=1", + name=self.name, + silenced_warnings="write_race(write_csr*)", fixed_parameters=dict( dim=self.dim, nstrengths=self.strength_count, - nkernels=len(self.kernels))) + nkernels=len(self.kernels)), + lang_version=MOST_RECENT_LANGUAGE_VERSION) + + loopy_knl = lp.add_dtypes(loopy_knl, + dict(nsources=np.int32, ntargets=np.int32)) loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") - loopy_knl = lp.tag_array_axes(loopy_knl, "strength", "sep,C") + loopy_knl = lp.tag_array_axes(loopy_knl, "targets", "sep,C") + loopy_knl = lp.tag_array_axes(loopy_knl, "sources", "sep,C") for knl in self.kernels: loopy_knl = knl.prepare_loopy_kernel(loopy_knl) @@ -485,12 +500,14 @@ class P2PFromCSR(P2PComputationBase): def get_optimized_kernel(self): # FIXME knl = self.get_kernel() + import pyopencl as cl dev = self.context.devices[0] if dev.type & cl.device_type.CPU: knl = lp.split_iname(knl, "itgt_box", 4, outer_tag="g.0") else: knl = lp.split_iname(knl, "itgt_box", 4, outer_tag="g.0") + return knl def __call__(self, queue, **kwargs): diff --git a/sumpy/qbx.py b/sumpy/qbx.py index 0f617e62990c4aad8f187b03d22e2ac9c72b46fd..a0814e01c412727a908ec72758e62014b3278708 100644 --- a/sumpy/qbx.py +++ b/sumpy/qbx.py @@ -27,7 +27,7 @@ THE SOFTWARE. import six -from six.moves import range, zip +from six.moves import range import numpy as np import loopy as lp from loopy.version import MOST_RECENT_LANGUAGE_VERSION @@ -49,6 +49,7 @@ QBX for Layer Potentials .. autoclass:: LayerPotentialBase .. autoclass:: LayerPotential .. autoclass:: LayerPotentialMatrixGenerator +.. autoclass:: LayerPotentialMatrixBlockGenerator """ @@ -87,7 +88,7 @@ def expand(expansion_nr, sac, expansion, avec, bvec): class LayerPotentialBase(KernelComputation, KernelCacheWrapper): def __init__(self, ctx, expansions, strength_usage=None, value_dtypes=None, - options=[], name="layerpot", device=None): + options=[], name=None, device=None): KernelComputation.__init__(self, ctx, expansions, strength_usage, value_dtypes, name, options, device) @@ -103,46 +104,8 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper): def expansions(self): return self.kernels - def get_compute_a_and_b_vecs(self): - return """ - <> a[idim] = center[idim,itgt] - src[idim,isrc] {dup=idim} - <> b[idim] = tgt[idim,itgt] - center[idim,itgt] {dup=idim} - <> rscale = expansion_radii[itgt] - """ - - def get_src_tgt_arguments(self): - return [ - lp.GlobalArg("src", None, - shape=(self.dim, "nsources"), order="C"), - lp.GlobalArg("tgt", None, - shape=(self.dim, "ntargets"), order="C"), - lp.GlobalArg("center", None, - shape=(self.dim, "ntargets"), order="C"), - lp.GlobalArg("expansion_radii", None, shape="ntargets"), - lp.ValueArg("nsources", np.int32), - lp.ValueArg("ntargets", np.int32), - ] - - def get_domains(self): - return [ - "{[itgt]: 0 <= itgt < ntargets}", - "{[isrc]: 0 <= isrc < nsources}", - "{[idim]: 0 <= idim < dim}" - ] - - def get_loop_begin(self): - return ["for itgt, isrc"] - - def get_loop_end(self): - return ["end"] - - def get_assumptions(self): - return "nsources>=1 and ntargets>=1" - - @memoize_method - def get_kernel(self): + def get_loopy_insns_and_result_names(self): from sumpy.symbolic import make_sym_vector - avec = make_sym_vector("a", self.dim) bvec = make_sym_vector("b", self.dim) @@ -152,7 +115,7 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper): logger.info("compute expansion expressions: start") result_names = [expand(i, sac, expn, avec, bvec) - for i, expn in enumerate(self.expansions)] + for i, expn in enumerate(self.expansions)] logger.info("compute expansion expressions: done") @@ -168,55 +131,43 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper): complex_dtype=np.complex128 # FIXME ) - isrc_sym = var("isrc") - exprs = [ - var(name) - * self.get_strength_or_not(isrc_sym, i) - for i, name in enumerate(result_names)] + return loopy_insns, result_names - from sumpy.tools import gather_loopy_source_arguments - arguments = ( - self.get_src_tgt_arguments() - + self.get_input_and_output_arguments() - + gather_loopy_source_arguments(self.kernels)) - - loopy_knl = lp.make_kernel( - self.get_domains(), - self.get_kernel_scaling_assignments() - + self.get_loop_begin() - + [self.get_compute_a_and_b_vecs()] - + loopy_insns - + [ - lp.Assignment(id=None, - assignee="pair_result_%d" % i, expression=expr, - temp_var_type=lp.auto) - for i, (expr, dtype) in enumerate(zip(exprs, self.value_dtypes)) - ] - + self.get_loop_end() - + self.get_result_store_instructions(), - arguments, - name=self.name, - assumptions=self.get_assumptions(), - default_offset=lp.auto, - silenced_warnings="write_race(write_lpot*)", - fixed_parameters=dict(dim=self.dim), - lang_version=MOST_RECENT_LANGUAGE_VERSION) + def get_strength_or_not(self, isrc, kernel_idx): + return var("strength_%d" % self.strength_usage[kernel_idx]).index(isrc) - loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + def get_kernel_exprs(self, result_names): + isrc_sym = var("isrc") + exprs = [var(name) * self.get_strength_or_not(isrc_sym, i) + for i, name in enumerate(result_names)] - for expn in self.expansions: - loopy_knl = expn.prepare_loopy_kernel(loopy_knl) + return [lp.Assignment(id=None, + assignee="pair_result_%d" % i, expression=expr, + temp_var_type=lp.auto) + for i, expr in enumerate(exprs)] - loopy_knl = lp.tag_array_axes(loopy_knl, "center", "sep,C") + def get_default_src_tgt_arguments(self): + from sumpy.tools import gather_loopy_source_arguments + return ([ + lp.GlobalArg("src", None, + shape=(self.dim, "nsources"), order="C"), + lp.GlobalArg("tgt", None, + shape=(self.dim, "ntargets"), order="C"), + lp.GlobalArg("center", None, + shape=(self.dim, "ntargets"), dim_tags="sep,C"), + lp.GlobalArg("expansion_radii", + None, shape="ntargets"), + lp.ValueArg("nsources", None), + lp.ValueArg("ntargets", None)] + + gather_loopy_source_arguments(self.kernels)) - return loopy_knl + def get_kernel(self): + raise NotImplementedError - @memoize_method def get_optimized_kernel(self): # FIXME specialize/tune for GPU/CPU loopy_knl = self.get_kernel() - # FIXME: how to tune for blocks? import pyopencl as cl dev = self.context.devices[0] if dev.type & cl.device_type.CPU: @@ -238,32 +189,58 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper): # {{{ direct applier class LayerPotential(LayerPotentialBase): - """Direct applier for the layer potential.""" + """Direct applier for the layer potential. - default_name = "qbx_apply" + .. automethod:: __call__ + """ - def get_strength_or_not(self, isrc, kernel_idx): - return var("strength_%d" % self.strength_usage[kernel_idx]).index(isrc) + default_name = "qbx_apply" - def get_input_and_output_arguments(self): - return [ - lp.GlobalArg("strength_%d" % i, None, shape="nsources", order="C") - for i in range(self.strength_count) - ]+[ - lp.GlobalArg("result_%d" % i, None, shape="ntargets", order="C") - for i in range(len(self.kernels)) - ] - - def get_result_store_instructions(self): - return [ - """ - result_{i}[itgt] = \ - knl_{i}_scaling*simul_reduce( - sum, isrc, pair_result_{i}) \ - {{id_prefix=write_lpot,inames=itgt}} + @memoize_method + def get_kernel(self): + loopy_insns, result_names = self.get_loopy_insns_and_result_names() + kernel_exprs = self.get_kernel_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + + [lp.GlobalArg("strength_%d" % i, + None, shape="nsources", order="C") + for i in range(self.strength_count)] + + [lp.GlobalArg("result_%d" % i, + None, shape="ntargets", order="C") + for i in range(len(self.kernels))]) + + loopy_knl = lp.make_kernel([""" + {[itgt, isrc, idim]: \ + 0 <= itgt < ntargets and \ + 0 <= isrc < nsources and \ + 0 <= idim < dim} + """], + self.get_kernel_scaling_assignments() + + ["for itgt, isrc"] + + ["<> a[idim] = center[idim, itgt] - src[idim, isrc] {dup=idim}"] + + ["<> b[idim] = tgt[idim, itgt] - center[idim, itgt] {dup=idim}"] + + ["<> rscale = expansion_radii[itgt]"] + + loopy_insns + kernel_exprs + + [""" + result_{i}[itgt] = knl_{i}_scaling * \ + simul_reduce(sum, isrc, pair_result_{i}) \ + {{id_prefix=write_lpot,inames=itgt}} """.format(i=iknl) - for iknl in range(len(self.expansions)) - ] + for iknl in range(len(self.expansions))] + + ["end"], + arguments, + name=self.name, + assumptions="ntargets>=1 and nsources>=1", + default_offset=lp.auto, + silenced_warnings="write_race(write_lpot*)", + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION) + + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + for expn in self.expansions: + loopy_knl = expn.prepare_loopy_kernel(loopy_knl) + + return loopy_knl def __call__(self, queue, targets, sources, centers, strengths, expansion_radii, **kwargs): @@ -293,23 +270,50 @@ class LayerPotentialMatrixGenerator(LayerPotentialBase): def get_strength_or_not(self, isrc, kernel_idx): return 1 - def get_input_and_output_arguments(self): - return [ - lp.GlobalArg("result_%d" % i, dtype, shape="ntargets,nsources") - for i, dtype in enumerate(self.value_dtypes) - ] - - def get_result_store_instructions(self): - return [ - """ - result_KNLIDX[itgt, isrc] = \ - knl_KNLIDX_scaling*pair_result_KNLIDX {inames=isrc:itgt} - """.replace("KNLIDX", str(iknl)) - for iknl in range(len(self.expansions)) - ] + @memoize_method + def get_kernel(self): + loopy_insns, result_names = self.get_loopy_insns_and_result_names() + kernel_exprs = self.get_kernel_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + + [lp.GlobalArg("result_%d" % i, + dtype, shape="ntargets, nsources", order="C") + for i, dtype in enumerate(self.value_dtypes)]) + + loopy_knl = lp.make_kernel([""" + {[itgt, isrc, idim]: \ + 0 <= itgt < ntargets and \ + 0 <= isrc < nsources and \ + 0 <= idim < dim} + """], + self.get_kernel_scaling_assignments() + + ["for itgt, isrc"] + + ["<> a[idim] = center[idim, itgt] - src[idim, isrc] {dup=idim}"] + + ["<> b[idim] = tgt[idim, itgt] - center[idim, itgt] {dup=idim}"] + + ["<> rscale = expansion_radii[itgt]"] + + loopy_insns + kernel_exprs + + [""" + result_{i}[itgt, isrc] = \ + knl_{i}_scaling * pair_result_{i} \ + {{inames=isrc:itgt}} + """.format(i=iknl) + for iknl in range(len(self.expansions))] + + ["end"], + arguments, + name=self.name, + assumptions="ntargets>=1 and nsources>=1", + default_offset=lp.auto, + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION) + + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + for expn in self.expansions: + loopy_knl = expn.prepare_loopy_kernel(loopy_knl) + + return loopy_knl def __call__(self, queue, targets, sources, centers, expansion_radii, **kwargs): - knl = self.get_optimized_kernel() + knl = self.get_cached_optimized_kernel() return knl(queue, src=sources, tgt=targets, center=centers, expansion_radii=expansion_radii, **kwargs) @@ -320,89 +324,97 @@ class LayerPotentialMatrixGenerator(LayerPotentialBase): # {{{ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase): - default_name = "qbx_block" + """Generator for a subset of the layer potential matrix entries. - def get_src_tgt_arguments(self): - return LayerPotentialBase.get_src_tgt_arguments(self) \ - + [ - lp.GlobalArg("srcindices", None, shape="nsrcindices"), - lp.GlobalArg("tgtindices", None, shape="ntgtindices"), - lp.GlobalArg("srcranges", None, shape="nranges + 1"), - lp.GlobalArg("tgtranges", None, shape="nranges + 1"), - lp.ValueArg("nsrcindices", np.int32), - lp.ValueArg("ntgtindices", np.int32), - lp.ValueArg("nranges", None) - ] - - def get_domains(self): - # FIXME: this doesn't work when separating j and k - return [ - "{[irange]: 0 <= irange < nranges}", - "{[j, k]: 0 <= j < tgt_length and 0 <= k < src_length}", - "{[idim]: 0 <= idim < dim}" - ] - - def get_loop_begin(self): - return [ - """ - for irange - <> tgtstart = tgtranges[irange] - <> tgtend = tgtranges[irange + 1] - <> tgt_length = tgtend - tgtstart - <> srcstart = srcranges[irange] - <> srcend = srcranges[irange + 1] - <> src_length = srcend - srcstart - for j, k - <> itgt = tgtindices[tgtstart + j] - <> isrc = srcindices[srcstart + k] - """ - ] - - def get_loop_end(self): - return [ - """ - end - end - """ - ] + .. automethod:: __call__ + """ + + default_name = "qbx_block" def get_strength_or_not(self, isrc, kernel_idx): return 1 - def get_input_and_output_arguments(self): - return [ - lp.GlobalArg("result_%d" % i, dtype, shape="ntgtindices,nsrcindices") - for i, dtype in enumerate(self.value_dtypes) - ] + @memoize_method + def get_kernel(self): + loopy_insns, result_names = self.get_loopy_insns_and_result_names() + kernel_exprs = self.get_kernel_exprs(result_names) + arguments = ( + self.get_default_src_tgt_arguments() + + [ + lp.GlobalArg("srcindices", None, shape="nresult"), + lp.GlobalArg("tgtindices", None, shape="nresult"), + lp.ValueArg("nresult", None) + ] + + [lp.GlobalArg("result_%d" % i, dtype, shape="nresult") + for i, dtype in enumerate(self.value_dtypes)]) + + loopy_knl = lp.make_kernel([ + "{[imat, idim]: 0 <= imat < nresult and 0 <= idim < dim}" + ], + self.get_kernel_scaling_assignments() + + [""" + for imat + <> itgt = tgtindices[imat] + <> isrc = srcindices[imat] + + <> a[idim] = center[idim, tgtindices[imat]] - \ + src[idim, srcindices[imat]] {dup=idim} + <> b[idim] = tgt[idim, tgtindices[imat]] - \ + center[idim, tgtindices[imat]] {dup=idim} + <> rscale = expansion_radii[tgtindices[imat]] + """] + + loopy_insns + kernel_exprs + + [""" + result_{i}[imat] = knl_{i}_scaling * pair_result_{i} \ + {{id_prefix=write_lpot}} + """.format(i=iknl) + for iknl in range(len(self.expansions))] + + ["end"], + arguments, + name=self.name, + assumptions="nresult>=1", + default_offset=lp.auto, + silenced_warnings="write_race(write_lpot*)", + fixed_parameters=dict(dim=self.dim), + lang_version=MOST_RECENT_LANGUAGE_VERSION) - def get_result_store_instructions(self): - return [ - """ - result_KNLIDX[tgtstart + j, srcstart + k] = \ - knl_KNLIDX_scaling*pair_result_KNLIDX {inames=irange} - """.replace("KNLIDX", str(iknl)) - for iknl in range(len(self.expansions)) - ] + loopy_knl = lp.tag_inames(loopy_knl, "idim*:unr") + loopy_knl = lp.add_dtypes(loopy_knl, + dict(nsources=np.int32, ntargets=np.int32)) - def get_assumptions(self): - return "nranges>=1" + for expn in self.expansions: + loopy_knl = expn.prepare_loopy_kernel(loopy_knl) + + return loopy_knl - @memoize_method def get_optimized_kernel(self): - # FIXME loopy_knl = self.get_kernel() - loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0") + loopy_knl = lp.split_iname(loopy_knl, "imat", 1024, outer_tag="g.0") return loopy_knl def __call__(self, queue, targets, sources, centers, expansion_radii, - tgtindices, srcindices, tgtranges, srcranges, **kwargs): - knl = self.get_optimized_kernel() + index_set, **kwargs): + """ + :arg targets: target point coordinates. + :arg sources: source point coordinates. + :arg centers: QBX target expansion centers. + :arg expansion_radii: radii for each expansion center. + :arg index_set: a :class:`sumpy.tools.MatrixBlockIndexRanges` used + to define the blocks. + :return: a tuple of one-dimensional arrays of kernel evaluations at + target-source pairs described by `index_set`. + """ - return knl(queue, src=sources, tgt=targets, center=centers, - expansion_radii=expansion_radii, - tgtindices=tgtindices, srcindices=srcindices, - tgtranges=tgtranges, srcranges=srcranges, **kwargs) + knl = self.get_cached_optimized_kernel() + + return knl(queue, + src=sources, + tgt=targets, + center=centers, + expansion_radii=expansion_radii, + tgtindices=index_set.linear_row_indices, + srcindices=index_set.linear_col_indices, **kwargs) # }}} diff --git a/sumpy/tools.py b/sumpy/tools.py index dd98325bef11bc122c34cf9a34114e5ff19673f0..c5438320517642aad4ef7acb590511e42290d3f1 100644 --- a/sumpy/tools.py +++ b/sumpy/tools.py @@ -1,6 +1,9 @@ from __future__ import division, absolute_import -__copyright__ = "Copyright (C) 2012 Andreas Kloeckner" +__copyright__ = """ +Copyright (C) 2012 Andreas Kloeckner +Copyright (C) 2018 Alexandru Fikl +""" __license__ = """ Permission is hereby granted, free of charge, to any person obtaining a copy @@ -24,10 +27,16 @@ THE SOFTWARE. import six from six.moves import range, zip -from pytools import memoize_method +from pytools import memoize_method, memoize_in import numpy as np import sumpy.symbolic as sym +import pyopencl as cl +import pyopencl.array # noqa + +import loopy as lp +from loopy.version import MOST_RECENT_LANGUAGE_VERSION + import logging logger = logging.getLogger(__name__) @@ -245,6 +254,253 @@ class KernelComputation(object): # }}} +# {{{ + + +def _to_host(x, queue=None): + if isinstance(x, cl.array.Array): + queue = queue or x.queue + return x.get(queue) + return x + + +class BlockIndexRanges(object): + """Convenience class for working with blocks of a global array. + + .. attribute:: indices + + A list of not necessarily continuous or increasing integers + representing the indices of a global array. The individual blocks are + delimited using :attr:`ranges`. + + .. attribute:: ranges + + A list of nondecreasing integers used to index into :attr:`indices`. + A block :math:`i` can be retrieved using + `indices[ranges[i]:ranges[i + 1]]`. + + .. automethod:: block_shape + .. automethod:: get + .. automethod:: take + """ + + def __init__(self, cl_context, indices, ranges): + self.cl_context = cl_context + self.indices = indices + self.ranges = ranges + + @property + @memoize_method + def _ranges(self): + with cl.CommandQueue(self.cl_context) as queue: + return _to_host(self.ranges, queue=queue) + + @property + def nblocks(self): + return self.ranges.shape[0] - 1 + + def block_shape(self, i): + return (self._ranges[i + 1] - self._ranges[i],) + + def block_indices(self, i): + return self.indices[self._ranges[i]:self._ranges[i + 1]] + + def get(self, queue=None): + return BlockIndexRanges(self.cl_context, + _to_host(self.indices, queue=queue), + _to_host(self.ranges, queue=queue)) + + def take(self, x, i): + """Return the subset of a global array `x` that is defined by + the :attr:`indices` in block :math:`i`. + """ + + return x[self.block_indices(i)] + + +class MatrixBlockIndexRanges(object): + """Keep track of different ways to index into matrix blocks. + + .. attribute:: row + + A :class:`BlockIndexRanges` encapsulating row block indices. + + .. attribute:: col + + A :class:`BlockIndexRanges` encapsulating column block indices. + + .. automethod:: block_shape + .. automethod:: block_take + .. automethod:: get + .. automethod:: take + + """ + + def __init__(self, cl_context, row, col): + self.cl_context = cl_context + self.row = row + self.col = col + assert self.row.nblocks == self.col.nblocks + + self.blkranges = np.cumsum([0] + [ + self.row.block_shape(i)[0] * self.col.block_shape(i)[0] + for i in range(self.row.nblocks)]) + + if isinstance(self.row.indices, cl.array.Array): + with cl.CommandQueue(self.cl_context) as queue: + self.blkranges = \ + cl.array.to_device(queue, self.blkranges).with_queue(None) + + @property + def nblocks(self): + return self.row.nblocks + + def block_shape(self, i): + return self.row.block_shape(i) + self.col.block_shape(i) + + def block_indices(self, i): + return (self.row.block_indices(i), + self.col.block_indices(i)) + + @property + def linear_row_indices(self): + r, _ = self._linear_indices() + return r + + @property + def linear_col_indices(self): + _, c = self._linear_indices() + return c + + @property + def linear_ranges(self): + return self.blkranges + + def get(self, queue=None): + """Transfer data to the host. Only the initial given data is + transfered, not the arrays returned by :meth:`linear_row_indices` and + friends. + + :return: a copy of `self` in which all data lives on the host, i.e. + all :class:`pyopencl.array.Array` instances are replaces by + :class:`numpy.ndarray` instances. + """ + return MatrixBlockIndexRanges(self.cl_context, + row=self.row.get(queue=queue), + col=self.col.get(queue=queue)) + + def take(self, x, i): + """Retrieve a block from a global matrix. + + :arg x: a 2D :class:`numpy.ndarray`. + :arg i: block index. + :return: requested block from the matrix. + """ + + if isinstance(self.row.indices, cl.array.Array) or \ + isinstance(self.col.indices, cl.array.Array): + raise ValueError("CL `Array`s are not supported." + "Use MatrixBlockIndexRanges.get() and then view into matrices.") + + irow, icol = self.block_indices(i) + return x[np.ix_(irow, icol)] + + def block_take(self, x, i): + """Retrieve a block from a linear representation of the matrix blocks. + A linear representation of the matrix blocks can be obtained, or + should be consistent with + + .. code-block:: python + + i = index.linear_row_indices() + j = index.linear_col_indices() + linear_blks = global_mat[i, j] + + for k in range(index.nblocks): + assert np.allclose(index.block_take(linear_blks, k), + index.take(global_mat, k)) + + :arg x: a 1D :class:`numpy.ndarray`. + :arg i: block index. + :return: requested block, reshaped into a 2D array. + """ + + iblk = np.s_[self.blkranges[i]:self.blkranges[i + 1]] + return x[iblk].reshape(*self.block_shape(i)) + + @memoize_method + def _linear_indices(self): + """ + :return: a tuple of `(rowindices, colindices)` that can be + used to provide linear indexing into a set of matrix blocks. These + index arrays are just the concatenated Cartesian products of all + the block arrays described by :attr:`row` and :attr:`col`. + + They can be used to index directly into a matrix as follows: + + .. code-block:: python + + mat[rowindices[blkranges[i]:blkranges[i + 1]], + colindices[blkranges[i]:blkranges[i + 1]]] + + The same block can be obtained more easily using + + .. code-block:: python + + index.view(mat, i).reshape(-1) + """ + + @memoize_in(self, "block_index_knl") + def _build_index(): + loopy_knl = lp.make_kernel([ + "{[irange]: 0 <= irange < nranges}", + "{[itgt, isrc]: 0 <= itgt < ntgtblock and 0 <= isrc < nsrcblock}" + ], + """ + for irange + <> ntgtblock = tgtranges[irange + 1] - tgtranges[irange] + <> nsrcblock = srcranges[irange + 1] - srcranges[irange] + + for itgt, isrc + <> imat = blkranges[irange] + (nsrcblock * itgt + isrc) + + rowindices[imat] = tgtindices[tgtranges[irange] + itgt] \ + {id_prefix=write_index} + colindices[imat] = srcindices[srcranges[irange] + isrc] \ + {id_prefix=write_index} + end + end + """, + [ + lp.GlobalArg('blkranges', None, shape="nranges + 1"), + lp.GlobalArg("rowindices", None, shape="nresults"), + lp.GlobalArg("colindices", None, shape="nresults"), + lp.ValueArg("nresults", None), + '...' + ], + name="block_index_knl", + default_offset=lp.auto, + assumptions='nranges>=1', + silenced_warnings="write_race(write_index*)", + lang_version=MOST_RECENT_LANGUAGE_VERSION) + loopy_knl = lp.split_iname(loopy_knl, "irange", 128, outer_tag="g.0") + + return loopy_knl + + with cl.CommandQueue(self.cl_context) as queue: + _, (rowindices, colindices) = _build_index()(queue, + tgtindices=self.row.indices, + srcindices=self.col.indices, + tgtranges=self.row.ranges, + srcranges=self.col.ranges, + blkranges=self.blkranges, + nresults=_to_host(self.blkranges[-1], queue=queue)) + return (rowindices.with_queue(None), + colindices.with_queue(None)) + +# }}} + + # {{{ OrderedSet # Source: http://code.activestate.com/recipes/576694-orderedset/ diff --git a/test/test_fmm.py b/test/test_fmm.py index 0331db6c696043b2efb8fe27286a2d48cbd16b59..71e3f044432b4bd612337bdd55d56e421540bcc4 100644 --- a/test/test_fmm.py +++ b/test/test_fmm.py @@ -234,6 +234,62 @@ def test_sumpy_fmm(ctx_getter, knl, local_expn_class, mpole_expn_class): pconv_verifier() +def test_sumpy_fmm_timing_data_collection(ctx_getter): + logging.basicConfig(level=logging.INFO) + + ctx = ctx_getter() + queue = cl.CommandQueue( + ctx, + properties=cl.command_queue_properties.PROFILING_ENABLE) + + nsources = 500 + dtype = np.float64 + + from boxtree.tools import ( + make_normal_particle_array as p_normal) + + knl = LaplaceKernel(2) + local_expn_class = VolumeTaylorLocalExpansion + mpole_expn_class = VolumeTaylorMultipoleExpansion + order = 1 + + sources = p_normal(queue, nsources, knl.dim, dtype, seed=15) + + from boxtree import TreeBuilder + tb = TreeBuilder(ctx) + + tree, _ = tb(queue, sources, + max_particles_in_box=30, debug=True) + + from boxtree.traversal import FMMTraversalBuilder + tbuild = FMMTraversalBuilder(ctx) + trav, _ = tbuild(queue, tree, debug=True) + + from pyopencl.clrandom import PhiloxGenerator + rng = PhiloxGenerator(ctx) + weights = rng.uniform(queue, nsources, dtype=np.float64) + + out_kernels = [knl] + + from functools import partial + + from sumpy.fmm import SumpyExpansionWranglerCodeContainer + wcc = SumpyExpansionWranglerCodeContainer( + ctx, + partial(mpole_expn_class, knl), + partial(local_expn_class, knl), + out_kernels) + + wrangler = wcc.get_wrangler(queue, tree, dtype, + fmm_level_to_order=lambda kernel, kernel_args, tree, lev: order) + from boxtree.fmm import drive_fmm + + timing_data = {} + pot, = drive_fmm(trav, wrangler, weights, timing_data=timing_data) + print(timing_data) + assert timing_data + + def test_sumpy_fmm_exclude_self(ctx_getter): logging.basicConfig(level=logging.INFO) diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py index 4e2b3ad7c449ba9acb37955cd167d94fe3c80d83..2f3aabc3082b99707ab86b5461390e866350fdc3 100644 --- a/test/test_matrixgen.py +++ b/test/test_matrixgen.py @@ -26,11 +26,17 @@ import sys import numpy as np import numpy.linalg as la -import pytest import pyopencl as cl +import pyopencl.array # noqa + +from sumpy.tools import vector_to_device +from sumpy.tools import MatrixBlockIndexRanges + +import pytest from pyopencl.tools import ( # noqa pytest_generate_tests_for_pyopencl as pytest_generate_tests) + import logging logger = logging.getLogger(__name__) @@ -42,7 +48,7 @@ else: faulthandler.enable() -def create_arguments(n, mode, target_radius=1.0): +def _build_geometry(queue, n, mode, target_radius=1.0): # parametrize circle t = np.linspace(0.0, 2.0 * np.pi, n, endpoint=False) unit_circle = np.exp(1j * t) @@ -60,133 +66,183 @@ def create_arguments(n, mode, target_radius=1.0): centers = unit_circle * (1.0 - radius) expansion_radii = radius * np.ones(n) - return targets, sources, centers, sigma, expansion_radii + return (cl.array.to_device(queue, targets), + cl.array.to_device(queue, sources), + vector_to_device(queue, centers), + cl.array.to_device(queue, expansion_radii), + cl.array.to_device(queue, sigma)) + + +def _build_block_index(queue, nnodes, nblks, factor): + indices = np.arange(0, nnodes) + ranges = np.arange(0, nnodes + 1, nnodes // nblks) + + if abs(factor - 1.0) < 1.0e-14: + ranges_ = ranges + indices_ = indices + else: + indices_ = np.empty(ranges.shape[0] - 1, dtype=np.object) + for i in range(ranges.shape[0] - 1): + iidx = indices[np.s_[ranges[i]:ranges[i + 1]]] + indices_[i] = np.sort(np.random.choice(iidx, + size=int(factor * len(iidx)), replace=False)) + + ranges_ = np.cumsum([0] + [r.shape[0] for r in indices_]) + indices_ = np.hstack(indices_) + + from sumpy.tools import BlockIndexRanges + return BlockIndexRanges(queue.context, + cl.array.to_device(queue, indices_).with_queue(None), + cl.array.to_device(queue, ranges_).with_queue(None)) -def test_qbx_direct(ctx_getter): - # This evaluates a single layer potential on a circle. +@pytest.mark.parametrize('factor', [1.0, 0.6]) +@pytest.mark.parametrize('lpot_id', [1, 2]) +def test_qbx_direct(ctx_getter, factor, lpot_id): logging.basicConfig(level=logging.INFO) ctx = ctx_getter() queue = cl.CommandQueue(ctx) - from sumpy.kernel import LaplaceKernel - lknl = LaplaceKernel(2) - + ndim = 2 nblks = 10 order = 12 mode_nr = 25 + from sumpy.kernel import LaplaceKernel, DirectionalSourceDerivative + if lpot_id == 1: + knl = LaplaceKernel(ndim) + elif lpot_id == 2: + knl = LaplaceKernel(ndim) + knl = DirectionalSourceDerivative(knl, dir_vec_name="dsource_vec") + else: + raise ValueError("unknow lpot_id") + + from sumpy.expansion.local import LineTaylorLocalExpansion + lknl = LineTaylorLocalExpansion(knl, order) + from sumpy.qbx import LayerPotential + lpot = LayerPotential(ctx, [lknl]) + from sumpy.qbx import LayerPotentialMatrixGenerator + mat_gen = LayerPotentialMatrixGenerator(ctx, [lknl]) + from sumpy.qbx import LayerPotentialMatrixBlockGenerator - from sumpy.expansion.local import LineTaylorLocalExpansion - lpot = LayerPotential(ctx, [LineTaylorLocalExpansion(lknl, order)]) - mat_gen = LayerPotentialMatrixGenerator(ctx, - [LineTaylorLocalExpansion(lknl, order)]) - blk_gen = LayerPotentialMatrixBlockGenerator(ctx, - [LineTaylorLocalExpansion(lknl, order)]) + blk_gen = LayerPotentialMatrixBlockGenerator(ctx, [lknl]) for n in [200, 300, 400]: - targets, sources, centers, sigma, expansion_radii = \ - create_arguments(n, mode_nr) + targets, sources, centers, expansion_radii, sigma = \ + _build_geometry(queue, n, mode_nr) h = 2 * np.pi / n strengths = (sigma * h,) - tgtindices = np.arange(0, n) - tgtindices = np.random.choice(tgtindices, size=int(0.8 * n)) - tgtranges = np.arange(0, tgtindices.shape[0] + 1, - tgtindices.shape[0] // nblks) - srcindices = np.arange(0, n) - srcindices = np.random.choice(srcindices, size=int(0.8 * n)) - srcranges = np.arange(0, srcindices.shape[0] + 1, - srcindices.shape[0] // nblks) - assert tgtranges.shape == srcranges.shape - - _, (mat,) = mat_gen(queue, targets, sources, centers, - expansion_radii) - result_mat = mat.dot(strengths[0]) + tgtindices = _build_block_index(queue, n, nblks, factor) + srcindices = _build_block_index(queue, n, nblks, factor) + index_set = MatrixBlockIndexRanges(ctx, tgtindices, srcindices) - _, (result_lpot,) = lpot(queue, targets, sources, centers, strengths, - expansion_radii) + extra_kwargs = {} + if lpot_id == 2: + extra_kwargs["dsource_vec"] = \ + vector_to_device(queue, np.ones((ndim, n))) + + _, (result_lpot,) = lpot(queue, + targets=targets, + sources=sources, + centers=centers, + expansion_radii=expansion_radii, + strengths=strengths, **extra_kwargs) + result_lpot = result_lpot.get() + + _, (mat,) = mat_gen(queue, + targets=targets, + sources=sources, + centers=centers, + expansion_radii=expansion_radii, **extra_kwargs) + mat = mat.get() + result_mat = mat.dot(strengths[0].get()) + + _, (blk,) = blk_gen(queue, + targets=targets, + sources=sources, + centers=centers, + expansion_radii=expansion_radii, + index_set=index_set, **extra_kwargs) + blk = blk.get() + + rowindices = index_set.linear_row_indices.get(queue) + colindices = index_set.linear_col_indices.get(queue) eps = 1.0e-10 * la.norm(result_lpot) assert la.norm(result_mat - result_lpot) < eps - - _, (blk,) = blk_gen(queue, targets, sources, centers, expansion_radii, - tgtindices, srcindices, tgtranges, srcranges) - - for i in range(srcranges.shape[0] - 1): - itgt = np.s_[tgtranges[i]:tgtranges[i + 1]] - isrc = np.s_[srcranges[i]:srcranges[i + 1]] - block = np.ix_(tgtindices[itgt], srcindices[isrc]) - - eps = 1.0e-10 * la.norm(mat[block]) - assert la.norm(blk[itgt, isrc] - mat[block]) < eps + assert la.norm(blk - mat[rowindices, colindices]) < eps @pytest.mark.parametrize("exclude_self", [True, False]) -def test_p2p_direct(ctx_getter, exclude_self): - # This evaluates a single layer potential on a circle. +@pytest.mark.parametrize("factor", [1.0, 0.6]) +def test_p2p_direct(ctx_getter, exclude_self, factor): logging.basicConfig(level=logging.INFO) ctx = ctx_getter() queue = cl.CommandQueue(ctx) - from sumpy.kernel import LaplaceKernel - lknl = LaplaceKernel(2) - + ndim = 2 nblks = 10 mode_nr = 25 + from sumpy.kernel import LaplaceKernel + lknl = LaplaceKernel(ndim) + from sumpy.p2p import P2P - from sumpy.p2p import P2PMatrixGenerator, P2PMatrixBlockGenerator lpot = P2P(ctx, [lknl], exclude_self=exclude_self) + + from sumpy.p2p import P2PMatrixGenerator mat_gen = P2PMatrixGenerator(ctx, [lknl], exclude_self=exclude_self) + + from sumpy.p2p import P2PMatrixBlockGenerator blk_gen = P2PMatrixBlockGenerator(ctx, [lknl], exclude_self=exclude_self) for n in [200, 300, 400]: - targets, sources, _, sigma, _ = \ - create_arguments(n, mode_nr, target_radius=1.2) + targets, sources, _, _, sigma = \ + _build_geometry(queue, n, mode_nr, target_radius=1.2) h = 2 * np.pi / n strengths = (sigma * h,) - tgtindices = np.arange(0, n) - tgtindices = np.random.choice(tgtindices, size=int(0.8 * n)) - tgtranges = np.arange(0, tgtindices.shape[0] + 1, - tgtindices.shape[0] // nblks) - srcindices = np.arange(0, n) - srcindices = np.random.choice(srcindices, size=int(0.8 * n)) - srcranges = np.arange(0, srcindices.shape[0] + 1, - srcindices.shape[0] // nblks) - assert tgtranges.shape == srcranges.shape + tgtindices = _build_block_index(queue, n, nblks, factor) + srcindices = _build_block_index(queue, n, nblks, factor) + index_set = MatrixBlockIndexRanges(ctx, tgtindices, srcindices) extra_kwargs = {} if exclude_self: - extra_kwargs["target_to_source"] = np.arange(n, dtype=np.int32) - - _, (mat,) = mat_gen(queue, targets, sources, **extra_kwargs) - result_mat = mat.dot(strengths[0]) - - _, (result_lpot,) = lpot(queue, targets, sources, strengths, - **extra_kwargs) + extra_kwargs["target_to_source"] = \ + cl.array.arange(queue, 0, n, dtype=np.int) + + _, (result_lpot,) = lpot(queue, + targets=targets, + sources=sources, + strength=strengths, **extra_kwargs) + result_lpot = result_lpot.get() + + _, (mat,) = mat_gen(queue, + targets=targets, + sources=sources, **extra_kwargs) + mat = mat.get() + result_mat = mat.dot(strengths[0].get()) + + _, (blk,) = blk_gen(queue, + targets=targets, + sources=sources, + index_set=index_set, **extra_kwargs) + blk = blk.get() eps = 1.0e-10 * la.norm(result_lpot) assert la.norm(result_mat - result_lpot) < eps - _, (blk,) = blk_gen(queue, targets, sources, - tgtindices, srcindices, tgtranges, srcranges, - **extra_kwargs) - - for i in range(srcranges.shape[0] - 1): - itgt = np.s_[tgtranges[i]:tgtranges[i + 1]] - isrc = np.s_[srcranges[i]:srcranges[i + 1]] - block = np.ix_(tgtindices[itgt], srcindices[isrc]) - - eps = 1.0e-10 * la.norm(mat[block]) - assert la.norm(blk[itgt, isrc] - mat[block]) < eps + index_set = index_set.get(queue) + for i in range(index_set.nblocks): + assert la.norm(index_set.block_take(blk, i) - + index_set.take(mat, i)) < eps # You can test individual routines by typing