diff --git a/.editorconfig b/.editorconfig
new file mode 100644
index 0000000000000000000000000000000000000000..817e87ab7d3b043048a79d999f8db6f19e940c8a
--- /dev/null
+++ b/.editorconfig
@@ -0,0 +1,32 @@
+# https://editorconfig.org/
+# https://github.com/editorconfig/editorconfig-vim
+# https://github.com/editorconfig/editorconfig-emacs
+
+root = true
+
+[*]
+indent_style = space
+end_of_line = lf
+charset = utf-8
+trim_trailing_whitespace = true
+insert_final_newline = true
+
+[*.py]
+indent_size = 4
+
+[*.rst]
+indent_size = 4
+
+[*.cpp]
+indent_size = 2
+
+[*.hpp]
+indent_size = 2
+
+# There may be one in doc/
+[Makefile]
+indent_style = tab
+
+# https://github.com/microsoft/vscode/issues/1679
+[*.md]
+trim_trailing_whitespace = false
\ No newline at end of file
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
new file mode 100644
index 0000000000000000000000000000000000000000..149f0473317cd4ffbe6e1f35ed5d2a8f02105369
--- /dev/null
+++ b/.github/workflows/ci.yml
@@ -0,0 +1,50 @@
+name: CI
+on:
+    push:
+        branches:
+        - master
+    pull_request:
+        paths-ignore:
+        - 'doc/*.rst'
+    schedule:
+        - cron:  '17 3 * * 0'
+
+jobs:
+    flake8:
+        name: Flake8
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v2
+        -
+            uses: actions/setup-python@v1
+            with:
+                python-version: '3.x' 
+        -   name: "Main Script"
+            run: |
+                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh
+                . ./prepare-and-run-flake8.sh sumpy test
+
+    pytest3:
+        name: Conda Pytest Py3
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v2
+        -   name: "Main Script"
+            run: |
+                grep -v symengine .test-conda-env-py3.yml > .test-conda-env.yml
+                CONDA_ENVIRONMENT=.test-conda-env.yml
+                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
+                . ./build-and-test-py-project-within-miniconda.sh
+
+    pytest3symengine:
+        name: Conda Pytest Py3 Symengine
+        runs-on: ubuntu-latest
+        steps:
+        -   uses: actions/checkout@v2
+        -   name: "Main Script"
+            run: |
+                CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+                curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
+                . ./build-and-test-py-project-within-miniconda.sh
+
+# vim: sw=4
diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index d74930eb8b76f1a0ee99701eb03a8e4c69ca1d7f..e1579ae818bfedb952d960b231627eeadf413695 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -17,26 +17,10 @@
 #    reports:
 #      junit: test/pytest.xml
 
-Python 2.7 POCL:
-  script:
-  - export PY_EXE=python2.7
-  - export PYOPENCL_TEST=portable
-  - export EXTRA_INSTALL="pybind11 numpy mako"
-  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
-  - ". ./build-and-test-py-project.sh"
-  tags:
-  - python2.7
-  - pocl
-  except:
-  - tags
-  artifacts:
-    reports:
-      junit: test/pytest.xml
-
 Python 3 POCL:
   script:
   - export PY_EXE=python3
-  - export PYOPENCL_TEST=portable
+  - export PYOPENCL_TEST=portable:pthread
   - export EXTRA_INSTALL="pybind11 numpy mako"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project.sh
   - ". ./build-and-test-py-project.sh"
@@ -66,16 +50,34 @@ Python 3 Titan X:
     reports:
       junit: test/pytest.xml
 
-Python 3.6 Conda:
+Python 3 Conda:
   script:
   # Disable caching to ensure SymEngine code generation is exercised.
   - export SUMPY_NO_CACHE=1
   - export SUMPY_FORCE_SYMBOLIC_BACKEND=symengine
+  - export PYOPENCL_TEST=portable:pthread
   - CONDA_ENVIRONMENT=.test-conda-env-py3.yml
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
   - ". ./build-and-test-py-project-within-miniconda.sh"
   tags:
-  - linux
+  - porter
+  except:
+  - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
+Python 3.6 POCL Titan X:
+  script:
+  # Disable caching to ensure SymEngine code generation is exercised.
+  - export SUMPY_NO_CACHE=1
+  - export SUMPY_FORCE_SYMBOLIC_BACKEND=symengine
+  - export PYOPENCL_TEST=portable:titan
+  - CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
+  - ". ./build-and-test-py-project-within-miniconda.sh"
+  tags:
+  - nvidia-titan-x
   except:
   - tags
   artifacts:
@@ -106,7 +108,7 @@ Benchmarks:
   - sed 's/python=3/python=3.6/' .test-conda-env-py3.yml > .test-conda-env.yml
   - CONDA_ENVIRONMENT=.test-conda-env.yml
   - PROJECT=sumpy
-  - PYOPENCL_TEST=portable
+  - PYOPENCL_TEST=portable:pthread
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-benchmark-py-project.sh
   - ". ./build-and-benchmark-py-project.sh"
   tags:
diff --git a/.test-conda-env-py3.yml b/.test-conda-env-py3.yml
index aa8ae80adf6740131933528c34ecb98d8d7fb0c3..f9ede8fcf96f730ed9d14ce03d0504b81ab7ee13 100644
--- a/.test-conda-env-py3.yml
+++ b/.test-conda-env-py3.yml
@@ -1,19 +1,18 @@
 name: test-conda-env-py3
 channels:
-- inducer
 - conda-forge
-- defaults
+- nodefaults
 
 dependencies:
 - git
-- conda-forge::numpy
-- conda-forge::sympy
+- numpy
+- sympy
 - pocl
+- pocl-cuda
 - islpy
 - pyopencl
 - python=3
-- symengine=0.3.0
-- python-symengine=0.3.0
+- python-symengine=0.6.0
 - pyfmmlib
 
 - pip
diff --git a/README.rst b/README.rst
index 884bef17cbaea4675daced30e117881e603fb7dd..63c0a5e2599bad6c7792ae07b80f3eff81913a24 100644
--- a/README.rst
+++ b/README.rst
@@ -4,9 +4,9 @@ sumpy: n-body kernels and translation operators
 .. image:: https://gitlab.tiker.net/inducer/sumpy/badges/master/pipeline.svg
     :alt: Gitlab Build Status
     :target: https://gitlab.tiker.net/inducer/sumpy/commits/master
-.. image:: https://dev.azure.com/ak-spam/inducer/_apis/build/status/inducer.sumpy?branchName=master
-    :alt: Azure Build Status
-    :target: https://dev.azure.com/ak-spam/inducer/_build/latest?definitionId=17&branchName=master
+.. image:: https://github.com/inducer/sumpy/workflows/CI/badge.svg?branch=master&event=push
+    :alt: Github Build Status
+    :target: https://github.com/inducer/sumpy/actions?query=branch%3Amaster+workflow%3ACI+event%3Apush
 .. image:: https://badge.fury.io/py/sumpy.png
     :alt: Python Package Index Release Page
     :target: https://pypi.org/project/sumpy/
diff --git a/asv.conf.json b/asv.conf.json
index 9632c4fb70310c528b43e3d055a96d96875e629f..6d6fd675c48cbdf02e0bc8448f22890dde2a3e26 100644
--- a/asv.conf.json
+++ b/asv.conf.json
@@ -68,7 +68,7 @@
     // },
     "matrix": {
         "numpy" : [""],
-        "sympy" : ["1.0"],
+        "sympy" : ["1.4"],
         "pyopencl" : [""],
         "islpy" : [""],
         "pocl" : [""],
diff --git a/azure-pipelines.yml b/azure-pipelines.yml
deleted file mode 100644
index 9f1f92a26ed125d3614d9a46d3436711e0b2b942..0000000000000000000000000000000000000000
--- a/azure-pipelines.yml
+++ /dev/null
@@ -1,96 +0,0 @@
-jobs:
--
-    job: 'Python2'
-    pool:
-        vmImage: 'ubuntu-latest'
-
-    steps:
-    -
-        script: |
-            set -e
-            sed 's/python=3/python=2.7/' .test-conda-env-py3.yml > .test-conda-env-py2-pre.yml
-            grep -v symengine .test-conda-env-py2-pre.yml > .test-conda-env-py2.yml
-            cat .test-conda-env-py2.yml
-            CONDA_ENVIRONMENT=.test-conda-env-py2.yml
-            curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
-            . ./build-and-test-py-project-within-miniconda.sh
-
-        displayName: 'Pytest Conda'
-    -
-        task: PublishTestResults@2
-        inputs:
-            testResultsFormat: 'JUnit'
-            testResultsFiles: 'test/pytest.xml'
-
--
-    job: 'Python3'
-    pool:
-        vmImage: 'ubuntu-latest'
-
-    steps:
-    -
-        script: |
-            set -e
-            grep -v symengine .test-conda-env-py3.yml > .test-conda-env.yml
-            CONDA_ENVIRONMENT=.test-conda-env.yml
-            curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
-            . ./build-and-test-py-project-within-miniconda.sh
-
-        displayName: 'Pytest Conda'
-    -
-        task: PublishTestResults@2
-        inputs:
-            testResultsFormat: 'JUnit'
-            testResultsFiles: 'test/pytest.xml'
-
--
-    job: 'Python3Symengine'
-    pool:
-        vmImage: 'ubuntu-latest'
-
-    steps:
-    -
-        script: |
-            set -e
-            CONDA_ENVIRONMENT=.test-conda-env-py3.yml
-            curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
-            . ./build-and-test-py-project-within-miniconda.sh
-
-        displayName: 'Pytest Conda'
-
-    -
-        task: PublishTestResults@2
-        inputs:
-            testResultsFormat: 'JUnit'
-            testResultsFiles: 'test/pytest.xml'
-
--
-    job: 'Flake8'
-    pool:
-        vmImage: 'ubuntu-latest'
-    strategy:
-        matrix:
-            Python37:
-                python.version: '3.7'
-
-    steps:
-    -
-        task: UsePythonVersion@0
-        inputs:
-            versionSpec: '$(python.version)'
-
-    -
-        script: |
-            set -e
-            curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/prepare-and-run-flake8.sh
-            . ./prepare-and-run-flake8.sh sumpy test
-
-        displayName: 'Flake8'
-
-schedules:
--
-    cron: "0 0 * * 0"
-    displayName: Weekly build
-    branches:
-        include:
-        - master
diff --git a/setup.py b/setup.py
index 9457bbdc269697f61153e755a930efc9fccae41d..28130655fab89415905e4532d408a3485025243e 100644
--- a/setup.py
+++ b/setup.py
@@ -91,6 +91,7 @@ setup(name="sumpy",
       license="MIT",
       packages=["sumpy", "sumpy.expansion"],
 
+      python_requires="~=3.6",
       install_requires=[
           "pytools>=2018.2",
           "loo.py>=2017.2",
diff --git a/sumpy/expansion/local.py b/sumpy/expansion/local.py
index a58c598173a352afaa92d6a6d06855ca07275457..b4038796c6926c34e97bfbf856eaac48e23616ea 100644
--- a/sumpy/expansion/local.py
+++ b/sumpy/expansion/local.py
@@ -382,10 +382,10 @@ class _FourierBesselLocalExpansion(LocalExpansionBase):
         source_angle_rel_center = sym.atan2(-avec[1], -avec[0])
         avec_len = sym_real_norm_2(avec)
         return [self.kernel.postprocess_at_source(
-                    hankel_1(l, arg_scale * avec_len)
-                    * rscale ** abs(l)
-                    * sym.exp(sym.I * l * source_angle_rel_center), avec)
-                    for l in self.get_coefficient_identifiers()]
+                    hankel_1(c, arg_scale * avec_len)
+                    * rscale ** abs(c)
+                    * sym.exp(sym.I * c * source_angle_rel_center), avec)
+                    for c in self.get_coefficient_identifiers()]
 
     def evaluate(self, coeffs, bvec, rscale, sac=None):
         if not self.use_rscale:
@@ -398,12 +398,12 @@ class _FourierBesselLocalExpansion(LocalExpansionBase):
 
         arg_scale = self.get_bessel_arg_scaling()
 
-        return sum(coeffs[self.get_storage_index(l)]
+        return sum(coeffs[self.get_storage_index(c)]
                    * self.kernel.postprocess_at_target(
-                       bessel_j(l, arg_scale * bvec_len)
-                       / rscale ** abs(l)
-                       * sym.exp(sym.I * l * -target_angle_rel_center), bvec)
-                for l in self.get_coefficient_identifiers())
+                       bessel_j(c, arg_scale * bvec_len)
+                       / rscale ** abs(c)
+                       * sym.exp(sym.I * c * -target_angle_rel_center), bvec)
+                for c in self.get_coefficient_identifiers())
 
     def translate_from(self, src_expansion, src_coeff_exprs, src_rscale,
             dvec, tgt_rscale):
@@ -420,13 +420,13 @@ class _FourierBesselLocalExpansion(LocalExpansionBase):
             bessel_j = sym.Function("bessel_j")
             new_center_angle_rel_old_center = sym.atan2(dvec[1], dvec[0])
             translated_coeffs = []
-            for l in self.get_coefficient_identifiers():
+            for j in self.get_coefficient_identifiers():
                 translated_coeffs.append(
                     sum(src_coeff_exprs[src_expansion.get_storage_index(m)]
-                        * bessel_j(m - l, arg_scale * dvec_len)
+                        * bessel_j(m - j, arg_scale * dvec_len)
                         / src_rscale ** abs(m)
-                        * tgt_rscale ** abs(l)
-                        * sym.exp(sym.I * (m - l) * -new_center_angle_rel_old_center)
+                        * tgt_rscale ** abs(j)
+                        * sym.exp(sym.I * (m - j) * -new_center_angle_rel_old_center)
                     for m in src_expansion.get_coefficient_identifiers()))
             return translated_coeffs
 
@@ -435,14 +435,14 @@ class _FourierBesselLocalExpansion(LocalExpansionBase):
             hankel_1 = sym.Function("hankel_1")
             new_center_angle_rel_old_center = sym.atan2(dvec[1], dvec[0])
             translated_coeffs = []
-            for l in self.get_coefficient_identifiers():
+            for j in self.get_coefficient_identifiers():
                 translated_coeffs.append(
                     sum(
-                        (-1) ** l
-                        * hankel_1(m + l, arg_scale * dvec_len)
+                        (-1) ** j
+                        * hankel_1(m + j, arg_scale * dvec_len)
                         * src_rscale ** abs(m)
-                        * tgt_rscale ** abs(l)
-                        * sym.exp(sym.I * (m + l) * new_center_angle_rel_old_center)
+                        * tgt_rscale ** abs(j)
+                        * sym.exp(sym.I * (m + j) * new_center_angle_rel_old_center)
                         * src_coeff_exprs[src_expansion.get_storage_index(m)]
                         for m in src_expansion.get_coefficient_identifiers()))
             return translated_coeffs
diff --git a/sumpy/expansion/multipole.py b/sumpy/expansion/multipole.py
index 1a53aa40f15f78fb3d94c1b2f8bb9932d6daba90..3328286e1b7117f0c083f566166faa05f864817a 100644
--- a/sumpy/expansion/multipole.py
+++ b/sumpy/expansion/multipole.py
@@ -171,8 +171,7 @@ class VolumeTaylorMultipoleExpansionBase(MultipoleExpansionBase):
 
         src_coeff_exprs = list(src_coeff_exprs)
         for i, mi in enumerate(src_expansion.get_coefficient_identifiers()):
-            src_coeff_exprs[i] *= mi_factorial(mi) * \
-                sym.UnevaluatedExpr(src_rscale/tgt_rscale)**sum(mi)
+            src_coeff_exprs[i] *= sym.UnevaluatedExpr(src_rscale/tgt_rscale)**sum(mi)
 
         result = [0] * len(self.get_full_coefficient_identifiers())
 
@@ -220,21 +219,15 @@ class VolumeTaylorMultipoleExpansionBase(MultipoleExpansionBase):
                         continue
 
                     contrib = dim_coeffs_to_translate[src_index]
-
                     for idim in range(self.dim):
                         n = tgt_mi[idim]
                         k = src_mi[idim]
                         assert n >= k
-                        from sympy import binomial
-                        contrib *= (binomial(n, k)
-                                * sym.UnevaluatedExpr(dvec[idim]/tgt_rscale)**(n-k))
+                        contrib /= mi_factorial((n-k,))
+                        contrib *= sym.UnevaluatedExpr(dvec[idim]/tgt_rscale)**(n-k)
 
                     result[i] += contrib
 
-                # Defer division by target factorial until the very end
-                if d == self.dim-1:
-                    result[i] /= mi_factorial(tgt_mi)
-
             dim_coeffs_to_translate = result[:]
             mi_to_index = tgt_mi_to_index
 
@@ -308,11 +301,11 @@ class _HankelBased2DMultipoleExpansion(MultipoleExpansionBase):
         source_angle_rel_center = sym.atan2(-avec[1], -avec[0])
         return [
                 self.kernel.postprocess_at_source(
-                    bessel_j(l, arg_scale * avec_len)
-                    / rscale ** abs(l)
-                    * sym.exp(sym.I * l * -source_angle_rel_center),
+                    bessel_j(c, arg_scale * avec_len)
+                    / rscale ** abs(c)
+                    * sym.exp(sym.I * c * -source_angle_rel_center),
                     avec)
-                for l in self.get_coefficient_identifiers()]
+                for c in self.get_coefficient_identifiers()]
 
     def evaluate(self, coeffs, bvec, rscale, sac=None):
         if not self.use_rscale:
@@ -325,12 +318,12 @@ class _HankelBased2DMultipoleExpansion(MultipoleExpansionBase):
 
         arg_scale = self.get_bessel_arg_scaling()
 
-        return sum(coeffs[self.get_storage_index(l)]
+        return sum(coeffs[self.get_storage_index(c)]
                    * self.kernel.postprocess_at_target(
-                       hankel_1(l, arg_scale * bvec_len)
-                       * rscale ** abs(l)
-                       * sym.exp(sym.I * l * target_angle_rel_center), bvec)
-                for l in self.get_coefficient_identifiers())
+                       hankel_1(c, arg_scale * bvec_len)
+                       * rscale ** abs(c)
+                       * sym.exp(sym.I * c * target_angle_rel_center), bvec)
+                for c in self.get_coefficient_identifiers())
 
     def translate_from(self, src_expansion, src_coeff_exprs, src_rscale,
             dvec, tgt_rscale):
@@ -351,13 +344,13 @@ class _HankelBased2DMultipoleExpansion(MultipoleExpansionBase):
         arg_scale = self.get_bessel_arg_scaling()
 
         translated_coeffs = []
-        for l in self.get_coefficient_identifiers():
+        for j in self.get_coefficient_identifiers():
             translated_coeffs.append(
                 sum(src_coeff_exprs[src_expansion.get_storage_index(m)]
-                    * bessel_j(m - l, arg_scale * dvec_len)
+                    * bessel_j(m - j, arg_scale * dvec_len)
                     * src_rscale ** abs(m)
-                    / tgt_rscale ** abs(l)
-                    * sym.exp(sym.I * (m - l) * new_center_angle_rel_old_center)
+                    / tgt_rscale ** abs(j)
+                    * sym.exp(sym.I * (m - j) * new_center_angle_rel_old_center)
                 for m in src_expansion.get_coefficient_identifiers()))
         return translated_coeffs
 
diff --git a/sumpy/fmm.py b/sumpy/fmm.py
index a233175686f2c395fe7783235ddeb5e351b09ff0..668734fdc8a433a65b3b80b722f825b2aa2f743b 100644
--- a/sumpy/fmm.py
+++ b/sumpy/fmm.py
@@ -320,13 +320,16 @@ class SumpyExpansionWrangler(object):
         return source_array.with_queue(self.queue)[self.tree.user_source_ids]
 
     def reorder_potentials(self, potentials):
-        from pytools.obj_array import is_obj_array, with_object_array_or_scalar
-        assert is_obj_array(potentials)
+        from pytools.obj_array import obj_array_vectorize
+        import numpy as np
+        assert (
+                isinstance(potentials, np.ndarray)
+                and potentials.dtype.char == "O")
 
         def reorder(x):
             return x.with_queue(self.queue)[self.tree.sorted_target_ids]
 
-        return with_object_array_or_scalar(reorder, potentials)
+        return obj_array_vectorize(reorder, potentials)
 
     # }}}
 
diff --git a/sumpy/kernel.py b/sumpy/kernel.py
index 6383d8d57f4fc3e429333e686631c8bbcad1b6c6..53b8a5506d4945fa54fb7229040fd57ee723f6bc 100644
--- a/sumpy/kernel.py
+++ b/sumpy/kernel.py
@@ -559,14 +559,30 @@ class YukawaKernel(ExpressionKernel):
         """
         lam = var(yukawa_lambda_name)
 
-        if dim == 2:
-            r = pymbolic_real_norm_2(make_sym_vector("d", dim))
+        # NOTE: The Yukawa kernel is given by [1]
+        #   -1/(2 pi)**(n/2) * (lam/r)**(n/2-1) * K(n/2-1, lam r)
+        # where K is a modified Bessel function of the second kind.
+        #
+        # [1] https://en.wikipedia.org/wiki/Green%27s_function
+        # [2] http://dlmf.nist.gov/10.27#E8
+        # [3] https://dlmf.nist.gov/10.47#E2
+        # [4] https://dlmf.nist.gov/10.49
 
-            # http://dlmf.nist.gov/10.27#E8
+        r = pymbolic_real_norm_2(make_sym_vector("d", dim))
+        if dim == 2:
+            # NOTE: transform K(0, lam r) into a Hankel function using [2]
             expr = var("hankel_1")(0, var("I")*lam*r)
-            scaling_for_K0 = 1/2*var("pi")*var("I")  # noqa: N806
+            scaling_for_K0 = var("pi")/2*var("I")       # noqa: N806
 
             scaling = -1/(2*var("pi")) * scaling_for_K0
+        elif dim == 3:
+            # NOTE: to get the expression, we do the following and simplify
+            # 1. express K(1/2, lam r) as a modified spherical Bessel function
+            #   k(0, lam r) using [3] and use expression for k(0, lam r) from [4]
+            # 2. or use (AS 10.2.17) directly
+            expr = var("exp")(-lam*r) / r
+
+            scaling = -1/(4 * var("pi")**2)
         else:
             raise RuntimeError("unsupported dimensionality")
 
diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index e3b457dd52e885bf66462e475b3b58edb78bd098..c585d0eae7ec73de1231850b3b3be628ba9e3d4d 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -33,7 +33,8 @@ import loopy as lp
 from loopy.version import MOST_RECENT_LANGUAGE_VERSION
 from pymbolic import var
 
-from sumpy.tools import KernelComputation, KernelCacheWrapper
+from sumpy.tools import (
+        KernelComputation, KernelCacheWrapper, is_obj_array_like)
 
 
 __doc__ = """
@@ -214,12 +215,9 @@ class P2P(P2PBase):
         return loopy_knl
 
     def __call__(self, queue, targets, sources, strength, **kwargs):
-        from pytools.obj_array import is_obj_array
         knl = self.get_cached_optimized_kernel(
-                targets_is_obj_array=(
-                    is_obj_array(targets) or isinstance(targets, (tuple, list))),
-                sources_is_obj_array=(
-                    is_obj_array(sources) or isinstance(sources, (tuple, list))))
+                targets_is_obj_array=is_obj_array_like(targets),
+                sources_is_obj_array=is_obj_array_like(sources))
 
         return knl(queue, sources=sources, targets=targets, strength=strength,
                 **kwargs)
@@ -278,12 +276,9 @@ class P2PMatrixGenerator(P2PBase):
         return loopy_knl
 
     def __call__(self, queue, targets, sources, **kwargs):
-        from pytools.obj_array import is_obj_array
         knl = self.get_cached_optimized_kernel(
-                targets_is_obj_array=(
-                    is_obj_array(targets) or isinstance(targets, (tuple, list))),
-                sources_is_obj_array=(
-                    is_obj_array(sources) or isinstance(sources, (tuple, list))))
+                targets_is_obj_array=is_obj_array_like(targets),
+                sources_is_obj_array=is_obj_array_like(sources))
 
         return knl(queue, sources=sources, targets=targets, **kwargs)
 
@@ -390,12 +385,9 @@ class P2PMatrixBlockGenerator(P2PBase):
         :return: a tuple of one-dimensional arrays of kernel evaluations at
             target-source pairs described by `index_set`.
         """
-        from pytools.obj_array import is_obj_array
         knl = self.get_cached_optimized_kernel(
-                targets_is_obj_array=(
-                    is_obj_array(targets) or isinstance(targets, (tuple, list))),
-                sources_is_obj_array=(
-                    is_obj_array(sources) or isinstance(sources, (tuple, list))))
+                targets_is_obj_array=is_obj_array_like(targets),
+                sources_is_obj_array=is_obj_array_like(sources))
 
         return knl(queue,
                    targets=targets,
diff --git a/sumpy/point_calculus.py b/sumpy/point_calculus.py
index f8b4f5bdf38ec7cd9043047f07e5f2c2ce31b69c..fc164d81b226c0ed2e38f2e60c68d4cd466c8b7e 100644
--- a/sumpy/point_calculus.py
+++ b/sumpy/point_calculus.py
@@ -231,9 +231,9 @@ class CalculusPatch(object):
         from pytools.obj_array import make_obj_array
         return make_obj_array([
             sum(
-                levi_civita((l, m, n)) * self.diff(m, arg[n])
+                levi_civita((k, m, n)) * self.diff(m, arg[n])
                 for m in range(3) for n in range(3))
-            for l in range(3)])
+            for k in range(3)])
 
     def eval_at_center(self, f_values):
         """Interpolate *f_values* to the center point.
diff --git a/sumpy/qbx.py b/sumpy/qbx.py
index 9708764c01d0448d7c7e8314efb461989c838acd..dd6a827928832aadf74d6108df0489950836cc59 100644
--- a/sumpy/qbx.py
+++ b/sumpy/qbx.py
@@ -35,7 +35,8 @@ import sumpy.symbolic as sym
 from pytools import memoize_method
 from pymbolic import parse, var
 
-from sumpy.tools import KernelComputation, KernelCacheWrapper
+from sumpy.tools import (
+        KernelComputation, KernelCacheWrapper, is_obj_array_like)
 
 import logging
 logger = logging.getLogger(__name__)
@@ -154,7 +155,7 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
                 lp.GlobalArg("tgt", None,
                     shape=(self.dim, "ntargets"), order="C"),
                 lp.GlobalArg("center", None,
-                    shape=(self.dim, "ntargets"), dim_tags="sep,C"),
+                    shape=(self.dim, "ntargets"), order="C"),
                 lp.GlobalArg("expansion_radii",
                     None, shape="ntargets"),
                 lp.ValueArg("nsources", None),
@@ -164,10 +165,18 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
     def get_kernel(self):
         raise NotImplementedError
 
-    def get_optimized_kernel(self):
+    def get_optimized_kernel(self,
+            targets_is_obj_array, sources_is_obj_array, centers_is_obj_array):
         # FIXME specialize/tune for GPU/CPU
         loopy_knl = self.get_kernel()
 
+        if targets_is_obj_array:
+            loopy_knl = lp.tag_array_axes(loopy_knl, "tgt", "sep,C")
+        if sources_is_obj_array:
+            loopy_knl = lp.tag_array_axes(loopy_knl, "src", "sep,C")
+        if centers_is_obj_array:
+            loopy_knl = lp.tag_array_axes(loopy_knl, "center", "sep,C")
+
         import pyopencl as cl
         dev = self.context.devices[0]
         if dev.type & cl.device_type.CPU:
@@ -249,7 +258,10 @@ class LayerPotential(LayerPotentialBase):
             already multiplied in.
         """
 
-        knl = self.get_cached_optimized_kernel()
+        knl = self.get_cached_optimized_kernel(
+                targets_is_obj_array=is_obj_array_like(targets),
+                sources_is_obj_array=is_obj_array_like(sources),
+                centers_is_obj_array=is_obj_array_like(centers))
 
         for i, dens in enumerate(strengths):
             kwargs["strength_%d" % i] = dens
@@ -313,7 +325,10 @@ class LayerPotentialMatrixGenerator(LayerPotentialBase):
         return loopy_knl
 
     def __call__(self, queue, targets, sources, centers, expansion_radii, **kwargs):
-        knl = self.get_cached_optimized_kernel()
+        knl = self.get_cached_optimized_kernel(
+                targets_is_obj_array=is_obj_array_like(targets),
+                sources_is_obj_array=is_obj_array_like(sources),
+                centers_is_obj_array=is_obj_array_like(centers))
 
         return knl(queue, src=sources, tgt=targets, center=centers,
                 expansion_radii=expansion_radii, **kwargs)
@@ -388,9 +403,17 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
 
         return loopy_knl
 
-    def get_optimized_kernel(self):
+    def get_optimized_kernel(self,
+            targets_is_obj_array, sources_is_obj_array, centers_is_obj_array):
         loopy_knl = self.get_kernel()
 
+        if targets_is_obj_array:
+            loopy_knl = lp.tag_array_axes(loopy_knl, "tgt", "sep,C")
+        if sources_is_obj_array:
+            loopy_knl = lp.tag_array_axes(loopy_knl, "src", "sep,C")
+        if centers_is_obj_array:
+            loopy_knl = lp.tag_array_axes(loopy_knl, "center", "sep,C")
+
         loopy_knl = lp.split_iname(loopy_knl, "imat", 1024, outer_tag="g.0")
         return loopy_knl
 
@@ -406,8 +429,10 @@ class LayerPotentialMatrixBlockGenerator(LayerPotentialBase):
         :return: a tuple of one-dimensional arrays of kernel evaluations at
             target-source pairs described by `index_set`.
         """
-
-        knl = self.get_cached_optimized_kernel()
+        knl = self.get_cached_optimized_kernel(
+                targets_is_obj_array=is_obj_array_like(targets),
+                sources_is_obj_array=is_obj_array_like(sources),
+                centers_is_obj_array=is_obj_array_like(centers))
 
         return knl(queue,
                    src=sources,
diff --git a/sumpy/tools.py b/sumpy/tools.py
index 2719f43a2e128da47997f0d1a010072d9bcec1dd..1742d0441b82e88155bfaa03eba063ed62f9556e 100644
--- a/sumpy/tools.py
+++ b/sumpy/tools.py
@@ -134,18 +134,18 @@ def build_matrix(op, dtype=None, shape=None):
 
 
 def vector_to_device(queue, vec):
-    from pytools.obj_array import with_object_array_or_scalar
+    from pytools.obj_array import obj_array_vectorize
 
     from pyopencl.array import to_device
 
     def to_dev(ary):
         return to_device(queue, ary)
 
-    return with_object_array_or_scalar(to_dev, vec)
+    return obj_array_vectorize(to_dev, vec)
 
 
 def vector_from_device(queue, vec):
-    from pytools.obj_array import with_object_array_or_scalar
+    from pytools.obj_array import obj_array_vectorize
 
     def from_dev(ary):
         from numbers import Number
@@ -155,7 +155,7 @@ def vector_from_device(queue, vec):
 
         return ary.get(queue=queue)
 
-    return with_object_array_or_scalar(from_dev, vec)
+    return obj_array_vectorize(from_dev, vec)
 
 
 def _merge_kernel_arguments(dictionary, arg):
@@ -760,4 +760,11 @@ def solve_symbolic(A, b):  # noqa: N803
     red = reduced_row_echelon_form(big)[0]
     return red[:, big.shape[0]:]
 
+
+def is_obj_array_like(ary):
+    return (
+            isinstance(ary, (tuple, list))
+            or (isinstance(ary, np.ndarray) and ary.dtype.char == "O"))
+
+
 # vim: fdm=marker
diff --git a/sumpy/version.py b/sumpy/version.py
index f42335928e7fcb0cf6f93e8461714fb436691f8d..ced3319c9da32decaffd911398df47c239e034b9 100644
--- a/sumpy/version.py
+++ b/sumpy/version.py
@@ -43,7 +43,7 @@ else:
 # }}}
 
 
-VERSION = (2016, 1)
+VERSION = (2020, 2)
 VERSION_STATUS = "beta1"
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
 
diff --git a/sumpy/visualization.py b/sumpy/visualization.py
index bf5f016b4c089be7723a2ff92c91530c998603c2..d1eee517a9d99b1544f8e30c566e91debe9a72d8 100644
--- a/sumpy/visualization.py
+++ b/sumpy/visualization.py
@@ -33,34 +33,22 @@ from six.moves import range
 
 
 def separate_by_real_and_imag(data, real_only):
+    from pytools.obj_array import obj_array_real_copy, obj_array_imag_copy
+
     for name, field in data:
-        from pytools.obj_array import log_shape
-        ls = log_shape(field)
-
-        if ls != () and ls[0] > 1:
-            assert len(ls) == 1
-            from pytools.obj_array import (
-                    oarray_real_copy, oarray_imag_copy,
-                    with_object_array_or_scalar)
-
-            if field[0].dtype.kind == "c":
-                if real_only:
-                    yield (name,
-                            with_object_array_or_scalar(oarray_real_copy, field))
-                else:
-                    yield (name+"_r",
-                            with_object_array_or_scalar(oarray_real_copy, field))
-                    yield (name+"_i",
-                            with_object_array_or_scalar(oarray_imag_copy, field))
-            else:
-                yield (name, field)
+        try:
+            # Look inside object arrays to get the entry dtype.
+            entry_dtype = field[0].dtype
+        except AttributeError:
+            entry_dtype = field.dtype
+
+        assert entry_dtype.kind != "O"
+
+        if real_only or entry_dtype.kind != "c":
+            yield (name, obj_array_real_copy(field))
         else:
-            # ls == ()
-            if field.dtype.kind == "c":
-                yield (name+"_r", field.real.copy())
-                yield (name+"_i", field.imag.copy())
-            else:
-                yield (name, field)
+            yield (name + "_r", obj_array_real_copy(field))
+            yield (name + "_i", obj_array_imag_copy(field))
 
 
 def make_field_plotter_from_bbox(bbox, h, extend_factor=0):
diff --git a/test/test_cse.py b/test/test_cse.py
index 703e8c9f93ca2069c273ead893a00ed0ba3dc697..2f54f34bab36e9be4047c05f1a2c663523a83509 100644
--- a/test/test_cse.py
+++ b/test/test_cse.py
@@ -253,12 +253,12 @@ def test_issue_4499():
     G = Function('G')  # noqa
     t = Tuple(
         *(a, a + S(1)/2, 2*a, b, 2*a - b + 1, (sqrt(z)/2)**(-2*a + 1)*B(2*a
-        - b, sqrt(z))*B(b - 1, sqrt(z))*G(b)*G(2*a - b + 1),
-        sqrt(z)*(sqrt(z)/2)**(-2*a + 1)*B(b, sqrt(z))*B(2*a - b,
+        - b, sqrt(z))*B(b - 1, sqrt(z))*G(b)*G(2*a - b + 1),  # noqa
+        sqrt(z)*(sqrt(z)/2)**(-2*a + 1)*B(b, sqrt(z))*B(2*a - b,  # noqa
         sqrt(z))*G(b)*G(2*a - b + 1), sqrt(z)*(sqrt(z)/2)**(-2*a + 1)*B(b - 1,
         sqrt(z))*B(2*a - b + 1, sqrt(z))*G(b)*G(2*a - b + 1),
-        (sqrt(z)/2)**(-2*a + 1)*B(b, sqrt(z))*B(2*a - b + 1,
-        sqrt(z))*G(b)*G(2*a - b + 1), 1, 0, S(1)/2, z/2, -b + 1, -2*a + b,
+        (sqrt(z)/2)**(-2*a + 1)*B(b, sqrt(z))*B(2*a - b + 1,  # noqa
+        sqrt(z))*G(b)*G(2*a - b + 1), 1, 0, S(1)/2, z/2, -b + 1, -2*a + b, # noqa
         -2*a))  # noqa
     c = cse(t)
     assert len(c[0]) == 11
diff --git a/test/test_fmm.py b/test/test_fmm.py
index 2bc6d69d6bcafedf338cc78aea0591d6d5ac3845..b429456628e35d997cb0cbe8f4eec4588e1b66fd 100644
--- a/test/test_fmm.py
+++ b/test/test_fmm.py
@@ -72,10 +72,10 @@ else:
                          HelmholtzConformingVolumeTaylorMultipoleExpansion),
     (YukawaKernel(2), Y2DLocalExpansion, Y2DMultipoleExpansion),
     ])
-def test_sumpy_fmm(ctx_getter, knl, local_expn_class, mpole_expn_class):
+def test_sumpy_fmm(ctx_factory, knl, local_expn_class, mpole_expn_class):
     logging.basicConfig(level=logging.INFO)
 
-    ctx = ctx_getter()
+    ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
     nsources = 1000
@@ -201,10 +201,10 @@ def test_sumpy_fmm(ctx_getter, knl, local_expn_class, mpole_expn_class):
     pconv_verifier()
 
 
-def test_sumpy_fmm_timing_data_collection(ctx_getter):
+def test_sumpy_fmm_timing_data_collection(ctx_factory):
     logging.basicConfig(level=logging.INFO)
 
-    ctx = ctx_getter()
+    ctx = ctx_factory()
     queue = cl.CommandQueue(
             ctx,
             properties=cl.command_queue_properties.PROFILING_ENABLE)
@@ -257,10 +257,10 @@ def test_sumpy_fmm_timing_data_collection(ctx_getter):
     assert timing_data
 
 
-def test_sumpy_fmm_exclude_self(ctx_getter):
+def test_sumpy_fmm_exclude_self(ctx_factory):
     logging.basicConfig(level=logging.INFO)
 
-    ctx = ctx_getter()
+    ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
     nsources = 500
diff --git a/test/test_kernels.py b/test/test_kernels.py
index 8225723cdd4f7df2afb7c487a578cc8c9ee81d28..2af6a86a06cf303723380e25e383e1c87fb13511 100644
--- a/test/test_kernels.py
+++ b/test/test_kernels.py
@@ -58,8 +58,8 @@ else:
 
 
 @pytest.mark.parametrize("exclude_self", (True, False))
-def test_p2p(ctx_getter, exclude_self):
-    ctx = ctx_getter()
+def test_p2p(ctx_factory, exclude_self):
+    ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
     dimensions = 3
@@ -140,13 +140,13 @@ def test_p2p(ctx_getter, exclude_self):
     True
     ])
 # Sample: test_p2e2p(cl._csc, LaplaceKernel(2), VolumeTaylorLocalExpansion, 4, False)
-def test_p2e2p(ctx_getter, base_knl, expn_class, order, with_source_derivative):
+def test_p2e2p(ctx_factory, base_knl, expn_class, order, with_source_derivative):
     #logging.basicConfig(level=logging.INFO)
 
     from sympy.core.cache import clear_cache
     clear_cache()
 
-    ctx = ctx_getter()
+    ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
     np.random.seed(17)
@@ -359,13 +359,13 @@ def test_p2e2p(ctx_getter, base_knl, expn_class, order, with_source_derivative):
     (StokesletKernel(2, 0, 0), BiharmonicConformingVolumeTaylorLocalExpansion,
      BiharmonicConformingVolumeTaylorMultipoleExpansion),
     ])
-def test_translations(ctx_getter, knl, local_expn_class, mpole_expn_class):
+def test_translations(ctx_factory, knl, local_expn_class, mpole_expn_class):
     logging.basicConfig(level=logging.INFO)
 
     from sympy.core.cache import clear_cache
     clear_cache()
 
-    ctx = ctx_getter()
+    ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
     np.random.seed(17)
diff --git a/test/test_matrixgen.py b/test/test_matrixgen.py
index 2c07b40a5c753e6c362be61263f0bf388784fc4a..463e7be6221a603ba3684b1c764d8905f35ca7e7 100644
--- a/test/test_matrixgen.py
+++ b/test/test_matrixgen.py
@@ -98,10 +98,10 @@ def _build_block_index(queue, nnodes, nblks, factor):
 
 @pytest.mark.parametrize('factor', [1.0, 0.6])
 @pytest.mark.parametrize('lpot_id', [1, 2])
-def test_qbx_direct(ctx_getter, factor, lpot_id):
+def test_qbx_direct(ctx_factory, factor, lpot_id):
     logging.basicConfig(level=logging.INFO)
 
-    ctx = ctx_getter()
+    ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
     ndim = 2
@@ -143,8 +143,9 @@ def test_qbx_direct(ctx_getter, factor, lpot_id):
 
         extra_kwargs = {}
         if lpot_id == 2:
+            from pytools.obj_array import make_obj_array
             extra_kwargs["dsource_vec"] = \
-                    vector_to_device(queue, np.ones((ndim, n)))
+                    vector_to_device(queue, make_obj_array(np.ones((ndim, n))))
 
         _, (result_lpot,) = lpot(queue,
                 targets=targets,
@@ -181,10 +182,10 @@ def test_qbx_direct(ctx_getter, factor, lpot_id):
 @pytest.mark.parametrize("exclude_self", [True, False])
 @pytest.mark.parametrize("factor", [1.0, 0.6])
 @pytest.mark.parametrize('lpot_id', [1, 2])
-def test_p2p_direct(ctx_getter, exclude_self, factor, lpot_id):
+def test_p2p_direct(ctx_factory, exclude_self, factor, lpot_id):
     logging.basicConfig(level=logging.INFO)
 
-    ctx = ctx_getter()
+    ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
     ndim = 2
@@ -225,8 +226,9 @@ def test_p2p_direct(ctx_getter, exclude_self, factor, lpot_id):
             extra_kwargs["target_to_source"] = \
                 cl.array.arange(queue, 0, n, dtype=np.int)
         if lpot_id == 2:
+            from pytools.obj_array import make_obj_array
             extra_kwargs["dsource_vec"] = \
-                    vector_to_device(queue, np.ones((ndim, n)))
+                    vector_to_device(queue, make_obj_array(np.ones((ndim, n))))
 
         _, (result_lpot,) = lpot(queue,
                 targets=targets,
diff --git a/test/test_misc.py b/test/test_misc.py
index a25b2e800ab6f9942395821cf0cb4345e80b6ccf..d8aff52f0e7918b65ef5572eae577e4a77983a5c 100644
--- a/test/test_misc.py
+++ b/test/test_misc.py
@@ -68,6 +68,7 @@ class YukawaKernelInfo:
     BiharmonicKernelInfo(2),
     BiharmonicKernelInfo(3),
     YukawaKernelInfo(2, 5),
+    YukawaKernelInfo(3, 5),
     ])
 def test_pde_check_kernels(ctx_factory, knl_info, order=5):
     dim = knl_info.kernel.dim
@@ -242,7 +243,7 @@ RTOL_P2E2E2P = 1e-2
 
 
 @pytest.mark.parametrize("case", P2E2E2P_TEST_CASES)
-def test_toy_p2e2e2p(ctx_getter, case):
+def test_toy_p2e2e2p(ctx_factory, case):
     dim = case.dim
 
     src = case.source.reshape(dim, -1)
@@ -255,7 +256,7 @@ def test_toy_p2e2e2p(ctx_getter, case):
     from sumpy.expansion.local import VolumeTaylorLocalExpansion
     from sumpy.expansion.multipole import VolumeTaylorMultipoleExpansion
 
-    cl_ctx = ctx_getter()
+    cl_ctx = ctx_factory()
     ctx = t.ToyContext(cl_ctx,
              LaplaceKernel(dim),
              VolumeTaylorMultipoleExpansion,
diff --git a/test/test_qbx.py b/test/test_qbx.py
index d408c4a4b82b907a6af801227f2c0e3ffbbb2f5a..8b73224c0ec23dc8f9249e2baad5fd92afb2d638 100644
--- a/test/test_qbx.py
+++ b/test/test_qbx.py
@@ -41,11 +41,11 @@ else:
     faulthandler.enable()
 
 
-def test_direct(ctx_getter):
+def test_direct(ctx_factory):
     # This evaluates a single layer potential on a circle.
     logging.basicConfig(level=logging.INFO)
 
-    ctx = ctx_getter()
+    ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
 
     from sumpy.kernel import LaplaceKernel