diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 6ee07ae45d90f7fa66cf6d8ca7e219d5cde2017b..63768430126b56370b840737f682d2107c6c55fb 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -66,6 +66,18 @@ Python 3.6 POCL:
   except:
   - tags
 
+Python 3.5 Conda:
+  script:
+  - export SUMPY_NO_CACHE=1
+  - CONDA_ENVIRONMENT=.test-conda-env-py3.yml
+  - REQUIREMENTS_TXT=.test-conda-env-py3-requirements.txt
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/master/build-and-test-py-project-within-miniconda.sh
+  - ". ./build-and-test-py-project-within-miniconda.sh"
+  tags:
+  - linux
+  except:
+  - tags
+
 Documentation:
   script:
   - EXTRA_INSTALL="numpy mako"
diff --git a/.test-conda-env-py3-requirements.txt b/.test-conda-env-py3-requirements.txt
new file mode 100644
index 0000000000000000000000000000000000000000..01f08662149acd51177b1f4c3a96d873555a5a9e
--- /dev/null
+++ b/.test-conda-env-py3-requirements.txt
@@ -0,0 +1,3 @@
+git+https://github.com/inducer/boxtree
+git+https://github.com/inducer/pymbolic
+git+https://github.com/inducer/loopy
diff --git a/.test-conda-env-py3.yml b/.test-conda-env-py3.yml
new file mode 100644
index 0000000000000000000000000000000000000000..f336c9e94a3638652f794f6771e91ddc958e4354
--- /dev/null
+++ b/.test-conda-env-py3.yml
@@ -0,0 +1,16 @@
+name: test-conda-env-py3
+channels:
+- inducer
+- symengine/label/dev
+- conda-forge
+- defaults
+dependencies:
+- git
+- conda-forge::numpy
+- conda-forge::sympy
+- pocl
+- islpy
+- pyopencl
+- python=3.5
+- python-symengine=0.2.0.53.g83912b7=py35_1
+# things not in here: loopy boxtree pymbolic pyfmmlib
diff --git a/doc/misc.rst b/doc/misc.rst
index c6ea75c30ea68e35999e273889af4d6f0ccc4178..c77c7c2252b5d92978c1f8ab33d28a7cabea4913 100644
--- a/doc/misc.rst
+++ b/doc/misc.rst
@@ -19,6 +19,19 @@ and say::
 
 In addition, you need to have :mod:`numpy` installed.
 
+Symbolic backends
+=================
+
+:mod:`sumpy` supports two symbolic backends: sympy and SymEngine. To use the
+SymEngine backend, ensure that the `SymEngine library
+<https://github.com/symengine/symengine>`_ and the `SymEngine Python bindings
+<https://github.com/symengine/symengine.py>`_ are installed.
+
+By default, :mod:`sumpy` prefers using SymEngine but falls back to sympy if it
+detects that SymEngine is not installed. To force the use of a particular
+backend, set the environment variable `SUMPY_FORCE_SYMBOLIC_BACKEND` to
+`symengine` or `sympy`.
+
 User-visible Changes
 ====================
 
diff --git a/requirements.txt b/requirements.txt
index abe39e8935e461184ed65898c568e8697162594b..6e23ae957e1531c3a3dba6762f93010b69f94b07 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -1,5 +1,6 @@
 numpy
 sympy==1.0
+git+https://github.com/inducer/pymbolic
 git+https://github.com/inducer/islpy
 git+https://github.com/pyopencl/pyopencl
 git+https://github.com/inducer/boxtree
diff --git a/sumpy/assignment_collection.py b/sumpy/assignment_collection.py
index c1e9197b516de9081ad230b021c4c02a4597670c..a12a17fd84b7158628c120a4626464fd288940bb 100644
--- a/sumpy/assignment_collection.py
+++ b/sumpy/assignment_collection.py
@@ -26,7 +26,7 @@ THE SOFTWARE.
 """
 
 
-import sympy as sp
+import sumpy.symbolic as sym
 
 import logging
 logger = logging.getLogger(__name__)
@@ -70,7 +70,7 @@ class _SymbolGenerator(object):
 
         self.base_to_count[base] = count + 1
 
-        return sp.Symbol(id_str)
+        return sym.Symbol(id_str)
 
     def __iter__(self):
         return self
@@ -132,7 +132,7 @@ class SymbolicAssignmentCollection(object):
 
         result = set()
         for dep in self.assignments[var_name].atoms():
-            if not isinstance(dep, sp.Symbol):
+            if not isinstance(dep, sym.Symbol):
                 continue
 
             dep_name = dep.name
@@ -153,7 +153,7 @@ class SymbolicAssignmentCollection(object):
         if root_name is None:
             root_name = name
 
-        self.assignments[name] = sp.sympify(expr)
+        self.assignments[name] = sym.sympify(expr)
 
     def assign_unique(self, name_base, expr):
         """Assign *expr* to a new variable whose name is based on *name_base*.
@@ -177,7 +177,7 @@ class SymbolicAssignmentCollection(object):
         # Options here:
         # - checked_cse: if you mistrust the result of the cse.
         #   Uses maxima to verify.
-        # - sp.cse: The sympy thing.
+        # - sym.cse: The sympy thing.
         # - sumpy.cse.cse: Based on sympy, designed to go faster.
         #from sumpy.symbolic import checked_cse
 
@@ -192,41 +192,13 @@ class SymbolicAssignmentCollection(object):
             self.assignments[name] = new_expr
 
         for name, value in new_assignments:
-            assert isinstance(name, sp.Symbol)
+            assert isinstance(name, sym.Symbol)
             self.add_assignment(name.name, value)
 
         logger.info("common subexpression elimination: done after {dur:.2f} s"
                     .format(dur=time.time() - start_time))
         return new_extra_exprs
 
-    def kill_trivial_assignments(self, exprs):
-        logger.info("kill trivial assignments: start")
-
-        approved_assignments = []
-        rejected_assignments = []
-
-        from sumpy.symbolic import is_assignment_nontrivial
-        for name, value in six.iteritems(self.assignments):
-            if name in self.user_symbols or is_assignment_nontrivial(name, value):
-                approved_assignments.append((name, value))
-            else:
-                rejected_assignments.append((name, value))
-
-        # un-substitute rejected assignments
-        from sumpy.symbolic import make_one_step_subst
-        unsubst_rej = make_one_step_subst(rejected_assignments)
-
-        new_assignments = dict(
-                (name, expr.subs(unsubst_rej))
-                for name, expr in approved_assignments)
-
-        exprs = [expr.subs(unsubst_rej) for expr in exprs]
-        self.assignments = new_assignments
-
-        logger.info("kill trivial assignments: done")
-
-        return exprs
-
 # }}}
 
 # vim: fdm=marker
diff --git a/sumpy/codegen.py b/sumpy/codegen.py
index 0cb14028261ca7d8a977c26f340c5606f6fd26e3..99e4f47d4c241efcf13fbfd4f180f37a1c914782 100644
--- a/sumpy/codegen.py
+++ b/sumpy/codegen.py
@@ -29,6 +29,7 @@ import pyopencl as cl
 import pyopencl.tools  # noqa
 import loopy as lp
 
+import six
 import re
 
 from pymbolic.mapper import IdentityMapper, WalkMapper, CSECachingMapperMixin
@@ -38,8 +39,7 @@ from loopy.types import NumpyType
 
 from pytools import memoize_method
 
-from pymbolic.interop.sympy import (
-        SympyToPymbolicMapper as SympyToPymbolicMapperBase)
+from sumpy.symbolic import (SympyToPymbolicMapper as SympyToPymbolicMapperBase)
 
 import logging
 logger = logging.getLogger(__name__)
@@ -58,25 +58,138 @@ Conversion of :mod:`sympy` expressions to :mod:`loopy`
 
 # {{{ sympy -> pymbolic mapper
 
+import sumpy.symbolic as sym
+_SPECIAL_FUNCTION_NAMES = frozenset(dir(sym.functions))
+
+
 class SympyToPymbolicMapper(SympyToPymbolicMapperBase):
-    def __init__(self, assignments):
-        self.assignments = assignments
-        self.derivative_cse_names = set()
 
-    def map_Derivative(self, expr):  # noqa
-        # Sympy has picked up the habit of picking arguments out of derivatives
-        # and pronounce them common subexpressions. Me no like. Undo it, so
-        # that the bessel substitutor further down can do its job.
+    def not_supported(self, expr):
+        if isinstance(expr, int):
+            return expr
+        elif getattr(expr, "is_Function", False):
+            func_name = SympyToPymbolicMapperBase.function_name(self, expr)
+            # SymEngine capitalizes the names of the special functions.
+            if func_name.lower() in _SPECIAL_FUNCTION_NAMES:
+                func_name = func_name.lower()
+            return prim.Variable(func_name)(
+                    *tuple(self.rec(arg) for arg in expr.args))
+        else:
+            return SympyToPymbolicMapperBase.not_supported(self, expr)
+
+# }}}
+
+
+# {{{ trivial assignment elimination
+
+def make_one_step_subst(assignments):
+    assignments = dict(assignments)
+    unwanted_vars = set(six.iterkeys(assignments))
+
+    # Ensure no re-assignments.
+    assert len(unwanted_vars) == len(assignments)
+
+    from loopy.symbolic import get_dependencies
+    unwanted_deps = dict(
+        (name, get_dependencies(value) & unwanted_vars)
+        for name, value in six.iteritems(assignments))
+
+    # {{{ compute substitution order
+
+    toposort = []
+    visited = set()
+    visiting = set()
+
+    while unwanted_vars:
+        stack = [unwanted_vars.pop()]
+
+        while stack:
+            top = stack[-1]
+
+            if top in visiting:
+                visiting.remove(top)
+                toposort.append(top)
+
+            if top in visited:
+                stack.pop()
+                continue
+
+            visited.add(top)
+            visiting.add(top)
+
+            for dep in unwanted_deps[top]:
+                # Check for no cycles.
+                assert dep not in visiting
+                stack.append(dep)
 
-        if expr.expr.is_Symbol:
-            # These will get removed, because loopy wont' be able to deal
-            # with them--they contain undefined placeholder symbols.
-            self.derivative_cse_names.add(expr.expr.name)
+    # }}}
 
-        return prim.Derivative(self.rec(
-            expr.expr.subs(self.assignments)),
-            tuple(v.name for v in expr.variables))
+    # {{{ make substitution
 
+    from pymbolic import substitute
+
+    result = {}
+    used_name_to_var = {}
+    from pymbolic import evaluate
+    from functools import partial
+    simplify = partial(evaluate, context=used_name_to_var)
+
+    for name in toposort:
+        value = assignments[name]
+        value = substitute(value, result)
+        used_name_to_var.update(
+            (used_name, prim.Variable(used_name))
+            for used_name in get_dependencies(value)
+            if used_name not in used_name_to_var)
+
+        result[name] = simplify(value)
+
+    # }}}
+
+    return result
+
+
+def is_assignment_nontrivial(name, value):
+    if prim.is_constant(value):
+        return False
+    elif isinstance(value, prim.Variable):
+        return False
+    elif (isinstance(value, prim.Product)
+            and len(value.children) == 2
+            and sum(1 for arg in value.children if prim.is_constant(arg)) == 1
+            and sum(1 for arg in value.children
+                    if isinstance(arg, prim.Variable)) == 1):
+        # const*var: not good enough
+        return False
+
+    return True
+
+
+def kill_trivial_assignments(assignments, retain_names=set()):
+    logger.info("kill trivial assignments (plain): start")
+    approved_assignments = []
+    rejected_assignments = []
+
+    for name, value in assignments:
+        if name in retain_names or is_assignment_nontrivial(name, value):
+            approved_assignments.append((name, value))
+        else:
+            rejected_assignments.append((name, value))
+
+    # un-substitute rejected assignments
+    unsubst_rej = make_one_step_subst(rejected_assignments)
+
+    result = []
+    from pymbolic import substitute
+    for name, expr in approved_assignments:
+        r = substitute(expr, unsubst_rej)
+        result.append((name, r))
+
+    logger.info(
+        "kill trivial assignments (plain): done, {nrej} assignments killed"
+        .format(nrej=len(rejected_assignments)))
+
+    return result
 
 # }}}
 
@@ -318,7 +431,7 @@ class BesselDerivativeReplacer(CSECachingMapperMixin, IdentityMapper):
             arg, = expr.values
 
             n_derivs = len(expr.child.variables)
-            import sympy as sp
+            import sympy as sym
 
             # AS (9.1.31)
             # http://dlmf.nist.gov/10.6.7
@@ -329,7 +442,7 @@ class BesselDerivativeReplacer(CSECachingMapperMixin, IdentityMapper):
             k = n_derivs
             return prim.CommonSubexpression(
                     2**(-k)*sum(
-                        (-1)**idx*int(sp.binomial(k, idx)) * function(i, arg)
+                        (-1)**idx*int(sym.binomial(k, idx)) * function(i, arg)
                         for idx, i in enumerate(range(order-k, order+k+1, 2))),
                     "d%d_%s_%s" % (n_derivs, function.name, order_str))
         else:
@@ -538,17 +651,15 @@ class MathConstantRewriter(CSECachingMapperMixin, IdentityMapper):
 
 
 def to_loopy_insns(assignments, vector_names=set(), pymbolic_expr_maps=[],
-        complex_dtype=None):
+                   complex_dtype=None, retain_names=set()):
     logger.info("loopy instruction generation: start")
     assignments = list(assignments)
 
     # convert from sympy
-    sympy_conv = SympyToPymbolicMapper(assignments)
+    sympy_conv = SympyToPymbolicMapper()
     assignments = [(name, sympy_conv(expr)) for name, expr in assignments]
-    assignments = [
-            (name, expr) for name, expr in assignments
-            if name not in sympy_conv.derivative_cse_names
-            ]
+
+    assignments = kill_trivial_assignments(assignments, retain_names)
 
     bdr = BesselDerivativeReplacer()
     assignments = [(name, bdr(expr)) for name, expr in assignments]
diff --git a/sumpy/cse.py b/sumpy/cse.py
index ad44fc41c8516dfc90587ce61cab6a3e5070e6ea..573c78c93961c8fe0e69c9650c4df7d10133e98b 100644
--- a/sumpy/cse.py
+++ b/sumpy/cse.py
@@ -66,8 +66,8 @@ DAMAGE.
 
 # }}}
 
-from sympy.core import Basic, Mul, Add, Pow
-from sympy.core.function import _coeff_isneg
+from sumpy.symbolic import (
+    Basic, Mul, Add, Pow, Symbol, _coeff_isneg, Derivative, Subs)
 from sympy.core.compatibility import iterable
 from sympy.utilities.iterables import numbered_symbols
 
@@ -82,6 +82,10 @@ Common subexpression elimination
 """
 
 
+# Don't CSE child nodes of these classes.
+CSE_NO_DESCEND_CLASSES = (Derivative, Subs)
+
+
 # {{{ cse pre/postprocessing
 
 def preprocess_for_cse(expr, optimizations):
@@ -248,6 +252,19 @@ class FuncArgTracker(object):
         self.func_to_argset[func_i].update(new_args)
 
 
+class Unevaluated(object):
+
+    def __init__(self, func, args):
+        self.func = func
+        self.args = args
+
+    def __str__(self):
+        return "Uneval<{}>({})".format(
+                self.func, ", ".join(str(a) for a in self.args))
+
+    __repr__ = __str__
+
+
 def match_common_args(func_class, funcs, opt_subs):
     """
     Recognize and extract common subexpressions of function arguments within a
@@ -305,9 +322,8 @@ def match_common_args(func_class, funcs, opt_subs):
             diff_i = arg_tracker.func_to_argset[i].difference(com_args)
             if diff_i:
                 # com_func needs to be unevaluated to allow for recursive matches.
-                com_func = func_class(
-                        *arg_tracker.get_args_in_value_order(com_args),
-                        evaluate=False)
+                com_func = Unevaluated(
+                        func_class, arg_tracker.get_args_in_value_order(com_args))
                 com_func_number = arg_tracker.get_or_add_value_number(com_func)
                 arg_tracker.update_func_argset(i, diff_i | set([com_func_number]))
                 changed.add(i)
@@ -334,9 +350,8 @@ def match_common_args(func_class, funcs, opt_subs):
                 changed.add(k)
 
         if i in changed:
-            opt_subs[funcs[i]] = func_class(
-                *arg_tracker.get_args_in_value_order(arg_tracker.func_to_argset[i]),
-                evaluate=False)
+            opt_subs[funcs[i]] = Unevaluated(func_class,
+                arg_tracker.get_args_in_value_order(arg_tracker.func_to_argset[i]))
 
         arg_tracker.stop_arg_tracking(i)
 
@@ -366,6 +381,9 @@ def opt_cse(exprs):
         if expr.is_Atom:
             return
 
+        if isinstance(expr, CSE_NO_DESCEND_CLASSES):
+            return
+
         if iterable(expr):
             for item in expr:
                 find_opts(item)
@@ -382,7 +400,7 @@ def opt_cse(exprs):
         if _coeff_isneg(expr):
             neg_expr = -expr
             if not neg_expr.is_Atom:
-                opt_subs[expr] = Mul(-1, neg_expr, evaluate=False)
+                opt_subs[expr] = Unevaluated(Mul, (-1, neg_expr))
                 seen_subexp.add(neg_expr)
                 expr = neg_expr
 
@@ -395,7 +413,7 @@ def opt_cse(exprs):
         elif isinstance(expr, Pow):
             base, exp = expr.args
             if _coeff_isneg(exp):
-                opt_subs[expr] = Pow(Pow(base, -exp), -1, evaluate=False)
+                opt_subs[expr] = Unevaluated(Pow, (Pow(base, -exp), -1))
 
     # }}}
 
@@ -436,10 +454,10 @@ def tree_cse(exprs, symbols, opt_subs=None):
     excluded_symbols = set()
 
     def find_repeated(expr):
-        if not isinstance(expr, Basic):
+        if not isinstance(expr, (Basic, Unevaluated)):
             return
 
-        if expr.is_Atom:
+        if isinstance(expr, Basic) and expr.is_Atom:
             if expr.is_Symbol:
                 excluded_symbols.add(expr)
             return
@@ -457,7 +475,10 @@ def tree_cse(exprs, symbols, opt_subs=None):
             if expr in opt_subs:
                 expr = opt_subs[expr]
 
-            args = expr.args
+            if isinstance(expr, CSE_NO_DESCEND_CLASSES):
+                args = ()
+            else:
+                args = expr.args
 
         for arg in args:
             find_repeated(arg)
@@ -478,7 +499,7 @@ def tree_cse(exprs, symbols, opt_subs=None):
     subs = dict()
 
     def rebuild(expr):
-        if not isinstance(expr, Basic):
+        if not isinstance(expr, (Basic, Unevaluated)):
             return expr
 
         if not expr.args:
@@ -495,11 +516,11 @@ def tree_cse(exprs, symbols, opt_subs=None):
         if expr in opt_subs:
             expr = opt_subs[expr]
 
-        new_args = [rebuild(arg) for arg in expr.args]
-        if new_args != expr.args:
-            new_expr = expr.func(*new_args)
-        else:
-            new_expr = expr
+        new_expr = expr
+        if not isinstance(expr, CSE_NO_DESCEND_CLASSES):
+            new_args = tuple(rebuild(arg) for arg in expr.args)
+            if isinstance(expr, Unevaluated) or new_args != expr.args:
+                new_expr = expr.func(*new_args)
 
         if orig_expr in to_eliminate:
             try:
@@ -560,7 +581,7 @@ def cse(exprs, symbols=None, optimizations=None):
     reduced_exprs = [preprocess_for_cse(e, optimizations) for e in exprs]
 
     if symbols is None:
-        symbols = numbered_symbols()
+        symbols = numbered_symbols(cls=Symbol)
     else:
         # In case we get passed an iterable with an __iter__ method instead of
         # an actual iterator.
diff --git a/sumpy/e2e.py b/sumpy/e2e.py
index f5b349401fee21112ae5e72b28d1d1ec8183046a..66e5e97567d8675834c54f7291dba7d49bd20b19 100644
--- a/sumpy/e2e.py
+++ b/sumpy/e2e.py
@@ -27,7 +27,7 @@ from six.moves import range
 
 import numpy as np
 import loopy as lp
-import sympy as sp
+import sumpy.symbolic as sym
 from sumpy.tools import KernelCacheWrapper
 
 import logging
@@ -77,10 +77,10 @@ class E2EBase(KernelCacheWrapper):
         self.dim = src_expansion.dim
 
     def get_translation_loopy_insns(self):
-        from sumpy.symbolic import make_sympy_vector
-        dvec = make_sympy_vector("d", self.dim)
+        from sumpy.symbolic import make_sym_vector
+        dvec = make_sym_vector("d", self.dim)
 
-        src_coeff_exprs = [sp.Symbol("src_coeff%d" % i)
+        src_coeff_exprs = [sym.Symbol("src_coeff%d" % i)
                 for i in range(len(self.src_expansion))]
 
         from sumpy.assignment_collection import SymbolicAssignmentCollection
@@ -93,17 +93,12 @@ class E2EBase(KernelCacheWrapper):
 
         sac.run_global_cse()
 
-        from sumpy.symbolic import kill_trivial_assignments
-        assignments = kill_trivial_assignments([
-                (name, expr)
-                for name, expr in six.iteritems(sac.assignments)],
-                retain_names=tgt_coeff_names)
-
         from sumpy.codegen import to_loopy_insns
         return to_loopy_insns(
-                assignments,
+                six.iteritems(sac.assignments),
                 vector_names=set(["d"]),
                 pymbolic_expr_maps=[self.tgt_expansion.get_code_transformer()],
+                retain_names=tgt_coeff_names,
                 complex_dtype=np.complex128  # FIXME
                 )
 
diff --git a/sumpy/e2p.py b/sumpy/e2p.py
index b2e0e58569ad91cc5416f529bc766b9ee7950859..6a77a5cc416af3c01df9786bac5fd236927e787c 100644
--- a/sumpy/e2p.py
+++ b/sumpy/e2p.py
@@ -27,7 +27,7 @@ from six.moves import range
 
 import numpy as np
 import loopy as lp
-import sympy as sp
+import sumpy.symbolic as sym
 from sumpy.tools import KernelCacheWrapper
 
 
@@ -74,15 +74,16 @@ class E2PBase(KernelCacheWrapper):
             assert tdr(knl) == expansion.kernel
 
     def get_loopy_insns_and_result_names(self):
-        from sumpy.symbolic import make_sympy_vector
-        bvec = make_sympy_vector("b", self.dim)
+        from sumpy.symbolic import make_sym_vector
+        bvec = make_sym_vector("b", self.dim)
 
         from sumpy.assignment_collection import SymbolicAssignmentCollection
         sac = SymbolicAssignmentCollection()
 
-        coeff_exprs = [sp.Symbol("coeff%d" % i)
+        coeff_exprs = [sym.Symbol("coeff%d" % i)
                 for i in range(len(self.expansion.get_coefficient_identifiers()))]
         value = self.expansion.evaluate(coeff_exprs, bvec)
+
         result_names = [
             sac.assign_unique("result_%d_p" % i,
                 knl.postprocess_at_target(value, bvec))
@@ -91,23 +92,19 @@ class E2PBase(KernelCacheWrapper):
 
         sac.run_global_cse()
 
-        from sumpy.symbolic import kill_trivial_assignments
-        assignments = kill_trivial_assignments([
-                (name, expr)
-                for name, expr in six.iteritems(sac.assignments)],
-                retain_names=result_names)
-
         from sumpy.codegen import to_loopy_insns
-        loopy_insns = to_loopy_insns(assignments,
+        loopy_insns = to_loopy_insns(
+                six.iteritems(sac.assignments),
                 vector_names=set(["b"]),
                 pymbolic_expr_maps=[self.expansion.get_code_transformer()],
+                retain_names=result_names,
                 complex_dtype=np.complex128  # FIXME
                 )
 
         return loopy_insns, result_names
 
     def get_kernel_scaling_assignment(self):
-        from pymbolic.interop.sympy import SympyToPymbolicMapper
+        from sumpy.symbolic import SympyToPymbolicMapper
         sympy_conv = SympyToPymbolicMapper()
         return [lp.Assignment(id=None,
                     assignee="kernel_scaling",
diff --git a/sumpy/expansion/__init__.py b/sumpy/expansion/__init__.py
index cc49c6ca2e712ecbf43a0a6d460d0e0dd2667152..f50bd5fbb9facd732c0c192c787a0a420d72b58c 100644
--- a/sumpy/expansion/__init__.py
+++ b/sumpy/expansion/__init__.py
@@ -25,9 +25,9 @@ THE SOFTWARE.
 """
 
 import numpy as np
-import sympy as sp
 import logging
 from pytools import memoize_method
+import sumpy.symbolic as sym
 from sumpy.tools import MiDerivativeTaker
 
 __doc__ = """
@@ -356,7 +356,7 @@ class HelmholtzDerivativeWrangler(LinearRecurrenceBasedDerivativeWrangler):
 
                 coeffs[needed_deriv] = -1
             else:
-                k = sp.Symbol(self.helmholtz_k_name)
+                k = sym.Symbol(self.helmholtz_k_name)
                 coeffs[tuple(reduced_deriv)] = -k*k
                 return coeffs
 
diff --git a/sumpy/expansion/local.py b/sumpy/expansion/local.py
index 3ba643cd2e1bdf6af214f069b278f01d650b26d3..f9ec106a349eb392b54dc4fd68992d38f759c298 100644
--- a/sumpy/expansion/local.py
+++ b/sumpy/expansion/local.py
@@ -23,7 +23,7 @@ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 THE SOFTWARE.
 """
 
-import sympy as sp
+import sumpy.symbolic as sym
 
 from sumpy.expansion import (
     ExpansionBase, VolumeTaylorExpansion, LaplaceConformingVolumeTaylorExpansion,
@@ -61,24 +61,29 @@ class LineTaylorLocalExpansion(LocalExpansionBase):
             raise RuntimeError("cannot use line-Taylor expansions in a setting "
                     "where the center-target vector is not known at coefficient "
                     "formation")
-        avec_line = avec + sp.Symbol("tau")*bvec
+
+        tau = sym.Symbol("tau")
+
+        avec_line = avec + tau*bvec
 
         line_kernel = self.kernel.get_expression(avec_line)
 
-        return [
-                self.kernel.postprocess_at_target(
-                    self.kernel.postprocess_at_source(
-                        line_kernel.diff("tau", i),
-                        avec),
-                    bvec)
-                .subs("tau", 0)
+        from sumpy.tools import MiDerivativeTaker, my_syntactic_subs
+        deriv_taker = MiDerivativeTaker(line_kernel, (tau,))
+
+        return [my_syntactic_subs(
+                    self.kernel.postprocess_at_target(
+                        self.kernel.postprocess_at_source(
+                            deriv_taker.diff(i),
+                            avec), bvec),
+                    {tau: 0})
                 for i in self.get_coefficient_identifiers()]
 
     def evaluate(self, coeffs, bvec):
         from pytools import factorial
-        return sum(
+        return sym.Add(*(
                 coeffs[self.get_storage_index(i)] / factorial(i)
-                for i in self.get_coefficient_identifiers())
+                for i in self.get_coefficient_identifiers()))
 
 # }}}
 
@@ -140,7 +145,7 @@ class VolumeTaylorLocalExpansionBase(LocalExpansionBase):
                         src_expansion.get_coefficient_identifiers()):
                     kernel_deriv = taker.diff(add_mi(deriv, term))
                     local_result.append(coeff * kernel_deriv)
-                result.append(sp.Add(*local_result))
+                result.append(sym.Add(*local_result))
         else:
             from sumpy.tools import MiDerivativeTaker
             expr = src_expansion.evaluate(src_coeff_exprs, dvec)
@@ -197,61 +202,61 @@ class H2DLocalExpansion(LocalExpansionBase):
         return list(range(-self.order, self.order+1))
 
     def coefficients_from_source(self, avec, bvec):
-        from sumpy.symbolic import sympy_real_norm_2
-        hankel_1 = sp.Function("hankel_1")
+        from sumpy.symbolic import sym_real_norm_2
+        hankel_1 = sym.Function("hankel_1")
 
-        k = sp.Symbol(self.kernel.get_base_kernel().helmholtz_k_name)
+        k = sym.Symbol(self.kernel.get_base_kernel().helmholtz_k_name)
 
         # The coordinates are negated since avec points from source to center.
-        source_angle_rel_center = sp.atan2(-avec[1], -avec[0])
-        avec_len = sympy_real_norm_2(avec)
+        source_angle_rel_center = sym.atan2(-avec[1], -avec[0])
+        avec_len = sym_real_norm_2(avec)
         return [self.kernel.postprocess_at_source(
                     hankel_1(l, k * avec_len)
-                    * sp.exp(sp.I * l * source_angle_rel_center), avec)
+                    * sym.exp(sym.I * l * source_angle_rel_center), avec)
                     for l in self.get_coefficient_identifiers()]
 
     def evaluate(self, coeffs, bvec):
-        from sumpy.symbolic import sympy_real_norm_2
-        bessel_j = sp.Function("bessel_j")
-        bvec_len = sympy_real_norm_2(bvec)
-        target_angle_rel_center = sp.atan2(bvec[1], bvec[0])
+        from sumpy.symbolic import sym_real_norm_2
+        bessel_j = sym.Function("bessel_j")
+        bvec_len = sym_real_norm_2(bvec)
+        target_angle_rel_center = sym.atan2(bvec[1], bvec[0])
 
-        k = sp.Symbol(self.kernel.get_base_kernel().helmholtz_k_name)
+        k = sym.Symbol(self.kernel.get_base_kernel().helmholtz_k_name)
 
         return sum(coeffs[self.get_storage_index(l)]
                    * self.kernel.postprocess_at_target(
                        bessel_j(l, k * bvec_len)
-                       * sp.exp(sp.I * l * -target_angle_rel_center), bvec)
+                       * sym.exp(sym.I * l * -target_angle_rel_center), bvec)
                 for l in self.get_coefficient_identifiers())
 
     def translate_from(self, src_expansion, src_coeff_exprs, dvec):
-        from sumpy.symbolic import sympy_real_norm_2
+        from sumpy.symbolic import sym_real_norm_2
 
-        k = sp.Symbol(self.kernel.get_base_kernel().helmholtz_k_name)
+        k = sym.Symbol(self.kernel.get_base_kernel().helmholtz_k_name)
 
         if isinstance(src_expansion, H2DLocalExpansion):
-            dvec_len = sympy_real_norm_2(dvec)
-            bessel_j = sp.Function("bessel_j")
-            new_center_angle_rel_old_center = sp.atan2(dvec[1], dvec[0])
+            dvec_len = sym_real_norm_2(dvec)
+            bessel_j = sym.Function("bessel_j")
+            new_center_angle_rel_old_center = sym.atan2(dvec[1], dvec[0])
             translated_coeffs = []
             for l in self.get_coefficient_identifiers():
                 translated_coeffs.append(
                     sum(src_coeff_exprs[src_expansion.get_storage_index(m)]
                         * bessel_j(m - l, k * dvec_len)
-                        * sp.exp(sp.I * (m - l) * -new_center_angle_rel_old_center)
+                        * sym.exp(sym.I * (m - l) * -new_center_angle_rel_old_center)
                     for m in src_expansion.get_coefficient_identifiers()))
             return translated_coeffs
 
         from sumpy.expansion.multipole import H2DMultipoleExpansion
         if isinstance(src_expansion, H2DMultipoleExpansion):
-            dvec_len = sympy_real_norm_2(dvec)
-            hankel_1 = sp.Function("hankel_1")
-            new_center_angle_rel_old_center = sp.atan2(dvec[1], dvec[0])
+            dvec_len = sym_real_norm_2(dvec)
+            hankel_1 = sym.Function("hankel_1")
+            new_center_angle_rel_old_center = sym.atan2(dvec[1], dvec[0])
             translated_coeffs = []
             for l in self.get_coefficient_identifiers():
                 translated_coeffs.append(
                     sum((-1) ** l * hankel_1(m + l, k * dvec_len)
-                        * sp.exp(sp.I * (m + l) * new_center_angle_rel_old_center)
+                        * sym.exp(sym.I * (m + l) * new_center_angle_rel_old_center)
                         * src_coeff_exprs[src_expansion.get_storage_index(m)]
                     for m in src_expansion.get_coefficient_identifiers()))
             return translated_coeffs
diff --git a/sumpy/expansion/multipole.py b/sumpy/expansion/multipole.py
index fa227e89ab3a0290dd64d9420340ceeec8d93843..ce3b78d8ae25049f48f1b1569ac7483bf8586923 100644
--- a/sumpy/expansion/multipole.py
+++ b/sumpy/expansion/multipole.py
@@ -23,7 +23,7 @@ THE SOFTWARE.
 """
 
 from six.moves import range, zip
-import sympy as sp  # noqa
+import sumpy.symbolic as sym  # noqa
 
 from sumpy.expansion import (
     ExpansionBase, VolumeTaylorExpansion, LaplaceConformingVolumeTaylorExpansion,
@@ -63,8 +63,8 @@ class VolumeTaylorMultipoleExpansionBase(MultipoleExpansionBase):
                 raise NotImplementedError("more than one source derivative "
                         "not supported at present")
 
-            from sumpy.symbolic import make_sympy_vector
-            dir_vec = make_sympy_vector(kernel.dir_vec_name, kernel.dim)
+            from sumpy.symbolic import make_sym_vector
+            dir_vec = make_sym_vector(kernel.dir_vec_name, kernel.dim)
 
             coeff_identifiers = self.get_full_coefficient_identifiers()
             result = [0] * len(coeff_identifiers)
@@ -91,9 +91,9 @@ class VolumeTaylorMultipoleExpansionBase(MultipoleExpansionBase):
 
     def evaluate(self, coeffs, bvec):
         taker = self.get_kernel_derivative_taker(bvec)
-        result = sum(
+        result = sym.Add(*tuple(
                 coeff * taker.diff(mi)
-                for coeff, mi in zip(coeffs, self.get_coefficient_identifiers()))
+                for coeff, mi in zip(coeffs, self.get_coefficient_identifiers())))
         return result
 
     def get_kernel_derivative_taker(self, bvec):
@@ -143,7 +143,8 @@ class VolumeTaylorMultipoleExpansionBase(MultipoleExpansionBase):
                     n = tgt_mi[idim]
                     k = src_mi[idim]
                     assert n >= k
-                    contrib *= (sp.binomial(n, k)
+                    from sympy import binomial
+                    contrib *= (binomial(n, k)
                             * dvec[idim]**(n-k))
 
                 result[i] += contrib
@@ -201,31 +202,31 @@ class H2DMultipoleExpansion(MultipoleExpansionBase):
         return list(range(-self.order, self.order+1))
 
     def coefficients_from_source(self, avec, bvec):
-        from sumpy.symbolic import sympy_real_norm_2
-        bessel_j = sp.Function("bessel_j")
-        avec_len = sympy_real_norm_2(avec)
+        from sumpy.symbolic import sym_real_norm_2
+        bessel_j = sym.Function("bessel_j")
+        avec_len = sym_real_norm_2(avec)
 
-        k = sp.Symbol(self.kernel.get_base_kernel().helmholtz_k_name)
+        k = sym.Symbol(self.kernel.get_base_kernel().helmholtz_k_name)
 
         # The coordinates are negated since avec points from source to center.
-        source_angle_rel_center = sp.atan2(-avec[1], -avec[0])
+        source_angle_rel_center = sym.atan2(-avec[1], -avec[0])
         return [self.kernel.postprocess_at_source(
                     bessel_j(l, k * avec_len) *
-                    sp.exp(sp.I * l * -source_angle_rel_center), avec)
+                    sym.exp(sym.I * l * -source_angle_rel_center), avec)
                 for l in self.get_coefficient_identifiers()]
 
     def evaluate(self, coeffs, bvec):
-        from sumpy.symbolic import sympy_real_norm_2
-        hankel_1 = sp.Function("hankel_1")
-        bvec_len = sympy_real_norm_2(bvec)
-        target_angle_rel_center = sp.atan2(bvec[1], bvec[0])
+        from sumpy.symbolic import sym_real_norm_2
+        hankel_1 = sym.Function("hankel_1")
+        bvec_len = sym_real_norm_2(bvec)
+        target_angle_rel_center = sym.atan2(bvec[1], bvec[0])
 
-        k = sp.Symbol(self.kernel.get_base_kernel().helmholtz_k_name)
+        k = sym.Symbol(self.kernel.get_base_kernel().helmholtz_k_name)
 
         return sum(coeffs[self.get_storage_index(l)]
                    * self.kernel.postprocess_at_target(
                        hankel_1(l, k * bvec_len)
-                       * sp.exp(sp.I * l * target_angle_rel_center), bvec)
+                       * sym.exp(sym.I * l * target_angle_rel_center), bvec)
                 for l in self.get_coefficient_identifiers())
 
     def translate_from(self, src_expansion, src_coeff_exprs, dvec):
@@ -233,19 +234,19 @@ class H2DMultipoleExpansion(MultipoleExpansionBase):
             raise RuntimeError("do not know how to translate %s to "
                                "multipole 2D Helmholtz Bessel expansion"
                                % type(src_expansion).__name__)
-        from sumpy.symbolic import sympy_real_norm_2
-        dvec_len = sympy_real_norm_2(dvec)
-        bessel_j = sp.Function("bessel_j")
-        new_center_angle_rel_old_center = sp.atan2(dvec[1], dvec[0])
+        from sumpy.symbolic import sym_real_norm_2
+        dvec_len = sym_real_norm_2(dvec)
+        bessel_j = sym.Function("bessel_j")
+        new_center_angle_rel_old_center = sym.atan2(dvec[1], dvec[0])
 
-        k = sp.Symbol(self.kernel.get_base_kernel().helmholtz_k_name)
+        k = sym.Symbol(self.kernel.get_base_kernel().helmholtz_k_name)
 
         translated_coeffs = []
         for l in self.get_coefficient_identifiers():
             translated_coeffs.append(
                 sum(src_coeff_exprs[src_expansion.get_storage_index(m)]
                     * bessel_j(m - l, k * dvec_len)
-                    * sp.exp(sp.I * (m - l) * new_center_angle_rel_old_center)
+                    * sym.exp(sym.I * (m - l) * new_center_angle_rel_old_center)
                 for m in src_expansion.get_coefficient_identifiers()))
         return translated_coeffs
 
diff --git a/sumpy/kernel.py b/sumpy/kernel.py
index e29ac5167051e5d781a9fdb331d97fb3fef84df2..ea0ea02b513775391bdfed754cb3f69274533037 100644
--- a/sumpy/kernel.py
+++ b/sumpy/kernel.py
@@ -264,10 +264,11 @@ class ExpressionKernel(Kernel):
         if self.dim != len(dist_vec):
             raise ValueError("dist_vec length does not match expected dimension")
 
-        expr = expr.subs([
-            ("d%d" % i, dist_vec_i)
+        from sumpy.symbolic import Symbol
+        expr = expr.subs(dict(
+            (Symbol("d%d" % i), dist_vec_i)
             for i, dist_vec_i in enumerate(dist_vec)
-            ])
+            ))
 
         return expr
 
@@ -715,7 +716,7 @@ class DirectionalTargetDerivative(DirectionalDerivative):
         dim = len(bvec)
         assert dim == self.dim
 
-        from sumpy.symbolic import make_sympy_vector
+        from sumpy.symbolic import make_sym_vector as make_sympy_vector
         dir_vec = make_sympy_vector(self.dir_vec_name, dim)
 
         # bvec = tgt-center
@@ -746,7 +747,7 @@ class DirectionalSourceDerivative(DirectionalDerivative):
         dimensions = len(avec)
         assert dimensions == self.dim
 
-        from sumpy.symbolic import make_sympy_vector
+        from sumpy.symbolic import make_sym_vector as make_sympy_vector
         dir_vec = make_sympy_vector(self.dir_vec_name, dimensions)
 
         # avec = center-src -> minus sign from chain rule
diff --git a/sumpy/p2e.py b/sumpy/p2e.py
index 1b037dd4ad2ced517ddabff925ba97e6c5527348..6a8fcfa9d2ea8dd7a034d0adb1a7cc152c8db7bc 100644
--- a/sumpy/p2e.py
+++ b/sumpy/p2e.py
@@ -70,8 +70,8 @@ class P2EBase(KernelCacheWrapper):
         self.dim = expansion.dim
 
     def get_loopy_instructions(self):
-        from sumpy.symbolic import make_sympy_vector
-        avec = make_sympy_vector("a", self.dim)
+        from sumpy.symbolic import make_sym_vector
+        avec = make_sym_vector("a", self.dim)
 
         from sumpy.assignment_collection import SymbolicAssignmentCollection
         sac = SymbolicAssignmentCollection()
@@ -83,17 +83,12 @@ class P2EBase(KernelCacheWrapper):
 
         sac.run_global_cse()
 
-        from sumpy.symbolic import kill_trivial_assignments
-        assignments = kill_trivial_assignments([
-                (name, expr)
-                for name, expr in six.iteritems(sac.assignments)],
-                retain_names=coeff_names)
-
         from sumpy.codegen import to_loopy_insns
         return to_loopy_insns(
-                assignments,
+                six.iteritems(sac.assignments),
                 vector_names=set(["a"]),
                 pymbolic_expr_maps=[self.expansion.get_code_transformer()],
+                retain_names=coeff_names,
                 complex_dtype=np.complex128  # FIXME
                 )
 
diff --git a/sumpy/p2p.py b/sumpy/p2p.py
index 3071a3b88c9b1fc45c7d4ce3470535f084aba0a9..39a9020544e05675903b0b4624db3fd95300830c 100644
--- a/sumpy/p2p.py
+++ b/sumpy/p2p.py
@@ -69,8 +69,8 @@ class P2PBase(KernelComputation, KernelCacheWrapper):
         self.dim = single_valued(knl.dim for knl in self.kernels)
 
     def get_loopy_insns_and_result_names(self):
-        from sumpy.symbolic import make_sympy_vector
-        dvec = make_sympy_vector("d", self.dim)
+        from sumpy.symbolic import make_sym_vector
+        dvec = make_sym_vector("d", self.dim)
 
         from sumpy.assignment_collection import SymbolicAssignmentCollection
         sac = SymbolicAssignmentCollection()
@@ -90,6 +90,7 @@ class P2PBase(KernelComputation, KernelCacheWrapper):
                 vector_names=set(["d"]),
                 pymbolic_expr_maps=[
                         knl.get_code_transformer() for knl in self.kernels],
+                retain_names=result_names,
                 complex_dtype=np.complex128  # FIXME
                 )
 
diff --git a/sumpy/qbx.py b/sumpy/qbx.py
index 0b3dc03537e788a75412b712e1c3ad18b14c11db..8892739e3b9050d3d617e8f210b288b3da4b7e28 100644
--- a/sumpy/qbx.py
+++ b/sumpy/qbx.py
@@ -27,7 +27,7 @@ import six
 from six.moves import range, zip
 import numpy as np
 import loopy as lp
-import sympy as sp
+import sumpy.symbolic as sym
 from pytools import memoize_method
 from pymbolic import parse, var
 
@@ -64,7 +64,7 @@ def expand(expansion_nr, sac, expansion, avec, bvec):
     coefficients = expansion.coefficients_from_source(avec, bvec)
 
     assigned_coeffs = [
-            sp.Symbol(
+            sym.Symbol(
                     sac.assign_unique("expn%dcoeff%s" % (
                         expansion_nr, stringify_expn_index(i)),
                         coefficients[expansion.get_storage_index(i)]))
@@ -117,10 +117,10 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
 
     @memoize_method
     def get_kernel(self):
-        from sumpy.symbolic import make_sympy_vector
+        from sumpy.symbolic import make_sym_vector
 
-        avec = make_sympy_vector("a", self.dim)
-        bvec = make_sympy_vector("b", self.dim)
+        avec = make_sym_vector("a", self.dim)
+        bvec = make_sym_vector("b", self.dim)
 
         from sumpy.assignment_collection import SymbolicAssignmentCollection
         sac = SymbolicAssignmentCollection()
@@ -134,17 +134,13 @@ class LayerPotentialBase(KernelComputation, KernelCacheWrapper):
 
         sac.run_global_cse()
 
-        from sumpy.symbolic import kill_trivial_assignments
-        assignments = kill_trivial_assignments([
-                (name, expr.subs("tau", 0))
-                for name, expr in six.iteritems(sac.assignments)],
-                retain_names=result_names)
-
         from sumpy.codegen import to_loopy_insns
-        loopy_insns = to_loopy_insns(assignments,
+        loopy_insns = to_loopy_insns(
+                six.iteritems(sac.assignments),
                 vector_names=set(["a", "b"]),
                 pymbolic_expr_maps=[
                     expn.kernel.get_code_transformer() for expn in self.expansions],
+                retain_names=result_names,
                 complex_dtype=np.complex128  # FIXME
                 )
 
diff --git a/sumpy/symbolic.py b/sumpy/symbolic.py
index 86c2666db8611d98c2cd6882db8c77a2abb04854..85dbb278eb88e82d753aee0f22b958a23bc67eba 100644
--- a/sumpy/symbolic.py
+++ b/sumpy/symbolic.py
@@ -27,68 +27,77 @@ import six
 from six.moves import range
 from six.moves import zip
 
-import sympy as sp
 import numpy as np
 from pymbolic.mapper import IdentityMapper as IdentityMapperBase
-from pymbolic.interop.sympy import PymbolicToSympyMapper
 import pymbolic.primitives as prim
 
 import logging
 logger = logging.getLogger(__name__)
 
 
-# {{{ trivial assignment elimination
+# {{{ symbolic backend
 
-def make_one_step_subst(assignments):
-    unwanted_vars = set(sp.Symbol(name) for name, value in assignments)
+def _find_symbolic_backend():
+    global USE_SYMENGINE
 
-    result = {}
-    for name, value in assignments:
-        while value.atoms() & unwanted_vars:
-            value = value.subs(assignments)
+    try:
+        import symengine  # noqa
+        symengine_found = True
+    except ImportError:
+        symengine_found = False
 
-        result[sp.Symbol(name)] = value
-
-    return result
+    ALLOWED_BACKENDS = ("sympy", "symengine")  # noqa
+    BACKEND_ENV_VAR = "SUMPY_FORCE_SYMBOLIC_BACKEND"  # noqa
 
+    import os
+    backend = os.environ.get(BACKEND_ENV_VAR)
+    if backend is not None:
+        if backend not in ALLOWED_BACKENDS:
+            raise RuntimeError(
+                "%s value is unrecognized: '%s' "
+                "(allowed values are %s)" % (
+                    BACKEND_ENV_VAR,
+                    backend,
+                    ", ".join("'%s'" % val for val in ALLOWED_BACKENDS)))
 
-def is_assignment_nontrivial(name, value):
-    if value.is_Number:
-        return False
-    elif isinstance(value, sp.Symbol):
-        return False
-    elif (isinstance(value, sp.Mul)
-            and len(value.args) == 2
-            and sum(1 for arg in value.args if arg.is_Number) == 1
-            and sum(1 for arg in value.args if isinstance(arg, sp.Symbol)) == 1):
-        # const*var: not good enough
-        return False
+        if backend == "symengine" and not symengine_found:
+            from warnings import warn
+            warn("%s=symengine was specified, but could not find symengine. "
+                 "Using sympy." % BACKEND_ENV_VAR, RuntimeWarning)
 
-    return True
+        USE_SYMENGINE = backend == "symengine" and symengine_found
+    else:
+        USE_SYMENGINE = symengine_found
 
 
-def kill_trivial_assignments(assignments, retain_names=set()):
-    logger.info("kill trivial assignments (plain): start")
-    approved_assignments = []
-    rejected_assignments = []
+_find_symbolic_backend()
 
-    for name, value in assignments:
-        if name in retain_names or is_assignment_nontrivial(name, value):
-            approved_assignments.append((name, value))
-        else:
-            rejected_assignments.append((name, value))
+# }}}
 
-    # un-substitute rejected assignments
-    unsubst_rej = make_one_step_subst(rejected_assignments)
+# Symbolic API common to SymEngine and sympy.
+# Before adding a function here, make sure it's present in both modules.
+SYMBOLIC_API = """
+Add Basic Mul Pow exp sqrt symbols sympify cos sin atan2 Function Symbol
+Derivative Integer Matrix Subs I pi functions""".split()
 
-    result = [(name, expr.xreplace(unsubst_rej))
-            for name, expr in approved_assignments]
+if USE_SYMENGINE:
+    from symengine import sympy_compat as sym
+    from pymbolic.interop.symengine import (
+        PymbolicToSymEngineMapper as PymbolicToSympyMapper,
+        SymEngineToPymbolicMapper as SympyToPymbolicMapper)
+else:
+    import sympy as sym
+    from pymbolic.interop.sympy import (
+        PymbolicToSympyMapper, SympyToPymbolicMapper)
 
-    logger.info("kill trivial assignments (plain): done")
+for _apifunc in SYMBOLIC_API:
+    globals()[_apifunc] = getattr(sym, _apifunc)
 
-    return result
 
-# }}}
+def _coeff_isneg(a):
+    if a.is_Mul:
+        a = a.args[0]
+    return a.is_Number and a.is_negative
 
 
 # {{{ debugging of sympy CSE via Maxima
@@ -111,7 +120,6 @@ def _get_assignments_in_maxima(assignments, prefix=""):
 
     from pymbolic.maxima import MaximaStringifyMapper
     mstr = MaximaStringifyMapper()
-    from pymbolic.interop.sympy import SympyToPymbolicMapper
     s2p = SympyToPymbolicMapper()
     dkill = _DerivativeKiller()
 
@@ -119,12 +127,12 @@ def _get_assignments_in_maxima(assignments, prefix=""):
 
     def write_assignment(name):
         symbols = [atm for atm in assignments[name].atoms()
-                if isinstance(atm, sp.Symbol)
+                if isinstance(atm, sym.Symbol)
                 and atm.name in my_variable_names]
 
-        for sym in symbols:
-            if sym.name not in written_assignments:
-                write_assignment(sym.name)
+        for symb in symbols:
+            if symb.name not in written_assignments:
+                write_assignment(symb.name)
 
         result.append("%s%s : %s;" % (
             prefix, name, mstr(dkill(s2p(
@@ -143,7 +151,7 @@ def checked_cse(exprs, symbols=None):
     if symbols is not None:
         kwargs["symbols"] = symbols
 
-    new_assignments, new_exprs = sp.cse(exprs, **kwargs)
+    new_assignments, new_exprs = sym.cse(exprs, **kwargs)
 
     max_old = _get_assignments_in_maxima(dict(
             ("old_expr%d" % i, expr)
@@ -171,8 +179,8 @@ def checked_cse(exprs, symbols=None):
 # }}}
 
 
-def sympy_real_norm_2(x):
-    return sp.sqrt((x.T*x)[0, 0])
+def sym_real_norm_2(x):
+    return sym.sqrt((x.T*x)[0, 0])
 
 
 def pymbolic_real_norm_2(x):
@@ -180,9 +188,9 @@ def pymbolic_real_norm_2(x):
     return var("sqrt")(np.dot(x, x))
 
 
-def make_sympy_vector(name, components):
-    return sp.Matrix(
-            [sp.Symbol("%s%d" % (name, i)) for i in range(components)])
+def make_sym_vector(name, components):
+    return sym.Matrix(
+            [sym.Symbol("%s%d" % (name, i)) for i in range(components)])
 
 
 def vector_subs(expr, old, new):
@@ -194,8 +202,8 @@ def vector_subs(expr, old, new):
 
 
 def find_power_of(base, prod):
-    remdr = sp.Wild("remdr")
-    power = sp.Wild("power")
+    remdr = sym.Wild("remdr")
+    power = sym.Wild("power")
     result = prod.match(remdr*base**power)
     if result is None:
         return 0
@@ -205,17 +213,16 @@ def find_power_of(base, prod):
 class PymbolicToSympyMapperWithSymbols(PymbolicToSympyMapper):
     def map_variable(self, expr):
         if expr.name == "I":
-            return sp.I
+            return sym.I
         elif expr.name == "pi":
-            return sp.pi
+            return sym.pi
         else:
             return PymbolicToSympyMapper.map_variable(self, expr)
 
     def map_subscript(self, expr):
         if isinstance(expr.aggregate, prim.Variable) and isinstance(expr.index, int):
-            return sp.Symbol("%s%d" % (expr.aggregate.name, expr.index))
+            return sym.Symbol("%s%d" % (expr.aggregate.name, expr.index))
         else:
-            raise RuntimeError("do not know how to translate '%s' to sympy"
-                    % expr)
+            self.raise_conversion_error(expr)
 
 # vim: fdm=marker
diff --git a/sumpy/tools.py b/sumpy/tools.py
index 60687510c66cd76aaca87af035464c858133d766..f5014295c9f9f560470a85ec2f8d6a3dcc3a42ae 100644
--- a/sumpy/tools.py
+++ b/sumpy/tools.py
@@ -28,6 +28,7 @@ THE SOFTWARE.
 
 from pytools import memoize_method
 import numpy as np
+import sumpy.symbolic as sym
 
 import logging
 logger = logging.getLogger(__name__)
@@ -57,6 +58,7 @@ def mi_power(vector, mi):
 class MiDerivativeTaker(object):
 
     def __init__(self, expr, var_list):
+        assert isinstance(expr, sym.Basic)
         self.var_list = var_list
         empty_mi = (0,) * len(var_list)
         self.cache_by_mi = {empty_mi: expr}
@@ -107,7 +109,7 @@ class LinearRecurrenceBasedMiDerivativeTaker(MiDerivativeTaker):
             expr = self.cache_by_mi[mi]
         except KeyError:
             from six import iteritems
-            from sympy import Add
+            from sumpy.symbolic import Add
 
             closest_mi = self.get_closest_cached_mi(mi)
             expr = self.cache_by_mi[closest_mi]
@@ -264,7 +266,7 @@ class KernelComputation(object):
         self.name = name or self.default_name
 
     def get_kernel_scaling_assignments(self):
-        from pymbolic.interop.sympy import SympyToPymbolicMapper
+        from sumpy.symbolic import SympyToPymbolicMapper
         sympy_conv = SympyToPymbolicMapper()
 
         import loopy as lp
@@ -383,4 +385,54 @@ class KernelCacheWrapper(object):
 
         return knl
 
+
+def my_syntactic_subs(expr, subst_dict):
+    # Workaround for differing substitution semantics between sympy and symengine.
+    # FIXME: This is a hack.
+    from sumpy.symbolic import Basic, Subs, Derivative, USE_SYMENGINE
+
+    if not isinstance(expr, Basic):
+        return expr
+
+    elif expr.is_Symbol:
+        return subst_dict.get(expr, expr)
+
+    elif isinstance(expr, Subs):
+        new_point = tuple(my_syntactic_subs(p, subst_dict) for p in expr.point)
+
+        import six
+        new_subst_dict = dict(
+            (var, subs) for var, subs in six.iteritems(subst_dict)
+            if var not in expr.variables)
+
+        new_expr = my_syntactic_subs(expr.expr, new_subst_dict)
+
+        if new_point != expr.point or new_expr != expr.expr:
+            return Subs(new_expr, expr.variables, new_point)
+
+        return expr
+
+    elif isinstance(expr, Derivative):
+        new_expr = my_syntactic_subs(expr.expr, subst_dict)
+        new_variables = my_syntactic_subs(expr.variables, subst_dict)
+
+        if new_expr != expr.expr or any(new_var != var for new_var, var in
+                                          zip(new_variables, expr.variables)):
+            # FIXME in SymEngine
+            if USE_SYMENGINE:
+                return Derivative(new_expr, new_variables)
+            else:
+                return Derivative(new_expr, *new_variables)
+
+        return expr
+
+    else:
+        new_args = tuple(my_syntactic_subs(arg, subst_dict) for arg in expr.args)
+
+        if any(new_arg != arg for arg, new_arg in zip(expr.args, new_args)):
+            return expr.func(*new_args)
+
+        return expr
+
+
 # vim: fdm=marker
diff --git a/sumpy/version.py b/sumpy/version.py
index be874f65b3e22b32ad34f07f49670b51f8e32f5d..f0497a83b01f992cea2293bddf4b04a64fbe3702 100644
--- a/sumpy/version.py
+++ b/sumpy/version.py
@@ -25,4 +25,4 @@ VERSION = (2016, 1)
 VERSION_STATUS = "beta1"
 VERSION_TEXT = ".".join(str(x) for x in VERSION) + VERSION_STATUS
 
-KERNEL_VERSION = 13
+KERNEL_VERSION = 14
diff --git a/test/test_codegen.py b/test/test_codegen.py
index 59fe0340caceb7eaf281f7d725d0eb7343c82bcc..c7b5e0bc357ace12ecc59d11ead7ac9af7d19b1d 100644
--- a/test/test_codegen.py
+++ b/test/test_codegen.py
@@ -29,6 +29,38 @@ import logging
 logger = logging.getLogger(__name__)
 
 
+def test_kill_trivial_assignments():
+    from pymbolic import var
+    x, y, t0, t1, t2 = [var(s) for s in "x y t0 t1 t2".split()]
+
+    assignments = (
+        ("t0", 6),
+        ("t1", -t0),
+        ("t2", 6*x),
+        ("nt", x**y),
+        # users of trivial assignments
+        ("u0", t0 + 1),
+        ("u1", t1 + 1),
+        ("u2", t2 + 1),
+    )
+
+    from sumpy.codegen import kill_trivial_assignments
+    result = kill_trivial_assignments(
+        assignments,
+        retain_names=("u0", "u1", "u2"))
+
+    from pymbolic.primitives import Sum
+
+    def _s(*vals):
+        return Sum(vals)
+
+    assert result == [
+        ('nt', x**y),
+        ('u0', _s(6, 1)),
+        ('u1', _s(-6, 1)),
+        ('u2', _s(6*x, 1))]
+
+
 def test_symbolic_assignment_name_uniqueness():
     # https://gitlab.tiker.net/inducer/sumpy/issues/13
     from sumpy.assignment_collection import SymbolicAssignmentCollection
diff --git a/test/test_cse.py b/test/test_cse.py
index d2904176e486ee16bb92bae9c8702f76969a4bae..4f957aaa5d20ef4b02d40a5b7c2052c4c6a5345a 100644
--- a/test/test_cse.py
+++ b/test/test_cse.py
@@ -69,12 +69,13 @@ DAMAGE.
 import pytest
 import sys
 
-from sympy import (Add, Pow, exp, sqrt, symbols, sympify, S, cos, sin, Eq,
-                   Function, Tuple, CRootOf, IndexedBase, Idx, Piecewise)
-from sympy.simplify.cse_opts import sub_pre, sub_post
-from sympy.functions.special.hyper import meijerg
-from sympy.simplify import cse_opts
+from sumpy.symbolic import (
+    Add, Pow, exp, sqrt, symbols, sympify, cos, sin, Function, USE_SYMENGINE)
 
+if not USE_SYMENGINE:
+    from sympy.simplify.cse_opts import sub_pre, sub_post
+    from sympy.functions.special.hyper import meijerg
+    from sympy.simplify import cse_opts
 
 from sumpy.cse import (
     cse, preprocess_for_cse, postprocess_for_cse)
@@ -83,6 +84,9 @@ from sumpy.cse import (
 w, x, y, z = symbols('w,x,y,z')
 x0, x1, x2, x3, x4, x5, x6, x7, x8, x9, x10, x11, x12 = symbols('x:13')
 
+sympyonly = (
+    pytest.mark.skipif(USE_SYMENGINE, reason="uses a sympy-only feature"))
+
 
 # Dummy "optimization" functions for testing.
 
@@ -122,6 +126,7 @@ def test_cse_single():
     assert reduced == [sqrt(x0) + x0**2]
 
 
+@sympyonly
 def test_cse_not_possible():
     # No substitution possible.
     e = Add(x, y)
@@ -142,6 +147,7 @@ def test_nested_substitution():
     assert reduced == [sqrt(x0) + x0**2]
 
 
+@sympyonly
 def test_subtraction_opt():
     # Make sure subtraction is optimized.
     e = (x - y)*(z - y) + exp((x - y)*(z - y))
@@ -177,12 +183,13 @@ def test_multiple_expressions():
     rsubsts, _ = cse(reversed(l))
     assert substs == rsubsts
     assert reduced == [x1, x1 + z, x0]
-    l = [(x - z)*(y - z), x - z, y - z]
+    f = Function("f")
+    l = [f(x - z, y - z), x - z, y - z]
     substs, reduced = cse(l)
     rsubsts, _ = cse(reversed(l))
     assert substs == [(x0, -z), (x1, x + x0), (x2, x0 + y)]
     assert rsubsts == [(x0, -z), (x1, x0 + y), (x2, x + x0)]
-    assert reduced == [x1*x2, x1, x2]
+    assert reduced == [f(x1, x2), x1, x2]
     l = [w*y + w + x + y + z, w*x*y]
     assert cse(l) == ([(x0, w*y)], [w + x + x0 + y + z, x*x0])
     assert cse([x + y, x + y + z]) == ([(x0, x + y)], [x0, z + x0])
@@ -195,26 +202,31 @@ def test_issue_4203():
     assert cse(sin(x**x)/x**x) == ([(x0, x**x)], [sin(x0)/x0])
 
 
-def test_dont_cse_tuples():
-    from sympy import Subs
+def test_dont_cse_subs():
+    from sumpy.symbolic import Subs
     f = Function("f")
     g = Function("g")
 
     name_val, (expr,) = cse(
-        Subs(f(x, y), (x, y), (0, 1))
-        + Subs(g(x, y), (x, y), (0, 1)))
+        Subs(f(x, y), (x, y), (0, x + y))
+        + Subs(g(x, y), (x, y), (0, x + y)))
 
     assert name_val == []
-    assert expr == (Subs(f(x, y), (x, y), (0, 1))
-            + Subs(g(x, y), (x, y), (0, 1)))
+    assert expr == Subs(f(x, y), (x, y), (0, x + y)) + \
+        Subs(g(x, y), (x, y), (0, x + y))
 
-    name_val, (expr,) = cse(
-        Subs(f(x, y), (x, y), (0, x + y))
-        + Subs(g(x, y), (x, y), (0, x + y)))
 
-    assert name_val == [(x0, x + y)]
-    assert expr == Subs(f(x, y), (x, y), (0, x0)) + \
-        Subs(g(x, y), (x, y), (0, x0))
+def test_dont_cse_derivative():
+    from sumpy.symbolic import Derivative
+    f = Function("f")
+
+    # FIXME
+    deriv = Derivative(f(x+y), (x,)) if USE_SYMENGINE else Derivative(f(x+y), x)
+
+    name_val, (expr,) = cse(x + y + deriv)
+
+    assert name_val == []
+    assert expr == x + y + deriv
 
 
 def test_pow_invpow():
@@ -238,9 +250,11 @@ def test_pow_invpow():
         ([(x0, x**(2*y))], [x0 + 1/x0])
 
 
+@sympyonly
 def test_issue_4499():
     # previously, this gave 16 constants
     from sympy.abc import a, b
+    from sympy import Tuple, S
     B = Function('B')  # noqa
     G = Function('G')  # noqa
     t = Tuple(*
@@ -256,7 +270,9 @@ def test_issue_4499():
     assert len(c[0]) == 11
 
 
+@sympyonly
 def test_issue_6169():
+    from sympy import CRootOf
     r = CRootOf(x**6 - 4*x**5 - 2, 1)
     assert cse(r) == ([], [r])
     # and a check that the right thing is done with the new
@@ -264,7 +280,9 @@ def test_issue_6169():
     assert sub_post(sub_pre((-x - y)*z - x - y)) == -z*(x + y) - x - y
 
 
+@sympyonly
 def test_cse_Indexed():  # noqa
+    from sympy import IndexedBase, Idx
     len_y = 5
     y = IndexedBase('y', shape=(len_y,))
     x = IndexedBase('x', shape=(len_y,))
@@ -277,7 +295,9 @@ def test_cse_Indexed():  # noqa
     assert len(replacements) > 0
 
 
+@sympyonly
 def test_Piecewise():  # noqa
+    from sympy import Piecewise, Eq
     f = Piecewise((-z + x*y, Eq(y, 0)), (-z - x*y, True))
     ans = cse(f)
     actual_ans = ([(x0, -z), (x1, x*y)],
@@ -290,7 +310,7 @@ def test_name_conflict():
     z2 = x2 + x3
     l = [cos(z1) + z1, cos(z2) + z2, x0 + x2]
     substs, reduced = cse(l)
-    assert [e.subs(reversed(substs)) for e in reduced] == l
+    assert [e.subs(dict(substs)) for e in reduced] == l
 
 
 def test_name_conflict_cust_symbols():
@@ -298,7 +318,7 @@ def test_name_conflict_cust_symbols():
     z2 = x2 + x3
     l = [cos(z1) + z1, cos(z2) + z2, x0 + x2]
     substs, reduced = cse(l, symbols("x:10"))
-    assert [e.subs(reversed(substs)) for e in reduced] == l
+    assert [e.subs(dict(substs)) for e in reduced] == l
 
 
 def test_symbols_exhausted_error():
@@ -308,6 +328,7 @@ def test_symbols_exhausted_error():
         print(cse(l, symbols=sym))
 
 
+@sympyonly
 def test_issue_7840():
     # daveknippers' example
     C393 = sympify(  # noqa
diff --git a/test/test_fmm.py b/test/test_fmm.py
index 2e2bfea4803b6a5f5e17a6d05379620adff09855..33d5ddada5a67acef8b1911bc7bd0011441e3b2c 100644
--- a/test/test_fmm.py
+++ b/test/test_fmm.py
@@ -63,6 +63,7 @@ else:
     (l2d_level_to_order_lookup, ()),
     ])
 def test_level_to_order_lookup(ctx_getter, lookup_func, extra_args):
+    pytest.importorskip("pyfmmlib")
     ctx = ctx_getter()
     queue = cl.CommandQueue(ctx)