diff --git a/arraycontext/.pytest.py.swm b/arraycontext/.pytest.py.swm
new file mode 100644
index 0000000000000000000000000000000000000000..76d766eb6afded79da5bfdcbeed45a1d8c874513
Binary files /dev/null and b/arraycontext/.pytest.py.swm differ
diff --git a/arraycontext/.pytest.py.swn b/arraycontext/.pytest.py.swn
new file mode 100644
index 0000000000000000000000000000000000000000..f96e886cfe2870ff96eadde22407c174a8facfd3
Binary files /dev/null and b/arraycontext/.pytest.py.swn differ
diff --git a/arraycontext/__init__.py b/arraycontext/__init__.py
index 76242ef45afa83aca5d734ac39a59c77e8f58ea8..1d286603a6f7fec0cdb806e63cb1805f69383615 100644
--- a/arraycontext/__init__.py
+++ b/arraycontext/__init__.py
@@ -60,9 +60,11 @@ from .container.traversal import (
 
 from .impl.pyopencl import PyOpenCLArrayContext
 from .impl.pytato import PytatoPyOpenCLArrayContext
+from .impl.pycuda import PyCUDAArrayContext
 
 from .pytest import (
         PytestPyOpenCLArrayContextFactory,
+        PytestPyCUDAArrayContextFactory,
         pytest_generate_tests_for_array_contexts,
         pytest_generate_tests_for_pyopencl_array_context)
 
@@ -91,11 +93,12 @@ __all__ = (
         "thaw", "freeze",
         "from_numpy", "to_numpy",
 
-        "PyOpenCLArrayContext", "PytatoPyOpenCLArrayContext",
+        "PyOpenCLArrayContext", "PytatoPyOpenCLArrayContext","PyCUDAArrayContext",
 
         "make_loopy_program",
 
         "PytestPyOpenCLArrayContextFactory",
+        "PyCUDAArrayContextFactory",
         "pytest_generate_tests_for_array_contexts",
         "pytest_generate_tests_for_pyopencl_array_context"
         )
diff --git a/arraycontext/fake_numpy.py b/arraycontext/fake_numpy.py
index cdb95348c6ca912bb39b01428aa7a0a96ecbfdb2..fdfc3c6dc38855f40ffdf7f08f2fe75548e1fbb9 100644
--- a/arraycontext/fake_numpy.py
+++ b/arraycontext/fake_numpy.py
@@ -145,31 +145,6 @@ class BaseFakeNumpyNamespace:
     _c_to_numpy_arc_functions = {c_name: numpy_name
             for numpy_name, c_name in _numpy_to_c_arc_functions.items()}
 
-    def __getattr__(self, name):
-        def loopy_implemented_elwise_func(*args):
-            actx = self._array_context
-            prg = _get_scalar_func_loopy_program(actx,
-                    c_name, nargs=len(args), naxes=len(args[0].shape))
-            outputs = actx.call_loopy(prg,
-                    **{"inp%d" % i: arg for i, arg in enumerate(args)})
-            return outputs["out"]
-
-        if name in self._c_to_numpy_arc_functions:
-            from warnings import warn
-            warn(f"'{name}' in ArrayContext.np is deprecated. "
-                    "Use '{c_to_numpy_arc_functions[name]}' as in numpy. "
-                    "The old name will stop working in 2021.",
-                    DeprecationWarning, stacklevel=3)
-
-        # normalize to C names anyway
-        c_name = self._numpy_to_c_arc_functions.get(name, name)
-
-        # limit which functions we try to hand off to loopy
-        if name in self._numpy_math_functions:
-            return multimapped_over_array_containers(loopy_implemented_elwise_func)
-        else:
-            raise AttributeError(name)
-
     def _new_like(self, ary, alloc_like):
         from numbers import Number
 
diff --git a/arraycontext/impl/pycuda/__init__.py b/arraycontext/impl/pycuda/__init__.py
new file mode 100644
index 0000000000000000000000000000000000000000..b6a06111b96dcdb9b888bb57816da0ee21048e0e
--- /dev/null
+++ b/arraycontext/impl/pycuda/__init__.py
@@ -0,0 +1,119 @@
+"""
+.. currentmodule:: arraycontext
+.. autoclass:: PyCUDAArrayContext
+"""
+
+__copyright__ = """
+Copyright (C) 2021 University of Illinois Board of Trustees
+"""
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from warnings import warn
+from typing import Dict, List, Sequence, Optional, Union, TYPE_CHECKING
+
+import numpy as np
+
+from pytools.tag import Tag
+
+from arraycontext.context import ArrayContext
+
+
+if TYPE_CHECKING:
+    import pycuda
+
+
+# {{{ PyCUDAArrayContext
+
+class PyCUDAArrayContext(ArrayContext):
+    """
+    A :class:`ArrayContext` that uses :class:`pycuda.gpuarray.GPUArray` instances
+    for its base array class.
+
+    .. attribute:: allocator
+
+        A PyCUDA memory allocator. Can also be `None` (default) or `False` to
+        use the default allocator.
+
+    .. automethod:: __init__
+    """
+
+    def __init__(self, allocator=None):
+        import pycuda
+        super().__init__()
+        if allocator == None:
+            self.allocator =  pycuda.driver.mem_alloc
+            from warnings import warn
+            warn("Allocator is None")
+        else:
+            self.allocator = allocator
+
+    def _get_fake_numpy_namespace(self):
+        from arraycontext.impl.pycuda.fake_numpy import PyCUDAFakeNumpyNamespace
+        return PyCUDAFakeNumpyNamespace(self)
+
+    # {{{ ArrayContext interface
+
+    def empty(self, shape, dtype):
+        import pycuda.gpuarray as gpuarray
+        return gpuarray.empty(shape=shape, dtype=dtype,
+                allocator=self.allocator)
+
+    def zeros(self, shape, dtype):
+        import pycuda.gpuarray as gpuarray
+        return gpuarray.zeros(shape=shape, dtype=dtype,
+                allocator=self.allocator)
+
+    def from_numpy(self, array: np.ndarray):
+        import pycuda.gpuarray as gpuarray
+        return gpuarray.to_gpu(array, allocator=self.allocator)
+
+    def to_numpy(self, array):
+        import pycuda.gpuarray as gpuarray
+        return array.get()
+
+    def call_loopy(self, t_unit, **kwargs):
+        raise NotImplementedError('Waiting for loopy to be more capable')
+
+    def freeze(self, array):
+        return array
+
+    def thaw(self, array):
+        return array
+
+    # }}}
+
+    def clone(self):
+        return type(self)(self.allocator)
+
+    def tag(self, array):
+        return array
+
+    def tag_axis(self, array):
+        return array
+
+    @property
+    def permits_inplace_modification(self):
+        return True
+
+# }}}
+
+# vim: foldmethod=marker
diff --git a/arraycontext/impl/pycuda/fake_numpy.py b/arraycontext/impl/pycuda/fake_numpy.py
new file mode 100644
index 0000000000000000000000000000000000000000..ce9ceb2167b81bc8b67f173923fafb8a6e61977c
--- /dev/null
+++ b/arraycontext/impl/pycuda/fake_numpy.py
@@ -0,0 +1,152 @@
+"""
+.. currentmodule:: arraycontext
+.. autoclass:: PyCUDAArrayContext
+"""
+__copyright__ = """
+Copyright (C) 2021 University of Illinois Board of Trustees
+"""
+
+__license__ = """
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+"""
+
+from functools import partial, reduce
+import operator
+
+from arraycontext.fake_numpy import \
+        BaseFakeNumpyNamespace, BaseFakeNumpyLinalgNamespace
+from arraycontext.container.traversal import (
+        rec_multimap_array_container, rec_map_array_container,
+        rec_map_reduce_array_container,
+        )
+
+import pycuda 
+
+try:
+    import pycuda.gpuarray as gpuarray
+except ImportError:
+    pass
+
+
+# {{{ fake numpy
+
+class PyCUDAFakeNumpyNamespace(BaseFakeNumpyNamespace):
+    def _get_fake_numpy_linalg_namespace(self):
+        return _PyCUDAFakeNumpyLinalgNamespace(self._array_context)
+
+    def __getattr__(self, name):
+        print(name)
+        pycuda_funcs = ["abs", "sin", "cos", "tan", "arcsin", "arccos", "arctan",
+                    "sinh", "cosh", "tanh", "exp", "log", "log10", "isnan",
+                    "sqrt", "exp"]
+        if name in pycuda_funcs:
+            from functools import partial
+            return partial(rec_map_array_container, getattr(pycuda, name))
+        
+        return super().__getattr__(name)
+
+    # {{{ comparisons
+
+    # FIXME: This should be documentation, not a comment.
+    # These are here mainly because some arrays may choose to interpret
+    # equality comparison as a binary predicate of structural identity,
+    # i.e. more like "are you two equal", and not like numpy semantics.
+    # These operations provide access to numpy-style comparisons in that
+    # case.
+
+    def equal(self, x, y):
+        return rec_multimap_array_container(operator.eq, x, y)
+
+    def not_equal(self, x, y):
+        return rec_multimap_array_container(operator.ne, x, y)
+
+    def greater(self, x, y):
+        return rec_multimap_array_container(operator.gt, x, y)
+
+    def greater_equal(self, x, y):
+        return rec_multimap_array_container(operator.ge, x, y)
+
+    def less(self, x, y):
+        return rec_multimap_array_container(operator.lt, x, y)
+
+    def less_equal(self, x, y):
+        return rec_multimap_array_container(operator.le, x, y)
+
+    # }}}
+
+    def maximum(self, x, y):
+        return rec_multimap_array_container(gpuarray.maximum,x, y)
+
+    def minimum(self, x, y):
+        return rec_multimap_array_container(gpuarray.minimum,x, y)
+
+    def where(self, criterion, then, else_):
+        def where_inner(inner_crit, inner_then, inner_else):
+            if isinstance(inner_crit, bool):
+                return inner_then if inner_crit else inner_else
+            return gpuarray.if_positive(inner_crit != 0, inner_then, inner_else)
+
+        return rec_multimap_array_container(where_inner, criterion, then, else_)
+
+    def sum(self, a, dtype=None):
+        def _gpuarray_sum(ary):
+            if dtype not in [ary.dtype, None]:
+                raise NotImplementedError
+
+            return gpuarray.sum(ary)
+
+        return rec_map_reduce_array_container(sum, _gpuarray_sum, a)
+
+    def min(self, a):
+        return rec_map_reduce_array_container(
+                partial(reduce, partial(gpuarray.minimum)),partial(gpuarray.min),a)
+
+    def max(self, a):
+        return rec_map_reduce_array_container(
+                partial(reduce, partial(gpuarray.maximum)), partial(gpuarray.max), a)
+
+    def stack(self, arrays, axis=0):
+         return rec_multimap_array_container(
+                lambda *args: gpuarray.stack(arrays=args, axis=axis),
+                *arrays)
+
+    def reshape(self, a, newshape):
+        return gpuarray.reshape(a, newshape)
+
+    def concatenate(self, arrays, axis=0):
+        return  gpuarray.concatenate(
+            arrays, axis,
+            self._array_context.allocator
+        )
+
+    def ravel(self, a, order="C"):
+        return gpuarray.reshape(a,-1,order=order)
+
+# }}}
+
+
+# {{{ fake np.linalg
+
+class _PyCUDAFakeNumpyLinalgNamespace(BaseFakeNumpyLinalgNamespace):
+    pass
+
+# }}}
+
+
+# vim: foldmethod=marker
diff --git a/arraycontext/impl/pyopencl/fake_numpy.py b/arraycontext/impl/pyopencl/fake_numpy.py
index 01054bac6b90d2960f3ddc6ee25cd13fc1d91d4d..e9a2cfa1defd09d5633870bc0bddcce0322d87c6 100644
--- a/arraycontext/impl/pyopencl/fake_numpy.py
+++ b/arraycontext/impl/pyopencl/fake_numpy.py
@@ -31,6 +31,8 @@ import operator
 
 from arraycontext.fake_numpy import \
         BaseFakeNumpyNamespace, BaseFakeNumpyLinalgNamespace
+from arraycontext.loopy import \
+        LoopyBasedFakeNumpyspace
 from arraycontext.container.traversal import (
         rec_multimap_array_container, rec_map_array_container,
         rec_map_reduce_array_container,
@@ -45,7 +47,7 @@ except ImportError:
 
 # {{{ fake numpy
 
-class PyOpenCLFakeNumpyNamespace(BaseFakeNumpyNamespace):
+class PyOpenCLFakeNumpyNamespace(BaseFakeNumpyNameSpace, LoopyBasedFakeNumpyNamespace):
     def _get_fake_numpy_linalg_namespace(self):
         return _PyOpenCLFakeNumpyLinalgNamespace(self._array_context)
 
@@ -58,6 +60,17 @@ class PyOpenCLFakeNumpyNamespace(BaseFakeNumpyNamespace):
     # These operations provide access to numpy-style comparisons in that
     # case.
 
+    def __getattr__(self, name):
+        print(name)
+        cl_funcs = ["abs", "sin", "cos", "tan", "arcsin", "arccos", "arctan",
+                    "sinh", "cosh", "tanh", "exp", "log", "log10", "isnan",
+                    "sqrt", "exp"]
+        if name in cl_funcs:
+            from functools import partial
+            return partial(rec_map_array_container, getattr(cl, name))
+
+        return super().__getattr__(name)
+
     def equal(self, x, y):
         return rec_multimap_array_container(operator.eq, x, y)
 
diff --git a/arraycontext/loopy.py b/arraycontext/loopy.py
index f4c97754d731961baaaf0191f70dcfeca287b688..b5a33b7b3a8384fab3407194ccb1025b467ed621 100644
--- a/arraycontext/loopy.py
+++ b/arraycontext/loopy.py
@@ -70,5 +70,47 @@ def get_default_entrypoint(t_unit):
 
 # }}}
 
-
-# vim: foldmethod=marker
+class LoopyBasedFakeNumpyNamespace:
+    _numpy_to_c_arc_functions = {
+            "arcsin": "asin",
+            "arccos": "acos",
+            "arctan": "atan",
+            "arctan2": "atan2",
+
+            "arcsinh": "asinh",
+            "arccosh": "acosh",
+            "arctanh": "atanh",
+            }
+
+    _c_to_numpy_arc_functions = {c_name: numpy_name
+            for numpy_name, c_name in _numpy_to_c_arc_functions.items()}
+
+    def __getattr__(self, name):
+        def loopy_implemented_elwise_func(*args):
+            actx = self._array_context
+            prg = _get_scalar_func_loopy_program(actx,
+                    c_name, nargs=len(args), naxes=len(args[0].shape))
+            outputs = actx.call_loopy(prg,
+                    **{"inp%d" % i: arg for i, arg in enumerate(args)})
+            return outputs["out"]
+
+        if name in self._c_to_numpy_arc_functions:
+            from warnings import warn
+            warn(f"'{name}' in ArrayContext.np is deprecated. "
+                    "Use '{c_to_numpy_arc_functions[name]}' as in numpy. "
+                    "The old name will stop working in 2022.",
+                    DeprecationWarning, stacklevel=3)
+
+        # normalize to C names anyway
+        c_name = self._numpy_to_c_arc_functions.get(name, name)
+
+        # limit which functions we try to hand off to loopy
+        if name in self._numpy_math_functions:
+            return multimapped_over_array_containers(loopy_implemented_elwise_func)
+        else:
+            raise AttributeError(name)
+
+
+
+
+vim: foldmethod=marker
diff --git a/arraycontext/pytest.py b/arraycontext/pytest.py
index e93a8b38bd8528d8719dfe818bd58c56214a3c66..9b6ebfb3f2f598a2dd3c8e81834e7f62409c1196 100644
--- a/arraycontext/pytest.py
+++ b/arraycontext/pytest.py
@@ -2,6 +2,7 @@
 .. currentmodule:: arraycontext
 
 .. autoclass:: PytestPyOpenCLArrayContextFactory
+.. autoclass:: PytestPyCUDAArrayContextFactory
 
 .. autofunction:: pytest_generate_tests_for_array_contexts
 .. autofunction:: pytest_generate_tests_for_pyopencl_array_context
@@ -34,6 +35,7 @@ THE SOFTWARE.
 from typing import Any, Callable, Dict, Sequence, Type, Union
 
 import pyopencl as cl
+import pycuda
 from arraycontext.context import ArrayContext
 
 
@@ -70,6 +72,22 @@ class PytestPyOpenCLArrayContextFactory:
         raise NotImplementedError
 
 
+class PytestPyCUDAArrayContextFactory:
+    """
+    .. automethod:: __init__
+    .. automethod:: __call__
+    """
+
+    def __init__(self, allocator):
+        """
+        :arg allocator: a :class:`gpuarray.allocator`.
+        """
+        self.allocator = allocator
+
+    def __call__(self) -> ArrayContext:
+        raise NotImplementedError
+
+
 class _PytestPyOpenCLArrayContextFactoryWithClass(PytestPyOpenCLArrayContextFactory):
     force_device_scalars = True
 
@@ -126,6 +144,75 @@ class _PytestPytatoPyOpenCLArrayContextFactory(
                     self.device.platform.name.strip()))
 
 
+class _PytestPyCUDAArrayContextFactory(
+        PytestPyCUDAArrayContextFactory):
+
+    @property
+    def actx_class(self):
+        from arraycontext import PyCUDAArrayContext
+        return PyCUDAArrayContext
+
+    def __call__(self):
+        def make_default_context(ctx_maker=None):
+            if ctx_maker is None:
+
+                def ctx_maker(dev):
+                    return dev.make_context()
+
+            ndevices = cuda.Device.count()
+            if ndevices == 0:
+                raise RuntimeError(
+                    "No CUDA enabled device found. " "Please check your installation."
+                )
+
+            # Is CUDA_DEVICE set?
+            import os
+
+            devn = os.environ.get("CUDA_DEVICE")
+
+            # Is $HOME/.cuda_device set ?
+            if devn is None:
+                try:
+                    homedir = os.environ.get("HOME")
+                    assert homedir is not None
+                    devn = open(os.path.join(homedir, ".cuda_device")).read().strip()
+                except Exception:
+                    pass
+
+            # If either CUDA_DEVICE or $HOME/.cuda_device is set, try to use it
+            if devn is not None:
+                try:
+                    devn = int(devn)
+                except TypeError:
+                    raise TypeError(
+                        "CUDA device number (CUDA_DEVICE or ~/.cuda_device)"
+                        " must be an integer"
+                    )
+
+                dev = cuda.Device(devn)
+                return ctx_maker(dev)
+
+            # Otherwise, try to use any available device
+            else:
+                for devn in range(ndevices):
+                    dev = cuda.Device(devn)
+                    try:
+                        return ctx_maker(dev)
+                    except cuda.Error:
+                        pass
+
+                raise RuntimeError(
+                    "make_default_context() wasn't able to create a context "
+                    "on any of the %d detected devices" % ndevices
+                )
+
+        import pycuda.driver as cuda
+        actx_class = self.actx_class(None)
+        cuda.init()
+        ctx = make_default_context()
+        return actx_class
+
+
 _ARRAY_CONTEXT_FACTORY_REGISTRY: \
         Dict[str, Type[PytestPyOpenCLArrayContextFactory]] = {
                 "pyopencl": _PytestPyOpenCLArrayContextFactoryWithClass,
diff --git a/test/test_arraycontext.py b/test/test_arraycontext.py
index 9e855b06943046f1853f75f3e466d472fc0faddf..c87793f8bcd893f045fbf53720930043d36f857b 100644
--- a/test/test_arraycontext.py
+++ b/test/test_arraycontext.py
@@ -34,12 +34,14 @@ from arraycontext import (
         FirstAxisIsElementsTag,
         PyOpenCLArrayContext,
         PytatoPyOpenCLArrayContext,
-        ArrayContainer,)
+        PyCUDAArrayContext,
+        ArrayContainer)
 from arraycontext import (  # noqa: F401
         pytest_generate_tests_for_array_contexts,
         )
 from arraycontext.pytest import (_PytestPyOpenCLArrayContextFactoryWithClass,
-                                 _PytestPytatoPyOpenCLArrayContextFactory)
+                                 _PytestPytatoPyOpenCLArrayContextFactory,
+                                 _PytestPyCUDAArrayContextFactory)
 
 
 import logging
@@ -66,6 +68,15 @@ class _PytatoPyOpenCLArrayContextForTests(PytatoPyOpenCLArrayContext):
     def transform_loopy_program(self, t_unit):
         return t_unit
 
+class _PyCUDAArrayContextForTests(PyCUDAArrayContext):
+    """Like :class:`PyCUDAArrayContext`, but applies no program
+    transformations whatsoever. Only to be used for testing internal to
+    :mod:`arraycontext`.
+    """
+
+    def transform_loopy_program(self, t_unit):
+        return t_unit
+
 
 class _PyOpenCLArrayContextWithHostScalarsForTestsFactory(
         _PytestPyOpenCLArrayContextFactoryWithClass):
@@ -82,13 +93,22 @@ class _PytatoPyOpenCLArrayContextForTestsFactory(
     actx_class = _PytatoPyOpenCLArrayContextForTests
 
 
+class _PyCUDAArrayContextForTestsFactory(
+        _PytestPyCUDAArrayContextFactory):
+    actx_class = _PyCUDAArrayContextForTests
+
+
+#pytest_generate_tests = pytest_generate_tests_for_array_contexts([
+    #_PyOpenCLArrayContextForTestsFactory,
+    #_PyOpenCLArrayContextWithHostScalarsForTestsFactory,
+    #_PytatoPyOpenCLArrayContextForTestsFactory,
+    #_PyCUDAArrayContextForTestsFactory,
+    #])
+
 pytest_generate_tests = pytest_generate_tests_for_array_contexts([
-    _PyOpenCLArrayContextForTestsFactory,
-    _PyOpenCLArrayContextWithHostScalarsForTestsFactory,
-    _PytatoPyOpenCLArrayContextForTestsFactory,
+    _PyCUDAArrayContextForTestsFactory
     ])
 
-
 def _acf():
     import pyopencl as cl
 
@@ -305,6 +325,8 @@ def test_array_context_np_workalike(actx_factory, sym_name, n_args, dtype):
             ])
 def test_array_context_np_like(actx_factory, sym_name, n_args, dtype):
     actx = actx_factory()
+    if not hasattr(actx.np, sym_name):
+        pytest.skip(f"'{sym_name}' not implemented on '{type(actx).__name__}'")
 
     ndofs = 512
     args = [randn(ndofs, dtype) for i in range(n_args)]