diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index 2ea1707ff43eb9e1d17760a4ac86bec1a886ae1d..3945734eb1d7e5063f0471b9846ba702121f3c32 100644
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -2,10 +2,9 @@ stages:
   - test
   - deploy
 
-Python 3 POCL:
+Pytest POCL:
   stage: test
   script:
-  - export PY_EXE=python3
   - export PYOPENCL_TEST=portable:pthread
   - export EXTRA_INSTALL="pybind11 numpy mako"
   - export LOOPY_NO_CACHE=1
@@ -20,10 +19,27 @@ Python 3 POCL:
     reports:
       junit: test/pytest.xml
 
-Python 3 POCL without arg check:
+Pytest Nvidia Titan V:
+  stage: test
+  script:
+  - export PYOPENCL_TEST=nvi:titan
+  - export EXTRA_INSTALL="pybind11 numpy mako"
+  - export LOOPY_NO_CACHE=1
+  - source /opt/enable-intel-cl.sh
+  - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/build-and-test-py-project.sh
+  - ". ./build-and-test-py-project.sh"
+  tags:
+  - python3
+  - nvidia-titan-v
+  except:
+  - tags
+  artifacts:
+    reports:
+      junit: test/pytest.xml
+
+Pytest POCL without arg check:
   stage: test
   script:
-  - export PY_EXE=python3
   - export PYOPENCL_TEST=portable:pthread
   - export EXTRA_INSTALL="pybind11 numpy mako"
   - export LOOPY_NO_CACHE=1
@@ -39,10 +55,9 @@ Python 3 POCL without arg check:
     reports:
       junit: test/pytest.xml
 
-Python 3 Intel:
+Pytest Intel:
   stage: test
   script:
-  - export PY_EXE=python3
   - export PYOPENCL_TEST=intel
   - export EXTRA_INSTALL="pybind11 numpy mako"
   - export LOOPY_NO_CACHE=1
@@ -60,10 +75,9 @@ Python 3 Intel:
       junit: test/pytest.xml
 
 
-Python 3 POCL Twice With Cache:
+Pytest POCL Twice With Cache:
   stage: test
   script: |
-    export PY_EXE=python3
     export PYOPENCL_TEST=portable:pthread
     export EXTRA_INSTALL="pybind11 numpy mako"
     curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/ci-support.sh
@@ -94,10 +108,9 @@ Python 3 POCL Twice With Cache:
 #   except:
 #   - tags
 
-Python 3 POCL Examples:
+Pytest POCL Examples:
   stage: test
   script: |
-    export PY_EXE=python3
     export PYOPENCL_TEST=portable:pthread
     export EXTRA_INSTALL="pybind11 numpy mako pyvisfile matplotlib ipykernel nbconvert"
 
@@ -126,7 +139,6 @@ Pylint:
   script:
   # Needed to avoid name shadowing issues when running from source directory.
   - PROJECT_INSTALL_FLAGS="--editable"
-  - export PY_EXE=python3
   - EXTRA_INSTALL="pybind11 numpy mako matplotlib ipykernel ply fparser"
   - curl -L -O -k https://gitlab.tiker.net/inducer/ci-support/raw/main/prepare-and-run-pylint.sh
   - . ./prepare-and-run-pylint.sh "$CI_PROJECT_NAME" test/test_*.py
diff --git a/test/test_loopy.py b/test/test_loopy.py
index 6fc6b1329125f92ce24b3300cc1c1ea44db9fe87..39e7429cb9ac5715846913c400e78e11c061ad13 100644
--- a/test/test_loopy.py
+++ b/test/test_loopy.py
@@ -893,52 +893,32 @@ def test_atomic(ctx_factory, dtype):
 def test_atomic_load(ctx_factory, dtype):
     ctx = ctx_factory()
     queue = cl.CommandQueue(ctx)
-    from loopy.kernel.data import AddressSpace
-    n = 10
-    vec_width = 4
-
-    if (
-            np.dtype(dtype).itemsize == 8
-            and "cl_khr_int64_base_atomics" not in ctx.devices[0].extensions):
-        pytest.skip("64-bit atomics not supported on device")
 
-    import pyopencl.version  # noqa
-    if (
-            cl.version.VERSION < (2015, 2)
-            and dtype == np.int64):
-        pytest.skip("int64 RNG not supported in PyOpenCL < 2015.2")
+    dtype = np.float64
+    n = 10
 
     knl = lp.make_kernel(
-            "{ [i,j]: 0<=i,j<n}",
+            "{ [j]: 0<=j<n}",
             """
             for j
-                <> upper = 0  {id=init_upper}
-                <> lower = 0  {id=init_lower}
                 temp = 0 {id=init, atomic}
-                for i
-                    upper = upper + i * a[i] {id=sum0,dep=init_upper}
-                    lower = lower - b[i] {id=sum1,dep=init_lower}
-                end
-                temp = temp + lower {id=temp_sum, dep=sum*:init, atomic,\
-                                           nosync=init}
+                ... lbarrier {id=lb1, dep=init}
+                temp = temp + 1 {id=temp_sum, dep=lb1, atomic}
                 ... lbarrier {id=lb2, dep=temp_sum}
-                out[j] = upper / temp {id=final, dep=lb2, atomic,\
-                                           nosync=init:temp_sum}
+                out[j] = temp {id=final, dep=lb2, atomic,nosync=init:temp_sum}
             end
             """,
             [
                 lp.GlobalArg("out", dtype, shape=lp.auto, for_atomic=True),
-                lp.GlobalArg("a", dtype, shape=lp.auto),
-                lp.GlobalArg("b", dtype, shape=lp.auto),
                 lp.TemporaryVariable("temp", dtype, for_atomic=True,
-                                     address_space=AddressSpace.LOCAL),
+                                     address_space=lp.AddressSpace.LOCAL),
                 "..."
                 ],
-            silenced_warnings=["write_race(init)", "write_race(temp_sum)"])
+            silenced_warnings=["write_race(temp_sum)", "write_race(init)"])
     knl = lp.fix_parameters(knl, n=n)
-    knl = lp.split_iname(knl, "j", vec_width, inner_tag="l.0")
-    _, out = knl(queue, a=np.arange(n, dtype=dtype), b=np.arange(n, dtype=dtype))
-    assert np.allclose(out, np.full_like(out, ((1 - 2 * n) / 3.0)))
+    knl = lp.tag_inames(knl, {"j": "l.0"})
+    _, (out,) = knl(queue)
+    assert (out.get() == n).all()
 
 
 @pytest.mark.parametrize("dtype", [np.int32, np.int64, np.float32, np.float64])