diff --git a/pycuda/elementwise.py b/pycuda/elementwise.py
index 4e8601f02940075cf17cf873dbf0883225ddf1b9..1c83a7ceaee4d5d0f5b7a96cded6924096f3396e 100644
--- a/pycuda/elementwise.py
+++ b/pycuda/elementwise.py
@@ -464,7 +464,14 @@ def get_linear_combination_kernel(summand_descriptors, dtype_z):
 
 
 @context_dependent_memoize
-def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z):
+def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z,
+                    x_is_scalar=False, y_is_scalar=False):
+    out_t = dtype_to_ctype(dtype_z)
+    x = "x[0]" if x_is_scalar else "x[i]"
+    ax = f"a*(({out_t}) {x})"
+    y = "y[0]" if y_is_scalar else "y[i]"
+    by = f"b*(({out_t}) {y})"
+    result = f"{ax} + {by}"
     return get_elwise_kernel(
         "%(tp_x)s a, %(tp_x)s *x, %(tp_y)s b, %(tp_y)s *y, %(tp_z)s *z"
         % {
@@ -472,7 +479,7 @@ def get_axpbyz_kernel(dtype_x, dtype_y, dtype_z):
             "tp_y": dtype_to_ctype(dtype_y),
             "tp_z": dtype_to_ctype(dtype_z),
         },
-        "z[i] = a*x[i] + b*y[i]",
+        "z[i] = %s" % result,
         "axpbyz",
     )
 
diff --git a/pycuda/gpuarray.py b/pycuda/gpuarray.py
index 36a39dc3d16e6fae90cf5ec5aabb269457a234d3..9ea807e66da906dc53e6ce2f5656f276e3201875 100644
--- a/pycuda/gpuarray.py
+++ b/pycuda/gpuarray.py
@@ -25,6 +25,18 @@ def _get_common_dtype(obj1, obj2):
     return _get_common_dtype_base(obj1, obj2, has_double_support())
 
 
+def _get_broadcasted_binary_op_result(obj1, obj2,
+                                      dtype_getter=_get_common_dtype):
+
+    if obj1.shape == obj2.shape:
+        return obj1._new_like_me(dtype_getter(obj1, obj2))
+    elif obj1.shape == () or obj1.shape == (1,):
+        return obj2._new_like_me(dtype_getter(obj1, obj2))
+    elif obj2.shape == () or obj2.shape == (1,):
+        return obj1._new_like_me(dtype_getter(obj1, obj2))
+    else:
+        raise NotImplementedError("Broadcasting binary operator with shapes:"
+                                  f" {obj1.shape}, {obj2.shape}.")
 # {{{ vector types
 
 
@@ -391,13 +403,22 @@ class GPUArray:
     def _axpbyz(self, selffac, other, otherfac, out, add_timer=None, stream=None):
         """Compute ``out = selffac * self + otherfac*other``,
         where `other` is a vector.."""
-        assert self.shape == other.shape
         if not self.flags.forc or not other.flags.forc:
             raise RuntimeError(
                 "only contiguous arrays may " "be used as arguments to this operation"
             )
-
-        func = elementwise.get_axpbyz_kernel(self.dtype, other.dtype, out.dtype)
+        self_shape = self.shape
+        other_shape = other.shape
+        out_shape = out.shape
+        assert ((self_shape == other_shape == out_shape)
+            or ((self_shape == () or self_shape == (1,)) and other_shape == out_shape)
+            or ((other_shape == () or other_shape == (1,)) and self_shape == out_shape))
+        if (self.size == 0) or (other.size == 0):
+            return out
+        func = elementwise.get_axpbyz_kernel(
+            self.dtype, other.dtype, out.dtype,
+            x_is_scalar=(self_shape == () or self_shape == (1,)),
+            y_is_scalar=(other_shape == () or other_shape == (1,)))
 
         if add_timer is not None:
             add_timer(
@@ -537,16 +558,17 @@ class GPUArray:
 
         if isinstance(other, GPUArray):
             # add another vector
-            result = self._new_like_me(_get_common_dtype(self, other))
+            result = _get_broadcasted_binary_op_result(self, other)
             return self._axpbyz(1, other, 1, result)
-        else:
+        elif np.isscalar(other):
             # add a scalar
             if other == 0:
                 return self.copy()
             else:
                 result = self._new_like_me(_get_common_dtype(self, other))
                 return self._axpbz(1, other, result)
-
+        else:
+            return NotImplemented
     __radd__ = __add__
 
     def __sub__(self, other):
@@ -1240,7 +1262,7 @@ def to_gpu_async(ary, allocator=drv.mem_alloc, stream=None):
 empty = GPUArray
 
 
-def zeros(shape, dtype, allocator=drv.mem_alloc, order="C"):
+def zeros(shape, dtype=np.float64, allocator=drv.mem_alloc, order="C"):
     """Returns an array of the given shape and dtype filled with 0's."""
     result = GPUArray(shape, dtype, allocator, order=order)
     zero = np.zeros((), dtype)
diff --git a/test/test_gpuarray.py b/test/test_gpuarray.py
index 73ec3ade3f99e8830882b6271aadd70945e4b4f3..1581cbbdf26256bb94363db68576793670ff474a 100644
--- a/test/test_gpuarray.py
+++ b/test/test_gpuarray.py
@@ -133,9 +133,25 @@ class TestGPUArray:
 
         a = np.array([1, 2, 3, 4, 5, 6, 7, 8, 9, 10]).astype(np.float32)
         a_gpu = gpuarray.to_gpu(a)
+        b = np.array([1]).astype(np.float32)
+        b_gpu = gpuarray.to_gpu(b)
+        a_empty = np.array([]).astype(np.float32)
+        a_empty_gpu = gpuarray.to_gpu(a_empty)
         a_added = (a_gpu + a_gpu).get()
+        a_added_scalar = (a_gpu + 1).get()
+        scalar_added_a = (1 + a_gpu).get()
+        a_empty_pl_b = (a_empty_gpu + b_gpu).get()
+        b_pl_a_empty = (b_gpu + a_empty_gpu).get()
+        a_gpu_pl_b_gpu = (a_gpu + b_gpu).get()
+        a_added_empty = (a_empty_gpu + a_empty_gpu).get()
 
         assert (a + a == a_added).all()
+        assert (a + 1 == a_added_scalar).all()
+        assert (1 + a == scalar_added_a).all()
+        assert (a_empty + a_empty == a_added_empty).all()
+        assert (a_empty + b == a_empty_pl_b).all()
+        assert (b + a_empty == b_pl_a_empty).all()
+        assert (a + b == a_gpu_pl_b_gpu).all()
 
     @mark_cuda_test
     def test_iaddition_array(self):
@@ -461,7 +477,7 @@ class TestGPUArray:
             ]
         ):
 
-            a_gpu = gpuarray.zeros((50000,), dtype=np.float32)
+            a_gpu = gpuarray.zeros((50000,))
             a_cpu = np.zeros(a_gpu.shape, a_gpu.dtype)
 
             a_cpu[slc] = 7