From af24ae4ce3b2f88357d53f598bd6f124a638bc8b Mon Sep 17 00:00:00 2001
From: Vahid Tavanashad <vahid.tavanashad@intel.com>
Date: Mon, 3 Feb 2025 09:22:30 -0800
Subject: [PATCH 1/7] fix issue 2270

---
 .github/workflows/array-api-skips.txt       |   5 +-
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py | 149 +++++++++-----------
 dpnp/tests/test_product.py                  | 124 ++++++++--------
 3 files changed, 140 insertions(+), 138 deletions(-)

diff --git a/.github/workflows/array-api-skips.txt b/.github/workflows/array-api-skips.txt
index 097e6ffef42..cccb79a51c5 100644
--- a/.github/workflows/array-api-skips.txt
+++ b/.github/workflows/array-api-skips.txt
@@ -26,13 +26,10 @@ array_api_tests/test_linalg.py::test_svd
 array_api_tests/test_linalg.py::test_qr
 array_api_tests/test_operators_and_elementwise_functions.py::test_clip
 
-# unexpected result is returned
+# unexpected result is returned - unmute when dpctl-1986 is resolved
 array_api_tests/test_operators_and_elementwise_functions.py::test_asin
 array_api_tests/test_operators_and_elementwise_functions.py::test_asinh
 
 # missing 'correction' keyword argument
 array_api_tests/test_signatures.py::test_func_signature[std]
 array_api_tests/test_signatures.py::test_func_signature[var]
-
-# arrays have different values
-array_api_tests/test_linalg.py::test_linalg_tensordot
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index 128dd3b78c3..ca57373a536 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -52,18 +52,12 @@
 
 def _compute_res_dtype(*arrays, sycl_queue, dtype=None, casting="no"):
     """
-    Determines the output array data type and an intermediate data type
-    used in performing calculations related to a specific math function.
+    Determines the output array data type.
     If dtype is ``None``, the output array data type of the operation is
     determined based on the Promotion Type Rule and device capabilities.
     Otherwise, `dtype` is used as output array dtype, if input arrays
     can cast to it according to the casting rule determined. If casting
     cannot be done, a ``TypeError`` is raised.
-    The intermediate data type is the data type used for performing the math
-    function calculations. If output array dtype is a floating-point data type,
-    it is also used for the intermediate data type. If output array dtype is an
-    integral data type, the default floating point data type of the device where
-    input arrays are allocated on are used for intermediate data type.
 
     Parameters
     ----------
@@ -78,17 +72,13 @@ def _compute_res_dtype(*arrays, sycl_queue, dtype=None, casting="no"):
 
     Returns
     -------
-    compute_dtype, res_dtype :
-        `compute_dtype` is the data type used in performing math function calculations.
-        The input arrays of the math function are cast to `compute_dtype` and then
-        the calculations are performed.
+    res_dtype :
         `res_dtype` is the output data type. When the result is obtained, it is cast
         to `res_dtype`.
 
     """
 
     res_dtype = dpnp.result_type(*arrays)
-    default_dtype = dpnp.default_float_type(sycl_queue=sycl_queue)
 
     if dtype is not None:
         if dpnp.can_cast(res_dtype, dtype, casting=casting):
@@ -98,11 +88,7 @@ def _compute_res_dtype(*arrays, sycl_queue, dtype=None, casting="no"):
                 f"Cannot cast from dtype({res_dtype}) to dtype({dtype}) with casting rule {casting}"
             )
 
-    compute_dtype = (
-        res_dtype if dpnp.issubdtype(res_dtype, dpnp.inexact) else default_dtype
-    )
-
-    return compute_dtype, res_dtype
+    return res_dtype
 
 
 def _copy_array(x, copy_flag=False, dtype=None, order="C"):
@@ -749,17 +735,17 @@ def dpnp_dot(a, b, /, out=None, *, casting="same_kind", conjugate=False):
     _validate_out_array(out, exec_q)
 
     # Determine the appropriate data types
-    dot_dtype, res_dtype = _compute_res_dtype(a, b, sycl_queue=exec_q)
+    res_dtype = _compute_res_dtype(a, b, sycl_queue=exec_q)
 
     result = _create_result_array(
-        a, b, out, (), dot_dtype, res_usm_type, exec_q
+        a, b, out, (), res_dtype, res_usm_type, exec_q
     )
 
     # input arrays should have the proper data type
     if dpnp.issubdtype(res_dtype, dpnp.inexact):
         # copying is needed if dtypes of input arrays are different
-        a = _copy_array(a, dtype=dot_dtype)
-        b = _copy_array(b, dtype=dot_dtype)
+        a = _copy_array(a, dtype=res_dtype)
+        b = _copy_array(b, dtype=res_dtype)
 
         _manager = dpu.SequentialOrderManager[exec_q]
 
@@ -777,14 +763,11 @@ def dpnp_dot(a, b, /, out=None, *, casting="same_kind", conjugate=False):
         )
         _manager.add_event_pair(ht_ev, dot_ev)
     else:
-        # oneapi::mkl::blas::dot is slow for integer data type,
+        # oneapi::mkl::blas::dot does not support integer dtypes,
         # so using dpctl.tensor.vecdot instead
-        dpt_a = dpnp.get_usm_ndarray(a)
-        dpt_b = dpnp.get_usm_ndarray(b)
-        result = dpnp_array._create_from_usm_ndarray(dpt.vecdot(dpt_a, dpt_b))
-
-    if dot_dtype != res_dtype:
-        result = result.astype(res_dtype, copy=False)
+        a_usm = dpnp.get_usm_ndarray(a)
+        b_usm = dpnp.get_usm_ndarray(b)
+        result = dpnp_array._create_from_usm_ndarray(dpt.vecdot(a_usm, b_usm))
 
     return dpnp.get_result_array(result, out, casting=casting)
 
@@ -902,7 +885,7 @@ def dpnp_multiplication(
         axes_res = normalize_axis_tuple(axes_res, len(result_shape), "axes")
 
     # Determine the appropriate data types
-    compute_dtype, res_dtype = _compute_res_dtype(
+    res_dtype = _compute_res_dtype(
         x1, x2, dtype=dtype, casting=casting, sycl_queue=exec_q
     )
 
@@ -998,7 +981,7 @@ def dpnp_multiplication(
             x2,
             out,
             res_shape,
-            compute_dtype,
+            res_dtype,
             res_usm_type,
             exec_q,
             res_order,
@@ -1010,64 +993,72 @@ def dpnp_multiplication(
         elif x1.size == 0 or x2.size == 0:
             result.fill(0)
         else:
-            # input arrays should have the proper data type and
-            # their base (last 2-dimensions) to be c-contiguous or f-contiguous
-            x1 = _copy_array(
-                x1,
-                copy_flag=not x1_contig_flag,
-                dtype=compute_dtype,
-                order=res_order,
-            )
-            x2 = _copy_array(
-                x2,
-                copy_flag=not x2_contig_flag,
-                dtype=compute_dtype,
-                order=res_order,
-            )
-
-            if call_flag == "gemv":
-                if transpose:
-                    a_usm = dpnp.get_usm_ndarray(x2)
-                    x_usm = dpnp.get_usm_ndarray(x1)
-                else:
-                    a_usm = dpnp.get_usm_ndarray(x1)
-                    x_usm = dpnp.get_usm_ndarray(x2)
-
-                _manager = dpu.SequentialOrderManager[exec_q]
-
-                ht_ev, gemv_ev = bi._gemv(
-                    exec_q,
-                    a_usm,
-                    x_usm,
-                    dpnp.get_usm_ndarray(result),
-                    transpose,
-                    depends=_manager.submitted_events,
-                )
-                _manager.add_event_pair(ht_ev, gemv_ev)
-            elif call_flag == "gemm":
-                result = _gemm_matmul(
-                    exec_q,
+            if dpnp.issubdtype(res_dtype, dpnp.inexact):
+                # copying is needed if dtypes of input arrays are different or
+                # their base (last 2-dimensions) is not c-contiguous or f-contiguous
+                x1 = _copy_array(
                     x1,
-                    x2,
-                    result,
+                    copy_flag=not x1_contig_flag,
+                    dtype=res_dtype,
+                    order=res_order,
                 )
-            else:  # call_flag == "gemm_batch"
-                assert call_flag == "gemm_batch"
-                result = _gemm_batch_matmul(
-                    exec_q,
-                    x1,
+                x2 = _copy_array(
                     x2,
-                    result,
+                    copy_flag=not x2_contig_flag,
+                    dtype=res_dtype,
+                    order=res_order,
                 )
 
+                if call_flag == "gemv":
+                    if transpose:
+                        a_usm = dpnp.get_usm_ndarray(x2)
+                        x_usm = dpnp.get_usm_ndarray(x1)
+                    else:
+                        a_usm = dpnp.get_usm_ndarray(x1)
+                        x_usm = dpnp.get_usm_ndarray(x2)
+
+                    _manager = dpu.SequentialOrderManager[exec_q]
+
+                    ht_ev, gemv_ev = bi._gemv(
+                        exec_q,
+                        a_usm,
+                        x_usm,
+                        dpnp.get_usm_ndarray(result),
+                        transpose,
+                        depends=_manager.submitted_events,
+                    )
+                    _manager.add_event_pair(ht_ev, gemv_ev)
+                elif call_flag == "gemm":
+                    result = _gemm_matmul(
+                        exec_q,
+                        x1,
+                        x2,
+                        result,
+                    )
+                else:  # call_flag == "gemm_batch"
+                    assert call_flag == "gemm_batch"
+                    result = _gemm_batch_matmul(
+                        exec_q,
+                        x1,
+                        x2,
+                        result,
+                    )
+            else:
+                # oneapi::mkl::blas::gemm/gemv do not support integer dtypes,
+                # so using dpctl.tensor.matmul instead
+                x1_usm = dpnp.get_usm_ndarray(x1)
+                x2_usm = dpnp.get_usm_ndarray(x2)
+                out_usm = dpnp.get_usm_ndarray(result)
+                res_usm = dpt.matmul(
+                    x1_usm, x2_usm, out=out_usm, dtype=dtype, order=order
+                )
+                result = dpnp_array._create_from_usm_ndarray(res_usm)
+
     if NumPy_special_case:
         result = dpnp.tile(result, out.shape)
     elif res_shape != result_shape:
         result = dpnp.reshape(result, result_shape)
 
-    if compute_dtype != res_dtype:
-        result = dpnp.astype(result, res_dtype, copy=False)
-
     if out is None:
         if axes is not None:
             # Move the data back to the appropriate axes of the result array
@@ -1207,7 +1198,7 @@ def dpnp_vecdot(
     )
 
     # Determine the appropriate data types
-    _, res_dtype = _compute_res_dtype(
+    res_dtype = _compute_res_dtype(
         x1, x2, dtype=dtype, casting=casting, sycl_queue=exec_q
     )
 
diff --git a/dpnp/tests/test_product.py b/dpnp/tests/test_product.py
index 927c3af94d0..0837ad7e4a8 100644
--- a/dpnp/tests/test_product.py
+++ b/dpnp/tests/test_product.py
@@ -271,7 +271,8 @@ def test_basic(self, dtype, shape1, shape2):
         expected = a.dot(b)
         assert_dtype_allclose(result, expected)
 
-    @pytest.mark.parametrize("dtype", get_all_dtypes(no_bool=True))
+    # Use both integer and float, to test both MKL and dpctl backends
+    @pytest.mark.parametrize("dtype", [numpy.int32, numpy.float32])
     @pytest.mark.parametrize("stride", [3, -1, -2, -5])
     def test_strided(self, dtype, stride):
         a = numpy.arange(25, dtype=dtype)
@@ -598,6 +599,8 @@ class TestMatmul:
     def setup_method(self):
         numpy.random.seed(42)
 
+    # Use both integer and float, to test both MKL and dpctl backends
+    @pytest.mark.parametrize("dtype", [numpy.int32, numpy.float32])
     @pytest.mark.parametrize(
         "order1, order2", [("C", "C"), ("C", "F"), ("F", "C"), ("F", "F")]
     )
@@ -674,10 +677,7 @@ def setup_method(self):
             ((7, 4, 3), (0, 7, 3, 5)),
         ],
     )
-    def test_basic(self, order1, order2, shape1, shape2):
-        # input should be float type otherwise they are copied to c-contigous array
-        # so testing order becomes meaningless
-        dtype = dpnp.default_float_type()
+    def test_basic(self, dtype, order1, order2, shape1, shape2):
         a = numpy.arange(numpy.prod(shape1), dtype=dtype).reshape(shape1)
         b = numpy.arange(numpy.prod(shape2), dtype=dtype).reshape(shape2)
         a = numpy.array(a, order=order1)
@@ -691,15 +691,12 @@ def test_basic(self, order1, order2, shape1, shape2):
     @pytest.mark.parametrize(
         "shape1, shape2",
         [
+            ((4,), (4, 3)),
             ((2, 4), (4, 3)),
             ((4, 2, 3), (4, 3, 5)),
             ((6, 7, 4, 3), (6, 7, 3, 5)),
         ],
-        ids=[
-            "((2, 4), (4, 3))",
-            "((4, 2, 3), (4, 3, 5))",
-            "((6, 7, 4, 3), (6, 7, 3, 5))",
-        ],
+        ids=["gemv", "gemm", "gemm_batch1", "gemm_batch2"],
     )
     def test_bool(self, shape1, shape2):
         x = numpy.arange(2, dtype=numpy.bool_)
@@ -829,15 +826,12 @@ def test_axes_out_1D(self, axes, b_shape, out_shape):
     @pytest.mark.parametrize(
         "shape1, shape2",
         [
+            ((4,), (4, 3)),
             ((2, 4), (4, 3)),
             ((4, 2, 3), (4, 3, 5)),
             ((6, 7, 4, 3), (6, 7, 3, 5)),
         ],
-        ids=[
-            "((2, 4), (4, 3))",
-            "((4, 2, 3), (4, 3, 5))",
-            "((6, 7, 4, 3), (6, 7, 3, 5))",
-        ],
+        ids=["gemv", "gemm", "gemm_batch1", "gemm_batch2"],
     )
     def test_dtype_matrix_inout(self, in_dt, out_dt, shape1, shape2):
         a = generate_random_numpy_array(shape1, in_dt)
@@ -857,15 +851,12 @@ def test_dtype_matrix_inout(self, in_dt, out_dt, shape1, shape2):
     @pytest.mark.parametrize(
         "shape1, shape2",
         [
+            ((4,), (4, 3)),
             ((2, 4), (4, 3)),
             ((4, 2, 3), (4, 3, 5)),
             ((6, 7, 4, 3), (6, 7, 3, 5)),
         ],
-        ids=[
-            "((2, 4), (4, 3))",
-            "((4, 2, 3), (4, 3, 5))",
-            "((6, 7, 4, 3), (6, 7, 3, 5))",
-        ],
+        ids=["gemv", "gemm", "gemm_batch1", "gemm_batch2"],
     )
     def test_dtype_matrix_inputs(self, dtype1, dtype2, shape1, shape2):
         a = generate_random_numpy_array(shape1, dtype1)
@@ -876,25 +867,25 @@ def test_dtype_matrix_inputs(self, dtype1, dtype2, shape1, shape2):
         expected = numpy.matmul(a, b)
         assert_dtype_allclose(result, expected)
 
+    @pytest.mark.parametrize("dtype", [numpy.int32, numpy.float32])
     @pytest.mark.parametrize("order1", ["C", "F", "A"])
     @pytest.mark.parametrize("order2", ["C", "F", "A"])
     @pytest.mark.parametrize("order", ["C", "F", "K", "A"])
     @pytest.mark.parametrize(
         "shape1, shape2",
         [
+            ((4,), (4, 3)),
             ((2, 4), (4, 3)),
             ((4, 2, 3), (4, 3, 5)),
             ((6, 7, 4, 3), (6, 7, 3, 5)),
         ],
-        ids=[
-            "((2, 4), (4, 3))",
-            "((4, 2, 3), (4, 3, 5))",
-            "((6, 7, 4, 3), (6, 7, 3, 5))",
-        ],
+        ids=["gemv", "gemm", "gemm_batch1", "gemm_batch2"],
     )
-    def test_order(self, order1, order2, order, shape1, shape2):
-        a = numpy.arange(numpy.prod(shape1)).reshape(shape1, order=order1)
-        b = numpy.arange(numpy.prod(shape2)).reshape(shape2, order=order2)
+    def test_order(self, dtype, order1, order2, order, shape1, shape2):
+        a = numpy.arange(numpy.prod(shape1), dtype=dtype)
+        b = numpy.arange(numpy.prod(shape2), dtype=dtype)
+        a = a.reshape(shape1, order=order1)
+        b = b.reshape(shape2, order=order2)
         ia, ib = dpnp.array(a), dpnp.array(b)
 
         result = dpnp.matmul(ia, ib, order=order)
@@ -913,16 +904,17 @@ def test_order(self, order1, order2, order, shape1, shape2):
         assert result.flags.f_contiguous == expected.flags.f_contiguous
         assert_dtype_allclose(result, expected)
 
+    @pytest.mark.parametrize("dtype", [numpy.int32, numpy.float32])
     @pytest.mark.parametrize(
         "stride",
         [(-2, -2, -2, -2), (2, 2, 2, 2), (-2, 2, -2, 2), (2, -2, 2, -2)],
         ids=["-2", "2", "(-2, 2)", "(2, -2)"],
     )
-    def test_strided1(self, stride):
+    def test_strided1(self, dtype, stride):
         for dim in [1, 2, 3, 4]:
             shape = tuple(20 for _ in range(dim))
-            A = numpy.random.rand(*shape)
-            iA = dpnp.asarray(A)
+            A = numpy.random.rand(*shape).astype(dtype)
+            iA = dpnp.array(A)
             slices = tuple(slice(None, None, stride[i]) for i in range(dim))
             a = A[slices]
             ia = iA[slices]
@@ -938,17 +930,18 @@ def test_strided1(self, stride):
             assert result is iout
             assert_dtype_allclose(result, expected)
 
+    @pytest.mark.parametrize("dtype", [numpy.int32, numpy.float32])
     @pytest.mark.parametrize(
         "shape", [(10, 3, 3), (12, 10, 3, 3)], ids=["3D", "4D"]
     )
-    @pytest.mark.parametrize("stride", [-1, -2, 2], ids=["-1", "-2", "2"])
-    @pytest.mark.parametrize("transpose", [False, True], ids=["False", "True"])
-    def test_strided2(self, shape, stride, transpose):
+    @pytest.mark.parametrize("stride", [-1, -2, 2])
+    @pytest.mark.parametrize("transpose", [False, True])
+    def test_strided2(self, dtype, shape, stride, transpose):
         # one dimension (axis=-3) is strided
         # if negative stride, copy is needed and the base becomes c-contiguous
         # otherwise the base remains the same as input in gemm_batch
-        A = numpy.random.rand(*shape)
-        iA = dpnp.asarray(A)
+        A = numpy.random.rand(*shape).astype(dtype)
+        iA = dpnp.array(A)
         if transpose:
             A = numpy.moveaxis(A, (-2, -1), (-1, -2))
             iA = dpnp.moveaxis(iA, (-2, -1), (-1, -2))
@@ -967,20 +960,21 @@ def test_strided2(self, shape, stride, transpose):
         assert result is iout
         assert_dtype_allclose(result, expected)
 
+    @pytest.mark.parametrize("dtype", [numpy.int32, numpy.float32])
     @pytest.mark.parametrize(
         "stride",
         [(-2, -2), (2, 2), (-2, 2), (2, -2)],
         ids=["(-2, -2)", "(2, 2)", "(-2, 2)", "(2, -2)"],
     )
     @pytest.mark.parametrize("transpose", [False, True])
-    def test_strided3(self, stride, transpose):
+    def test_strided3(self, dtype, stride, transpose):
         # 4D case, the 1st and 2nd dimensions are strided
         # For negative stride, copy is needed and the base becomes c-contiguous.
         # For positive stride, no copy but reshape makes the base c-contiguous.
         stride0, stride1 = stride
         shape = (12, 10, 3, 3)  # 4D array
-        A = numpy.random.rand(*shape)
-        iA = dpnp.asarray(A)
+        A = numpy.random.rand(*shape).astype(dtype)
+        iA = dpnp.array(A)
         if transpose:
             A = numpy.moveaxis(A, (-2, -1), (-1, -2))
             iA = dpnp.moveaxis(iA, (-2, -1), (-1, -2))
@@ -997,11 +991,12 @@ def test_strided3(self, stride, transpose):
         assert_dtype_allclose(result, expected)
 
     @testing.with_requires("numpy>=2.2")
+    @pytest.mark.parametrize("dtype", [numpy.int32, numpy.float32])
     @pytest.mark.parametrize("func", ["matmul", "matvec"])
     @pytest.mark.parametrize("incx", [-2, 2])
     @pytest.mark.parametrize("incy", [-2, 2])
     @pytest.mark.parametrize("transpose", [False, True])
-    def test_strided_mat_vec(self, func, incx, incy, transpose):
+    def test_strided_mat_vec(self, dtype, func, incx, incy, transpose):
         # vector is strided
         shape = (8, 10)  # 2D
         if transpose:
@@ -1010,8 +1005,8 @@ def test_strided_mat_vec(self, func, incx, incy, transpose):
         else:
             s1 = shape[-1]
             s2 = shape[-2]
-        a = numpy.random.rand(*shape)
-        ia = dpnp.asarray(a)
+        a = numpy.random.rand(*shape).astype(dtype)
+        ia = dpnp.array(a)
         if transpose:
             a = numpy.moveaxis(a, (-2, -1), (-1, -2))
             ia = dpnp.moveaxis(ia, (-2, -1), (-1, -2))
@@ -1032,11 +1027,12 @@ def test_strided_mat_vec(self, func, incx, incy, transpose):
         assert_dtype_allclose(result, expected)
 
     @testing.with_requires("numpy>=2.2")
+    @pytest.mark.parametrize("dtype", [numpy.int32, numpy.float32])
     @pytest.mark.parametrize("func", ["matmul", "vecmat"])
     @pytest.mark.parametrize("incx", [-2, 2])
     @pytest.mark.parametrize("incy", [-2, 2])
     @pytest.mark.parametrize("transpose", [False, True])
-    def test_strided_vec_mat(self, func, incx, incy, transpose):
+    def test_strided_vec_mat(self, dtype, func, incx, incy, transpose):
         # vector is strided
         shape = (8, 10)  # 2D
         if transpose:
@@ -1045,8 +1041,8 @@ def test_strided_vec_mat(self, func, incx, incy, transpose):
         else:
             s1 = shape[-1]
             s2 = shape[-2]
-        a = numpy.random.rand(*shape)
-        ia = dpnp.asarray(a)
+        a = numpy.random.rand(*shape).astype(dtype)
+        ia = dpnp.array(a)
         if transpose:
             a = numpy.moveaxis(a, (-2, -1), (-1, -2))
             ia = dpnp.moveaxis(ia, (-2, -1), (-1, -2))
@@ -1187,7 +1183,7 @@ def test_out_0D(self, out_shape):
             ((5000, 5000, 2, 2), (2, 2)),
         ],
     )
-    def test_large(self, shape1, shape2):
+    def test_large_arrays(self, shape1, shape2):
         a = generate_random_numpy_array(shape1)
         b = generate_random_numpy_array(shape2)
         ia, ib = dpnp.array(a), dpnp.array(b)
@@ -1196,6 +1192,18 @@ def test_large(self, shape1, shape2):
         expected = numpy.matmul(a, b)
         assert_dtype_allclose(result, expected, factor=24)
 
+    @pytest.mark.parametrize("dtype", [numpy.int64, numpy.float32])
+    def test_large_values(self, dtype):
+        a = numpy.array(
+            [[[94906267, 94906267], [94906267, 94906267]]], dtype=dtype
+        )
+        b = numpy.array([94906265, 94906265], dtype=dtype)
+        ia, ib = dpnp.array(a), dpnp.array(b)
+
+        result = dpnp.matmul(ia, ib)
+        expected = numpy.matmul(a, b)
+        assert_dtype_allclose(result, expected)
+
     @testing.with_requires("numpy>=2.0")
     def test_linalg_matmul(self):
         a = numpy.ones((3, 4))
@@ -1206,6 +1214,7 @@ def test_linalg_matmul(self):
         expected = numpy.linalg.matmul(a, b)
         assert_array_equal(result, expected)
 
+    @pytest.mark.parametrize("dtype", [numpy.int32, numpy.float32])
     @pytest.mark.parametrize(
         "sh1, sh2",
         [
@@ -1214,9 +1223,9 @@ def test_linalg_matmul(self):
         ],
         ids=["gemm", "gemm_batch"],
     )
-    def test_matmul_with_offsets(self, sh1, sh2):
-        a = generate_random_numpy_array(sh1)
-        b = generate_random_numpy_array(sh2)
+    def test_matmul_with_offsets(self, dtype, sh1, sh2):
+        a = generate_random_numpy_array(sh1, dtype)
+        b = generate_random_numpy_array(sh2, dtype)
         ia, ib = dpnp.array(a), dpnp.array(b)
 
         result = ia[1] @ ib[1]
@@ -1978,10 +1987,12 @@ def test_axes_out_1D(self, axes, b_shape):
         expected = numpy.vecdot(a, b, axes=axes, out=out)
         assert_dtype_allclose(result, expected)
 
+    # Use both integer and float, to test both MKL and dpctl backends
+    @pytest.mark.parametrize("dtype", [numpy.int32, numpy.float32])
     @pytest.mark.parametrize("stride", [2, -1, -2])
-    def test_strided(self, stride):
-        a = numpy.arange(100).reshape(10, 10)
-        b = numpy.arange(100).reshape(10, 10)
+    def test_strided(self, dtype, stride):
+        a = numpy.arange(100, dtype=dtype).reshape(10, 10)
+        b = numpy.arange(100, dtype=dtype).reshape(10, 10)
         ia, ib = dpnp.array(a), dpnp.array(b)
 
         result = dpnp.vecdot(ia[::stride, ::stride], ib[::stride, ::stride])
@@ -1999,6 +2010,7 @@ def test_input_dtype_matrix(self, dtype1, dtype2):
         expected = numpy.vecdot(a, b)
         assert_dtype_allclose(result, expected)
 
+    @pytest.mark.parametrize("dtype", [numpy.int32, numpy.float32])
     @pytest.mark.parametrize("order1", ["C", "F", "A"])
     @pytest.mark.parametrize("order2", ["C", "F", "A"])
     @pytest.mark.parametrize("order", ["C", "F", "K", "A"])
@@ -2007,9 +2019,11 @@ def test_input_dtype_matrix(self, dtype1, dtype2):
         [(4, 3), (4, 3, 5), (6, 7, 3, 5)],
         ids=["(4, 3)", "(4, 3, 5)", "(6, 7, 3, 5)"],
     )
-    def test_order(self, order1, order2, order, shape):
-        a = numpy.arange(numpy.prod(shape)).reshape(shape, order=order1)
-        b = numpy.arange(numpy.prod(shape)).reshape(shape, order=order2)
+    def test_order(self, dtype, order1, order2, order, shape):
+        a = numpy.arange(numpy.prod(shape), dtype=dtype)
+        b = numpy.arange(numpy.prod(shape), dtype=dtype)
+        a = a.reshape(shape, order=order1)
+        b = b.reshape(shape, order=order2)
         ia, ib = dpnp.array(a), dpnp.array(b)
 
         result = dpnp.vecdot(ia, ib, order=order)

From 1add60add198c1791e53dfe8ececb16329eb76a5 Mon Sep 17 00:00:00 2001
From: Vahid Tavanashad <vahid.tavanashad@intel.com>
Date: Wed, 5 Feb 2025 14:29:16 -0800
Subject: [PATCH 2/7] increase linux test timeout

---
 .github/workflows/conda-package.yml | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index 0e83675ba57..3ffe0c8418e 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -218,7 +218,7 @@ jobs:
         id: run_tests_linux
         uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
         with:
-          timeout_minutes: 12
+          timeout_minutes: 15
           max_attempts: ${{ env.RUN_TESTS_MAX_ATTEMPTS }}
           retry_on: any
           command: |

From 9d2bc0029813059ceddb299e728c87e5ea369f61 Mon Sep 17 00:00:00 2001
From: Vahid Tavanashad <vahid.tavanashad@intel.com>
Date: Thu, 6 Feb 2025 08:04:22 -0800
Subject: [PATCH 3/7] address comments

---
 .github/workflows/check-mkl-interfaces.yaml |   2 +-
 .github/workflows/cron-run-tests.yaml       |   2 +-
 dpnp/backend/extensions/blas/blas_py.cpp    |   3 +
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py |  78 +++++--
 dpnp/tests/test_product.py                  | 241 +++++++++-----------
 5 files changed, 167 insertions(+), 159 deletions(-)

diff --git a/.github/workflows/check-mkl-interfaces.yaml b/.github/workflows/check-mkl-interfaces.yaml
index fe43dfd9235..65bc9d4002f 100644
--- a/.github/workflows/check-mkl-interfaces.yaml
+++ b/.github/workflows/check-mkl-interfaces.yaml
@@ -216,7 +216,7 @@ jobs:
         id: run_tests
         uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
         with:
-          timeout_minutes: 12
+          timeout_minutes: 15
           max_attempts: ${{ env.RUN_TESTS_MAX_ATTEMPTS }}
           retry_on: any
           command: |
diff --git a/.github/workflows/cron-run-tests.yaml b/.github/workflows/cron-run-tests.yaml
index 1376a25bfa8..942977ac439 100644
--- a/.github/workflows/cron-run-tests.yaml
+++ b/.github/workflows/cron-run-tests.yaml
@@ -126,7 +126,7 @@ jobs:
         id: run_tests_linux
         uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
         with:
-          timeout_minutes: 12
+          timeout_minutes: 15
           max_attempts: ${{ env.RUN_TESTS_MAX_ATTEMPTS }}
           retry_on: any
           command: |
diff --git a/dpnp/backend/extensions/blas/blas_py.cpp b/dpnp/backend/extensions/blas/blas_py.cpp
index 3b62ad4945e..38d0d5ce9fe 100644
--- a/dpnp/backend/extensions/blas/blas_py.cpp
+++ b/dpnp/backend/extensions/blas/blas_py.cpp
@@ -142,6 +142,9 @@ PYBIND11_MODULE(_blas_impl, m)
               py::arg("sycl_queue"), py::arg("matrixA"), py::arg("vectorX"),
               py::arg("vectorY"), py::arg("transpose"),
               py::arg("depends") = py::list());
+    }
+
+    {
         m.def(
             "_row_major_is_available",
             [](void) {
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index ca57373a536..d49b63d4af6 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -50,20 +50,23 @@
 ]
 
 
-def _compute_res_dtype(*arrays, sycl_queue, dtype=None, casting="no"):
+def _compute_res_dtype(*arrays, sycl_queue, dtype=None, out=None, casting="no"):
     """
     Determines the output array data type.
-    If dtype is ``None``, the output array data type of the operation is
-    determined based on the Promotion Type Rule and device capabilities.
-    Otherwise, `dtype` is used as output array dtype, if input arrays
-    can cast to it according to the casting rule determined. If casting
-    cannot be done, a ``TypeError`` is raised.
+    If `dtype` and `out` are ``None``, the output array data type of the
+    operation is determined based on the Promotion Type Rule and device
+    capabilities. if `out` is given, its data type is used as the output
+    array dtypes. Otherwise, `dtype` is used as output array dtype.
+    If input arrays cannot be cast to the determined output array dtype,
+    a ``TypeError`` is raised.
 
     Parameters
     ----------
     arrays : {dpnp.ndarray, usm_ndarray}
         Input arrays.
     dtype : dtype
+        If not ``None`` and `out` is not defined, data type of the output array.
+    out : {dpnp.ndarray, usm_ndarray}
         If not ``None``, data type of the output array.
     casting : {"no", "equiv", "safe", "same_kind", "unsafe"}, optional
         Controls what kind of data casting may occur.
@@ -73,13 +76,23 @@ def _compute_res_dtype(*arrays, sycl_queue, dtype=None, casting="no"):
     Returns
     -------
     res_dtype :
-        `res_dtype` is the output data type. When the result is obtained, it is cast
-        to `res_dtype`.
+        `res_dtype` is the output data type. When the result is obtained,
+        it is cast to `res_dtype`.
 
     """
 
     res_dtype = dpnp.result_type(*arrays)
 
+    # If inputs are boolean and `out` is given and it is not boolean, the
+    # calculation should be performed in boolean and at the end the result
+    # is cast to out dtype. It is different than general case where the inputs
+    # are cast to out dtype and then calculation is performed. Even when inputs
+    # are boolean and `dtype` is given, the casting is done first and then the
+    # calculation is performed.
+    if out is not None and not dpnp.issubdtype(res_dtype, dpnp.bool):
+        # out dtype is prioritized over a given dtype
+        dtype = out.dtype
+
     if dtype is not None:
         if dpnp.can_cast(res_dtype, dtype, casting=casting):
             res_dtype = dtype
@@ -735,7 +748,9 @@ def dpnp_dot(a, b, /, out=None, *, casting="same_kind", conjugate=False):
     _validate_out_array(out, exec_q)
 
     # Determine the appropriate data types
-    res_dtype = _compute_res_dtype(a, b, sycl_queue=exec_q)
+    res_dtype = _compute_res_dtype(
+        a, b, out=out, casting=casting, sycl_queue=exec_q
+    )
 
     result = _create_result_array(
         a, b, out, (), res_dtype, res_usm_type, exec_q
@@ -886,7 +901,7 @@ def dpnp_multiplication(
 
     # Determine the appropriate data types
     res_dtype = _compute_res_dtype(
-        x1, x2, dtype=dtype, casting=casting, sycl_queue=exec_q
+        x1, x2, dtype=dtype, out=out, casting=casting, sycl_queue=exec_q
     )
 
     call_flag = None
@@ -993,7 +1008,22 @@ def dpnp_multiplication(
         elif x1.size == 0 or x2.size == 0:
             result.fill(0)
         else:
-            if dpnp.issubdtype(res_dtype, dpnp.inexact):
+            # TODO: replace with dpnp.int8 when it is added
+            x1_is_int8 = dpnp.issubdtype(x1.dtype, numpy.int8)
+            x2_is_int8 = dpnp.issubdtype(x2.dtype, numpy.int8)
+            res_is_int32 = dpnp.issubdtype(res_dtype, dpnp.int32)
+            special_case = x1_is_int8 and x2_is_int8 and res_is_int32
+            special_case = special_case and call_flag == "gemm"
+            if special_case:
+                # OneMath supports this special case
+                x1 = _copy_array(
+                    x1, copy_flag=not x1_contig_flag, order=res_order
+                )
+                x2 = _copy_array(
+                    x2, copy_flag=not x2_contig_flag, order=res_order
+                )
+                result = _gemm_matmul(exec_q, x1, x2, result)
+            elif dpnp.issubdtype(res_dtype, dpnp.inexact):
                 # copying is needed if dtypes of input arrays are different or
                 # their base (last 2-dimensions) is not c-contiguous or f-contiguous
                 x1 = _copy_array(
@@ -1029,23 +1059,21 @@ def dpnp_multiplication(
                     )
                     _manager.add_event_pair(ht_ev, gemv_ev)
                 elif call_flag == "gemm":
-                    result = _gemm_matmul(
-                        exec_q,
-                        x1,
-                        x2,
-                        result,
-                    )
-                else:  # call_flag == "gemm_batch"
+                    result = _gemm_matmul(exec_q, x1, x2, result)
+                else:
                     assert call_flag == "gemm_batch"
-                    result = _gemm_batch_matmul(
-                        exec_q,
-                        x1,
-                        x2,
-                        result,
-                    )
+                    result = _gemm_batch_matmul(exec_q, x1, x2, result)
             else:
                 # oneapi::mkl::blas::gemm/gemv do not support integer dtypes,
                 # so using dpctl.tensor.matmul instead
+
+                # `dpt.matmul` does not support `casting` kwarg.
+                # We may need to change input dtypes based on given `casting`.
+                # The possibility of casting is already validated in
+                # `_compute_res_dtype`.
+                x1 = _copy_array(x1, dtype=res_dtype, order=res_order)
+                x2 = _copy_array(x2, dtype=res_dtype, order=res_order)
+
                 x1_usm = dpnp.get_usm_ndarray(x1)
                 x2_usm = dpnp.get_usm_ndarray(x2)
                 out_usm = dpnp.get_usm_ndarray(result)
@@ -1199,7 +1227,7 @@ def dpnp_vecdot(
 
     # Determine the appropriate data types
     res_dtype = _compute_res_dtype(
-        x1, x2, dtype=dtype, casting=casting, sycl_queue=exec_q
+        x1, x2, dtype=dtype, out=out, casting=casting, sycl_queue=exec_q
     )
 
     _, x1_is_1D, _ = _define_dim_flags(x1, axis=-1)
diff --git a/dpnp/tests/test_product.py b/dpnp/tests/test_product.py
index 0837ad7e4a8..c414e6fdf6e 100644
--- a/dpnp/tests/test_product.py
+++ b/dpnp/tests/test_product.py
@@ -3,7 +3,7 @@
 import pytest
 from dpctl.tensor._numpy_helper import AxisError
 from dpctl.utils import ExecutionPlacementError
-from numpy.testing import assert_array_equal, assert_raises
+from numpy.testing import assert_allclose, assert_array_equal, assert_raises
 
 import dpnp
 
@@ -15,6 +15,10 @@
 )
 from .third_party.cupy import testing
 
+# A list of selected dtypes including both integer and float dtypes
+# to test differennt backends: OneMath (for float) and dpctl (for integer)
+_selected_dtypes = [numpy.int64, numpy.float32]
+
 
 def _assert_selective_dtype_allclose(result, expected, dtype):
     # For numpy.dot, numpy.vdot, numpy.kron, numpy.inner, and numpy.tensordot,
@@ -271,8 +275,7 @@ def test_basic(self, dtype, shape1, shape2):
         expected = a.dot(b)
         assert_dtype_allclose(result, expected)
 
-    # Use both integer and float, to test both MKL and dpctl backends
-    @pytest.mark.parametrize("dtype", [numpy.int32, numpy.float32])
+    @pytest.mark.parametrize("dtype", _selected_dtypes)
     @pytest.mark.parametrize("stride", [3, -1, -2, -5])
     def test_strided(self, dtype, stride):
         a = numpy.arange(25, dtype=dtype)
@@ -599,8 +602,7 @@ class TestMatmul:
     def setup_method(self):
         numpy.random.seed(42)
 
-    # Use both integer and float, to test both MKL and dpctl backends
-    @pytest.mark.parametrize("dtype", [numpy.int32, numpy.float32])
+    @pytest.mark.parametrize("dtype", _selected_dtypes)
     @pytest.mark.parametrize(
         "order1, order2", [("C", "C"), ("C", "F"), ("F", "C"), ("F", "F")]
     )
@@ -678,8 +680,8 @@ def setup_method(self):
         ],
     )
     def test_basic(self, dtype, order1, order2, shape1, shape2):
-        a = numpy.arange(numpy.prod(shape1), dtype=dtype).reshape(shape1)
-        b = numpy.arange(numpy.prod(shape2), dtype=dtype).reshape(shape2)
+        a = generate_random_numpy_array(shape1, dtype=dtype).reshape(shape1)
+        b = generate_random_numpy_array(shape2, dtype=dtype).reshape(shape2)
         a = numpy.array(a, order=order1)
         b = numpy.array(b, order=order2)
         ia, ib = dpnp.array(a), dpnp.array(b)
@@ -688,26 +690,6 @@ def test_basic(self, dtype, order1, order2, shape1, shape2):
         expected = numpy.matmul(a, b)
         assert_dtype_allclose(result, expected)
 
-    @pytest.mark.parametrize(
-        "shape1, shape2",
-        [
-            ((4,), (4, 3)),
-            ((2, 4), (4, 3)),
-            ((4, 2, 3), (4, 3, 5)),
-            ((6, 7, 4, 3), (6, 7, 3, 5)),
-        ],
-        ids=["gemv", "gemm", "gemm_batch1", "gemm_batch2"],
-    )
-    def test_bool(self, shape1, shape2):
-        x = numpy.arange(2, dtype=numpy.bool_)
-        a = numpy.resize(x, numpy.prod(shape1)).reshape(shape1)
-        b = numpy.resize(x, numpy.prod(shape2)).reshape(shape2)
-        ia, ib = dpnp.array(a), dpnp.array(b)
-
-        result = dpnp.matmul(ia, ib)
-        expected = numpy.matmul(a, b)
-        assert_dtype_allclose(result, expected)
-
     @pytest.mark.parametrize(
         "axes",
         [
@@ -819,55 +801,48 @@ def test_axes_out_1D(self, axes, b_shape, out_shape):
         expected = numpy.matmul(a, b, axes=axes, out=out)
         assert_dtype_allclose(result, expected)
 
-    @pytest.mark.parametrize("in_dt", get_all_dtypes(no_bool=True))
-    @pytest.mark.parametrize(
-        "out_dt", get_all_dtypes(no_bool=True, no_none=True)
-    )
+    @pytest.mark.parametrize("dt_in1", get_all_dtypes(no_none=True))
+    @pytest.mark.parametrize("dt_in2", get_all_dtypes(no_none=True))
+    @pytest.mark.parametrize("dt_out", get_all_dtypes(no_none=True))
     @pytest.mark.parametrize(
         "shape1, shape2",
         [
-            ((4,), (4, 3)),
+            ((1, 4), (4, 3)),
             ((2, 4), (4, 3)),
             ((4, 2, 3), (4, 3, 5)),
-            ((6, 7, 4, 3), (6, 7, 3, 5)),
         ],
-        ids=["gemv", "gemm", "gemm_batch1", "gemm_batch2"],
+        ids=["gemv", "gemm", "gemm_batch"],
     )
-    def test_dtype_matrix_inout(self, in_dt, out_dt, shape1, shape2):
-        a = generate_random_numpy_array(shape1, in_dt)
-        b = generate_random_numpy_array(shape2, in_dt)
+    def test_dtype_matrix(self, dt_in1, dt_in2, dt_out, shape1, shape2):
+        a = generate_random_numpy_array(shape1, dt_in1)
+        b = generate_random_numpy_array(shape2, dt_in2)
         ia, ib = dpnp.array(a), dpnp.array(b)
 
-        if dpnp.can_cast(dpnp.result_type(ia, ib), out_dt, casting="same_kind"):
-            result = dpnp.matmul(ia, ib, dtype=out_dt)
-            expected = numpy.matmul(a, b, dtype=out_dt)
+        out_shape = shape1[:-2] + (shape1[-2], shape2[-1])
+        out = numpy.empty(out_shape, dtype=dt_out)
+        iout = dpnp.array(out)
+
+        if dpnp.can_cast(dpnp.result_type(ia, ib), dt_out, casting="same_kind"):
+            result = dpnp.matmul(ia, ib, dtype=dt_out)
+            expected = numpy.matmul(a, b, dtype=dt_out)
             assert_dtype_allclose(result, expected)
-        else:
-            assert_raises(TypeError, dpnp.matmul, ia, ib, dtype=out_dt)
-            assert_raises(TypeError, numpy.matmul, a, b, dtype=out_dt)
 
-    @pytest.mark.parametrize("dtype1", get_all_dtypes(no_bool=True))
-    @pytest.mark.parametrize("dtype2", get_all_dtypes(no_bool=True))
-    @pytest.mark.parametrize(
-        "shape1, shape2",
-        [
-            ((4,), (4, 3)),
-            ((2, 4), (4, 3)),
-            ((4, 2, 3), (4, 3, 5)),
-            ((6, 7, 4, 3), (6, 7, 3, 5)),
-        ],
-        ids=["gemv", "gemm", "gemm_batch1", "gemm_batch2"],
-    )
-    def test_dtype_matrix_inputs(self, dtype1, dtype2, shape1, shape2):
-        a = generate_random_numpy_array(shape1, dtype1)
-        b = generate_random_numpy_array(shape2, dtype2)
-        ia, ib = dpnp.array(a), dpnp.array(b)
+            result = dpnp.matmul(ia, ib, out=iout)
+            assert result is iout
+            expected = numpy.matmul(a, b, out=out)
+            # dpnp gives the exact same result when `dtype` or `out` is specified
+            # while NumPy give slightly different results. NumPy result obtained
+            # with `dtype` is much closer to dpnp (a smaller `tol`) while the
+            # result obtained with `out` needs a larger `tol` to match dpnp
+            assert_allclose(result, expected, rtol=1e-6, atol=1e-6)
+        else:
+            assert_raises(TypeError, dpnp.matmul, ia, ib, dtype=dt_out)
+            assert_raises(TypeError, numpy.matmul, a, b, dtype=dt_out)
 
-        result = dpnp.matmul(ia, ib)
-        expected = numpy.matmul(a, b)
-        assert_dtype_allclose(result, expected)
+            assert_raises(TypeError, dpnp.matmul, ia, ib, out=iout)
+            assert_raises(TypeError, numpy.matmul, a, b, out=out)
 
-    @pytest.mark.parametrize("dtype", [numpy.int32, numpy.float32])
+    @pytest.mark.parametrize("dtype", _selected_dtypes)
     @pytest.mark.parametrize("order1", ["C", "F", "A"])
     @pytest.mark.parametrize("order2", ["C", "F", "A"])
     @pytest.mark.parametrize("order", ["C", "F", "K", "A"])
@@ -904,7 +879,7 @@ def test_order(self, dtype, order1, order2, order, shape1, shape2):
         assert result.flags.f_contiguous == expected.flags.f_contiguous
         assert_dtype_allclose(result, expected)
 
-    @pytest.mark.parametrize("dtype", [numpy.int32, numpy.float32])
+    @pytest.mark.parametrize("dtype", _selected_dtypes)
     @pytest.mark.parametrize(
         "stride",
         [(-2, -2, -2, -2), (2, 2, 2, 2), (-2, 2, -2, 2), (2, -2, 2, -2)],
@@ -930,7 +905,7 @@ def test_strided1(self, dtype, stride):
             assert result is iout
             assert_dtype_allclose(result, expected)
 
-    @pytest.mark.parametrize("dtype", [numpy.int32, numpy.float32])
+    @pytest.mark.parametrize("dtype", _selected_dtypes)
     @pytest.mark.parametrize(
         "shape", [(10, 3, 3), (12, 10, 3, 3)], ids=["3D", "4D"]
     )
@@ -960,7 +935,7 @@ def test_strided2(self, dtype, shape, stride, transpose):
         assert result is iout
         assert_dtype_allclose(result, expected)
 
-    @pytest.mark.parametrize("dtype", [numpy.int32, numpy.float32])
+    @pytest.mark.parametrize("dtype", _selected_dtypes)
     @pytest.mark.parametrize(
         "stride",
         [(-2, -2), (2, 2), (-2, 2), (2, -2)],
@@ -991,7 +966,7 @@ def test_strided3(self, dtype, stride, transpose):
         assert_dtype_allclose(result, expected)
 
     @testing.with_requires("numpy>=2.2")
-    @pytest.mark.parametrize("dtype", [numpy.int32, numpy.float32])
+    @pytest.mark.parametrize("dtype", _selected_dtypes)
     @pytest.mark.parametrize("func", ["matmul", "matvec"])
     @pytest.mark.parametrize("incx", [-2, 2])
     @pytest.mark.parametrize("incy", [-2, 2])
@@ -1027,7 +1002,7 @@ def test_strided_mat_vec(self, dtype, func, incx, incy, transpose):
         assert_dtype_allclose(result, expected)
 
     @testing.with_requires("numpy>=2.2")
-    @pytest.mark.parametrize("dtype", [numpy.int32, numpy.float32])
+    @pytest.mark.parametrize("dtype", _selected_dtypes)
     @pytest.mark.parametrize("func", ["matmul", "vecmat"])
     @pytest.mark.parametrize("incx", [-2, 2])
     @pytest.mark.parametrize("incy", [-2, 2])
@@ -1075,75 +1050,52 @@ def test_strided_vec_mat(self, dtype, func, incx, incy, transpose):
             ("F", "F", "C"),
         ],
     )
-    @pytest.mark.parametrize(
-        "dtype", get_all_dtypes(no_none=True, no_bool=True)
-    )
-    def test_out1(self, order1, order2, out_order, dtype):
+    @pytest.mark.parametrize("dtype", _selected_dtypes)
+    def test_out_order1(self, order1, order2, out_order, dtype):
         # test gemm with out keyword
-        a1 = numpy.arange(20, dtype=dtype).reshape(5, 4, order=order1)
-        a2 = numpy.arange(28, dtype=dtype).reshape(4, 7, order=order2)
-        b1, b2 = dpnp.array(a1), dpnp.array(a2)
-
-        iout = dpnp.empty((5, 7), dtype=dtype, order=out_order)
-        result = dpnp.matmul(b1, b2, out=iout)
-        assert result is iout
-
-        out = numpy.empty((5, 7), dtype=dtype, order=out_order)
-        expected = numpy.matmul(a1, a2, out=out)
-        assert result.flags.c_contiguous == expected.flags.c_contiguous
-        assert result.flags.f_contiguous == expected.flags.f_contiguous
-        assert_dtype_allclose(result, expected)
-
-    @pytest.mark.parametrize("trans", [True, False])
-    @pytest.mark.parametrize(
-        "dtype", get_all_dtypes(no_none=True, no_bool=True)
-    )
-    def test_out2(self, trans, dtype):
-        # test gemm_batch with out keyword
-        # the base of input arrays is c-contiguous
-        # the base of output array is c-contiguous or f-contiguous
-        a = numpy.arange(24, dtype=dtype).reshape(2, 3, 4)
-        b = numpy.arange(40, dtype=dtype).reshape(2, 4, 5)
+        a = generate_random_numpy_array((5, 4), dtype)
+        b = generate_random_numpy_array((4, 7), dtype)
+        a = numpy.array(a, order=order1)
+        b = numpy.array(b, order=order2)
         ia, ib = dpnp.array(a), dpnp.array(b)
 
-        if trans:
-            iout = dpnp.empty((2, 5, 3), dtype=dtype).transpose(0, 2, 1)
-            out = numpy.empty((2, 5, 3), dtype=dtype).transpose(0, 2, 1)
-        else:
-            iout = dpnp.empty((2, 3, 5), dtype=dtype)
-            out = numpy.empty((2, 3, 5), dtype=dtype)
-
+        iout = dpnp.empty((5, 7), dtype=dtype, order=out_order)
         result = dpnp.matmul(ia, ib, out=iout)
         assert result is iout
 
+        out = numpy.empty((5, 7), dtype=dtype, order=out_order)
         expected = numpy.matmul(a, b, out=out)
         assert result.flags.c_contiguous == expected.flags.c_contiguous
         assert result.flags.f_contiguous == expected.flags.f_contiguous
         assert_dtype_allclose(result, expected)
 
-    @pytest.mark.parametrize("trans", [True, False])
-    @pytest.mark.parametrize(
-        "dtype", get_all_dtypes(no_none=True, no_bool=True)
-    )
-    def test_out3(self, trans, dtype):
+    @pytest.mark.parametrize("trans_in", [True, False])
+    @pytest.mark.parametrize("trans_out", [True, False])
+    @pytest.mark.parametrize("dtype", _selected_dtypes)
+    def test_out_order2(self, trans_in, trans_out, dtype):
         # test gemm_batch with out keyword
-        # the base of input arrays is f-contiguous
         # the base of output array is c-contiguous or f-contiguous
-        a = numpy.arange(24, dtype=dtype).reshape(2, 4, 3)
-        b = numpy.arange(40, dtype=dtype).reshape(2, 5, 4)
-        ia, ib = dpnp.array(a), dpnp.array(b)
-
-        a = numpy.asarray(a).transpose(0, 2, 1)
-        b = numpy.asarray(b).transpose(0, 2, 1)
-        ia = ia.transpose(0, 2, 1)
-        ib = ib.transpose(0, 2, 1)
+        if trans_in:
+            # the base of input arrays is f-contiguous
+            a = generate_random_numpy_array((2, 4, 3), dtype)
+            b = generate_random_numpy_array((2, 5, 4), dtype)
+            ia, ib = dpnp.array(a), dpnp.array(b)
+            a, b = a.transpose(0, 2, 1), b.transpose(0, 2, 1)
+            ia, ib = ia.transpose(0, 2, 1), ib.transpose(0, 2, 1)
+        else:
+            # the base of input arrays is c-contiguous
+            a = generate_random_numpy_array((2, 3, 4), dtype)
+            b = generate_random_numpy_array((2, 4, 5), dtype)
+            ia, ib = dpnp.array(a), dpnp.array(b)
 
-        if trans:
+        if trans_out:
+            # the base of output array is f-contiguous
             iout = dpnp.empty((2, 5, 3), dtype=dtype).transpose(0, 2, 1)
             out = numpy.empty((2, 5, 3), dtype=dtype).transpose(0, 2, 1)
         else:
-            iout = dpnp.empty((2, 3, 5), dtype=dtype)
+            # the base of output array is c-contiguous
             out = numpy.empty((2, 3, 5), dtype=dtype)
+            iout = dpnp.array(out)
 
         result = dpnp.matmul(ia, ib, out=iout)
         assert result is iout
@@ -1155,11 +1107,7 @@ def test_out3(self, trans, dtype):
 
     @pytest.mark.parametrize(
         "out_shape",
-        [
-            ((4, 5)),
-            ((6,)),
-            ((4, 7, 2)),
-        ],
+        [((4, 5)), ((6,)), ((4, 7, 2))],
     )
     def test_out_0D(self, out_shape):
         # for matmul of 1-D arrays, output is 0-D and if out keyword is given
@@ -1192,11 +1140,10 @@ def test_large_arrays(self, shape1, shape2):
         expected = numpy.matmul(a, b)
         assert_dtype_allclose(result, expected, factor=24)
 
-    @pytest.mark.parametrize("dtype", [numpy.int64, numpy.float32])
+    @pytest.mark.parametrize("dtype", _selected_dtypes)
     def test_large_values(self, dtype):
-        a = numpy.array(
-            [[[94906267, 94906267], [94906267, 94906267]]], dtype=dtype
-        )
+        val = 94906267
+        a = numpy.array([[[val, val], [val, val]]], dtype=dtype)
         b = numpy.array([94906265, 94906265], dtype=dtype)
         ia, ib = dpnp.array(a), dpnp.array(b)
 
@@ -1204,6 +1151,37 @@ def test_large_values(self, dtype):
         expected = numpy.matmul(a, b)
         assert_dtype_allclose(result, expected)
 
+    def test_special_case(self):
+        # Although inputs are int, gemm will be used for calculation
+        a = numpy.ones((3, 4), dtype=numpy.int8)
+        b = numpy.ones((4, 5), dtype=numpy.int8)
+        ia, ib = dpnp.array(a), dpnp.array(b)
+
+        result = dpnp.matmul(ia, ib, dtype=numpy.int32)
+        expected = numpy.matmul(a, b, dtype=numpy.int32)
+        assert_dtype_allclose(result, expected)
+
+        iout = dpnp.empty((3, 5), dtype=numpy.int32)
+        result = dpnp.matmul(ia, ib, out=iout)
+        assert_dtype_allclose(result, expected)
+
+    def test_bool(self):
+        a = numpy.ones((3, 4), dtype=numpy.bool)
+        b = numpy.ones((4, 5), dtype=numpy.bool)
+        ia, ib = dpnp.array(a), dpnp.array(b)
+
+        # the output is (3, 4) array filled with 4
+        result = dpnp.matmul(ia, ib, dtype=numpy.int32)
+        expected = numpy.matmul(a, b, dtype=numpy.int32)
+        assert_dtype_allclose(result, expected)
+
+        out = numpy.empty((3, 5), dtype=numpy.int32)
+        iout = dpnp.array(out)
+        # the output is (3, 4) array filled with 1
+        result = dpnp.matmul(ia, ib, out=iout)
+        expected = numpy.matmul(a, b, out=out)
+        assert_dtype_allclose(result, expected)
+
     @testing.with_requires("numpy>=2.0")
     def test_linalg_matmul(self):
         a = numpy.ones((3, 4))
@@ -1214,7 +1192,7 @@ def test_linalg_matmul(self):
         expected = numpy.linalg.matmul(a, b)
         assert_array_equal(result, expected)
 
-    @pytest.mark.parametrize("dtype", [numpy.int32, numpy.float32])
+    @pytest.mark.parametrize("dtype", _selected_dtypes)
     @pytest.mark.parametrize(
         "sh1, sh2",
         [
@@ -1987,8 +1965,7 @@ def test_axes_out_1D(self, axes, b_shape):
         expected = numpy.vecdot(a, b, axes=axes, out=out)
         assert_dtype_allclose(result, expected)
 
-    # Use both integer and float, to test both MKL and dpctl backends
-    @pytest.mark.parametrize("dtype", [numpy.int32, numpy.float32])
+    @pytest.mark.parametrize("dtype", _selected_dtypes)
     @pytest.mark.parametrize("stride", [2, -1, -2])
     def test_strided(self, dtype, stride):
         a = numpy.arange(100, dtype=dtype).reshape(10, 10)
@@ -2010,7 +1987,7 @@ def test_input_dtype_matrix(self, dtype1, dtype2):
         expected = numpy.vecdot(a, b)
         assert_dtype_allclose(result, expected)
 
-    @pytest.mark.parametrize("dtype", [numpy.int32, numpy.float32])
+    @pytest.mark.parametrize("dtype", _selected_dtypes)
     @pytest.mark.parametrize("order1", ["C", "F", "A"])
     @pytest.mark.parametrize("order2", ["C", "F", "A"])
     @pytest.mark.parametrize("order", ["C", "F", "K", "A"])

From 161c617cc6b1b74da4abd2c3acfa7c9bd5f77f9b Mon Sep 17 00:00:00 2001
From: Vahid Tavanashad <vahid.tavanashad@intel.com>
Date: Thu, 6 Feb 2025 16:57:47 -0800
Subject: [PATCH 4/7] updates for onemkl interfaces

---
 dpnp/backend/extensions/blas/blas_py.cpp    | 14 ++++----
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py | 37 ++++++++++++++++-----
 dpnp/tests/test_product.py                  | 24 ++++++++-----
 3 files changed, 50 insertions(+), 25 deletions(-)

diff --git a/dpnp/backend/extensions/blas/blas_py.cpp b/dpnp/backend/extensions/blas/blas_py.cpp
index 38d0d5ce9fe..21792912025 100644
--- a/dpnp/backend/extensions/blas/blas_py.cpp
+++ b/dpnp/backend/extensions/blas/blas_py.cpp
@@ -146,14 +146,14 @@ PYBIND11_MODULE(_blas_impl, m)
 
     {
         m.def(
-            "_row_major_is_available",
-            [](void) {
-#if defined(USE_ONEMKL_CUBLAS)
-                return false;
-#else
+            "_using_onemkl_interfaces",
+            []() {
+#ifdef USE_ONEMKL_INTERFACES
                 return true;
-#endif // USE_ONEMKL_CUBLAS
+#else
+                return false;
+#endif
             },
-            "Check if the onemkl::blas::row_major can be used.");
+            "Check if the OneMKL interfaces are being used.");
     }
 }
diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index d49b63d4af6..30443a67159 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -503,6 +503,28 @@ def _gemm_matmul(exec_q, x1, x2, res):
     return res
 
 
+def _gemm_special_case(x1, x2, res_dtype, call_flag):
+    """
+    `gemm` and `gemm_batch` support these special cases of data types
+    while `gemv` does not.
+
+    """
+
+    # TODO: replace with dpnp.int8 when it is added
+    x1_is_int8 = dpnp.issubdtype(x1.dtype, numpy.int8)
+    x2_is_int8 = dpnp.issubdtype(x2.dtype, numpy.int8)
+    res_is_int32 = dpnp.issubdtype(res_dtype, dpnp.int32)
+    res_is_float32 = dpnp.issubdtype(res_dtype, dpnp.float32)
+
+    flag = x1_is_int8 and x2_is_int8 and (res_is_int32 or res_is_float32)
+    flag = flag and call_flag in ["gemm", "gemm_batch"]
+
+    # onemkl_interfaces does not support these data types
+    onemkl_interfaces = bi._using_onemkl_interfaces()
+
+    return flag and not onemkl_interfaces
+
+
 def _shape_error(shape1, shape2, func, err_msg):
     """Validate the shapes of input and output arrays."""
 
@@ -1008,21 +1030,18 @@ def dpnp_multiplication(
         elif x1.size == 0 or x2.size == 0:
             result.fill(0)
         else:
-            # TODO: replace with dpnp.int8 when it is added
-            x1_is_int8 = dpnp.issubdtype(x1.dtype, numpy.int8)
-            x2_is_int8 = dpnp.issubdtype(x2.dtype, numpy.int8)
-            res_is_int32 = dpnp.issubdtype(res_dtype, dpnp.int32)
-            special_case = x1_is_int8 and x2_is_int8 and res_is_int32
-            special_case = special_case and call_flag == "gemm"
-            if special_case:
-                # OneMath supports this special case
+            if _gemm_special_case(x1, x2, res_dtype, call_flag):
                 x1 = _copy_array(
                     x1, copy_flag=not x1_contig_flag, order=res_order
                 )
                 x2 = _copy_array(
                     x2, copy_flag=not x2_contig_flag, order=res_order
                 )
-                result = _gemm_matmul(exec_q, x1, x2, result)
+                if call_flag == "gemm":
+                    result = _gemm_matmul(exec_q, x1, x2, result)
+                else:
+                    assert call_flag == "gemm_batch"
+                    result = _gemm_batch_matmul(exec_q, x1, x2, result)
             elif dpnp.issubdtype(res_dtype, dpnp.inexact):
                 # copying is needed if dtypes of input arrays are different or
                 # their base (last 2-dimensions) is not c-contiguous or f-contiguous
diff --git a/dpnp/tests/test_product.py b/dpnp/tests/test_product.py
index c414e6fdf6e..8ebedd141d1 100644
--- a/dpnp/tests/test_product.py
+++ b/dpnp/tests/test_product.py
@@ -834,7 +834,7 @@ def test_dtype_matrix(self, dt_in1, dt_in2, dt_out, shape1, shape2):
             # while NumPy give slightly different results. NumPy result obtained
             # with `dtype` is much closer to dpnp (a smaller `tol`) while the
             # result obtained with `out` needs a larger `tol` to match dpnp
-            assert_allclose(result, expected, rtol=1e-6, atol=1e-6)
+            assert_allclose(result, expected, rtol=1e-5, atol=1e-5)
         else:
             assert_raises(TypeError, dpnp.matmul, ia, ib, dtype=dt_out)
             assert_raises(TypeError, numpy.matmul, a, b, dtype=dt_out)
@@ -1151,23 +1151,29 @@ def test_large_values(self, dtype):
         expected = numpy.matmul(a, b)
         assert_dtype_allclose(result, expected)
 
-    def test_special_case(self):
+    @pytest.mark.parametrize("dt_out", [numpy.int32, numpy.float32])
+    @pytest.mark.parametrize(
+        "shape1, shape2",
+        [((2, 4), (4, 3)), ((4, 2, 3), (4, 3, 5))],
+        ids=["gemm", "gemm_batch"],
+    )
+    def test_special_case(self, dt_out, shape1, shape2):
         # Although inputs are int, gemm will be used for calculation
-        a = numpy.ones((3, 4), dtype=numpy.int8)
-        b = numpy.ones((4, 5), dtype=numpy.int8)
+        a = numpy.ones(shape1, dtype=numpy.int8)
+        b = numpy.ones(shape2, dtype=numpy.int8)
         ia, ib = dpnp.array(a), dpnp.array(b)
 
-        result = dpnp.matmul(ia, ib, dtype=numpy.int32)
-        expected = numpy.matmul(a, b, dtype=numpy.int32)
+        result = dpnp.matmul(ia, ib, dtype=dt_out)
+        expected = numpy.matmul(a, b, dtype=dt_out)
         assert_dtype_allclose(result, expected)
 
-        iout = dpnp.empty((3, 5), dtype=numpy.int32)
+        iout = dpnp.empty(result.shape, dtype=dt_out)
         result = dpnp.matmul(ia, ib, out=iout)
         assert_dtype_allclose(result, expected)
 
     def test_bool(self):
-        a = numpy.ones((3, 4), dtype=numpy.bool)
-        b = numpy.ones((4, 5), dtype=numpy.bool)
+        a = numpy.ones((3, 4), dtype=dpnp.bool)
+        b = numpy.ones((4, 5), dtype=dpnp.bool)
         ia, ib = dpnp.array(a), dpnp.array(b)
 
         # the output is (3, 4) array filled with 4

From 6cc6dee8c0cfe9ef74ca746ce4fd70644df057c5 Mon Sep 17 00:00:00 2001
From: Vahid Tavanashad <vahid.tavanashad@intel.com>
Date: Fri, 7 Feb 2025 07:47:01 -0800
Subject: [PATCH 5/7] address new comments

---
 dpnp/dpnp_utils/dpnp_utils_linearalgebra.py | 19 +++++++------------
 dpnp/tests/test_product.py                  | 14 +++++++-------
 2 files changed, 14 insertions(+), 19 deletions(-)

diff --git a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
index 30443a67159..5767fdd4e52 100644
--- a/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
+++ b/dpnp/dpnp_utils/dpnp_utils_linearalgebra.py
@@ -89,7 +89,7 @@ def _compute_res_dtype(*arrays, sycl_queue, dtype=None, out=None, casting="no"):
     # are cast to out dtype and then calculation is performed. Even when inputs
     # are boolean and `dtype` is given, the casting is done first and then the
     # calculation is performed.
-    if out is not None and not dpnp.issubdtype(res_dtype, dpnp.bool):
+    if out is not None and res_dtype != dpnp.bool:
         # out dtype is prioritized over a given dtype
         dtype = out.dtype
 
@@ -509,15 +509,10 @@ def _gemm_special_case(x1, x2, res_dtype, call_flag):
     while `gemv` does not.
 
     """
-
     # TODO: replace with dpnp.int8 when it is added
-    x1_is_int8 = dpnp.issubdtype(x1.dtype, numpy.int8)
-    x2_is_int8 = dpnp.issubdtype(x2.dtype, numpy.int8)
-    res_is_int32 = dpnp.issubdtype(res_dtype, dpnp.int32)
-    res_is_float32 = dpnp.issubdtype(res_dtype, dpnp.float32)
-
-    flag = x1_is_int8 and x2_is_int8 and (res_is_int32 or res_is_float32)
-    flag = flag and call_flag in ["gemm", "gemm_batch"]
+    is_int8 = x1.dtype == numpy.int8 and x2.dtype == numpy.int8
+    is_int32_or_f32 = res_dtype in [dpnp.int32, dpnp.float32]
+    flag = is_int8 and is_int32_or_f32 and call_flag in ["gemm", "gemm_batch"]
 
     # onemkl_interfaces does not support these data types
     onemkl_interfaces = bi._using_onemkl_interfaces()
@@ -1084,7 +1079,8 @@ def dpnp_multiplication(
                     result = _gemm_batch_matmul(exec_q, x1, x2, result)
             else:
                 # oneapi::mkl::blas::gemm/gemv do not support integer dtypes,
-                # so using dpctl.tensor.matmul instead
+                # except for special cases determined in `_gemm_special_case`,
+                # use dpctl.tensor.matmul for unsupported cases
 
                 # `dpt.matmul` does not support `casting` kwarg.
                 # We may need to change input dtypes based on given `casting`.
@@ -1096,10 +1092,9 @@ def dpnp_multiplication(
                 x1_usm = dpnp.get_usm_ndarray(x1)
                 x2_usm = dpnp.get_usm_ndarray(x2)
                 out_usm = dpnp.get_usm_ndarray(result)
-                res_usm = dpt.matmul(
+                dpt.matmul(
                     x1_usm, x2_usm, out=out_usm, dtype=dtype, order=order
                 )
-                result = dpnp_array._create_from_usm_ndarray(res_usm)
 
     if NumPy_special_case:
         result = dpnp.tile(result, out.shape)
diff --git a/dpnp/tests/test_product.py b/dpnp/tests/test_product.py
index 8ebedd141d1..99c211970bf 100644
--- a/dpnp/tests/test_product.py
+++ b/dpnp/tests/test_product.py
@@ -888,7 +888,7 @@ def test_order(self, dtype, order1, order2, order, shape1, shape2):
     def test_strided1(self, dtype, stride):
         for dim in [1, 2, 3, 4]:
             shape = tuple(20 for _ in range(dim))
-            A = numpy.random.rand(*shape).astype(dtype)
+            A = generate_random_numpy_array(shape, dtype)
             iA = dpnp.array(A)
             slices = tuple(slice(None, None, stride[i]) for i in range(dim))
             a = A[slices]
@@ -897,13 +897,13 @@ def test_strided1(self, dtype, stride):
             # the 2D base is not c-contiguous nor f-contigous
             result = dpnp.matmul(ia, ia)
             expected = numpy.matmul(a, a)
-            assert_dtype_allclose(result, expected)
+            assert_dtype_allclose(result, expected, factor=16)
 
             iOUT = dpnp.empty(shape, dtype=result.dtype)
             iout = iOUT[slices]
             result = dpnp.matmul(ia, ia, out=iout)
             assert result is iout
-            assert_dtype_allclose(result, expected)
+            assert_dtype_allclose(result, expected, factor=16)
 
     @pytest.mark.parametrize("dtype", _selected_dtypes)
     @pytest.mark.parametrize(
@@ -915,7 +915,7 @@ def test_strided2(self, dtype, shape, stride, transpose):
         # one dimension (axis=-3) is strided
         # if negative stride, copy is needed and the base becomes c-contiguous
         # otherwise the base remains the same as input in gemm_batch
-        A = numpy.random.rand(*shape).astype(dtype)
+        A = generate_random_numpy_array(shape, dtype)
         iA = dpnp.array(A)
         if transpose:
             A = numpy.moveaxis(A, (-2, -1), (-1, -2))
@@ -948,7 +948,7 @@ def test_strided3(self, dtype, stride, transpose):
         # For positive stride, no copy but reshape makes the base c-contiguous.
         stride0, stride1 = stride
         shape = (12, 10, 3, 3)  # 4D array
-        A = numpy.random.rand(*shape).astype(dtype)
+        A = generate_random_numpy_array(shape, dtype)
         iA = dpnp.array(A)
         if transpose:
             A = numpy.moveaxis(A, (-2, -1), (-1, -2))
@@ -980,7 +980,7 @@ def test_strided_mat_vec(self, dtype, func, incx, incy, transpose):
         else:
             s1 = shape[-1]
             s2 = shape[-2]
-        a = numpy.random.rand(*shape).astype(dtype)
+        a = generate_random_numpy_array(shape, dtype)
         ia = dpnp.array(a)
         if transpose:
             a = numpy.moveaxis(a, (-2, -1), (-1, -2))
@@ -1016,7 +1016,7 @@ def test_strided_vec_mat(self, dtype, func, incx, incy, transpose):
         else:
             s1 = shape[-1]
             s2 = shape[-2]
-        a = numpy.random.rand(*shape).astype(dtype)
+        a = generate_random_numpy_array(shape, dtype)
         ia = dpnp.array(a)
         if transpose:
             a = numpy.moveaxis(a, (-2, -1), (-1, -2))

From 2d41e151d2ebfe06c2f226835043dfb382a5536b Mon Sep 17 00:00:00 2001
From: Vahid Tavanashad <120411540+vtavana@users.noreply.github.com>
Date: Fri, 7 Feb 2025 13:45:15 -0600
Subject: [PATCH 6/7] Apply suggestions from code review

Co-authored-by: Anton <100830759+antonwolfy@users.noreply.github.com>
---
 dpnp/tests/test_product.py | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/dpnp/tests/test_product.py b/dpnp/tests/test_product.py
index 99c211970bf..ba305540632 100644
--- a/dpnp/tests/test_product.py
+++ b/dpnp/tests/test_product.py
@@ -1159,8 +1159,8 @@ def test_large_values(self, dtype):
     )
     def test_special_case(self, dt_out, shape1, shape2):
         # Although inputs are int, gemm will be used for calculation
-        a = numpy.ones(shape1, dtype=numpy.int8)
-        b = numpy.ones(shape2, dtype=numpy.int8)
+        a = generate_random_numpy_array(shape1, dtype=numpy.int8)
+        b = generate_random_numpy_array(shape2, dtype=numpy.int8)
         ia, ib = dpnp.array(a), dpnp.array(b)
 
         result = dpnp.matmul(ia, ib, dtype=dt_out)
@@ -1172,8 +1172,8 @@ def test_special_case(self, dt_out, shape1, shape2):
         assert_dtype_allclose(result, expected)
 
     def test_bool(self):
-        a = numpy.ones((3, 4), dtype=dpnp.bool)
-        b = numpy.ones((4, 5), dtype=dpnp.bool)
+        a = generate_random_numpy_array((3, 4), dtype=dpnp.bool)
+        b = generate_random_numpy_array((4, 5), dtype=dpnp.bool)
         ia, ib = dpnp.array(a), dpnp.array(b)
 
         # the output is (3, 4) array filled with 4

From 1edddbe5af69b4665c02d8860e0301f3e095498b Mon Sep 17 00:00:00 2001
From: Vahid Tavanashad <vahid.tavanashad@intel.com>
Date: Fri, 7 Feb 2025 13:13:33 -0800
Subject: [PATCH 7/7] increase timeout for windows

---
 .github/workflows/conda-package.yml   | 2 +-
 .github/workflows/cron-run-tests.yaml | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/.github/workflows/conda-package.yml b/.github/workflows/conda-package.yml
index 3ffe0c8418e..39f0cf21fa5 100644
--- a/.github/workflows/conda-package.yml
+++ b/.github/workflows/conda-package.yml
@@ -355,7 +355,7 @@ jobs:
         id: run_tests_win
         uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
         with:
-          timeout_minutes: 15
+          timeout_minutes: 17
           max_attempts: ${{ env.RUN_TESTS_MAX_ATTEMPTS }}
           retry_on: any
           command: |
diff --git a/.github/workflows/cron-run-tests.yaml b/.github/workflows/cron-run-tests.yaml
index 942977ac439..18ce2ca632e 100644
--- a/.github/workflows/cron-run-tests.yaml
+++ b/.github/workflows/cron-run-tests.yaml
@@ -143,7 +143,7 @@ jobs:
         id: run_tests_win
         uses: nick-fields/retry@7152eba30c6575329ac0576536151aca5a72780e # v3.0.0
         with:
-          timeout_minutes: 15
+          timeout_minutes: 17
           max_attempts: ${{ env.RUN_TESTS_MAX_ATTEMPTS }}
           retry_on: any
           command: |