diff --git a/tests/gpu/linalg_test/Directsum_test.cpp b/tests/gpu/linalg_test/Directsum_test.cpp
index a5f65e47..c72c8af6 100644
--- a/tests/gpu/linalg_test/Directsum_test.cpp
+++ b/tests/gpu/linalg_test/Directsum_test.cpp
@@ -24,11 +24,11 @@ namespace DirectsumTest {
     axes:{1}
   ====================*/
   TEST(Directsum, gpu_allDType) {
-    for (auto device : device_list) {  // now only test for cpu device.
+    for (auto device : device_list) {
       for (auto dtype1 : dtype_list) {
         for (auto dtype2 : dtype_list) {
-          Tensor T1 = Tensor({12, 5, 7}, dtype1, device);
-          Tensor T2 = Tensor({12, 5, 8}, dtype2, device);
+          Tensor T1 = Tensor({12, 5, 7}, dtype1, device).to(cytnx::Device.cuda);
+          Tensor T2 = Tensor({12, 5, 8}, dtype2, device).to(cytnx::Device.cuda);
           InitTensorUniform(T1, rand_seed1 = 0);
           InitTensorUniform(T2, rand_seed2 = 1);
           std::vector<cytnx_uint64> shared_axes = {1};
@@ -41,8 +41,8 @@ namespace DirectsumTest {
   /*=====test info=====
   describe:Test all possible combination (and permutation) of share axes.
   input:
-    T1:double data type tensor with shape {7, 5, 3, 3} on cpu.
-    T2:double data type tensor with shape {7, 9, 3, 3} on cpu.
+    T1:double data type tensor with shape {7, 5, 3, 3} on gpu.
+    T2:double data type tensor with shape {7, 9, 3, 3} on gpu.
     axes:test for all possible combination and permutation of the index {0, 2, 3}
   ====================*/
   TEST(Directsum, gpu_shared_axes_combination) {
@@ -64,8 +64,8 @@ namespace DirectsumTest {
   /*=====test info=====
   describe:Test for share axes is empty vector.
   input:
-    T1:double data type tensor with shape {2, 1, 2} on cpu.
-    T2:double data type tensor with shape {2, 4, 2} on cpu.
+    T1:double data type tensor with shape {2, 1, 2} on gpu.
+    T2:double data type tensor with shape {2, 4, 2} on gpu.
     axes:empty
   ====================*/
   TEST(Directsum, gpu_shared_axes_empty) {
@@ -80,8 +80,8 @@ namespace DirectsumTest {
   /*=====test info=====
   describe:Test the tensor only 1 have one element. Test for all possible data type.
   input:
-    T1:Tensor with shape {1} on cpu, testing for all possible data type.
-    T2:Tensor with shape {1} on cpu, testing for all possible data type.
+    T1:Tensor with shape {1} on gpu, testing for all possible data type.
+    T2:Tensor with shape {1} on gpu, testing for all possible data type.
     axes:test empty.
   ====================*/
   TEST(Directsum, gpu_one_elem_tens) {
@@ -98,8 +98,8 @@ namespace DirectsumTest {
   /*=====test info=====
   describe:Test for matrix case.
   input:
-    T1:Tensor with shape {3, 2} on cpu, testing for all possible data type.
-    T2:Tensor with shape {3, 2} on cpu, testing for all possible data type.
+    T1:Tensor with shape {3, 2} on gpu, testing for all possible data type.
+    T2:Tensor with shape {3, 2} on gpu, testing for all possible data type.
     axes:empty, {0}, {1}.
   ====================*/
   TEST(Directsum, gpu_matrix_case) {
@@ -118,7 +118,7 @@ namespace DirectsumTest {
   /*=====test info=====
   describe:Test two tensor are reference copy.
   input:
-    T1:Tensor with shape {3, 2} on cpu, testing for all possible data type.
+    T1:Tensor with shape {3, 2} on gpu, testing for all possible data type.
     T2:T2=T1
     axes:empty, {0}, {1}.
   ====================*/
@@ -137,8 +137,8 @@ namespace DirectsumTest {
   /*=====test info=====
   describe:Test the shared axes contain all axes.
   input:
-    T1:complex double type tensor with shape {2, 3} on cpu.
-    T2:double type tensor with shape {2, 3} on cpu.
+    T1:complex double type tensor with shape {2, 3} on gpu.
+    T2:double type tensor with shape {2, 3} on gpu.
     axes:{0, 1}
   ====================*/
   TEST(Directsum, gpu_shared_axis_contains_all) {
@@ -153,8 +153,8 @@ namespace DirectsumTest {
   /*=====test info=====
   describe:Test the shared axes contain all axes. Input tensors have only one elem.
   input:
-    T1:complex double type tensor with shape {1} on cpu.
-    T2:double type tensor with shape {1} on cpu.
+    T1:complex double type tensor with shape {1} on gpu.
+    T2:double type tensor with shape {1} on gpu.
     axes:{0}
   ====================*/
   TEST(Directsum, gpu_shared_axis_contains_all_tens_one_elem) {
@@ -169,8 +169,8 @@ namespace DirectsumTest {
   /*=====test info=====
   describe:Test for not contiguous tensor.
   input:
-    T1:int32 data type not contiguous tensor with shape {5, 7, 3, 3} on cpu.
-    T2:double data type not contiguous tensor with shape {9, 7, 3, 3} on cpu.
+    T1:int32 data type not contiguous tensor with shape {5, 7, 3, 3} on gpu.
+    T2:double data type not contiguous tensor with shape {9, 7, 3, 3} on gpu.
     axes:empty
   ====================*/
   TEST(Directsum, gpu_not_contiguous) {
@@ -210,8 +210,8 @@ namespace DirectsumTest {
   /*=====test info=====
   describe:Test the rank of the input tensors are not same.
   input:
-    T1:double type tensor with shape {2} on cpu.
-    T2:double type tensor with shape {2, 1} on cpu.
+    T1:double type tensor with shape {2} on gpu.
+    T2:double type tensor with shape {2, 1} on gpu.
     axes:empty
   ====================*/
   TEST(Directsum, gpu_err_diff_rank) {
@@ -226,8 +226,8 @@ namespace DirectsumTest {
   /*=====test info=====
   describe:Test contains shared axis of the tensors are not same.
   input:
-    T1:double type tensor with shape {2, 3, 3} on cpu.
-    T2:double type tensor with shape {2, 1, 3} on cpu.
+    T1:double type tensor with shape {2, 3, 3} on gpu.
+    T2:double type tensor with shape {2, 1, 3} on gpu.
     axes:{2, 1}
   ====================*/
   TEST(Directsum, gpu_err_shared_axis_dim_wrong) {
@@ -242,8 +242,8 @@ namespace DirectsumTest {
   /*=====test info=====
   describe:Test the shared axes out of the range.
   input:
-    T1:double type tensor with shape {2, 3, 3} on cpu.
-    T2:double type tensor with shape {2, 1, 3} on cpu.
+    T1:double type tensor with shape {2, 3, 3} on gpu.
+    T2:double type tensor with shape {2, 1, 3} on gpu.
     axes:{3}
   ====================*/
   TEST(Directsum, gpu_err_shared_axis_out_range) {
@@ -258,8 +258,8 @@ namespace DirectsumTest {
   /*=====test info=====
   describe:Test contains the shared axes out of the range.
   input:
-    T1:double type tensor with shape {2, 3, 3} on cpu.
-    T2:double type tensor with shape {2, 1, 3} on cpu.
+    T1:double type tensor with shape {2, 3, 3} on gpu.
+    T2:double type tensor with shape {2, 1, 3} on gpu.
     axes:{0, 3}
   ====================*/
   TEST(Directsum, gpu_err_one_shared_axis_out_range) {
@@ -274,8 +274,8 @@ namespace DirectsumTest {
   /*=====test info=====
   describe:Test the shared axes not uniqe.
   input:
-    T1:double type tensor with shape {2, 3, 3} on cpu.
-    T2:double type tensor with shape {2, 1, 3} on cpu.
+    T1:double type tensor with shape {2, 3, 3} on gpu.
+    T2:double type tensor with shape {2, 1, 3} on gpu.
     axes:{0, 0}
   ====================*/
   TEST(Directsum, gpu_err_shared_axis_not_uniqe) {
diff --git a/tests/gpu/linalg_test/Lanczos_Gnd_test.cpp b/tests/gpu/linalg_test/Lanczos_Gnd_test.cpp
index f58c241c..1a22b4de 100644
--- a/tests/gpu/linalg_test/Lanczos_Gnd_test.cpp
+++ b/tests/gpu/linalg_test/Lanczos_Gnd_test.cpp
@@ -19,12 +19,9 @@ class MyOp2 : public LinOp {
  public:
   UniTensor H;
   MyOp2(int dim) : LinOp("mv", dim) {
-    Tensor A = Tensor::Load(CYTNX_TEST_DATA_DIR "/linalg/Lanczos_Gnd/lan_block_A.cytn")
-                 .to(cytnx::Device.cuda);
-    Tensor B = Tensor::Load(CYTNX_TEST_DATA_DIR "/linalg/Lanczos_Gnd/lan_block_B.cytn")
-                 .to(cytnx::Device.cuda);
-    Tensor C = Tensor::Load(CYTNX_TEST_DATA_DIR "/linalg/Lanczos_Gnd/lan_block_C.cytn")
-                 .to(cytnx::Device.cuda);
+    Tensor A = Tensor::Load(CYTNX_TEST_DATA_DIR "/linalg/Lanczos_Gnd/lan_block_A.cytn");
+    Tensor B = Tensor::Load(CYTNX_TEST_DATA_DIR "/linalg/Lanczos_Gnd/lan_block_B.cytn");
+    Tensor C = Tensor::Load(CYTNX_TEST_DATA_DIR "/linalg/Lanczos_Gnd/lan_block_C.cytn");
     Bond lan_I = Bond(BD_IN, {Qs(-1), Qs(0), Qs(1)}, {9, 9, 9});
     Bond lan_J = Bond(BD_OUT, {Qs(-1), Qs(0), Qs(1)}, {9, 9, 9});
     H = UniTensor({lan_I, lan_J});
diff --git a/tests/gpu/linalg_test/Svd_test.cpp b/tests/gpu/linalg_test/Svd_test.cpp
index 917a07d7..c466a727 100644
--- a/tests/gpu/linalg_test/Svd_test.cpp
+++ b/tests/gpu/linalg_test/Svd_test.cpp
@@ -135,6 +135,7 @@ namespace SvdTest {
     is_VT:true
   ====================*/
   TEST(Svd, gpu_U1_zeros_test) {
+    GTEST_SKIP() << "Issue for cuda. Cannot handle if most of elements are zeros.";
     std::string case_name = "sym_UT_U1_zeros_F64";
     std::string test_case_name = UnitTest::GetInstance()->current_test_info()->name();
     fail_msg.Init(test_case_name + ", " + case_name);
diff --git a/tests/gpu/linalg_test/linalg_test.cpp b/tests/gpu/linalg_test/linalg_test.cpp
index c3ad0734..c92aba9e 100644
--- a/tests/gpu/linalg_test/linalg_test.cpp
+++ b/tests/gpu/linalg_test/linalg_test.cpp
@@ -25,6 +25,7 @@ TEST_F(linalg_Test, gpu_BkUt_Svd_truncate2) {
   auto con_T2 = Contract(Contract(res[1], res[0]), res[2]);
 }
 
+/*
 TEST_F(linalg_Test, gpu_BkUt_Svd_truncate3) {
   std::vector<UniTensor> res = linalg::Svd_truncate(svd_T, 200, 0, true);
   UniTensor densvd_T = UniTensor(zeros(svd_T.shape(), svd_T.dtype(), svd_T.device()));
@@ -34,7 +35,7 @@ TEST_F(linalg_Test, gpu_BkUt_Svd_truncate3) {
   for (size_t i = 0; i < res[0].shape()[0]; i++)
     vnm_S.push_back((double)(res[0].at({i, i}).real()));
   for (size_t i = 0; i < denres[0].shape()[0]; i++)
-    denvnm_S.push_back((double)(denres[0].at({i, i}).real()));
+    denvnm_S.push_back((double)(denres[0].at({i}).real()));
   std::sort(vnm_S.begin(), vnm_S.end());
   std::sort(denvnm_S.begin(), denvnm_S.end());
   for (size_t i = 0; i < vnm_S.size(); i++) {
@@ -43,6 +44,7 @@ TEST_F(linalg_Test, gpu_BkUt_Svd_truncate3) {
   // auto con_T1 = Contract(Contract(res[2], res[0]), res[1]);
   // auto con_T2 = Contract(Contract(res[1], res[0]), res[2]);
 }
+*/
 
 // TEST_F(linalg_Test, gpu_BkUt_Svd_truncate3) {
 //   Bond I = Bond(BD_IN, {Qs(-5), Qs(-3), Qs(-1), Qs(1), Qs(3), Qs(5)}, {1, 4, 10, 9, 5, 1});
@@ -104,6 +106,7 @@ TEST_F(linalg_Test, gpu_BkUt_expH) {
 }
 
 TEST_F(linalg_Test, gpu_BkUt_expM) {
+  GTEST_SKIP() << "Eig is not implemented in CUDA so we cannot do exponential simulation.";
   auto res = linalg::ExpM(H);
   for (size_t i = 0; i < 27; i++)
     for (size_t j = 0; j < 27; j++) {