From a70d17a75f4debb4470805e779ae1f11c23eb272 Mon Sep 17 00:00:00 2001
From: Alan Kaptanoglu <alanakaptanoglu@gmail.com>
Date: Tue, 21 May 2024 21:15:01 -0400
Subject: [PATCH] Made sad attempt to xsimd the A_matrix calculation.
 Unfortunately it requires calculation using ellipe and ellipk (and is much
 faster than the direct calculation with biot savart so cant replace it with
 direct). Probably need to write a c++ function to manually calculate it and
 then simd_t all the terms in there. not sure this is worth it.

---
 src/simsopt/geo/wp_grid.py |   7 +-
 src/simsoptpp/psc.cpp      | 273 +++++++++++++++++++++++++------------
 2 files changed, 188 insertions(+), 92 deletions(-)
diff --git a/src/simsopt/geo/wp_grid.py b/src/simsopt/geo/wp_grid.py
index 4bea48baa..a2b59ad2b 100644
--- a/src/simsopt/geo/wp_grid.py
+++ b/src/simsopt/geo/wp_grid.py
@@ -613,6 +613,7 @@ def setup_currents_and_fields(self):
         # Need to rotate and flip it
         nn = self.num_wp
         q = 0
+        t1 = time.time()
         for fp in range(self.nfp):
             for stell in self.stell_list:
                 xyz = self.grid_xyz_all[q * nn: (q + 1) * nn, :]
@@ -629,7 +630,7 @@ def setup_currents_and_fields(self):
                 #     xyz, 
                 #     self.alphas_total[q * nn: (q + 1) * nn],
                 #     self.deltas_total[q * nn: (q + 1) * nn])
-                A_matrix += 2 * sopp.A_matrix(
+                A_matrix += sopp.A_matrix(
                     contig(xyz),
                     contig(self.plasma_points),
                     contig(self.alphas_total[q * nn: (q + 1) * nn]),
@@ -640,8 +641,10 @@ def setup_currents_and_fields(self):
                 q = q + 1
                 # print(A_matrix2, A_matrix)
                 # assert np.allclose(A_matrix2, A_matrix)
+        t2 = time.time()
+        print('Time = ',t2 - t1)
         self.A_matrix = A_matrix
-        self.Bn_WP = (A_matrix @ self.I * 1e6).reshape(-1) # missing factor of fac
+        self.Bn_WP = 2 * (A_matrix @ self.I * 1e6).reshape(-1) # missing factor of fac
     
     def setup_curves(self):
         """ 
diff --git a/src/simsoptpp/psc.cpp b/src/simsoptpp/psc.cpp
index 37ca322a7..bca5df6ee 100644
--- a/src/simsoptpp/psc.cpp
+++ b/src/simsoptpp/psc.cpp
@@ -351,8 +351,191 @@ Array dpsi_dkappa(Array& I_TF, Array& dl_TF, Array& gamma_TF, Array& PSC_points,
     return dpsi * fac;
 }
 
+Array A_matrix(Array& points, Array& plasma_points, Array& alphas, Array& deltas, Array& plasma_normal, double R)
+{     
+    // points shape should be (num_coils, 3)
+    // plasma_normal shape should be (num_plasma_points, 3)
+    // plasma_points should be (num_plasma_points, 3)
+    int num_coils = alphas.shape(0);  // shape should be (num_coils)
+    int num_plasma_points = plasma_points.shape(0);
+    
+    // this variable is the A matrix in the least-squares term so A * I = Bn
+    Array A = xt::zeros<double>({num_plasma_points, num_coils});
+    constexpr int simd_size = xsimd::simd_type<double>::size;
+    double R2 = R * R;
+    double fac = 2.0e-7;
+    using namespace boost::math;
+    
+    double* alphas_ptr = &(alphas(0));
+    double* deltas_ptr = &(deltas(0));
+    double* points_ptr = &(points(0, 0));
+    double* plasma_points_ptr = &(plasma_points(0, 0));
+    double* plasma_normal_ptr = &(plasma_normal(0, 0));
+    
+    #pragma omp parallel for schedule(static)
+    for(int j = 0; j < num_coils; j += simd_size) {
+        auto point_j = Vec3dSimd();
+        int klimit = std::min(simd_size, num_coils - j);
+        simd_t aj, dj;
+        for(int k = 0; k < klimit; k++){
+            for (int d = 0; d < 3; ++d) {
+                point_j[d][k] = points_ptr[3 * (j + k) + d];
+            }
+            aj[k] = alphas_ptr[j + k];
+            dj[k] = deltas_ptr[j + k];
+        }
+        simd_t cdj = xsimd::cos(dj);
+        simd_t caj = xsimd::cos(aj);
+        simd_t sdj = xsimd::sin(dj);
+        simd_t saj = xsimd::sin(aj);
+        for (int i = 0; i < num_plasma_points; i++) {
+            auto xp = Vec3dSimd(plasma_points_ptr[3 * i], \
+                                plasma_points_ptr[3 * i + 1], \
+                                plasma_points_ptr[3 * i + 2]);
+            auto np = Vec3dSimd(plasma_normal_ptr[3 * i], \
+                                plasma_normal_ptr[3 * i + 1], \
+                                plasma_normal_ptr[3 * i + 2]);
+            Vec3dSimd x0 = xp - point_j;
+            simd_t nxx = cdj;
+            simd_t nxy = sdj * saj;
+            simd_t nxz = sdj * caj;
+            simd_t nyx = ((simd_t) 0.0);
+            simd_t nyy = caj;
+            simd_t nyz = -saj;
+            simd_t nzx = -sdj;
+            simd_t nzy = cdj * saj;
+            simd_t nzz = cdj * caj;
+            auto x0rot = Vec3dSimd(x0.x * nxx + x0.y * nyx + x0.z * nzx, \
+                                   x0.x * nxy + x0.y * nyy + x0.z * nzy, \
+                                   x0.x * nxz + x0.y * nyz + x0.z * nzz);
+            simd_t x = x0rot.x;
+            simd_t y = x0rot.y;
+            simd_t z = x0rot.z;
+            simd_t rho2 = x * x + y * y;
+            simd_t r2 = rho2 + z * z;
+            simd_t rho = xsimd::sqrt(rho2);
+            simd_t R2_r2 = R2 + r2;
+            simd_t gamma2 = R2_r2 - 2.0 * R * rho;
+            simd_t beta2 = R2_r2 + 2.0 * R * rho;
+            simd_t beta = xsimd::sqrt(beta2);
+            auto k2 = 1.0 - gamma2 / beta2;
+            simd_t beta_gamma2 = beta * gamma2;
+            auto k = sqrt(k2);
+            auto ellipe = ellint_2(k);
+            auto ellipk = ellint_1(k);
+            simd_t Eplus = R2_r2 * ellipe - gamma2 * ellipk;
+            simd_t Eminus = (R2 - r2) * ellipe + gamma2 * ellipk;
+            auto B = Vec3dSimd(x * z * Eplus / (rho2 * beta_gamma2), \
+                               y * z * Eplus / (rho2 * beta_gamma2), \
+                               Eminus / beta_gamma2);
+            // Need to rotate the vector
+            auto B_rot = Vec3dSimd(B.x * nxx + B.y * nxy + B.z * nxz, \
+                                   B.x * nyx + B.y * nyy + B.z * nyz, \
+                                   B.x * nzx + B.y * nzy + B.z * nzz);
+            simd_t inner_prod = inner(B_rot, np);
+            for(int kk = 0; kk < klimit; kk++){
+                A(i, j) += inner_prod[kk];
+            }
+        }
+    }
+    return A;
+}
+
 #else
 
+Array A_matrix(Array& points, Array& plasma_points, Array& alphas, Array& deltas, Array& plasma_normal, double R)
+{
+    // warning: row_major checks below do NOT throw an error correctly on a compute node on Cori
+    if(points.layout() != xt::layout_type::row_major)
+          throw std::runtime_error("points needs to be in row-major storage order");
+    if(alphas.layout() != xt::layout_type::row_major)
+          throw std::runtime_error("alphas needs to be in row-major storage order");
+    if(deltas.layout() != xt::layout_type::row_major)
+          throw std::runtime_error("deltas needs to be in row-major storage order");
+    if(plasma_points.layout() != xt::layout_type::row_major)
+          throw std::runtime_error("plasma_points needs to be in row-major storage order");
+    if(plasma_normal.layout() != xt::layout_type::row_major)
+          throw std::runtime_error("plasma_normal needs to be in row-major storage order");
+          
+    // points shape should be (num_coils, 3)
+    // plasma_normal shape should be (num_plasma_points, 3)
+    // plasma_points should be (num_plasma_points, 3)
+    int num_coils = alphas.shape(0);  // shape should be (num_coils)
+    int num_plasma_points = plasma_points.shape(0);
+    
+    // this variable is the A matrix in the least-squares term so A * I = Bn
+    Array A = xt::zeros<double>({num_plasma_points, num_coils});
+    double R2 = R * R;
+    double fac = 2.0e-7;
+    using namespace boost::math;
+    
+    double* alphas_ptr = &(alphas(0));
+    double* deltas_ptr = &(deltas(0));
+    double* points_ptr = &(points(0, 0));
+    double* plasma_points_ptr = &(plasma_points(0, 0));
+    double* plasma_normal_ptr = &(plasma_normal(0, 0));
+    
+    #pragma omp parallel for schedule(static)
+    for(int j = 0; j < num_coils; j++) {
+        auto cdj = cos(deltas_ptr[j]);
+        auto caj = cos(alphas_ptr[j]);
+        auto sdj = sin(deltas_ptr[j]);
+        auto saj = sin(alphas_ptr[j]);
+        auto xj = points_ptr[3 * j];
+        auto yj = points_ptr[3 * j + 1];
+        auto zj = points_ptr[3 * j + 2];
+        for (int i = 0; i < num_plasma_points; ++i) {
+            auto xp = plasma_points_ptr[3 * i];
+            auto yp = plasma_points_ptr[3 * i + 1];
+            auto zp = plasma_points_ptr[3 * i + 2];
+            auto nx = plasma_normal_ptr[3 * i];
+            auto ny = plasma_normal_ptr[3 * i + 1];
+            auto nz = plasma_normal_ptr[3 * i + 2];
+            auto x0 = (xp - xj);
+            auto y0 = (yp - yj);
+            auto z0 = (zp - zj);
+            auto nxx = cdj;
+            auto nxy = sdj * saj;
+            auto nxz = sdj * caj;
+            auto nyx = 0.0;
+            auto nyy = caj;
+            auto nyz = -saj;
+            auto nzx = -sdj;
+            auto nzy = cdj * saj;
+            auto nzz = cdj * caj;
+            auto x = x0 * nxx + y0 * nyx + z0 * nzx;
+            auto y = x0 * nxy + y0 * nyy + z0 * nzy;
+            auto z = x0 * nxz + y0 * nyz + z0 * nzz;
+            auto rho2 = x * x + y * y;
+            auto r2 = rho2 + z * z;
+            auto rho = sqrt(rho2);
+            auto R2_r2 = R2 + r2;
+            auto gamma2 = R2_r2 - 2.0 * R * rho;
+            auto beta2 = R2_r2 + 2.0 * R * rho;
+            auto beta = sqrt(beta2);
+            auto k2 = 1.0 - gamma2 / beta2;
+            auto beta_gamma2 = beta * gamma2;
+            auto k = sqrt(k2);
+            auto ellipe = ellint_2(k);
+            auto ellipk = ellint_1(k);
+            auto Eplus = R2_r2 * ellipe - gamma2 * ellipk;
+            auto Eminus = (R2 - r2) * ellipe + gamma2 * ellipk;
+            auto Bx = x * z * Eplus / (rho2 * beta_gamma2);
+            auto By = y * z * Eplus / (rho2 * beta_gamma2);
+            auto Bz = Eminus / beta_gamma2;
+            // Need to rotate the vector
+            auto Bx_rot = Bx * nxx + By * nxy + Bz * nxz;
+            auto By_rot = Bx * nyx + By * nyy + Bz * nyz;
+            auto Bz_rot = Bx * nzx + By * nzy + Bz * nzz;
+            A(i, j) = Bx_rot * nx + By_rot * ny + Bz_rot * nz;
+        }
+    }
+    return A;
+}
+
+
+// Todo: write your own C++ code for ellipe/ellipk so we can XSIMD it...? 
+
 Array dpsi_dkappa(Array& I_TF, Array& dl_TF, Array& gamma_TF, Array& PSC_points, Array& alphas, Array& deltas, Array& coil_normals, Array& rho, Array& phi, double R)
 {
     // warning: row_major checks below do NOT throw an error correctly on a compute node on Cori
@@ -780,96 +963,6 @@ Array flux_integration(Array& B, Array& rho, Array& normal)
     return Psi;
 }
 
-Array A_matrix(Array& points, Array& plasma_points, Array& alphas, Array& deltas, Array& plasma_normal, double R)
-{
-    // warning: row_major checks below do NOT throw an error correctly on a compute node on Cori
-    if(points.layout() != xt::layout_type::row_major)
-          throw std::runtime_error("points needs to be in row-major storage order");
-    if(alphas.layout() != xt::layout_type::row_major)
-          throw std::runtime_error("alphas needs to be in row-major storage order");
-    if(deltas.layout() != xt::layout_type::row_major)
-          throw std::runtime_error("deltas needs to be in row-major storage order");
-    if(plasma_points.layout() != xt::layout_type::row_major)
-          throw std::runtime_error("plasma_points needs to be in row-major storage order");
-    if(plasma_normal.layout() != xt::layout_type::row_major)
-          throw std::runtime_error("plasma_normal needs to be in row-major storage order");
-          
-    // points shape should be (num_coils, 3)
-    // plasma_normal shape should be (num_plasma_points, 3)
-    // plasma_points should be (num_plasma_points, 3)
-    int num_coils = alphas.shape(0);  // shape should be (num_coils)
-    int num_plasma_points = plasma_points.shape(0);
-    
-    // this variable is the A matrix in the least-squares term so A * I = Bn
-    Array A = xt::zeros<double>({num_plasma_points, num_coils});
-    double R2 = R * R;
-    double fac = 2.0e-7;
-    using namespace boost::math;
-    
-    double* alphas_ptr = &(alphas(0));
-    double* deltas_ptr = &(deltas(0));
-    double* points_ptr = &(points(0, 0));
-    double* plasma_points_ptr = &(plasma_points(0, 0));
-    double* plasma_normal_ptr = &(plasma_normal(0, 0));
-    
-    #pragma omp parallel for schedule(static)
-    for(int j = 0; j < num_coils; j++) {
-        auto cdj = cos(deltas_ptr[j]);
-        auto caj = cos(alphas_ptr[j]);
-        auto sdj = sin(deltas_ptr[j]);
-        auto saj = sin(alphas_ptr[j]);
-        auto xj = points_ptr[3 * j];
-        auto yj = points_ptr[3 * j + 1];
-        auto zj = points_ptr[3 * j + 2];
-        for (int i = 0; i < num_plasma_points; ++i) {
-            auto xp = plasma_points_ptr[3 * i];
-            auto yp = plasma_points_ptr[3 * i + 1];
-            auto zp = plasma_points_ptr[3 * i + 2];
-            auto nx = plasma_normal_ptr[3 * i];
-            auto ny = plasma_normal_ptr[3 * i + 1];
-            auto nz = plasma_normal_ptr[3 * i + 2];
-            auto x0 = (xp - xj);
-            auto y0 = (yp - yj);
-            auto z0 = (zp - zj);
-            auto nxx = cdj;
-            auto nxy = sdj * saj;
-            auto nxz = sdj * caj;
-            auto nyx = 0.0;
-            auto nyy = caj;
-            auto nyz = -saj;
-            auto nzx = -sdj;
-            auto nzy = cdj * saj;
-            auto nzz = cdj * caj;
-            auto x = x0 * nxx + y0 * nyx + z0 * nzx;
-            auto y = x0 * nxy + y0 * nyy + z0 * nzy;
-            auto z = x0 * nxz + y0 * nyz + z0 * nzz;
-            auto rho2 = x * x + y * y;
-            auto r2 = rho2 + z * z;
-            auto rho = sqrt(rho2);
-            auto R2_r2 = R2 + r2;
-            auto gamma2 = R2_r2 - 2.0 * R * rho;
-            auto beta2 = R2_r2 + 2.0 * R * rho;
-            auto beta = sqrt(beta2);
-            auto k2 = 1.0 - gamma2 / beta2;
-            auto beta_gamma2 = beta * gamma2;
-            auto k = sqrt(k2);
-            auto ellipe = ellint_2(k);
-            auto ellipk = ellint_1(k);
-            auto Eplus = R2_r2 * ellipe - gamma2 * ellipk;
-            auto Eminus = (R2 - r2) * ellipe + gamma2 * ellipk;
-            auto Bx = x * z * Eplus / (rho2 * beta_gamma2);
-            auto By = y * z * Eplus / (rho2 * beta_gamma2);
-            auto Bz = Eminus / beta_gamma2;
-            // Need to rotate the vector
-            auto Bx_rot = Bx * nxx + By * nxy + Bz * nxz;
-            auto By_rot = Bx * nyx + By * nyy + Bz * nyz;
-            auto Bz_rot = Bx * nzx + By * nzy + Bz * nzz;
-            A(i, j) = Bx_rot * nx + By_rot * ny + Bz_rot * nz;
-        }
-    }
-    return A;
-}
-
 Array B_PSC(Array& points, Array& plasma_points, Array& alphas, Array& deltas, Array& psc_currents, double R)
 {
     // warning: row_major checks below do NOT throw an error correctly on a compute node on Cori