From ffd9707e155d4096cfa2c187f3342c71c345ad75 Mon Sep 17 00:00:00 2001
From: Alan Kaptanoglu <alanakaptanoglu@gmail.com>
Date: Thu, 23 May 2024 12:13:30 -0400
Subject: [PATCH] Think I have the derivative calculated correctly, at least
 for the QA discrete symmetries. Unit tests passing and WP optimization
 running well. Jacobian calculation still more expensive than I would like
 since xsimd seems not to give speedup. Saving and switching to laptop to see
 if intel gives the speedup I would expect.

---
 examples/2_Intermediate/QA_psc_example.py |   2 +-
 examples/2_Intermediate/QA_wp_example.py  |  26 +-
 src/simsopt/geo/psc_grid.py               |  74 ++-
 src/simsopt/geo/wp_grid.py                |  18 +-
 src/simsoptpp/psc.cpp                     | 601 +++++++++----------
 src/simsoptpp/psc.h                       |   2 +
 src/simsoptpp/python.cpp                  |   1 +
 tests/geo/test_psc_grid.py                | 667 ++++++++++++----------
 8 files changed, 716 insertions(+), 675 deletions(-)

diff --git a/examples/2_Intermediate/QA_psc_example.py b/examples/2_Intermediate/QA_psc_example.py
index afe6616a8..aacc28e1b 100644
--- a/examples/2_Intermediate/QA_psc_example.py
+++ b/examples/2_Intermediate/QA_psc_example.py
@@ -227,7 +227,7 @@ def fun(dofs):
 make_Bnormal_plots(bs, s_plot, out_dir, "biot_savart_TF_optimized", B_axis)
 
 # Finally, initialize the psc class
-kwargs_geo = {"Nx": 5, "out_dir": out_str, 
+kwargs_geo = {"Nx": 4, "out_dir": out_str, 
               "random_initialization": True, "poff": poff,}
               # "interpolated_field": True} 
 psc_array = PSCgrid.geo_setup_between_toroidal_surfaces(
diff --git a/examples/2_Intermediate/QA_wp_example.py b/examples/2_Intermediate/QA_wp_example.py
index 08fe1a61b..06d178e2f 100644
--- a/examples/2_Intermediate/QA_wp_example.py
+++ b/examples/2_Intermediate/QA_wp_example.py
@@ -57,15 +57,15 @@ def coil_optimization_QA(s, bs, base_curves, curves, out_dir=''):
     ncoils = len(base_curves)
 
     # Weight on the curve lengths in the objective function:
-    LENGTH_WEIGHT = 1e-4
+    LENGTH_WEIGHT = 1e1
 
     # Threshold and weight for the coil-to-coil distance penalty in the objective function:
     CC_THRESHOLD = 0.2
     CC_WEIGHT = 1e1
 
     # Threshold and weight for the coil-to-surface distance penalty in the objective function:
-    CS_THRESHOLD = 0.5
-    CS_WEIGHT = 1
+    CS_THRESHOLD = 2.0
+    CS_WEIGHT = 1e-2
 
     # Threshold and weight for the curvature penalty in the objective function:
     CURVATURE_THRESHOLD = 10
@@ -144,14 +144,14 @@ def initialize_coils_qa(TEST_DIR, s, out_dir=''):
     from simsopt.geo import curves_to_vtk
     # generate planar TF coils
     ncoils = 1
-    R0 = 1.0
-    R1 = 0.6
+    R0 = s.get_rc(0, 0)
+    R1 = s.get_rc(1, 0) * 4
     order = 5
 
     # qa needs to be scaled to 0.1 T on-axis magnetic field strength
     from simsopt.mhd.vmec import Vmec
-    vmec_file = 'wout_LandremanPaul2021_QA_lowres.nc'
-    total_current = Vmec(TEST_DIR / vmec_file).external_current() / (2 * s.nfp) / 7.131
+    vmec_file = 'wout_LandremanPaul2021_QA_reactorScale_lowres_reference.nc'
+    total_current = Vmec(TEST_DIR / vmec_file).external_current() / (2 * s.nfp) / 7.131 * 6
     base_curves = create_equally_spaced_curves(ncoils, s.nfp, stellsym=True, R0=R0, R1=R1, order=order, numquadpoints=128)
     base_currents = [(Current(total_current / ncoils * 1e-5) * 1e5) for _ in range(ncoils)]
     base_currents[0].fix_all()
@@ -184,11 +184,11 @@ def initialize_coils_qa(TEST_DIR, s, out_dir=''):
     quadpoints_phi = np.linspace(0, 1, qphi, endpoint=True)
     quadpoints_theta = np.linspace(0, 1, ntheta * 2, endpoint=True)
 
-poff = 0.2  # WP grid will be offset 'poff' meters from the plasma surface
-coff = 0.1  # WP grid will be initialized between 1 m and 2 m from plasma
+poff = 1.0  # WP grid will be offset 'poff' meters from the plasma surface
+coff = 1.2  # WP grid will be initialized between 1 m and 2 m from plasma
 
 # Read in the plasma equilibrium file
-input_name = 'input.LandremanPaul2021_QA_lowres'
+input_name = 'input.LandremanPaul2021_QA_reactorScale_lowres'
 TEST_DIR = (Path(__file__).parent / ".." / ".." / "tests" / "test_files").resolve()
 surface_filename = TEST_DIR / input_name
 range_param = 'half period'
@@ -246,7 +246,7 @@ def initialize_coils_qa(TEST_DIR, s, out_dir=''):
 # fix all the coil shapes so only the currents are optimized
 # for i in range(ncoils):
 #     base_curves[i].fix_all()
-# bs = coil_optimization_QA(s, bs, base_curves, curves, out_dir)
+bs = coil_optimization_QA(s, bs, base_curves, curves, out_dir)
 curves_to_vtk(curves, out_dir / "TF_coils", close=True)
 bs.set_points(s.gamma().reshape((-1, 3)))
 Bnormal = np.sum(bs.B().reshape((nphi, ntheta, 3)) * s.unitnormal(), axis=2)
@@ -300,7 +300,7 @@ def initialize_coils_qa(TEST_DIR, s, out_dir=''):
 for i in range(wp_array.num_wp):
     opt_bounds.append((None, None))
 opt_bounds = tuple(opt_bounds)
-options = {"disp": True, "maxiter": 300, "iprint": 101}  #, "maxls": 100}
+options = {"disp": True, "maxiter": 500, "iprint": 101}  #, "maxls": 100}
 # print(opt_bounds)
 verbose = True
 
@@ -311,7 +311,7 @@ def initialize_coils_qa(TEST_DIR, s, out_dir=''):
                  "coils_TF" : coils
                  }
 t1 = time.time()
-I_threshold = 1e4
+I_threshold = 5e5
 STLSQ_max_iters = 10
 BdotN2_list = []
 num_wps = []
diff --git a/src/simsopt/geo/psc_grid.py b/src/simsopt/geo/psc_grid.py
index 9d344c3f1..2d7dbebe9 100644
--- a/src/simsopt/geo/psc_grid.py
+++ b/src/simsopt/geo/psc_grid.py
@@ -32,7 +32,7 @@ def __init__(self):
         self.BdotN2_list = []
         # Define a set of quadrature points and weights for the N point
         # Gaussian quadrature rule
-        num_quad = 80
+        num_quad = 20
         (quad_points_phi, 
           self.quad_weights_phi) = np.polynomial.legendre.leggauss(num_quad)
         self.quad_points_phi = quad_points_phi * np.pi + np.pi
@@ -312,6 +312,7 @@ def geo_setup_between_toroidal_surfaces(
         # Normalization of the ||A*Linv*psi - b||^2 objective 
         # representing Bnormal errors on the plasma surface
         psc_grid.normalization = B_axis ** 2 * psc_grid.plasma_boundary.area()
+        psc_grid.fac2_norm = psc_grid.fac ** 2 / psc_grid.normalization
         psc_grid.B_TF = B_TF
 
         # Random or B_TF aligned initialization of the coil orientations
@@ -460,7 +461,7 @@ def geo_setup_manual(
                                      (np.random.rand(
                                          psc_grid.num_psc) - 0.5) * 2 * np.pi
         )
-        N = 20  # Must be larger for some unit tests to converge
+        N = 50  # Must be larger for some unit tests to converge
         psc_grid.phi = np.linspace(0, 2 * np.pi, N, endpoint=False)
         psc_grid.dphi = psc_grid.phi[1] - psc_grid.phi[0]
         psc_grid.rho = np.linspace(0, R, N, endpoint=False)
@@ -512,18 +513,17 @@ def geo_setup_manual(
         # qa scaled to 0.1 T on-axis magnetic field strength?
         total_current = 1e6
         base_curves = create_equally_spaced_curves(
-            ncoils, psc_grid.plasma_boundary.nfp, 
-            stellsym=psc_grid.plasma_boundary.stellsym, 
+            ncoils, 1,  #psc_grid.plasma_boundary.nfp, 
+            stellsym=False,  #psc_grid.plasma_boundary.stellsym, 
             R0=R0, R1=R1, order=order, numquadpoints=128
         )
         base_currents = [(Current(total_current / ncoils * 1e-5) * 1e5) for _ in range(ncoils)]
-        I_TF_total = total_current
         total_current = Current(total_current)
         total_current.fix_all()
         default_coils = coils_via_symmetries(
             base_curves, base_currents, 
-            psc_grid.plasma_boundary.nfp, 
-            psc_grid.plasma_boundary.stellsym
+            1,  #psc_grid.plasma_boundary.nfp, 
+            False,  #psc_grid.plasma_boundary.stellsym
         )
         # fix all the coil shapes so only the currents are optimized
         for i in range(ncoils):
@@ -564,6 +564,7 @@ def geo_setup_manual(
         # Normalization of the ||A*Linv*psi - b||^2 objective 
         # representing Bnormal errors on the plasma surface
         psc_grid.normalization = B_axis ** 2 * psc_grid.plasma_boundary.area()
+        psc_grid.fac2_norm = psc_grid.fac ** 2 / psc_grid.normalization
         psc_grid.B_TF = B_TF
         
         # Order of the coils. For unit tests, needs > 400
@@ -634,10 +635,10 @@ def setup_currents_and_fields(self):
                     contig(self.deltas_total[q * nn: (q + 1) * nn]),
                     contig(self.plasma_unitnormals),
                     self.R,
-                )  # accounts for sign change of the currents
+                )  # accounts for sign change of the currents (does it?)
                 q = q + 1
         self.A_matrix = 2 * A_matrix
-        self.Bn_PSC = (self.A_matrix @ self.I).reshape(-1) 
+        self.Bn_PSC = (self.A_matrix @ self.I).reshape(-1) # need minus signs?
     
     def setup_curves(self):
         """ 
@@ -796,7 +797,7 @@ def least_squares(self, kappas, verbose=False):
         deltas = kappas[len(kappas) // 2:]
         self.setup_orientations(alphas, deltas)
         Ax_b = (self.Bn_PSC + self.b_opt) * self.grid_normalization
-        BdotN2 = 0.5 * Ax_b.T @ Ax_b / self.normalization * self.fac ** 2
+        BdotN2 = 0.5 * Ax_b.T @ Ax_b * self.fac2_norm
         # outstr = f"Normalized f_B = {BdotN2:.3e} "
         # for i in range(len(kappas)):
         #     outstr += f"kappas[{i:d}] = {kappas[i]:.2e} "
@@ -829,7 +830,7 @@ def least_squares_jacobian(self, kappas, verbose=False):
         deltas = kappas[len(kappas) // 2:]
         self.setup_orientations(alphas, deltas)
         # Two factors of grid normalization since it is not added to the gradients
-        Ax_b = (self.Bn_PSC + self.b_opt) / self.normalization * self.grid_normalization
+        Ax_b = (self.Bn_PSC + self.b_opt) * self.grid_normalization
         # t1 = time.time()
         A_deriv = self.A_deriv()
         Linv = self.L_inv[:self.num_psc, :self.num_psc]
@@ -855,13 +856,14 @@ def least_squares_jacobian(self, kappas, verbose=False):
         grad_alpha3 = self.A_matrix @ (Linv * psi_deriv[:self.num_psc])
         grad_delta3 = self.A_matrix @ (Linv * psi_deriv[self.num_psc:])
         grad_kappa3 = np.hstack((grad_alpha3, grad_delta3))
+        grad = self.grid_normalization[:, None] * (grad_kappa1 + grad_kappa2 + grad_kappa3)
         # t2 = time.time()
         # print('grad_A3 time = ', t2 - t1, grad_alpha3.shape, grad_kappa3.shape)
         # if verbose:
             # print('Jacobian = ', Ax_b @ (grad_kappa1 + grad_kappa2 + grad_kappa3))
         # minus sign in front because I = -Linv @ psi
-        jac = -Ax_b.T @ (grad_kappa1 + grad_kappa2 + grad_kappa3)
-        return jac
+        jac = -Ax_b.T @ grad
+        return jac * self.fac2_norm
     
     def A_deriv(self):
         """
@@ -880,7 +882,7 @@ def A_deriv(self):
         q = 0
         for fp in range(self.nfp):
             for stell in self.stell_list:
-                dA_dkappa += sopp.dA_dkappa(
+                dA = sopp.dA_dkappa(
                     contig(self.grid_xyz_all[q * nn: (q + 1) * nn, :]),
                     contig(self.plasma_points),
                     contig(self.alphas_total[q * nn: (q + 1) * nn]),
@@ -890,6 +892,14 @@ def A_deriv(self):
                     contig(self.quad_weights_phi),
                     self.R,
                 )
+                dA_dkappa[:, :self.num_psc] += dA[:, :self.num_psc] * (-1) ** fp
+                dA_dkappa[:, self.num_psc:] += dA[:, self.num_psc:] * (-1) ** fp * stell
+                # print(fp, stell, dA[0, 0], self.grid_xyz_all[q * nn: q * nn + 1, :])
+                # print(self.plasma_points[0, :], 
+                #       self.alphas_total[q * nn: q * nn + 1],
+                #       self.deltas_total[q * nn: q * nn + 1],
+                #       self.plasma_unitnormals[0, :],
+                #       )
                 q = q + 1
         return dA_dkappa * np.pi # rescale by pi for gauss-leg quadrature
     
@@ -940,7 +950,7 @@ def psi_deriv(self):
         q = 0
         for fp in range(self.nfp):
             for stell in self.stell_list:
-                psi_deriv = sopp.dpsi_dkappa(
+                psi_deriv += sopp.dpsi_dkappa(
                     contig(self.I_TF),
                     contig(self.dl_TF),
                     contig(self.gamma_TF),
@@ -1020,6 +1030,8 @@ def setup_orientations(self, alphas, deltas):
             contig(self.rho),
             contig(self.coil_normals),
         )
+        # print('B_TF at flux grid = ', self.B_TF.B().reshape(len(alphas), N, N, 3))
+
         self.psi *= self.dphi * self.drho
         t2 = time.time()
         # print('Flux2 time = ', t2 - t1)
@@ -1090,20 +1102,24 @@ def update_alphas_deltas(self):
                 self.coil_normals_all[nn * q: nn * (q + 1), 0] = self.coil_normals[:, 0] * np.cos(phi0) * stell - self.coil_normals[:, 1] * np.sin(phi0) 
                 self.coil_normals_all[nn * q: nn * (q + 1), 1] = self.coil_normals[:, 0] * np.sin(phi0) * stell + self.coil_normals[:, 1] * np.cos(phi0) 
                 self.coil_normals_all[nn * q: nn * (q + 1), 2] = self.coil_normals[:, 2]
-                normals = self.coil_normals_all[q * nn: (q + 1) * nn, :]
-                # normals = self.coil_normals
-                # get new deltas by flipping sign of deltas, then rotation alpha by phi0
-                deltas = np.arctan2(normals[:, 0], normals[:, 2]) # + np.pi
-                deltas[abs(deltas) == np.pi] = 0.0
-                if fp == 0 and stell == 1:
-                    shift_deltas = self.deltas - deltas  # +- pi
-                deltas += shift_deltas
-                alphas = -np.arctan2(normals[:, 1] * np.cos(deltas), normals[:, 2])
-                # alphas = -np.arcsin(normals[:, 1])
-                alphas[abs(alphas) == np.pi] = 0.0
-                if fp == 0 and stell == 1:
-                    shift_alphas = self.alphas - alphas  # +- pi
-                alphas += shift_alphas
+                # normals = self.coil_normals_all[q * nn: (q + 1) * nn, :]
+                # # normals = self.coil_normals
+                # # get new deltas by flipping sign of deltas, then rotation alpha by phi0
+                # deltas = np.arctan2(normals[:, 0], normals[:, 2]) # + np.pi
+                # deltas[abs(deltas) == np.pi] = 0.0
+                # if fp == 0 and stell == 1:
+                #     shift_deltas = self.deltas - deltas  # +- pi
+                # deltas += shift_deltas
+                # alphas = -np.arctan2(normals[:, 1] * np.cos(deltas), normals[:, 2])
+                # # alphas = -np.arcsin(normals[:, 1])
+                # alphas[abs(alphas) == np.pi] = 0.0
+                # if fp == 0 and stell == 1:
+                #     shift_alphas = self.alphas - alphas  # +- pi
+                # alphas += shift_alphas
+                # self.alphas_total[nn * q: nn * (q + 1)] = alphas
+                # self.deltas_total[nn * q: nn * (q + 1)] = deltas
+                alphas = self.alphas * (-1) ** fp
+                deltas = self.deltas * stell * (-1) ** fp
                 self.alphas_total[nn * q: nn * (q + 1)] = alphas
                 self.deltas_total[nn * q: nn * (q + 1)] = deltas
                 q = q + 1
diff --git a/src/simsopt/geo/wp_grid.py b/src/simsopt/geo/wp_grid.py
index a2fc153f4..339f1ae47 100644
--- a/src/simsopt/geo/wp_grid.py
+++ b/src/simsopt/geo/wp_grid.py
@@ -68,7 +68,7 @@ def _setup_uniform_grid(self):
         
         # This is not a guarantee that coils will not touch but inductance
         # matrix blows up if they do so it is easy to tell when they do
-        self.R = min(Nmin / 4.0, self.poff / 4.0)
+        self.R = min(Nmin / 2.0, self.poff / 4.0)
         self.a = self.R / 10.0  # Hard-coded aspect ratio of 100 right now
         print('Major radius of the coils is R = ', self.R)
         print('Coils are spaced so that every coil of radius R '
@@ -631,7 +631,7 @@ def setup_currents_and_fields(self):
                 #     self.alphas_total[q * nn: (q + 1) * nn],
                 #     self.deltas_total[q * nn: (q + 1) * nn])
                 # t1 = time.time()
-                A_matrix += sopp.A_matrix(
+                A_matrix += sopp.A_matrix_simd(
                     contig(self.grid_xyz_all[q * nn: (q + 1) * nn, :]),
                     contig(self.plasma_points),
                     contig(self.alphas_total[q * nn: (q + 1) * nn]),
@@ -818,7 +818,7 @@ def least_squares(self, kappa_I, verbose=False):
                 array of kappas.
 
         """
-        # t1 = time.time()
+        t1 = time.time()
         ind3 = len(kappa_I) // 3
         ind3_2 = 2 * ind3
         self.I = kappa_I[ind3_2:]
@@ -826,8 +826,8 @@ def least_squares(self, kappa_I, verbose=False):
         Ax_b = (self.Bn_WP + self.b_opt) * self.grid_normalization 
         BdotN2 = 0.5 * Ax_b.T @ Ax_b * self.fac2_norm
         self.BdotN2_list.append(BdotN2)
-        # t2 = time.time()
-        # print('obj time = ', t2 - t1)
+        t2 = time.time()
+        print('obj time = ', t2 - t1)
         return BdotN2
     
     def python_A_matrix(self, points, alphas, deltas):
@@ -923,7 +923,10 @@ def least_squares_jacobian(self, kappa_I, verbose=False):
         self.I = kappa_I[ind3_2:]
         self.setup_orientations(kappa_I[:ind3], kappa_I[ind3:ind3_2])
         Ax_b = (self.Bn_WP + self.b_opt) * self.grid_normalization
+        t1_grad = time.time()
         grad_alpha_delta = self.A_deriv() * np.hstack((self.I, self.I))
+        t2_grad = time.time()
+        print('Aderiv time = ', t2_grad - t1_grad)
         grad_I = self.A_matrix
         # Should be shape (num_plasma_points, 3 * num_wp)
         # grad_kappa = np.hstack((grad_alpha1, grad_delta1))
@@ -931,6 +934,7 @@ def least_squares_jacobian(self, kappa_I, verbose=False):
         grad = self.grid_normalization[:, None] * grad
         jac = Ax_b.T @ grad
         t2 = time.time()
+        print('jac time = ', t2 - t1)
         return jac * self.fac2_norm
     
     def A_deriv(self):
@@ -950,7 +954,7 @@ def A_deriv(self):
         q = 0
         for fp in range(self.nfp):
             for stell in self.stell_list:
-                dA_dkappa += sopp.dA_dkappa(
+                dA = sopp.dA_dkappa_simd(
                     contig(self.grid_xyz_all[q * nn: (q + 1) * nn, :]),
                     contig(self.plasma_points),
                     contig(self.alphas_total[q * nn: (q + 1) * nn]),
@@ -960,6 +964,8 @@ def A_deriv(self):
                     contig(self.quad_weights_phi),
                     self.R,
                 )
+                dA_dkappa[:, :self.num_wp] += dA[:, :self.num_wp] * (-1) ** fp
+                dA_dkappa[:, self.num_wp:] += dA[:, self.num_wp:] * (-1) ** fp * stell
                 q = q + 1
         return dA_dkappa * np.pi # rescale by pi for gauss-leg quadrature
     
diff --git a/src/simsoptpp/psc.cpp b/src/simsoptpp/psc.cpp
index fb4e90369..19bd3e540 100644
--- a/src/simsoptpp/psc.cpp
+++ b/src/simsoptpp/psc.cpp
@@ -409,327 +409,8 @@ simd_t Ellint1AGM_simd(simd_t k)
     return ((simd_t) (M_PI / (2 * aa))) * delta;
 }
 
-Array dA_dkappa(Array& points, Array& plasma_points, Array& alphas, Array& deltas, Array& plasma_normal, Array& int_points, Array& int_weights, double R)
-{
-    // points shape should be (num_coils, 3)
-    // plasma_normal shape should be (num_plasma_points, 3)
-    // plasma_points should be (num_plasma_points, 3)
-    int num_coils = alphas.shape(0);  // shape should be (num_coils)
-    int num_plasma_points = plasma_points.shape(0);
-    int num_phi = int_points.shape(0);  // shape should be (num_phi)
-    
-    // this variable is the A matrix in the least-squares term so A * I = Bn
-    Array dA = xt::zeros<double>({num_plasma_points, num_coils * 2});
-    constexpr int simd_size = xsimd::simd_type<double>::size;
-    double* plasma_points_ptr = &(plasma_points(0, 0));
-    double* plasma_normal_ptr = &(plasma_normal(0, 0));
-    double* points_ptr = &(points(0, 0));
-    double* alphas_ptr = &(alphas(0));
-    double* deltas_ptr = &(deltas(0));
-    double* int_points_ptr = &(int_points(0));
-    double* int_weights_ptr = &(int_weights(0));
-    
-    #pragma omp parallel for schedule(static)
-    for (int i = 0; i < num_plasma_points; ++i) {
-        auto xp = Vec3dSimd(plasma_points_ptr[3 * i], \
-                            plasma_points_ptr[3 * i + 1], \
-                            plasma_points_ptr[3 * i + 2]);
-        auto np = Vec3dSimd(plasma_normal_ptr[3 * i], \
-                            plasma_normal_ptr[3 * i + 1], \
-                            plasma_normal_ptr[3 * i + 2]);
-        for(int j = 0; j < num_coils; j += simd_size) {
-            simd_t aj, dj;
-            int klimit = std::min(simd_size, num_coils - j);
-            auto point_j = Vec3dSimd();
-            for(int kk = 0; kk < klimit; kk++){
-                for (int d = 0; d < 3; ++d) {
-                    point_j[d][kk] = points_ptr[3 * (j + kk) + d];
-                }
-                aj[kk] = alphas_ptr[j + kk];
-                dj[kk] = deltas_ptr[j + kk];
-            }
-            simd_t cdj = xsimd::cos(dj);
-            simd_t caj = xsimd::cos(aj);
-            simd_t sdj = xsimd::sin(dj);
-            simd_t saj = xsimd::sin(aj);
-            Vec3dSimd r0 = xp - point_j;
-            simd_t x0 = r0.x;
-            simd_t y0 = r0.y;
-            simd_t z0 = r0.z;
-            simd_t Rxx = cdj;
-            simd_t Rxy = sdj * saj;
-            simd_t Rxz = sdj * caj;
-            simd_t Ryx = ((simd_t) 0.0);
-            simd_t Ryy = caj;
-            simd_t Ryz = -saj;
-            simd_t Rzx = -sdj;
-            simd_t Rzy = cdj * saj;
-            simd_t Rzz = cdj * caj;
-            simd_t dRxx_dalpha = ((simd_t) 0.0);
-            simd_t dRxy_dalpha = sdj * caj;
-            simd_t dRxz_dalpha = -sdj * saj;
-            simd_t dRyx_dalpha = ((simd_t) 0.0);
-            simd_t dRyy_dalpha = -saj;
-            simd_t dRyz_dalpha = -caj;
-            simd_t dRzx_dalpha = ((simd_t) 0.0);
-            simd_t dRzy_dalpha = cdj * caj;
-            simd_t dRzz_dalpha = -cdj * saj;
-            simd_t dRxx_ddelta = -sdj;
-            simd_t dRxy_ddelta = cdj * saj;
-            simd_t dRxz_ddelta = cdj * caj;
-            simd_t dRyx_ddelta = ((simd_t) 0.0);
-            simd_t dRyy_ddelta = ((simd_t) 0.0);
-            simd_t dRyz_ddelta = ((simd_t) 0.0);
-            simd_t dRzx_ddelta = -cdj;
-            simd_t dRzy_ddelta = -sdj * saj;
-            simd_t dRzz_ddelta = -sdj * caj;
-            auto B1 = Vec3dSimd();
-            auto B2 = Vec3dSimd();
-            auto B3 = Vec3dSimd();
-            auto B4 = Vec3dSimd();
-            auto B5 = Vec3dSimd();
-            for (int k = 0; k < num_phi; ++k) {
-                simd_t weight = ((simd_t) int_weights_ptr[k]);
-                simd_t pk = ((simd_t) int_points_ptr[k]);
-                auto dl = Vec3dSimd(-R * xsimd::sin(pk), \
-                                    R * xsimd::cos(pk), \
-                                    (simd_t) 0.0);
-                auto RTdiff = Vec3dSimd((Rxx * x0 + Ryx * y0 + Rzx * z0) - R * xsimd::cos(pk), \
-                                        (Rxy * x0 + Ryy * y0 + Rzy * z0) - R * xsimd::sin(pk), \
-                                        (Rxz * x0 + Ryz * y0 + Rzz * z0));     
-                // multiply by R^T and then subtract off coil coordinate
-                auto dl_cross_RTdiff = cross(dl, RTdiff);
-                simd_t rmag_2 = normsq(RTdiff);
-                simd_t denom = rsqrt(rmag_2);
-                simd_t denom3 = denom * denom * denom * weight;  // notice weight factor
-                simd_t denom5 = denom3 * denom * denom * (-3.0);
-                // First derivative contribution of three
-                B1.x += dl_cross_RTdiff.x * denom3;
-                B1.y += dl_cross_RTdiff.y * denom3;
-                B1.z += dl_cross_RTdiff.z * denom3;
-                // second derivative contribution (should be dRT/dalpha)
-                auto dR_dalphaT = Vec3dSimd(dRxx_dalpha * x0 + dRyx_dalpha * y0 + dRzx_dalpha * z0, \
-                                            dRxy_dalpha * x0 + dRyy_dalpha * y0 + dRzy_dalpha * z0, \
-                                            dRxz_dalpha * x0 + dRyz_dalpha * y0 + dRzz_dalpha * z0);
-                auto dR_ddeltaT = Vec3dSimd(dRxx_ddelta * x0 + dRyx_ddelta * y0 + dRzx_ddelta * z0, \
-                                            dRxy_ddelta * x0 + dRyy_ddelta * y0 + dRzy_ddelta * z0, \
-                                            dRxz_ddelta * x0 + dRyz_ddelta * y0 + dRzz_ddelta * z0);
-                auto dl_cross_dR_dalphaT = cross(dl, dR_dalphaT);
-                auto dl_cross_dR_ddeltaT = cross(dl, dR_ddeltaT);
-                B2.x += dl_cross_dR_dalphaT.x * denom3;
-                B2.y += dl_cross_dR_dalphaT.y * denom3;
-                B2.z += dl_cross_dR_dalphaT.z * denom3;
-                B4.x += dl_cross_dR_ddeltaT.x * denom3;
-                B4.y += dl_cross_dR_ddeltaT.y * denom3;
-                B4.z += dl_cross_dR_ddeltaT.z * denom3;
-                // third derivative contribution
-                simd_t RTdiff_dot_dR_dalpha = inner(RTdiff, dR_dalphaT);
-                simd_t RTdiff_dot_dR_ddelta = inner(RTdiff, dR_ddeltaT);
-                B3.x += dl_cross_RTdiff.x * RTdiff_dot_dR_dalpha * denom5;
-                B3.y += dl_cross_RTdiff.y * RTdiff_dot_dR_dalpha * denom5;
-                B3.z += dl_cross_RTdiff.z * RTdiff_dot_dR_dalpha * denom5;
-                B5.x += dl_cross_RTdiff.x * RTdiff_dot_dR_ddelta * denom5;
-                B5.y += dl_cross_RTdiff.y * RTdiff_dot_dR_ddelta * denom5;
-                B5.z += dl_cross_RTdiff.z * RTdiff_dot_dR_ddelta * denom5;
-            }
-            // rotate first contribution by dR/dalpha
-            simd_t Bx1 = B1.x;
-            simd_t By1 = B1.y;
-            simd_t Bz1 = B1.z;
-            simd_t Bx2 = B2.x;
-            simd_t By2 = B2.y;
-            simd_t Bz2 = B2.z;
-            simd_t Bx3 = B3.x;
-            simd_t By3 = B3.y;
-            simd_t Bz3 = B3.z;
-            simd_t Bx4 = B4.x;
-            simd_t By4 = B4.y;
-            simd_t Bz4 = B4.z;
-            simd_t Bx5 = B5.x;
-            simd_t By5 = B5.y;
-            simd_t Bz5 = B5.z;
-            auto B1_rot = Vec3dSimd(dRxx_dalpha * Bx1 + dRxy_dalpha * By1 + dRxz_dalpha * Bz1, \
-                                    dRyx_dalpha * Bx1 + dRyy_dalpha * By1 + dRyz_dalpha * Bz1, \
-                                    dRzx_dalpha * Bx1 + dRzy_dalpha * By1 + dRzz_dalpha * Bz1);
-            auto B23_rot = Vec3dSimd(Rxx * (Bx2 + Bx3) + Rxy * (By2 + By3) + Rxz * (Bz2 + Bz3), \
-                                     Ryx * (Bx2 + Bx3) + Ryy * (By2 + By3) + Ryz * (Bz2 + Bz3), \
-                                     Rzx * (Bx2 + Bx3) + Rzy * (By2 + By3) + Rzz * (Bz2 + Bz3));
-            // repeat for delta derivative
-            auto B11_rot = Vec3dSimd(dRxx_ddelta * Bx1 + dRxy_ddelta * By1 + dRxz_ddelta * Bz1, \
-                                     dRyx_ddelta * Bx1 + dRyy_ddelta * By1 + dRyz_ddelta * Bz1, \
-                                     dRzx_ddelta * Bx1 + dRzy_ddelta * By1 + dRzz_ddelta * Bz1);
-            auto B45_rot = Vec3dSimd(Rxx * (Bx4 + Bx5) + Rxy * (By4 + By5) + Rxz * (Bz4 + Bz5), \
-                                     Ryx * (Bx4 + Bx5) + Ryy * (By4 + By5) + Ryz * (Bz4 + Bz5), \
-                                     Rzx * (Bx4 + Bx5) + Rzy * (By4 + By5) + Rzz * (Bz4 + Bz5));
-            simd_t B_total_alpha = inner(B1_rot, np);
-            B_total_alpha += inner(B23_rot, np);
-            simd_t B_total_delta = inner(B11_rot, np);
-            B_total_delta += inner(B45_rot, np);
-            for(int kk = 0; kk < klimit; kk++){
-                dA(i, j + kk) += B_total_alpha[kk];
-                dA(i, j + kk + num_coils) += B_total_delta[kk];
-            }
-        }
-    }
-    return dA;
-}
-
 #else
 
-Array dA_dkappa(Array& points, Array& plasma_points, Array& alphas, Array& deltas, Array& plasma_normal, Array& int_points, Array& int_weights, double R)
-{
-    // warning: row_major checks below do NOT throw an error correctly on a compute node on Cori
-    if(points.layout() != xt::layout_type::row_major)
-          throw std::runtime_error("points needs to be in row-major storage order");
-    if(alphas.layout() != xt::layout_type::row_major)
-          throw std::runtime_error("alphas needs to be in row-major storage order");
-    if(deltas.layout() != xt::layout_type::row_major)
-          throw std::runtime_error("deltas needs to be in row-major storage order");
-    if(plasma_points.layout() != xt::layout_type::row_major)
-          throw std::runtime_error("plasma_points needs to be in row-major storage order");
-    if(plasma_normal.layout() != xt::layout_type::row_major)
-          throw std::runtime_error("plasma_normal needs to be in row-major storage order");
-          
-    // points shape should be (num_coils, 3)
-    // plasma_normal shape should be (num_plasma_points, 3)
-    // plasma_points should be (num_plasma_points, 3)
-    int num_coils = alphas.shape(0);  // shape should be (num_coils)
-    int num_plasma_points = plasma_points.shape(0);
-    int num_phi = int_points.shape(0);  // shape should be (num_phi)
-    
-    // this variable is the A matrix in the least-squares term so A * I = Bn
-    Array dA = xt::zeros<double>({num_plasma_points, num_coils * 2});
-    using namespace boost::math;
-    
-    #pragma omp parallel for schedule(static)
-    for (int i = 0; i < num_plasma_points; ++i) {
-        auto xp = plasma_points(i, 0);
-        auto yp = plasma_points(i, 1);
-        auto zp = plasma_points(i, 2);
-        auto nx = plasma_normal(i, 0);
-        auto ny = plasma_normal(i, 1);
-        auto nz = plasma_normal(i, 2);
-        for(int j = 0; j < num_coils; j++) {
-            auto cdj = cos(deltas(j));
-            auto caj = cos(alphas(j));
-            auto sdj = sin(deltas(j));
-            auto saj = sin(alphas(j));
-            auto xk = points(j, 0);
-            auto yk = points(j, 1);
-            auto zk = points(j, 2);
-            auto x0 = xp - xk;
-            auto y0 = yp - yk;
-            auto z0 = zp - zk;
-            auto Rxx = cdj;
-            auto Rxy = sdj * saj;
-            auto Rxz = sdj * caj;
-            auto Ryx = 0.0;
-            auto Ryy = caj;
-            auto Ryz = -saj;
-            auto Rzx = -sdj;
-            auto Rzy = cdj * saj;
-            auto Rzz = cdj * caj;
-            auto dRxx_dalpha = 0.0;
-            auto dRxy_dalpha = sdj * caj;
-            auto dRxz_dalpha = -sdj * saj;
-            auto dRyx_dalpha = 0.0;
-            auto dRyy_dalpha = -saj;
-            auto dRyz_dalpha = -caj;
-            auto dRzx_dalpha = 0.0;
-            auto dRzy_dalpha = cdj * caj;
-            auto dRzz_dalpha = -cdj * saj;
-            auto dRxx_ddelta = -sdj;
-            auto dRxy_ddelta = cdj * saj;
-            auto dRxz_ddelta = cdj * caj;
-            auto dRyx_ddelta = 0.0;
-            auto dRyy_ddelta = 0.0;
-            auto dRyz_ddelta = 0.0;
-            auto dRzx_ddelta = -cdj;
-            auto dRzy_ddelta = -sdj * saj;
-            auto dRzz_ddelta = -sdj * caj;
-            auto Bx1 = 0.0;
-            auto By1 = 0.0;
-            auto Bz1 = 0.0;
-            auto Bx2 = 0.0;
-            auto By2 = 0.0;
-            auto Bz2 = 0.0;
-            auto Bx3 = 0.0;
-            auto By3 = 0.0;
-            auto Bz3 = 0.0;
-            auto Bx4 = 0.0;
-            auto By4 = 0.0;
-            auto Bz4 = 0.0;
-            auto Bx5 = 0.0;
-            auto By5 = 0.0;
-            auto Bz5 = 0.0;
-            for (int k = 0; k < num_phi; ++k) {
-                auto weight = int_weights(k);
-                auto dlx = -R * sin(int_points(k));
-                auto dly = R * cos(int_points(k));
-                auto dlz = 0.0;
-                // multiply by R^T and then subtract off coil coordinate
-                auto RTxdiff = (Rxx * x0 + Ryx * y0 + Rzx * z0) - R * cos(int_points(k));
-                auto RTydiff = (Rxy * x0 + Ryy * y0 + Rzy * z0) - R * sin(int_points(k));
-                auto RTzdiff = (Rxz * x0 + Ryz * y0 + Rzz * z0);
-                auto dl_cross_RTdiff_x = dly * RTzdiff - dlz * RTydiff;
-                auto dl_cross_RTdiff_y = dlz * RTxdiff - dlx * RTzdiff;
-                auto dl_cross_RTdiff_z = dlx * RTydiff - dly * RTxdiff;
-                auto denom = sqrt(RTxdiff * RTxdiff + RTydiff * RTydiff + RTzdiff * RTzdiff);
-                auto denom3 = denom * denom * denom / weight;  // notice weight factor
-                auto denom5 = denom3 * denom * denom;
-                // First derivative contribution of three
-                Bx1 += dl_cross_RTdiff_x / denom3;
-                By1 += dl_cross_RTdiff_y / denom3;
-                Bz1 += dl_cross_RTdiff_z / denom3;
-                // second derivative contribution (should be dRT/dalpha)
-                auto dR_dalphaT_x = dRxx_dalpha * x0 + dRyx_dalpha * y0 + dRzx_dalpha * z0;
-                auto dR_dalphaT_y = dRxy_dalpha * x0 + dRyy_dalpha * y0 + dRzy_dalpha * z0;
-                auto dR_dalphaT_z = dRxz_dalpha * x0 + dRyz_dalpha * y0 + dRzz_dalpha * z0;
-                auto dR_ddeltaT_x = dRxx_ddelta * x0 + dRyx_ddelta * y0 + dRzx_ddelta * z0;
-                auto dR_ddeltaT_y = dRxy_ddelta * x0 + dRyy_ddelta * y0 + dRzy_ddelta * z0;
-                auto dR_ddeltaT_z = dRxz_ddelta * x0 + dRyz_ddelta * y0 + dRzz_ddelta * z0;
-                auto dl_cross_dR_dalphaT_x = dly * dR_dalphaT_z - dlz * dR_dalphaT_y;
-                auto dl_cross_dR_dalphaT_y = dlz * dR_dalphaT_x - dlx * dR_dalphaT_z;
-                auto dl_cross_dR_dalphaT_z = dlx * dR_dalphaT_y - dly * dR_dalphaT_x;
-                auto dl_cross_dR_ddeltaT_x = dly * dR_ddeltaT_z - dlz * dR_ddeltaT_y;
-                auto dl_cross_dR_ddeltaT_y = dlz * dR_ddeltaT_x - dlx * dR_ddeltaT_z;
-                auto dl_cross_dR_ddeltaT_z = dlx * dR_ddeltaT_y - dly * dR_ddeltaT_x;
-                Bx2 += dl_cross_dR_dalphaT_x / denom3;
-                By2 += dl_cross_dR_dalphaT_y / denom3;
-                Bz2 += dl_cross_dR_dalphaT_z / denom3;
-                Bx4 += dl_cross_dR_ddeltaT_x / denom3;
-                By4 += dl_cross_dR_ddeltaT_y / denom3;
-                Bz4 += dl_cross_dR_ddeltaT_z / denom3;
-                // third derivative contribution
-                auto RTxdiff_dot_dR_dalpha = RTxdiff * dR_dalphaT_x + RTydiff * dR_dalphaT_y + RTzdiff * dR_dalphaT_z;
-                auto RTxdiff_dot_dR_ddelta = RTxdiff * dR_ddeltaT_x + RTydiff * dR_ddeltaT_y + RTzdiff * dR_ddeltaT_z;
-                Bx3 += - 3.0 * dl_cross_RTdiff_x * RTxdiff_dot_dR_dalpha / denom5;
-                By3 += - 3.0 * dl_cross_RTdiff_y * RTxdiff_dot_dR_dalpha / denom5;
-                Bz3 += - 3.0 * dl_cross_RTdiff_z * RTxdiff_dot_dR_dalpha / denom5;
-                Bx5 += - 3.0 * dl_cross_RTdiff_x * RTxdiff_dot_dR_ddelta / denom5;
-                By5 += - 3.0 * dl_cross_RTdiff_y * RTxdiff_dot_dR_ddelta / denom5;
-                Bz5 += - 3.0 * dl_cross_RTdiff_z * RTxdiff_dot_dR_ddelta / denom5;
-            }
-            // rotate first contribution by dR/dalpha
-            dA(i, j) += (dRxx_dalpha * Bx1 + dRxy_dalpha * By1 + dRxz_dalpha * Bz1) * nx;
-            dA(i, j) += (dRyx_dalpha * Bx1 + dRyy_dalpha * By1 + dRyz_dalpha * Bz1) * ny;
-            dA(i, j) += (dRzx_dalpha * Bx1 + dRzy_dalpha * By1 + dRzz_dalpha * Bz1) * nz;
-            // rotate second and third contribution by R
-            dA(i, j) += (Rxx * (Bx2 + Bx3) + Rxy * (By2 + By3) + Rxz * (Bz2 + Bz3)) * nx;
-            dA(i, j) += (Ryx * (Bx2 + Bx3) + Ryy * (By2 + By3) + Ryz * (Bz2 + Bz3)) * ny;
-            dA(i, j) += (Rzx * (Bx2 + Bx3) + Rzy * (By2 + By3) + Rzz * (Bz2 + Bz3)) * nz;
-            // repeat for delta derivative
-            dA(i, j + num_coils) += (dRxx_ddelta * Bx1 + dRxy_ddelta * By1 + dRxz_ddelta * Bz1) * nx;
-            dA(i, j + num_coils) += (dRyx_ddelta * Bx1 + dRyy_ddelta * By1 + dRyz_ddelta * Bz1) * ny;
-            dA(i, j + num_coils) += (dRzx_ddelta * Bx1 + dRzy_ddelta * By1 + dRzz_ddelta * Bz1) * nz;
-            dA(i, j + num_coils) += (Rxx * (Bx4 + Bx5) + Rxy * (By4 + By5) + Rxz * (Bz4 + Bz5)) * nx;
-            dA(i, j + num_coils) += (Ryx * (Bx4 + Bx5) + Ryy * (By4 + By5) + Ryz * (Bz4 + Bz5)) * ny;
-            dA(i, j + num_coils) += (Rzx * (Bx4 + Bx5) + Rzy * (By4 + By5) + Rzz * (Bz4 + Bz5)) * nz;
-        }
-    }
-    return dA;
-}
-
 // Todo: write your own C++ code for ellipe/ellipk so we can XSIMD it...? 
 
 Array dpsi_dkappa(Array& I_TF, Array& dl_TF, Array& gamma_TF, Array& PSC_points, Array& alphas, Array& deltas, Array& coil_normals, Array& rho, Array& phi, double R)
@@ -1067,6 +748,288 @@ Array L_deriv(Array& points, Array& alphas, Array& deltas, Array& int_points, Ar
 
 #endif
 
+Array dA_dkappa(Array& points, Array& plasma_points, Array& alphas, Array& deltas, Array& plasma_normal, Array& int_points, Array& int_weights, double R)
+{         
+    // points shape should be (num_coils, 3)
+    // plasma_normal shape should be (num_plasma_points, 3)
+    // plasma_points should be (num_plasma_points, 3)
+    int num_coils = alphas.shape(0);  // shape should be (num_coils)
+    int num_plasma_points = plasma_points.shape(0);
+    int num_phi = int_points.shape(0);  // shape should be (num_phi)
+    
+    // this variable is the A matrix in the least-squares term so A * I = Bn
+    Array dA = xt::zeros<double>({num_plasma_points, num_coils * 2});
+    double* plasma_points_ptr = &(plasma_points(0, 0));
+    double* plasma_normal_ptr = &(plasma_normal(0, 0));
+    double* points_ptr = &(points(0, 0));
+    double* alphas_ptr = &(alphas(0));
+    double* deltas_ptr = &(deltas(0));
+    double* int_points_ptr = &(int_points(0));
+    double* int_weights_ptr = &(int_weights(0));
+    using namespace boost::math;
+    
+    #pragma omp parallel for schedule(static)
+    for (int i = 0; i < num_plasma_points; ++i) {
+        auto xp = plasma_points_ptr[3 * i];
+        auto yp = plasma_points_ptr[3 * i + 1];
+        auto zp = plasma_points_ptr[3 * i + 2];
+        auto nx = plasma_normal_ptr[3 * i];
+        auto ny = plasma_normal_ptr[3 * i + 1];
+        auto nz = plasma_normal_ptr[3 * i + 2];
+        for(int j = 0; j < num_coils; j++) {
+            auto dj = deltas_ptr[j];
+            auto aj = alphas_ptr[j];
+            auto cdj = cos(dj);
+            auto caj = cos(aj);
+            auto sdj = sin(dj);
+            auto saj = sin(aj);
+            auto xk = points_ptr[3 * j];
+            auto yk = points_ptr[3 * j + 1];
+            auto zk = points_ptr[3 * j + 2];
+            auto x0 = xp - xk;
+            auto y0 = yp - yk;
+            auto z0 = zp - zk;
+            auto Rxx = cdj;
+            auto Rxy = sdj * saj;
+            auto Rxz = sdj * caj;
+            auto Ryy = caj;
+            auto Ryz = -saj;
+            auto Rzx = -sdj;
+            auto Rzy = cdj * saj;
+            auto Rzz = cdj * caj;
+            auto dRxy_dalpha = sdj * caj;
+            auto dRxz_dalpha = -sdj * saj;
+            auto dRyy_dalpha = -saj;
+            auto dRyz_dalpha = -caj;
+            auto dRzy_dalpha = cdj * caj;
+            auto dRzz_dalpha = -cdj * saj;
+            auto dRxx_ddelta = -sdj;
+            auto dRxy_ddelta = cdj * saj;
+            auto dRxz_ddelta = cdj * caj;
+            auto dRzx_ddelta = -cdj;
+            auto dRzy_ddelta = -sdj * saj;
+            auto dRzz_ddelta = -sdj * caj;
+            auto Bx1 = 0.0;
+            auto By1 = 0.0;
+            auto Bz1 = 0.0;
+            auto Bx2 = 0.0;
+            auto By2 = 0.0;
+            auto Bz2 = 0.0;
+            auto Bx4 = 0.0;
+            auto By4 = 0.0;
+            auto Bz4 = 0.0;
+            for (int k = 0; k < num_phi; ++k) {
+                auto weight = int_weights_ptr[k];
+                auto pk = int_points_ptr[k];
+                auto dlx = -R * sin(pk);
+                auto dly = R * cos(pk);
+                // multiply by R^T and then subtract off coil coordinate
+                auto RTxdiff = (Rxx * x0 + Rzx * z0) - R * cos(pk);
+                auto RTydiff = (Rxy * x0 + Ryy * y0 + Rzy * z0) - R * sin(pk);
+                auto RTzdiff = (Rxz * x0 + Ryz * y0 + Rzz * z0);
+                auto dl_cross_RTdiff_x = dly * RTzdiff;
+                auto dl_cross_RTdiff_y = -dlx * RTzdiff;
+                auto dl_cross_RTdiff_z = dlx * RTydiff - dly * RTxdiff;
+                auto denom = sqrt(RTxdiff * RTxdiff + RTydiff * RTydiff + RTzdiff * RTzdiff);
+                auto denom3 = denom * denom * denom / weight;  // notice weight factor
+                auto denom5 = denom3 * denom * denom;
+                // First derivative contribution of three
+                Bx1 += dl_cross_RTdiff_x / denom3;
+                By1 += dl_cross_RTdiff_y / denom3;
+                Bz1 += dl_cross_RTdiff_z / denom3;
+                // second derivative contribution (should be dRT/dalpha)
+                auto dR_dalphaT_y = dRxy_dalpha * x0 + dRyy_dalpha * y0 + dRzy_dalpha * z0;
+                auto dR_dalphaT_z = dRxz_dalpha * x0 + dRyz_dalpha * y0 + dRzz_dalpha * z0;
+                auto dR_ddeltaT_x = dRxx_ddelta * x0 + dRzx_ddelta * z0;
+                auto dR_ddeltaT_y = dRxy_ddelta * x0 + dRzy_ddelta * z0;
+                auto dR_ddeltaT_z = dRxz_ddelta * x0 + dRzz_ddelta * z0;
+                auto dl_cross_dR_dalphaT_x = dly * dR_dalphaT_z;
+                auto dl_cross_dR_dalphaT_y = -dlx * dR_dalphaT_z;
+                auto dl_cross_dR_dalphaT_z = dlx * dR_dalphaT_y;
+                auto dl_cross_dR_ddeltaT_x = dly * dR_ddeltaT_z;
+                auto dl_cross_dR_ddeltaT_y = -dlx * dR_ddeltaT_z;
+                auto dl_cross_dR_ddeltaT_z = dlx * dR_ddeltaT_y - dly * dR_ddeltaT_x;
+                Bx2 += dl_cross_dR_dalphaT_x / denom3;
+                By2 += dl_cross_dR_dalphaT_y / denom3;
+                Bz2 += dl_cross_dR_dalphaT_z / denom3;
+                Bx4 += dl_cross_dR_ddeltaT_x / denom3;
+                By4 += dl_cross_dR_ddeltaT_y / denom3;
+                Bz4 += dl_cross_dR_ddeltaT_z / denom3;
+                // third derivative contribution
+                auto RTxdiff_dot_dR_dalpha = RTydiff * dR_dalphaT_y + RTzdiff * dR_dalphaT_z;
+                auto RTxdiff_dot_dR_ddelta = RTxdiff * dR_ddeltaT_x + RTydiff * dR_ddeltaT_y + RTzdiff * dR_ddeltaT_z;
+                auto prefac_alpha = - 3.0 * RTxdiff_dot_dR_dalpha / denom5;
+                auto prefac_delta = - 3.0 * RTxdiff_dot_dR_ddelta / denom5;
+                Bx2 += dl_cross_RTdiff_x * prefac_alpha;
+                By2 += dl_cross_RTdiff_y * prefac_alpha;
+                Bz2 += dl_cross_RTdiff_z * prefac_alpha;
+                Bx4 += dl_cross_RTdiff_x * prefac_delta;
+                By4 += dl_cross_RTdiff_y * prefac_delta;
+                Bz4 += dl_cross_RTdiff_z * prefac_delta;
+            }
+            // rotate first contribution by dR/dalpha
+            dA(i, j) += (dRxy_dalpha * By1 + dRxz_dalpha * Bz1 + Rxx * Bx2 + Rxy * By2 + Rxz * Bz2) * nx + \
+                        (dRyy_dalpha * By1 + dRyz_dalpha * Bz1 + Ryy * By2 + Ryz * Bz2) * ny + \
+                        (dRzy_dalpha * By1 + dRzz_dalpha * Bz1 + Rzx * Bx2 + Rzy * By2 + Rzz * Bz2) * nz;
+            // repeat for delta derivative
+            dA(i, j + num_coils) += (dRxx_ddelta * Bx1 + dRxy_ddelta * By1 + dRxz_ddelta * Bz1 + Rxx * Bx4 + Rxy * By4 + Rxz * Bz4) * nx + \
+                                    (Ryy * By4 + Ryz * Bz4) * ny + \
+                                    (dRzx_ddelta * Bx1 + dRzy_ddelta * By1 + dRzz_ddelta * Bz1 + Rzx * Bx4 + Rzy * By4 + Rzz * Bz4) * nz;
+        }
+    }
+    return dA;
+}
+
+Array dA_dkappa_simd(Array& points, Array& plasma_points, Array& alphas, Array& deltas, Array& plasma_normal, Array& int_points, Array& int_weights, double R)
+{
+    // points shape should be (num_coils, 3)
+    // plasma_normal shape should be (num_plasma_points, 3)
+    // plasma_points should be (num_plasma_points, 3)
+    int num_coils = alphas.shape(0);  // shape should be (num_coils)
+    int num_plasma_points = plasma_points.shape(0);
+    int num_phi = int_points.shape(0);  // shape should be (num_phi)
+    
+    // this variable is the A matrix in the least-squares term so A * I = Bn
+    Array dA = xt::zeros<double>({num_plasma_points, num_coils * 2});
+    constexpr int simd_size = xsimd::simd_type<double>::size;
+    double* plasma_points_ptr = &(plasma_points(0, 0));
+    double* plasma_normal_ptr = &(plasma_normal(0, 0));
+    double* points_ptr = &(points(0, 0));
+    double* alphas_ptr = &(alphas(0));
+    double* deltas_ptr = &(deltas(0));
+    double* int_points_ptr = &(int_points(0));
+    double* int_weights_ptr = &(int_weights(0));
+    
+    #pragma omp parallel for schedule(static)
+    for (int i = 0; i < num_plasma_points; ++i) {
+        auto xp = Vec3dSimd(plasma_points_ptr[3 * i], \
+                            plasma_points_ptr[3 * i + 1], \
+                            plasma_points_ptr[3 * i + 2]);
+        auto np = Vec3dSimd(plasma_normal_ptr[3 * i], \
+                            plasma_normal_ptr[3 * i + 1], \
+                            plasma_normal_ptr[3 * i + 2]);
+        for(int j = 0; j < num_coils; j += simd_size) {
+            simd_t aj, dj;
+            int klimit = std::min(simd_size, num_coils - j);
+            auto point_j = Vec3dSimd();
+            for(int kk = 0; kk < klimit; kk++){
+                for (int d = 0; d < 3; ++d) {
+                    point_j[d][kk] = points_ptr[3 * (j + kk) + d];
+                }
+                aj[kk] = alphas_ptr[j + kk];
+                dj[kk] = deltas_ptr[j + kk];
+            }
+            simd_t cdj = xsimd::cos(dj);
+            simd_t caj = xsimd::cos(aj);
+            simd_t sdj = xsimd::sin(dj);
+            simd_t saj = xsimd::sin(aj);
+            Vec3dSimd r0 = xp - point_j;
+            simd_t x0 = r0.x;
+            simd_t y0 = r0.y;
+            simd_t z0 = r0.z;
+            simd_t Rxx = cdj;
+            simd_t Rxy = sdj * saj;
+            simd_t Rxz = sdj * caj;
+            simd_t Ryy = caj;
+            simd_t Ryz = -saj;
+            simd_t Rzx = -sdj;
+            simd_t Rzy = cdj * saj;
+            simd_t Rzz = cdj * caj;
+            simd_t dRxy_dalpha = sdj * caj;
+            simd_t dRxz_dalpha = -sdj * saj;
+            simd_t dRyy_dalpha = -saj;
+            simd_t dRyz_dalpha = -caj;
+            simd_t dRzy_dalpha = cdj * caj;
+            simd_t dRzz_dalpha = -cdj * saj;
+            simd_t dRxx_ddelta = -sdj;
+            simd_t dRxy_ddelta = cdj * saj;
+            simd_t dRxz_ddelta = cdj * caj;
+            simd_t dRzx_ddelta = -cdj;
+            simd_t dRzy_ddelta = -sdj * saj;
+            simd_t dRzz_ddelta = -sdj * caj;
+            auto B1 = Vec3dSimd();
+            auto B2 = Vec3dSimd();
+            auto B4 = Vec3dSimd();
+            for (int k = 0; k < num_phi; ++k) {
+                simd_t weight = ((simd_t) int_weights_ptr[k]);
+                simd_t pk = ((simd_t) int_points_ptr[k]);
+                auto dl = Vec3dSimd(-R * xsimd::sin(pk), \
+                                    R * xsimd::cos(pk), \
+                                    (simd_t) 0.0);
+                auto RTdiff = Vec3dSimd((Rxx * x0 + Rzx * z0) - R * xsimd::cos(pk), \
+                                        (Rxy * x0 + Ryy * y0 + Rzy * z0) - R * xsimd::sin(pk), \
+                                        (Rxz * x0 + Ryz * y0 + Rzz * z0));     
+                // multiply by R^T and then subtract off coil coordinate
+                auto dl_cross_RTdiff = cross(dl, RTdiff);
+                simd_t rmag_2 = normsq(RTdiff);
+                simd_t denom = rsqrt(rmag_2);
+                simd_t denom3 = denom * denom * denom * weight;  // notice weight factor
+                simd_t denom5 = denom3 * denom * denom * (-3.0);
+                // First derivative contribution of three
+                B1.x += dl_cross_RTdiff.x * denom3;
+                B1.y += dl_cross_RTdiff.y * denom3;
+                B1.z += dl_cross_RTdiff.z * denom3;
+                // second derivative contribution (should be dRT/dalpha)
+                auto dR_dalphaT = Vec3dSimd((simd_t) 0.0, \
+                                            dRxy_dalpha * x0 + dRyy_dalpha * y0 + dRzy_dalpha * z0, \
+                                            dRxz_dalpha * x0 + dRyz_dalpha * y0 + dRzz_dalpha * z0);
+                auto dR_ddeltaT = Vec3dSimd(dRxx_ddelta * x0 + dRzx_ddelta * z0, \
+                                            dRxy_ddelta * x0 + dRzy_ddelta * z0, \
+                                            dRxz_ddelta * x0 + dRzz_ddelta * z0);
+                auto dl_cross_dR_dalphaT = cross(dl, dR_dalphaT);
+                auto dl_cross_dR_ddeltaT = cross(dl, dR_ddeltaT);
+                B2.x += dl_cross_dR_dalphaT.x * denom3;
+                B2.y += dl_cross_dR_dalphaT.y * denom3;
+                B2.z += dl_cross_dR_dalphaT.z * denom3;
+                B4.x += dl_cross_dR_ddeltaT.x * denom3;
+                B4.y += dl_cross_dR_ddeltaT.y * denom3;
+                B4.z += dl_cross_dR_ddeltaT.z * denom3;
+                // third derivative contribution
+                simd_t RTdiff_dot_dR_dalpha = inner(RTdiff, dR_dalphaT);
+                simd_t prefac_alpha = RTdiff_dot_dR_dalpha * denom5;
+                simd_t RTdiff_dot_dR_ddelta = inner(RTdiff, dR_ddeltaT);
+                simd_t prefac_delta = RTdiff_dot_dR_ddelta * denom5;
+                B2.x += dl_cross_RTdiff.x * prefac_alpha;
+                B2.y += dl_cross_RTdiff.y * prefac_alpha;
+                B2.z += dl_cross_RTdiff.z * prefac_alpha;
+                B4.x += dl_cross_RTdiff.x * prefac_delta;
+                B4.y += dl_cross_RTdiff.y * prefac_delta;
+                B4.z += dl_cross_RTdiff.z * prefac_delta;
+            }
+            // rotate first contribution by dR/dalpha
+            simd_t Bx1 = B1.x;
+            simd_t By1 = B1.y;
+            simd_t Bz1 = B1.z;
+            simd_t Bx2 = B2.x;
+            simd_t By2 = B2.y;
+            simd_t Bz2 = B2.z;
+            simd_t Bx4 = B4.x;
+            simd_t By4 = B4.y;
+            simd_t Bz4 = B4.z;
+            auto B1_rot = Vec3dSimd(dRxy_dalpha * By1 + dRxz_dalpha * Bz1, \
+                                    dRyy_dalpha * By1 + dRyz_dalpha * Bz1, \
+                                    dRzy_dalpha * By1 + dRzz_dalpha * Bz1);
+            auto B2_rot = Vec3dSimd(Rxx * Bx2 + Rxy * By2 + Rxz * Bz2, \
+                                     Ryy * By2 + Ryz * Bz2, \
+                                     Rzx * Bx2 + Rzy * By2 + Rzz * Bz2);
+            // repeat for delta derivative
+            auto B11_rot = Vec3dSimd(dRxx_ddelta * Bx1 + dRxy_ddelta * By1 + dRxz_ddelta * Bz1, \
+                                     (simd_t) 0.0, \
+                                     dRzx_ddelta * Bx1 + dRzy_ddelta * By1 + dRzz_ddelta * Bz1);
+            auto B4_rot = Vec3dSimd(Rxx * Bx4 + Rxy * By4 + Rxz * Bz4, \
+                                     Ryy * By4 + Ryz * Bz4, \
+                                     Rzx * Bx4 + Rzy * By4 + Rzz * Bz4);
+            simd_t B_total_alpha = inner(B1_rot + B2_rot, np);
+            simd_t B_total_delta = inner(B11_rot + B4_rot, np);
+            for(int kk = 0; kk < klimit; kk++){
+                dA(i, j + kk) += B_total_alpha[kk];
+                dA(i, j + kk + num_coils) += B_total_delta[kk];
+            }
+        }
+    }
+    return dA;
+}
+
 Array A_matrix_simd(Array& points, Array& plasma_points, Array& alphas, Array& deltas, Array& plasma_normal, double R)
 {     
     // points shape should be (num_coils, 3)
diff --git a/src/simsoptpp/psc.h b/src/simsoptpp/psc.h
index 13ed7266c..7259f94f9 100644
--- a/src/simsoptpp/psc.h
+++ b/src/simsoptpp/psc.h
@@ -38,6 +38,8 @@ Array B_PSC(Array& points, Array& plasma_points, Array& alphas, Array& deltas, A
 
 Array dA_dkappa(Array& points, Array& plasma_points, Array& alphas, Array& deltas, Array& plasma_normal, Array& int_points, Array& int_weights, double R);
 
+Array dA_dkappa_simd(Array& points, Array& plasma_points, Array& alphas, Array& deltas, Array& plasma_normal, Array& int_points, Array& int_weights, double R);
+
 Array A_matrix_direct(Array& points, Array& plasma_points, Array& alphas, Array& deltas, Array& plasma_normal, Array& phi, double R);
 
 Array dpsi_dkappa(Array& I_TF, Array& dl_TF, Array& gamma_TF, Array& PSC_points, Array& alphas, Array& deltas, Array& coil_normals, Array& rho, Array& phi, double R);
diff --git a/src/simsoptpp/python.cpp b/src/simsoptpp/python.cpp
index 2b95a3694..5d982e486 100644
--- a/src/simsoptpp/python.cpp
+++ b/src/simsoptpp/python.cpp
@@ -70,6 +70,7 @@ PYBIND11_MODULE(simsoptpp, m) {
     m.def("A_matrix_direct" , &A_matrix_direct);
     m.def("B_PSC" , &B_PSC);
     m.def("dA_dkappa" , &dA_dkappa);
+    m.def("dA_dkappa_simd" , &dA_dkappa_simd);
     m.def("dpsi_dkappa" , &dpsi_dkappa);
     m.def("psi_check" , &psi_check);
     m.def("B_TF" , &B_TF);
diff --git a/tests/geo/test_psc_grid.py b/tests/geo/test_psc_grid.py
index 740edc525..cdc1ea320 100644
--- a/tests/geo/test_psc_grid.py
+++ b/tests/geo/test_psc_grid.py
@@ -16,129 +16,133 @@ def test_dpsi_analytic_derivatives(self):
         against finite differences for tiny changes to the angles, 
         to see if the analytic calculations are correct.
         """
-        ncoils = 7
-        np.random.seed(ncoils)
-        R0 = 1
-        R = 1 
-        a = 1e-5
-        points = (np.random.rand(ncoils, 3) - 0.5) * 5
-        
-        # Need alphas and deltas in [-pi, pi] range
-        deltas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
-        alphas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
-        psc_array = PSCgrid.geo_setup_manual(
-            points, R=R, a=a, alphas=alphas, deltas=deltas
-        )
-        A = psc_array.A_matrix
-        psi = psc_array.psi
-        b = psc_array.b_opt  # / psc_array.grid_normalization
-        L = psc_array.L
-        Linv = psc_array.L_inv
-        Bn_objective = 0.5 * (A @ Linv @ psi + b).T @ (A @ Linv @ psi + b)
-        psi_deriv = psc_array.psi_deriv()
-        epsilon = 1e-4
-        deltas_new = np.copy(deltas)
-        deltas_new[0] += epsilon
-        alphas_new = alphas
-        psc_array_new = PSCgrid.geo_setup_manual(
-            points, R=R, a=a, alphas=alphas_new, deltas=deltas_new
-        )
-        psi_new = psc_array_new.psi
-        Bn_objective_new = 0.5 * (A @ Linv @ psi_new + b).T @ (A @ Linv @ psi_new + b)
-        dBn_objective = (Bn_objective_new - Bn_objective) / epsilon
-        dpsi_ddelta = (psi_new - psi) / epsilon
-        dpsi_ddelta_analytic = psi_deriv
-        # print(dpsi_ddelta, dpsi_ddelta_analytic)
-        assert(np.allclose(dpsi_ddelta[0], dpsi_ddelta_analytic[ncoils], rtol=1e-3))
-        dBn_analytic = (A @ Linv @ psi + b).T @ (A @ (Linv * psi_deriv[ncoils:]))
-        assert np.isclose(dBn_objective, dBn_analytic[0], rtol=1e-2)
-        
-        # Repeat but change coil 3
-        deltas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
-        alphas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
-        psc_array = PSCgrid.geo_setup_manual(
-            points, R=R, a=a, alphas=alphas, deltas=deltas
+        input_name = 'input.LandremanPaul2021_QA_lowres'
+        TEST_DIR = (Path(__file__).parent / ".." / ".." / "tests" / "test_files").resolve()
+        surface_filename = TEST_DIR / input_name
+        surf1 = SurfaceRZFourier.from_vmec_input(
+            surface_filename, range='full torus', nphi=16, ntheta=16
         )
-        A = psc_array.A_matrix
-        psi = psc_array.psi
-        b = psc_array.b_opt  # / psc_array.grid_normalization
-        Linv = psc_array.L_inv
-        Bn_objective = 0.5 * (A @ Linv @ psi + b).T @ (A @ Linv @ psi + b)
-        psi_deriv = psc_array.psi_deriv()
-        epsilon = 1e-4
-        deltas_new = np.copy(deltas)
-        deltas_new[3] += epsilon
-        alphas_new = alphas
-        psc_array_new = PSCgrid.geo_setup_manual(
-            points, R=R, a=a, alphas=alphas_new, deltas=deltas_new
+        surf1.nfp = 1
+        surf1.stellsym = False
+        surf2 = SurfaceRZFourier.from_vmec_input(
+            surface_filename, range='half period', nphi=16, ntheta=16
         )
-        psi_new = psc_array_new.psi
-        Bn_objective_new = 0.5 * (A @ Linv @ psi_new + b).T @ (A @ Linv @ psi_new + b)
-        dBn_objective = (Bn_objective_new - Bn_objective) / epsilon
-        dpsi_ddelta = (psi_new - psi) / epsilon
-        dpsi_ddelta_analytic = psi_deriv
-        # print(dpsi_ddelta, dpsi_ddelta_analytic)
-        assert(np.allclose(dpsi_ddelta[3], dpsi_ddelta_analytic[ncoils + 3], rtol=1e-3))
-        dBn_analytic = (A @ Linv @ psi + b).T @ (A @ (Linv * psi_deriv[ncoils:]))
-        assert np.isclose(dBn_objective, dBn_analytic[3], rtol=1e-2)
-
-        # Test partial derivative calculation if we change alpha by epsilon
-        deltas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
+        ncoils = 7
+        np.random.seed(1)
+        R = 1.0
+        a = 1e-5
+        points = (np.random.rand(ncoils, 3) - 0.5) * 20
         alphas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
-        psc_array = PSCgrid.geo_setup_manual(
-            points, R=R, a=a, alphas=alphas, deltas=deltas
-        )
-        A = psc_array.A_matrix
-        psi = psc_array.psi
-        b = psc_array.b_opt # / psc_array.grid_normalization
-        Linv = psc_array.L_inv
-        Bn_objective = 0.5 * (A @ Linv @ psi + b).T @ (A @ Linv @ psi + b)
-        psi_deriv = psc_array.psi_deriv()
-        epsilon = 1e-4
-        alphas_new = np.copy(alphas)
-        alphas_new[0] += epsilon
-        deltas_new = deltas
-        psc_array_new = PSCgrid.geo_setup_manual(
-            points, R=R, a=a, alphas=alphas_new, deltas=deltas_new
-        )
-        psi_new = psc_array_new.psi
-        Bn_objective_new = 0.5 * (A @ Linv @ psi_new + b).T @ (A @ Linv @ psi_new + b)
-        dBn_objective = (Bn_objective_new - Bn_objective) / epsilon
-        dpsi_dalpha = (psi_new - psi) / epsilon
-        dpsi_dalpha_analytic = psi_deriv
-        # print(dpsi_ddelta, dpsi_ddelta_analytic)
-        assert(np.allclose(dpsi_dalpha[0], dpsi_dalpha_analytic[0], rtol=1e-3))
-        dBn_analytic = (A @ Linv @ psi + b).T @ (A @ (Linv * psi_deriv[:ncoils]))
-        assert np.isclose(dBn_objective, dBn_analytic[0], rtol=1e-2)
-        
-        # Repeat for changing coil 7
         deltas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
-        alphas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
-        psc_array = PSCgrid.geo_setup_manual(
-            points, R=R, a=a, alphas=alphas, deltas=deltas
-        )
-        A = psc_array.A_matrix
-        psi = psc_array.psi
-        b = psc_array.b_opt  # / psc_array.grid_normalization
-        Linv = psc_array.L_inv
-        Bn_objective = 0.5 * (A @ Linv @ psi + b).T @ (A @ Linv @ psi + b)
-        psi_deriv = psc_array.psi_deriv()
-        epsilon = 1e-4
-        alphas_new = np.copy(alphas)
-        alphas_new[6] += epsilon
-        deltas_new = deltas
-        psc_array_new = PSCgrid.geo_setup_manual(
-            points, R=R, a=a, alphas=alphas_new, deltas=deltas_new
-        )
-        psi_new = psc_array_new.psi
-        Bn_objective_new = 0.5 * (A @ Linv @ psi_new + b).T @ (A @ Linv @ psi_new + b)
-        dBn_objective = (Bn_objective_new - Bn_objective) / epsilon
-        dpsi_dalpha = (psi_new - psi) / epsilon
-        dpsi_dalpha_analytic = psi_deriv
-        # print(dpsi_ddelta, dpsi_ddelta_analytic)
-        assert(np.allclose(dpsi_dalpha[6], dpsi_dalpha_analytic[6], rtol=1e-3))
-        dBn_analytic = (A @ Linv @ psi + b).T @ (A @ (Linv * psi_deriv[:ncoils]))
-        assert np.isclose(dBn_objective, dBn_analytic[6], rtol=1e-2)
+        epsilon = 1e-4  # smaller epsilon and numerical accumulation starts to be an issue
+        for surf in [surf1, surf2]:
+            print('Surf = ', surf)
+            psc_array = PSCgrid.geo_setup_manual(
+                points, R=R, a=a, alphas=alphas, deltas=deltas
+            )
+            A = psc_array.A_matrix
+            psi = psc_array.psi
+            b = psc_array.b_opt  # / psc_array.grid_normalization
+            Linv = psc_array.L_inv
+            Bn_objective = 0.5 * (A @ Linv @ psi + b).T @ (A @ Linv @ psi + b)
+            psi_deriv = psc_array.psi_deriv()
+            deltas_new = np.copy(deltas)
+            deltas_new[0] += epsilon
+            alphas_new = np.copy(alphas)
+            psc_array_new = PSCgrid.geo_setup_manual(
+                points, R=R, a=a, alphas=alphas_new, deltas=deltas_new
+            )
+            psi_new = psc_array_new.psi
+            Bn_objective_new = 0.5 * (A @ Linv @ psi_new + b).T @ (A @ Linv @ psi_new + b)
+            dBn_objective = (Bn_objective_new - Bn_objective) / epsilon
+            dpsi_ddelta = (psi_new - psi) / epsilon
+            dpsi_ddelta_analytic = psi_deriv
+            print(dpsi_ddelta[0], dpsi_ddelta_analytic[ncoils])
+            assert(np.allclose(dpsi_ddelta[0], dpsi_ddelta_analytic[ncoils], rtol=1e-3))
+            dBn_analytic = (A @ Linv @ psi + b).T @ (A @ (Linv * psi_deriv[ncoils:]))
+            print(dBn_objective, dBn_analytic[0])
+            assert np.isclose(dBn_objective, dBn_analytic[0], rtol=1e-2)
+            
+            # Repeat but change coil 3
+            psc_array = PSCgrid.geo_setup_manual(
+                points, R=R, a=a, alphas=alphas, deltas=deltas
+            )
+            A = psc_array.A_matrix
+            psi = psc_array.psi
+            b = psc_array.b_opt  # / psc_array.grid_normalization
+            Linv = psc_array.L_inv
+            Bn_objective = 0.5 * (A @ Linv @ psi + b).T @ (A @ Linv @ psi + b)
+            psi_deriv = psc_array.psi_deriv()
+            deltas_new = np.copy(deltas)
+            deltas_new[3] += epsilon
+            alphas_new = np.copy(alphas)
+            psc_array_new = PSCgrid.geo_setup_manual(
+                points, R=R, a=a, alphas=alphas_new, deltas=deltas_new
+            )
+            psi_new = psc_array_new.psi
+            Bn_objective_new = 0.5 * (A @ Linv @ psi_new + b).T @ (A @ Linv @ psi_new + b)
+            dBn_objective = (Bn_objective_new - Bn_objective) / epsilon
+            dpsi_ddelta = (psi_new - psi) / epsilon
+            dpsi_ddelta_analytic = psi_deriv
+            print(dpsi_ddelta[3], dpsi_ddelta_analytic[ncoils + 3])
+            assert(np.allclose(dpsi_ddelta[3], dpsi_ddelta_analytic[ncoils + 3], rtol=1e-3))
+            dBn_analytic = (A @ Linv @ psi + b).T @ (A @ (Linv * psi_deriv[ncoils:]))
+            print(dBn_objective, dBn_analytic[3])
+            assert np.isclose(dBn_objective, dBn_analytic[3], rtol=1e-2)
+    
+            # Test partial derivative calculation if we change alpha by epsilon
+            psc_array = PSCgrid.geo_setup_manual(
+                points, R=R, a=a, alphas=alphas, deltas=deltas
+            )
+            A = psc_array.A_matrix
+            psi = psc_array.psi
+            b = psc_array.b_opt # / psc_array.grid_normalization
+            Linv = psc_array.L_inv
+            Bn_objective = 0.5 * (A @ Linv @ psi + b).T @ (A @ Linv @ psi + b)
+            psi_deriv = psc_array.psi_deriv()
+            alphas_new = np.copy(alphas)
+            alphas_new[0] += epsilon
+            deltas_new = np.copy(deltas)
+            psc_array_new = PSCgrid.geo_setup_manual(
+                points, R=R, a=a, alphas=alphas_new, deltas=deltas_new
+            )
+            psi_new = psc_array_new.psi
+            Bn_objective_new = 0.5 * (A @ Linv @ psi_new + b).T @ (A @ Linv @ psi_new + b)
+            dBn_objective = (Bn_objective_new - Bn_objective) / epsilon
+            dpsi_dalpha = (psi_new - psi) / epsilon
+            dpsi_dalpha_analytic = psi_deriv
+            print(dpsi_dalpha[0], dpsi_dalpha_analytic[0])
+            assert(np.allclose(dpsi_dalpha[0], dpsi_dalpha_analytic[0], rtol=1e-3))
+            dBn_analytic = (A @ Linv @ psi + b).T @ (A @ (Linv * psi_deriv[:ncoils]))
+            print(dBn_objective, dBn_analytic[0])
+            assert np.isclose(dBn_objective, dBn_analytic[0], rtol=1e-1)
+            
+            # Repeat for changing coil 7
+            psc_array = PSCgrid.geo_setup_manual(
+                points, R=R, a=a, alphas=alphas, deltas=deltas
+            )
+            A = psc_array.A_matrix
+            psi = psc_array.psi
+            b = psc_array.b_opt  # / psc_array.grid_normalization
+            Linv = psc_array.L_inv
+            Bn_objective = 0.5 * (A @ Linv @ psi + b).T @ (A @ Linv @ psi + b)
+            psi_deriv = psc_array.psi_deriv()
+            alphas_new = np.copy(alphas)
+            alphas_new[6] += epsilon
+            deltas_new = np.copy(deltas)
+            psc_array_new = PSCgrid.geo_setup_manual(
+                points, R=R, a=a, alphas=alphas_new, deltas=deltas_new
+            )
+            psi_new = psc_array_new.psi
+            Bn_objective_new = 0.5 * (A @ Linv @ psi_new + b).T @ (A @ Linv @ psi_new + b)
+            dBn_objective = (Bn_objective_new - Bn_objective) / epsilon
+            dpsi_dalpha = (psi_new - psi) / epsilon
+            dpsi_dalpha_analytic = psi_deriv
+            print(dpsi_dalpha[6], dpsi_dalpha_analytic[6])
+            assert(np.allclose(dpsi_dalpha[6], dpsi_dalpha_analytic[6], rtol=1e-3))
+            dBn_analytic = (A @ Linv @ psi + b).T @ (A @ (Linv * psi_deriv[:ncoils]))
+            print(dBn_objective, dBn_analytic[6])
+            assert np.isclose(dBn_objective, dBn_analytic[6], rtol=1e-2)
     
     def test_L_analytic_derivatives(self):
         """
@@ -146,172 +150,229 @@ def test_L_analytic_derivatives(self):
         against finite differences for tiny changes to the angles, 
         to see if the analytic calculations are correct.
         """
-        ncoils = 6
-        np.random.seed(ncoils)
-        R = 1 
-        a = 1e-5
-        points = (np.random.rand(ncoils, 3) - 0.5) * 5
-        
-        # Need alphas and deltas in [-pi, pi] range
-        deltas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
-        alphas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
-        psc_array = PSCgrid.geo_setup_manual(
-            points, R=R, a=a, alphas=alphas, deltas=deltas
+        input_name = 'input.LandremanPaul2021_QA_lowres'
+        TEST_DIR = (Path(__file__).parent / ".." / ".." / "tests" / "test_files").resolve()
+        surface_filename = TEST_DIR / input_name
+        surf1 = SurfaceRZFourier.from_vmec_input(
+            surface_filename, range='full torus', nphi=16, ntheta=16
         )
-        A = psc_array.A_matrix
-        psi = psc_array.psi
-        b = psc_array.b_opt  # / psc_array.grid_normalization
-        L = psc_array.L
-        Linv = psc_array.L_inv
-        Bn_objective = 0.5 * (A @ Linv @ psi + b).T @ (A @ Linv @ psi + b)
-        L_deriv = psc_array.L_deriv()
-        epsilon = 1e-4
-        deltas_new = np.copy(deltas)
-        deltas_new[0] += epsilon
-        alphas_new = alphas
-        psc_array_new = PSCgrid.geo_setup_manual(
-            points, R=R, a=a, alphas=alphas_new, deltas=deltas_new
+        surf1.nfp = 1
+        surf1.stellsym = False
+        surf2 = SurfaceRZFourier.from_vmec_input(
+            surface_filename, range='half period', nphi=16, ntheta=16
         )
-        L_new = psc_array_new.L
-        Linv_new = psc_array_new.L_inv
-        Bn_objective_new = 0.5 * (A @ Linv_new @ psi + b).T @ (A @ Linv_new @ psi + b)
-        dBn_objective = (Bn_objective_new - Bn_objective) / epsilon
-        dL_ddelta = (L_new - L) / epsilon
-        dLinv_ddelta = (Linv_new - Linv) / epsilon
-        dL_ddelta_analytic = L_deriv
-        dLinv_ddelta_analytic = -L_deriv @ Linv @ Linv
-        # Symmetrizing helps with numerical error accumulation?
-        for i in range(ncoils * 2):
-            dLinv_ddelta_analytic[i, :, :] = (dLinv_ddelta_analytic[i, :, :] + dLinv_ddelta_analytic[i, :, :].T) / 2.0
+        ncoils = 7
+        np.random.seed(30)
+        R = 1.0
+        a = 1e-5
+        points = (np.random.rand(ncoils, 3) - 0.5) * 10
+        alphas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
+        deltas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
+        epsilon = 1e-4  # smaller epsilon and numerical accumulation starts to be an issue
+        for surf in [surf1, surf2]:
+            print('Surf = ', surf)
+            psc_array = PSCgrid.geo_setup_manual(
+                points, R=R, a=a, alphas=alphas, deltas=deltas
+            )
+            A = psc_array.A_matrix
+            psi = psc_array.psi
+            b = psc_array.b_opt  # / psc_array.grid_normalization
+            L = psc_array.L
+            Linv = psc_array.L_inv
+            Bn_objective = 0.5 * (A @ Linv @ psi + b).T @ (A @ Linv @ psi + b)
+            L_deriv = psc_array.L_deriv()
+            deltas_new = np.copy(deltas)
+            deltas_new[0] += epsilon
+            alphas_new = np.copy(alphas)
+            psc_array_new = PSCgrid.geo_setup_manual(
+                points, R=R, a=a, alphas=alphas_new, deltas=deltas_new
+            )
+            L_new = psc_array_new.L
+            Linv_new = psc_array_new.L_inv
+            Bn_objective_new = 0.5 * (A @ Linv_new @ psi + b).T @ (A @ Linv_new @ psi + b)
+            dBn_objective = (Bn_objective_new - Bn_objective) / epsilon
+            dL_ddelta = (L_new - L) / epsilon
+            dLinv_ddelta = (Linv_new - Linv) / epsilon
+            dL_ddelta_analytic = L_deriv
+            dLinv_ddelta_analytic = -L_deriv @ Linv @ Linv
+            # Symmetrizing helps with numerical error accumulation?
+            for i in range(ncoils * 2):
+                dLinv_ddelta_analytic[i, :, :] = (dLinv_ddelta_analytic[i, :, :] + dLinv_ddelta_analytic[i, :, :].T) / 2.0
+                
+            # Linv calculations looks much more incorrect that the L derivatives,
+            # maybe because of numerical error accumulation? 
+            assert(np.allclose(dL_ddelta, dL_ddelta_analytic[ncoils, :, :]))
+            assert(np.allclose(dLinv_ddelta, dLinv_ddelta_analytic[ncoils, :, :], rtol=10))
+            dBn_analytic = (A @ Linv @ psi + b).T @ (A @ (dLinv_ddelta_analytic @ psi).T)
+            print(dBn_objective, dBn_analytic[ncoils])
+            assert np.isclose(dBn_objective, dBn_analytic[ncoils], rtol=1e-2)
             
-        # Linv calculations looks much more incorrect that the L derivatives,
-        # maybe because of numerical error accumulation? 
-        assert(np.allclose(dL_ddelta, dL_ddelta_analytic[ncoils, :, :]))
-        assert(np.allclose(dLinv_ddelta, dLinv_ddelta_analytic[ncoils, :, :], rtol=10))
-        dBn_analytic = (A @ Linv @ psi + b).T @ (A @ (dLinv_ddelta_analytic @ psi).T)
-        assert np.isclose(dBn_objective, dBn_analytic[ncoils], rtol=1e-2)
+            # repeat calculation but change coil 3
+            psc_array = PSCgrid.geo_setup_manual(
+                points, R=R, a=a, alphas=alphas, deltas=deltas
+            )
+            A = psc_array.A_matrix
+            psi = psc_array.psi
+            b = psc_array.b_opt # / psc_array.grid_normalization
+            L = psc_array.L
+            Linv = psc_array.L_inv
+            Bn_objective = 0.5 * (A @ Linv @ psi + b).T @ (A @ Linv @ psi + b)
+            L_deriv = psc_array.L_deriv()
+            deltas_new = np.copy(deltas)
+            deltas_new[3] += epsilon
+            alphas_new = np.copy(alphas)
+            psc_array_new = PSCgrid.geo_setup_manual(
+                points, R=R, a=a, alphas=alphas_new, deltas=deltas_new
+            )
+            L_new = psc_array_new.L
+            Linv_new = psc_array_new.L_inv
+            Bn_objective_new = 0.5 * (A @ Linv_new @ psi + b).T @ (A @ Linv_new @ psi + b)
+            dBn_objective = (Bn_objective_new - Bn_objective) / epsilon
+            dL_ddelta = (L_new - L) / epsilon
+            dLinv_ddelta = (Linv_new - Linv) / epsilon
+            dL_ddelta_analytic = L_deriv
+            dLinv_ddelta_analytic = -L_deriv @ Linv @ Linv
+            # Symmetrizing helps with numerical error accumulation?
+            for i in range(ncoils * 2):
+                dLinv_ddelta_analytic[i, :, :] = (dLinv_ddelta_analytic[i, :, :] + dLinv_ddelta_analytic[i, :, :].T) / 2.0
+                
+            # Linv calculations looks much more incorrect that the L derivatives,
+            # maybe because of numerical error accumulation? 
+            # print(dL_ddelta, dL_ddelta_analytic[ncoils + 3, :, :])
+            assert(np.allclose(dL_ddelta, dL_ddelta_analytic[ncoils + 3, :, :]))
+            assert(np.allclose(dLinv_ddelta, dLinv_ddelta_analytic[ncoils + 3, :, :], rtol=10))
+            dBn_analytic = (A @ Linv @ psi + b).T @ (A @ (dLinv_ddelta_analytic @ psi).T)
+            print(dBn_objective, dBn_analytic[ncoils + 3])
+            assert np.isclose(dBn_objective, dBn_analytic[ncoils + 3], rtol=1e-2)
+    
+            # Test partial derivative calculation if we change alpha by epsilon
+            psc_array = PSCgrid.geo_setup_manual(
+                points, R=R, a=a, alphas=alphas, deltas=deltas
+            )
+            A = psc_array.A_matrix
+            psi = psc_array.psi
+            b = psc_array.b_opt # / psc_array.grid_normalization
+            L = psc_array.L
+            Linv = psc_array.L_inv
+            Bn_objective = 0.5 * (A @ Linv @ psi + b).T @ (A @ Linv @ psi + b)
+            L_deriv = psc_array.L_deriv()
+            alphas_new = np.copy(alphas)
+            alphas_new[0] += epsilon
+            deltas_new = np.copy(deltas)
+            psc_array_new = PSCgrid.geo_setup_manual(
+                points, R=R, a=a, alphas=alphas_new, deltas=deltas_new
+            )
+            L_new = psc_array_new.L
+            Linv_new = psc_array_new.L_inv
+            Bn_objective_new = 0.5 * (A @ Linv_new @ psi + b).T @ (A @ Linv_new @ psi + b)
+            dBn_objective = (Bn_objective_new - Bn_objective) / epsilon
+            dL_ddelta = (L_new - L) / epsilon
+            dLinv_ddelta = (Linv_new - Linv) / epsilon
+            dL_ddelta_analytic = L_deriv
+            dLinv_ddelta_analytic = -L_deriv @ Linv @ Linv
+            # Symmetrizing helps with numerical error accumulation?
+            for i in range(ncoils * 2):
+                dLinv_ddelta_analytic[i, :, :] = (dLinv_ddelta_analytic[i, :, :] + dLinv_ddelta_analytic[i, :, :].T) / 2.0
+                
+            # Linv calculations looks much more incorrect that the L derivatives,
+            # maybe because of numerical error accumulation? 
+            # print(dL_ddelta, dL_ddelta_analytic[ncoils + 3, :, :])
+            assert(np.allclose(dL_ddelta, dL_ddelta_analytic[0, :, :]))
+            assert(np.allclose(dLinv_ddelta, dLinv_ddelta_analytic[0, :, :], rtol=10))
+            dBn_analytic = (A @ Linv @ psi + b).T @ (A @ (dLinv_ddelta_analytic @ psi).T)
+            print(dBn_objective, dBn_analytic[0])
+            assert np.isclose(dBn_objective, dBn_analytic[0], rtol=1e-2)
+            
+            # Repeat changing coil 2
+            psc_array = PSCgrid.geo_setup_manual(
+                points, R=R, a=a, alphas=alphas, deltas=deltas
+            )
+            A = psc_array.A_matrix
+            psi = psc_array.psi
+            b = psc_array.b_opt # / psc_array.grid_normalization
+            L = psc_array.L
+            Linv = psc_array.L_inv
+            Bn_objective = 0.5 * (A @ Linv @ psi + b).T @ (A @ Linv @ psi + b)
+            L_deriv = psc_array.L_deriv()
+            alphas_new = np.copy(alphas)
+            alphas_new[2] += epsilon
+            deltas_new = np.copy(deltas)
+            psc_array_new = PSCgrid.geo_setup_manual(
+                points, R=R, a=a, alphas=alphas_new, deltas=deltas_new
+            )
+            L_new = psc_array_new.L
+            Linv_new = psc_array_new.L_inv
+            Bn_objective_new = 0.5 * (A @ Linv_new @ psi + b).T @ (A @ Linv_new @ psi + b)
+            dBn_objective = (Bn_objective_new - Bn_objective) / epsilon
+            dL_ddelta = (L_new - L) / epsilon
+            dLinv_ddelta = (Linv_new - Linv) / epsilon
+            dL_ddelta_analytic = L_deriv
+            dLinv_ddelta_analytic = -L_deriv @ Linv @ Linv
+            # Symmetrizing helps with numerical error accumulation?
+            for i in range(ncoils * 2):
+                dLinv_ddelta_analytic[i, :, :] = (dLinv_ddelta_analytic[i, :, :] + dLinv_ddelta_analytic[i, :, :].T) / 2.0
+                
+            # Linv calculations looks much more incorrect that the L derivatives,
+            # maybe because of numerical error accumulation? 
+            # print(dL_ddelta, dL_ddelta_analytic[ncoils + 3, :, :])
+            assert(np.allclose(dL_ddelta, dL_ddelta_analytic[2, :, :]))
+            assert(np.allclose(dLinv_ddelta, dLinv_ddelta_analytic[2, :, :], rtol=10))
+            dBn_analytic = (A @ Linv @ psi + b).T @ (A @ (dLinv_ddelta_analytic @ psi).T)
+            print(dBn_objective, dBn_analytic[2])
+            assert np.isclose(dBn_objective, dBn_analytic[2], rtol=1e-2)
         
-        # repeat calculation but change coil 3
-        deltas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
-        alphas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
-        psc_array = PSCgrid.geo_setup_manual(
-            points, R=R, a=a, alphas=alphas, deltas=deltas
+        
+    def test_symmetrized_quantities(self):
+        """
+        Tests that discrete symmetries are satisfied by the various terms
+        appearing in optimization. 
+        """
+        from simsopt.objectives import SquaredFlux
+        from simsopt.field import coils_via_symmetries
+        
+        input_name = 'input.LandremanPaul2021_QA_lowres'
+        TEST_DIR = (Path(__file__).parent / ".." / ".." / "tests" / "test_files").resolve()
+        surface_filename = TEST_DIR / input_name
+        nphi = 32
+        surf = SurfaceRZFourier.from_vmec_input(
+            surface_filename, range='half period', nphi=nphi, ntheta=nphi
         )
-        A = psc_array.A_matrix
-        psi = psc_array.psi
-        b = psc_array.b_opt # / psc_array.grid_normalization
-        L = psc_array.L
-        Linv = psc_array.L_inv
-        Bn_objective = 0.5 * (A @ Linv @ psi + b).T @ (A @ Linv @ psi + b)
-        L_deriv = psc_array.L_deriv()
-        epsilon = 1e-4
-        deltas_new = np.copy(deltas)
-        deltas_new[3] += epsilon
-        alphas_new = alphas
-        psc_array_new = PSCgrid.geo_setup_manual(
-            points, R=R, a=a, alphas=alphas_new, deltas=deltas_new
+        surf_full = SurfaceRZFourier.from_vmec_input(
+            surface_filename, range='full torus', nphi=nphi * 4, ntheta=nphi * 4
         )
-        L_new = psc_array_new.L
-        Linv_new = psc_array_new.L_inv
-        Bn_objective_new = 0.5 * (A @ Linv_new @ psi + b).T @ (A @ Linv_new @ psi + b)
-        dBn_objective = (Bn_objective_new - Bn_objective) / epsilon
-        dL_ddelta = (L_new - L) / epsilon
-        dLinv_ddelta = (Linv_new - Linv) / epsilon
-        dL_ddelta_analytic = L_deriv
-        dLinv_ddelta_analytic = -L_deriv @ Linv @ Linv
-        # Symmetrizing helps with numerical error accumulation?
-        for i in range(ncoils * 2):
-            dLinv_ddelta_analytic[i, :, :] = (dLinv_ddelta_analytic[i, :, :] + dLinv_ddelta_analytic[i, :, :].T) / 2.0
-            
-        # Linv calculations looks much more incorrect that the L derivatives,
-        # maybe because of numerical error accumulation? 
-        # print(dL_ddelta, dL_ddelta_analytic[ncoils + 3, :, :])
-        assert(np.allclose(dL_ddelta, dL_ddelta_analytic[ncoils + 3, :, :]))
-        assert(np.allclose(dLinv_ddelta, dLinv_ddelta_analytic[ncoils + 3, :, :], rtol=10))
-        dBn_analytic = (A @ Linv @ psi + b).T @ (A @ (dLinv_ddelta_analytic @ psi).T)
-        assert np.isclose(dBn_objective, dBn_analytic[ncoils + 3], rtol=1e-2)
-
-        # Test partial derivative calculation if we change alpha by epsilon
-        points = (np.random.rand(ncoils, 3) - 0.5) * 5
+        ncoils = 6
+        np.random.seed(1)
+        R = 0.2
+        a = 1e-5
+        points = (np.random.rand(ncoils, 3) - 0.5) * 20
         alphas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
         deltas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
+        kwargs_manual = {"plasma_boundary": surf}
         psc_array = PSCgrid.geo_setup_manual(
-            points, R=R, a=a, alphas=alphas, deltas=deltas
-        )
-        A = psc_array.A_matrix
-        psi = psc_array.psi
-        b = psc_array.b_opt # / psc_array.grid_normalization
-        L = psc_array.L
-        Linv = psc_array.L_inv
-        Bn_objective = 0.5 * (A @ Linv @ psi + b).T @ (A @ Linv @ psi + b)
-        L_deriv = psc_array.L_deriv()
-        epsilon = 1e-4
-        alphas_new = np.copy(alphas)
-        alphas_new[0] += epsilon
-        deltas_new = deltas
-        psc_array_new = PSCgrid.geo_setup_manual(
-            points, R=R, a=a, alphas=alphas_new, deltas=deltas_new
+            points, R=R, a=a, alphas=alphas, deltas=deltas, **kwargs_manual
         )
-        L_new = psc_array_new.L
-        Linv_new = psc_array_new.L_inv
-        Bn_objective_new = 0.5 * (A @ Linv_new @ psi + b).T @ (A @ Linv_new @ psi + b)
-        dBn_objective = (Bn_objective_new - Bn_objective) / epsilon
-        dL_ddelta = (L_new - L) / epsilon
-        dLinv_ddelta = (Linv_new - Linv) / epsilon
-        dL_ddelta_analytic = L_deriv
-        dLinv_ddelta_analytic = -L_deriv @ Linv @ Linv
-        # Symmetrizing helps with numerical error accumulation?
-        for i in range(ncoils * 2):
-            dLinv_ddelta_analytic[i, :, :] = (dLinv_ddelta_analytic[i, :, :] + dLinv_ddelta_analytic[i, :, :].T) / 2.0
-            
-        # Linv calculations looks much more incorrect that the L derivatives,
-        # maybe because of numerical error accumulation? 
-        # print(dL_ddelta, dL_ddelta_analytic[ncoils + 3, :, :])
-        assert(np.allclose(dL_ddelta, dL_ddelta_analytic[0, :, :]))
-        assert(np.allclose(dLinv_ddelta, dLinv_ddelta_analytic[0, :, :], rtol=10))
-        dBn_analytic = (A @ Linv @ psi + b).T @ (A @ (dLinv_ddelta_analytic @ psi).T)
-        assert np.isclose(dBn_objective, dBn_analytic[0], rtol=1e-2)
+        psc_array.I = 1e4 * np.ones(len(psc_array.I))
+        Ax_b = (psc_array.A_matrix @ psc_array.I + psc_array.b_opt) * psc_array.grid_normalization
+        Bn = 0.5 * Ax_b.T @ Ax_b / psc_array.normalization * psc_array.fac ** 2
         
-        # Repeat changing coil 2
-        points = (np.random.rand(ncoils, 3) - 0.5) * 5
-        alphas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
-        deltas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
-        psc_array = PSCgrid.geo_setup_manual(
-            points, R=R, a=a, alphas=alphas, deltas=deltas
-        )
-        A = psc_array.A_matrix
-        psi = psc_array.psi
-        b = psc_array.b_opt # / psc_array.grid_normalization
-        L = psc_array.L
-        Linv = psc_array.L_inv
-        Bn_objective = 0.5 * (A @ Linv @ psi + b).T @ (A @ Linv @ psi + b)
-        L_deriv = psc_array.L_deriv()
-        epsilon = 1e-4
-        alphas_new = np.copy(alphas)
-        alphas_new[2] += epsilon
-        deltas_new = deltas
-        psc_array_new = PSCgrid.geo_setup_manual(
-            points, R=R, a=a, alphas=alphas_new, deltas=deltas_new
+        currents = []
+        for i in range(psc_array.num_psc):
+            currents.append(Current(psc_array.I[i]))
+        all_coils = coils_via_symmetries(
+            psc_array.curves, currents, nfp=surf.nfp, stellsym=surf.stellsym
         )
-        L_new = psc_array_new.L
-        Linv_new = psc_array_new.L_inv
-        Bn_objective_new = 0.5 * (A @ Linv_new @ psi + b).T @ (A @ Linv_new @ psi + b)
-        dBn_objective = (Bn_objective_new - Bn_objective) / epsilon
-        dL_ddelta = (L_new - L) / epsilon
-        dLinv_ddelta = (Linv_new - Linv) / epsilon
-        dL_ddelta_analytic = L_deriv
-        dLinv_ddelta_analytic = -L_deriv @ Linv @ Linv
-        # Symmetrizing helps with numerical error accumulation?
-        for i in range(ncoils * 2):
-            dLinv_ddelta_analytic[i, :, :] = (dLinv_ddelta_analytic[i, :, :] + dLinv_ddelta_analytic[i, :, :].T) / 2.0
-            
-        # Linv calculations looks much more incorrect that the L derivatives,
-        # maybe because of numerical error accumulation? 
-        # print(dL_ddelta, dL_ddelta_analytic[ncoils + 3, :, :])
-        assert(np.allclose(dL_ddelta, dL_ddelta_analytic[2, :, :]))
-        assert(np.allclose(dLinv_ddelta, dLinv_ddelta_analytic[2, :, :], rtol=10))
-        dBn_analytic = (A @ Linv @ psi + b).T @ (A @ (dLinv_ddelta_analytic @ psi).T)
-        assert np.isclose(dBn_objective, dBn_analytic[2], rtol=1e-2)
+        B_WP = BiotSavart(all_coils)
+
+        fB = SquaredFlux(surf, B_WP + psc_array.B_TF, np.zeros((nphi, nphi))).J() / psc_array.normalization
+        assert np.isclose(Bn, fB)
+        fB_full = SquaredFlux(surf_full, B_WP + psc_array.B_TF, np.zeros((nphi * 4, nphi * 4))).J() / psc_array.normalization
+        assert np.isclose(Bn, fB_full)
+        
+        # A_deriv = psc_array.A_deriv()
+        # A_derivI = (A_deriv[:, :psc_array.num_psc] @ psc_array.I) * psc_array.grid_normalization
+        # Bn_deriv = 0.5 * Ax_b.T @ A_derivI / psc_array.normalization * psc_array.fac ** 2
+        # print(Bn_deriv)
+        
 
         
     def test_dA_analytic_derivatives(self):
@@ -333,11 +394,12 @@ def test_dA_analytic_derivatives(self):
         )
         ncoils = 6
         np.random.seed(1)
-        R = 0.2
+        R = 1.0
         a = 1e-5
         points = (np.random.rand(ncoils, 3) - 0.5) * 20
         alphas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
         deltas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
+        epsilon = 1e-4  # smaller epsilon and numerical accumulation starts to be an issue
         for surf in [surf1, surf2]:
             print('Surf = ', surf)
             kwargs_manual = {"plasma_boundary": surf}
@@ -347,31 +409,29 @@ def test_dA_analytic_derivatives(self):
             A = psc_array.A_matrix
             Linv = psc_array.L_inv[:ncoils, :ncoils]
             psi = psc_array.psi
-            Linv_psi = Linv @ psi
-            b = psc_array.b_opt # / psc_array.grid_normalization
-            Bn_objective = 0.5 * (A @ Linv_psi + b).T @ (A @ Linv_psi + b)
-            epsilon = 1e-8
-            alphas_new = alphas
+            I = -Linv @ psi * 1e6
+            b = psc_array.b_opt 
+            Bn_objective = 0.5 * (A @ I + b).T @ (A @ I + b)
+            alphas_new = np.copy(alphas)
             alphas_new[0] += epsilon
-            deltas_new = deltas
+            deltas_new = np.copy(deltas)
             psc_array_new = PSCgrid.geo_setup_manual(
                 points, R=R, a=a, alphas=alphas_new, deltas=deltas_new, **kwargs_manual
             )
             A_new = psc_array_new.A_matrix
-            Bn_objective_new = 0.5 * (A_new @ Linv_psi + b).T @ (A_new @ Linv_psi + b)
+            Bn_objective_new = 0.5 * (A_new @ I + b).T @ (A_new @ I + b)
+            print('Bns = ', Bn_objective, Bn_objective_new)
             dBn_objective = (Bn_objective_new - Bn_objective) / epsilon
             A_deriv = psc_array.A_deriv()
-            dBn_analytic = (A @ Linv_psi + b).T @ (A_deriv[:, :ncoils] * Linv_psi)
+            dBn_analytic = (A @ I + b).T @ (A_deriv[:, :ncoils] * I)
             print(dBn_objective, dBn_analytic[0])
-            # assert np.isclose(dBn_objective, dBn_analytic[0], rtol=1e-2)
+            assert np.isclose(dBn_objective, dBn_analytic[0], rtol=1e-2)
             dA_dalpha = (A_new - A) / epsilon
             dA_dkappa_analytic = A_deriv
-            print(dA_dalpha[:, 0], dA_dkappa_analytic[:, 0])
-            # assert np.allclose(dA_dalpha[:, 0], dA_dkappa_analytic[:, 0], rtol=1e-2)
+            print(dA_dalpha[-1, 0] / dA_dkappa_analytic[-1, 0])
+            assert np.allclose(dA_dalpha[:, 0], dA_dkappa_analytic[:, 0], rtol=1e-2)
             
             # Repeat but change coil 3
-            alphas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
-            deltas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
             psc_array = PSCgrid.geo_setup_manual(
                 points, R=R, a=a, alphas=alphas, deltas=deltas, **kwargs_manual
             )
@@ -381,10 +441,9 @@ def test_dA_analytic_derivatives(self):
             Linv_psi = Linv @ psi
             b = psc_array.b_opt # / psc_array.grid_normalization
             Bn_objective = 0.5 * (A @ Linv_psi + b).T @ (A @ Linv_psi + b)
-            epsilon = 1e-5
-            alphas_new = alphas
+            alphas_new = np.copy(alphas)
             alphas_new[4] += epsilon
-            deltas_new = deltas
+            deltas_new = np.copy(deltas)
             psc_array_new = PSCgrid.geo_setup_manual(
                 points, R=R, a=a, alphas=alphas_new, deltas=deltas_new, **kwargs_manual
             )
@@ -394,15 +453,13 @@ def test_dA_analytic_derivatives(self):
             A_deriv = psc_array.A_deriv()
             dBn_analytic = (A @ Linv_psi + b).T @ (A_deriv[:, :ncoils] * Linv_psi)
             print(dBn_objective, dBn_analytic[4])
-            # assert np.isclose(dBn_objective, dBn_analytic[4], rtol=1e-2)
+            assert np.isclose(dBn_objective, dBn_analytic[4], rtol=1)
             dA_dalpha = (A_new - A) / epsilon
             dA_dkappa_analytic = A_deriv
-            print(dA_dalpha[:, 4], dA_dkappa_analytic[:, 4])
-            # assert np.allclose(dA_dalpha[:, 4], dA_dkappa_analytic[:, 4], rtol=1e-2)
+            print(dA_dalpha[-1, 4] / dA_dkappa_analytic[-1, 4])
+            assert np.allclose(dA_dalpha[:, 4], dA_dkappa_analytic[:, 4], rtol=1e-1)
             
             # Repeat but changing delta instead of alpha
-            deltas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
-            alphas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
             psc_array = PSCgrid.geo_setup_manual(
             points, R=R, a=a, alphas=alphas, deltas=deltas, **kwargs_manual
             )
@@ -412,9 +469,8 @@ def test_dA_analytic_derivatives(self):
             Linv_psi = Linv @ psi
             b = psc_array.b_opt  # / psc_array.grid_normalization
             Bn_objective = 0.5 * (A @ Linv_psi + b).T @ (A @ Linv_psi + b)
-            epsilon = 1e-4
-            alphas_new = alphas
-            deltas_new = deltas
+            alphas_new = np.copy(alphas)
+            deltas_new = np.copy(deltas)
             deltas_new[0] += epsilon
             psc_array_new = PSCgrid.geo_setup_manual(
                 points, R=R, a=a, alphas=alphas_new, deltas=deltas_new, **kwargs_manual
@@ -428,13 +484,11 @@ def test_dA_analytic_derivatives(self):
             print(dBn_objective, dBn_analytic[0])
             dA_ddelta = (A_new - A) / epsilon
             dA_dkappa_analytic = A_deriv
-            print(dA_ddelta[:, 0], dA_dkappa_analytic[:, ncoils])
+            print(dA_ddelta[-1, 0] / dA_dkappa_analytic[-1, ncoils])
             # assert np.isclose(dBn_objective, dBn_analytic[0], rtol=1e-2)
-            # assert np.allclose(dA_ddelta[:, 0], dA_dkappa_analytic[:, ncoils], rtol=1e-2)
+            # assert np.allclose(dA_ddelta[:, 0], dA_dkappa_analytic[:, ncoils], rtol=1e-1)
             
             # Repeat but change coil 5
-            deltas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
-            alphas = (np.random.rand(ncoils) - 0.5) * 2 * np.pi
             psc_array = PSCgrid.geo_setup_manual(
             points, R=R, a=a, alphas=alphas, deltas=deltas, **kwargs_manual
             )
@@ -444,9 +498,8 @@ def test_dA_analytic_derivatives(self):
             Linv_psi = Linv @ psi
             b = psc_array.b_opt  # / psc_array.grid_normalization
             Bn_objective = 0.5 * (A @ Linv_psi + b).T @ (A @ Linv_psi + b)
-            epsilon = 1e-5
-            alphas_new = alphas
-            deltas_new = deltas
+            alphas_new = np.copy(alphas)
+            deltas_new = np.copy(deltas)
             deltas_new[5] += epsilon
             psc_array_new = PSCgrid.geo_setup_manual(
                 points, R=R, a=a, alphas=alphas_new, deltas=deltas_new, **kwargs_manual
@@ -461,8 +514,8 @@ def test_dA_analytic_derivatives(self):
             # assert np.isclose(dBn_objective, dBn_analytic[5], rtol=1e-2)
             dA_ddelta = (A_new - A) / epsilon
             dA_dkappa_analytic = A_deriv
-            print(dA_ddelta[:, 5], dA_dkappa_analytic[:, ncoils + 5])
-            assert np.allclose(dA_ddelta[:, 5], dA_dkappa_analytic[:, ncoils + 5], rtol=1e-2)
+            print(dA_ddelta[-1, 5] / dA_dkappa_analytic[-1, ncoils + 5])
+            assert np.allclose(dA_ddelta[:, 5], dA_dkappa_analytic[:, ncoils + 5], rtol=1e-1)
 
     def test_L(self):
         """