diff --git a/MakefileHPC b/MakefileHPC
new file mode 100644
index 0000000..c6bef7b
--- /dev/null
+++ b/MakefileHPC
@@ -0,0 +1,251 @@
+# to find path to openmpi, type mpiexec --test , the error messege reveals path
+MKL_HOME = /apps/cent7/intel/compilers_and_libraries_2017.1.132/linux/mkl
+BROWN_MPI_HOME = /apps/brown/openmpi/2.1.0/gcc-5.2.0
+SNYDER_MPI_HOME = /apps/snyder7/openmpi/2.1.0_gcc-5.2.0
+MPI_HOME = $(SNYDER_MPI_HOME)
+
+BLAS_LIB = -L$(MKL_HOME)/lib/intel64 -Wl,-rpath=$(MKL_HOME)/lib/intel64 -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp
+LAPACK_LIB = -L$(MKL_HOME)/lib/intel64 -Wl,-rpath=$(MKL_HOME)/lib/intel64 -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core
+
+EXTRA_FLAGS = '-Wl,-rpath=$(MKL_HOME)/lib/intel64' ,  '-Wl,-rpath=$(MPI_HOME)/lib'
+
+EXTRA_DIRS = '$(MKL_HOME)/lib/intel64' , '$(MPI_HOME)/lib'
+
+#BLAS_LIB = -lblas
+#LAPACK_LIB = -llapack
+
+# OPTIONAL
+# Typically if installed,
+#  FFTW3_INC can be left empty
+#  FFTW3_LIB = -lfftw3 
+#  or, if Fedora and/or fftw is version 3 but named fftw rather than fftw3
+#  FTW3_LIB = -lfftw 
+#  May need to link libraries properly as with blas and lapack above
+FFTW3_INC = -I$(MLK_HOME)/include/fftw/
+FFTW3_LIB = $(BLAS_LIB)
+
+# Typically,
+#  PTHREAD_INC = -DHAVE_UNISTD_H
+#  PTHREAD_LIB = -lpthread
+PTHREAD_INC = -DHAVE_UNISTD_H
+PTHREAD_LIB = -lpthread
+
+# OPTIONAL
+# If not installed:
+# Fedora: dnf install libsuitsparse-devel
+# Typically, if installed:
+#CHOLMOD_INC = -I/usr/include/suitesparse
+#CHOLMOD_LIB = -lcholmod -lamd -lcolamd -lcamd -lccolamd
+#CHOLMOD_INC = -I/usr/include/suitesparse
+#CHOLMOD_LIB = -lcholmod -lamd -lcolamd -lcamd -lccolamd
+
+# Specify the MPI library
+# For example, on Fedora: dnf  install openmpi-devel
+#MPI_INC = -I/usr/include/openmpi-x86_64/openmpi/ompi
+#MPI_LIB = -lmpi
+# or, explicitly link to the library with -L, example below
+MPI_LIB = -L$(MPI_HOME)/lib -Wl,-rpath=$(MPI_HOME)/lib -lmpi
+MPI_INC = -I$(MPI_HOME)/include
+#MPI_LIB = -L/usr/lib64/openmpi/lib/libmpi.so
+
+# Specify custom compilers if needed
+CXX = $(MPI_HOME)/bin/mpic++
+CC  = $(MPI_HOME)/bin/mpicc
+
+#CFLAGS += -O3 -fPIC
+CFLAGS = -O3 -msse3 -msse2 -msse -fPIC
+
+# options for Sampler module
+OPTFLAGS = -O3
+
+OBJDIR = ./build
+S4_BINNAME = $(OBJDIR)/S4
+S4_LIBNAME = $(OBJDIR)/libS4.a
+S4r_LIBNAME = $(OBJDIR)/libS4r.a
+
+##################### DO NOT EDIT BELOW THIS LINE #####################
+
+#### Set the compilation flags
+
+CPPFLAGS = -I. -IS4 -IS4/RNP -IS4/kiss_fft
+
+ifdef BLAS_LIB
+CPPFLAGS += -DHAVE_BLAS
+endif
+
+ifdef LAPACK_LIB
+CPPFLAGS += -DHAVE_LAPACK
+endif
+
+ifdef FFTW3_LIB
+CPPFLAGS += -DHAVE_FFTW3 $(FFTW3_INC)
+endif
+
+ifdef PTHREAD_LIB
+CPPFLAGS += -DHAVE_LIBPTHREAD $(PTHREAD_INC)
+endif
+
+ifdef CHOLMOD_LIB
+CPPFLAGS += -DHAVE_LIBCHOLMOD $(CHOLMOD_INC)
+endif
+
+ifdef MPI_LIB
+CPPFLAGS += -DHAVE_MPI $(MPI_INC)
+endif
+
+LIBS = $(BLAS_LIB) $(LAPACK_LIB) $(FFTW3_LIB) $(PTHREAD_LIB) $(CHOLMOD_LIB) $(MPI_LIB)
+
+#### Compilation targets
+
+all: $(S4_LIBNAME)
+
+objdir:
+	mkdir -p $(OBJDIR)
+	mkdir -p $(OBJDIR)/S4k
+	mkdir -p $(OBJDIR)/S4r
+	mkdir -p $(OBJDIR)/modules
+	
+S4_LIBOBJS = \
+	$(OBJDIR)/S4k/S4.o \
+	$(OBJDIR)/S4k/rcwa.o \
+	$(OBJDIR)/S4k/fmm_common.o \
+	$(OBJDIR)/S4k/fmm_FFT.o \
+	$(OBJDIR)/S4k/fmm_kottke.o \
+	$(OBJDIR)/S4k/fmm_closed.o \
+	$(OBJDIR)/S4k/fmm_PolBasisNV.o \
+	$(OBJDIR)/S4k/fmm_PolBasisVL.o \
+	$(OBJDIR)/S4k/fmm_PolBasisJones.o \
+	$(OBJDIR)/S4k/fmm_experimental.o \
+	$(OBJDIR)/S4k/fft_iface.o \
+	$(OBJDIR)/S4k/pattern.o \
+	$(OBJDIR)/S4k/intersection.o \
+	$(OBJDIR)/S4k/predicates.o \
+	$(OBJDIR)/S4k/numalloc.o \
+	$(OBJDIR)/S4k/gsel.o \
+	$(OBJDIR)/S4k/sort.o \
+	$(OBJDIR)/S4k/kiss_fft.o \
+	$(OBJDIR)/S4k/kiss_fftnd.o \
+	$(OBJDIR)/S4k/SpectrumSampler.o \
+	$(OBJDIR)/S4k/cubature.o \
+	$(OBJDIR)/S4k/Interpolator.o \
+	$(OBJDIR)/S4k/convert.o
+
+S4r_LIBOBJS = \
+	$(OBJDIR)/S4r/Material.o \
+	$(OBJDIR)/S4r/LatticeGridRect.o \
+	$(OBJDIR)/S4r/LatticeGridArb.o \
+	$(OBJDIR)/S4r/POFF2Mesh.o \
+	$(OBJDIR)/S4r/PeriodicMesh.o \
+	$(OBJDIR)/S4r/Shape.o \
+	$(OBJDIR)/S4r/Simulation.o \
+	$(OBJDIR)/S4r/Layer.o \
+	$(OBJDIR)/S4r/Pseudoinverse.o \
+	$(OBJDIR)/S4r/Eigensystems.o \
+	$(OBJDIR)/S4r/IRA.o \
+	$(OBJDIR)/S4r/intersection.o \
+	$(OBJDIR)/S4r/predicates.o \
+	$(OBJDIR)/S4r/periodic_off2.o
+
+ifndef LAPACK_LIB
+  S4_LIBOBJS += $(OBJDIR)/S4k/Eigensystems.o
+endif
+
+$(S4_LIBNAME): objdir $(S4_LIBOBJS)
+	$(AR) crvs $@ $(S4_LIBOBJS)
+$(S4r_LIBNAME): objdir $(S4r_LIBOBJS)
+	$(AR) crvs $@ $(S4r_LIBOBJS)
+
+$(OBJDIR)/S4k/S4.o: S4/S4.cpp
+	$(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+$(OBJDIR)/S4k/rcwa.o: S4/rcwa.cpp
+	$(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+$(OBJDIR)/S4k/fmm_common.o: S4/fmm/fmm_common.cpp
+	$(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+$(OBJDIR)/S4k/fmm_FFT.o: S4/fmm/fmm_FFT.cpp
+	$(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+$(OBJDIR)/S4k/fmm_kottke.o: S4/fmm/fmm_kottke.cpp
+	$(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+$(OBJDIR)/S4k/fmm_closed.o: S4/fmm/fmm_closed.cpp
+	$(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+$(OBJDIR)/S4k/fmm_PolBasisNV.o: S4/fmm/fmm_PolBasisNV.cpp
+	$(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+$(OBJDIR)/S4k/fmm_PolBasisVL.o: S4/fmm/fmm_PolBasisVL.cpp
+	$(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+$(OBJDIR)/S4k/fmm_PolBasisJones.o: S4/fmm/fmm_PolBasisJones.cpp
+	$(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+$(OBJDIR)/S4k/fmm_experimental.o: S4/fmm/fmm_experimental.cpp
+	$(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+$(OBJDIR)/S4k/fft_iface.o: S4/fmm/fft_iface.cpp
+	$(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+$(OBJDIR)/S4k/pattern.o: S4/pattern/pattern.c
+	$(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+$(OBJDIR)/S4k/intersection.o: S4/pattern/intersection.c
+	$(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+$(OBJDIR)/S4k/predicates.o: S4/pattern/predicates.c
+	$(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+$(OBJDIR)/S4k/numalloc.o: S4/numalloc.c
+	$(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+$(OBJDIR)/S4k/gsel.o: S4/gsel.c
+	$(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+$(OBJDIR)/S4k/sort.o: S4/sort.c
+	$(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+$(OBJDIR)/S4k/kiss_fft.o: S4/kiss_fft/kiss_fft.c
+	$(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+$(OBJDIR)/S4k/kiss_fftnd.o: S4/kiss_fft/tools/kiss_fftnd.c
+	$(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+$(OBJDIR)/S4k/SpectrumSampler.o: S4/SpectrumSampler.c
+	$(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+$(OBJDIR)/S4k/cubature.o: S4/cubature.c
+	$(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+$(OBJDIR)/S4k/Interpolator.o: S4/Interpolator.c
+	$(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+$(OBJDIR)/S4k/convert.o: S4/convert.c
+	$(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+$(OBJDIR)/S4k/Eigensystems.o: S4/RNP/Eigensystems.cpp
+	$(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+
+
+$(OBJDIR)/S4r/Material.o: S4r/Material.cpp S4r/Material.hpp S4r/Types.hpp
+	$(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@
+$(OBJDIR)/S4r/LatticeGridRect.o: S4r/LatticeGridRect.cpp S4r/PeriodicMesh.hpp S4r/Types.hpp
+	$(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@
+$(OBJDIR)/S4r/LatticeGridArb.o: S4r/LatticeGridArb.cpp S4r/PeriodicMesh.hpp S4r/Types.hpp
+	$(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@
+$(OBJDIR)/S4r/POFF2Mesh.o: S4r/POFF2Mesh.cpp S4r/PeriodicMesh.hpp S4r/Types.hpp
+	$(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@
+$(OBJDIR)/S4r/PeriodicMesh.o: S4r/PeriodicMesh.cpp S4r/PeriodicMesh.hpp S4r/Types.hpp
+	$(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@
+$(OBJDIR)/S4r/Shape.o: S4r/Shape.cpp S4r/Shape.hpp S4r/Types.hpp
+	$(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@
+$(OBJDIR)/S4r/Simulation.o: S4r/Simulation.cpp S4r/Simulation.hpp S4r/StarProduct.hpp S4r/Types.hpp
+	$(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@
+$(OBJDIR)/S4r/Layer.o: S4r/Layer.cpp S4r/Layer.hpp S4r/Types.hpp
+	$(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@
+$(OBJDIR)/S4r/Pseudoinverse.o: S4r/Pseudoinverse.cpp S4r/Pseudoinverse.hpp S4r/Types.hpp
+	$(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@
+$(OBJDIR)/S4r/Eigensystems.o: S4r/Eigensystems.cpp S4r/Eigensystems.hpp S4r/Types.hpp
+	$(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@
+$(OBJDIR)/S4r/IRA.o: S4r/IRA.cpp S4r/IRA.hpp S4r/Types.hpp
+	$(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@
+$(OBJDIR)/S4r/intersection.o: S4r/intersection.c S4r/intersection.h
+	$(CC) -c -O3 $< -o $@
+$(OBJDIR)/S4r/periodic_off2.o: S4r/periodic_off2.c S4r/periodic_off2.h 
+	$(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@
+$(OBJDIR)/S4r/predicates.o: S4r/predicates.c
+	$(CC) -c -O3 $< -o $@
+	
+#### Python extension
+
+check:
+	@echo $(LIBS)
+
+S4_pyext: objdir $(S4_LIBNAME)
+	echo "Linking...."
+	echo $(EXTRA_FLAGS) $(EXTRA_DIRS)
+	sh gensetupHPC.py.sh $(OBJDIR) $(S4_LIBNAME) "$(LIBS)" "$(EXTRA_FLAGS)" "$(EXTRA_DIRS)"
+	python setup.py build
+	#pip3 install --upgrade ./
+
+clean:
+	rm S4.so
+	rm -rf $(OBJDIR)
diff --git a/PythonTest.py b/PythonTest.py
new file mode 100644
index 0000000..681d145
--- /dev/null
+++ b/PythonTest.py
@@ -0,0 +1,122 @@
+
+# all lengths normalized in terms of a 
+# frequency normalized in terms of 2*pi*c/a
+
+# this test file plots a frequency spectrum and field plot for the python extension
+
+import S4
+import numpy as np
+import matplotlib.pyplot as plt
+
+a = 1e-6; #1nm
+c_const = 3e8;
+print('Bontempi, Kivshar et al. \"Highly sensitive biosensors based on all-dielectric nanoresonator\"')
+p = 0.96
+d = 0.73
+h = 0.22
+period = [p,p]
+S = S4.New(Lattice=((period[0],0),(0,period[1])),NumBasis=20)
+S.SetMaterial(Name = 'SiO2',Epsilon = (1.45 + 0.0j)**2)
+S.SetMaterial(Name = 'Si',Epsilon = (3.487+0.0j)**2)
+S.SetMaterial(Name = 'Vacuum',Epsilon = (1 + 0j)**2)
+
+S.AddLayer(Name = 'AirAbove',Thickness = 0, Material = 'Vacuum')
+S.AddLayer(Name = 'AirDisp',Thickness = 1, Material = 'Vacuum')
+S.AddLayer(Name = 'Si_disks',Thickness = h,Material = 'Vacuum')
+
+S.SetRegionCircle(
+    Layer = 'Si_disks',
+    Material = 'Si',
+    Center = (0,0),
+    Radius = d/2
+)
+
+S.AddLayer(Name = 'Glass_Below',Thickness = 2, Material = 'SiO2')
+#S.AddLayerCopy(Name = 'AirBelow',Thickness = 0, Layer = 'AirAbove')
+S.AddLayer(Name = 'AirBelow',Thickness = 0,Material = 'Vacuum')
+
+S.SetExcitationPlanewave(
+    IncidenceAngles=(
+        0,# polar angle in [0,180)
+        0 # azimuthal angle in [0,360)
+    ),
+    sAmplitude = 0,
+    pAmplitude = 1,
+    Order = 0
+)
+
+S.SetOptions(
+    PolarizationDecomposition = True
+)
+
+wavelength_um = float(wavelength_p_nm)/1000; # [8.0,14.0]
+freq = 1 / float(wavelength_um);  
+S.SetFrequency(freq)  #unit 2*pi*c_const / a
+forward,backward = S.GetPowerFlux(Layer = 'AirAbove', zOffset = 0) # reflected power
+forward = S.GetPowerFlux(Layer = 'AirBelow',zOffset = 0)
+print('f='+str(1/freq)+', Power_forward= '+str(forward[0].real)+', Power_backward='+str(backward.real))
+
+# get fields at points
+
+
+
+
+# frequency sweep
+wavelength_space = np.linspace(1.3, 1.7, 200)
+
+R = wavelength_space * 0
+T=R
+i=0
+for lam in wavelength_space:
+    f = 1 / float(lam)
+    S.SetFrequency(f)
+    (forw,back) = S.GetPowerFlux(Layer = 'AirAbove', zOffset = 0)
+    forw = S.GetPowerFlux(Layer = 'AirBelow', zOffset = 0)
+    R[i] = np.abs(forw[0])
+    #T[i] = np.abs(back)
+    i +=1
+
+plt.plot(wavelength_space, R)
+#plt.hold
+#plt.plot(wavelength_space, T)
+plt.xlabel('Wavelength (um)')
+plt.ylabel('T')
+plt.title('Figure 3(a)')
+
+#field map
+
+wavelength_p_nm = 1400;
+wavelength_um = float(wavelength_p_nm)/1000; # [8.0,14.0]
+freq = 1 / float(wavelength_um);  
+S.SetFrequency(freq)  #unit 2*pi*c_const / a
+x_space = np.linspace(-p/2,p/2,300)
+z_space = np.linspace(0.8,1.4,300)
+
+X,Z = np.meshgrid(x_space,z_space)
+Ey = np.zeros((X.shape[0], X.shape[1]))
+xc = 0
+zc = 0
+
+for x in x_space:
+    zc=0
+    #print('x='+str(x)+'\n')
+    for z in z_space:
+        #print('z='+str(z)+'\tzc='+str(zc)+'\n')
+        E,H = S.GetFields(x,0,z)
+        Ey[zc,xc] = np.abs(H[1]) ** 2 + np.abs(H[2]) ** 2 + np.abs(H[0]) ** 2
+        #print(str(x)+'\t'+str(z)+'\t'+str(abs(E[1])))
+        zc += 1
+    xc += 1
+    
+fig = plt.figure()
+cm = np.linspace(np.min(Ey),np.max(Ey), 100)
+plt.contourf(x_space, z_space, Ey, cm)
+plt.plot([-p/2, p/2], [1, 1], color='w', linestyle='-', linewidth=2)
+plt.plot([-d/2, d/2], [1+h, 1+h], color='w', linestyle='-', linewidth=2)
+plt.plot([-d/2, -d/2], [1, 1+h], color='w', linestyle='-', linewidth=2)
+plt.plot([d/2, d/2], [1, 1+h], color='w', linestyle='-', linewidth=2)
+plt.title('Figure 3(c) - |H|^2 - lam = ' + str(wavelength_p_nm))
+plt.show(block=True)
+
+
+print('If you can read this, the configuration work is finished!')
diff --git a/README b/README
index 5969182..a04acad 100644
--- a/README
+++ b/README
@@ -8,3 +8,14 @@ See the S4 manual, in doc/index.html, for a complete
 description of the package and its user interface, as well as
 installation instructions, the license and copyright, contact
 addresses, and other important information.
+
+Sajidmc:
+This fork focuses on HPC compilation, especially on Purdue University clusters.
+We are only using the python extension. 
+
+Note: you must edit the MakefileHPC and manually check/add the paths to each of the libraries
+
+Usage: 
+
+make -f MakefileHPC clean
+make -f MakefileHPC S4_pyext
diff --git a/S4/rcwa.cpp b/S4/rcwa.cpp
index 685f450..5e1a6e3 100644
--- a/S4/rcwa.cpp
+++ b/S4/rcwa.cpp
@@ -44,7 +44,8 @@
 //# define DUMP_STREAM (omega.real() > 1.91637 ? std::cerr : std::cout)
 # include <IO.h>
 #endif
-#include <cstdio>
+# include <IO.h>
+# include <cstdio>
 
 #include <numalloc.h>
 
@@ -79,165 +80,22 @@ static void SingularLinearSolve(
 	integer lwork = (integer)dummy.real();
 	std::complex<double> *work = (std::complex<double>*)rcwa_malloc(sizeof(std::complex<double>) * lwork);
 	double *rwork = (double*)rcwa_malloc(sizeof(double) * 6*m);
-
+	
 	zgelss_(m, n, nRHS, a, lda, b, ldb, rwork, rcond, &rank, work, lwork, rwork+m, &info);
-
+	
 	rcwa_free(rwork);
 	rcwa_free(work);
 #else
 	RNP::LinearSolve<'N'>(n, nRHS, a, lda, b, ldb);
 #endif
 }
-
-
-template <typename T>
-static void Swap(int n, T* x, int incx, T* y, int incy){
-	while(n --> 0){
-		std::swap(*x, *y);
-		x += incx; y += incy;
-	}
-}
-
-template <typename A, typename T>
-static void Axpy(int n, const A &alpha, const T *x, int incx, T *y, int incy){
-	while(n --> 0){
-		*y += alpha*(*x);
-		x += incx;
-		y += incy;
-	}
-}
-template <typename A, typename T>
-static void Axpy(int m, int n, const A &alpha, const T *a, int lda, T *b, int ldb){
-	for(int j = 0; j < n; ++j){
-		Axpy(m, alpha, a, 1, b, 1);
-		a += lda;
-		b += ldb;
-	}
-}
-template <typename S, typename T>
-static void Scale(int n, const S &s, T *v, int incv = 1){
-	while(n --> 0){
-		*v *= s;
-		v += incv;
-	}
-}
-
-template <typename T>
-static void Copy(int n, const T *src, int incsrc, T *dst, int incdst){
-	if(1 == incsrc && 1 == incdst){
-		memcpy(dst, src, sizeof(T)*n);
-	}else{
-		for(int j = 0; j < n; ++j){
-			*dst = *src;
-			src += incsrc;
-			dst += incdst;
-		}
-	}
-}
-template <typename T>
-static void Copy(int m, int n, const T *src, int ldsrc, T *dst, int lddst){
-	if(m == ldsrc && m == lddst){
-		memcpy(dst, src, sizeof(T)*m*n);
-	}else{
-		for(int j = 0; j < n; ++j){
-			memcpy(dst, src, sizeof(T)*m);
-			src += ldsrc;
-			dst += lddst;
-		}
-	}
-}
-template <typename A, typename T>
-static void SetMatrix(int m, int n, const A &diag, const A &offdiag, T *a, int lda){
-	for(int j = 0; j < n; ++j){
-		int i;
-		T *p = a;
-		for(i = 0; i < j; ++i){
-			*p++ = offdiag;
-		}
-		*p++ = diag;
-		for(i++ ; i < m; ++i){
-			*p++ = offdiag;
-		}
-		a += lda;
-	}
-}
-template <typename T>
-static void ZeroMatrix(int m, int n, T *a, int lda){
-	if(m == lda){
-		memset(a, 0, sizeof(T)*m*n);
-	}else{
-		for(int j = 0; j < n; ++j){
-			memset(a, 0, sizeof(T)*m);
-			a += lda;
-		}
-	}
-}
-extern "C" void zgemv_(
-	const char *trans, const int &m, const int &n, const std::complex<double> &alpha,
-	const std::complex<double> *a, const int &lda, const std::complex<double> *x, const int &incx,
-	const std::complex<double> &beta, std::complex<double> *y, const int &incy
-);
-extern "C" void zgemm_(
-	const char *transa, const char *transb, const int &m, const int &n, const int &k,
-	const std::complex<double> &alpha, const std::complex<double> *a, const int &lda,
-	const std::complex<double> *b, const int &ldb,
-	const std::complex<double> &beta, std::complex<double> *c, const int &ldc
-);
-extern "C" void zgetrf_(
-	const int &m, const int &n, std::complex<double> *a, const int &lda, int *ipiv, int *info
-);
-extern "C" void zgetrs_(
-	const char *trans, const int &n, const int &nrhs, const std::complex<double> *a, const int &lda,
-	const int *ipiv, std::complex<double> *b, const int &ldb, int *info
-);
-extern "C" void zgetri_(
-	const int &n, const std::complex<double> *a, const int &lda,
-	const int *ipiv, std::complex<double> *work, const int &lwork, int *info
-);
-static int Invert(size_t n, std::complex<double> *a, size_t lda, std::complex<double> *work, size_t lwork, size_t *iwork){
-	int info;
-	zgetrf_(n, n, a, lda, (int*)iwork, &info);
-	zgetri_(n, a, lda, (int*)iwork, work, lwork, &info);
-	return info;
-}
-
-static void Mult(size_t n, const double &alpha, const std::complex<double> *a, size_t lda, std::complex<double> *b, size_t ldb, const double &beta, std::complex<double> *c, size_t ldc){
-	zgemm_("N","N", n, n, n, alpha, a, lda, b, ldb, beta, c, ldc);
-}
-static void Mult(size_t n, size_t m, const double &alpha, const std::complex<double> *a, size_t lda, std::complex<double> *b, const double &beta, std::complex<double> *c){
-	zgemv_("N", n, m, alpha, a, lda, b, 1, beta, c, 1);
-}
-static int LUFactor(size_t n, std::complex<double> *a, size_t lda, size_t *ipiv){
-	int info = 0;
-	zgetrf_(n, n, a, lda, (int*)ipiv, &info); // A = P*L*U
-	return info;
-}
-static int LUSolve(size_t n, size_t nRHS, const std::complex<double> *a, size_t lda, const size_t *ipiv, std::complex<double> *b, size_t ldb){
-	int info;
-	zgetrs_("N", n, nRHS, a, lda, (const int*)ipiv, b, ldb, &info);
-	return info;
-}
-static void PrintMatrix(const char *name, size_t m, size_t n, const std::complex<double> *a, size_t lda){
-	printf("%s = [\n", name);
-	for(size_t i = 0; i < m; ++i){
-		for(size_t j = 0; j < n; ++j){
-			const std::complex<double> &val = a[i+j*lda];
-			if(val.imag() < 0){
-				printf(" % 18.16e-% 18.16ei", val.real(), -val.imag());
-			}else{
-				printf(" % 18.16e+% 18.16ei", val.real(), val.imag());
-			}
-		}
-		printf("\n");
-	}
-	printf("];\n");
-}
+	
 
 // kp = k-parallel matrix
 // kp = omega^2 - Kappa = omega^2 - [  ky*epsinv*ky -ky*epsinv*kx ]
 //                                  [ -kx*epsinv*ky  kx*epsinv*kx ]
 //    = omega^2 + [ -ky ] [ epsinv ] [ ky -kx ]
-//
+//    
 
 static void MakeKPMatrix(
 	std::complex<double> omega,
@@ -255,11 +113,11 @@ static void MakeKPMatrix(
 		RNP::TBLAS::CopyMatrix<'A'>(n2,n2, kp_existing,n2, kp,ldkp);
 		return;
 	}
-
+	
 	const std::complex<double> omega2 = omega*omega;
 	if(EPSILON2_TYPE_BLKDIAG1_SCALAR == epstype || EPSILON2_TYPE_BLKDIAG2_SCALAR == epstype){
 		RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., kp,ldkp);
-
+		
 		for(size_t i = 0; i < n; ++i){
 			kp[(0+i)+(0+i)*ldkp] = omega2 - ky[i]*Epsilon_inv[0]*ky[i];
 			kp[(n+i)+(0+i)*ldkp] = kx[i]*Epsilon_inv[0]*ky[i];
@@ -273,7 +131,7 @@ static void MakeKPMatrix(
 		RNP::TBLAS::CopyMatrix<'A'>(n,n, Epsilon_inv, n, &kp[n+0*ldkp], ldkp);
 		RNP::TBLAS::CopyMatrix<'A'>(n,n, Epsilon_inv, n, &kp[0+n*ldkp], ldkp);
 		RNP::TBLAS::CopyMatrix<'A'>(n,n, Epsilon_inv, n, &kp[n+n*ldkp], ldkp);
-
+		
 		for(size_t i = 0; i < n; ++i){
 			RNP::TBLAS::Scale(n2, -ky[i], &kp[0+i*n2], 1);
 		}
@@ -292,50 +150,6 @@ static void MakeKPMatrix(
 		}
 	}
 }
-static void MakeKPMatrix_real(
-	double omega,
-	size_t n,
-	const double *kx,
-	const double *ky,
-	const std::complex<double> *Epsilon_inv,
-	double *kp_existing,
-	double *kp,
-	const size_t ldkp // leading dimension of kp, >= 2*n
-){
-	const size_t n2 = 2*n;
-	if(NULL != kp_existing){
-		RNP::TBLAS::CopyMatrix<'A'>(n2,n2, kp_existing,n2, kp,ldkp);
-		return;
-	}
-
-	const double omega2 = omega*omega;
-
-	for(size_t j = 0; j < n; ++j){
-		for(size_t i = 0; i < n; ++i){
-			kp[i+j*ldkp] = Epsilon_inv[i+j*n].real();
-		}
-	}
-
-	RNP::TBLAS::CopyMatrix<'A'>(n,n, kp, ldkp, &kp[n+0*ldkp], ldkp);
-	RNP::TBLAS::CopyMatrix<'A'>(n,n, kp, ldkp, &kp[0+n*ldkp], ldkp);
-	RNP::TBLAS::CopyMatrix<'A'>(n,n, kp, ldkp, &kp[n+n*ldkp], ldkp);
-
-	for(size_t i = 0; i < n; ++i){
-		RNP::TBLAS::Scale(n2, -ky[i], &kp[0+i*n2], 1);
-	}
-	for(size_t i = 0; i < n; ++i){
-		RNP::TBLAS::Scale(n2, kx[i], &kp[0+(i+n)*n2], 1);
-	}
-	for(size_t i = 0; i < n; ++i){
-		RNP::TBLAS::Scale(n2, ky[i], &kp[i+0*n2], ldkp);
-	}
-	for(size_t i = 0; i < n; ++i){
-		RNP::TBLAS::Scale(n2, -kx[i], &kp[i+n+0*n2], ldkp);
-	}
-	for(size_t i = 0; i < n2; ++i){
-		kp[i+i*ldkp] += omega2;
-	}
-}
 
 // kp = k-parallel matrix
 // kp = omega^2 - Kappa = omega^2 - [  ky*epsinv*ky -ky*epsinv*kx ]
@@ -359,7 +173,7 @@ void MultKPMatrix(
 	const size_t ldy
 ){
 	const size_t n2 = 2*n;
-
+	
 	if(NULL != kp){
 		if('N' == trans[0]){
 			if(ncols > 1){
@@ -376,7 +190,7 @@ void MultKPMatrix(
 		}
 		return;
 	}
-
+	
 	const std::complex<double> omega2 = omega*omega;
 	if(EPSILON2_TYPE_BLKDIAG1_SCALAR == Epsilon2_type || EPSILON2_TYPE_BLKDIAG2_SCALAR == Epsilon2_type){
 		const std::complex<double> epsinv(
@@ -409,7 +223,7 @@ void MultKPMatrix(
 				RNP::TBLAS::MultMV<'C'>(n,n, 1.,Epsilon_inv,n, &y[0+0*ldy],1, 0.,&y[n+0*ldy],1);
 			}
 		}
-
+		
 		for(size_t j = 0; j < ncols; ++j){
 			for(size_t i = 0; i < n; ++i){
 				y[(0+i)+j*ldy] = omega2 * a[(0+i)+j*lda] - ky[i]*y[n+i+j*ldy];
@@ -453,17 +267,9 @@ void SolveLayerEigensystem_uniform(
 
 		// Set the \hat{q} vector (diagonal matrix) while we're at it
 		if(0 == omega.imag()){ // Not bandsolving
-			if(std::abs(q[i].imag()) <= 4*DBL_EPSILON*std::abs(q[i].real())){
-				if(q[i].real() >= 0){
-					q[i] = std::sqrt(q[i].real());
-				}else{
-					q[i] = std::complex<double>(0, std::sqrt(-q[i].real()));
-				}
-			}else{
-				q[i] = std::sqrt(q[i]);
-				if(q[i].imag() < 0){
-					q[i] = -q[i];
-				}
+			q[i] = std::sqrt(q[i]);
+			if(q[i].imag() < 0){
+				q[i] = -q[i];
 			}
 		}else{ // performing some kind of bandsolving, need to choose the appropriate branch
 			if(q[i].real() < 0){
@@ -475,13 +281,13 @@ void SolveLayerEigensystem_uniform(
 				q[i] = std::sqrt(q[i]);
 			}
 		}
-
+		
 		/*
 		if(std::complex<double>(0) == q[i]){
 			q[i] = DBL_EPSILON*omega;
 		}
 		*/
-
+		
 		q[i+n] = q[i];
 	}
 #ifdef DUMP_MATRICES
@@ -501,14 +307,14 @@ void SolveLayerEigensystem_uniform(
 	}
 }
 
-#ifdef HAVE_LAPACK
-void SolveLayerEigensystem_real(
-	double omega,
+void SolveLayerEigensystem(
+	std::complex<double> omega,
 	size_t n,
 	const double *kx,
 	const double *ky,
 	const std::complex<double> *Epsilon_inv, // size (glist.n)^2; inv of usual dielectric Fourier coupling matrix
 	const std::complex<double> *Epsilon2, // size (2*glist.n)^2 (dielectric/normal-field matrix)
+	int epstype,
 	std::complex<double> *q, // length 2*glist.n
 	std::complex<double> *kp, // size (2*glist.n)^2 (k-parallel matrix)
 	std::complex<double> *phi, // size (2*glist.n)^2
@@ -519,27 +325,35 @@ void SolveLayerEigensystem_real(
 	const size_t n2 = 2*n;
 
 	if((size_t)-1 == lwork){
-		double tmp;
-		RNP::Eigensystem_real(n2, NULL, n2, q, NULL, 1, phi, n2, &tmp, lwork);
-		work_[0] = tmp + n2*n2;
-		return;
+     //M.P. (Michael Povolotskyi) Bellow I have to mimic the instructions from RNP::Eigensystem in file Eigensystems.cpp
+     // because MKL lapack cannot handle NULL pointer for the matrix
+     //	RNP::Eigensystem(n2, NULL, n2, q, NULL, 1, phi, n2, work_, NULL, lwork); //M.P
+     work_[0] = double(2*n2); //M.P.
+     work_[0] += n2*n2;
+     return;
 	}else if(0 == lwork){
 		lwork = n2*n2+2*n2;
 	}
 
-	double *work = (double*)work_;
+	std::complex<double> *work = work_;
 	size_t eigenlwork;
-	if(NULL == work_ || lwork < n2*n2+4*n2){
+	if(NULL == work_ || lwork < n2*n2+2*n2){
 		lwork = (size_t)-1;
-		double tmp;
-		RNP::Eigensystem_real(n2, NULL, n2, q, NULL, 1, phi, n2, &tmp, lwork);
-		eigenlwork = (size_t)tmp;
-		work = (double*)rcwa_malloc(sizeof(double)*(eigenlwork + n2*n2));
+      //M.P. Same as above 
+      //	RNP::Eigensystem(n2, NULL, n2, q, NULL, 1, phi, n2, q, NULL, lwork); //M.P.
+      q[0] = (double)(2*n2); //M.P
+		eigenlwork = (size_t)q[0].real();
+		work = (std::complex<double>*)rcwa_malloc(sizeof(std::complex<double>)*(eigenlwork + n2*n2));
 	}else{
 		eigenlwork = lwork - n2*n2;
 	}
-	double *op = work;
-	double *eigenwork = op + n2*n2;
+	std::complex<double> *op = work;
+	std::complex<double> *eigenwork = op + n2*n2;
+	
+	double *rwork = rwork_;
+	if(NULL == rwork_){
+		rwork = (double*)rcwa_malloc(sizeof(double)*2*n2);
+	}
 
 #ifdef DUMP_MATRICES
 	DUMP_STREAM << "kx:" << std::endl;
@@ -558,9 +372,9 @@ void SolveLayerEigensystem_real(
 #endif
 
 	// Fill kp
-	double *kp_use = (double*)(NULL != kp ? kp : phi);
-
-	MakeKPMatrix_real(omega, n, kx, ky, Epsilon_inv, NULL, kp_use, n2);
+	std::complex<double> *kp_use = (NULL != kp ? kp : phi);
+	
+	MakeKPMatrix(omega, n, kx, ky, Epsilon_inv, epstype, NULL, kp_use, n2);
 #ifdef DUMP_MATRICES
 	DUMP_STREAM << "kp:" << std::endl;
 # ifdef DUMP_MATRICES_LARGE
@@ -569,7 +383,7 @@ void SolveLayerEigensystem_real(
 	RNP::IO::PrintVector(n2,&kp_use[0+(n2-1)*n2],1, DUMP_STREAM) << std::endl << std::endl;
 # endif
 #endif
-
+	
 
 #ifdef DUMP_MATRICES
 	DUMP_STREAM << "Epsilon2:" << std::endl;
@@ -581,18 +395,9 @@ void SolveLayerEigensystem_real(
 #endif
 
 	// Make the eigenoperator Epsilon2*kp - [kxkx, kxky; kykx, kyky]
-	for(size_t j = 0; j < n2; ++j){
-		for(size_t i = 0; i < n2; ++i){
-	//		double sum = 0;
-	//		for(size_t k = 0; k < n2; ++k){
-	//			sum += Epsilon2[i+k*n2]*kp_use[k+j*n2];
-	//		}
-			op[i+j*n2] = RNP::TBLAS::Dot(n2, (double*)(&Epsilon2[i]), 2*n2, &kp_use[j*n2], 1);
-		}
-	}
-	//RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., op,n2);
-	//RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,Epsilon2,n2, kp_use,n2, 0.,op,n2);
-
+	RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., op,n2);
+	RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,Epsilon2,n2, kp_use,n2, 0.,op,n2);
+	
 	for(size_t i = 0; i < n; ++i){
 		op[i+i*n2] -= kx[i]*kx[i];
 	}
@@ -617,18 +422,14 @@ void SolveLayerEigensystem_real(
 
 #ifdef DUMP_MATRICES
 # ifdef DUMP_MATRICES_LARGE
-	std::complex<double> *op_save = (std::complex<double>*)rcwa_malloc(sizeof(std::complex<double>) * 2*n2*n2);
+	std::complex<double> *op_save = (std::complex<double> *)rcwa_malloc(sizeof(std::complex<double>) * 2*n2*n2);
 	std::complex<double> *op_temp = op_save + n2*n2;
-	for(size_t j = 0; j < n2; ++j){
-		for(size_t i = 0; i < n2; ++i){
-			op_save[i+j*n2] = op[i+j*n2];
-		}
-	}
+	RNP::TBLAS::CopyMatrix<'A'>(n2,n2, op,n2, op_save,n2);
 # endif
 #endif
-	int info = RNP::Eigensystem_real(n2, op, n2, q, NULL, 1, phi, n2, eigenwork, eigenlwork);
+	int info = RNP::Eigensystem(n2, op, n2, q, NULL, 1, phi, n2, eigenwork, rwork, eigenlwork);
 	if(0 != info){
-		fprintf(stderr, "Layer eigensystem returned info = %d\n", info);
+		std::cerr << "Layer eigensystem returned info = " << info << std::endl;
 	}
 #ifdef DUMP_MATRICES
 	DUMP_STREAM << "eigen info = " << info << std::endl;
@@ -640,24 +441,26 @@ void SolveLayerEigensystem_real(
 	}
 	DUMP_STREAM << "eigensystem residual:" << std::endl;
 	RNP::IO::PrintMatrix(n2,n2,op_temp,n2, DUMP_STREAM) << std::endl << std::endl;
-	{
-		for(size_t j = 0; j < n2; ++j){
-			double sum = 0;
-			for(size_t i = 0; i < n2; ++i){
-				sum += fabs(op_temp[i+j*n2].real()) + fabs(op_temp[i+j*n2].imag());
-			}
-			DUMP_STREAM << " residual sum " << j << ": " << sum << std::endl;
-		}
-	}
 	rcwa_free(op_save);
 # endif
 #endif
 
 	for(size_t i = 0; i < n2; ++i){
 		// Set the \hat{q} vector (diagonal matrix) while we're at it
-		q[i] = std::sqrt(q[i]);
-		if(q[i].imag() < 0){
-			q[i] = -q[i];
+		if(0 == omega.imag()){ // Not bandsolving
+			q[i] = std::sqrt(q[i]);
+			if(q[i].imag() < 0){
+				q[i] = -q[i];
+			}
+		}else{ // performing some kind of bandsolving, need to choose the appropriate branch
+			if(q[i].real() < 0){
+				// branch cut should be just below positive real axis
+				q[i] = std::complex<double>(0,1) * std::sqrt(-q[i]);
+			}else{
+				// branch cut should be just below negative real axis
+				// This is the default behavior for sqrt(std::complex)
+				q[i] = std::sqrt(q[i]);
+			}
 		}
 	}
 #ifdef DUMP_MATRICES
@@ -670,593 +473,71 @@ void SolveLayerEigensystem_real(
 	RNP::IO::PrintVector(n2,phi,1, DUMP_STREAM) << std::endl << std::endl;
 # endif
 #endif
-
-	if(NULL == work_ || lwork < n2*n2+4*n2){
+	
+	if(NULL == work_ || lwork < n2*n2+2*n2){
 		rcwa_free(work);
 	}
-	
-	// Re-make kp in complex arithmetic for future use
-	std::complex<double> *kp_use_actual = (NULL != kp ? kp : phi);
-	MakeKPMatrix(omega, n, kx, ky, Epsilon_inv, EPSILON2_TYPE_FULL, NULL, kp_use_actual, n2);
+	if(NULL == rwork_){
+		rcwa_free(rwork);
+	}
 }
-#endif // HAVE_LAPACK
 
-void SolveLayerEigensystem(
-	std::complex<double> omega,
+
+void InitSMatrix(
 	size_t n,
-	const double *kx,
-	const double *ky,
-	const std::complex<double> *Epsilon_inv, // size (glist.n)^2; inv of usual dielectric Fourier coupling matrix
-	const std::complex<double> *Epsilon2, // size (2*glist.n)^2 (dielectric/normal-field matrix)
-	int epstype,
-	std::complex<double> *q, // length 2*glist.n
-	std::complex<double> *kp, // size (2*glist.n)^2 (k-parallel matrix)
-	std::complex<double> *phi, // size (2*glist.n)^2
+	std::complex<double> *S // size (4*n)^2
+){
+	const size_t n4 = 4*n;
+	RNP::TBLAS::SetMatrix<'A'>(n4,n4, 0.,1., S, n4);
+}
+void GetSMatrix(
+	size_t nlayers,
+	size_t n, // glist.n
+	const double *kx, const double *ky,
+	std::complex<double> omega,
+	const double *thickness, // list of thicknesses
+	const std::complex<double> **q, // list of q vectors
+	const std::complex<double> **Epsilon_inv, // size (glist.n)^2; inv of usual dielectric Fourier coupling matrix
+	int *epstype,
+	const std::complex<double> **kp,
+	const std::complex<double> **phi,
+	std::complex<double> *S, // size (4*n)^2
 	std::complex<double> *work_,
-	double *rwork_,
+	size_t *iwork,
 	size_t lwork
 ){
+	if(0 == nlayers){ return; }
 	const size_t n2 = 2*n;
-	
-#ifdef HAVE_LAPACK
-	bool isreal = (n > 10);
-	for(size_t j = 0; j < n && isreal; ++j){
-		for(size_t i = 0; i < n; ++i){
-			if(0 != Epsilon_inv[i+j*n].imag()){
-				isreal = false;
-				break;
-			}
-		}
-	}
-	for(size_t j = 0; j < n2 && isreal; ++j){
-		for(size_t i = 0; i < n2; ++i){
-			if(0 != Epsilon2[i+j*n2].imag()){
-				isreal = false;
-				break;
-			}
-		}
-	}
-	if(isreal && 0 == omega.imag() && EPSILON2_TYPE_FULL == epstype){
-		SolveLayerEigensystem_real(
-			omega.real(), n, kx, ky, Epsilon_inv, Epsilon2,
-			q, kp, phi, work_, rwork_, lwork
-		);
-		return;
-	}
-#endif
+	const size_t n4 = 2*n2;
 
 	if((size_t)-1 == lwork){
-		double dum;
-		RNP::Eigensystem(n2, NULL, n2, q, NULL, 1, phi, n2, work_, &dum, lwork);
-		work_[0] += n2*n2;
-		return;
-	}else if(0 == lwork){
-		lwork = n2*n2+2*n2;
-	}
-
-	std::complex<double> *work = work_;
-	size_t eigenlwork;
-	if(NULL == work_ || lwork < n2*n2+2*n2){
-		lwork = (size_t)-1;
-		RNP::Eigensystem(n2, NULL, n2, q, NULL, 1, phi, n2, q, NULL, lwork);
-		eigenlwork = (size_t)q[0].real();
-		work = (std::complex<double>*)rcwa_malloc(sizeof(std::complex<double>)*(eigenlwork + n2*n2));
-	}else{
-		eigenlwork = lwork - n2*n2;
-	}
-	std::complex<double> *op = work;
-	std::complex<double> *eigenwork = op + n2*n2;
-
-	double *rwork = rwork_;
-	if(NULL == rwork_){
-		rwork = (double*)rcwa_malloc(sizeof(double)*2*n2);
-	}
-
-#ifdef DUMP_MATRICES
-	DUMP_STREAM << "kx:" << std::endl;
-	RNP::IO::PrintVector(n,kx,1, DUMP_STREAM) << std::endl;
-	DUMP_STREAM << "ky:" << std::endl;
-	RNP::IO::PrintVector(n,ky,1, DUMP_STREAM) << std::endl << std::endl;
-#endif
-
-#ifdef DUMP_MATRICES
-	DUMP_STREAM << "Epsilon_inv:" << std::endl;
-# ifdef DUMP_MATRICES_LARGE
-	RNP::IO::PrintMatrix(n,n,Epsilon_inv,n, DUMP_STREAM) << std::endl << std::endl;
-# else
-	RNP::IO::PrintVector(n,Epsilon_inv,1, DUMP_STREAM) << std::endl << std::endl;
-# endif
-#endif
-
-	// Fill kp
-	std::complex<double> *kp_use = (NULL != kp ? kp : phi);
-
-	MakeKPMatrix(omega, n, kx, ky, Epsilon_inv, epstype, NULL, kp_use, n2);
-#ifdef DUMP_MATRICES
-	DUMP_STREAM << "kp:" << std::endl;
-# ifdef DUMP_MATRICES_LARGE
-	RNP::IO::PrintMatrix(n2,n2,kp_use,n2, DUMP_STREAM) << std::endl << std::endl;
-# else
-	RNP::IO::PrintVector(n2,&kp_use[0+(n2-1)*n2],1, DUMP_STREAM) << std::endl << std::endl;
-# endif
-#endif
-
-
-#ifdef DUMP_MATRICES
-	DUMP_STREAM << "Epsilon2:" << std::endl;
-# ifdef DUMP_MATRICES_LARGE
-	RNP::IO::PrintMatrix(n2,n2,Epsilon2,n2, DUMP_STREAM) << std::endl << std::endl;
-# else
-	RNP::IO::PrintVector(n2,Epsilon2,1, DUMP_STREAM) << std::endl << std::endl;
-# endif
-#endif
-
-	// Make the eigenoperator Epsilon2*kp - [kxkx, kxky; kykx, kyky]
-	RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., op,n2);
-	RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,Epsilon2,n2, kp_use,n2, 0.,op,n2);
-
-	for(size_t i = 0; i < n; ++i){
-		op[i+i*n2] -= kx[i]*kx[i];
-	}
-	for(size_t i = 0; i < n; ++i){
-		op[i+n+i*n2] -= ky[i]*kx[i];
-	}
-	for(size_t i = 0; i < n; ++i){
-		op[i+(i+n)*n2] -= kx[i]*ky[i];
-	}
-	for(size_t i = 0; i < n; ++i){
-		op[i+n+(i+n)*n2] -= ky[i]*ky[i];
-	}
-#ifdef DUMP_MATRICES
-	DUMP_STREAM << "op:" << std::endl;
-# ifdef DUMP_MATRICES_LARGE
-	RNP::IO::PrintMatrix(n2,n2,op,n2, DUMP_STREAM) << std::endl << std::endl;
-# else
-	RNP::IO::PrintMatrix(n2,n2,op,n2, DUMP_STREAM) << std::endl << std::endl;
-	//RNP::IO::PrintVector(n2,&op[0+(n2-1)*n2],1, DUMP_STREAM) << std::endl << std::endl;
-# endif
-#endif
-
-#ifdef DUMP_MATRICES
-# ifdef DUMP_MATRICES_LARGE
-	std::complex<double> *op_save = (std::complex<double> *)rcwa_malloc(sizeof(std::complex<double>) * 2*n2*n2);
-	std::complex<double> *op_temp = op_save + n2*n2;
-	RNP::TBLAS::CopyMatrix<'A'>(n2,n2, op,n2, op_save,n2);
-# endif
-#endif
-	int info = RNP::Eigensystem(n2, op, n2, q, NULL, 1, phi, n2, eigenwork, rwork, eigenlwork);
-	if(0 != info){
-		fprintf(stderr, "Layer eigensystem returned info = %d\n", info);
-	}
-#ifdef DUMP_MATRICES
-	DUMP_STREAM << "eigen info = " << info << std::endl;
-# ifdef DUMP_MATRICES_LARGE
-	RNP::TBLAS::Fill(n2*n2, 0., op_temp,1);
-	RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,op_save,n2, phi,n2, 0.,op_temp,n2);
-	for(size_t i = 0; i < n2; ++i){
-		RNP::TBLAS::Axpy(n2, -q[i], &phi[0+i*n2],1, &op_temp[0+i*n2],1);
-	}
-	DUMP_STREAM << "eigensystem residual:" << std::endl;
-	RNP::IO::PrintMatrix(n2,n2,op_temp,n2, DUMP_STREAM) << std::endl << std::endl;
-	rcwa_free(op_save);
-# endif
-#endif
-
-	for(size_t i = 0; i < n2; ++i){
-		// Set the \hat{q} vector (diagonal matrix) while we're at it
-		if(0 == omega.imag()){ // Not bandsolving
-			q[i] = std::sqrt(q[i]);
-			if(q[i].imag() < 0){
-				q[i] = -q[i];
-			}
-		}else{ // performing some kind of bandsolving, need to choose the appropriate branch
-			if(q[i].real() < 0){
-				// branch cut should be just below positive real axis
-				q[i] = std::complex<double>(0,1) * std::sqrt(-q[i]);
-			}else{
-				// branch cut should be just below negative real axis
-				// This is the default behavior for sqrt(std::complex)
-				q[i] = std::sqrt(q[i]);
-			}
-		}
-	}
-#ifdef DUMP_MATRICES
-	DUMP_STREAM << "q:" << std::endl;
-	RNP::IO::PrintVector(n2,q,1, DUMP_STREAM) << std::endl << std::endl;
-	DUMP_STREAM << "phi:" << std::endl;
-# ifdef DUMP_MATRICES_LARGE
-	RNP::IO::PrintMatrix(n2,n2,phi,n2, DUMP_STREAM) << std::endl << std::endl;
-# else
-	RNP::IO::PrintVector(n2,phi,1, DUMP_STREAM) << std::endl << std::endl;
-# endif
-#endif
-
-	if(NULL == work_ || lwork < n2*n2+2*n2){
-		rcwa_free(work);
-	}
-	if(NULL == rwork_){
-		rcwa_free(rwork);
-	}
-}
-
-
-void InitSMatrix(
-	size_t n,
-	std::complex<double> *S // size (4*n)^2
-){
-	const size_t n4 = 4*n;
-	RNP::TBLAS::SetMatrix<'A'>(n4,n4, 0.,1., S, n4);
-}
-void GetSMatrix(
-	size_t nlayers,
-	size_t n, // glist.n
-	const double *kx, const double *ky,
-	std::complex<double> omega,
-	const double *thickness, // list of thicknesses
-	const std::complex<double> **q, // list of q vectors
-	const std::complex<double> **Epsilon_inv, // size (glist.n)^2; inv of usual dielectric Fourier coupling matrix
-	int *epstype,
-	const std::complex<double> **kp,
-	const std::complex<double> **phi,
-	std::complex<double> *S, // size (4*n)^2
-	std::complex<double> *work_,
-	size_t *iwork,
-	size_t lwork
-){
-	if(0 == nlayers){ return; }
-	const size_t n2 = 2*n;
-	const size_t n4 = 2*n2;
-
-	if((size_t)-1 == lwork){
-		work_[0] = n4*(n4+1);
-		return;
-	}
-	std::complex<double> *work = work_;
-	if(NULL == work_ || lwork < n4*(n4+1)){
-		work = (std::complex<double>*)rcwa_malloc(sizeof(std::complex<double>)*(n4*(n4+1)));
-	}
-	size_t *pivots = iwork;
-	if(NULL == iwork){
-		pivots = (size_t*)rcwa_malloc(sizeof(size_t)*n4);
-	}
-
-	RNP::TBLAS::SetMatrix<'A'>(n4,n4, 0.,1., S, n4);
-
-	std::complex<double> *t1 = work;
-	std::complex<double> *t2 = t1 + n2*n2;
-	std::complex<double> *in1 = t2 + n2*n2;
-	std::complex<double> *in2 = in1 + n2*n2;
-	std::complex<double> *d1 = in2 + n2*n2;
-	std::complex<double> *d2 = d1 + n2;
-
-	for(size_t l = 0; l < nlayers-1; ++l){
-		size_t lp1 = l+1;
-		if(lp1 >= nlayers){ lp1 = l; }
-
-		// Make the interface matrices
-		if((lp1 == l) || (q[l] == q[lp1] && ((NULL != kp[l] && kp[l] == kp[lp1]) || Epsilon_inv[l] == Epsilon_inv[lp1]) && phi[l] == phi[lp1])){
-			// This is a trivial interface, set to identity
-			RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,1., in1, n2);
-			RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., in2, n2);
-		}else{
-			// The interface matrix is the inverse of the mode-to-field matrix of layer l
-			// times the mode-to-field matrix of layer l+1 (lp1).
-			// The mode-to-field matrix is of the form
-			// [ B -B ] where A = phi
-			// [ A  A ] where B = kp*phi*inv(diag(q)) = G*A/q
-			// So we want
-			// 0.5 * [  iBl  iAl ] [ Blp1 -Blp1 ]
-			//       [ -iBl  iAl ] [ Alp1  Alp1 ]
-			// Multiplying out gives
-			// 0.5 * [ P+Q P-Q ] // where P = iAl*Alp1, and i in front means inverse
-			//       [ P-Q P+Q ] // where Q = iBl*Blp1
-			// Making P is easy, since A is a single matrix.
-			// Making Q is as follows:
-			// Q = iBl*Blp1
-			//   = ql*iAl*iGl * Gl*Alp1*iqlp1
-			// We will only store I11 and I21
-			/*
-			{
-				std::complex<double> *Ml = (std::complex<double>*)rcwa_malloc(sizeof(std::complex<double>)*n4*n4);
-				std::complex<double> *Mlp1 = (std::complex<double>*)rcwa_malloc(sizeof(std::complex<double>)*n4*n4);
-
-				RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., &Ml[0+0*n4],n4);
-				RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[l],n2, &Ml[n2+0*n4],n4);
-				RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[l],n2, &Ml[n2+n2*n4],n4);
-				RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[l],n2, &Ml[0+n2*n4],n4);
-				for(size_t i = 0; i < n2; ++i){
-					RNP::TBLAS::Scale(n2, 1./q[l][i], &Ml[0+(i+n2)*n4], 1);
-				}
-				RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,kp[l],n2, &Ml[0+n2*n4],n4, 0.,&Ml[0+0*n4],n4);
-				RNP::TBLAS::CopyMatrix<'A'>(n2,n2, &Ml[0+0*n4],n4, &Ml[0+n2*n4],n4);
-				for(size_t i = 0; i < n2; ++i){
-					RNP::TBLAS::Scale(n2, -1., &Ml[0+(i+n2)*n4], 1);
-				}
-
-				RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., &Mlp1[0+0*n4],n4);
-				RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[lp1],n2, &Mlp1[n2+0*n4],n4);
-				RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[lp1],n2, &Mlp1[n2+n2*n4],n4);
-				RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[lp1],n2, &Mlp1[0+n2*n4],n4);
-				for(size_t i = 0; i < n2; ++i){
-					RNP::TBLAS::Scale(n2, 1./q[lp1][i], &Mlp1[0+(i+n2)*n4], 1);
-				}
-				RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,kp[lp1],n2, &Mlp1[0+n2*n4],n4, 0.,&Mlp1[0+0*n4],n4);
-				RNP::TBLAS::CopyMatrix<'A'>(n2,n2, &Mlp1[0+0*n4],n4, &Mlp1[0+n2*n4],n4);
-				for(size_t i = 0; i < n2; ++i){
-					RNP::TBLAS::Scale(n2, -1., &Mlp1[0+(i+n2)*n4], 1);
-				}
-
-				//RNP::LinearSolve<'N'>(n4, n4, Ml, n4, Mlp1, n4, NULL, pivots);
-#ifdef DUMP_MATRICES
-			DUMP_STREAM << "M(" << l << ") = " << std::endl;
-# ifdef DUMP_MATRICES_LARGE
-			RNP::IO::PrintMatrix(n4,n4,Ml,n4, DUMP_STREAM) << std::endl << std::endl;
-# else
-			RNP::IO::PrintVector(n4,Ml,1, DUMP_STREAM) << std::endl << std::endl;
-# endif
-			DUMP_STREAM << "M(" << lp1 << ") = " << std::endl;
-# ifdef DUMP_MATRICES_LARGE
-			RNP::IO::PrintMatrix(n4,n4,Mlp1,n4, DUMP_STREAM) << std::endl << std::endl;
-# else
-			RNP::IO::PrintVector(n4,Mlp1,1, DUMP_STREAM) << std::endl << std::endl;
-# endif
-#endif
-
-				rcwa_free(Mlp1);
-				rcwa_free(Ml);
-			}
-			*/
-			// Make Bl in t1
-			RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., t1,n2);
-			{
-				if(NULL == phi[l]){
-					if(NULL == kp[l]){
-						MakeKPMatrix(omega, n, kx, ky, Epsilon_inv[l], epstype[l], kp[l], t1,n2);
-					}else{
-						RNP::TBLAS::CopyMatrix<'A'>(n2,n2, kp[l],n2, t1,n2);
-					}
-				}else{
-					MultKPMatrix("N", omega, n, kx, ky, Epsilon_inv[l], epstype[l], kp[l], n2, phi[l],n2, t1,n2);
-				}
-				//for(size_t i = 0; i < n2; ++i){
-				//	RNP::TBLAS::Scale(n2, 1./q[l][i], &t1[0+i*n2], 1);
-				//}
-			}
-#ifdef DUMP_MATRICES
-			DUMP_STREAM << "Bl(" << l << ") = " << std::endl;
-# ifdef DUMP_MATRICES_LARGE
-			RNP::IO::PrintMatrix(n2,n2,t1,n2, DUMP_STREAM) << std::endl << std::endl;
-# else
-			RNP::IO::PrintVector(n2,t1,1, DUMP_STREAM) << std::endl << std::endl;
-# endif
-#endif
-			// Make Blp1 in in1
-			RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., in1,n2);
-			{
-				if(NULL == phi[lp1]){
-					if(NULL == kp[lp1]){
-						MakeKPMatrix(omega, n, kx, ky, Epsilon_inv[lp1], epstype[lp1], kp[lp1], in1,n2);
-					}else{
-						RNP::TBLAS::CopyMatrix<'A'>(n2,n2, kp[lp1],n2, in1,n2);
-					}
-				}else{
-					MultKPMatrix("N", omega, n, kx, ky, Epsilon_inv[lp1], epstype[lp1], kp[lp1], n2, phi[lp1],n2, in1,n2);
-				}
-				//for(size_t i = 0; i < n2; ++i){
-				//	RNP::TBLAS::Scale(n2, 1./q[lp1][i], &in1[0+i*n2], 1);
-				//}
-			}
-#ifdef DUMP_MATRICES
-		DUMP_STREAM << "Bl(" << l+1 << ") = " << std::endl;
-# ifdef DUMP_MATRICES_LARGE
-		RNP::IO::PrintMatrix(n2,n2,in1,n2, DUMP_STREAM) << std::endl << std::endl;
-# else
-		RNP::IO::PrintVector(n2,in1,1, DUMP_STREAM) << std::endl << std::endl;
-# endif
-#endif
-			int solve_info;
-			// Make Q in in1
-			//RNP::LinearSolve<'N'>(n2, n2, t1, n2, in1, n2, &solve_info, pivots);
-			SingularLinearSolve(n2,n2,n2, t1,n2, in1,n2, DBL_EPSILON);
-			// Now perform the diagonal scalings
-			for(size_t i = 0; i < n2; ++i){
-				RNP::TBLAS::Scale(n2, q[l][i], &in1[i+0*n2], n2);
-			}
-			{
-				double maxel = 0;
-				for(size_t i = 0; i < n2; ++i){
-					double el = std::abs(q[lp1][i]);
-					if(el > maxel){ maxel = el; }
-				}
-				for(size_t i = 0; i < n2; ++i){
-					double el = std::abs(q[lp1][i]);
-					if(el < DBL_EPSILON * maxel){
-						RNP::TBLAS::Scale(n2, 0., &in1[0+i*n2], 1);
-					}else{
-						RNP::TBLAS::Scale(n2, 1./q[lp1][i], &in1[0+i*n2], 1);
-					}
-				}
-			}
-
-			// Make P in in2
-			if(NULL == phi[lp1]){
-				RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,1., in2,n2);
-			}else{
-				RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[lp1],n2, in2,n2);
-			}
-			if(NULL != phi[l]){
-				RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[l],n2, t1,n2);
-				RNP::LinearSolve<'N'>(n2, n2, t1, n2, in2, n2, &solve_info, pivots);
-			}
-
-			RNP::TBLAS::CopyMatrix<'A'>(n2,n2, in2,n2, t1,n2); // in2 = P, t1 = P, in1 = Q
-			RNP::TBLAS::Axpy(n2*n2, -1., in1,1, in2,1); // in2 = P-Q, t1 = P, in1 = Q
-			RNP::TBLAS::Axpy(n2*n2, 1., t1,1, in1,1); // in2 = P+Q, t1 = P, in1 = P+Q
-			RNP::TBLAS::Scale(n2*n2, 0.5, in1,1);
-			RNP::TBLAS::Scale(n2*n2, 0.5, in2,1);
-		}
-#ifdef DUMP_MATRICES
-		DUMP_STREAM << "Interface1(" << l+1 << ") = " << std::endl;
-# ifdef DUMP_MATRICES_LARGE
-		RNP::IO::PrintMatrix(n2,n2,in1,n2, DUMP_STREAM) << std::endl << std::endl;
-# else
-		RNP::IO::PrintVector(n2,in1,1, DUMP_STREAM) << std::endl << std::endl;
-# endif
-		DUMP_STREAM << "Interface2(" << l+1 << ") = " << std::endl;
-# ifdef DUMP_MATRICES_LARGE
-		RNP::IO::PrintMatrix(n2,n2,in2,n2, DUMP_STREAM) << std::endl << std::endl;
-# else
-		RNP::IO::PrintVector(n2,in2,1, DUMP_STREAM) << std::endl << std::endl;
-# endif
-#endif
-
-		for(size_t i = 0; i < n2; ++i){
-			d1[i] = std::exp(q[l  ][i] * std::complex<double>(0,thickness[l  ]));
-			d2[i] = std::exp(q[lp1][i] * std::complex<double>(0,thickness[lp1]));
-		}
-
-		// Make S11
-		RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, -1.,&S[0+n2*n4],n4, in2,n2, 0.,t1,n2); // t1 = -S12 I21
-		for(size_t i = 0; i < n2; ++i){ // t1 = -f_l S12 I21
-			RNP::TBLAS::Scale(n2, d1[i], &t1[i+0*n2], n2);
-		}
-		RNP::TBLAS::Axpy(n2*n2, 1., in1,1, t1,1); // t1 = (I11 - f_l S12 I21)
-
-		RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,1., t2,n2);
-		int solve_info;
-		RNP::LinearSolve<'N'>(n2, n2, t1, n2, t2, n2, &solve_info, pivots); // t2 = (I11 - f_l S12 I21)^{-1}
-
-		RNP::TBLAS::CopyMatrix<'A'>(n2,n2, &S[0+0*n4],n4, t1,n2);
-		for(size_t i = 0; i < n2; ++i){ // t1 = f_l S11
-			RNP::TBLAS::Scale(n2, d1[i], &t1[i+0*n2], n2);
-		}
-		RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,t2,n2, t1,n2, 0.,&S[0+0*n4],n4);
-		// S11 is done, and we need to hold on to t2 = (I11 - f_l S12 I21)^{-1}
-
-		RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,&S[0+n2*n4],n4, in1,n2, 0.,t1,n2); // t1 = S12 I22
-		for(size_t i = 0; i < n2; ++i){ // t1 = f_l S12 I22
-			RNP::TBLAS::Scale(n2, d1[i], &t1[i+0*n2], n2);
-		}
-		RNP::TBLAS::Axpy(n2*n2, -1., in2,1, t1,1); // t1 = f_l S12 I22 - I12
-		for(size_t i = 0; i < n2; ++i){ // t1 = (f_l S12 I22 - I12) f_{l+1}
-			RNP::TBLAS::Scale(n2, d2[i], &t1[0+i*n2], 1);
-		}
-		RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,t2,n2, t1,n2, 0.,&S[0+n2*n4],n4);
-		// S12 done, and t2 can be reused
-
-		RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,&S[n2+n2*n4],n4, in2,n2, 0.,t1,n2); // t1 = S22 I21
-		RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,t1,n2, &S[0+0*n4],n4, 1.,&S[n2+0*n4],n4);
-		// S21 done, need to keep t1 = S22 I21
-
-		RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,&S[n2+n2*n4],n4, in1,n2, 0.,t2,n2); // t2 = S22 I22
-		for(size_t i = 0; i < n2; ++i){ // t2 = S22 I22 f_{l+1}
-			RNP::TBLAS::Scale(n2, d2[i], &t2[0+i*n2], 1);
-		}
-		RNP::TBLAS::CopyMatrix<'A'>(n2,n2, t2,n2, &S[n2+n2*n4],n4);
-		RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,t1,n2, &S[0+n2*n4],n4, 1.,&S[n2+n2*n4],n4);
-
-#ifdef DUMP_MATRICES
-		DUMP_STREAM << "S(1," << l+2 << ") = " << std::endl;
-# ifdef DUMP_MATRICES_LARGE
-		RNP::IO::PrintMatrix(n4,n4,S,n4, DUMP_STREAM) << std::endl << std::endl;
-# else
-		RNP::IO::PrintVector(n4,S,1, DUMP_STREAM) << std::endl << std::endl;
-# endif
-#endif
-	}
-	if(NULL == work_ || lwork < n4*(n4+1)){
-		rcwa_free(work);
-	}
-	if(NULL == iwork){
-		rcwa_free(pivots);
-	}
-}
-
-
-int SolveAll(
-	size_t nlayers,
-	size_t n, // glist.n
-	const double *kx, const double *ky,
-	std::complex<double> omega,
-	const double *thickness, // list of thicknesses
-	const std::complex<double> **q, // list of q vectors
-	const std::complex<double> **Epsilon_inv, // size (glist.n)^2; inv of usual dielectric Fourier coupling matrix
-	int *epstype,
-	const std::complex<double> **kp,
-	const std::complex<double> **phi,
-	std::complex<double> *ab, // length 2*n*nlayers
-	std::complex<double> *work_, // length lwork
-	size_t *iwork, // length n2*nlayers
-	size_t lwork // set to -1 for query into iwork[0], at least 6*n2^2*nlayers
-){
-	const size_t n2 = 2*n;
-	const size_t n4 = 2*n2;
-	const size_t n22 = n2*n2;
-	const size_t minwork = 6*n22*nlayers;
-	if((size_t)-1 == lwork){
-		iwork[0] = minwork;
-		return 0;
+		work_[0] = n4*(n4+1);
+		return;
 	}
-	typedef std::complex<double> doublecomplex;
-	doublecomplex *work = work_;
-	if(0 == lwork && NULL == work_){
-		work = (doublecomplex*)rcwa_malloc(sizeof(doublecomplex) * minwork);
-		if(NULL == work){
-printf("Could not allocate %d\n", (int)minwork); fflush(stdout);
-			return 1;
-		}
-	}else if(lwork < minwork){
-		// error: not enough workspace
-		return -15;
+	std::complex<double> *work = work_;
+	if(NULL == work_ || lwork < n4*(n4+1)){
+		work = (std::complex<double>*)rcwa_malloc(sizeof(std::complex<double>)*(n4*(n4+1)));
+	}
+	size_t *pivots = iwork;
+	if(NULL == iwork){
+		pivots = (size_t*)rcwa_malloc(sizeof(size_t)*n4);
 	}
 	
-	// Set up temporaries
-	doublecomplex *t1 = work;
-	doublecomplex *t2 = t1 + n22;
-	doublecomplex *in1 = t2 + n22;
-	doublecomplex *in2 = in1 + n22;
-	
-	for(size_t ip = 0, iq = 1; iq < nlayers; ip=iq++){
-		// Set pointers to current row of matrices
-		doublecomplex *row = work+6*n22*iq;
-		
-		// Precondition for current loop iteration:
-		//   Qprev from previous iteration was formed correctly
-		
-		/*
-		// Assemble interface T-matrix
-		//   Assemble mode-to-field operator for p in row
-		if(0 == p){
-			// TODO: Generate mode-to-field operator for p
-			
-			// Factor and invert P
-			if(layer p is uniform){
-			}else if(layer p is z-uncoupled){
-			}else{
-				LUFactor(n4, n4, row, n4, ipiv);
-				LUInvert(n4, n4, row, n4, ipiv);
-			}
-		}else{ // Mode-to-field operator for p was generated and factored in previous iteration
-			// Only invert P (in-place)
-			if(layer p is uniform){
-			}else if(layer p is z-uncoupled){
-			}else{
-				LUInvert(n4, n4, row, n4, ipiv);
-			}
-		}
-		//   Assemble mode-to-field operator for q in rownext
-		// TODO: Generate mode-to-field operator for q
-		if(layer q is uniform){
-		}else if(layer q is z-uncoupled){
-		}else{
-			LUFactor(n4, n4, rownext, n4, ipiv);
-		}
-		// In-place multiply q into p
-		LUMultRight(n4, n4, rownext, n4, ipiv);
-		*/
+	RNP::TBLAS::SetMatrix<'A'>(n4,n4, 0.,1., S, n4);
+
+	std::complex<double> *t1 = work;
+	std::complex<double> *t2 = t1 + n2*n2;
+	std::complex<double> *in1 = t2 + n2*n2;
+	std::complex<double> *in2 = in1 + n2*n2;
+	std::complex<double> *d1 = in2 + n2*n2;
+	std::complex<double> *d2 = d1 + n2;
+
+	for(size_t l = 0; l < nlayers-1; ++l){
+		size_t lp1 = l+1;
+		if(lp1 >= nlayers){ lp1 = l; }
 		
 		// Make the interface matrices
-		if((ip == iq) || (q[ip] == q[iq] && ((NULL != kp[ip] && kp[ip] == kp[iq]) || Epsilon_inv[ip] == Epsilon_inv[iq]) && phi[ip] == phi[iq])){
+		if((lp1 == l) || (q[l] == q[lp1] && ((NULL != kp[l] && kp[l] == kp[lp1]) || Epsilon_inv[l] == Epsilon_inv[lp1]) && phi[l] == phi[lp1])){
 			// This is a trivial interface, set to identity
 			RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,1., in1, n2);
 			RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., in2, n2);
@@ -1281,7 +562,7 @@ printf("Could not allocate %d\n", (int)minwork); fflush(stdout);
 			{
 				std::complex<double> *Ml = (std::complex<double>*)rcwa_malloc(sizeof(std::complex<double>)*n4*n4);
 				std::complex<double> *Mlp1 = (std::complex<double>*)rcwa_malloc(sizeof(std::complex<double>)*n4*n4);
-
+				
 				RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., &Ml[0+0*n4],n4);
 				RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[l],n2, &Ml[n2+0*n4],n4);
 				RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[l],n2, &Ml[n2+n2*n4],n4);
@@ -1294,7 +575,7 @@ printf("Could not allocate %d\n", (int)minwork); fflush(stdout);
 				for(size_t i = 0; i < n2; ++i){
 					RNP::TBLAS::Scale(n2, -1., &Ml[0+(i+n2)*n4], 1);
 				}
-
+				
 				RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., &Mlp1[0+0*n4],n4);
 				RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[lp1],n2, &Mlp1[n2+0*n4],n4);
 				RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[lp1],n2, &Mlp1[n2+n2*n4],n4);
@@ -1307,7 +588,7 @@ printf("Could not allocate %d\n", (int)minwork); fflush(stdout);
 				for(size_t i = 0; i < n2; ++i){
 					RNP::TBLAS::Scale(n2, -1., &Mlp1[0+(i+n2)*n4], 1);
 				}
-
+				
 				//RNP::LinearSolve<'N'>(n4, n4, Ml, n4, Mlp1, n4, NULL, pivots);
 #ifdef DUMP_MATRICES
 			DUMP_STREAM << "M(" << l << ") = " << std::endl;
@@ -1323,28 +604,29 @@ printf("Could not allocate %d\n", (int)minwork); fflush(stdout);
 			RNP::IO::PrintVector(n4,Mlp1,1, DUMP_STREAM) << std::endl << std::endl;
 # endif
 #endif
-
+				
 				rcwa_free(Mlp1);
 				rcwa_free(Ml);
 			}
 			*/
-			
-			//// Begin copy from GetSMatrix
 			// Make Bl in t1
 			RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., t1,n2);
 			{
-				if(NULL == phi[ip]){
-					if(NULL == kp[ip]){
-						MakeKPMatrix(omega, n, kx, ky, Epsilon_inv[ip], epstype[ip], kp[ip], t1,n2);
+				if(NULL == phi[l]){
+					if(NULL == kp[l]){
+						MakeKPMatrix(omega, n, kx, ky, Epsilon_inv[l], epstype[l], kp[l], t1,n2);
 					}else{
-						RNP::TBLAS::CopyMatrix<'A'>(n2,n2, kp[ip],n2, t1,n2);
+						RNP::TBLAS::CopyMatrix<'A'>(n2,n2, kp[l],n2, t1,n2);
 					}
 				}else{
-					MultKPMatrix("N", omega, n, kx, ky, Epsilon_inv[ip], epstype[ip], kp[ip], n2, phi[ip],n2, t1,n2);
+					MultKPMatrix("N", omega, n, kx, ky, Epsilon_inv[l], epstype[l], kp[l], n2, phi[l],n2, t1,n2);
 				}
+				//for(size_t i = 0; i < n2; ++i){
+				//	RNP::TBLAS::Scale(n2, 1./q[l][i], &t1[0+i*n2], 1);
+				//}
 			}
 #ifdef DUMP_MATRICES
-			DUMP_STREAM << "Bl(" << ip << ") = " << std::endl;
+			DUMP_STREAM << "Bl(" << l << ") = " << std::endl;
 # ifdef DUMP_MATRICES_LARGE
 			RNP::IO::PrintMatrix(n2,n2,t1,n2, DUMP_STREAM) << std::endl << std::endl;
 # else
@@ -1354,18 +636,21 @@ printf("Could not allocate %d\n", (int)minwork); fflush(stdout);
 			// Make Blp1 in in1
 			RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., in1,n2);
 			{
-				if(NULL == phi[iq]){
-					if(NULL == kp[iq]){
-						MakeKPMatrix(omega, n, kx, ky, Epsilon_inv[iq], epstype[iq], kp[iq], in1,n2);
+				if(NULL == phi[lp1]){
+					if(NULL == kp[lp1]){
+						MakeKPMatrix(omega, n, kx, ky, Epsilon_inv[lp1], epstype[lp1], kp[lp1], in1,n2);
 					}else{
-						RNP::TBLAS::CopyMatrix<'A'>(n2,n2, kp[iq],n2, in1,n2);
+						RNP::TBLAS::CopyMatrix<'A'>(n2,n2, kp[lp1],n2, in1,n2);
 					}
 				}else{
-					MultKPMatrix("N", omega, n, kx, ky, Epsilon_inv[iq], epstype[iq], kp[iq], n2, phi[iq],n2, in1,n2);
+					MultKPMatrix("N", omega, n, kx, ky, Epsilon_inv[lp1], epstype[lp1], kp[lp1], n2, phi[lp1],n2, in1,n2);
 				}
+				//for(size_t i = 0; i < n2; ++i){
+				//	RNP::TBLAS::Scale(n2, 1./q[lp1][i], &in1[0+i*n2], 1);
+				//}
 			}
 #ifdef DUMP_MATRICES
-		DUMP_STREAM << "Bl(" << iq << ") = " << std::endl;
+		DUMP_STREAM << "Bl(" << l+1 << ") = " << std::endl;
 # ifdef DUMP_MATRICES_LARGE
 		RNP::IO::PrintMatrix(n2,n2,in1,n2, DUMP_STREAM) << std::endl << std::endl;
 # else
@@ -1374,39 +659,39 @@ printf("Could not allocate %d\n", (int)minwork); fflush(stdout);
 #endif
 			int solve_info;
 			// Make Q in in1
-			RNP::LinearSolve<'N'>(n2, n2, t1, n2, in1, n2, &solve_info, iwork);
-			//SingularLinearSolve(n2,n2,n2, t1,n2, in1,n2, DBL_EPSILON);
+			//RNP::LinearSolve<'N'>(n2, n2, t1, n2, in1, n2, &solve_info, pivots);
+			SingularLinearSolve(n2,n2,n2, t1,n2, in1,n2, DBL_EPSILON);
 			// Now perform the diagonal scalings
 			for(size_t i = 0; i < n2; ++i){
-				RNP::TBLAS::Scale(n2, q[ip][i], &in1[i+0*n2], n2);
+				RNP::TBLAS::Scale(n2, q[l][i], &in1[i+0*n2], n2);
 			}
 			{
 				double maxel = 0;
 				for(size_t i = 0; i < n2; ++i){
-					double el = std::abs(q[iq][i]);
+					double el = std::abs(q[lp1][i]);
 					if(el > maxel){ maxel = el; }
 				}
 				for(size_t i = 0; i < n2; ++i){
-					double el = std::abs(q[iq][i]);
+					double el = std::abs(q[lp1][i]);
 					if(el < DBL_EPSILON * maxel){
 						RNP::TBLAS::Scale(n2, 0., &in1[0+i*n2], 1);
 					}else{
-						RNP::TBLAS::Scale(n2, 1./q[iq][i], &in1[0+i*n2], 1);
+						RNP::TBLAS::Scale(n2, 1./q[lp1][i], &in1[0+i*n2], 1);
 					}
 				}
 			}
-
+			
 			// Make P in in2
-			if(NULL == phi[iq]){
+			if(NULL == phi[lp1]){
 				RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,1., in2,n2);
 			}else{
-				RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[iq],n2, in2,n2);
+				RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[lp1],n2, in2,n2);
 			}
-			if(NULL != phi[ip]){
-				RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[ip],n2, t1,n2);
-				RNP::LinearSolve<'N'>(n2, n2, t1, n2, in2, n2, &solve_info, iwork);
+			if(NULL != phi[l]){
+				RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[l],n2, t1,n2);
+				RNP::LinearSolve<'N'>(n2, n2, t1, n2, in2, n2, &solve_info, pivots);
 			}
-
+			
 			RNP::TBLAS::CopyMatrix<'A'>(n2,n2, in2,n2, t1,n2); // in2 = P, t1 = P, in1 = Q
 			RNP::TBLAS::Axpy(n2*n2, -1., in1,1, in2,1); // in2 = P-Q, t1 = P, in1 = Q
 			RNP::TBLAS::Axpy(n2*n2, 1., t1,1, in1,1); // in2 = P+Q, t1 = P, in1 = P+Q
@@ -1414,260 +699,80 @@ printf("Could not allocate %d\n", (int)minwork); fflush(stdout);
 			RNP::TBLAS::Scale(n2*n2, 0.5, in2,1);
 		}
 #ifdef DUMP_MATRICES
-		DUMP_STREAM << "Interface1(" << iq << ") = " << std::endl;
+		DUMP_STREAM << "Interface1(" << l+1 << ") = " << std::endl;
 # ifdef DUMP_MATRICES_LARGE
 		RNP::IO::PrintMatrix(n2,n2,in1,n2, DUMP_STREAM) << std::endl << std::endl;
 # else
 		RNP::IO::PrintVector(n2,in1,1, DUMP_STREAM) << std::endl << std::endl;
 # endif
-		DUMP_STREAM << "Interface2(" << iq << ") = " << std::endl;
+		DUMP_STREAM << "Interface2(" << l+1 << ") = " << std::endl;
 # ifdef DUMP_MATRICES_LARGE
 		RNP::IO::PrintMatrix(n2,n2,in2,n2, DUMP_STREAM) << std::endl << std::endl;
 # else
 		RNP::IO::PrintVector(n2,in2,1, DUMP_STREAM) << std::endl << std::endl;
 # endif
 #endif
-
-		//PrintMatrix("in1", n2, n2, in1, n2);
-		//PrintMatrix("in2", n2, n2, in2, n2);
-
-		doublecomplex *Sba = row;
-		doublecomplex *Saa = Sba+n2;
-		doublecomplex *Sbb = row+n2*n4;
-		doublecomplex *Sab = Sbb+n2;
 		
-//printf("Preparing to exchange T to S\n"); fflush(stdout);
-
-		// Exchange to interface S-matrix, and also swap block rows
-		Copy(n2,n2, in1,n2, Saa,n4);
-		Invert(n2, Saa,n4, Sbb, n22, iwork); // Use Sbb as workspace
-		Mult(n2,  1., in2,n2, Saa,n4, 0., Sba,n4);
-		Mult(n2, -1., Saa,n4, in2,n2, 0., Sab,n4);
-		Copy(n2,n2, in1,n2, Sbb,n4);
-		Mult(n2,  1., in2,n2, Sab,n4, 1., Sbb,n4);
-
-//printf("Exchanged T to S\n"); fflush(stdout);
-
-//		PrintMatrix("Saa", n2, n2, Saa, n4);
-//		PrintMatrix("Sab", n2, n2, Sab, n4);
-//		PrintMatrix("Sba", n2, n2, Sba, n4);
-//		PrintMatrix("Sbb", n2, n2, Sbb, n4);
-		
-		// Fold in propagation phases into the S-matrix
-		for(size_t j = 0; j < n2; ++j){
-			doublecomplex sp = std::exp(q[ip][j] * std::complex<double>(0,thickness[ip]));
-			doublecomplex sq = std::exp(q[iq][j] * std::complex<double>(0,thickness[iq]));
-//printf("Scale factor ip j = %f + %f i\n", sp.real(), sp.imag());
-//printf("Scale factor iq j = %f + %f i\n", sq.real(), sq.imag());
-			Scale(n4, sp, &row[0+j*n4]);
-			Scale(n4, sq, &row[0+(j+n2)*n4]);
+		for(size_t i = 0; i < n2; ++i){
+			d1[i] = std::exp(q[l  ][i] * std::complex<double>(0,thickness[l  ]));
+			d2[i] = std::exp(q[lp1][i] * std::complex<double>(0,thickness[lp1]));
 		}
 
-//		PrintMatrix("Saa with phase", n2, n2, Saa, n4);
-//		PrintMatrix("Sab with phase", n2, n2, Sab, n4);
-//		PrintMatrix("Sba with phase", n2, n2, Sba, n4);
-//		PrintMatrix("Sbb with phase", n2, n2, Sbb, n4);
+		// Make S11
+		RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, -1.,&S[0+n2*n4],n4, in2,n2, 0.,t1,n2); // t1 = -S12 I21
+		for(size_t i = 0; i < n2; ++i){ // t1 = -f_l S12 I21
+			RNP::TBLAS::Scale(n2, d1[i], &t1[i+0*n2], n2);
+		}
+		RNP::TBLAS::Axpy(n2*n2, 1., in1,1, t1,1); // t1 = (I11 - f_l S12 I21)
 
-//printf("Applied phases\n"); fflush(stdout);
-	}
-	///////////// End copy from GetSMatrix
-	
-	// At this point, the workspace is divided into portions of length 6*n22
-	// for each layer. The interface scattering matrix between layers i and i+1
-	// begins at offset 6*n22*(i+1) in the workspace. The contents of each block
-	// has the form:
-	//   [ Sba Sbb ]
-	//   [ Saa Sab ]
-	// The system of equations has instead a block row of the form:
-	//   [ -Sba  I   0  -Sbb ]
-	//   [ -Saa  0   I  -Sab ]
-	// Assembling all such rows, one obtains a rectangular matrix of size
-	// 4*n*(m-1) by 4*n*m where m is the number of layers.
-	// The labeling of the rows is [ b0, a1, b1, a2, b2, ... am ]
-	// The labeling of the columns is [ a0, b0, a1, b1, a2, b2, ... am, bm ]
-	// The first and last block columns correspond to boundary conditions and
-	// hence must be moved to the RHS. This produces in the end a square matrix.
-	
-	// Prepare the RHS
-	Mult(n4, n2, 1., &work[6*n22], n4, &ab[0], 0, t1);
-	Copy(n4, t1, 1, &ab[n2], 1);
-	Mult(n4, n2, 1., &work[6*n22*(nlayers-1)+n2*n4], n4, &ab[(nlayers-1)*n4+n2], 0, t1);
-	Copy(n4, t1, 1, &ab[(nlayers-1)*n4-n2], 1);
-	
-//	PrintMatrix("RHS0", n4,nlayers, ab, n4);
-	
-	// At this point, we can compute the quantities involved in the LDU decomposition
-	// of the system matrix. The initial diagonal pivot block is
-	//   [ I    0  | -Sbb  0 ]
-	//   [ 0    I  | -Sab  0 ]
-	//   [ --------+-------- ]
-	//   [ 0  -Sba |         ]
-	//   [ 0  -Saa |         ]
-	// Notice that the upper and lower diagonal blocks come from different interfaces.
-	// The first LDU factorization step is trivial, and there are m-1 in total.
-	// Therefore, we need only iterate through the m-2 inner layers.
-	
-	for(size_t i = 1; i < nlayers; i++){
-//printf("i = %d\n", (int)i);
-		// Set pointers to current row of matrices
-		doublecomplex *row = work+6*n22*i;
-		doublecomplex *P = row + 4*n22;
-		doublecomplex *Q = P + n22;
-		doublecomplex *Pprev = P - 6*n22;
-		doublecomplex *Qprev = Pprev + n22;
-			
-		const doublecomplex *Sbb = row-6*n22+n2*n4;
-		const doublecomplex *Sab = Sbb+n2;
-		
-		const doublecomplex *Sba = row;
-		const doublecomplex *Saa = Sba+n2;
-		size_t *ipivP = iwork + n2*i;
+		RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,1., t2,n2);
+		int solve_info;
+		RNP::LinearSolve<'N'>(n2, n2, t1, n2, t2, n2, &solve_info, pivots); // t2 = (I11 - f_l S12 I21)^{-1}
 		
-		// Perform LDU factorization step
-		if(1 == i){ // First step is trivial
-			SetMatrix(n2, n2, 1., 0., P, n2);
-			ZeroMatrix(n2, n2, Q, n2);
-		}else{
-			Copy(n2, n2, Sab, n4, t1, n2); // t1 = Sab
-			Copy(n2, n2, Sbb, n4, Q, n2); // Q = Sbb, t1 = Sab
-			LUSolve(n2, n2, Pprev, n2, ipivP-n2, Q, n2); // Q = inv(P) Sbb, t1 = Sab
-//	PrintMatrix("t1(Sab)", n2,n2, Sab, n4);
-//	PrintMatrix("         Sbb", n2,n2, Sbb, n4);
-//	PrintMatrix("invPprev Sbb", n2,n2, Q, n2);
-//	PrintMatrix("Qprev", n2,n2, Qprev, n2);
-			Mult(n2, 1., Qprev, n2, Q, n2, -1., t1, n2); // t1 = Q*inv(P)*Sbb - Sab
-//	PrintMatrix("Qprev invPprev Sbb", n2,n2, t1, n2);
-			Mult(n2, 1., Saa, n4, t1, n2, 0., Q, n2);
-	//PrintMatrix("Saa", n2,n2, Saa, n4);
-	//PrintMatrix("Saa Qprev invPprev Sbb", n2,n2, Q, n2);
-			
-			Mult(n2, 1., Sba, n4, t1, n2, 0., P, n2);
-			for(size_t j = 0; j < n2; ++j){
-				P[j+j*n2] += 1.;
-			}
-			
-	//		PrintMatrix("Q", n2,n2, Q, n2);
-	//		PrintMatrix("P", n2,n2, P, n2);
+		RNP::TBLAS::CopyMatrix<'A'>(n2,n2, &S[0+0*n4],n4, t1,n2);
+		for(size_t i = 0; i < n2; ++i){ // t1 = f_l S11
+			RNP::TBLAS::Scale(n2, d1[i], &t1[i+0*n2], n2);
 		}
-//	PrintMatrix("P", n2,n2, P, n2);
-		LUFactor(n2, P, n2, ipivP);
-//	PrintMatrix("Pfactored", n2,n2, P, n2);
-//	for(size_t i = 0; i < n2; ++i){
-//		printf(" %d", ((int*)ipivP)[i]);
-//	} printf("\n");
+		RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,t2,n2, t1,n2, 0.,&S[0+0*n4],n4);
+		// S11 is done, and we need to hold on to t2 = (I11 - f_l S12 I21)^{-1}
 
-//printf("LDU step completed\n"); fflush(stdout);
-	}
-	
-	// Now use LDU factorization to solve
-	//   L D U x = b
-	//   x = U \ (D \ (L \ b))
-	//
-	// Forward pass for D \ (L \ b)
-	for(size_t j = 1; j+1 < nlayers; ++j){
-		// Set pointers to current row of matrices
-		doublecomplex *row = work+6*n22*j;
-		doublecomplex *P = row + 4*n22;
-		doublecomplex *Q = P + n22;
-		doublecomplex *Pprev = P - 6*n22;
-		doublecomplex *Qprev = Pprev + n22;
-			
-		doublecomplex *Sbb = row+n2*n4;
-		doublecomplex *Sab = Sbb+n2;
-		
-		doublecomplex *Sba = row+6*n22;
-		doublecomplex *Saa = Sba+n2;
-		
-		doublecomplex *aj = &ab[j*n4];
-		doublecomplex *bjm1 = aj-n2;
-		doublecomplex *bjp1 = aj+n2;
-		size_t *ipivP = iwork + n2*j;
-		
-		// sub-diagonal block:
-		//   [     P               |  ]
-		//   [     Q            I  |  ]
-		//   [ --------------------+- ]
-		//   [ Sba Q inv(P)   -Sba |  ]
-		//   [ Saa Q inv(P)   -Saa |  ]
-		Copy(n4, bjm1, 1, t1, 1);
-//PrintMatrix("bjm1", n4,1, bjm1, n4);
-//PrintMatrix("Applied matrix", n4,n2, Sba, n4);
-		Mult(n4, n2, 1., Sba, n4, &t1[n2], 1., bjp1);
-		if(j > 1){ // Second iteration should use the fact that Q = 0
-			LUSolve(n2, 1, P, n2, ipivP, t1, n2);
-			Mult(n2, n2, -1., Q, n2, t1, 0., &t1[n2]);
-			Mult(n4, n2,  1., Sba, n4, &t1[n2], 1., bjp1);
+		RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,&S[0+n2*n4],n4, in1,n2, 0.,t1,n2); // t1 = S12 I22
+		for(size_t i = 0; i < n2; ++i){ // t1 = f_l S12 I22
+			RNP::TBLAS::Scale(n2, d1[i], &t1[i+0*n2], n2);
 		}
-	}
-//printf("Forward pass completed\n"); fflush(stdout);
-//PrintMatrix("RHS1", n4,nlayers, ab, n4);
-	
-	// Diagonal pass
-	for(size_t j = 2; j < nlayers; ++j){
-		// Set pointers to current row of matrices
-		const doublecomplex *row = work+6*n22*j;
-		const doublecomplex *P = row + 4*n22;
-		const doublecomplex *Q = P + n22;
-		size_t *ipivP = iwork + n2*j;
-		doublecomplex *aj = &ab[j*n4];
-		doublecomplex *bjm1 = aj-n2;
-		// Diaonal block:
-		//   [ P     0 ]
-		//   [ Q     I ]
-		// Inverse:
-		//   [    inv(P)   0 ] [ bjm1 ]
-		//   [ -Q inv(P)   I ] [ aj   ]
-		LUSolve(n2, 1, P, n2, ipivP, bjm1, n2);
-//PrintMatrix("bjm1", n2,1, bjm1, n2);
-		Mult(n2, n2, -1., Q, n2, bjm1, 1., aj);
-//PrintMatrix("Q", n2,n2, Q, n2);
-	}
-//printf("Diagonal pass completed\n"); fflush(stdout);
-//PrintMatrix("RHS2", n4,nlayers, ab, n4);
-	
-	// Backward pass for U \ ...
-	for(size_t j = nlayers-2; j > 0; --j){
-		// Set pointers to current row of matrices
-		doublecomplex *row = work+6*n22*j;
-		doublecomplex *P = row + 4*n22;
-		doublecomplex *Q = P + n22;
-		doublecomplex *Pprev = P - 6*n22;
-		doublecomplex *Qprev = Pprev + n22;
-			
-		doublecomplex *Sbb = row+n2*n4;
-		doublecomplex *Sab = Sbb+n2;
-		
-		doublecomplex *Sba = row+6*n22;
-		doublecomplex *Saa = Sba+n2;
+		RNP::TBLAS::Axpy(n2*n2, -1., in2,1, t1,1); // t1 = f_l S12 I22 - I12
+		for(size_t i = 0; i < n2; ++i){ // t1 = (f_l S12 I22 - I12) f_{l+1}
+			RNP::TBLAS::Scale(n2, d2[i], &t1[0+i*n2], 1);
+		}
+		RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,t2,n2, t1,n2, 0.,&S[0+n2*n4],n4);
+		// S12 done, and t2 can be reused
 		
-		doublecomplex *aj = &ab[j*n4];
-		doublecomplex *bjm1 = aj-n2;
-		doublecomplex *bjp1 = aj+n2;
-		size_t *ipivP = iwork + n2*j;
+		RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,&S[n2+n2*n4],n4, in2,n2, 0.,t1,n2); // t1 = S22 I21
+		RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,t1,n2, &S[0+0*n4],n4, 1.,&S[n2+0*n4],n4);
+		// S21 done, need to keep t1 = S22 I21
 		
-		// super-diagonal block:
-		//   [ P   0 | -inv(P) Sbb            0 ]
-		//   [ Q   I | Q inv(P) Sbb - Sab     0 ]
-		//   [ ------+------------------------- ]
-		//   [       |                          ]
-		Mult(n2, n2, 1., Sbb, n4, bjp1, 0., t1);
-		if(j > 0){
-			LUSolve(n2, 1, P, n2, ipivP, t1, n2);
-		}
-		Axpy(n2, 1., t1, 1, bjm1, 1);
-		if(j > 0){
-			Mult(n2, n2, -1., Q, n2, t1, 1., aj);
+		RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,&S[n2+n2*n4],n4, in1,n2, 0.,t2,n2); // t2 = S22 I22
+		for(size_t i = 0; i < n2; ++i){ // t2 = S22 I22 f_{l+1}
+			RNP::TBLAS::Scale(n2, d2[i], &t2[0+i*n2], 1);
 		}
-		Mult(n2, n2,  1., Sab, n4, bjp1, 1., aj);
+		RNP::TBLAS::CopyMatrix<'A'>(n2,n2, t2,n2, &S[n2+n2*n4],n4);
+		RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,t1,n2, &S[0+n2*n4],n4, 1.,&S[n2+n2*n4],n4);
+
+#ifdef DUMP_MATRICES
+		DUMP_STREAM << "S(1," << l+2 << ") = " << std::endl;
+# ifdef DUMP_MATRICES_LARGE
+		RNP::IO::PrintMatrix(n4,n4,S,n4, DUMP_STREAM) << std::endl << std::endl;
+# else
+		RNP::IO::PrintVector(n4,S,1, DUMP_STREAM) << std::endl << std::endl;
+# endif
+#endif
 	}
-	
-//printf("Backward pass completed\n"); fflush(stdout);
-//PrintMatrix("RHS3", n4,nlayers, ab, n4);
-	
-	if(NULL == work_){
+	if(NULL == work_ || lwork < n4*(n4+1)){
 		rcwa_free(work);
 	}
-	return 0;
+	if(NULL == iwork){
+		rcwa_free(pivots);
+	}
 }
 
 int SolveInterior(
@@ -1691,7 +796,7 @@ int SolveInterior(
 ){
 	if(0 == nlayers){ return-1; }
 	if(which_layer >= nlayers){ return -2; }
-
+	
 	const size_t n2 = 2*n;
 	const size_t n4 = 2*n2;
 	const size_t lwork_GetSMatrix = 4*n*(4*n+1);
@@ -1719,7 +824,7 @@ int SolveInterior(
 	std::complex<double> *bl = al+n2;
 	std::complex<double> *temp = S0l;
 	size_t ldtemp = n4;
-
+	
 	int info;
 
 	GetSMatrix(which_layer+1, n, kx, ky, omega,
@@ -1729,16 +834,6 @@ int SolveInterior(
 		thickness+which_layer, q+which_layer, Epsilon_inv+which_layer, epstype+which_layer, kp+which_layer, phi+which_layer,
 		SlN, work_GetSMatrix, pivots, lwork_GetSMatrix);
 
-#ifdef DUMP_MATRICES
-	if(NULL != a0){
-		DUMP_STREAM << "a0 = " << std::endl;
-		RNP::IO::PrintVector(n2,a0,1, DUMP_STREAM) << std::endl << std::endl;
-	}
-	if(NULL != bN){
-		DUMP_STREAM << "bN = " << std::endl;
-		RNP::IO::PrintVector(n2,bN,1, DUMP_STREAM) << std::endl << std::endl;
-	}
-#endif
 #ifdef DUMP_MATRICES
 	DUMP_STREAM << "S0l(0," << which_layer << ") = " << std::endl;
 # ifdef DUMP_MATRICES_LARGE
@@ -1769,10 +864,10 @@ int SolveInterior(
 	}else{
 		RNP::TBLAS::Fill(n2, 0., S22bN, 1);
 	}
-
+	
 	// We overwrite the upper left submatrix S11(0,l) since it's not needed anymore
 	// temp is set to S0l for this reason.
-
+	
 	// Compute -S_12(0,l)S_21(l,N)
 	RNP::TBLAS::MultMM<'N','N'>(n2, n2, n2, std::complex<double>(-1.0), &S0l[0+n2*n4], n4,
 		&SlN[n2+0*n4], n4,
@@ -1858,7 +953,7 @@ void GetZPoyntingFlux(
 	std::complex<double> *work
 ){
 	const size_t n2 = 2*n;
-
+	
 	std::complex<double> *a2 = work;
 	if(NULL == work){
 		a2 = (std::complex<double> *)rcwa_malloc(sizeof(std::complex<double>) * 4*n2);
@@ -1866,7 +961,7 @@ void GetZPoyntingFlux(
 	std::complex<double> *b2 = a2 + n2;
 	std::complex<double> *a3 = b2 + n2;
 	std::complex<double> *b3 = a3 + n2;
-
+	
 	memcpy(a2, ab, sizeof(std::complex<double>) * 2*n2);
 	for(size_t i = 0; i < n2; ++i){ a2[i] /= (omega*q[i]); }
 	for(size_t i = 0; i < n2; ++i){ b2[i] /= (omega*q[i]); }
@@ -1881,16 +976,16 @@ void GetZPoyntingFlux(
 	}else{
 		RNP::TBLAS::MultMM<'C','N'>(n2,2,n2, std::complex<double>(1.),phi,n2, a2,n2, std::complex<double>(0.), a3,n2);
 	}
-
+	
 	// At this point a3[n2] is alpha and b3[n2] is beta,
 	// ab[n2] is a and (ab+n2)[n2] is b
-
+	
 	*forward  =  RNP::TBLAS::ConjugateDot(n2, ab     ,1, a3,1).real();
 	*backward = -RNP::TBLAS::ConjugateDot(n2, &ab[n2],1, b3,1).real();
 	std::complex<double> diff = 0.5*(RNP::TBLAS::ConjugateDot(n2, &ab[n2],1, a3,1) - RNP::TBLAS::ConjugateDot(n2, b3,1, ab,1));
 	*forward  += diff;
 	*backward += std::conj(diff);
-
+	
 	if(NULL == work){
 		rcwa_free(a2);
 	}
@@ -1911,7 +1006,7 @@ void GetZPoyntingFluxComponents(
 	std::complex<double> *work
 ){
 	const size_t n2 = 2*n;
-
+	
 	std::complex<double> *a2 = work;
 	if(NULL == work){
 		a2 = (std::complex<double> *)rcwa_malloc(sizeof(std::complex<double>) * 4*n2);
@@ -1919,7 +1014,7 @@ void GetZPoyntingFluxComponents(
 	std::complex<double> *b2 = a2 + n2;
 	std::complex<double> *a3 = b2 + n2;
 	std::complex<double> *b3 = a3 + n2;
-
+	
 	memcpy(a2, ab, sizeof(std::complex<double>) * 2*n2);
 	for(size_t i = 0; i < n2; ++i){ a2[i] /= (omega*q[i]); }
 	for(size_t i = 0; i < n2; ++i){ b2[i] /= (omega*q[i]); }
@@ -1930,14 +1025,14 @@ void GetZPoyntingFluxComponents(
 	}
 	MultKPMatrix("N", omega, n, kx, ky, Epsilon_inv, epstype, kp, 2, a3,n2, a2,n2);
 	// At this point, a2[n2] contains alpha_e, b2[n2] contains beta_e
-
+	
 	if(NULL == phi){
 		RNP::TBLAS::CopyMatrix<'A'>(n2,2, ab,n2, a3,n2);
 	}else{
 		RNP::TBLAS::MultMM<'N','N'>(n2,2,n2, std::complex<double>(1.),phi,n2, ab,n2, std::complex<double>(0.), a3,n2);
 	}
 	// At this point, a3[n2] contains alpha_h, b3[n2] contains beta_h
-
+	
 	for(size_t i = 0; i < n; ++i){
 		forward[i] = 0;
 		backward[i] = 0;
@@ -1950,7 +1045,7 @@ void GetZPoyntingFluxComponents(
 			backward[i] += std::conj(diff);
 		}
 	}
-
+	
 	if(NULL == work){
 		rcwa_free(a2);
 	}
@@ -1970,7 +1065,7 @@ static void GetInPlaneFieldVector(
 ){
 	const size_t n2 = 2*n;
 	const size_t n4 = 2*n2;
-
+	
 	for(size_t i = 0; i < n2; ++i){
 		eh[4*n2+i] = ab[i]/(omega*q[i]);
 		eh[5*n2+i] = -ab[i+n2]/(omega*q[i]);
@@ -2013,18 +1108,18 @@ void GetFieldAtPoint(
 	const std::complex<double> z_zero(0.);
 	const std::complex<double> z_one(1.);
 	const size_t n2 = 2*n;
-
+	
 	std::complex<double> *eh = work;
 	if(NULL == work){
 		eh = (std::complex<double>*)rcwa_malloc(sizeof(std::complex<double>) * 8*n2);
 	}
-
+	
 	GetInPlaneFieldVector(n, kx, ky, omega, q, epsilon_inv, epstype, kp, phi, ab, eh);
 	const std::complex<double> *hx  = &eh[3*n2+0];
 	const std::complex<double> *hy  = &eh[3*n2+n];
 	const std::complex<double> *ney = &eh[4*n2+0];
 	const std::complex<double> *ex  = &eh[4*n2+n];
-
+	
 	std::complex<double> fE[3], fH[3];
 	fE[0] = 0;
 	fE[1] = 0;
@@ -2032,7 +1127,7 @@ void GetFieldAtPoint(
 	fH[0] = 0;
 	fH[1] = 0;
 	fH[2] = 0;
-
+	
 	if(NULL != efield && NULL != epsilon_inv){
 		for(size_t i = 0; i < n; ++i){
 			eh[i] = (ky[i]*hx[i] - kx[i]*hy[i]);
@@ -2044,7 +1139,7 @@ void GetFieldAtPoint(
 			RNP::TBLAS::MultMV<'N'>(n,n, z_one,epsilon_inv,n, eh,1, z_zero,&eh[n],1);
 		}
 	}
-
+	
 	for(size_t i = 0; i < n; ++i){
 		const double theta = (kx[i]*r[0] + ky[i]*r[1]);
 
@@ -2056,7 +1151,7 @@ void GetFieldAtPoint(
 		fH[2] += (kx[i] * -ney[i] - ky[i] * ex[i]) * (phase / omega);
 		fE[2] += eh[n+i] * (phase / omega);
 	}
-
+	
 	if(NULL != efield && NULL != epsilon_inv){
 		efield[0] = fE[0];
 		efield[1] = fE[1];
@@ -2067,7 +1162,7 @@ void GetFieldAtPoint(
 		hfield[1] = fH[1];
 		hfield[2] = fH[2];
 	}
-
+	
 	if(NULL == work){
 		rcwa_free(eh);
 	}
@@ -2084,7 +1179,6 @@ void GetFieldOnGrid(
 	int epstype,
 	const std::complex<double> *ab, // length 4*glist.n
 	const size_t nxy[2], // number of points per lattice direction
-	const double *xy0,
 	std::complex<double> *efield,
 	std::complex<double> *hfield
 ){
@@ -2095,9 +1189,9 @@ void GetFieldOnGrid(
 	const int nxyoff[2] = { (int)(nxy[0]/2), (int)(nxy[1]/2) };
 	int inxy[2] = { (int)nxy[0], (int)nxy[1] };
 	int inxy_rev[2] = { (int)nxy[1], (int)nxy[0] };
-
+	
 	std::complex<double> *eh = (std::complex<double>*)rcwa_malloc(sizeof(std::complex<double>) * 8*n2);
-
+	
 	GetInPlaneFieldVector(n, kx, ky, omega, q, epsilon_inv, epstype, kp, phi, ab, eh);
 	const std::complex<double> *hx  = &eh[3*n2+0];
 	const std::complex<double> *hy  = &eh[3*n2+n];
@@ -2113,15 +1207,15 @@ void GetFieldOnGrid(
 		memset(from[i], 0, sizeof(std::complex<double>) * N);
 		plan[i] = fft_plan_dft_2d(inxy_rev, from[i], to[i], 1);
 	}
-
+	
 	for(size_t i = 0; i < n; ++i){
 		eh[i] = (ky[i]*hx[i] - kx[i]*hy[i]);
-	}
-	if(EPSILON2_TYPE_BLKDIAG1_SCALAR == epstype || EPSILON2_TYPE_BLKDIAG2_SCALAR == epstype){
-		RNP::TBLAS::Scale(n, epsilon_inv[0], eh,1);
-		RNP::TBLAS::Copy(n, eh,1, &eh[n], 1);
-	}else{
-		RNP::TBLAS::MultMV<'N'>(n,n, z_one,epsilon_inv,n, eh,1, z_zero,&eh[n],1);
+		if(EPSILON2_TYPE_BLKDIAG1_SCALAR == epstype || EPSILON2_TYPE_BLKDIAG2_SCALAR == epstype){
+			RNP::TBLAS::Scale(n, epsilon_inv[0], eh,1);
+			RNP::TBLAS::Copy(n, eh,1, &eh[n], 1);
+		}else{
+			RNP::TBLAS::MultMV<'N'>(n,n, z_one,epsilon_inv,n, eh,1, z_zero,&eh[n],1);
+		}
 	}
 	for(size_t i = 0; i < n; ++i){
 		const int iu = G[2*i+0];
@@ -2130,7 +1224,6 @@ void GetFieldOnGrid(
 			(nxyoff[0] - (int)nxy[0] < iu && iu <= nxyoff[0]) &&
 			(nxyoff[1] - (int)nxy[1] < iv && iv <= nxyoff[1])
 		){
-			//todo: const std::complex<double> shift_phase(-i 2pi Lk.G.xy0);
 			const int ii = (iu >= 0 ? iu : iu + nxy[0]);
 			const int jj = (iv >= 0 ? iv : iv + nxy[1]);
 			from[0][ii+jj*nxy[0]] = hx[i];
@@ -2141,11 +1234,11 @@ void GetFieldOnGrid(
 			from[5][ii+jj*nxy[0]] = eh[n+i] / omega;
 		}
 	}
-
+	
 	for(unsigned i = 0; i < 6; ++i){
 		fft_plan_exec(plan[i]);
 	}
-
+	
 	for(size_t j = 0; j < nxy[1]; ++j){
 		for(size_t i = 0; i < nxy[0]; ++i){
 			hfield[3*(i+j*nxy[0])+0] = to[0][i+j*nxy[0]];
@@ -2156,7 +1249,7 @@ void GetFieldOnGrid(
 			efield[3*(i+j*nxy[0])+2] = to[5][i+j*nxy[0]];
 		}
 	}
-
+	
 	for(unsigned i = 0; i < 6; ++i){
 		fft_plan_destroy(plan[i]);
 		fft_free(to[i]);
@@ -2183,12 +1276,12 @@ void GetZStressTensorIntegral(
 	const size_t n2 = 2*n;
 	const std::complex<double> z_zero(0.);
 	const std::complex<double> z_one(1.);
-
+	
 	std::complex<double> *eh = work;
 	if(NULL == work){
 		eh = (std::complex<double>*)rcwa_malloc(sizeof(std::complex<double>) * 8*n2);
 	}
-
+	
 	GetInPlaneFieldVector(n, kx, ky, omega, q, epsilon_inv, epstype, kp, phi, ab, eh);
 	std::complex<double> *ez  = &eh[1*n2+n];
 	std::complex<double> *dz  = &eh[2*n2+0];
@@ -2199,11 +1292,11 @@ void GetZStressTensorIntegral(
 	const std::complex<double> *ex  = &eh[4*n2+n];
 	std::complex<double> *ndy = &eh[5*n2+0];
 	std::complex<double> *dx  = &eh[5*n2+n];
-
+	
 	integral[0] = 0;
 	integral[1] = 0;
 	integral[2] = 0;
-
+	
 	for(size_t i = 0; i < n; ++i){
 		hz[i] = (kx[i] * -ney[i] - ky[i] * ex[i]) / omega;
 		dz[i] = (ky[i]*hx[i] - kx[i]*hy[i]) / omega;
@@ -2215,7 +1308,7 @@ void GetZStressTensorIntegral(
 		RNP::TBLAS::MultMV<'N'>(n,n, z_one,epsilon_inv,n, dz,1, z_zero,ez,1);
 	}
 	RNP::TBLAS::MultMV<'N'>(n2,n2, z_one,epsilon2,n2, ney,1, z_zero,ndy,1);
-
+	
 	for(size_t i = 0; i < n; ++i){
 		std::complex<double> cdz = std::conj(dz[i]);
 		std::complex<double> cbz = std::conj(hz[i]);
@@ -2231,7 +1324,7 @@ void GetZStressTensorIntegral(
 		integral[2] -= 0.5*hy[i]*std::conj(hy[i]);
 		integral[2] -= 0.5*hx[i]*std::conj(hx[i]);
 	}
-
+	
 	if(NULL == work){
 		rcwa_free(eh);
 	}
@@ -2273,7 +1366,7 @@ void GetLayerVolumeIntegral(
 	}
 	const std::complex<double> z_zero(0.);
 	const std::complex<double> z_one(1.);
-
+	
 	std::complex<double> *Q = work;
 	if(NULL == work){
 		Q = (std::complex<double>*)rcwa_malloc(sizeof(std::complex<double>)*n4*n4);
@@ -2336,7 +1429,7 @@ void GetLayerVolumeIntegral(
 	// Q = M^H * CsC * M = [  S+T -S+T ], S = fie^H * A * fie
 	//                     [ -S+T  S+T ]  T = phi^H * B * phi
 	// integral = int_0^thickness [x(z)^H * Q * x(z)] dz, x(z) = [a*exp(iqz);b*exp(iq(d-z))]
-
+	
 	// Make T
 	if(NULL != phi){
 		RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, z_one,&Q[n2+n2*n4],n4, phi,n2, z_zero,&Q[0+n2*n4],n4);
@@ -2391,7 +1484,7 @@ void GetLayerVolumeIntegral(
 	// \frac{e^{id/2 (q-q^*)} }{i(s_j q - s_i q^*)} 2 i sin[ d/2 (s_j q - s_i q^*) ]
 	// \frac{d e^{id/2 (q-q^*)} }{d/2 (s_j q - s_i q^*)} sin[ d/2 (s_j q - s_i q^*) ]
 	// d e^{id/2 (q-q^*)} sinc[ d/2 (s_j q - s_i q^*) ], where sinc(x) = sin(x)/x
-
+	
 	*integral = 0;
 	const double d_2 = 0.5*thickness;
 	for(size_t j = 0; j < n4; ++j){
@@ -2434,16 +1527,16 @@ void GetLayerZIntegral(
 	const size_t n2 = 2*n;
 	const size_t n4 = 2*n2;
 	const std::complex<double> iomega(1./omega);
-
+	
 	std::complex<double> *f = work;
 	if(NULL == work){
 		f = (std::complex<double>*)rcwa_malloc(sizeof(std::complex<double>)*12*n4);
 	}
 	std::complex<double> *g = f + 6*n4;
 	RNP::TBLAS::Fill(6*n4, 0., f, 1);
-
+	
 	// We will simultaneously generate f for all 6 components
-
+	
 	// Generate the Fourier phase factors
 	for(size_t i = 0; i < n; ++i){ // this is for ex
 		double theta = (kx[i]*r[0] + ky[i]*r[1]);
@@ -2471,7 +1564,7 @@ void GetLayerZIntegral(
 	for(size_t i = 0; i < n; ++i){ // -hz
 		f[5*n4+1*n+i] = f[n4+i] * iomega * ky[i];
 	}
-
+	
 	// Form M^H C^H * f
 	// M = [ kp.phi.inv(q)   -kp.phi.inv(q) ]
 	//     [     phi                phi     ]
@@ -2493,11 +1586,11 @@ void GetLayerZIntegral(
 			g[i+n2+j*n4] -= t;
 		}
 	}
-
+	
 	for(size_t i = 0; i < 6; ++i){
 		integral[i] = 0;
 	}
-
+	
 	// At this point, each column of g contains a vector v such that
 	// v v^H is a rank-1 matrix that must be integrated over.
 	const double d_2 = 0.5*thickness;
@@ -2515,7 +1608,7 @@ void GetLayerZIntegral(
 			}
 		}
 	}
-
+	
 	if(NULL == work){
 		rcwa_free(f);
 	}
diff --git a/gensetupHPC.py.sh b/gensetupHPC.py.sh
new file mode 100644
index 0000000..6cf65b2
--- /dev/null
+++ b/gensetupHPC.py.sh
@@ -0,0 +1,32 @@
+#!/bin/bash
+
+OBJDIR="$1"
+LIBFILE="$2"
+LIBS="$3"
+x1="$4"
+x2="$5"
+
+cat <<SETUPPY > setup.py
+from distutils.core import setup, Extension
+
+libs = ['S4', 'stdc++']
+libs.extend( [lib[2::] for lib in '$LIBS'.split()])
+
+S4module = Extension('S4',
+	sources = [
+		'S4/main_python.c'
+	],
+	libraries = libs,
+	library_dirs = ['$OBJDIR', $x2],
+	extra_link_args = [
+		'$LIBFILE', $x1
+	],
+	extra_compile_args=['-std=gnu99'] 
+)
+
+setup(name = 'S4',
+	version = '1.1',
+	description = 'Stanford Stratified Structure Solver (S4): Fourier Modal Method',
+	ext_modules = [S4module]
+)
+SETUPPY