diff --git a/MakefileHPC b/MakefileHPC new file mode 100644 index 0000000..c6bef7b --- /dev/null +++ b/MakefileHPC @@ -0,0 +1,251 @@ +# to find path to openmpi, type mpiexec --test , the error messege reveals path +MKL_HOME = /apps/cent7/intel/compilers_and_libraries_2017.1.132/linux/mkl +BROWN_MPI_HOME = /apps/brown/openmpi/2.1.0/gcc-5.2.0 +SNYDER_MPI_HOME = /apps/snyder7/openmpi/2.1.0_gcc-5.2.0 +MPI_HOME = $(SNYDER_MPI_HOME) + +BLAS_LIB = -L$(MKL_HOME)/lib/intel64 -Wl,-rpath=$(MKL_HOME)/lib/intel64 -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core -lgomp +LAPACK_LIB = -L$(MKL_HOME)/lib/intel64 -Wl,-rpath=$(MKL_HOME)/lib/intel64 -lmkl_intel_lp64 -lmkl_gnu_thread -lmkl_core + +EXTRA_FLAGS = '-Wl,-rpath=$(MKL_HOME)/lib/intel64' , '-Wl,-rpath=$(MPI_HOME)/lib' + +EXTRA_DIRS = '$(MKL_HOME)/lib/intel64' , '$(MPI_HOME)/lib' + +#BLAS_LIB = -lblas +#LAPACK_LIB = -llapack + +# OPTIONAL +# Typically if installed, +# FFTW3_INC can be left empty +# FFTW3_LIB = -lfftw3 +# or, if Fedora and/or fftw is version 3 but named fftw rather than fftw3 +# FTW3_LIB = -lfftw +# May need to link libraries properly as with blas and lapack above +FFTW3_INC = -I$(MLK_HOME)/include/fftw/ +FFTW3_LIB = $(BLAS_LIB) + +# Typically, +# PTHREAD_INC = -DHAVE_UNISTD_H +# PTHREAD_LIB = -lpthread +PTHREAD_INC = -DHAVE_UNISTD_H +PTHREAD_LIB = -lpthread + +# OPTIONAL +# If not installed: +# Fedora: dnf install libsuitsparse-devel +# Typically, if installed: +#CHOLMOD_INC = -I/usr/include/suitesparse +#CHOLMOD_LIB = -lcholmod -lamd -lcolamd -lcamd -lccolamd +#CHOLMOD_INC = -I/usr/include/suitesparse +#CHOLMOD_LIB = -lcholmod -lamd -lcolamd -lcamd -lccolamd + +# Specify the MPI library +# For example, on Fedora: dnf install openmpi-devel +#MPI_INC = -I/usr/include/openmpi-x86_64/openmpi/ompi +#MPI_LIB = -lmpi +# or, explicitly link to the library with -L, example below +MPI_LIB = -L$(MPI_HOME)/lib -Wl,-rpath=$(MPI_HOME)/lib -lmpi +MPI_INC = -I$(MPI_HOME)/include +#MPI_LIB = -L/usr/lib64/openmpi/lib/libmpi.so + +# Specify custom compilers if needed +CXX = $(MPI_HOME)/bin/mpic++ +CC = $(MPI_HOME)/bin/mpicc + +#CFLAGS += -O3 -fPIC +CFLAGS = -O3 -msse3 -msse2 -msse -fPIC + +# options for Sampler module +OPTFLAGS = -O3 + +OBJDIR = ./build +S4_BINNAME = $(OBJDIR)/S4 +S4_LIBNAME = $(OBJDIR)/libS4.a +S4r_LIBNAME = $(OBJDIR)/libS4r.a + +##################### DO NOT EDIT BELOW THIS LINE ##################### + +#### Set the compilation flags + +CPPFLAGS = -I. -IS4 -IS4/RNP -IS4/kiss_fft + +ifdef BLAS_LIB +CPPFLAGS += -DHAVE_BLAS +endif + +ifdef LAPACK_LIB +CPPFLAGS += -DHAVE_LAPACK +endif + +ifdef FFTW3_LIB +CPPFLAGS += -DHAVE_FFTW3 $(FFTW3_INC) +endif + +ifdef PTHREAD_LIB +CPPFLAGS += -DHAVE_LIBPTHREAD $(PTHREAD_INC) +endif + +ifdef CHOLMOD_LIB +CPPFLAGS += -DHAVE_LIBCHOLMOD $(CHOLMOD_INC) +endif + +ifdef MPI_LIB +CPPFLAGS += -DHAVE_MPI $(MPI_INC) +endif + +LIBS = $(BLAS_LIB) $(LAPACK_LIB) $(FFTW3_LIB) $(PTHREAD_LIB) $(CHOLMOD_LIB) $(MPI_LIB) + +#### Compilation targets + +all: $(S4_LIBNAME) + +objdir: + mkdir -p $(OBJDIR) + mkdir -p $(OBJDIR)/S4k + mkdir -p $(OBJDIR)/S4r + mkdir -p $(OBJDIR)/modules + +S4_LIBOBJS = \ + $(OBJDIR)/S4k/S4.o \ + $(OBJDIR)/S4k/rcwa.o \ + $(OBJDIR)/S4k/fmm_common.o \ + $(OBJDIR)/S4k/fmm_FFT.o \ + $(OBJDIR)/S4k/fmm_kottke.o \ + $(OBJDIR)/S4k/fmm_closed.o \ + $(OBJDIR)/S4k/fmm_PolBasisNV.o \ + $(OBJDIR)/S4k/fmm_PolBasisVL.o \ + $(OBJDIR)/S4k/fmm_PolBasisJones.o \ + $(OBJDIR)/S4k/fmm_experimental.o \ + $(OBJDIR)/S4k/fft_iface.o \ + $(OBJDIR)/S4k/pattern.o \ + $(OBJDIR)/S4k/intersection.o \ + $(OBJDIR)/S4k/predicates.o \ + $(OBJDIR)/S4k/numalloc.o \ + $(OBJDIR)/S4k/gsel.o \ + $(OBJDIR)/S4k/sort.o \ + $(OBJDIR)/S4k/kiss_fft.o \ + $(OBJDIR)/S4k/kiss_fftnd.o \ + $(OBJDIR)/S4k/SpectrumSampler.o \ + $(OBJDIR)/S4k/cubature.o \ + $(OBJDIR)/S4k/Interpolator.o \ + $(OBJDIR)/S4k/convert.o + +S4r_LIBOBJS = \ + $(OBJDIR)/S4r/Material.o \ + $(OBJDIR)/S4r/LatticeGridRect.o \ + $(OBJDIR)/S4r/LatticeGridArb.o \ + $(OBJDIR)/S4r/POFF2Mesh.o \ + $(OBJDIR)/S4r/PeriodicMesh.o \ + $(OBJDIR)/S4r/Shape.o \ + $(OBJDIR)/S4r/Simulation.o \ + $(OBJDIR)/S4r/Layer.o \ + $(OBJDIR)/S4r/Pseudoinverse.o \ + $(OBJDIR)/S4r/Eigensystems.o \ + $(OBJDIR)/S4r/IRA.o \ + $(OBJDIR)/S4r/intersection.o \ + $(OBJDIR)/S4r/predicates.o \ + $(OBJDIR)/S4r/periodic_off2.o + +ifndef LAPACK_LIB + S4_LIBOBJS += $(OBJDIR)/S4k/Eigensystems.o +endif + +$(S4_LIBNAME): objdir $(S4_LIBOBJS) + $(AR) crvs $@ $(S4_LIBOBJS) +$(S4r_LIBNAME): objdir $(S4r_LIBOBJS) + $(AR) crvs $@ $(S4r_LIBOBJS) + +$(OBJDIR)/S4k/S4.o: S4/S4.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/rcwa.o: S4/rcwa.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_common.o: S4/fmm/fmm_common.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_FFT.o: S4/fmm/fmm_FFT.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_kottke.o: S4/fmm/fmm_kottke.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_closed.o: S4/fmm/fmm_closed.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_PolBasisNV.o: S4/fmm/fmm_PolBasisNV.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_PolBasisVL.o: S4/fmm/fmm_PolBasisVL.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_PolBasisJones.o: S4/fmm/fmm_PolBasisJones.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fmm_experimental.o: S4/fmm/fmm_experimental.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/fft_iface.o: S4/fmm/fft_iface.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/pattern.o: S4/pattern/pattern.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/intersection.o: S4/pattern/intersection.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/predicates.o: S4/pattern/predicates.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/numalloc.o: S4/numalloc.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/gsel.o: S4/gsel.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/sort.o: S4/sort.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/kiss_fft.o: S4/kiss_fft/kiss_fft.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/kiss_fftnd.o: S4/kiss_fft/tools/kiss_fftnd.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/SpectrumSampler.o: S4/SpectrumSampler.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/cubature.o: S4/cubature.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/Interpolator.o: S4/Interpolator.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/convert.o: S4/convert.c + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4k/Eigensystems.o: S4/RNP/Eigensystems.cpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ + + +$(OBJDIR)/S4r/Material.o: S4r/Material.cpp S4r/Material.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/LatticeGridRect.o: S4r/LatticeGridRect.cpp S4r/PeriodicMesh.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/LatticeGridArb.o: S4r/LatticeGridArb.cpp S4r/PeriodicMesh.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/POFF2Mesh.o: S4r/POFF2Mesh.cpp S4r/PeriodicMesh.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/PeriodicMesh.o: S4r/PeriodicMesh.cpp S4r/PeriodicMesh.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/Shape.o: S4r/Shape.cpp S4r/Shape.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/Simulation.o: S4r/Simulation.cpp S4r/Simulation.hpp S4r/StarProduct.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/Layer.o: S4r/Layer.cpp S4r/Layer.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/Pseudoinverse.o: S4r/Pseudoinverse.cpp S4r/Pseudoinverse.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/Eigensystems.o: S4r/Eigensystems.cpp S4r/Eigensystems.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/IRA.o: S4r/IRA.cpp S4r/IRA.hpp S4r/Types.hpp + $(CXX) -c $(CFLAGS) $(CPPFLAGS) -I. $< -o $@ +$(OBJDIR)/S4r/intersection.o: S4r/intersection.c S4r/intersection.h + $(CC) -c -O3 $< -o $@ +$(OBJDIR)/S4r/periodic_off2.o: S4r/periodic_off2.c S4r/periodic_off2.h + $(CC) -c $(CFLAGS) $(CPPFLAGS) $< -o $@ +$(OBJDIR)/S4r/predicates.o: S4r/predicates.c + $(CC) -c -O3 $< -o $@ + +#### Python extension + +check: + @echo $(LIBS) + +S4_pyext: objdir $(S4_LIBNAME) + echo "Linking...." + echo $(EXTRA_FLAGS) $(EXTRA_DIRS) + sh gensetupHPC.py.sh $(OBJDIR) $(S4_LIBNAME) "$(LIBS)" "$(EXTRA_FLAGS)" "$(EXTRA_DIRS)" + python setup.py build + #pip3 install --upgrade ./ + +clean: + rm S4.so + rm -rf $(OBJDIR) diff --git a/PythonTest.py b/PythonTest.py new file mode 100644 index 0000000..681d145 --- /dev/null +++ b/PythonTest.py @@ -0,0 +1,122 @@ + +# all lengths normalized in terms of a +# frequency normalized in terms of 2*pi*c/a + +# this test file plots a frequency spectrum and field plot for the python extension + +import S4 +import numpy as np +import matplotlib.pyplot as plt + +a = 1e-6; #1nm +c_const = 3e8; +print('Bontempi, Kivshar et al. \"Highly sensitive biosensors based on all-dielectric nanoresonator\"') +p = 0.96 +d = 0.73 +h = 0.22 +period = [p,p] +S = S4.New(Lattice=((period[0],0),(0,period[1])),NumBasis=20) +S.SetMaterial(Name = 'SiO2',Epsilon = (1.45 + 0.0j)**2) +S.SetMaterial(Name = 'Si',Epsilon = (3.487+0.0j)**2) +S.SetMaterial(Name = 'Vacuum',Epsilon = (1 + 0j)**2) + +S.AddLayer(Name = 'AirAbove',Thickness = 0, Material = 'Vacuum') +S.AddLayer(Name = 'AirDisp',Thickness = 1, Material = 'Vacuum') +S.AddLayer(Name = 'Si_disks',Thickness = h,Material = 'Vacuum') + +S.SetRegionCircle( + Layer = 'Si_disks', + Material = 'Si', + Center = (0,0), + Radius = d/2 +) + +S.AddLayer(Name = 'Glass_Below',Thickness = 2, Material = 'SiO2') +#S.AddLayerCopy(Name = 'AirBelow',Thickness = 0, Layer = 'AirAbove') +S.AddLayer(Name = 'AirBelow',Thickness = 0,Material = 'Vacuum') + +S.SetExcitationPlanewave( + IncidenceAngles=( + 0,# polar angle in [0,180) + 0 # azimuthal angle in [0,360) + ), + sAmplitude = 0, + pAmplitude = 1, + Order = 0 +) + +S.SetOptions( + PolarizationDecomposition = True +) + +wavelength_um = float(wavelength_p_nm)/1000; # [8.0,14.0] +freq = 1 / float(wavelength_um); +S.SetFrequency(freq) #unit 2*pi*c_const / a +forward,backward = S.GetPowerFlux(Layer = 'AirAbove', zOffset = 0) # reflected power +forward = S.GetPowerFlux(Layer = 'AirBelow',zOffset = 0) +print('f='+str(1/freq)+', Power_forward= '+str(forward[0].real)+', Power_backward='+str(backward.real)) + +# get fields at points + + + + +# frequency sweep +wavelength_space = np.linspace(1.3, 1.7, 200) + +R = wavelength_space * 0 +T=R +i=0 +for lam in wavelength_space: + f = 1 / float(lam) + S.SetFrequency(f) + (forw,back) = S.GetPowerFlux(Layer = 'AirAbove', zOffset = 0) + forw = S.GetPowerFlux(Layer = 'AirBelow', zOffset = 0) + R[i] = np.abs(forw[0]) + #T[i] = np.abs(back) + i +=1 + +plt.plot(wavelength_space, R) +#plt.hold +#plt.plot(wavelength_space, T) +plt.xlabel('Wavelength (um)') +plt.ylabel('T') +plt.title('Figure 3(a)') + +#field map + +wavelength_p_nm = 1400; +wavelength_um = float(wavelength_p_nm)/1000; # [8.0,14.0] +freq = 1 / float(wavelength_um); +S.SetFrequency(freq) #unit 2*pi*c_const / a +x_space = np.linspace(-p/2,p/2,300) +z_space = np.linspace(0.8,1.4,300) + +X,Z = np.meshgrid(x_space,z_space) +Ey = np.zeros((X.shape[0], X.shape[1])) +xc = 0 +zc = 0 + +for x in x_space: + zc=0 + #print('x='+str(x)+'\n') + for z in z_space: + #print('z='+str(z)+'\tzc='+str(zc)+'\n') + E,H = S.GetFields(x,0,z) + Ey[zc,xc] = np.abs(H[1]) ** 2 + np.abs(H[2]) ** 2 + np.abs(H[0]) ** 2 + #print(str(x)+'\t'+str(z)+'\t'+str(abs(E[1]))) + zc += 1 + xc += 1 + +fig = plt.figure() +cm = np.linspace(np.min(Ey),np.max(Ey), 100) +plt.contourf(x_space, z_space, Ey, cm) +plt.plot([-p/2, p/2], [1, 1], color='w', linestyle='-', linewidth=2) +plt.plot([-d/2, d/2], [1+h, 1+h], color='w', linestyle='-', linewidth=2) +plt.plot([-d/2, -d/2], [1, 1+h], color='w', linestyle='-', linewidth=2) +plt.plot([d/2, d/2], [1, 1+h], color='w', linestyle='-', linewidth=2) +plt.title('Figure 3(c) - |H|^2 - lam = ' + str(wavelength_p_nm)) +plt.show(block=True) + + +print('If you can read this, the configuration work is finished!') diff --git a/README b/README index 5969182..a04acad 100644 --- a/README +++ b/README @@ -8,3 +8,14 @@ See the S4 manual, in doc/index.html, for a complete description of the package and its user interface, as well as installation instructions, the license and copyright, contact addresses, and other important information. + +Sajidmc: +This fork focuses on HPC compilation, especially on Purdue University clusters. +We are only using the python extension. + +Note: you must edit the MakefileHPC and manually check/add the paths to each of the libraries + +Usage: + +make -f MakefileHPC clean +make -f MakefileHPC S4_pyext diff --git a/S4/rcwa.cpp b/S4/rcwa.cpp index 685f450..5e1a6e3 100644 --- a/S4/rcwa.cpp +++ b/S4/rcwa.cpp @@ -44,7 +44,8 @@ //# define DUMP_STREAM (omega.real() > 1.91637 ? std::cerr : std::cout) # include #endif -#include +# include +# include #include @@ -79,165 +80,22 @@ static void SingularLinearSolve( integer lwork = (integer)dummy.real(); std::complex *work = (std::complex*)rcwa_malloc(sizeof(std::complex) * lwork); double *rwork = (double*)rcwa_malloc(sizeof(double) * 6*m); - + zgelss_(m, n, nRHS, a, lda, b, ldb, rwork, rcond, &rank, work, lwork, rwork+m, &info); - + rcwa_free(rwork); rcwa_free(work); #else RNP::LinearSolve<'N'>(n, nRHS, a, lda, b, ldb); #endif } - - -template -static void Swap(int n, T* x, int incx, T* y, int incy){ - while(n --> 0){ - std::swap(*x, *y); - x += incx; y += incy; - } -} - -template -static void Axpy(int n, const A &alpha, const T *x, int incx, T *y, int incy){ - while(n --> 0){ - *y += alpha*(*x); - x += incx; - y += incy; - } -} -template -static void Axpy(int m, int n, const A &alpha, const T *a, int lda, T *b, int ldb){ - for(int j = 0; j < n; ++j){ - Axpy(m, alpha, a, 1, b, 1); - a += lda; - b += ldb; - } -} -template -static void Scale(int n, const S &s, T *v, int incv = 1){ - while(n --> 0){ - *v *= s; - v += incv; - } -} - -template -static void Copy(int n, const T *src, int incsrc, T *dst, int incdst){ - if(1 == incsrc && 1 == incdst){ - memcpy(dst, src, sizeof(T)*n); - }else{ - for(int j = 0; j < n; ++j){ - *dst = *src; - src += incsrc; - dst += incdst; - } - } -} -template -static void Copy(int m, int n, const T *src, int ldsrc, T *dst, int lddst){ - if(m == ldsrc && m == lddst){ - memcpy(dst, src, sizeof(T)*m*n); - }else{ - for(int j = 0; j < n; ++j){ - memcpy(dst, src, sizeof(T)*m); - src += ldsrc; - dst += lddst; - } - } -} -template -static void SetMatrix(int m, int n, const A &diag, const A &offdiag, T *a, int lda){ - for(int j = 0; j < n; ++j){ - int i; - T *p = a; - for(i = 0; i < j; ++i){ - *p++ = offdiag; - } - *p++ = diag; - for(i++ ; i < m; ++i){ - *p++ = offdiag; - } - a += lda; - } -} -template -static void ZeroMatrix(int m, int n, T *a, int lda){ - if(m == lda){ - memset(a, 0, sizeof(T)*m*n); - }else{ - for(int j = 0; j < n; ++j){ - memset(a, 0, sizeof(T)*m); - a += lda; - } - } -} -extern "C" void zgemv_( - const char *trans, const int &m, const int &n, const std::complex &alpha, - const std::complex *a, const int &lda, const std::complex *x, const int &incx, - const std::complex &beta, std::complex *y, const int &incy -); -extern "C" void zgemm_( - const char *transa, const char *transb, const int &m, const int &n, const int &k, - const std::complex &alpha, const std::complex *a, const int &lda, - const std::complex *b, const int &ldb, - const std::complex &beta, std::complex *c, const int &ldc -); -extern "C" void zgetrf_( - const int &m, const int &n, std::complex *a, const int &lda, int *ipiv, int *info -); -extern "C" void zgetrs_( - const char *trans, const int &n, const int &nrhs, const std::complex *a, const int &lda, - const int *ipiv, std::complex *b, const int &ldb, int *info -); -extern "C" void zgetri_( - const int &n, const std::complex *a, const int &lda, - const int *ipiv, std::complex *work, const int &lwork, int *info -); -static int Invert(size_t n, std::complex *a, size_t lda, std::complex *work, size_t lwork, size_t *iwork){ - int info; - zgetrf_(n, n, a, lda, (int*)iwork, &info); - zgetri_(n, a, lda, (int*)iwork, work, lwork, &info); - return info; -} - -static void Mult(size_t n, const double &alpha, const std::complex *a, size_t lda, std::complex *b, size_t ldb, const double &beta, std::complex *c, size_t ldc){ - zgemm_("N","N", n, n, n, alpha, a, lda, b, ldb, beta, c, ldc); -} -static void Mult(size_t n, size_t m, const double &alpha, const std::complex *a, size_t lda, std::complex *b, const double &beta, std::complex *c){ - zgemv_("N", n, m, alpha, a, lda, b, 1, beta, c, 1); -} -static int LUFactor(size_t n, std::complex *a, size_t lda, size_t *ipiv){ - int info = 0; - zgetrf_(n, n, a, lda, (int*)ipiv, &info); // A = P*L*U - return info; -} -static int LUSolve(size_t n, size_t nRHS, const std::complex *a, size_t lda, const size_t *ipiv, std::complex *b, size_t ldb){ - int info; - zgetrs_("N", n, nRHS, a, lda, (const int*)ipiv, b, ldb, &info); - return info; -} -static void PrintMatrix(const char *name, size_t m, size_t n, const std::complex *a, size_t lda){ - printf("%s = [\n", name); - for(size_t i = 0; i < m; ++i){ - for(size_t j = 0; j < n; ++j){ - const std::complex &val = a[i+j*lda]; - if(val.imag() < 0){ - printf(" % 18.16e-% 18.16ei", val.real(), -val.imag()); - }else{ - printf(" % 18.16e+% 18.16ei", val.real(), val.imag()); - } - } - printf("\n"); - } - printf("];\n"); -} + // kp = k-parallel matrix // kp = omega^2 - Kappa = omega^2 - [ ky*epsinv*ky -ky*epsinv*kx ] // [ -kx*epsinv*ky kx*epsinv*kx ] // = omega^2 + [ -ky ] [ epsinv ] [ ky -kx ] -// +// static void MakeKPMatrix( std::complex omega, @@ -255,11 +113,11 @@ static void MakeKPMatrix( RNP::TBLAS::CopyMatrix<'A'>(n2,n2, kp_existing,n2, kp,ldkp); return; } - + const std::complex omega2 = omega*omega; if(EPSILON2_TYPE_BLKDIAG1_SCALAR == epstype || EPSILON2_TYPE_BLKDIAG2_SCALAR == epstype){ RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., kp,ldkp); - + for(size_t i = 0; i < n; ++i){ kp[(0+i)+(0+i)*ldkp] = omega2 - ky[i]*Epsilon_inv[0]*ky[i]; kp[(n+i)+(0+i)*ldkp] = kx[i]*Epsilon_inv[0]*ky[i]; @@ -273,7 +131,7 @@ static void MakeKPMatrix( RNP::TBLAS::CopyMatrix<'A'>(n,n, Epsilon_inv, n, &kp[n+0*ldkp], ldkp); RNP::TBLAS::CopyMatrix<'A'>(n,n, Epsilon_inv, n, &kp[0+n*ldkp], ldkp); RNP::TBLAS::CopyMatrix<'A'>(n,n, Epsilon_inv, n, &kp[n+n*ldkp], ldkp); - + for(size_t i = 0; i < n; ++i){ RNP::TBLAS::Scale(n2, -ky[i], &kp[0+i*n2], 1); } @@ -292,50 +150,6 @@ static void MakeKPMatrix( } } } -static void MakeKPMatrix_real( - double omega, - size_t n, - const double *kx, - const double *ky, - const std::complex *Epsilon_inv, - double *kp_existing, - double *kp, - const size_t ldkp // leading dimension of kp, >= 2*n -){ - const size_t n2 = 2*n; - if(NULL != kp_existing){ - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, kp_existing,n2, kp,ldkp); - return; - } - - const double omega2 = omega*omega; - - for(size_t j = 0; j < n; ++j){ - for(size_t i = 0; i < n; ++i){ - kp[i+j*ldkp] = Epsilon_inv[i+j*n].real(); - } - } - - RNP::TBLAS::CopyMatrix<'A'>(n,n, kp, ldkp, &kp[n+0*ldkp], ldkp); - RNP::TBLAS::CopyMatrix<'A'>(n,n, kp, ldkp, &kp[0+n*ldkp], ldkp); - RNP::TBLAS::CopyMatrix<'A'>(n,n, kp, ldkp, &kp[n+n*ldkp], ldkp); - - for(size_t i = 0; i < n; ++i){ - RNP::TBLAS::Scale(n2, -ky[i], &kp[0+i*n2], 1); - } - for(size_t i = 0; i < n; ++i){ - RNP::TBLAS::Scale(n2, kx[i], &kp[0+(i+n)*n2], 1); - } - for(size_t i = 0; i < n; ++i){ - RNP::TBLAS::Scale(n2, ky[i], &kp[i+0*n2], ldkp); - } - for(size_t i = 0; i < n; ++i){ - RNP::TBLAS::Scale(n2, -kx[i], &kp[i+n+0*n2], ldkp); - } - for(size_t i = 0; i < n2; ++i){ - kp[i+i*ldkp] += omega2; - } -} // kp = k-parallel matrix // kp = omega^2 - Kappa = omega^2 - [ ky*epsinv*ky -ky*epsinv*kx ] @@ -359,7 +173,7 @@ void MultKPMatrix( const size_t ldy ){ const size_t n2 = 2*n; - + if(NULL != kp){ if('N' == trans[0]){ if(ncols > 1){ @@ -376,7 +190,7 @@ void MultKPMatrix( } return; } - + const std::complex omega2 = omega*omega; if(EPSILON2_TYPE_BLKDIAG1_SCALAR == Epsilon2_type || EPSILON2_TYPE_BLKDIAG2_SCALAR == Epsilon2_type){ const std::complex epsinv( @@ -409,7 +223,7 @@ void MultKPMatrix( RNP::TBLAS::MultMV<'C'>(n,n, 1.,Epsilon_inv,n, &y[0+0*ldy],1, 0.,&y[n+0*ldy],1); } } - + for(size_t j = 0; j < ncols; ++j){ for(size_t i = 0; i < n; ++i){ y[(0+i)+j*ldy] = omega2 * a[(0+i)+j*lda] - ky[i]*y[n+i+j*ldy]; @@ -453,17 +267,9 @@ void SolveLayerEigensystem_uniform( // Set the \hat{q} vector (diagonal matrix) while we're at it if(0 == omega.imag()){ // Not bandsolving - if(std::abs(q[i].imag()) <= 4*DBL_EPSILON*std::abs(q[i].real())){ - if(q[i].real() >= 0){ - q[i] = std::sqrt(q[i].real()); - }else{ - q[i] = std::complex(0, std::sqrt(-q[i].real())); - } - }else{ - q[i] = std::sqrt(q[i]); - if(q[i].imag() < 0){ - q[i] = -q[i]; - } + q[i] = std::sqrt(q[i]); + if(q[i].imag() < 0){ + q[i] = -q[i]; } }else{ // performing some kind of bandsolving, need to choose the appropriate branch if(q[i].real() < 0){ @@ -475,13 +281,13 @@ void SolveLayerEigensystem_uniform( q[i] = std::sqrt(q[i]); } } - + /* if(std::complex(0) == q[i]){ q[i] = DBL_EPSILON*omega; } */ - + q[i+n] = q[i]; } #ifdef DUMP_MATRICES @@ -501,14 +307,14 @@ void SolveLayerEigensystem_uniform( } } -#ifdef HAVE_LAPACK -void SolveLayerEigensystem_real( - double omega, +void SolveLayerEigensystem( + std::complex omega, size_t n, const double *kx, const double *ky, const std::complex *Epsilon_inv, // size (glist.n)^2; inv of usual dielectric Fourier coupling matrix const std::complex *Epsilon2, // size (2*glist.n)^2 (dielectric/normal-field matrix) + int epstype, std::complex *q, // length 2*glist.n std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) std::complex *phi, // size (2*glist.n)^2 @@ -519,27 +325,35 @@ void SolveLayerEigensystem_real( const size_t n2 = 2*n; if((size_t)-1 == lwork){ - double tmp; - RNP::Eigensystem_real(n2, NULL, n2, q, NULL, 1, phi, n2, &tmp, lwork); - work_[0] = tmp + n2*n2; - return; + //M.P. (Michael Povolotskyi) Bellow I have to mimic the instructions from RNP::Eigensystem in file Eigensystems.cpp + // because MKL lapack cannot handle NULL pointer for the matrix + // RNP::Eigensystem(n2, NULL, n2, q, NULL, 1, phi, n2, work_, NULL, lwork); //M.P + work_[0] = double(2*n2); //M.P. + work_[0] += n2*n2; + return; }else if(0 == lwork){ lwork = n2*n2+2*n2; } - double *work = (double*)work_; + std::complex *work = work_; size_t eigenlwork; - if(NULL == work_ || lwork < n2*n2+4*n2){ + if(NULL == work_ || lwork < n2*n2+2*n2){ lwork = (size_t)-1; - double tmp; - RNP::Eigensystem_real(n2, NULL, n2, q, NULL, 1, phi, n2, &tmp, lwork); - eigenlwork = (size_t)tmp; - work = (double*)rcwa_malloc(sizeof(double)*(eigenlwork + n2*n2)); + //M.P. Same as above + // RNP::Eigensystem(n2, NULL, n2, q, NULL, 1, phi, n2, q, NULL, lwork); //M.P. + q[0] = (double)(2*n2); //M.P + eigenlwork = (size_t)q[0].real(); + work = (std::complex*)rcwa_malloc(sizeof(std::complex)*(eigenlwork + n2*n2)); }else{ eigenlwork = lwork - n2*n2; } - double *op = work; - double *eigenwork = op + n2*n2; + std::complex *op = work; + std::complex *eigenwork = op + n2*n2; + + double *rwork = rwork_; + if(NULL == rwork_){ + rwork = (double*)rcwa_malloc(sizeof(double)*2*n2); + } #ifdef DUMP_MATRICES DUMP_STREAM << "kx:" << std::endl; @@ -558,9 +372,9 @@ void SolveLayerEigensystem_real( #endif // Fill kp - double *kp_use = (double*)(NULL != kp ? kp : phi); - - MakeKPMatrix_real(omega, n, kx, ky, Epsilon_inv, NULL, kp_use, n2); + std::complex *kp_use = (NULL != kp ? kp : phi); + + MakeKPMatrix(omega, n, kx, ky, Epsilon_inv, epstype, NULL, kp_use, n2); #ifdef DUMP_MATRICES DUMP_STREAM << "kp:" << std::endl; # ifdef DUMP_MATRICES_LARGE @@ -569,7 +383,7 @@ void SolveLayerEigensystem_real( RNP::IO::PrintVector(n2,&kp_use[0+(n2-1)*n2],1, DUMP_STREAM) << std::endl << std::endl; # endif #endif - + #ifdef DUMP_MATRICES DUMP_STREAM << "Epsilon2:" << std::endl; @@ -581,18 +395,9 @@ void SolveLayerEigensystem_real( #endif // Make the eigenoperator Epsilon2*kp - [kxkx, kxky; kykx, kyky] - for(size_t j = 0; j < n2; ++j){ - for(size_t i = 0; i < n2; ++i){ - // double sum = 0; - // for(size_t k = 0; k < n2; ++k){ - // sum += Epsilon2[i+k*n2]*kp_use[k+j*n2]; - // } - op[i+j*n2] = RNP::TBLAS::Dot(n2, (double*)(&Epsilon2[i]), 2*n2, &kp_use[j*n2], 1); - } - } - //RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., op,n2); - //RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,Epsilon2,n2, kp_use,n2, 0.,op,n2); - + RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., op,n2); + RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,Epsilon2,n2, kp_use,n2, 0.,op,n2); + for(size_t i = 0; i < n; ++i){ op[i+i*n2] -= kx[i]*kx[i]; } @@ -617,18 +422,14 @@ void SolveLayerEigensystem_real( #ifdef DUMP_MATRICES # ifdef DUMP_MATRICES_LARGE - std::complex *op_save = (std::complex*)rcwa_malloc(sizeof(std::complex) * 2*n2*n2); + std::complex *op_save = (std::complex *)rcwa_malloc(sizeof(std::complex) * 2*n2*n2); std::complex *op_temp = op_save + n2*n2; - for(size_t j = 0; j < n2; ++j){ - for(size_t i = 0; i < n2; ++i){ - op_save[i+j*n2] = op[i+j*n2]; - } - } + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, op,n2, op_save,n2); # endif #endif - int info = RNP::Eigensystem_real(n2, op, n2, q, NULL, 1, phi, n2, eigenwork, eigenlwork); + int info = RNP::Eigensystem(n2, op, n2, q, NULL, 1, phi, n2, eigenwork, rwork, eigenlwork); if(0 != info){ - fprintf(stderr, "Layer eigensystem returned info = %d\n", info); + std::cerr << "Layer eigensystem returned info = " << info << std::endl; } #ifdef DUMP_MATRICES DUMP_STREAM << "eigen info = " << info << std::endl; @@ -640,24 +441,26 @@ void SolveLayerEigensystem_real( } DUMP_STREAM << "eigensystem residual:" << std::endl; RNP::IO::PrintMatrix(n2,n2,op_temp,n2, DUMP_STREAM) << std::endl << std::endl; - { - for(size_t j = 0; j < n2; ++j){ - double sum = 0; - for(size_t i = 0; i < n2; ++i){ - sum += fabs(op_temp[i+j*n2].real()) + fabs(op_temp[i+j*n2].imag()); - } - DUMP_STREAM << " residual sum " << j << ": " << sum << std::endl; - } - } rcwa_free(op_save); # endif #endif for(size_t i = 0; i < n2; ++i){ // Set the \hat{q} vector (diagonal matrix) while we're at it - q[i] = std::sqrt(q[i]); - if(q[i].imag() < 0){ - q[i] = -q[i]; + if(0 == omega.imag()){ // Not bandsolving + q[i] = std::sqrt(q[i]); + if(q[i].imag() < 0){ + q[i] = -q[i]; + } + }else{ // performing some kind of bandsolving, need to choose the appropriate branch + if(q[i].real() < 0){ + // branch cut should be just below positive real axis + q[i] = std::complex(0,1) * std::sqrt(-q[i]); + }else{ + // branch cut should be just below negative real axis + // This is the default behavior for sqrt(std::complex) + q[i] = std::sqrt(q[i]); + } } } #ifdef DUMP_MATRICES @@ -670,593 +473,71 @@ void SolveLayerEigensystem_real( RNP::IO::PrintVector(n2,phi,1, DUMP_STREAM) << std::endl << std::endl; # endif #endif - - if(NULL == work_ || lwork < n2*n2+4*n2){ + + if(NULL == work_ || lwork < n2*n2+2*n2){ rcwa_free(work); } - - // Re-make kp in complex arithmetic for future use - std::complex *kp_use_actual = (NULL != kp ? kp : phi); - MakeKPMatrix(omega, n, kx, ky, Epsilon_inv, EPSILON2_TYPE_FULL, NULL, kp_use_actual, n2); + if(NULL == rwork_){ + rcwa_free(rwork); + } } -#endif // HAVE_LAPACK -void SolveLayerEigensystem( - std::complex omega, + +void InitSMatrix( size_t n, - const double *kx, - const double *ky, - const std::complex *Epsilon_inv, // size (glist.n)^2; inv of usual dielectric Fourier coupling matrix - const std::complex *Epsilon2, // size (2*glist.n)^2 (dielectric/normal-field matrix) - int epstype, - std::complex *q, // length 2*glist.n - std::complex *kp, // size (2*glist.n)^2 (k-parallel matrix) - std::complex *phi, // size (2*glist.n)^2 + std::complex *S // size (4*n)^2 +){ + const size_t n4 = 4*n; + RNP::TBLAS::SetMatrix<'A'>(n4,n4, 0.,1., S, n4); +} +void GetSMatrix( + size_t nlayers, + size_t n, // glist.n + const double *kx, const double *ky, + std::complex omega, + const double *thickness, // list of thicknesses + const std::complex **q, // list of q vectors + const std::complex **Epsilon_inv, // size (glist.n)^2; inv of usual dielectric Fourier coupling matrix + int *epstype, + const std::complex **kp, + const std::complex **phi, + std::complex *S, // size (4*n)^2 std::complex *work_, - double *rwork_, + size_t *iwork, size_t lwork ){ + if(0 == nlayers){ return; } const size_t n2 = 2*n; - -#ifdef HAVE_LAPACK - bool isreal = (n > 10); - for(size_t j = 0; j < n && isreal; ++j){ - for(size_t i = 0; i < n; ++i){ - if(0 != Epsilon_inv[i+j*n].imag()){ - isreal = false; - break; - } - } - } - for(size_t j = 0; j < n2 && isreal; ++j){ - for(size_t i = 0; i < n2; ++i){ - if(0 != Epsilon2[i+j*n2].imag()){ - isreal = false; - break; - } - } - } - if(isreal && 0 == omega.imag() && EPSILON2_TYPE_FULL == epstype){ - SolveLayerEigensystem_real( - omega.real(), n, kx, ky, Epsilon_inv, Epsilon2, - q, kp, phi, work_, rwork_, lwork - ); - return; - } -#endif + const size_t n4 = 2*n2; if((size_t)-1 == lwork){ - double dum; - RNP::Eigensystem(n2, NULL, n2, q, NULL, 1, phi, n2, work_, &dum, lwork); - work_[0] += n2*n2; - return; - }else if(0 == lwork){ - lwork = n2*n2+2*n2; - } - - std::complex *work = work_; - size_t eigenlwork; - if(NULL == work_ || lwork < n2*n2+2*n2){ - lwork = (size_t)-1; - RNP::Eigensystem(n2, NULL, n2, q, NULL, 1, phi, n2, q, NULL, lwork); - eigenlwork = (size_t)q[0].real(); - work = (std::complex*)rcwa_malloc(sizeof(std::complex)*(eigenlwork + n2*n2)); - }else{ - eigenlwork = lwork - n2*n2; - } - std::complex *op = work; - std::complex *eigenwork = op + n2*n2; - - double *rwork = rwork_; - if(NULL == rwork_){ - rwork = (double*)rcwa_malloc(sizeof(double)*2*n2); - } - -#ifdef DUMP_MATRICES - DUMP_STREAM << "kx:" << std::endl; - RNP::IO::PrintVector(n,kx,1, DUMP_STREAM) << std::endl; - DUMP_STREAM << "ky:" << std::endl; - RNP::IO::PrintVector(n,ky,1, DUMP_STREAM) << std::endl << std::endl; -#endif - -#ifdef DUMP_MATRICES - DUMP_STREAM << "Epsilon_inv:" << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n,n,Epsilon_inv,n, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n,Epsilon_inv,1, DUMP_STREAM) << std::endl << std::endl; -# endif -#endif - - // Fill kp - std::complex *kp_use = (NULL != kp ? kp : phi); - - MakeKPMatrix(omega, n, kx, ky, Epsilon_inv, epstype, NULL, kp_use, n2); -#ifdef DUMP_MATRICES - DUMP_STREAM << "kp:" << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n2,n2,kp_use,n2, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n2,&kp_use[0+(n2-1)*n2],1, DUMP_STREAM) << std::endl << std::endl; -# endif -#endif - - -#ifdef DUMP_MATRICES - DUMP_STREAM << "Epsilon2:" << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n2,n2,Epsilon2,n2, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n2,Epsilon2,1, DUMP_STREAM) << std::endl << std::endl; -# endif -#endif - - // Make the eigenoperator Epsilon2*kp - [kxkx, kxky; kykx, kyky] - RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., op,n2); - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,Epsilon2,n2, kp_use,n2, 0.,op,n2); - - for(size_t i = 0; i < n; ++i){ - op[i+i*n2] -= kx[i]*kx[i]; - } - for(size_t i = 0; i < n; ++i){ - op[i+n+i*n2] -= ky[i]*kx[i]; - } - for(size_t i = 0; i < n; ++i){ - op[i+(i+n)*n2] -= kx[i]*ky[i]; - } - for(size_t i = 0; i < n; ++i){ - op[i+n+(i+n)*n2] -= ky[i]*ky[i]; - } -#ifdef DUMP_MATRICES - DUMP_STREAM << "op:" << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n2,n2,op,n2, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintMatrix(n2,n2,op,n2, DUMP_STREAM) << std::endl << std::endl; - //RNP::IO::PrintVector(n2,&op[0+(n2-1)*n2],1, DUMP_STREAM) << std::endl << std::endl; -# endif -#endif - -#ifdef DUMP_MATRICES -# ifdef DUMP_MATRICES_LARGE - std::complex *op_save = (std::complex *)rcwa_malloc(sizeof(std::complex) * 2*n2*n2); - std::complex *op_temp = op_save + n2*n2; - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, op,n2, op_save,n2); -# endif -#endif - int info = RNP::Eigensystem(n2, op, n2, q, NULL, 1, phi, n2, eigenwork, rwork, eigenlwork); - if(0 != info){ - fprintf(stderr, "Layer eigensystem returned info = %d\n", info); - } -#ifdef DUMP_MATRICES - DUMP_STREAM << "eigen info = " << info << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::TBLAS::Fill(n2*n2, 0., op_temp,1); - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,op_save,n2, phi,n2, 0.,op_temp,n2); - for(size_t i = 0; i < n2; ++i){ - RNP::TBLAS::Axpy(n2, -q[i], &phi[0+i*n2],1, &op_temp[0+i*n2],1); - } - DUMP_STREAM << "eigensystem residual:" << std::endl; - RNP::IO::PrintMatrix(n2,n2,op_temp,n2, DUMP_STREAM) << std::endl << std::endl; - rcwa_free(op_save); -# endif -#endif - - for(size_t i = 0; i < n2; ++i){ - // Set the \hat{q} vector (diagonal matrix) while we're at it - if(0 == omega.imag()){ // Not bandsolving - q[i] = std::sqrt(q[i]); - if(q[i].imag() < 0){ - q[i] = -q[i]; - } - }else{ // performing some kind of bandsolving, need to choose the appropriate branch - if(q[i].real() < 0){ - // branch cut should be just below positive real axis - q[i] = std::complex(0,1) * std::sqrt(-q[i]); - }else{ - // branch cut should be just below negative real axis - // This is the default behavior for sqrt(std::complex) - q[i] = std::sqrt(q[i]); - } - } - } -#ifdef DUMP_MATRICES - DUMP_STREAM << "q:" << std::endl; - RNP::IO::PrintVector(n2,q,1, DUMP_STREAM) << std::endl << std::endl; - DUMP_STREAM << "phi:" << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n2,n2,phi,n2, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n2,phi,1, DUMP_STREAM) << std::endl << std::endl; -# endif -#endif - - if(NULL == work_ || lwork < n2*n2+2*n2){ - rcwa_free(work); - } - if(NULL == rwork_){ - rcwa_free(rwork); - } -} - - -void InitSMatrix( - size_t n, - std::complex *S // size (4*n)^2 -){ - const size_t n4 = 4*n; - RNP::TBLAS::SetMatrix<'A'>(n4,n4, 0.,1., S, n4); -} -void GetSMatrix( - size_t nlayers, - size_t n, // glist.n - const double *kx, const double *ky, - std::complex omega, - const double *thickness, // list of thicknesses - const std::complex **q, // list of q vectors - const std::complex **Epsilon_inv, // size (glist.n)^2; inv of usual dielectric Fourier coupling matrix - int *epstype, - const std::complex **kp, - const std::complex **phi, - std::complex *S, // size (4*n)^2 - std::complex *work_, - size_t *iwork, - size_t lwork -){ - if(0 == nlayers){ return; } - const size_t n2 = 2*n; - const size_t n4 = 2*n2; - - if((size_t)-1 == lwork){ - work_[0] = n4*(n4+1); - return; - } - std::complex *work = work_; - if(NULL == work_ || lwork < n4*(n4+1)){ - work = (std::complex*)rcwa_malloc(sizeof(std::complex)*(n4*(n4+1))); - } - size_t *pivots = iwork; - if(NULL == iwork){ - pivots = (size_t*)rcwa_malloc(sizeof(size_t)*n4); - } - - RNP::TBLAS::SetMatrix<'A'>(n4,n4, 0.,1., S, n4); - - std::complex *t1 = work; - std::complex *t2 = t1 + n2*n2; - std::complex *in1 = t2 + n2*n2; - std::complex *in2 = in1 + n2*n2; - std::complex *d1 = in2 + n2*n2; - std::complex *d2 = d1 + n2; - - for(size_t l = 0; l < nlayers-1; ++l){ - size_t lp1 = l+1; - if(lp1 >= nlayers){ lp1 = l; } - - // Make the interface matrices - if((lp1 == l) || (q[l] == q[lp1] && ((NULL != kp[l] && kp[l] == kp[lp1]) || Epsilon_inv[l] == Epsilon_inv[lp1]) && phi[l] == phi[lp1])){ - // This is a trivial interface, set to identity - RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,1., in1, n2); - RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., in2, n2); - }else{ - // The interface matrix is the inverse of the mode-to-field matrix of layer l - // times the mode-to-field matrix of layer l+1 (lp1). - // The mode-to-field matrix is of the form - // [ B -B ] where A = phi - // [ A A ] where B = kp*phi*inv(diag(q)) = G*A/q - // So we want - // 0.5 * [ iBl iAl ] [ Blp1 -Blp1 ] - // [ -iBl iAl ] [ Alp1 Alp1 ] - // Multiplying out gives - // 0.5 * [ P+Q P-Q ] // where P = iAl*Alp1, and i in front means inverse - // [ P-Q P+Q ] // where Q = iBl*Blp1 - // Making P is easy, since A is a single matrix. - // Making Q is as follows: - // Q = iBl*Blp1 - // = ql*iAl*iGl * Gl*Alp1*iqlp1 - // We will only store I11 and I21 - /* - { - std::complex *Ml = (std::complex*)rcwa_malloc(sizeof(std::complex)*n4*n4); - std::complex *Mlp1 = (std::complex*)rcwa_malloc(sizeof(std::complex)*n4*n4); - - RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., &Ml[0+0*n4],n4); - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[l],n2, &Ml[n2+0*n4],n4); - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[l],n2, &Ml[n2+n2*n4],n4); - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[l],n2, &Ml[0+n2*n4],n4); - for(size_t i = 0; i < n2; ++i){ - RNP::TBLAS::Scale(n2, 1./q[l][i], &Ml[0+(i+n2)*n4], 1); - } - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,kp[l],n2, &Ml[0+n2*n4],n4, 0.,&Ml[0+0*n4],n4); - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, &Ml[0+0*n4],n4, &Ml[0+n2*n4],n4); - for(size_t i = 0; i < n2; ++i){ - RNP::TBLAS::Scale(n2, -1., &Ml[0+(i+n2)*n4], 1); - } - - RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., &Mlp1[0+0*n4],n4); - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[lp1],n2, &Mlp1[n2+0*n4],n4); - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[lp1],n2, &Mlp1[n2+n2*n4],n4); - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[lp1],n2, &Mlp1[0+n2*n4],n4); - for(size_t i = 0; i < n2; ++i){ - RNP::TBLAS::Scale(n2, 1./q[lp1][i], &Mlp1[0+(i+n2)*n4], 1); - } - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,kp[lp1],n2, &Mlp1[0+n2*n4],n4, 0.,&Mlp1[0+0*n4],n4); - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, &Mlp1[0+0*n4],n4, &Mlp1[0+n2*n4],n4); - for(size_t i = 0; i < n2; ++i){ - RNP::TBLAS::Scale(n2, -1., &Mlp1[0+(i+n2)*n4], 1); - } - - //RNP::LinearSolve<'N'>(n4, n4, Ml, n4, Mlp1, n4, NULL, pivots); -#ifdef DUMP_MATRICES - DUMP_STREAM << "M(" << l << ") = " << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n4,n4,Ml,n4, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n4,Ml,1, DUMP_STREAM) << std::endl << std::endl; -# endif - DUMP_STREAM << "M(" << lp1 << ") = " << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n4,n4,Mlp1,n4, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n4,Mlp1,1, DUMP_STREAM) << std::endl << std::endl; -# endif -#endif - - rcwa_free(Mlp1); - rcwa_free(Ml); - } - */ - // Make Bl in t1 - RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., t1,n2); - { - if(NULL == phi[l]){ - if(NULL == kp[l]){ - MakeKPMatrix(omega, n, kx, ky, Epsilon_inv[l], epstype[l], kp[l], t1,n2); - }else{ - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, kp[l],n2, t1,n2); - } - }else{ - MultKPMatrix("N", omega, n, kx, ky, Epsilon_inv[l], epstype[l], kp[l], n2, phi[l],n2, t1,n2); - } - //for(size_t i = 0; i < n2; ++i){ - // RNP::TBLAS::Scale(n2, 1./q[l][i], &t1[0+i*n2], 1); - //} - } -#ifdef DUMP_MATRICES - DUMP_STREAM << "Bl(" << l << ") = " << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n2,n2,t1,n2, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n2,t1,1, DUMP_STREAM) << std::endl << std::endl; -# endif -#endif - // Make Blp1 in in1 - RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., in1,n2); - { - if(NULL == phi[lp1]){ - if(NULL == kp[lp1]){ - MakeKPMatrix(omega, n, kx, ky, Epsilon_inv[lp1], epstype[lp1], kp[lp1], in1,n2); - }else{ - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, kp[lp1],n2, in1,n2); - } - }else{ - MultKPMatrix("N", omega, n, kx, ky, Epsilon_inv[lp1], epstype[lp1], kp[lp1], n2, phi[lp1],n2, in1,n2); - } - //for(size_t i = 0; i < n2; ++i){ - // RNP::TBLAS::Scale(n2, 1./q[lp1][i], &in1[0+i*n2], 1); - //} - } -#ifdef DUMP_MATRICES - DUMP_STREAM << "Bl(" << l+1 << ") = " << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n2,n2,in1,n2, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n2,in1,1, DUMP_STREAM) << std::endl << std::endl; -# endif -#endif - int solve_info; - // Make Q in in1 - //RNP::LinearSolve<'N'>(n2, n2, t1, n2, in1, n2, &solve_info, pivots); - SingularLinearSolve(n2,n2,n2, t1,n2, in1,n2, DBL_EPSILON); - // Now perform the diagonal scalings - for(size_t i = 0; i < n2; ++i){ - RNP::TBLAS::Scale(n2, q[l][i], &in1[i+0*n2], n2); - } - { - double maxel = 0; - for(size_t i = 0; i < n2; ++i){ - double el = std::abs(q[lp1][i]); - if(el > maxel){ maxel = el; } - } - for(size_t i = 0; i < n2; ++i){ - double el = std::abs(q[lp1][i]); - if(el < DBL_EPSILON * maxel){ - RNP::TBLAS::Scale(n2, 0., &in1[0+i*n2], 1); - }else{ - RNP::TBLAS::Scale(n2, 1./q[lp1][i], &in1[0+i*n2], 1); - } - } - } - - // Make P in in2 - if(NULL == phi[lp1]){ - RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,1., in2,n2); - }else{ - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[lp1],n2, in2,n2); - } - if(NULL != phi[l]){ - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[l],n2, t1,n2); - RNP::LinearSolve<'N'>(n2, n2, t1, n2, in2, n2, &solve_info, pivots); - } - - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, in2,n2, t1,n2); // in2 = P, t1 = P, in1 = Q - RNP::TBLAS::Axpy(n2*n2, -1., in1,1, in2,1); // in2 = P-Q, t1 = P, in1 = Q - RNP::TBLAS::Axpy(n2*n2, 1., t1,1, in1,1); // in2 = P+Q, t1 = P, in1 = P+Q - RNP::TBLAS::Scale(n2*n2, 0.5, in1,1); - RNP::TBLAS::Scale(n2*n2, 0.5, in2,1); - } -#ifdef DUMP_MATRICES - DUMP_STREAM << "Interface1(" << l+1 << ") = " << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n2,n2,in1,n2, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n2,in1,1, DUMP_STREAM) << std::endl << std::endl; -# endif - DUMP_STREAM << "Interface2(" << l+1 << ") = " << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n2,n2,in2,n2, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n2,in2,1, DUMP_STREAM) << std::endl << std::endl; -# endif -#endif - - for(size_t i = 0; i < n2; ++i){ - d1[i] = std::exp(q[l ][i] * std::complex(0,thickness[l ])); - d2[i] = std::exp(q[lp1][i] * std::complex(0,thickness[lp1])); - } - - // Make S11 - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, -1.,&S[0+n2*n4],n4, in2,n2, 0.,t1,n2); // t1 = -S12 I21 - for(size_t i = 0; i < n2; ++i){ // t1 = -f_l S12 I21 - RNP::TBLAS::Scale(n2, d1[i], &t1[i+0*n2], n2); - } - RNP::TBLAS::Axpy(n2*n2, 1., in1,1, t1,1); // t1 = (I11 - f_l S12 I21) - - RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,1., t2,n2); - int solve_info; - RNP::LinearSolve<'N'>(n2, n2, t1, n2, t2, n2, &solve_info, pivots); // t2 = (I11 - f_l S12 I21)^{-1} - - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, &S[0+0*n4],n4, t1,n2); - for(size_t i = 0; i < n2; ++i){ // t1 = f_l S11 - RNP::TBLAS::Scale(n2, d1[i], &t1[i+0*n2], n2); - } - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,t2,n2, t1,n2, 0.,&S[0+0*n4],n4); - // S11 is done, and we need to hold on to t2 = (I11 - f_l S12 I21)^{-1} - - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,&S[0+n2*n4],n4, in1,n2, 0.,t1,n2); // t1 = S12 I22 - for(size_t i = 0; i < n2; ++i){ // t1 = f_l S12 I22 - RNP::TBLAS::Scale(n2, d1[i], &t1[i+0*n2], n2); - } - RNP::TBLAS::Axpy(n2*n2, -1., in2,1, t1,1); // t1 = f_l S12 I22 - I12 - for(size_t i = 0; i < n2; ++i){ // t1 = (f_l S12 I22 - I12) f_{l+1} - RNP::TBLAS::Scale(n2, d2[i], &t1[0+i*n2], 1); - } - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,t2,n2, t1,n2, 0.,&S[0+n2*n4],n4); - // S12 done, and t2 can be reused - - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,&S[n2+n2*n4],n4, in2,n2, 0.,t1,n2); // t1 = S22 I21 - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,t1,n2, &S[0+0*n4],n4, 1.,&S[n2+0*n4],n4); - // S21 done, need to keep t1 = S22 I21 - - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,&S[n2+n2*n4],n4, in1,n2, 0.,t2,n2); // t2 = S22 I22 - for(size_t i = 0; i < n2; ++i){ // t2 = S22 I22 f_{l+1} - RNP::TBLAS::Scale(n2, d2[i], &t2[0+i*n2], 1); - } - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, t2,n2, &S[n2+n2*n4],n4); - RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,t1,n2, &S[0+n2*n4],n4, 1.,&S[n2+n2*n4],n4); - -#ifdef DUMP_MATRICES - DUMP_STREAM << "S(1," << l+2 << ") = " << std::endl; -# ifdef DUMP_MATRICES_LARGE - RNP::IO::PrintMatrix(n4,n4,S,n4, DUMP_STREAM) << std::endl << std::endl; -# else - RNP::IO::PrintVector(n4,S,1, DUMP_STREAM) << std::endl << std::endl; -# endif -#endif - } - if(NULL == work_ || lwork < n4*(n4+1)){ - rcwa_free(work); - } - if(NULL == iwork){ - rcwa_free(pivots); - } -} - - -int SolveAll( - size_t nlayers, - size_t n, // glist.n - const double *kx, const double *ky, - std::complex omega, - const double *thickness, // list of thicknesses - const std::complex **q, // list of q vectors - const std::complex **Epsilon_inv, // size (glist.n)^2; inv of usual dielectric Fourier coupling matrix - int *epstype, - const std::complex **kp, - const std::complex **phi, - std::complex *ab, // length 2*n*nlayers - std::complex *work_, // length lwork - size_t *iwork, // length n2*nlayers - size_t lwork // set to -1 for query into iwork[0], at least 6*n2^2*nlayers -){ - const size_t n2 = 2*n; - const size_t n4 = 2*n2; - const size_t n22 = n2*n2; - const size_t minwork = 6*n22*nlayers; - if((size_t)-1 == lwork){ - iwork[0] = minwork; - return 0; + work_[0] = n4*(n4+1); + return; } - typedef std::complex doublecomplex; - doublecomplex *work = work_; - if(0 == lwork && NULL == work_){ - work = (doublecomplex*)rcwa_malloc(sizeof(doublecomplex) * minwork); - if(NULL == work){ -printf("Could not allocate %d\n", (int)minwork); fflush(stdout); - return 1; - } - }else if(lwork < minwork){ - // error: not enough workspace - return -15; + std::complex *work = work_; + if(NULL == work_ || lwork < n4*(n4+1)){ + work = (std::complex*)rcwa_malloc(sizeof(std::complex)*(n4*(n4+1))); + } + size_t *pivots = iwork; + if(NULL == iwork){ + pivots = (size_t*)rcwa_malloc(sizeof(size_t)*n4); } - // Set up temporaries - doublecomplex *t1 = work; - doublecomplex *t2 = t1 + n22; - doublecomplex *in1 = t2 + n22; - doublecomplex *in2 = in1 + n22; - - for(size_t ip = 0, iq = 1; iq < nlayers; ip=iq++){ - // Set pointers to current row of matrices - doublecomplex *row = work+6*n22*iq; - - // Precondition for current loop iteration: - // Qprev from previous iteration was formed correctly - - /* - // Assemble interface T-matrix - // Assemble mode-to-field operator for p in row - if(0 == p){ - // TODO: Generate mode-to-field operator for p - - // Factor and invert P - if(layer p is uniform){ - }else if(layer p is z-uncoupled){ - }else{ - LUFactor(n4, n4, row, n4, ipiv); - LUInvert(n4, n4, row, n4, ipiv); - } - }else{ // Mode-to-field operator for p was generated and factored in previous iteration - // Only invert P (in-place) - if(layer p is uniform){ - }else if(layer p is z-uncoupled){ - }else{ - LUInvert(n4, n4, row, n4, ipiv); - } - } - // Assemble mode-to-field operator for q in rownext - // TODO: Generate mode-to-field operator for q - if(layer q is uniform){ - }else if(layer q is z-uncoupled){ - }else{ - LUFactor(n4, n4, rownext, n4, ipiv); - } - // In-place multiply q into p - LUMultRight(n4, n4, rownext, n4, ipiv); - */ + RNP::TBLAS::SetMatrix<'A'>(n4,n4, 0.,1., S, n4); + + std::complex *t1 = work; + std::complex *t2 = t1 + n2*n2; + std::complex *in1 = t2 + n2*n2; + std::complex *in2 = in1 + n2*n2; + std::complex *d1 = in2 + n2*n2; + std::complex *d2 = d1 + n2; + + for(size_t l = 0; l < nlayers-1; ++l){ + size_t lp1 = l+1; + if(lp1 >= nlayers){ lp1 = l; } // Make the interface matrices - if((ip == iq) || (q[ip] == q[iq] && ((NULL != kp[ip] && kp[ip] == kp[iq]) || Epsilon_inv[ip] == Epsilon_inv[iq]) && phi[ip] == phi[iq])){ + if((lp1 == l) || (q[l] == q[lp1] && ((NULL != kp[l] && kp[l] == kp[lp1]) || Epsilon_inv[l] == Epsilon_inv[lp1]) && phi[l] == phi[lp1])){ // This is a trivial interface, set to identity RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,1., in1, n2); RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., in2, n2); @@ -1281,7 +562,7 @@ printf("Could not allocate %d\n", (int)minwork); fflush(stdout); { std::complex *Ml = (std::complex*)rcwa_malloc(sizeof(std::complex)*n4*n4); std::complex *Mlp1 = (std::complex*)rcwa_malloc(sizeof(std::complex)*n4*n4); - + RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., &Ml[0+0*n4],n4); RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[l],n2, &Ml[n2+0*n4],n4); RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[l],n2, &Ml[n2+n2*n4],n4); @@ -1294,7 +575,7 @@ printf("Could not allocate %d\n", (int)minwork); fflush(stdout); for(size_t i = 0; i < n2; ++i){ RNP::TBLAS::Scale(n2, -1., &Ml[0+(i+n2)*n4], 1); } - + RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., &Mlp1[0+0*n4],n4); RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[lp1],n2, &Mlp1[n2+0*n4],n4); RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[lp1],n2, &Mlp1[n2+n2*n4],n4); @@ -1307,7 +588,7 @@ printf("Could not allocate %d\n", (int)minwork); fflush(stdout); for(size_t i = 0; i < n2; ++i){ RNP::TBLAS::Scale(n2, -1., &Mlp1[0+(i+n2)*n4], 1); } - + //RNP::LinearSolve<'N'>(n4, n4, Ml, n4, Mlp1, n4, NULL, pivots); #ifdef DUMP_MATRICES DUMP_STREAM << "M(" << l << ") = " << std::endl; @@ -1323,28 +604,29 @@ printf("Could not allocate %d\n", (int)minwork); fflush(stdout); RNP::IO::PrintVector(n4,Mlp1,1, DUMP_STREAM) << std::endl << std::endl; # endif #endif - + rcwa_free(Mlp1); rcwa_free(Ml); } */ - - //// Begin copy from GetSMatrix // Make Bl in t1 RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., t1,n2); { - if(NULL == phi[ip]){ - if(NULL == kp[ip]){ - MakeKPMatrix(omega, n, kx, ky, Epsilon_inv[ip], epstype[ip], kp[ip], t1,n2); + if(NULL == phi[l]){ + if(NULL == kp[l]){ + MakeKPMatrix(omega, n, kx, ky, Epsilon_inv[l], epstype[l], kp[l], t1,n2); }else{ - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, kp[ip],n2, t1,n2); + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, kp[l],n2, t1,n2); } }else{ - MultKPMatrix("N", omega, n, kx, ky, Epsilon_inv[ip], epstype[ip], kp[ip], n2, phi[ip],n2, t1,n2); + MultKPMatrix("N", omega, n, kx, ky, Epsilon_inv[l], epstype[l], kp[l], n2, phi[l],n2, t1,n2); } + //for(size_t i = 0; i < n2; ++i){ + // RNP::TBLAS::Scale(n2, 1./q[l][i], &t1[0+i*n2], 1); + //} } #ifdef DUMP_MATRICES - DUMP_STREAM << "Bl(" << ip << ") = " << std::endl; + DUMP_STREAM << "Bl(" << l << ") = " << std::endl; # ifdef DUMP_MATRICES_LARGE RNP::IO::PrintMatrix(n2,n2,t1,n2, DUMP_STREAM) << std::endl << std::endl; # else @@ -1354,18 +636,21 @@ printf("Could not allocate %d\n", (int)minwork); fflush(stdout); // Make Blp1 in in1 RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,0., in1,n2); { - if(NULL == phi[iq]){ - if(NULL == kp[iq]){ - MakeKPMatrix(omega, n, kx, ky, Epsilon_inv[iq], epstype[iq], kp[iq], in1,n2); + if(NULL == phi[lp1]){ + if(NULL == kp[lp1]){ + MakeKPMatrix(omega, n, kx, ky, Epsilon_inv[lp1], epstype[lp1], kp[lp1], in1,n2); }else{ - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, kp[iq],n2, in1,n2); + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, kp[lp1],n2, in1,n2); } }else{ - MultKPMatrix("N", omega, n, kx, ky, Epsilon_inv[iq], epstype[iq], kp[iq], n2, phi[iq],n2, in1,n2); + MultKPMatrix("N", omega, n, kx, ky, Epsilon_inv[lp1], epstype[lp1], kp[lp1], n2, phi[lp1],n2, in1,n2); } + //for(size_t i = 0; i < n2; ++i){ + // RNP::TBLAS::Scale(n2, 1./q[lp1][i], &in1[0+i*n2], 1); + //} } #ifdef DUMP_MATRICES - DUMP_STREAM << "Bl(" << iq << ") = " << std::endl; + DUMP_STREAM << "Bl(" << l+1 << ") = " << std::endl; # ifdef DUMP_MATRICES_LARGE RNP::IO::PrintMatrix(n2,n2,in1,n2, DUMP_STREAM) << std::endl << std::endl; # else @@ -1374,39 +659,39 @@ printf("Could not allocate %d\n", (int)minwork); fflush(stdout); #endif int solve_info; // Make Q in in1 - RNP::LinearSolve<'N'>(n2, n2, t1, n2, in1, n2, &solve_info, iwork); - //SingularLinearSolve(n2,n2,n2, t1,n2, in1,n2, DBL_EPSILON); + //RNP::LinearSolve<'N'>(n2, n2, t1, n2, in1, n2, &solve_info, pivots); + SingularLinearSolve(n2,n2,n2, t1,n2, in1,n2, DBL_EPSILON); // Now perform the diagonal scalings for(size_t i = 0; i < n2; ++i){ - RNP::TBLAS::Scale(n2, q[ip][i], &in1[i+0*n2], n2); + RNP::TBLAS::Scale(n2, q[l][i], &in1[i+0*n2], n2); } { double maxel = 0; for(size_t i = 0; i < n2; ++i){ - double el = std::abs(q[iq][i]); + double el = std::abs(q[lp1][i]); if(el > maxel){ maxel = el; } } for(size_t i = 0; i < n2; ++i){ - double el = std::abs(q[iq][i]); + double el = std::abs(q[lp1][i]); if(el < DBL_EPSILON * maxel){ RNP::TBLAS::Scale(n2, 0., &in1[0+i*n2], 1); }else{ - RNP::TBLAS::Scale(n2, 1./q[iq][i], &in1[0+i*n2], 1); + RNP::TBLAS::Scale(n2, 1./q[lp1][i], &in1[0+i*n2], 1); } } } - + // Make P in in2 - if(NULL == phi[iq]){ + if(NULL == phi[lp1]){ RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,1., in2,n2); }else{ - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[iq],n2, in2,n2); + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[lp1],n2, in2,n2); } - if(NULL != phi[ip]){ - RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[ip],n2, t1,n2); - RNP::LinearSolve<'N'>(n2, n2, t1, n2, in2, n2, &solve_info, iwork); + if(NULL != phi[l]){ + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, phi[l],n2, t1,n2); + RNP::LinearSolve<'N'>(n2, n2, t1, n2, in2, n2, &solve_info, pivots); } - + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, in2,n2, t1,n2); // in2 = P, t1 = P, in1 = Q RNP::TBLAS::Axpy(n2*n2, -1., in1,1, in2,1); // in2 = P-Q, t1 = P, in1 = Q RNP::TBLAS::Axpy(n2*n2, 1., t1,1, in1,1); // in2 = P+Q, t1 = P, in1 = P+Q @@ -1414,260 +699,80 @@ printf("Could not allocate %d\n", (int)minwork); fflush(stdout); RNP::TBLAS::Scale(n2*n2, 0.5, in2,1); } #ifdef DUMP_MATRICES - DUMP_STREAM << "Interface1(" << iq << ") = " << std::endl; + DUMP_STREAM << "Interface1(" << l+1 << ") = " << std::endl; # ifdef DUMP_MATRICES_LARGE RNP::IO::PrintMatrix(n2,n2,in1,n2, DUMP_STREAM) << std::endl << std::endl; # else RNP::IO::PrintVector(n2,in1,1, DUMP_STREAM) << std::endl << std::endl; # endif - DUMP_STREAM << "Interface2(" << iq << ") = " << std::endl; + DUMP_STREAM << "Interface2(" << l+1 << ") = " << std::endl; # ifdef DUMP_MATRICES_LARGE RNP::IO::PrintMatrix(n2,n2,in2,n2, DUMP_STREAM) << std::endl << std::endl; # else RNP::IO::PrintVector(n2,in2,1, DUMP_STREAM) << std::endl << std::endl; # endif #endif - - //PrintMatrix("in1", n2, n2, in1, n2); - //PrintMatrix("in2", n2, n2, in2, n2); - - doublecomplex *Sba = row; - doublecomplex *Saa = Sba+n2; - doublecomplex *Sbb = row+n2*n4; - doublecomplex *Sab = Sbb+n2; -//printf("Preparing to exchange T to S\n"); fflush(stdout); - - // Exchange to interface S-matrix, and also swap block rows - Copy(n2,n2, in1,n2, Saa,n4); - Invert(n2, Saa,n4, Sbb, n22, iwork); // Use Sbb as workspace - Mult(n2, 1., in2,n2, Saa,n4, 0., Sba,n4); - Mult(n2, -1., Saa,n4, in2,n2, 0., Sab,n4); - Copy(n2,n2, in1,n2, Sbb,n4); - Mult(n2, 1., in2,n2, Sab,n4, 1., Sbb,n4); - -//printf("Exchanged T to S\n"); fflush(stdout); - -// PrintMatrix("Saa", n2, n2, Saa, n4); -// PrintMatrix("Sab", n2, n2, Sab, n4); -// PrintMatrix("Sba", n2, n2, Sba, n4); -// PrintMatrix("Sbb", n2, n2, Sbb, n4); - - // Fold in propagation phases into the S-matrix - for(size_t j = 0; j < n2; ++j){ - doublecomplex sp = std::exp(q[ip][j] * std::complex(0,thickness[ip])); - doublecomplex sq = std::exp(q[iq][j] * std::complex(0,thickness[iq])); -//printf("Scale factor ip j = %f + %f i\n", sp.real(), sp.imag()); -//printf("Scale factor iq j = %f + %f i\n", sq.real(), sq.imag()); - Scale(n4, sp, &row[0+j*n4]); - Scale(n4, sq, &row[0+(j+n2)*n4]); + for(size_t i = 0; i < n2; ++i){ + d1[i] = std::exp(q[l ][i] * std::complex(0,thickness[l ])); + d2[i] = std::exp(q[lp1][i] * std::complex(0,thickness[lp1])); } -// PrintMatrix("Saa with phase", n2, n2, Saa, n4); -// PrintMatrix("Sab with phase", n2, n2, Sab, n4); -// PrintMatrix("Sba with phase", n2, n2, Sba, n4); -// PrintMatrix("Sbb with phase", n2, n2, Sbb, n4); + // Make S11 + RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, -1.,&S[0+n2*n4],n4, in2,n2, 0.,t1,n2); // t1 = -S12 I21 + for(size_t i = 0; i < n2; ++i){ // t1 = -f_l S12 I21 + RNP::TBLAS::Scale(n2, d1[i], &t1[i+0*n2], n2); + } + RNP::TBLAS::Axpy(n2*n2, 1., in1,1, t1,1); // t1 = (I11 - f_l S12 I21) -//printf("Applied phases\n"); fflush(stdout); - } - ///////////// End copy from GetSMatrix - - // At this point, the workspace is divided into portions of length 6*n22 - // for each layer. The interface scattering matrix between layers i and i+1 - // begins at offset 6*n22*(i+1) in the workspace. The contents of each block - // has the form: - // [ Sba Sbb ] - // [ Saa Sab ] - // The system of equations has instead a block row of the form: - // [ -Sba I 0 -Sbb ] - // [ -Saa 0 I -Sab ] - // Assembling all such rows, one obtains a rectangular matrix of size - // 4*n*(m-1) by 4*n*m where m is the number of layers. - // The labeling of the rows is [ b0, a1, b1, a2, b2, ... am ] - // The labeling of the columns is [ a0, b0, a1, b1, a2, b2, ... am, bm ] - // The first and last block columns correspond to boundary conditions and - // hence must be moved to the RHS. This produces in the end a square matrix. - - // Prepare the RHS - Mult(n4, n2, 1., &work[6*n22], n4, &ab[0], 0, t1); - Copy(n4, t1, 1, &ab[n2], 1); - Mult(n4, n2, 1., &work[6*n22*(nlayers-1)+n2*n4], n4, &ab[(nlayers-1)*n4+n2], 0, t1); - Copy(n4, t1, 1, &ab[(nlayers-1)*n4-n2], 1); - -// PrintMatrix("RHS0", n4,nlayers, ab, n4); - - // At this point, we can compute the quantities involved in the LDU decomposition - // of the system matrix. The initial diagonal pivot block is - // [ I 0 | -Sbb 0 ] - // [ 0 I | -Sab 0 ] - // [ --------+-------- ] - // [ 0 -Sba | ] - // [ 0 -Saa | ] - // Notice that the upper and lower diagonal blocks come from different interfaces. - // The first LDU factorization step is trivial, and there are m-1 in total. - // Therefore, we need only iterate through the m-2 inner layers. - - for(size_t i = 1; i < nlayers; i++){ -//printf("i = %d\n", (int)i); - // Set pointers to current row of matrices - doublecomplex *row = work+6*n22*i; - doublecomplex *P = row + 4*n22; - doublecomplex *Q = P + n22; - doublecomplex *Pprev = P - 6*n22; - doublecomplex *Qprev = Pprev + n22; - - const doublecomplex *Sbb = row-6*n22+n2*n4; - const doublecomplex *Sab = Sbb+n2; - - const doublecomplex *Sba = row; - const doublecomplex *Saa = Sba+n2; - size_t *ipivP = iwork + n2*i; + RNP::TBLAS::SetMatrix<'A'>(n2,n2, 0.,1., t2,n2); + int solve_info; + RNP::LinearSolve<'N'>(n2, n2, t1, n2, t2, n2, &solve_info, pivots); // t2 = (I11 - f_l S12 I21)^{-1} - // Perform LDU factorization step - if(1 == i){ // First step is trivial - SetMatrix(n2, n2, 1., 0., P, n2); - ZeroMatrix(n2, n2, Q, n2); - }else{ - Copy(n2, n2, Sab, n4, t1, n2); // t1 = Sab - Copy(n2, n2, Sbb, n4, Q, n2); // Q = Sbb, t1 = Sab - LUSolve(n2, n2, Pprev, n2, ipivP-n2, Q, n2); // Q = inv(P) Sbb, t1 = Sab -// PrintMatrix("t1(Sab)", n2,n2, Sab, n4); -// PrintMatrix(" Sbb", n2,n2, Sbb, n4); -// PrintMatrix("invPprev Sbb", n2,n2, Q, n2); -// PrintMatrix("Qprev", n2,n2, Qprev, n2); - Mult(n2, 1., Qprev, n2, Q, n2, -1., t1, n2); // t1 = Q*inv(P)*Sbb - Sab -// PrintMatrix("Qprev invPprev Sbb", n2,n2, t1, n2); - Mult(n2, 1., Saa, n4, t1, n2, 0., Q, n2); - //PrintMatrix("Saa", n2,n2, Saa, n4); - //PrintMatrix("Saa Qprev invPprev Sbb", n2,n2, Q, n2); - - Mult(n2, 1., Sba, n4, t1, n2, 0., P, n2); - for(size_t j = 0; j < n2; ++j){ - P[j+j*n2] += 1.; - } - - // PrintMatrix("Q", n2,n2, Q, n2); - // PrintMatrix("P", n2,n2, P, n2); + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, &S[0+0*n4],n4, t1,n2); + for(size_t i = 0; i < n2; ++i){ // t1 = f_l S11 + RNP::TBLAS::Scale(n2, d1[i], &t1[i+0*n2], n2); } -// PrintMatrix("P", n2,n2, P, n2); - LUFactor(n2, P, n2, ipivP); -// PrintMatrix("Pfactored", n2,n2, P, n2); -// for(size_t i = 0; i < n2; ++i){ -// printf(" %d", ((int*)ipivP)[i]); -// } printf("\n"); + RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,t2,n2, t1,n2, 0.,&S[0+0*n4],n4); + // S11 is done, and we need to hold on to t2 = (I11 - f_l S12 I21)^{-1} -//printf("LDU step completed\n"); fflush(stdout); - } - - // Now use LDU factorization to solve - // L D U x = b - // x = U \ (D \ (L \ b)) - // - // Forward pass for D \ (L \ b) - for(size_t j = 1; j+1 < nlayers; ++j){ - // Set pointers to current row of matrices - doublecomplex *row = work+6*n22*j; - doublecomplex *P = row + 4*n22; - doublecomplex *Q = P + n22; - doublecomplex *Pprev = P - 6*n22; - doublecomplex *Qprev = Pprev + n22; - - doublecomplex *Sbb = row+n2*n4; - doublecomplex *Sab = Sbb+n2; - - doublecomplex *Sba = row+6*n22; - doublecomplex *Saa = Sba+n2; - - doublecomplex *aj = &ab[j*n4]; - doublecomplex *bjm1 = aj-n2; - doublecomplex *bjp1 = aj+n2; - size_t *ipivP = iwork + n2*j; - - // sub-diagonal block: - // [ P | ] - // [ Q I | ] - // [ --------------------+- ] - // [ Sba Q inv(P) -Sba | ] - // [ Saa Q inv(P) -Saa | ] - Copy(n4, bjm1, 1, t1, 1); -//PrintMatrix("bjm1", n4,1, bjm1, n4); -//PrintMatrix("Applied matrix", n4,n2, Sba, n4); - Mult(n4, n2, 1., Sba, n4, &t1[n2], 1., bjp1); - if(j > 1){ // Second iteration should use the fact that Q = 0 - LUSolve(n2, 1, P, n2, ipivP, t1, n2); - Mult(n2, n2, -1., Q, n2, t1, 0., &t1[n2]); - Mult(n4, n2, 1., Sba, n4, &t1[n2], 1., bjp1); + RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,&S[0+n2*n4],n4, in1,n2, 0.,t1,n2); // t1 = S12 I22 + for(size_t i = 0; i < n2; ++i){ // t1 = f_l S12 I22 + RNP::TBLAS::Scale(n2, d1[i], &t1[i+0*n2], n2); } - } -//printf("Forward pass completed\n"); fflush(stdout); -//PrintMatrix("RHS1", n4,nlayers, ab, n4); - - // Diagonal pass - for(size_t j = 2; j < nlayers; ++j){ - // Set pointers to current row of matrices - const doublecomplex *row = work+6*n22*j; - const doublecomplex *P = row + 4*n22; - const doublecomplex *Q = P + n22; - size_t *ipivP = iwork + n2*j; - doublecomplex *aj = &ab[j*n4]; - doublecomplex *bjm1 = aj-n2; - // Diaonal block: - // [ P 0 ] - // [ Q I ] - // Inverse: - // [ inv(P) 0 ] [ bjm1 ] - // [ -Q inv(P) I ] [ aj ] - LUSolve(n2, 1, P, n2, ipivP, bjm1, n2); -//PrintMatrix("bjm1", n2,1, bjm1, n2); - Mult(n2, n2, -1., Q, n2, bjm1, 1., aj); -//PrintMatrix("Q", n2,n2, Q, n2); - } -//printf("Diagonal pass completed\n"); fflush(stdout); -//PrintMatrix("RHS2", n4,nlayers, ab, n4); - - // Backward pass for U \ ... - for(size_t j = nlayers-2; j > 0; --j){ - // Set pointers to current row of matrices - doublecomplex *row = work+6*n22*j; - doublecomplex *P = row + 4*n22; - doublecomplex *Q = P + n22; - doublecomplex *Pprev = P - 6*n22; - doublecomplex *Qprev = Pprev + n22; - - doublecomplex *Sbb = row+n2*n4; - doublecomplex *Sab = Sbb+n2; - - doublecomplex *Sba = row+6*n22; - doublecomplex *Saa = Sba+n2; + RNP::TBLAS::Axpy(n2*n2, -1., in2,1, t1,1); // t1 = f_l S12 I22 - I12 + for(size_t i = 0; i < n2; ++i){ // t1 = (f_l S12 I22 - I12) f_{l+1} + RNP::TBLAS::Scale(n2, d2[i], &t1[0+i*n2], 1); + } + RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,t2,n2, t1,n2, 0.,&S[0+n2*n4],n4); + // S12 done, and t2 can be reused - doublecomplex *aj = &ab[j*n4]; - doublecomplex *bjm1 = aj-n2; - doublecomplex *bjp1 = aj+n2; - size_t *ipivP = iwork + n2*j; + RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,&S[n2+n2*n4],n4, in2,n2, 0.,t1,n2); // t1 = S22 I21 + RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,t1,n2, &S[0+0*n4],n4, 1.,&S[n2+0*n4],n4); + // S21 done, need to keep t1 = S22 I21 - // super-diagonal block: - // [ P 0 | -inv(P) Sbb 0 ] - // [ Q I | Q inv(P) Sbb - Sab 0 ] - // [ ------+------------------------- ] - // [ | ] - Mult(n2, n2, 1., Sbb, n4, bjp1, 0., t1); - if(j > 0){ - LUSolve(n2, 1, P, n2, ipivP, t1, n2); - } - Axpy(n2, 1., t1, 1, bjm1, 1); - if(j > 0){ - Mult(n2, n2, -1., Q, n2, t1, 1., aj); + RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,&S[n2+n2*n4],n4, in1,n2, 0.,t2,n2); // t2 = S22 I22 + for(size_t i = 0; i < n2; ++i){ // t2 = S22 I22 f_{l+1} + RNP::TBLAS::Scale(n2, d2[i], &t2[0+i*n2], 1); } - Mult(n2, n2, 1., Sab, n4, bjp1, 1., aj); + RNP::TBLAS::CopyMatrix<'A'>(n2,n2, t2,n2, &S[n2+n2*n4],n4); + RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, 1.,t1,n2, &S[0+n2*n4],n4, 1.,&S[n2+n2*n4],n4); + +#ifdef DUMP_MATRICES + DUMP_STREAM << "S(1," << l+2 << ") = " << std::endl; +# ifdef DUMP_MATRICES_LARGE + RNP::IO::PrintMatrix(n4,n4,S,n4, DUMP_STREAM) << std::endl << std::endl; +# else + RNP::IO::PrintVector(n4,S,1, DUMP_STREAM) << std::endl << std::endl; +# endif +#endif } - -//printf("Backward pass completed\n"); fflush(stdout); -//PrintMatrix("RHS3", n4,nlayers, ab, n4); - - if(NULL == work_){ + if(NULL == work_ || lwork < n4*(n4+1)){ rcwa_free(work); } - return 0; + if(NULL == iwork){ + rcwa_free(pivots); + } } int SolveInterior( @@ -1691,7 +796,7 @@ int SolveInterior( ){ if(0 == nlayers){ return-1; } if(which_layer >= nlayers){ return -2; } - + const size_t n2 = 2*n; const size_t n4 = 2*n2; const size_t lwork_GetSMatrix = 4*n*(4*n+1); @@ -1719,7 +824,7 @@ int SolveInterior( std::complex *bl = al+n2; std::complex *temp = S0l; size_t ldtemp = n4; - + int info; GetSMatrix(which_layer+1, n, kx, ky, omega, @@ -1729,16 +834,6 @@ int SolveInterior( thickness+which_layer, q+which_layer, Epsilon_inv+which_layer, epstype+which_layer, kp+which_layer, phi+which_layer, SlN, work_GetSMatrix, pivots, lwork_GetSMatrix); -#ifdef DUMP_MATRICES - if(NULL != a0){ - DUMP_STREAM << "a0 = " << std::endl; - RNP::IO::PrintVector(n2,a0,1, DUMP_STREAM) << std::endl << std::endl; - } - if(NULL != bN){ - DUMP_STREAM << "bN = " << std::endl; - RNP::IO::PrintVector(n2,bN,1, DUMP_STREAM) << std::endl << std::endl; - } -#endif #ifdef DUMP_MATRICES DUMP_STREAM << "S0l(0," << which_layer << ") = " << std::endl; # ifdef DUMP_MATRICES_LARGE @@ -1769,10 +864,10 @@ int SolveInterior( }else{ RNP::TBLAS::Fill(n2, 0., S22bN, 1); } - + // We overwrite the upper left submatrix S11(0,l) since it's not needed anymore // temp is set to S0l for this reason. - + // Compute -S_12(0,l)S_21(l,N) RNP::TBLAS::MultMM<'N','N'>(n2, n2, n2, std::complex(-1.0), &S0l[0+n2*n4], n4, &SlN[n2+0*n4], n4, @@ -1858,7 +953,7 @@ void GetZPoyntingFlux( std::complex *work ){ const size_t n2 = 2*n; - + std::complex *a2 = work; if(NULL == work){ a2 = (std::complex *)rcwa_malloc(sizeof(std::complex) * 4*n2); @@ -1866,7 +961,7 @@ void GetZPoyntingFlux( std::complex *b2 = a2 + n2; std::complex *a3 = b2 + n2; std::complex *b3 = a3 + n2; - + memcpy(a2, ab, sizeof(std::complex) * 2*n2); for(size_t i = 0; i < n2; ++i){ a2[i] /= (omega*q[i]); } for(size_t i = 0; i < n2; ++i){ b2[i] /= (omega*q[i]); } @@ -1881,16 +976,16 @@ void GetZPoyntingFlux( }else{ RNP::TBLAS::MultMM<'C','N'>(n2,2,n2, std::complex(1.),phi,n2, a2,n2, std::complex(0.), a3,n2); } - + // At this point a3[n2] is alpha and b3[n2] is beta, // ab[n2] is a and (ab+n2)[n2] is b - + *forward = RNP::TBLAS::ConjugateDot(n2, ab ,1, a3,1).real(); *backward = -RNP::TBLAS::ConjugateDot(n2, &ab[n2],1, b3,1).real(); std::complex diff = 0.5*(RNP::TBLAS::ConjugateDot(n2, &ab[n2],1, a3,1) - RNP::TBLAS::ConjugateDot(n2, b3,1, ab,1)); *forward += diff; *backward += std::conj(diff); - + if(NULL == work){ rcwa_free(a2); } @@ -1911,7 +1006,7 @@ void GetZPoyntingFluxComponents( std::complex *work ){ const size_t n2 = 2*n; - + std::complex *a2 = work; if(NULL == work){ a2 = (std::complex *)rcwa_malloc(sizeof(std::complex) * 4*n2); @@ -1919,7 +1014,7 @@ void GetZPoyntingFluxComponents( std::complex *b2 = a2 + n2; std::complex *a3 = b2 + n2; std::complex *b3 = a3 + n2; - + memcpy(a2, ab, sizeof(std::complex) * 2*n2); for(size_t i = 0; i < n2; ++i){ a2[i] /= (omega*q[i]); } for(size_t i = 0; i < n2; ++i){ b2[i] /= (omega*q[i]); } @@ -1930,14 +1025,14 @@ void GetZPoyntingFluxComponents( } MultKPMatrix("N", omega, n, kx, ky, Epsilon_inv, epstype, kp, 2, a3,n2, a2,n2); // At this point, a2[n2] contains alpha_e, b2[n2] contains beta_e - + if(NULL == phi){ RNP::TBLAS::CopyMatrix<'A'>(n2,2, ab,n2, a3,n2); }else{ RNP::TBLAS::MultMM<'N','N'>(n2,2,n2, std::complex(1.),phi,n2, ab,n2, std::complex(0.), a3,n2); } // At this point, a3[n2] contains alpha_h, b3[n2] contains beta_h - + for(size_t i = 0; i < n; ++i){ forward[i] = 0; backward[i] = 0; @@ -1950,7 +1045,7 @@ void GetZPoyntingFluxComponents( backward[i] += std::conj(diff); } } - + if(NULL == work){ rcwa_free(a2); } @@ -1970,7 +1065,7 @@ static void GetInPlaneFieldVector( ){ const size_t n2 = 2*n; const size_t n4 = 2*n2; - + for(size_t i = 0; i < n2; ++i){ eh[4*n2+i] = ab[i]/(omega*q[i]); eh[5*n2+i] = -ab[i+n2]/(omega*q[i]); @@ -2013,18 +1108,18 @@ void GetFieldAtPoint( const std::complex z_zero(0.); const std::complex z_one(1.); const size_t n2 = 2*n; - + std::complex *eh = work; if(NULL == work){ eh = (std::complex*)rcwa_malloc(sizeof(std::complex) * 8*n2); } - + GetInPlaneFieldVector(n, kx, ky, omega, q, epsilon_inv, epstype, kp, phi, ab, eh); const std::complex *hx = &eh[3*n2+0]; const std::complex *hy = &eh[3*n2+n]; const std::complex *ney = &eh[4*n2+0]; const std::complex *ex = &eh[4*n2+n]; - + std::complex fE[3], fH[3]; fE[0] = 0; fE[1] = 0; @@ -2032,7 +1127,7 @@ void GetFieldAtPoint( fH[0] = 0; fH[1] = 0; fH[2] = 0; - + if(NULL != efield && NULL != epsilon_inv){ for(size_t i = 0; i < n; ++i){ eh[i] = (ky[i]*hx[i] - kx[i]*hy[i]); @@ -2044,7 +1139,7 @@ void GetFieldAtPoint( RNP::TBLAS::MultMV<'N'>(n,n, z_one,epsilon_inv,n, eh,1, z_zero,&eh[n],1); } } - + for(size_t i = 0; i < n; ++i){ const double theta = (kx[i]*r[0] + ky[i]*r[1]); @@ -2056,7 +1151,7 @@ void GetFieldAtPoint( fH[2] += (kx[i] * -ney[i] - ky[i] * ex[i]) * (phase / omega); fE[2] += eh[n+i] * (phase / omega); } - + if(NULL != efield && NULL != epsilon_inv){ efield[0] = fE[0]; efield[1] = fE[1]; @@ -2067,7 +1162,7 @@ void GetFieldAtPoint( hfield[1] = fH[1]; hfield[2] = fH[2]; } - + if(NULL == work){ rcwa_free(eh); } @@ -2084,7 +1179,6 @@ void GetFieldOnGrid( int epstype, const std::complex *ab, // length 4*glist.n const size_t nxy[2], // number of points per lattice direction - const double *xy0, std::complex *efield, std::complex *hfield ){ @@ -2095,9 +1189,9 @@ void GetFieldOnGrid( const int nxyoff[2] = { (int)(nxy[0]/2), (int)(nxy[1]/2) }; int inxy[2] = { (int)nxy[0], (int)nxy[1] }; int inxy_rev[2] = { (int)nxy[1], (int)nxy[0] }; - + std::complex *eh = (std::complex*)rcwa_malloc(sizeof(std::complex) * 8*n2); - + GetInPlaneFieldVector(n, kx, ky, omega, q, epsilon_inv, epstype, kp, phi, ab, eh); const std::complex *hx = &eh[3*n2+0]; const std::complex *hy = &eh[3*n2+n]; @@ -2113,15 +1207,15 @@ void GetFieldOnGrid( memset(from[i], 0, sizeof(std::complex) * N); plan[i] = fft_plan_dft_2d(inxy_rev, from[i], to[i], 1); } - + for(size_t i = 0; i < n; ++i){ eh[i] = (ky[i]*hx[i] - kx[i]*hy[i]); - } - if(EPSILON2_TYPE_BLKDIAG1_SCALAR == epstype || EPSILON2_TYPE_BLKDIAG2_SCALAR == epstype){ - RNP::TBLAS::Scale(n, epsilon_inv[0], eh,1); - RNP::TBLAS::Copy(n, eh,1, &eh[n], 1); - }else{ - RNP::TBLAS::MultMV<'N'>(n,n, z_one,epsilon_inv,n, eh,1, z_zero,&eh[n],1); + if(EPSILON2_TYPE_BLKDIAG1_SCALAR == epstype || EPSILON2_TYPE_BLKDIAG2_SCALAR == epstype){ + RNP::TBLAS::Scale(n, epsilon_inv[0], eh,1); + RNP::TBLAS::Copy(n, eh,1, &eh[n], 1); + }else{ + RNP::TBLAS::MultMV<'N'>(n,n, z_one,epsilon_inv,n, eh,1, z_zero,&eh[n],1); + } } for(size_t i = 0; i < n; ++i){ const int iu = G[2*i+0]; @@ -2130,7 +1224,6 @@ void GetFieldOnGrid( (nxyoff[0] - (int)nxy[0] < iu && iu <= nxyoff[0]) && (nxyoff[1] - (int)nxy[1] < iv && iv <= nxyoff[1]) ){ - //todo: const std::complex shift_phase(-i 2pi Lk.G.xy0); const int ii = (iu >= 0 ? iu : iu + nxy[0]); const int jj = (iv >= 0 ? iv : iv + nxy[1]); from[0][ii+jj*nxy[0]] = hx[i]; @@ -2141,11 +1234,11 @@ void GetFieldOnGrid( from[5][ii+jj*nxy[0]] = eh[n+i] / omega; } } - + for(unsigned i = 0; i < 6; ++i){ fft_plan_exec(plan[i]); } - + for(size_t j = 0; j < nxy[1]; ++j){ for(size_t i = 0; i < nxy[0]; ++i){ hfield[3*(i+j*nxy[0])+0] = to[0][i+j*nxy[0]]; @@ -2156,7 +1249,7 @@ void GetFieldOnGrid( efield[3*(i+j*nxy[0])+2] = to[5][i+j*nxy[0]]; } } - + for(unsigned i = 0; i < 6; ++i){ fft_plan_destroy(plan[i]); fft_free(to[i]); @@ -2183,12 +1276,12 @@ void GetZStressTensorIntegral( const size_t n2 = 2*n; const std::complex z_zero(0.); const std::complex z_one(1.); - + std::complex *eh = work; if(NULL == work){ eh = (std::complex*)rcwa_malloc(sizeof(std::complex) * 8*n2); } - + GetInPlaneFieldVector(n, kx, ky, omega, q, epsilon_inv, epstype, kp, phi, ab, eh); std::complex *ez = &eh[1*n2+n]; std::complex *dz = &eh[2*n2+0]; @@ -2199,11 +1292,11 @@ void GetZStressTensorIntegral( const std::complex *ex = &eh[4*n2+n]; std::complex *ndy = &eh[5*n2+0]; std::complex *dx = &eh[5*n2+n]; - + integral[0] = 0; integral[1] = 0; integral[2] = 0; - + for(size_t i = 0; i < n; ++i){ hz[i] = (kx[i] * -ney[i] - ky[i] * ex[i]) / omega; dz[i] = (ky[i]*hx[i] - kx[i]*hy[i]) / omega; @@ -2215,7 +1308,7 @@ void GetZStressTensorIntegral( RNP::TBLAS::MultMV<'N'>(n,n, z_one,epsilon_inv,n, dz,1, z_zero,ez,1); } RNP::TBLAS::MultMV<'N'>(n2,n2, z_one,epsilon2,n2, ney,1, z_zero,ndy,1); - + for(size_t i = 0; i < n; ++i){ std::complex cdz = std::conj(dz[i]); std::complex cbz = std::conj(hz[i]); @@ -2231,7 +1324,7 @@ void GetZStressTensorIntegral( integral[2] -= 0.5*hy[i]*std::conj(hy[i]); integral[2] -= 0.5*hx[i]*std::conj(hx[i]); } - + if(NULL == work){ rcwa_free(eh); } @@ -2273,7 +1366,7 @@ void GetLayerVolumeIntegral( } const std::complex z_zero(0.); const std::complex z_one(1.); - + std::complex *Q = work; if(NULL == work){ Q = (std::complex*)rcwa_malloc(sizeof(std::complex)*n4*n4); @@ -2336,7 +1429,7 @@ void GetLayerVolumeIntegral( // Q = M^H * CsC * M = [ S+T -S+T ], S = fie^H * A * fie // [ -S+T S+T ] T = phi^H * B * phi // integral = int_0^thickness [x(z)^H * Q * x(z)] dz, x(z) = [a*exp(iqz);b*exp(iq(d-z))] - + // Make T if(NULL != phi){ RNP::TBLAS::MultMM<'N','N'>(n2,n2,n2, z_one,&Q[n2+n2*n4],n4, phi,n2, z_zero,&Q[0+n2*n4],n4); @@ -2391,7 +1484,7 @@ void GetLayerVolumeIntegral( // \frac{e^{id/2 (q-q^*)} }{i(s_j q - s_i q^*)} 2 i sin[ d/2 (s_j q - s_i q^*) ] // \frac{d e^{id/2 (q-q^*)} }{d/2 (s_j q - s_i q^*)} sin[ d/2 (s_j q - s_i q^*) ] // d e^{id/2 (q-q^*)} sinc[ d/2 (s_j q - s_i q^*) ], where sinc(x) = sin(x)/x - + *integral = 0; const double d_2 = 0.5*thickness; for(size_t j = 0; j < n4; ++j){ @@ -2434,16 +1527,16 @@ void GetLayerZIntegral( const size_t n2 = 2*n; const size_t n4 = 2*n2; const std::complex iomega(1./omega); - + std::complex *f = work; if(NULL == work){ f = (std::complex*)rcwa_malloc(sizeof(std::complex)*12*n4); } std::complex *g = f + 6*n4; RNP::TBLAS::Fill(6*n4, 0., f, 1); - + // We will simultaneously generate f for all 6 components - + // Generate the Fourier phase factors for(size_t i = 0; i < n; ++i){ // this is for ex double theta = (kx[i]*r[0] + ky[i]*r[1]); @@ -2471,7 +1564,7 @@ void GetLayerZIntegral( for(size_t i = 0; i < n; ++i){ // -hz f[5*n4+1*n+i] = f[n4+i] * iomega * ky[i]; } - + // Form M^H C^H * f // M = [ kp.phi.inv(q) -kp.phi.inv(q) ] // [ phi phi ] @@ -2493,11 +1586,11 @@ void GetLayerZIntegral( g[i+n2+j*n4] -= t; } } - + for(size_t i = 0; i < 6; ++i){ integral[i] = 0; } - + // At this point, each column of g contains a vector v such that // v v^H is a rank-1 matrix that must be integrated over. const double d_2 = 0.5*thickness; @@ -2515,7 +1608,7 @@ void GetLayerZIntegral( } } } - + if(NULL == work){ rcwa_free(f); } diff --git a/gensetupHPC.py.sh b/gensetupHPC.py.sh new file mode 100644 index 0000000..6cf65b2 --- /dev/null +++ b/gensetupHPC.py.sh @@ -0,0 +1,32 @@ +#!/bin/bash + +OBJDIR="$1" +LIBFILE="$2" +LIBS="$3" +x1="$4" +x2="$5" + +cat < setup.py +from distutils.core import setup, Extension + +libs = ['S4', 'stdc++'] +libs.extend( [lib[2::] for lib in '$LIBS'.split()]) + +S4module = Extension('S4', + sources = [ + 'S4/main_python.c' + ], + libraries = libs, + library_dirs = ['$OBJDIR', $x2], + extra_link_args = [ + '$LIBFILE', $x1 + ], + extra_compile_args=['-std=gnu99'] +) + +setup(name = 'S4', + version = '1.1', + description = 'Stanford Stratified Structure Solver (S4): Fourier Modal Method', + ext_modules = [S4module] +) +SETUPPY