From 5184dda5a196614b1025d058e58214db12a98807 Mon Sep 17 00:00:00 2001
From: Nick Papior <nickpapior@gmail.com>
Date: Fri, 2 Jul 2021 19:02:10 +0200
Subject: [PATCH] mnt: added optimize1d.c

Also benchmarks should not rely on convergence, rather a fixed
number of iterations is more appropriate.

The problem of inconsistent iterations between the C and Fortran versions
was also fixed because of an indexing error when calculating the cell
coordinates (C is 0-based, F is 1-based). So they now run the same number
of iterations, regardless of M.

Moved timings about to make them more comparable.

Ensured that all examples now use allocate/malloc which is much more
realistic. Even though it does not have the same performance it
is much more appropriate for benchmark cases.

Lastly, made calls more similar, C and F both
call a function for the iteration step.
---
 poisson2d/README.md                      |   2 +
 poisson2d/optimized.f90                  |  96 ++++++++++----------
 poisson2d/optimized1d.c                  | 106 +++++++++++++++++++++++
 poisson2d/{optimized.c => optimized2d.c} |  12 ++-
 4 files changed, 168 insertions(+), 48 deletions(-)
 create mode 100644 poisson2d/optimized1d.c
 rename poisson2d/{optimized.c => optimized2d.c} (95%)

diff --git a/poisson2d/README.md b/poisson2d/README.md
index 14ae309..82915a8 100644
--- a/poisson2d/README.md
+++ b/poisson2d/README.md
@@ -29,3 +29,5 @@ compared to the rest, for some reason. The timings are on AMD Ryzen 5 3600 @3.6G
 Some thoughts on the code at https://runningcrocodile.fi/articles/pythonperformance.html . Also, there was
 discussion about this problem at Discourse: https://fortran-lang.discourse.group/t/performance-c-vs-fortran/1461 
 with important contributions from numerous users (Mason, Beliavsky, septc, implicitall, nncarlson, han190 and pmk)
+
+The codes were rewritten for more realistic cases by zerothi.
diff --git a/poisson2d/optimized.f90 b/poisson2d/optimized.f90
index 3b9d997..536ed00 100644
--- a/poisson2d/optimized.f90
+++ b/poisson2d/optimized.f90
@@ -1,44 +1,26 @@
-module rhofunc
-  implicit none
-  public
-  integer, parameter :: dp=kind(0.d0)
-  
-contains
-  
-  pure real(dp) function rho(x,y)
-    real(dp), intent(in) :: x,y
-    if ( 0.6_dp < x .and. x < 0.8_dp .and. 0.6_dp < y .and. y < 0.8_dp ) then
-      rho = 1.0_dp
-    else if ( 0.2_dp < x .and. x < 0.4_dp .and. 0.2_dp < y .and. y < 0.4_dp ) then
-      rho = -1.0_dp
-    else
-      rho = 0.0_dp
-    end if
-  end function rho
-
-end module rhofunc
-
 program poisson
-  use rhofunc, only: rho
+
   implicit none
+
+  integer, parameter :: M = 1000
   integer, parameter :: dp=kind(0.d0)
-  integer, parameter :: M = 300
+  integer, parameter :: N_ITER = 10000
   real(dp),parameter :: target=1E-1_dp
   real(dp),parameter :: a=0.01_dp
-
-  integer            :: i,j, iter
   real(dp),parameter :: epsilon0=8.85E-12_dp
-  real(dp)           :: delta, b, e, a2
+
+  integer :: i,j, iter
+  real(dp) :: delta, b, e, a2
 
   real(dp), allocatable :: rhoarr(:,:)
   real(dp), allocatable, target :: phiprime(:,:), phi(:,:)
-  real(dp), pointer :: p(:,:), pnew(:,:)
+  real(dp), pointer :: p(:,:), pnew(:,:), tmp(:,:)
+
+  call cpu_time(b)
 
   allocate(phiprime(M,M), phi(M,M))
   allocate(rhoarr(M,M))
 
-  call cpu_time(b)
-
   ! Fortran doesn't care too much about pow
   ! since it figures out if the exponent is an integer or not
   a2 = a*a / epsilon0
@@ -48,21 +30,43 @@ program poisson
     end do
   end do
 
-  ! We only need to initialize 1 array
   phi(:,:) = 0.0_dp
+  phiprime(:,:) = 0.0_dp
+  p => phi
+  pnew => phiprime
   iter = 0
   delta = target + 1._dp
-  do while ( delta > target )
+  do while ( iter < N_ITER )
     iter = iter + 1
 
+    call iterate(rhoarr, p, pnew, delta)
+
     ! Easer to swap pointer than copying stuff around
-    if ( mod(iter, 2) == 1 ) then
-      p => phi
-      pnew => phiprime
-    else
-      p => phiprime
-      pnew => phi
-    end if
+    ! In fortran we could even use pointers
+    ! But we can just call
+    tmp => p
+    p => pnew
+    pnew => tmp
+
+  end do
+  
+  deallocate(phi, phiprime, rhoarr)
+
+  call cpu_time(e)
+
+  write(*,'(a)') 'fortran version'
+  write(*,'(a,f20.10)') 'delta = ',delta
+  write(*,'(a,i0)') 'Iterations = ',iter
+  write(*,'(a,f20.10)') 'Time = ',e - b
+
+contains
+
+  subroutine iterate(rhoarr, p, pnew, delta)
+    real(dp), intent(in) :: p(:,:), rhoarr(:,:)
+    real(dp), intent(out) :: pnew(:,:)
+    real(dp), intent(out) :: delta
+
+    integer :: i, j
 
     delta = 0._dp
     do j=2, M-1
@@ -72,13 +76,17 @@ program poisson
       end do
     end do
 
-  end do
-  
-  call cpu_time(e)
+  end subroutine iterate
 
-  deallocate(phi, phiprime, rhoarr)
-  
-  write(*,'(a,i0)') 'Iterations = ',iter
-  write(*,'(a,f20.10)') 'Time = ',e - b
+  pure real(dp) function rho(x,y)
+    real(dp), intent(in) :: x,y
+    if ( 0.6_dp < x .and. x < 0.8_dp .and. 0.6_dp < y .and. y < 0.8_dp ) then
+      rho = 1.0_dp
+    else if ( 0.2_dp < x .and. x < 0.4_dp .and. 0.2_dp < y .and. y < 0.4_dp ) then
+      rho = -1.0_dp
+    else
+      rho = 0.0_dp
+    end if
+  end function rho
 
 end program poisson
diff --git a/poisson2d/optimized1d.c b/poisson2d/optimized1d.c
new file mode 100644
index 0000000..adbb7c9
--- /dev/null
+++ b/poisson2d/optimized1d.c
@@ -0,0 +1,106 @@
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <math.h>
+#include <time.h>
+#define M 1000
+#define N_ITER 10000
+
+#define IDX(i,j) (i)*M+j
+
+
+double rho(const double x, const double y) {
+  const double s1 = 0.6;
+  const double e1 = 0.8;
+  const double s2 = 0.2;
+  const double e2 = 0.4;
+
+  if (s1 < x && x < e1 && s1 < y && y < e1) {
+    return 1.0;
+  } else if ( s2 < x && x < e2 && s2 < y && y < e2 ) {
+    return -1.0;
+  } else {
+    return 0.0;
+  }
+}
+
+double iterate(double *restrict phi, double *restrict phinew, double *restrict rhoa) {
+  double delta = 0, err;
+  for (int i=1; i < M-1; i++) {
+    for (int j=1; j < M-1; j++) {
+      phinew[IDX(i,j)] = (phi[IDX(i+1,j)] + phi[IDX(i-1,j)] + phi[IDX(i,j+1)] + phi[IDX(i,j-1)] + rhoa[IDX(i,j)])*0.25;
+      err = fabs(phinew[IDX(i,j)] - phi[IDX(i,j)]);
+      if ( err > delta ) delta = err;
+    }
+  }
+  return delta;
+}
+
+void init_rho(double *restrict rhoa, const double epsilon, const double a) {
+  const double a2 = a * a / epsilon;
+  for (int i=1; i<M-1; i++)  {
+    for (int j=1; j<M-1; j++) {
+      rhoa[IDX(i,j)] = rho(i*a,j*a) * a2;
+    }
+  }
+}
+
+
+void run(const double toler, const double a)
+{
+  double epsilon0 = 8.85e-12;
+
+  double *phi;
+  double *phip;
+  double *rhoa;
+  double *tmp;
+
+  // A real world program will definitely use malloc
+  phi = malloc(M*M*sizeof(double));
+  phip = malloc(M*M*sizeof(double));
+  rhoa = malloc(M*M*sizeof(double));
+
+  // Only need to initialize one of them!
+  for (int i = 0 ; i < M*M ; i++ )
+    phi[i] = 0.;
+  for (int i = 0 ; i < M*M ; i++ )
+    phip[i] = 0.;
+
+  // In C one tries to avoid using pow because
+  // it assumes floating point powers (integers are faster)
+  // So better do it directly
+  init_rho(rhoa, epsilon0, a);
+  
+  int iter = 0;
+  double delta;
+  while ( iter < N_ITER ) {
+    iter += 1;
+
+    delta = iterate(phi, phip, rhoa);
+
+    // swap pointers (no copies)
+    tmp = phi;
+    phi = phip;
+    phip = tmp;
+  }
+
+  free(phi);
+  free(phip);
+  free(rhoa);
+
+  printf("delta = %20.10f\n",delta);
+  printf("Iterations = %d\n", iter);
+}
+
+int main(int argc, char *argv[])
+{
+  const double target = 1e-1;
+  const double a = 0.01;
+
+  clock_t start = clock();
+  printf("c version [1d]\n");
+  run(target, a);
+  clock_t end = clock();
+  double total = ((double)(end - start)) / CLOCKS_PER_SEC;
+  printf("Time = %20.10f\n",total);
+}
diff --git a/poisson2d/optimized.c b/poisson2d/optimized2d.c
similarity index 95%
rename from poisson2d/optimized.c
rename to poisson2d/optimized2d.c
index df18e21..791c94a 100644
--- a/poisson2d/optimized.c
+++ b/poisson2d/optimized2d.c
@@ -3,7 +3,8 @@
 #include <string.h>
 #include <math.h>
 #include <time.h>
-#define M 300
+#define M 1000
+#define N_ITER 10000
 
 
 double ** malloc_2d(int m, int n) {
@@ -99,7 +100,7 @@ void run(double toler, double a)
   
   int iter = 0;
   double delta;
-  do {
+  while ( iter < N_ITER ) {
     iter += 1;
 
     delta = iterate(phi, phip, rhoa);
@@ -108,20 +109,23 @@ void run(double toler, double a)
     tmp = phi;
     phi = phip;
     phip = tmp;
-  } while ( delta > toler );
+  }
 
   free_2d(phi);
   free_2d(phip);
   free_2d(rhoa);
 
+  printf("delta = %20.10f\n",delta);
   printf("Iterations = %d\n", iter);
 }
 
 int main(int argc, char *argv[])
 {
-  clock_t start = clock();
   const double target = 1e-1;
   const double a = 0.01;
+
+  clock_t start = clock();
+  printf("c version [2d]\n");
   run(target, a);
   clock_t end = clock();
   double total = ((double)(end - start)) / CLOCKS_PER_SEC;