BerkeleyLab · rouson · Dec 28, 2023 · Dec 31, 2023 · Jan 1, 2024 · Jan 1, 2024
diff --git a/app/main.f90 b/app/main.f90
@@ -9,14 +9,14 @@ program main
   associate(input => input_t())
     output = output_t(input, matcha(input))
     block
-      double precision, allocatable :: simulated_distribution(:,:)
+      double precision, allocatable :: simulated_distribution(:,:), frequency_distribution(:)
       integer, parameter :: freq=2
       integer num_cells
 
       num_cells = output%my_num_cells()
       simulated_distribution = output%simulated_distribution()
-      simulated_distribution(:,freq) = num_cells*simulated_distribution(:,freq)
-      call co_sum(simulated_distribution(:,freq), result_image=1)
+      frequency_distribution =  num_cells*simulated_distribution(:,freq) ! copy to work around nagfor bug
+      call co_sum(frequency_distribution, result_image=1)
       call co_sum(num_cells, result_image=1)
       if (this_image()==1) simulated_distribution(:,freq) = simulated_distribution(:,freq)/dble(num_cells)
     end block

diff --git a/example/heat-equation.f90 b/example/heat-equation.f90
@@ -76,13 +76,14 @@ module subroutine exchange_halo(self)
 
   end interface
 
+  real, allocatable :: halo_x(:,:)[:]
+
 end module
 
 submodule(subdomain_2D_m) subdomain_2D_s
   use assertions_m, only : assert
   implicit none
 
-  real, allocatable :: halo_x(:,:)[:]
   real dx_, dy_
   integer, parameter :: west=1, east=2
   integer my_nx, nx, ny, me, num_subdomains, my_internal_west, my_internal_east
@@ -114,7 +115,8 @@ module subroutine exchange_halo(self)
     self%s_(my_nx, 2:ny-1) = merge(boundary_val, internal_val, me==num_subdomains) ! east subdomain boundary
 
     if (allocated(halo_x)) deallocate(halo_x)
-    allocate(halo_x(west:east, ny)[*])
+    !allocate(halo_x(west:east, ny)[*])
+    allocate(halo_x(2, ny)[*])
     call self%exchange_halo
   end procedure
 
@@ -131,7 +133,7 @@ module subroutine exchange_halo(self)
     integer i, j
     real, allocatable :: halo_west(:), halo_east(:)
 
-    call assert(allocated(rhs%s_), "subdomain_2D_t%laplacian: allocated(rhs%s_)")
+    !call assert(allocated(rhs%s_), "subdomain_2D_t%laplacian: allocated(rhs%s_)")
     call assert(allocated(halo_x), "subdomain_2D_t%laplacian: allocated(halo_x)")
 
     allocate(laplacian_rhs(my_nx, ny))
@@ -168,8 +170,8 @@ module subroutine exchange_halo(self)
   end procedure
 
   module procedure exchange_halo
-    if (me>1) halo_x(east,:)[me-1] = self%s_(1,:)
-    if (me<num_subdomains) halo_x(west,:)[me+1] = self%s_(my_nx,:)
+    !if (me>1) halo_x(east,:)[me-1] = self%s_(1,:)
+    !if (me<num_subdomains) halo_x(west,:)[me+1] = self%s_(my_nx,:)
   end procedure
 
   module procedure values

diff --git a/example/time-paradigm.f90 b/example/time-paradigm.f90
@@ -2,7 +2,7 @@
 ! Terms of use are as specified in LICENSE.txt
 program time_paradigm_m
   !! Time various alternative programming paradigms
-  use subdomain_m, only : subdomain_t
+  use subdomain_m, only : subdomain_t, march
   use assert_m, only : assert
   use sourcery_m, only : string_t, file_t, command_line_t, bin_t, csv 
   use iso_fortran_env, only : int64
@@ -12,7 +12,7 @@ program time_paradigm_m
   character(len=:), allocatable :: steps_string, resolution_string
   type(command_line_t) command_line
   integer(int64) counter_start, counter_end, clock_rate
-  integer :: steps=200, resolution=64
+  integer :: steps=300, resolution=128
 
   associate(me => this_image())
     if (command_line%argument_present(["--help"])) then
@@ -52,20 +52,22 @@ function functional_programming_time() result(system_time)
     integer(int64) t_start_functional, t_end_functional, clock_rate
     integer step
     real system_time
-    type(subdomain_t) T
+    type(subdomain_t), save :: T[*]
 
     call T%define(side=1., boundary_val=T_boundary, internal_val=T_internal_initial, n=resolution)
 
-    call system_clock(t_start_functional)
-
     associate(dt => T%dx()*T%dy()/(4*alpha))
+      call system_clock(t_start_functional)
+
       functional_programming: &
       do step = 1, steps
+        sync all
         T =  T + dt * alpha * .laplacian. T
       end do functional_programming
+
+      call system_clock(t_end_functional, clock_rate)
     end associate
 
-    call system_clock(t_end_functional, clock_rate)
     system_time = real(t_end_functional - t_start_functional)/real(clock_rate)
 
     associate(L_infinity_norm => maxval(abs(T%values() - T_steady)))
@@ -78,18 +80,22 @@ function procedural_programming_time() result(system_time)
     integer(int64) t_start_procedural, t_end_procedural, clock_rate
     integer step
     real system_time
-    type(subdomain_t) T
+    type(subdomain_t), save :: T[*]
+
+    call T%define(side=1., boundary_val=0., internal_val=1., n=resolution)
 
     associate(dt => T%dx()*T%dy()/(4*alpha))
-      call T%define(side=1., boundary_val=0., internal_val=1., n=resolution)
       call system_clock(t_start_procedural)
+
       procedural_programming: &
       do step = 1, steps
-        call T%step(alpha*dt)
+        sync all
+        call march(alpha*dt, T)
       end do procedural_programming
+
+      call system_clock(t_end_procedural, clock_rate)
     end associate
 
-    call system_clock(t_end_procedural, clock_rate)
     system_time = real(t_end_procedural - t_start_procedural)/real(clock_rate)
 
     associate(L_infinity_norm => maxval(abs(T%values() - T_steady)))

diff --git a/src/matcha/distribution_s.F90 → src/matcha/distribution_s.f90 b/src/matcha/distribution_s.F90 → src/matcha/distribution_s.f90
@@ -47,8 +47,11 @@ pure function monotonically_increasing(f) result(monotonic)
       "distribution_t%cumulative_distribution: allocated(cumulative_distribution_)")
     call assert(allocated(self%vel_), "distribution_t%cumulative_distribution: allocated(vel_)")
 
-     ! Sample from the distribution
-     call do_concurrent_sampled_speeds(speeds, self%vel_, self%cumulative_distribution(), sampled_speeds)
+    ! Sample from the distribution
+    associate(ncells => size(speeds,1), nsteps => size(speeds,2))
+      allocate(sampled_speeds(ncells,nsteps))
+      call do_concurrent_sampled_speeds(speeds, self%vel_, self%cumulative_distribution(), sampled_speeds)
+    end associate
 
      associate(nsteps => size(speeds,2))
 
@@ -63,6 +66,9 @@ pure function monotonically_increasing(f) result(monotonic)
          end associate
        end associate
 
+       if(allocated(my_velocities)) deallocate(my_velocities)
+       allocate(my_velocities, mold=dir)
+
        call do_concurrent_my_velocities(nsteps, dir, sampled_speeds, my_velocities)
 
      end associate

diff --git a/src/matcha/do_concurrent_m.f90 b/src/matcha/do_concurrent_m.f90
@@ -1,6 +1,6 @@
 module do_concurrent_m
   use iso_c_binding, only : c_double, c_int
-  use t_cell_collection_m, only : t_cell_collection_t, t_cell_collection_bind_C_t
+  use t_cell_collection_m, only : t_cell_collection_bind_C_t
   implicit none
   private
   public :: do_concurrent_sampled_speeds, do_concurrent_my_velocities, do_concurrent_k, do_concurrent_speeds
@@ -11,34 +11,34 @@ module do_concurrent_m
     pure module subroutine do_concurrent_sampled_speeds(speeds, vel, cumulative_distribution, sampled_speeds) bind(C)
       implicit none
       real(c_double), intent(in) :: speeds(:,:), vel(:), cumulative_distribution(:)
-      real(c_double), intent(out), allocatable :: sampled_speeds(:,:)
+      real(c_double), intent(out) :: sampled_speeds(:,:)
     end subroutine
 
     pure module subroutine do_concurrent_my_velocities(nsteps, dir, sampled_speeds, my_velocities) bind(C)
       implicit none
       integer(c_int), intent(in) :: nsteps
       real(c_double), intent(in) :: dir(:,:,:), sampled_speeds(:,:)
-      real(c_double), intent(out), allocatable :: my_velocities(:,:,:)
+      real(c_double), intent(out) :: my_velocities(:,:,:)
     end subroutine
 
     pure module subroutine do_concurrent_k(speeds, vel, k) bind(C)
       implicit none
       real(c_double), intent(in) :: speeds(:), vel(:)
-      integer(c_int), intent(out), allocatable :: k(:)
+      integer(c_int), intent(out) :: k(:)
     end subroutine
 
-    pure module subroutine &
-      do_concurrent_output_distribution(nintervals, speed, freq, emp_distribution, k, output_distribution) bind(C)
+    pure module subroutine do_concurrent_output_distribution(speed, freq, emp_distribution, k, output_distribution) &
+      bind(C)
       implicit none
-      integer(c_int), intent(in) :: nintervals, speed, freq, k(:)
+      integer(c_int), intent(in) :: speed, freq, k(:)
       real(c_double), intent(in) :: emp_distribution(:,:)
-      real(c_double), intent(out), allocatable :: output_distribution(:,:)
+      real(c_double), intent(out) :: output_distribution(:,:)
     end subroutine
 
     module subroutine do_concurrent_speeds(history, speeds) bind(C)
       implicit none
       type(t_cell_collection_bind_C_t), intent(in) :: history(:)
-      real(c_double), intent(out), allocatable :: speeds(:)
+      real(c_double), intent(out) :: speeds(:)
     end subroutine
 
   end interface

diff --git a/src/matcha/do_concurrent_s.f90 b/src/matcha/do_concurrent_s.f90
@@ -1,68 +1,72 @@
 submodule(do_concurrent_m) do_concurrent_s
+  use assert_m, only : assert
   use iso_c_binding, only : c_f_pointer
   implicit none
 
 contains
 
-  module procedure do_concurrent_sampled_speeds
-
+  pure module subroutine do_concurrent_sampled_speeds(speeds, vel, cumulative_distribution, sampled_speeds) bind(C)
+    real(c_double), intent(in) :: speeds(:,:), vel(:), cumulative_distribution(:)
+    real(c_double), intent(out) :: sampled_speeds(:,:)
     integer cell, step
+
+    call assert(all(shape(sampled_speeds)==shape(speeds)), "do_concurrent_sampled_speeds: {sampled_,}speeds shape match")
 
     associate(ncells => size(speeds,1), nsteps => size(speeds,2))
-      allocate(sampled_speeds(ncells,nsteps))
       do concurrent(cell = 1:ncells, step = 1:nsteps)
         associate(k => findloc(speeds(cell,step) >= cumulative_distribution, value=.false., dim=1)-1)
           sampled_speeds(cell,step) = vel(k)
         end associate
       end do
     end associate
 
-  end procedure
-
-  module procedure do_concurrent_my_velocities
-
+  end subroutine  
+
+  pure module subroutine do_concurrent_my_velocities(nsteps, dir, sampled_speeds, my_velocities) bind(C)
+    integer(c_int), intent(in) :: nsteps
+    real(c_double), intent(in) :: dir(:,:,:), sampled_speeds(:,:)
+    real(c_double), intent(out) :: my_velocities(:,:,:)
     integer step
 
-    if(allocated(my_velocities)) deallocate(my_velocities)
-    allocate(my_velocities, mold=dir)
-
+    call assert(all([size(my_velocities,1),size(sampled_speeds,2)] == shape(sampled_speeds)), &
+      "do_concurrent_my_velocities: argument size match")
+    call assert(all(size(my_velocities,1)==shape(dir)), "do_concurrent_my_velocities: argument shape match")
+
     do concurrent(step=1:nsteps)
       my_velocities(:,step,1) = sampled_speeds(:,step)*dir(:,step,1)
       my_velocities(:,step,2) = sampled_speeds(:,step)*dir(:,step,2)
       my_velocities(:,step,3) = sampled_speeds(:,step)*dir(:,step,3)
     end do
-
-  end procedure
-
-  module procedure do_concurrent_k
+  end subroutine  
 
-  integer i
+  pure module subroutine do_concurrent_k(speeds, vel, k) bind(C)
+    real(c_double), intent(in) :: speeds(:), vel(:)
+    integer(c_int), intent(out) :: k(:)  
+    integer i
 
     associate(nspeeds => size(speeds))
-      if(allocated(k)) deallocate(k)
-      allocate(k(nspeeds))
         do concurrent(i = 1:nspeeds)
           k(i) = findloc(speeds(i) >= vel, value=.false., dim=1)-1
         end do
     end associate
-  end procedure
-
-  module procedure do_concurrent_output_distribution
+  end subroutine
 
+  pure module subroutine do_concurrent_output_distribution(speed, freq, emp_distribution, k, output_distribution) bind(C)
+    integer(c_int), intent(in) :: speed, freq, k(:)
+    real(c_double), intent(in) :: emp_distribution(:,:)
+    real(c_double), intent(out) :: output_distribution(:,:)
     integer i
 
-    if(allocated(output_distribution)) deallocate(output_distribution)
-    allocate(output_distribution(nintervals,2))
     output_distribution(:,freq) = 0.d0
     output_distribution(:,speed) = emp_distribution(:,speed)
     do concurrent(i = 1:size(output_distribution,1))
       output_distribution(i,freq) = count(k==i)
     end do
-
-  end procedure
-
-  module procedure do_concurrent_speeds
+  end subroutine
 
+  module subroutine do_concurrent_speeds(history, speeds) bind(C)
+    type(t_cell_collection_bind_C_t), intent(in) :: history(:)
+    real(c_double), intent(out) :: speeds(:)  
     integer i, j, k
     integer, parameter :: nspacedims=3
 
@@ -78,19 +82,16 @@
          x(i,:,:) = positions
       end do
 
-      associate(t => history%time)
-        allocate(speeds(ncells*(npositions-1)))
-        do concurrent(i = 1:npositions-1, j = 1:ncells)
-          associate( &
-            u => (x(i+1,j,:) - x(i,j,:))/(t(i+1) - t(i)), &
-            ij => i + (j-1)*(npositions-1) &
-           )   
-            speeds(ij) = sqrt(sum([(u(k)**2, k=1,nspacedims)]))
-          end associate
-        end do
-      end associate
+      do concurrent(i = 1:npositions-1, j = 1:ncells)
+        associate( &
+          u => (x(i+1,j,:) - x(i,j,:))/(history(i+1)%time - history(i)%time), &
+          ij => i + (j-1)*(npositions-1) &
+         )   
+          speeds(ij) = sqrt(sum([(u(k)**2, k=1,nspacedims)]))
+        end associate
+      end do
     end associate
 
-  end procedure
+  end subroutine
 
-end submodule do_concurrent_s
+end submodule do_concurrent_s
diff --git a/src/matcha/output_s.f90 b/src/matcha/output_s.f90
@@ -3,7 +3,7 @@
 submodule(output_m) output_s
   use do_concurrent_m, only : do_concurrent_k, do_concurrent_output_distribution, do_concurrent_speeds
   use t_cell_collection_m, only : t_cell_collection_bind_C_t
-  use iso_c_binding, only : c_loc, c_double
+  use iso_c_binding, only : c_double
   implicit none
 
 contains
@@ -24,16 +24,26 @@
 
     integer, parameter :: speed=1, freq=2 ! subscripts for speeds and frequencies
 
+    associate(npositions => size(self%history_))
+      allocate(speeds(self%my_num_cells()*(npositions-1)))
+    end associate
     call do_concurrent_speeds(t_cell_collection_bind_C_t(self%history_), speeds)
 
-    associate(emp_distribution => self%input_%sample_distribution())
+    block
+      real(c_double), allocatable :: emp_distribution(:,:)
+
+      emp_distribution = self%input_%sample_distribution()
       associate(nintervals => size(emp_distribution(:,1)), dvel_half => (emp_distribution(2,speed)-emp_distribution(1,speed))/2.d0)
         vel = [emp_distribution(1,speed) - dvel_half, [(emp_distribution(i,speed) + dvel_half, i=1,nintervals)]]
+        if (allocated(k)) deallocate(k)
+        allocate(k(size(speeds)))
         call do_concurrent_k(speeds, vel, k)
-        call do_concurrent_output_distribution(nintervals, speed, freq, emp_distribution, k, output_distribution)
+        if(allocated(output_distribution)) deallocate(output_distribution)
+        allocate(output_distribution(nintervals,2))
+        call do_concurrent_output_distribution(speed, freq, emp_distribution, k, output_distribution)
         output_distribution(:,freq) = output_distribution(:,freq)/sum(output_distribution(:,freq))
       end associate
-    end associate
+    end block
 
   end procedure