Merge pull request #28 from numericalEFT/thread_new

Add thread support
numericalEFT · Nov 9, 2022 · a58cd3b · a58cd3b · kunyuan · Nov 9, 2022
2 parents a1b0017 + 74708ba
commit a58cd3b
Show file tree

Hide file tree

Showing 19 changed files with 733 additions and 152 deletions.
diff --git a/.github/workflows/CI.yml b/.github/workflows/CI.yml
@@ -34,7 +34,32 @@ jobs:
             ${{ runner.os }}-test-
             ${{ runner.os }}-
       - uses: julia-actions/julia-buildpkg@v1
+      - name: Install MPI dependencies
+        shell: bash
+        run: |
+          julia -e '
+            using Pkg; Pkg.add("MPI"); using MPI; MPI.install_mpiexecjl()
+          '
+
       - uses: julia-actions/julia-runtest@v1
+        env:
+          JULIA_NUM_THREADS: 4
+      # - name: Execute MPI-parallel tests
+      #   run: |
+      #     julia --project -e '
+      #       using Pkg; Pkg.build(); Pkg.precompile()
+      #       Pkg.add("MPI"); using MPI; MPI.install_mpiexecjl()
+      #       Pkg.test(; test_args=["quick"])
+      #     '
+      #     $HOME/.julia/bin/mpiexecjl -np 8 julia --check-bounds=yes --depwarn=yes --project --color=yes -e 'using Pkg; Pkg.test(coverage=true)'
+      # if: ${{ matrix.payload == 'mpi' }}
+      # continue-on-error: ${{ matrix.version == 'nightly' }}
+
+      - uses: julia-actions/julia-processcoverage@v1
+      - uses: codecov/codecov-action@v2
+        with:
+          file: lcov.info
+
       - uses: julia-actions/julia-processcoverage@v1
       - uses: codecov/codecov-action@v1
         with:

diff --git a/Project.toml b/Project.toml
@@ -1,14 +1,13 @@
 name = "MCIntegration"
 uuid = "ea1e2de9-7db7-4b42-91ee-0cd1bf6df167"
 authors = ["Kun Chen", "Xiansheng Cai", "Pengcheng Hou"]
-version = "0.3.1"
+version = "0.3.2"
 
 [deps]
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"
 Graphs = "86223c79-3864-5bf0-83f7-82e725a168b6"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 MPI = "da04e1cc-30fd-572f-bb4f-1f8673147195"
-Measurements = "eff96d63-e80a-5855-80a2-b1b0885c5ab7"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
@@ -17,11 +16,10 @@ Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
 [compat]
 Graphs = "1"
-MPI = "0.16, 0.19"
+MPI = "0.16, 0.19, 0.20"
+ProgressMeter = "1"
 StaticArrays = "1"
 julia = "1.6"
-Measurements = "2"
-ProgressMeter = "1"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

diff --git a/README.md b/README.md
@@ -168,13 +168,45 @@ The packed variables will be sampled all together in the Markov-chain based solv
 Moreover, packed variables usually indicate nontrivial correlations between their distributions. In the future, it will be interesting to learn such correlation so that one can sample the packed variables more efficiently.
 
 # Parallelization
+MCIntegration supports both MPI and multi-thread parallelization. You can even mix them if necessary.
 
-MCIntegration supports MPI parallelization. To run your code in MPI mode, simply use the command
+## MPI
+To run your code in MPI mode, simply use the command,
 ```bash
-mpiexec julia -n #NCPU ./your_script.jl
+mpiexec -n #NCPU julia ./your_script.jl
 ```
 where `#NCPU` is the number of workers. Internally, the MC sampler will send the blocks (controlled by the argument `Nblock`, see above example code) to different workers, then collect the estimates in the root node. 
 
 Note that you need to install the package [MPI.jl](https://github.com/JuliaParallel/MPI.jl) to use the MPI mode. See this [link](https://juliaparallel.github.io/MPI.jl/stable/configuration/) for the instruction on the configuration.
 
 The user essentially doesn't need to write additional code to support the parallelization. The only tricky part is the output: only the function `MCIntegratoin.integrate` of the root node returns meaningful estimates, while other workers simply returns `nothing`.
+
+## Multi-threading
+
+MCIntegration supports multi-threading with or without MPI. To run your code with multiple threads, start Julia with
+```bash
+julia -t #NCPU ./your_script.jl
+```
+Note that all threads will share the same memory. The user-defined `integrand` and `measure` functions should be implemented thread-safe (for example, be very careful about reading any data if another thread might write to it). We recommend the user read Julia's official [documentation](https://docs.julialang.org/en/v1/manual/multi-threading/).
+
+There are two different ways to parallelize your code with multiple threads. 
+
+1. If you need to evaluate multiple integrals, each thread can call the function `MCIntegration.integrate` to do one integral. In the following example, we use three threads to evaluate three integrals altogether. Note that only three threads will be used even if you initialize Julia with more than three threads.
+```julia
+julia> Threads.@threads for i = 1:3
+       println("Thread $(Threads.threadid()) returns ", integrate((x, c) -> x[1]^i, print=-2))
+       end
+Thread 2 returns Integral 1 = 0.24995156136254149 ± 6.945088534643841e-5   (chi2/dof = 2.95)
+Thread 3 returns Integral 1 = 0.3334287563137184 ± 9.452648803649706e-5   (chi2/dof = 1.35)
+Thread 1 returns Integral 1 = 0.5000251243601586 ± 0.00013482206569391864   (chi2/dof = 1.58)
+```
+
+2. Only the main thread calls the function `MCIntegration.integrate`, then parallelize the internal blocks with multiple threads. To do that, you need to call the function `MCIntegration.integrate` with a key argument `parallel = :thread`. This approach will utilize all Julia threads.  For example,
+```julia
+julia> for i = 1:3
+       println("Thread $(Threads.threadid()) return ", integrate((x, c) -> x[1]^i, print=-2, parallel=:thread))
+       end
+Thread 1 return Integral 1 = 0.5001880440214347 ± 0.00015058935731086765   (chi2/dof = 0.397)
+Thread 1 return Integral 1 = 0.33341068551139696 ± 0.00010109649819894601   (chi2/dof = 1.94)
+Thread 1 return Integral 1 = 0.24983868976137244 ± 8.546009018501706e-5   (chi2/dof = 1.54)
+```
diff --git a/docs/src/index.md b/docs/src/index.md
@@ -197,13 +197,45 @@ Moreover, packed variables usually indicate nontrivial correlations between thei
     over the infinite interval ``(-\infty, \infty)`` is zero.
 
 # Parallelization
+MCIntegration supports both MPI and multi-thread parallelization. You can even mix them if necessary.
 
-MCIntegration supports MPI parallelization. To run your code in MPI mode, simply use the command
+## MPI
+To run your code in MPI mode, simply use the command,
 ```bash
-mpiexec julia -n #NCPU ./your_script.jl
+mpiexec -n #NCPU julia ./your_script.jl
 ```
 where `#NCPU` is the number of workers. Internally, the MC sampler will send the blocks (controlled by the argument `Nblock`, see above example code) to different workers, then collect the estimates in the root node. 
 
 Note that you need to install the package [MPI.jl](https://github.com/JuliaParallel/MPI.jl) to use the MPI mode. See this [link](https://juliaparallel.github.io/MPI.jl/stable/configuration/) for the instruction on the configuration.
 
-The user essentially doesn't need to write additional code to support the parallelization. The only tricky part is the output: only the function `MCIntegratoin.integrate` of the root node returns meaningful estimates, while other workers simply returns `nothing`. 
+The user essentially doesn't need to write additional code to support the parallelization. The only tricky part is the output: only the function `MCIntegratoin.integrate` of the root node returns meaningful estimates, while other workers simply returns `nothing`.
+
+## Multi-threading
+
+MCIntegration supports multi-threading with or without MPI. To run your code with multiple threads, start Julia with
+```bash
+julia -t #NCPU ./your_script.jl
+```
+Note that all threads will share the same memory. The user-defined `integrand` and `measure` functions should be implemented thread-safe (for example, be very careful about reading any data if another thread might write to it). We recommend the user read Julia's official [documentation](https://docs.julialang.org/en/v1/manual/multi-threading/).
+
+There are two different ways to parallelize your code with multiple threads. 
+
+1. If you need to evaluate multiple integrals, each thread can call the function `MCIntegration.integrate` to do one integral. In the following example, we use three threads to evaluate three integrals altogether. Note that only three threads will be used even if you initialize Julia with more than three threads.
+```julia
+julia> Threads.@threads for i = 1:3
+       println("Thread $(Threads.threadid()) returns ", integrate((x, c) -> x[1]^i, print=-2))
+       end
+Thread 2 returns Integral 1 = 0.24995156136254149 ± 6.945088534643841e-5   (chi2/dof = 2.95)
+Thread 3 returns Integral 1 = 0.3334287563137184 ± 9.452648803649706e-5   (chi2/dof = 1.35)
+Thread 1 returns Integral 1 = 0.5000251243601586 ± 0.00013482206569391864   (chi2/dof = 1.58)
+```
+
+2. Only the main thread calls the function `MCIntegration.integrate`, then parallelize the internal blocks with multiple threads. To do that, you need to call the function `MCIntegration.integrate` with a key argument `parallel = :thread`. This approach will utilize all Julia threads.  For example,
+```julia
+julia> for i = 1:3
+       println("Thread $(Threads.threadid()) return ", integrate((x, c) -> x[1]^i, print=-2, parallel=:thread))
+       end
+Thread 1 return Integral 1 = 0.5001880440214347 ± 0.00015058935731086765   (chi2/dof = 0.397)
+Thread 1 return Integral 1 = 0.33341068551139696 ± 0.00010109649819894601   (chi2/dof = 1.94)
+Thread 1 return Integral 1 = 0.24983868976137244 ± 8.546009018501706e-5   (chi2/dof = 1.54)
+```
diff --git a/src/MCIntegration.jl b/src/MCIntegration.jl
@@ -5,7 +5,7 @@ using Random
 using Graphs
 using Test
 using ProgressMeter
-using Measurements
+# using Measurements
 
 const RNG = Random.GLOBAL_RNG
 const TINY = eps(Float64(0)) * 1e50 # 4.940656458412466e-274

diff --git a/src/configuration.jl b/src/configuration.jl
@@ -167,6 +167,11 @@ function Configuration(;
     )
 end
 
+function reset_seed!(cfg::Configuration, seed::Int)
+    cfg.seed = seed
+    cfg.rng = MersenneTwister(seed)
+end
+
 function _neighbor(neighbor, Nd)
     if isnothing(neighbor)
         # By default, only the order-1 and order+1 diagrams are considered to be the neighbors
@@ -230,37 +235,85 @@ function addConfig!(c::Configuration, ic::Configuration)
     end
 end
 
-function MPIreduceConfig!(c::Configuration, root, comm)
+function MPIreduceConfig!(c::Configuration, root=0, comm=MPI.COMM_WORLD)
+    # Need to reduce from workers:
+    # neval
+    # var.histogram
+    # visited, propose, accept
+    # normalization, observable
+
     function histogram_reduce!(var::Variable)
         if var isa Dist.CompositeVar
             for v in var.vars
                 histogram_reduce!(v)
             end
         else
-            histogram = MPI.Reduce(var.histogram, MPI.SUM, root, comm)
-            if MPI.Comm_rank(comm) == root
-                var.histogram = histogram
-            end
+            MCUtility.MPIreduce!(var.histogram)
         end
     end
 
     ########## variable that could be a number ##############
-    neval = MPI.Reduce(c.neval, MPI.SUM, root, comm)
-    normalization = MPI.Reduce(c.normalization, MPI.SUM, root, comm)
-    observable = [MPI.Reduce(c.observable[o], MPI.SUM, root, comm) for o in eachindex(c.observable)]
-    if MPI.Comm_rank(comm) == root
-        c.neval = neval
-        c.normalization = normalization
-        c.observable = observable
+    c.neval = MCUtility.MPIreduce(c.neval) # reduce the amount of the commuication
+    c.normalization = MCUtility.MPIreduce(c.normalization) # reduce the amount of the commuication
+    for o in eachindex(c.observable)
+        if c.observable[o] isa AbstractArray
+            MCUtility.MPIreduce!(c.observable[o]) # avoid memory allocation
+        else
+            c.observable[o] = MCUtility.MPIreduce(c.observable[o])
+        end
     end
     for v in c.var
         histogram_reduce!(v)
     end
 
     ########## variable that are vectors ##############
-    MPI.Reduce!(c.visited, MPI.SUM, root, comm)
-    MPI.Reduce!(c.propose, MPI.SUM, root, comm)
-    MPI.Reduce!(c.accept, MPI.SUM, root, comm)
+    MCUtility.MPIreduce!(c.visited)
+    MCUtility.MPIreduce!(c.propose)
+    MCUtility.MPIreduce!(c.accept)
+end
+
+function MPIbcastConfig!(c::Configuration, root=0, comm=MPI.COMM_WORLD)
+    # need to broadcast from root to workers:
+    # reweight
+    # var.histogram
+    function histogram_bcast!(var::Variable)
+        if var isa Dist.CompositeVar
+            for v in var.vars
+                histogram_bcast!(v)
+            end
+        else
+            MCUtility.MPIbcast!(var.histogram)
+        end
+    end
+
+    ########## variable that could be a number ##############
+    MCUtility.MPIbcast(c.reweight)
+
+    for v in c.var
+        histogram_bcast!(v)
+    end
+end
+
+function bcastConfig!(dest::Configuration, src::Configuration)
+    # need to broadcast from root to workers:
+    # reweight
+    # var.histogram
+    ########## variable that could be a number ##############
+    dest.reweight .= src.reweight
+
+    function histogram_bcast!(dest::Variable, src::Variable)
+        if dest isa Dist.CompositeVar
+            for i in 1:length(dest.vars)
+                histogram_bcast!(dest.vars[i], src.vars[i])
+            end
+        else
+            dest.histogram .= src.histogram
+        end
+    end
+
+    for i in 1:length(dest.var)
+        histogram_bcast!(dest.var[i], src.var[i])
+    end
 end
 
 function report(config::Configuration, total_neval=nothing)

diff --git a/src/distribution/variable.jl b/src/distribution/variable.jl
@@ -147,7 +147,6 @@ function accumulate!(T::Continuous, idx::Int, weight=1.0)
     end
 end
 
-
 """
 Vegas adaptive map
 """
-Original file line number
+Diff line change
@@ Expand Up / @@ -147,7 +147,6 @@ function accumulate!(T::Continuous, idx::Int, weight=1.0) @@
         end
     end
     """
     Vegas adaptive map
     """
@@ Expand Down @@