From 9f065a14d6e7470147bcdee10bd70763e8afb565 Mon Sep 17 00:00:00 2001
From: Marco Molari <mmolari91@gmail.com>
Date: Wed, 7 Jun 2023 18:18:15 +0200
Subject: [PATCH 01/19] feat: deterministic block name and pair ordering

---
 src/PanGraph.jl |  2 ++
 src/align.jl    | 33 +++++++++++++++++++++++----------
 2 files changed, 25 insertions(+), 10 deletions(-)

diff --git a/src/PanGraph.jl b/src/PanGraph.jl
index 923e6557..361b16f9 100644
--- a/src/PanGraph.jl
+++ b/src/PanGraph.jl
@@ -163,6 +163,8 @@ function main(args)
 end
 
 function julia_main()::Cint
+    # initialize random seed for reproducibility
+    seed!(0)
     try
         main(ARGS)
     catch
diff --git a/src/align.jl b/src/align.jl
index 209dfb87..f989a5fd 100644
--- a/src/align.jl
+++ b/src/align.jl
@@ -3,6 +3,7 @@ module Align
 using Rematch, Dates
 using LinearAlgebra
 using ProgressMeter
+using Random
 
 using Base.Threads: @spawn, @threads
 
@@ -26,6 +27,7 @@ end
 # ------------------------------------------------------------------------
 # guide tree for order of pairwise comparison for multiple alignments
 
+
 # TODO: distance?
 """
 	mutable struct Clade
@@ -33,7 +35,7 @@ end
 		parent :: Union{Clade,Nothing}
 		left   :: Union{Clade,Nothing}
 		right  :: Union{Clade,Nothing}
-		graph  :: Channel{Graph}
+		graph  :: Channel{Tuple{Graph,Int}}
 	end
 
 Clade is a node (internal or leaf) of a binary guide tree used to order pairwise alignments
@@ -41,13 +43,16 @@ associated to a multiple genome alignment in progress.
 `name` is only non-empty for leaf nodes.
 `parent` is `nothing` for the root node.
 `graph` is a 0-sized channel that is used as a message passing primitive in alignment.
+    It contains the graph and an index used to decide the order of items in a pair in
+    pairwise graph merge.
 """
+Message=Tuple{Graph,Int}
 mutable struct Clade
     name   :: String
     parent :: Union{Clade,Nothing}
     left   :: Union{Clade,Nothing}
     right  :: Union{Clade,Nothing}
-    graph  :: Channel{Graph}
+    graph  :: Channel{Message}
 end
 
 # ---------------------------
@@ -58,20 +63,20 @@ end
 
 Generate an empty, disconnected clade.
 """
-Clade()     = Clade("",nothing,nothing,nothing,Channel{Graph}(2))
+Clade() = Clade("", nothing, nothing, nothing, Channel{Message}(2))
 """
 	Clade(name)
 
 Generate an empty, disconnected clade with name `name`.
 """
-Clade(name) = Clade(name,nothing,nothing,nothing,Channel{Graph}(2))
+Clade(name) = Clade(name, nothing, nothing, nothing, Channel{Message}(2))
 """
 	Clade(left::Clade, right::Clade)
 
 Generate an nameless clade with `left` and `right` children.
 """
 function Clade(left::Clade, right::Clade)
-    parent = Clade("",nothing,left,right,Channel{Graph}(2))
+    parent = Clade("", nothing, left, right, Channel{Message}(2))
 
     left.parent = parent
     right.parent = parent
@@ -655,15 +660,22 @@ function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(h
     meter_lock = ReentrantLock()
 
     G = nothing
-    for clade ∈ postorder(tree)
+    for (n_clade, clade) ∈ enumerate(postorder(tree))
         @spawn try
+            Random.seed!(n_clade)
             if isleaf(clade)
                 close(clade.graph)
-                put!(clade.parent.graph, tips[clade.name])
+                msg = (tips[clade.name], n_clade)
+                put!(clade.parent.graph, msg)
             else
-                Gₗ = take!(clade.graph)
-                Gᵣ = take!(clade.graph)
+                Gₗ, Pₗ = take!(clade.graph)
+                Gᵣ, Pᵣ = take!(clade.graph)
                 close(clade.graph)
+                # ensure a consistent ordering of the two graphs,
+                # irrespective of which process is sending the message first.
+                if Pₗ > Pᵣ
+                    Gₗ, Gᵣ = Gᵣ, Gₗ
+                end
 
                 # the lock ensures that at most N=Threads.nthreads() processes are
                 # spawning run(`cmd`) instances at the same time
@@ -678,7 +690,8 @@ function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(h
                 end
 
                 if clade.parent !== nothing
-                    put!(clade.parent.graph, G₀)
+                    msg = (G₀, n_clade)
+                    put!(clade.parent.graph, msg)
                 else
                     G = G₀
                     close(error_channel)

From 450e987b40f4a7f16c9346cc4050dd71fd0ff4c6 Mon Sep 17 00:00:00 2001
From: Marco Molari <mmolari91@gmail.com>
Date: Fri, 9 Jun 2023 16:49:34 +0200
Subject: [PATCH 02/19] feat: deterministic graph construction

---
 Manifest.toml |  4 +--
 Project.toml  |  3 +-
 src/align.jl  | 41 +++++++++++++++++++++++++-
 src/block.jl  | 79 ++++++++++++++++++++++++++++++---------------------
 4 files changed, 91 insertions(+), 36 deletions(-)

diff --git a/Manifest.toml b/Manifest.toml
index fd8baaae..23770584 100644
--- a/Manifest.toml
+++ b/Manifest.toml
@@ -247,9 +247,9 @@ uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e"
 version = "0.5.5+0"
 
 [[deps.OrderedCollections]]
-git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c"
+git-tree-sha1 = "d321bf2de576bf25ec4d3e4360faca399afca282"
 uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
-version = "1.4.1"
+version = "1.6.0"
 
 [[deps.PDMats]]
 deps = ["LinearAlgebra", "SparseArrays", "SuiteSparse"]
diff --git a/Project.toml b/Project.toml
index 97e29456..f8d7b2f9 100644
--- a/Project.toml
+++ b/Project.toml
@@ -12,6 +12,7 @@ JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 Logging = "56ddb016-857b-54e1-b83d-db4d58db5568"
+OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d"
 PackageCompiler = "9b87118b-4619-50d2-8e1e-99f35a4d4d9d"
 Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
@@ -23,4 +24,4 @@ TreeTools = "62f0eae3-8c0e-4032-a621-7756092209e5"
 minimap2_jll = "d341526d-637d-5003-8fc4-9c6812cd2b55"
 
 [compat]
-TreeTools = ">= 0.6.2"
\ No newline at end of file
+TreeTools = ">= 0.6.2"
diff --git a/src/align.jl b/src/align.jl
index f989a5fd..67855824 100644
--- a/src/align.jl
+++ b/src/align.jl
@@ -356,6 +356,31 @@ function preprocess(hits, skip, energy, blocks!)
     return hits
 end
 
+# DEBUG
+function log_alignment(G₁::Graph, G₂::Graph, hits, fname::String)
+    open(fname, "w") do io
+        for G in (G₁, G₂)
+            write(io, "------------ G ------------\n")
+            PC = pancontigs(G)
+            for (name, seq) in zip(PC.name, PC.sequence)
+                write(io, ">$name\n")
+                write(io, seq, "\n")
+            end
+        end
+        write(io, "------------ hits ------------\n")
+        for h in hits
+            write(io, """
+            .........................
+            qry -> $(h.qry.name) | $(h.qry.start) -> $(h.qry.stop) | $(h.qry.length)
+            ref -> $(h.ref.name) | $(h.ref.start) -> $(h.ref.stop) | $(h.ref.length)
+            len -> $(h.length)
+            strand -> $(h.orientation)
+            cigar -> $(h.cigar)
+            """)
+        end
+    end
+end
+
 function do_align(G₁::Graph, G₂::Graph, energy::Function, aligner::Function)
     hits = if G₁ == G₂
         self = pancontigs(G₁)
@@ -364,6 +389,9 @@ function do_align(G₁::Graph, G₂::Graph, energy::Function, aligner::Function)
         aligner(pancontigs(G₁), pancontigs(G₂))
     end
     sort!(hits; by=energy)
+    
+    # DEBUG
+    # log_alignment(G₁, G₂, hits, "issue/minimap/$(randstring(10)).log")
 
     return hits
 end
@@ -662,7 +690,11 @@ function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(h
     G = nothing
     for (n_clade, clade) ∈ enumerate(postorder(tree))
         @spawn try
+
+            # random seed for the thread - to ensure deterministic reproducibility
+            # in block names
             Random.seed!(n_clade)
+
             if isleaf(clade)
                 close(clade.graph)
                 msg = (tips[clade.name], n_clade)
@@ -676,7 +708,7 @@ function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(h
                 if Pₗ > Pᵣ
                     Gₗ, Gᵣ = Gᵣ, Gₗ
                 end
-
+                
                 # the lock ensures that at most N=Threads.nthreads() processes are
                 # spawning run(`cmd`) instances at the same time
                 G₀ = lock_semaphore(s) do
@@ -684,6 +716,13 @@ function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(h
                     align_self(G₀, energy, minblock, aligner, verify, false)
                 end
 
+                # DEBUG : save graph at each iteration in a file
+                # open("issue/comp/graph_iteration_$(n_clade).json", "w") do io
+                #     finalize!(G₀)
+                #     marshal(io, G₀; fmt=:json)
+                # end
+
+
                 # advance progress bar in a thread-safe way
                 lock(meter_lock) do
                     next!(meter)
diff --git a/src/block.jl b/src/block.jl
index f81a302d..2b66f081 100644
--- a/src/block.jl
+++ b/src/block.jl
@@ -4,6 +4,7 @@ using Rematch
 
 import Base:
     show, length, append!, keys, merge!, pop!
+import OrderedCollections: OrderedDict, OrderedSet
 
 # internal modules
 using ..Intervals
@@ -28,11 +29,17 @@ export depth, combine, swap!, check  # operators
 export assertequivalent
 export alignment, diversity
 
-const AlleleMaps{T} = Union{Dict{Node{T},SNPMap},Dict{Node{T},InsMap},Dict{Node{T},DelMap}} 
+const AlleleMaps{T} = Union{OrderedDict{Node{T},SNPMap},OrderedDict{Node{T},InsMap},OrderedDict{Node{T},DelMap}} 
 
 # ------------------------------------------------------------------------
 # utility functions
 
+# DEBUG
+function log(msg...)
+    println(stderr, msg...)
+    flush(stderr)
+end
+
 """
 	applyalleles(seq::Array{UInt8}, mutate::SNPMap, insert::InsMap, delete::DelMap)
 
@@ -270,9 +277,9 @@ end
         uuid     :: String
         sequence :: Array{UInt8}
         gaps     :: Dict{Int,Int}
-        mutate   :: Dict{Node{Block},SNPMap}
-        insert   :: Dict{Node{Block},InsMap}
-        delete   :: Dict{Node{Block},DelMap}
+        mutate   :: OrderedDict{Node{Block},SNPMap}
+        insert   :: OrderedDict{Node{Block},InsMap}
+        delete   :: OrderedDict{Node{Block},DelMap}
     end
 
 Store a multiple sequence alignment of contiguous DNA related by homology.
@@ -286,10 +293,13 @@ mutable struct Block
     uuid     :: String
     sequence :: Array{UInt8}
     gaps     :: Dict{Int,Int}
-    mutate   :: Dict{Node{Block},SNPMap}
-    insert   :: Dict{Node{Block},InsMap}
-    delete   :: Dict{Node{Block},DelMap}
+    mutate   :: OrderedDict{Node{Block},SNPMap}
+    insert   :: OrderedDict{Node{Block},InsMap}
+    delete   :: OrderedDict{Node{Block},DelMap}
 end
+SNPsDict = OrderedDict{Node{Block},SNPMap}
+InsDict  = OrderedDict{Node{Block},InsMap}
+DelDict  = OrderedDict{Node{Block},DelMap}
 
 function show(io::IO, m::Dict{Node{Block}, T}) where T <: Union{SNPMap, InsMap, DelMap}
     print(io, "{\n")
@@ -317,13 +327,13 @@ Block(sequence,gaps,mutate,insert,delete) = Block(random_id(),sequence,gaps,muta
 Construct a block with a unique `uuid` with fixed `sequence` and `gaps`.
 No individuals and thus polymorphisms are initialized.
 """
-Block(sequence,gaps) = Block(sequence,gaps,Dict{Node{Block},SNPMap}(),Dict{Node{Block},InsMap}(),Dict{Node{Block},DelMap}())
+Block(sequence,gaps) = Block(sequence,gaps,SNPsDict(),InsDict(),DelDict())
 """
 	Block(sequence,gaps)
 
 Construct a block with a unique `uuid` with fixed `sequence`.
 """
-Block(sequence) = Block(sequence,Dict{Int,Int}(),Dict{Node{Block},SNPMap}(),Dict{Node{Block},InsMap}(),Dict{Node{Block},DelMap}())
+Block(sequence) = Block(sequence,Dict{Int,Int}(),SNPsDict(),InsDict(),DelDict())
 """
 	Block(sequence,gaps)
 
@@ -333,20 +343,20 @@ Block()         = Block(UInt8[])
 
 # move alleles
 translate(d::Dict{Int,Int}, δ) = Dict(x+δ => v for (x,v) ∈ d) # gaps
-translate(d::Dict{Node{Block},InsMap}, δ) = Dict{Node{Block},InsMap}(n => Dict((x+δ,Δ) => v for ((x,Δ),v) ∈ val) for (n,val) ∈ d) # insertions 
-translate(dict::T, δ) where T <: AlleleMaps{Block} = Dict(key=>Dict(x+δ=>v for (x,v) in val) for (key,val) in dict)
+translate(d::InsDict, δ) = InsDict(n => Dict((x+δ,Δ) => v for ((x,Δ),v) ∈ val) for (n,val) ∈ d) # insertions 
+translate(dict::T, δ) where T <: Union{SNPsDict,DelDict} = T(key=>Dict(x+δ=>v for (x,v) in val) for (key,val) in dict)
 
 # select alleles within window
-lociwithin(dict::Dict{Node{Block},SNPMap}, i) =
-Dict{Node{Block},SNPMap}(
+lociwithin(dict::SNPsDict, i) =
+SNPsDict(
     node => SNPMap(
         locus => allele for (locus,allele) in subdict if i.start ≤ locus ≤ i.stop
     ) for (node, subdict) ∈ dict
 )
 
 # XXX: have to deal with left case i.e. you have a deletion that starts before i.start but continues onwards
-lociwithin(dict::Dict{Node{Block},DelMap}, i) = 
-Dict{Node{Block},DelMap}(
+lociwithin(dict::DelDict, i) = 
+DelDict(
     node => DelMap(
        let
            k = max(locus,i.start) 
@@ -357,9 +367,9 @@ Dict{Node{Block},DelMap}(
 )
 
 # XXX: have to special case the left edge
-function lociwithin(dict::Dict{Node{Block},InsMap}, i) 
+function lociwithin(dict::InsDict, i) 
     i = (i.start == 1) ? (0:i.stop) : i
-    return Dict{Node{Block},InsMap}(
+    return InsDict(
         node => InsMap(
             locus => insert for (locus, insert) in subdict if i.start ≤ first(locus) ≤ i.stop
         ) for (node, subdict) ∈ dict
@@ -372,12 +382,12 @@ function lociwithin(dict::Dict{Int,Int}, i)
 end
 
 # copy dictionary
-Base.copy(dict::T) where T <: AlleleMaps{Block} = Dict(node=>Base.copy(subdict) for (node,subdict) in dict)
+Base.copy(dict::T) where T <: AlleleMaps{Block} = T(node=>Base.copy(subdict) for (node,subdict) in dict)
 
 # merge alleles (recursively)
 function merge!(base::T, others::T...) where T <: AlleleMaps{Block}
     # keys not found in base
-    for node ∈ Set(k for other in others for k ∈ keys(other) if k ∉ keys(base))
+    for node ∈ OrderedSet(k for other in others for k ∈ keys(other) if k ∉ keys(base))
         base[node] = merge((other[node] for other in others if node ∈ keys(other))...)
     end
 
@@ -390,7 +400,7 @@ function merge!(base::T, others::T...) where T <: AlleleMaps{Block}
 end
 
 function merge_cat!(base::InsMap, gaps::Dict{Int,Int}, others::InsMap...)
-    new    = Set(locus for other in others for locus in first.(keys(other)))
+    new    = OrderedSet(locus for other in others for locus in first.(keys(other)))
     shared = intersect(Set(keys(gaps)), new)
 
     length(shared) == 0 && return merge!(base, others...)
@@ -407,10 +417,10 @@ function merge_cat!(base::InsMap, gaps::Dict{Int,Int}, others::InsMap...)
     end
 end
 
-function merge_cat!(base::Dict{Node{Block},InsMap}, others::Dict{Node{Block},InsMap}...)
-    Ks = Set(keys(base))
+function merge_cat!(base::InsDict, others::InsDict...)
+    Ks = OrderedSet(keys(base))
     # keys not found in base
-    for node ∈ Set(k for other in others for k ∈ keys(other) if k ∉ Ks)
+    for node ∈ OrderedSet(k for other in others for k ∈ keys(other) if k ∉ Ks)
         base[node] = merge((other[node] for other in others if node ∈ keys(other))...)
     end
 
@@ -588,9 +598,9 @@ function reverse_complement(b::Block; keepid=false)
     revcmpl(dict::DelMap) = Dict(len-(locus+del-1)+1 => del for (locus,del) in dict)
     revcmpl(dict::InsMap) = Dict((len-locus,b.gaps[locus]-length(ins)-off) => reverse_complement(ins) for ((locus,off),ins) in dict)
 
-    mutate = Dict(node => revcmpl(snp) for (node, snp) in b.mutate)
-    insert = Dict(node => revcmpl(ins) for (node, ins) in b.insert)
-    delete = Dict(node => revcmpl(del) for (node, del) in b.delete)
+    mutate = SNPsDict(node => revcmpl(snp) for (node, snp) in b.mutate)
+    insert = InsDict(node => revcmpl(ins) for (node, ins) in b.insert)
+    delete = DelDict(node => revcmpl(del) for (node, del) in b.delete)
     gaps   = Dict(len-locus => gap  for (locus, gap) in b.gaps)
 
     return keepid ? Block(b.uuid, seq,gaps,mutate,insert,delete) : Block(seq,gaps,mutate,insert,delete)
@@ -776,7 +786,7 @@ function sequence(b::Block, node::Node{Block}; gaps=false, debug=false, forward=
     return (node.strand || forward) ? seq : reverse_complement(seq)
 end
 
-function gapconsensus(insert::Dict{Node{Block},InsMap}, len::Int, x::Int)
+function gapconsensus(insert::InsDict, len::Int, x::Int)
     num = sum(1 for ins in values(insert) for locus in keys(ins) if first(locus) == x; init=0)
     @assert num > 0
 
@@ -968,6 +978,11 @@ function rereference(qry::Block, ref::Block, segments)
         delete = Base.copy(ref.delete),
     )
 
+    # DEBUG
+    # log("type of combined mutate", typeof(combined.mutate))
+    # log("type of combined insert", typeof(combined.insert))
+    # log("type of combined delete", typeof(combined.delete))
+
     map(dict, from, to) = translate(lociwithin(dict, from), to.start-from.start)
 
     x = (
@@ -1048,7 +1063,7 @@ function rereference(qry::Block, ref::Block, segments)
                         end
 
                         for I in unmatched
-                            merge!(combined.delete, Dict(node => Dict(I.lo=>length(I))))
+                            merge!(combined.delete, DelDict(node => Dict(I.lo=>length(I))))
                         end
                     end
 
@@ -1061,7 +1076,7 @@ function rereference(qry::Block, ref::Block, segments)
 
                     newgap = nothing
                 else
-                    newdeletes = Dict(node => Dict(x.ref=>Δ.stop-Δ.start+1) for node ∈ keys(qry))
+                    newdeletes = DelDict(node => Dict(x.ref=>Δ.stop-Δ.start+1) for node ∈ keys(qry))
                     merge!(combined.delete, newdeletes)
                 end
 
@@ -1082,7 +1097,7 @@ function rereference(qry::Block, ref::Block, segments)
 
                 newgap = (x.ref-1, 0)
 
-                newinserts = Dict(
+                newinserts = InsDict(
                     let
                         seq = applyalleles(qry.sequence[Δ], mutate[node], insert[node], delete[node])
                         if length(seq) > 0
@@ -1107,7 +1122,7 @@ function rereference(qry::Block, ref::Block, segments)
             end
             (Δq, Δr) => let # simple translation of alleles of qry -> ref
                 # carry over mutations to qry sequences as long as its different from new reference
-                muts = Dict(
+                muts = SNPsDict(
                     node => Dict(
                          x => nuc for (x,nuc) in subdict if nuc != ref.sequence[x]
                     ) for (node, subdict) in map(qry.mutate,Δq,Δr)
@@ -1117,7 +1132,7 @@ function rereference(qry::Block, ref::Block, segments)
                 # apply mutations to all qry sequences where qry ≠ ref that are not deleted or already mutated
                 qrysnps = findall(qry.sequence[Δq] .!= ref.sequence[Δr])
 
-                newmuts = Dict(
+                newmuts = SNPsDict(
                     node => Dict(
                         Δr.start+(x-1) => qry.sequence[Δq.start+(x-1)] for x in qrysnps if ((Δq.start+(x-1)) ∉ keys(qry.mutate[node])) 
                    ) for node ∈ keys(qry)

From ed8366c26efa899a6b397353bb48764206c5bbc2 Mon Sep 17 00:00:00 2001
From: Marco Molari <mmolari91@gmail.com>
Date: Mon, 19 Jun 2023 10:08:10 +0200
Subject: [PATCH 03/19] fix: ins-map consistency

---
 src/block.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/block.jl b/src/block.jl
index 2b66f081..6b8216fd 100644
--- a/src/block.jl
+++ b/src/block.jl
@@ -1104,7 +1104,7 @@ function rereference(qry::Block, ref::Block, segments)
                             if (δ+length(seq)) > last(newgap)
                                 newgap = (first(newgap),length(seq)+δ)
                             end
-                            node => Dict((x.ref-1,δ) => seq) 
+                            node => InsMap((x.ref-1,δ) => seq) 
                         else
                             node => InsMap()
                         end

From af5f80473478b4d857215f8bfe029b4760217201 Mon Sep 17 00:00:00 2001
From: Marco Molari <mmolari91@gmail.com>
Date: Mon, 19 Jun 2023 14:42:09 +0200
Subject: [PATCH 04/19] fix for missing insertions

---
 src/block.jl | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/src/block.jl b/src/block.jl
index 6b8216fd..7ce1aa7c 100644
--- a/src/block.jl
+++ b/src/block.jl
@@ -1068,7 +1068,12 @@ function rereference(qry::Block, ref::Block, segments)
                     end
 
                     delete!(qry.gaps, x.qry-1)
-                    delete!(newgaps, Δ.start-1)
+                    # DEBUG delete only if no more insertions are present here
+                    if Δ.start-1 ∉ [
+                        locus for (node, ins) ∈ combined.insert for (locus, δ) ∈ keys(ins)
+                    ]
+                        delete!(newgaps, Δ.start-1)
+                    end
 
                     if last(newgap) > 0
                         newgaps[newgap[1]] = newgap[2]

From 95a984a19b9dc19c7a4ac0469067aceae0be258b Mon Sep 17 00:00:00 2001
From: Marco Molari <mmolari91@gmail.com>
Date: Mon, 19 Jun 2023 15:50:01 +0200
Subject: [PATCH 05/19] chore: modified changelog

---
 CHANGELOG.md | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 40834316..58e67687 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,8 +1,10 @@
 # PanGraph Changelog
 
-## v0.6.4 (draft)
+## v0.7.0 (draft)
 
 - fasta input files are checked for duplicated records, and white lines between records are tolerated, see [#55](https://github.com/neherlab/pangraph/pull/55).
+- PanGraph execution is now deterministic, and same input files always produce the same output.
+- Fixed [#56](https://github.com/neherlab/pangraph/issues/56)
 
 ## v0.6.3
 

From e4e4fa8677364e23b2417009bac13af6d6e74a86 Mon Sep 17 00:00:00 2001
From: Marco Molari <mmolari91@gmail.com>
Date: Mon, 19 Jun 2023 16:39:45 +0200
Subject: [PATCH 06/19] feat: added flag to enforce consistency checks

---
 docs/src/cli/build.md |  25 ++--
 src/align.jl          |  22 ++-
 src/block.jl          |   2 +-
 src/build.jl          | 302 ++++++++++++++++++++++--------------------
 4 files changed, 192 insertions(+), 159 deletions(-)

diff --git a/docs/src/cli/build.md b/docs/src/cli/build.md
index fd4a7f3b..a3e3e5f8 100644
--- a/docs/src/cli/build.md
+++ b/docs/src/cli/build.md
@@ -4,18 +4,19 @@
 Build a multiple sequence alignment pangraph.
 
 ## Options
-| Name                 | Type    | Short Flag | Long Flag        | Description                                                                                               |
-| :------------------- | :------ | :--------- | :--------------- | :-------------------------------------------------------------------------------------------------------- |
-| minimum length       | Integer | l          | len              | minimum block size for alignment graph (in nucleotides)                                                   |
-| block junction cost  | Float   | a          | alpha            | energy cost for introducing block partitions due to alignment merger                                      |
-| block diversity cost | Float   | b          | beta             | energy cost for interblock diversity due to alignment merger                                              |
-| circular genomes     | Boolean | c          | circular         | toggle if input genomes are circular                                                                      |
-| pairwise sensitivity | String  | s          | sensitivity      | controls the pairwise genome alignment sensitivity of minimap 2. Currently only accepts "5", "10" or "20" |
-| maximum self-maps    | Integer | x          | max-self-map     | maximum number of iterations to perform block self maps per pairwise graph merger                         |
-| enforce uppercase    | Boolean | u          | upper-case       | toggle to force genomes to uppercase characters                                                           |
-| distance calculator  | String  | d          | distance-backend | only accepts "native" or "mash"                                                                           |
-| alignment kernel     | String  | k          | alignment-kernel | only accepts "minimap2" or "mmseqs"                                                                       |
-| kmer length (mmseqs) | Integer | K          | kmer-length      | kmer length, only used for mmseqs2 alignment kernel. If not specified will use mmseqs default.            |
+| Name                 | Type    | Short Flag | Long Flag        | Description                                                                                                   |
+| :------------------- | :------ | :--------- | :--------------- | :------------------------------------------------------------------------------------------------------------ |
+| minimum length       | Integer | l          | len              | minimum block size for alignment graph (in nucleotides)                                                       |
+| block junction cost  | Float   | a          | alpha            | energy cost for introducing block partitions due to alignment merger                                          |
+| block diversity cost | Float   | b          | beta             | energy cost for interblock diversity due to alignment merger                                                  |
+| circular genomes     | Boolean | c          | circular         | toggle if input genomes are circular                                                                          |
+| pairwise sensitivity | String  | s          | sensitivity      | controls the pairwise genome alignment sensitivity of minimap 2. Currently only accepts "5", "10" or "20"     |
+| maximum self-maps    | Integer | x          | max-self-map     | maximum number of iterations to perform block self maps per pairwise graph merger                             |
+| enforce uppercase    | Boolean | u          | upper-case       | toggle to force genomes to uppercase characters                                                               |
+| distance calculator  | String  | d          | distance-backend | only accepts "native" or "mash"                                                                               |
+| alignment kernel     | String  | k          | alignment-kernel | only accepts "minimap2" or "mmseqs"                                                                           |
+| kmer length (mmseqs) | Integer | K          | kmer-length      | kmer length, only used for mmseqs2 alignment kernel. If not specified will use mmseqs default.                |
+| consistency check    | Boolean | v          | verify           | toggle to activate consistency check: verifies that input genomes can be exactly reconstructed from the graph |
 
 ## Arguments
 Expects one or more fasta files.
diff --git a/src/align.jl b/src/align.jl
index 67855824..ba3abf0f 100644
--- a/src/align.jl
+++ b/src/align.jl
@@ -431,7 +431,7 @@ The _lower_ the score, the _better_ the alignment. Only negative energies are co
 `minblock` is the minimum size block that will be produced from the algorithm.
 `maxiter` is maximum number of duplications that will be considered during this alignment.
 """
-function align_self(G₁::Graph, energy::Function, minblock::Int, aligner::Function, verify::Function, verbose::Bool; maxiter=100, sensitivity="asm10")
+function align_self(G₁::Graph, energy::Function, minblock::Int, aligner::Function, verify::Function, verbose::Bool; maxiter=100)
     G₀ = G₁
 
     for niter in 1:maxiter
@@ -473,6 +473,9 @@ function align_self(G₁::Graph, energy::Function, minblock::Int, aligner::Funct
         detransitive!(G₀)
         purge!(G₀)
         prune!(G₀)
+
+        # verify that isolates are correctly reconstructed (-v flag)
+        verify(G₀, msg="verify align-self $niter")
     end
 
     return G₀
@@ -606,6 +609,9 @@ function align_pair(G₁::Graph, G₂::Graph, energy::Function, minblock::Int, a
     purge!(G)
     prune!(G)
 
+    # verify that isolates are correctly reconstructed (-v flag)
+    verify(G, msg="verify align-pair")
+
     return G
 end
 
@@ -625,8 +631,8 @@ The _lower_ the score, the _better_ the alignment. Only negative energies are co
 
 `compare` is the function to be used to generate pairwise distances that generate the internal guide tree.
 """
-function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100)
-    function verify(graph, msg="")
+function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100, verbose=false)
+    function verify(graph; msg="")
         if reference !== nothing
             for (name,path) ∈ graph.sequence
                 seq = sequence(path)
@@ -663,7 +669,7 @@ function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(h
                     println("--> insert:           $(path.node[i].block.insert[path.node[i]])")
                     println("--> delete:           $(path.node[i].block.delete[path.node[i]])")
 
-                    error("--> isolate '$name' incorrectly reconstructed")
+                    error("$msg\n--> isolate '$name' incorrectly reconstructed")
                 end
             end
         end
@@ -712,8 +718,12 @@ function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(h
                 # the lock ensures that at most N=Threads.nthreads() processes are
                 # spawning run(`cmd`) instances at the same time
                 G₀ = lock_semaphore(s) do
-                    G₀ = align_pair(Gₗ, Gᵣ, energy, minblock, aligner, verify, false)
-                    align_self(G₀, energy, minblock, aligner, verify, false)
+                    verbose && log("--> align-pair for clade n. $n_clade")
+                    G₀ = align_pair(Gₗ, Gᵣ, energy, minblock, aligner, verify, verbose)
+                    verbose && log("--> align-self for clade n. $n_clade")
+                    G₀ = align_self(G₀, energy, minblock, aligner, verify, verbose, maxiter=maxiter)
+                    verbose && log("--> graph merging for clade n. $n_clade completed")
+                    G₀
                 end
 
                 # DEBUG : save graph at each iteration in a file
diff --git a/src/block.jl b/src/block.jl
index 7ce1aa7c..130596ab 100644
--- a/src/block.jl
+++ b/src/block.jl
@@ -1068,7 +1068,7 @@ function rereference(qry::Block, ref::Block, segments)
                     end
 
                     delete!(qry.gaps, x.qry-1)
-                    # DEBUG delete only if no more insertions are present here
+                    # DEBUG delete only if no more insertions are present in this gap
                     if Δ.start-1 ∉ [
                         locus for (node, ins) ∈ combined.insert for (locus, δ) ∈ keys(ins)
                     ]
diff --git a/src/build.jl b/src/build.jl
index 30c6ef42..87064bba 100644
--- a/src/build.jl
+++ b/src/build.jl
@@ -1,162 +1,184 @@
 Build = Command(
-   "build",
-   "pangraph build <options> [arguments]",
-   "align genomes into a multiple sequence alignment graph",
-   """zero, one or more fasta files.
-      files can be optionally gzipped.
-      multiple records within one file are treated as seperate genomes.
-      if no file is given, reads fasta record from stdin""",
-   [
-    Arg(
-        Int,
-        "minimum length",
-        (short="-l", long="--len"),
-        "minimum block size for alignment graph (in nucleotides)",
-        100,
-    ),
-    Arg(
-        Float64,
-        "block junction cost",
-        (short="-a", long="--alpha"),
-        "energy cost for introducing junction due to alignment merger",
-        100,
-    ),
-    Arg(
-        Float64,
-        "block diversity cost",
-        (short="-b", long="--beta"),
-        "energy cost for interblock diversity due to alignment merger",
-        10,
-    ),
-    Arg(
-        Bool,
-        "circular genomes",
-        (short="-c", long="--circular"),
-        "toggle if input genomes are circular",
-        false,
-    ),
-    Arg(
-        Bool,
-        "enforce uppercase",
-        (short="-u", long="--upper-case"),
-        "transforms all sequence to upper case",
-        false,
-    ),
-    Arg(
-        String,
-        "pairwise sensitivity",
-        (short="-s", long="--sensitivity"),
-        "used to set pairwise alignment sensitivity\n\trecognized options: [5, 10, 20]",
-        "10",
-    ),
-    Arg(
-        Int,
-        "maximum self maps",
-        (short="-x", long="--max-self-map"),
-        "maximum number of self mappings to consider per pairwise graph merger",
-        100,
-    ),
-    Arg(
-        String,
-        "distance calculator",
-        (short="-d", long="--distance-backend"),
-        "backend to use to estimate pairwise distance for guide tree\n\trecognized options: [native, mash]",
-        "native",
-    ),
-    Arg(
-        String,
-        "alignment kernel",
-        (short="-k", long="--alignment-kernel"),
-        "backend to use for pairwise genome alignment\n\trecognized options: [minimap2, mmseqs]",
-        "minimap2",
-    ),
-    Arg(
-        Int,
-        "k-mer length",
-        (short="-K", long="--kmer-length"),
-        "kmer length, only used for mmseqs2 alignment kernel. If not specified will use mmseqs default.",
-        0,
-    )
-   ],
+    "build",
+    "pangraph build <options> [arguments]",
+    "align genomes into a multiple sequence alignment graph",
+    """zero, one or more fasta files.
+       files can be optionally gzipped.
+       multiple records within one file are treated as seperate genomes.
+       if no file is given, reads fasta record from stdin""",
+    [
+        Arg(
+            Int,
+            "minimum length",
+            (short = "-l", long = "--len"),
+            "minimum block size for alignment graph (in nucleotides)",
+            100,
+        ),
+        Arg(
+            Float64,
+            "block junction cost",
+            (short = "-a", long = "--alpha"),
+            "energy cost for introducing junction due to alignment merger",
+            100,
+        ),
+        Arg(
+            Float64,
+            "block diversity cost",
+            (short = "-b", long = "--beta"),
+            "energy cost for interblock diversity due to alignment merger",
+            10,
+        ),
+        Arg(
+            Bool,
+            "circular genomes",
+            (short = "-c", long = "--circular"),
+            "toggle if input genomes are circular",
+            false,
+        ),
+        Arg(
+            Bool,
+            "enforce uppercase",
+            (short = "-u", long = "--upper-case"),
+            "transforms all sequence to upper case",
+            false,
+        ),
+        Arg(
+            String,
+            "pairwise sensitivity",
+            (short = "-s", long = "--sensitivity"),
+            "used to set pairwise alignment sensitivity\n\trecognized options: [5, 10, 20]",
+            "10",
+        ),
+        Arg(
+            Int,
+            "maximum self maps",
+            (short = "-x", long = "--max-self-map"),
+            "maximum number of self mappings to consider per pairwise graph merger",
+            100,
+        ),
+        Arg(
+            String,
+            "distance calculator",
+            (short = "-d", long = "--distance-backend"),
+            "backend to use to estimate pairwise distance for guide tree\n\trecognized options: [native, mash]",
+            "native",
+        ),
+        Arg(
+            String,
+            "alignment kernel",
+            (short = "-k", long = "--alignment-kernel"),
+            "backend to use for pairwise genome alignment\n\trecognized options: [minimap2, mmseqs]",
+            "minimap2",
+        ),
+        Arg(
+            Int,
+            "k-mer length",
+            (short = "-K", long = "--kmer-length"),
+            "kmer length, only used for mmseqs2 alignment kernel. If not specified will use mmseqs default.",
+            0,
+        ),
+        Arg(
+            Bool,
+            "verify that input genomes can be reconstructed from output",
+            (short = "-v", long = "--verify"),
+            "toggle to activate consistency check at each step of the graph merging process",
+            false,
+        ),
+    ],
+    (args) -> let
+        files = parse(Build, args)
+        files = if files === nothing || length(files) == 0
+            ["/dev/stdin"]
+        else
+            files
+        end
 
-   (args) -> let
-       files = parse(Build, args)
-       files = if files === nothing || length(files) == 0
-           ["/dev/stdin"]
-       else
-           files
-       end
+        minblock = arg(Build, "-l")
+        circular = arg(Build, "-c")
+        uppercase = arg(Build, "-u")
 
-       minblock  = arg(Build, "-l")
-       circular  = arg(Build, "-c")
-       uppercase = arg(Build, "-u")
+        α = arg(Build, "-a")
+        β = arg(Build, "-b")
 
-       α = arg(Build, "-a")
-       β = arg(Build, "-b")
+        energy = function (aln)
+            len = aln.length
+            len < minblock && return Inf
 
-       energy = function(aln)
-           len = aln.length
-           len < minblock && return Inf
+            cuts(hit) = (hit.start > minblock) + ((hit.length - hit.stop) > minblock)
 
-           cuts(hit) = (hit.start > minblock) + ((hit.length-hit.stop) > minblock)
+            ncuts = cuts(aln.qry) + cuts(aln.ref)
+            nmuts = aln.divergence * aln.length
 
-           ncuts = cuts(aln.qry)+cuts(aln.ref)
-           nmuts = aln.divergence*aln.length
+            return -len + α * ncuts + β * nmuts
+        end
 
-           return -len + α*ncuts + β*nmuts
-       end
-
-       maxiter = arg(Build, "-x")
-       sensitivity = @match arg(Build, "-s") begin
-           "5"  => "asm5"
-           "10" => "asm10"
-           "20" => "asm20"
-            _   => begin
+        maxiter = arg(Build, "-x")
+        sensitivity = @match arg(Build, "-s") begin
+            "5" => "asm5"
+            "10" => "asm10"
+            "20" => "asm20"
+            _ => begin
                 usage(Build)
                 exit(1)
             end
-       end
+        end
 
-       graph(io) = graphs(io; circular=circular, upper=uppercase)
-       isolates  = (G for file in files for G ∈ open(graph,file))
+        graph(io) = graphs(io; circular = circular, upper = uppercase)
+        isolates = (G for file in files for G ∈ open(graph, file))
 
-       compare = @match arg(Build, "-d") begin
-           "native" => Graphs.Mash.distance
-           "mash"   => begin
-               if !Graphs.havecommand("mash")
-                   panic("external command mash not found. either install or use native backend\n")
-               end
-               Graphs.mash
-           end
-           _        => begin
+        compare = @match arg(Build, "-d") begin
+            "native" => Graphs.Mash.distance
+            "mash" => begin
+                if !Graphs.havecommand("mash")
+                    panic(
+                        "external command mash not found. either install or use native backend\n",
+                    )
+                end
+                Graphs.mash
+            end
+            _ => begin
                 # XXX: hacky...
                 Build.arg[8].value = "native"
 
                 usage(Build)
                 exit(1)
-           end
-       end
+            end
+        end
+
+        aligner(contigs₁, contigs₂) = @match arg(Build, "-k") begin
+            "minimap2" => Minimap.align(contigs₁, contigs₂, minblock, sensitivity)
+            "mmseqs" => let
+                if !Shell.havecommand("mmseqs")
+                    panic(
+                        "external command mmseqs not found. please install before running build step with mmseqs backend\n",
+                    )
+                end
+                kmer_len = arg(Build, "-K")
+                MMseqs.align(contigs₁, contigs₂, kmer_len)
+            end
+            _ => error("unrecognized alignment kernel")
+        end
 
-       aligner(contigs₁, contigs₂) = @match arg(Build, "-k") begin
-           "minimap2" => Minimap.align(contigs₁, contigs₂, minblock, sensitivity)
-           "mmseqs"   => let
-               if !Shell.havecommand("mmseqs")
-                   panic("external command mmseqs not found. please install before running build step with mmseqs backend\n")
-               end
-               kmer_len = arg(Build, "-K")
-               MMseqs.align(contigs₁, contigs₂, kmer_len)
-           end
-                    _ => error("unrecognized alignment kernel")
-       end
+        reference =
+            arg(Build, "-v") ?
+            Dict(begin
+                    key = collect(keys(G.sequence))[1]
+                    seq = sequence(collect(values(G.sequence))[1])
+                    key => seq
+                end for G in isolates) : nothing
 
-       graph = Graphs.align(aligner, isolates...;
-            compare     = compare,
-            energy      = energy,
-            minblock    = minblock,
-            maxiter     = maxiter,
-       )
-       finalize!(graph)
+        graph = Graphs.align(
+            aligner,
+            isolates...;
+            compare = compare,
+            energy = energy,
+            minblock = minblock,
+            maxiter = maxiter,
+            reference = reference,
+            verbose = false,
+        )
+        finalize!(graph)
 
-       marshal(stdout, graph; fmt=:json)
-   end
+        marshal(stdout, graph; fmt = :json)
+    end,
 )

From f7c0ae200a6e4433b136ddc3c9b5821963780379 Mon Sep 17 00:00:00 2001
From: Marco Molari <mmolari91@gmail.com>
Date: Mon, 19 Jun 2023 16:41:46 +0200
Subject: [PATCH 07/19] feat: activate consistency check at trace

---
 trace.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/trace.jl b/trace.jl
index a5295b86..90032cc6 100644
--- a/trace.jl
+++ b/trace.jl
@@ -15,9 +15,9 @@ PanGraph.main(["help", "version"])		   # version usage
 PanGraph.main(["help", "help"])		   # help usage
 
 # build (native - mmseqs)
-PanGraph.main(["build", "-c", "-u", "-b", "0", "-a", "0", "$root/test.fa"])
+PanGraph.main(["build", "-v", "-c", "-u", "-b", "0", "-a", "0", "$root/test.fa"])
 # PanGraph.main(["build", "-c", "-u", "-d", "mash", "$root/test.fa"])
-PanGraph.main(["build", "-c", "-u", "-b", "0", "-a", "0", "-k", "mmseqs", "-K", "8", "$root/test.fa"])
+PanGraph.main(["build", "-v", "-c", "-u", "-b", "0", "-a", "0", "-k", "mmseqs", "-K", "8", "$root/test.fa"])
 
 # export
 PanGraph.main(["export", "-o", "$root/export", "$root/test.json"])

From 10ba8e6b6968087acfba670eab78baef193e1f8f Mon Sep 17 00:00:00 2001
From: Marco Molari <mmolari91@gmail.com>
Date: Mon, 19 Jun 2023 16:52:01 +0200
Subject: [PATCH 08/19] chore: updated version and changelog

---
 CHANGELOG.md | 3 ++-
 Project.toml | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 58e67687..ae974c9b 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,7 +3,8 @@
 ## v0.7.0 (draft)
 
 - fasta input files are checked for duplicated records, and white lines between records are tolerated, see [#55](https://github.com/neherlab/pangraph/pull/55).
-- PanGraph execution is now deterministic, and same input files always produce the same output.
+- PanGraph execution is now deterministic, and same input files always produce the same output, see [#57](https://github.com/neherlab/pangraph/pull/57).
+- introduced the `-v` flag in the `build` command. This activates consistency checks at each mergers. In this check we verify that the input genomes can be exactly reconstructed from the graph. See [#57](https://github.com/neherlab/pangraph/pull/57).
 - Fixed [#56](https://github.com/neherlab/pangraph/issues/56)
 
 ## v0.6.3
diff --git a/Project.toml b/Project.toml
index f8d7b2f9..171bc211 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "PanGraph"
 uuid = "0f9f61ca-f32c-45e1-b3bc-00138f4f8814"
 authors = ["Nicholas Noll <nbnoll@eml.cc>"]
-version = "0.6.3"
+version = "0.7.0"
 
 [deps]
 Dates = "ade2ca70-3891-5945-98fb-dc099432e06a"

From 59112a538e7b9829b73739a755979a6e0f021141 Mon Sep 17 00:00:00 2001
From: Marco Molari <mmolari91@gmail.com>
Date: Mon, 19 Jun 2023 17:04:33 +0200
Subject: [PATCH 09/19] chore: added -v flag to cli-tests

---
 CHANGELOG.md           | 2 +-
 tests/run-cli-tests.sh | 6 +++---
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index ae974c9b..d72dde7f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -1,6 +1,6 @@
 # PanGraph Changelog
 
-## v0.7.0 (draft)
+## v0.7.0
 
 - fasta input files are checked for duplicated records, and white lines between records are tolerated, see [#55](https://github.com/neherlab/pangraph/pull/55).
 - PanGraph execution is now deterministic, and same input files always produce the same output, see [#57](https://github.com/neherlab/pangraph/pull/57).
diff --git a/tests/run-cli-tests.sh b/tests/run-cli-tests.sh
index fc171a6b..991ef2cc 100644
--- a/tests/run-cli-tests.sh
+++ b/tests/run-cli-tests.sh
@@ -49,14 +49,14 @@ pangraph generate -s 100 -r 1e-1 -m 1e-3 -d 5e-2 -i 1e-2 -t 5 "$TESTDIR/randseqs
 
 echo "Test pangraph build - minimap asm20 no energy"
 export JULIA_NUM_THREADS=4
-pangraph build -c -k minimap2 -s 20 -a 0 -b 0 "$TESTDIR/input.fa" > "$TESTDIR/test1.json"
+pangraph build -v -c -k minimap2 -s 20 -a 0 -b 0 "$TESTDIR/input.fa" > "$TESTDIR/test1.json"
 
 echo "Test pangraph build - mash"
-pangraph build -c -d mash "$TESTDIR/input.fa" > "$TESTDIR/test2.json"
+pangraph build -v -c -d mash "$TESTDIR/input.fa" > "$TESTDIR/test2.json"
 
 echo "Test pangraph build - mmseqs"
 export JULIA_NUM_THREADS=1
-pangraph build -c -k mmseqs -K 8 "$TESTDIR/input.fa" > "$TESTDIR/test3.json"
+pangraph build -v -c -k mmseqs -K 8 "$TESTDIR/input.fa" > "$TESTDIR/test3.json"
 
 echo "Test pangraph polish"
 pangraph polish -c -l 10000 "$TESTDIR/test1.json" > "$TESTDIR/polished.json"

From a2fe79efbdcfff664ece5f2394b176917491cf36 Mon Sep 17 00:00:00 2001
From: Marco Molari <mmolari91@gmail.com>
Date: Mon, 19 Jun 2023 18:42:03 +0200
Subject: [PATCH 10/19] feat: OrderedDict in unmarshall

---
 src/graph.jl | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/src/graph.jl b/src/graph.jl
index 13664c6d..171aeef1 100644
--- a/src/graph.jl
+++ b/src/graph.jl
@@ -101,6 +101,8 @@ using .Intervals
 import .Shell: mash, mafft, havecommand
 import ..PanGraph: PanContigs
 
+using OrderedCollections: OrderedDict
+
 export Graph
 export Shell, Blocks, Nodes, Utility
 
@@ -508,9 +510,9 @@ function unmarshal(io)
             b.sequence,
             b.gaps,
             # empty until we build the required node{block} objects
-            Dict{Node{Block},SNPMap}(),
-            Dict{Node{Block},InsMap}(),
-            Dict{Node{Block},DelMap}(),
+            OrderedDict{Node{Block},SNPMap}(),
+            OrderedDict{Node{Block},InsMap}(),
+            OrderedDict{Node{Block},DelMap}(),
         )
     end)
 

From 88b1411933026c2d40440aa68b34ddda2367635a Mon Sep 17 00:00:00 2001
From: Marco Molari <mmolari91@gmail.com>
Date: Tue, 20 Jun 2023 17:25:22 +0200
Subject: [PATCH 11/19] feat: implement deep copy for graphs

---
 src/graph.jl       | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 src/marginalize.jl |  2 +-
 2 files changed, 47 insertions(+), 1 deletion(-)

diff --git a/src/graph.jl b/src/graph.jl
index 171aeef1..ddee8683 100644
--- a/src/graph.jl
+++ b/src/graph.jl
@@ -156,6 +156,52 @@ function Graph(name::String, sequence::Array{UInt8}; circular=false)
     )
 end
 
+"""
+    copy(G::Graph)
+
+Returns a deep copy of `G`.
+"""
+function copy(G::Graph)
+    blocks = deepcopy(G.block)
+    paths = deepcopy(G.sequence)
+    nodes_match = Dict{Node, Node}()
+    for (name, path) in G.sequence
+        new_nodes = Node[]
+        for node in path.node
+            if node ∉ keys(nodes_match)
+                nodes_match[node] = Node(blocks[node.block.uuid], node.strand)
+            end
+            push!(new_nodes, nodes_match[node])
+        end
+        paths[name].node = new_nodes
+    end
+    for (uuid, block) in G.block
+
+        new_ins = OrderedDict{Node, InsMap}()
+        for (k, v) in block.insert
+            new_node = nodes_match[k]
+            new_ins[new_node] = deepcopy(v)
+        end
+        blocks[uuid].insert = new_ins
+
+        new_del = OrderedDict{Node, DelMap}()
+        for (k, v) in block.delete
+            new_node = nodes_match[k]
+            new_del[new_node] = deepcopy(v)
+        end
+        blocks[uuid].delete = new_del
+
+        new_snp = OrderedDict{Node, SNPMap}()
+        for (k, v) in block.mutate
+            new_node = nodes_match[k]
+            new_snp[new_node] = deepcopy(v)
+        end
+        blocks[uuid].mutate = new_snp
+
+    end
+    return Graph(blocks, paths)
+end
+
 
 """
 Utility function that raises an error if the list of records has entries with duplicated
diff --git a/src/marginalize.jl b/src/marginalize.jl
index 3cb73f1d..7ee6353f 100644
--- a/src/marginalize.jl
+++ b/src/marginalize.jl
@@ -47,7 +47,7 @@ Marginalize = Command(
            pairs  = [ (n₁,n₂) for n₁ in names for n₂ in names if n₁ < n₂ ]
 
            Threads.@threads for (name₁, name₂) in pairs
-               G = deepcopy(graph)
+               G = Graphs.copy(graph)
                Graphs.keeponly!(G, name₁, name₂)
                if reduce
                    changed = true

From a9439fba35d572f2d460ff03dabedf9bce735410 Mon Sep 17 00:00:00 2001
From: Marco Molari <mmolari91@gmail.com>
Date: Tue, 20 Jun 2023 17:50:01 +0200
Subject: [PATCH 12/19] feat: added consistency check in marginalize

---
 docs/src/cli/marginalize.md |   1 +
 src/graph.jl                |   4 +-
 src/marginalize.jl          | 201 +++++++++++++++++++++---------------
 3 files changed, 119 insertions(+), 87 deletions(-)

diff --git a/docs/src/cli/marginalize.md b/docs/src/cli/marginalize.md
index 60256e6d..f8604293 100644
--- a/docs/src/cli/marginalize.md
+++ b/docs/src/cli/marginalize.md
@@ -9,6 +9,7 @@ Compute all pairwise marginalizations of a multiple sequence alignment pangraph.
 | Output path        | String  | o          | output-path    | Path to direcotry where the output of all pairwise mariginalizations will be stored if supplied                                        |
 | Reduce paralogs    | Boolean | r          | reduce-paralog | Collapses coparallel paths through duplicated blocks.                                                                                  |
 | Projection strains | String  | s          | Strains        | Collapses the graph structure to only blocks and edges contained by the paths of the supplied strain names. comma seperated, no spaces |
+| Consistency check  | Boolean | v          | verify         | toggle to activate consistency check: verifies that output genomes are exactly equal to input genomes                                  |
 
 ## Arguments
 Zero or one pangraph file which must be formatted as a JSON.
diff --git a/src/graph.jl b/src/graph.jl
index ddee8683..ad3814b7 100644
--- a/src/graph.jl
+++ b/src/graph.jl
@@ -157,11 +157,11 @@ function Graph(name::String, sequence::Array{UInt8}; circular=false)
 end
 
 """
-    copy(G::Graph)
+    copy_graph(G::Graph)
 
 Returns a deep copy of `G`.
 """
-function copy(G::Graph)
+function copy_graph(G::Graph)
     blocks = deepcopy(G.block)
     paths = deepcopy(G.sequence)
     nodes_match = Dict{Node, Node}()
diff --git a/src/marginalize.jl b/src/marginalize.jl
index 7ee6353f..3d133d5c 100644
--- a/src/marginalize.jl
+++ b/src/marginalize.jl
@@ -1,94 +1,125 @@
 Marginalize = Command(
-   "marginalize",
-   "pangraph marginalize <options> [arguments]",
-   "computes all pairwise marginalizations of a multiple sequence alignment graph",
-   """multiple sequence alignment accepted in formats: [json]""",
-   [
-    Arg(
-        String,
-        "output path",
-        (short="-o", long="--output-path"),
-        "path to directory where all pairwise marginalizations will be stored\n\tif empty, will skip this computation",
-        ""
-    ),
-    Arg(
-        Bool,
-        "reduce paralog paths",
-        (short="-r", long="--reduce-paralog"),
-        "collapse coparallel paths through duplications",
-        false,
-    ),
-    Arg(
-        String,
-        "isolates to project onto",
-        (short="-s", long="--strains"),
-        "collapse the graph to only blocks contained by paths of the given isolates.\n\tcomma seperated list, no spaces",
-        "",
-    ),
-   ],
-   (args) -> let
-       path = parse(Marginalize, args)
-       path = if (path === nothing || length(path) == 0)
-           nothing
-       elseif length(path) == 1
-           path
-       else
-           return 2
-       end
+    "marginalize",
+    "pangraph marginalize <options> [arguments]",
+    "computes all pairwise marginalizations of a multiple sequence alignment graph",
+    """multiple sequence alignment accepted in formats: [json]""",
+    [
+        Arg(
+            String,
+            "output path",
+            (short = "-o", long = "--output-path"),
+            "path to directory where all pairwise marginalizations will be stored\n\tif empty, will skip this computation",
+            "",
+        ),
+        Arg(
+            Bool,
+            "reduce paralog paths",
+            (short = "-r", long = "--reduce-paralog"),
+            "collapse coparallel paths through duplications",
+            false,
+        ),
+        Arg(
+            String,
+            "isolates to project onto",
+            (short = "-s", long = "--strains"),
+            "collapse the graph to only blocks contained by paths of the given isolates.\n\tcomma seperated list, no spaces",
+            "",
+        ),
+        Arg(
+            Bool,
+            "check that genomes are preserved",
+            (short = "-v", long = "--verify"),
+            "sanity check that verifies that genomes in the output graphs are identical\n\tto genomes in the input graphs.",
+            false,
+        ),
+    ],
+    (args) -> let
+        path = parse(Marginalize, args)
+        path = if (path === nothing || length(path) == 0)
+            nothing
+        elseif length(path) == 1
+            path
+        else
+            return 2
+        end
 
-       graph  = load(path, Marginalize)
-       names  = collect(keys(graph.sequence))
+        graph = load(path, Marginalize)
+        names = collect(keys(graph.sequence))
 
-       reduce = arg(Marginalize, "-r")
-       output = arg(Marginalize, "-o")
+        reduce = arg(Marginalize, "-r")
+        output = arg(Marginalize, "-o")
+        verify_flag = arg(Marginalize, "-v")
 
-       if length(output) > 0
-           isdir(output) || mkpath(output)
-           pairs  = [ (n₁,n₂) for n₁ in names for n₂ in names if n₁ < n₂ ]
+        function verify(G₀, G₋)
+            # verifies that after the marginalization all genomes from
+            # G₋ are exactly equal to their initial value in in G₀
+            for (name, path) in G₋.sequence
+                seq₀ = sequence(G₀.sequence[name], shift = true)
+                seq₋ = sequence(path, shift = true)
+                @assert seq₀ == seq₋
+            end
+        end
 
-           Threads.@threads for (name₁, name₂) in pairs
-               G = Graphs.copy(graph)
-               Graphs.keeponly!(G, name₁, name₂)
-               if reduce
-                   changed = true
-                   while changed
-                       l = length(G.block)
-                       Graphs.deparalog!(G)
-                       Graphs.detransitive!(G)
-                       changed = l != length(G.block)
-                   end
-               else
-                   Graphs.detransitive!(G)
-               end
+        function log(msg...)
+            println(stderr, msg...)
+            flush(stderr)
+        end
 
-               # recompute positions
-               Graphs.finalize!(G)
-               open("$(output)/$(name₁)-$(name₂).json", "w") do io
-                   marshal(io, G; fmt=:json)
-               end
-           end
-       end
+        if length(output) > 0
+            isdir(output) || mkpath(output)
+            pairs = [(n₁, n₂) for n₁ in names for n₂ in names if n₁ < n₂]
 
-       isolates = arg(Marginalize, "-s")
-       if length(isolates) > 0
-           names = split(isolates,',')
-           Graphs.keeponly!(graph, names...)
+            Threads.@threads for (name₁, name₂) in pairs
+                G = Graphs.copy_graph(graph)
+                Graphs.keeponly!(G, name₁, name₂)
+                if reduce
+                    changed = true
+                    while changed
+                        l = length(G.block)
+                        Graphs.deparalog!(G)
+                        Graphs.detransitive!(G)
+                        changed = l != length(G.block)
+                    end
+                else
+                    Graphs.detransitive!(G)
+                end
 
-           if reduce
-               changed = true
-               while changed
-                   l = length(graph.block)
-                   Graphs.deparalog!(graph)
-                   Graphs.detransitive!(graph)
-                   changed = l != length(graph.block)
-               end
-           else
-               Graphs.detransitive!(graph)
-           end
-           
-           # recompute positions
-           Graphs.finalize!(graph)
-           marshal(stdout, graph; fmt=:json)
-       end
-   end
+                # recompute positions
+                Graphs.finalize!(G)
+
+                verify_flag && verify(graph, G)
+
+                open("$(output)/$(name₁)-$(name₂).json", "w") do io
+                    marshal(io, G; fmt = :json)
+                end
+            end
+        end
+
+        isolates = arg(Marginalize, "-s")
+        if length(isolates) > 0
+
+            G = Graphs.copy(graph)
+            names = split(isolates, ',')
+            Graphs.keeponly!(G, names...)
+
+            if reduce
+                changed = true
+                while changed
+                    l = length(G.block)
+                    Graphs.deparalog!(G)
+                    Graphs.detransitive!(G)
+                    changed = l != length(G.block)
+                end
+            else
+                Graphs.detransitive!(G)
+            end
+
+            # recompute positions
+            Graphs.finalize!(G)
+
+            verify_flag && verify(graph, G)
+
+            marshal(stdout, G; fmt = :json)
+        end
+    end,
 )

From 067789fa478ff6ce0dbbe1055bb617b126f2cc96 Mon Sep 17 00:00:00 2001
From: Marco Molari <mmolari91@gmail.com>
Date: Tue, 20 Jun 2023 17:52:47 +0200
Subject: [PATCH 13/19] fix: copy -> copy_graph

---
 src/marginalize.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/marginalize.jl b/src/marginalize.jl
index 3d133d5c..b65ea80c 100644
--- a/src/marginalize.jl
+++ b/src/marginalize.jl
@@ -98,7 +98,7 @@ Marginalize = Command(
         isolates = arg(Marginalize, "-s")
         if length(isolates) > 0
 
-            G = Graphs.copy(graph)
+            G = Graphs.copy_graph(graph)
             names = split(isolates, ',')
             Graphs.keeponly!(G, names...)
 

From 08ef580db3c7c59165337f908f4ff42724d091d5 Mon Sep 17 00:00:00 2001
From: Marco Molari <mmolari91@gmail.com>
Date: Tue, 20 Jun 2023 18:01:37 +0200
Subject: [PATCH 14/19] chore: changelog and cli-tests

---
 CHANGELOG.md           | 2 +-
 tests/run-cli-tests.sh | 2 +-
 trace.jl               | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index d72dde7f..cf04ecba 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,7 +4,7 @@
 
 - fasta input files are checked for duplicated records, and white lines between records are tolerated, see [#55](https://github.com/neherlab/pangraph/pull/55).
 - PanGraph execution is now deterministic, and same input files always produce the same output, see [#57](https://github.com/neherlab/pangraph/pull/57).
-- introduced the `-v` flag in the `build` command. This activates consistency checks at each mergers. In this check we verify that the input genomes can be exactly reconstructed from the graph. See [#57](https://github.com/neherlab/pangraph/pull/57).
+- introduced the `-v` flag in the `build` and `merge` command. This activates consistency checks to verify that the input genomes can be exactly reconstructed. See [#57](https://github.com/neherlab/pangraph/pull/57).
 - Fixed [#56](https://github.com/neherlab/pangraph/issues/56)
 
 ## v0.6.3
diff --git a/tests/run-cli-tests.sh b/tests/run-cli-tests.sh
index 991ef2cc..f34673d8 100644
--- a/tests/run-cli-tests.sh
+++ b/tests/run-cli-tests.sh
@@ -68,7 +68,7 @@ echo "Test pangraph PanX export"
 pangraph export -ng -pX -o "$TESTDIR/export" "$TESTDIR/test1.json"
 
 echo "Test pangraph marginalize"
-pangraph marginalize -o "$TESTDIR/marginalize" "$TESTDIR/test1.json"
+pangraph marginalize -v -o "$TESTDIR/marginalize" "$TESTDIR/test1.json"
 
 # remove generated files
 rm -r $TESTDIR
diff --git a/trace.jl b/trace.jl
index 90032cc6..0f1c8d3a 100644
--- a/trace.jl
+++ b/trace.jl
@@ -26,7 +26,7 @@ PanGraph.main(["export", "-o", "$root/export", "$root/test.json"])
 PanGraph.main(["generate", "-o", "$root/export.json", "$root/test.fa"])
 
 # marginalize
-PanGraph.main(["marginalize", "-o", "$root/pairs", "$root/test.json"])
+PanGraph.main(["marginalize", "-v", "-o", "$root/pairs", "$root/test.json"])
 
 # polish
 PanGraph.main(["polish", "-c", "-l", "10000", "$root/test.json"])
\ No newline at end of file

From 53bc71bd1b14cd1978413b85d882f0955a088f8f Mon Sep 17 00:00:00 2001
From: Marco Molari <mmolari91@gmail.com>
Date: Sat, 24 Jun 2023 13:40:15 +0200
Subject: [PATCH 15/19] feat: add gap positions consistency check

---
 src/build.jl |  4 ++++
 src/graph.jl | 30 ++++++++++++++++++++++++++++++
 2 files changed, 34 insertions(+)

diff --git a/src/build.jl b/src/build.jl
index 87064bba..e88be8d5 100644
--- a/src/build.jl
+++ b/src/build.jl
@@ -159,6 +159,7 @@ Build = Command(
             _ => error("unrecognized alignment kernel")
         end
 
+        # if testing is set to true, collect dictionary of reference sequences for comparisons
         reference =
             arg(Build, "-v") ?
             Dict(begin
@@ -179,6 +180,9 @@ Build = Command(
         )
         finalize!(graph)
 
+        # test graph consistency
+        arg(Build, "-v") && Graphs.consistency_check(graph)
+
         marshal(stdout, graph; fmt = :json)
     end,
 )
diff --git a/src/graph.jl b/src/graph.jl
index ad3814b7..538b0b98 100644
--- a/src/graph.jl
+++ b/src/graph.jl
@@ -678,6 +678,36 @@ function finalize!(g)
     end
 end
 
+"""
+    consistency_check(G::Graph)
+
+performs final consistency checks on the graph. Implemented checks for now are:
+- check 1-1 correspondence between gaps and insertion positions in block alignments.
+"""
+function consistency_check(graph)
+
+    # for each block, check that each insertion corresponds to a gap
+    # and each gap has at least one insertion
+    for (id, b) in graph.block
+        
+        # collect gap positions
+        gaps = b.gaps |> keys |> collect
+
+        # collect insertion positions
+        ins_pos = b.insert |> values .|> keys
+        ins_pos = [first(i) for ip in ins_pos for i in ip if length(i) == 2] |> unique
+
+        # check 1-1 correspondence
+        if Set(gaps) != Set(ins_pos)
+            msg = "Consistency check failed in block $id:\n"
+            msg *= "no 1-1 correspondence between\n"
+            msg *= "gaps -> $gaps\n"
+            msg *= "insertion positions -> $ins_pos\n"
+            error(msg)
+        end
+    end
+end
+
 # ------------------------------------------------------------------------
 # main point of entry
 

From c461450885dabd068baf3e9fedc1c4b8c52280e3 Mon Sep 17 00:00:00 2001
From: Marco Molari <mmolari91@gmail.com>
Date: Sat, 24 Jun 2023 13:43:48 +0200
Subject: [PATCH 16/19] chore: consistency check flag v -> t

---
 docs/src/cli/build.md       |  2 +-
 docs/src/cli/marginalize.md |  2 +-
 src/build.jl                |  6 +++---
 src/marginalize.jl          | 10 +++++-----
 tests/run-cli-tests.sh      |  8 ++++----
 trace.jl                    |  6 +++---
 6 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/docs/src/cli/build.md b/docs/src/cli/build.md
index a3e3e5f8..19c3c278 100644
--- a/docs/src/cli/build.md
+++ b/docs/src/cli/build.md
@@ -16,7 +16,7 @@ Build a multiple sequence alignment pangraph.
 | distance calculator  | String  | d          | distance-backend | only accepts "native" or "mash"                                                                               |
 | alignment kernel     | String  | k          | alignment-kernel | only accepts "minimap2" or "mmseqs"                                                                           |
 | kmer length (mmseqs) | Integer | K          | kmer-length      | kmer length, only used for mmseqs2 alignment kernel. If not specified will use mmseqs default.                |
-| consistency check    | Boolean | v          | verify           | toggle to activate consistency check: verifies that input genomes can be exactly reconstructed from the graph |
+| consistency check    | Boolean | t          | test             | toggle to activate consistency check: verifies that input genomes can be exactly reconstructed from the graph |
 
 ## Arguments
 Expects one or more fasta files.
diff --git a/docs/src/cli/marginalize.md b/docs/src/cli/marginalize.md
index f8604293..864e1559 100644
--- a/docs/src/cli/marginalize.md
+++ b/docs/src/cli/marginalize.md
@@ -9,7 +9,7 @@ Compute all pairwise marginalizations of a multiple sequence alignment pangraph.
 | Output path        | String  | o          | output-path    | Path to direcotry where the output of all pairwise mariginalizations will be stored if supplied                                        |
 | Reduce paralogs    | Boolean | r          | reduce-paralog | Collapses coparallel paths through duplicated blocks.                                                                                  |
 | Projection strains | String  | s          | Strains        | Collapses the graph structure to only blocks and edges contained by the paths of the supplied strain names. comma seperated, no spaces |
-| Consistency check  | Boolean | v          | verify         | toggle to activate consistency check: verifies that output genomes are exactly equal to input genomes                                  |
+| Consistency check  | Boolean | t          | test           | toggle to activate consistency check: verifies that output genomes are exactly equal to input genomes                                  |
 
 ## Arguments
 Zero or one pangraph file which must be formatted as a JSON.
diff --git a/src/build.jl b/src/build.jl
index e88be8d5..318e3af4 100644
--- a/src/build.jl
+++ b/src/build.jl
@@ -80,7 +80,7 @@ Build = Command(
         Arg(
             Bool,
             "verify that input genomes can be reconstructed from output",
-            (short = "-v", long = "--verify"),
+            (short = "-t", long = "--test"),
             "toggle to activate consistency check at each step of the graph merging process",
             false,
         ),
@@ -161,7 +161,7 @@ Build = Command(
 
         # if testing is set to true, collect dictionary of reference sequences for comparisons
         reference =
-            arg(Build, "-v") ?
+            arg(Build, "-t") ?
             Dict(begin
                     key = collect(keys(G.sequence))[1]
                     seq = sequence(collect(values(G.sequence))[1])
@@ -181,7 +181,7 @@ Build = Command(
         finalize!(graph)
 
         # test graph consistency
-        arg(Build, "-v") && Graphs.consistency_check(graph)
+        arg(Build, "-t") && Graphs.consistency_check(graph)
 
         marshal(stdout, graph; fmt = :json)
     end,
diff --git a/src/marginalize.jl b/src/marginalize.jl
index b65ea80c..cc7f6132 100644
--- a/src/marginalize.jl
+++ b/src/marginalize.jl
@@ -28,8 +28,8 @@ Marginalize = Command(
         Arg(
             Bool,
             "check that genomes are preserved",
-            (short = "-v", long = "--verify"),
-            "sanity check that verifies that genomes in the output graphs are identical\n\tto genomes in the input graphs.",
+            (short = "-t", long = "--test"),
+            "consistency check that verifies that genomes in the output graphs are identical\n\tto genomes in the input graphs.",
             false,
         ),
     ],
@@ -48,7 +48,7 @@ Marginalize = Command(
 
         reduce = arg(Marginalize, "-r")
         output = arg(Marginalize, "-o")
-        verify_flag = arg(Marginalize, "-v")
+        test_flag = arg(Marginalize, "-t")
 
         function verify(G₀, G₋)
             # verifies that after the marginalization all genomes from
@@ -87,7 +87,7 @@ Marginalize = Command(
                 # recompute positions
                 Graphs.finalize!(G)
 
-                verify_flag && verify(graph, G)
+                test_flag && verify(graph, G)
 
                 open("$(output)/$(name₁)-$(name₂).json", "w") do io
                     marshal(io, G; fmt = :json)
@@ -117,7 +117,7 @@ Marginalize = Command(
             # recompute positions
             Graphs.finalize!(G)
 
-            verify_flag && verify(graph, G)
+            test_flag && verify(graph, G)
 
             marshal(stdout, G; fmt = :json)
         end
diff --git a/tests/run-cli-tests.sh b/tests/run-cli-tests.sh
index f34673d8..55cbbcc5 100644
--- a/tests/run-cli-tests.sh
+++ b/tests/run-cli-tests.sh
@@ -49,14 +49,14 @@ pangraph generate -s 100 -r 1e-1 -m 1e-3 -d 5e-2 -i 1e-2 -t 5 "$TESTDIR/randseqs
 
 echo "Test pangraph build - minimap asm20 no energy"
 export JULIA_NUM_THREADS=4
-pangraph build -v -c -k minimap2 -s 20 -a 0 -b 0 "$TESTDIR/input.fa" > "$TESTDIR/test1.json"
+pangraph build -t -c -k minimap2 -s 20 -a 0 -b 0 "$TESTDIR/input.fa" > "$TESTDIR/test1.json"
 
 echo "Test pangraph build - mash"
-pangraph build -v -c -d mash "$TESTDIR/input.fa" > "$TESTDIR/test2.json"
+pangraph build -t -c -d mash "$TESTDIR/input.fa" > "$TESTDIR/test2.json"
 
 echo "Test pangraph build - mmseqs"
 export JULIA_NUM_THREADS=1
-pangraph build -v -c -k mmseqs -K 8 "$TESTDIR/input.fa" > "$TESTDIR/test3.json"
+pangraph build -t -c -k mmseqs -K 8 "$TESTDIR/input.fa" > "$TESTDIR/test3.json"
 
 echo "Test pangraph polish"
 pangraph polish -c -l 10000 "$TESTDIR/test1.json" > "$TESTDIR/polished.json"
@@ -68,7 +68,7 @@ echo "Test pangraph PanX export"
 pangraph export -ng -pX -o "$TESTDIR/export" "$TESTDIR/test1.json"
 
 echo "Test pangraph marginalize"
-pangraph marginalize -v -o "$TESTDIR/marginalize" "$TESTDIR/test1.json"
+pangraph marginalize -t -o "$TESTDIR/marginalize" "$TESTDIR/test1.json"
 
 # remove generated files
 rm -r $TESTDIR
diff --git a/trace.jl b/trace.jl
index 0f1c8d3a..f34339e1 100644
--- a/trace.jl
+++ b/trace.jl
@@ -15,9 +15,9 @@ PanGraph.main(["help", "version"])		   # version usage
 PanGraph.main(["help", "help"])		   # help usage
 
 # build (native - mmseqs)
-PanGraph.main(["build", "-v", "-c", "-u", "-b", "0", "-a", "0", "$root/test.fa"])
+PanGraph.main(["build", "-t", "-c", "-u", "-b", "0", "-a", "0", "$root/test.fa"])
 # PanGraph.main(["build", "-c", "-u", "-d", "mash", "$root/test.fa"])
-PanGraph.main(["build", "-v", "-c", "-u", "-b", "0", "-a", "0", "-k", "mmseqs", "-K", "8", "$root/test.fa"])
+PanGraph.main(["build", "-t", "-c", "-u", "-b", "0", "-a", "0", "-k", "mmseqs", "-K", "8", "$root/test.fa"])
 
 # export
 PanGraph.main(["export", "-o", "$root/export", "$root/test.json"])
@@ -26,7 +26,7 @@ PanGraph.main(["export", "-o", "$root/export", "$root/test.json"])
 PanGraph.main(["generate", "-o", "$root/export.json", "$root/test.fa"])
 
 # marginalize
-PanGraph.main(["marginalize", "-v", "-o", "$root/pairs", "$root/test.json"])
+PanGraph.main(["marginalize", "-t", "-o", "$root/pairs", "$root/test.json"])
 
 # polish
 PanGraph.main(["polish", "-c", "-l", "10000", "$root/test.json"])
\ No newline at end of file

From 9f7f3c1dcf8b852f1de6164509087ccda9c54700 Mon Sep 17 00:00:00 2001
From: Marco Molari <mmolari91@gmail.com>
Date: Tue, 27 Jun 2023 16:14:25 +0200
Subject: [PATCH 17/19] feat: added random seed in CLI

---
 CHANGELOG.md          |  2 +-
 docs/src/cli/build.md |  1 +
 src/align.jl          |  4 ++--
 src/build.jl          | 11 +++++++++++
 4 files changed, 15 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cf04ecba..cdd8e02a 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,7 +3,7 @@
 ## v0.7.0
 
 - fasta input files are checked for duplicated records, and white lines between records are tolerated, see [#55](https://github.com/neherlab/pangraph/pull/55).
-- PanGraph execution is now deterministic, and same input files always produce the same output, see [#57](https://github.com/neherlab/pangraph/pull/57).
+- PanGraph execution is now deterministic, and same input files always produce the same output, see [#57](https://github.com/neherlab/pangraph/pull/57). For the build command, a random seed can be set with the `-r` flag.
 - introduced the `-v` flag in the `build` and `merge` command. This activates consistency checks to verify that the input genomes can be exactly reconstructed. See [#57](https://github.com/neherlab/pangraph/pull/57).
 - Fixed [#56](https://github.com/neherlab/pangraph/issues/56)
 
diff --git a/docs/src/cli/build.md b/docs/src/cli/build.md
index 19c3c278..2a4dbd61 100644
--- a/docs/src/cli/build.md
+++ b/docs/src/cli/build.md
@@ -17,6 +17,7 @@ Build a multiple sequence alignment pangraph.
 | alignment kernel     | String  | k          | alignment-kernel | only accepts "minimap2" or "mmseqs"                                                                           |
 | kmer length (mmseqs) | Integer | K          | kmer-length      | kmer length, only used for mmseqs2 alignment kernel. If not specified will use mmseqs default.                |
 | consistency check    | Boolean | t          | test             | toggle to activate consistency check: verifies that input genomes can be exactly reconstructed from the graph |
+| random seed          | Int     | r          | random-seed      | controls the random naming of blocks                                                                          |
 
 ## Arguments
 Expects one or more fasta files.
diff --git a/src/align.jl b/src/align.jl
index ba3abf0f..888f087e 100644
--- a/src/align.jl
+++ b/src/align.jl
@@ -631,7 +631,7 @@ The _lower_ the score, the _better_ the alignment. Only negative energies are co
 
 `compare` is the function to be used to generate pairwise distances that generate the internal guide tree.
 """
-function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100, verbose=false)
+function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100, verbose=false, rand_seed=0)
     function verify(graph; msg="")
         if reference !== nothing
             for (name,path) ∈ graph.sequence
@@ -699,7 +699,7 @@ function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(h
 
             # random seed for the thread - to ensure deterministic reproducibility
             # in block names
-            Random.seed!(n_clade)
+            Random.seed!(rand_seed+n_clade)
 
             if isleaf(clade)
                 close(clade.graph)
diff --git a/src/build.jl b/src/build.jl
index 318e3af4..77233be6 100644
--- a/src/build.jl
+++ b/src/build.jl
@@ -84,6 +84,13 @@ Build = Command(
             "toggle to activate consistency check at each step of the graph merging process",
             false,
         ),
+        Arg(
+            Int,
+            "random seed",
+            (short = "-r", long = "--random-seed"),
+            "random seed. Controls block random naming.",
+            0,
+        ),
     ],
     (args) -> let
         files = parse(Build, args)
@@ -123,6 +130,9 @@ Build = Command(
             end
         end
 
+        r_seed = arg(Build, "-r")
+        seed!(r_seed)
+
         graph(io) = graphs(io; circular = circular, upper = uppercase)
         isolates = (G for file in files for G ∈ open(graph, file))
 
@@ -177,6 +187,7 @@ Build = Command(
             maxiter = maxiter,
             reference = reference,
             verbose = false,
+            rand_seed = r_seed,
         )
         finalize!(graph)
 

From e1c5775f090c523d9e8f29b6416758b7f053c013 Mon Sep 17 00:00:00 2001
From: Marco Molari <mmolari91@gmail.com>
Date: Tue, 27 Jun 2023 16:22:52 +0200
Subject: [PATCH 18/19] chore: changed docs

---
 docs/src/cli/build.md | 2 +-
 src/build.jl          | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/docs/src/cli/build.md b/docs/src/cli/build.md
index 2a4dbd61..ee6ae020 100644
--- a/docs/src/cli/build.md
+++ b/docs/src/cli/build.md
@@ -17,7 +17,7 @@ Build a multiple sequence alignment pangraph.
 | alignment kernel     | String  | k          | alignment-kernel | only accepts "minimap2" or "mmseqs"                                                                           |
 | kmer length (mmseqs) | Integer | K          | kmer-length      | kmer length, only used for mmseqs2 alignment kernel. If not specified will use mmseqs default.                |
 | consistency check    | Boolean | t          | test             | toggle to activate consistency check: verifies that input genomes can be exactly reconstructed from the graph |
-| random seed          | Int     | r          | random-seed      | controls the random naming of blocks                                                                          |
+| random seed          | Int     | r          | random-seed      | random seed for pangraph construction.                                                                        |
 
 ## Arguments
 Expects one or more fasta files.
diff --git a/src/build.jl b/src/build.jl
index 77233be6..c7e9210b 100644
--- a/src/build.jl
+++ b/src/build.jl
@@ -88,7 +88,7 @@ Build = Command(
             Int,
             "random seed",
             (short = "-r", long = "--random-seed"),
-            "random seed. Controls block random naming.",
+            "random seed for pangraph construction.",
             0,
         ),
     ],

From 9c0af7ba1d5fd746e73989270b22d4675d21ff74 Mon Sep 17 00:00:00 2001
From: Marco Molari <mmolari91@gmail.com>
Date: Tue, 27 Jun 2023 17:01:34 +0200
Subject: [PATCH 19/19] chore: small changelog correction

---
 CHANGELOG.md | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index cdd8e02a..330eef6f 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -4,7 +4,7 @@
 
 - fasta input files are checked for duplicated records, and white lines between records are tolerated, see [#55](https://github.com/neherlab/pangraph/pull/55).
 - PanGraph execution is now deterministic, and same input files always produce the same output, see [#57](https://github.com/neherlab/pangraph/pull/57). For the build command, a random seed can be set with the `-r` flag.
-- introduced the `-v` flag in the `build` and `merge` command. This activates consistency checks to verify that the input genomes can be exactly reconstructed. See [#57](https://github.com/neherlab/pangraph/pull/57).
+- introduced the `-t` flag in the `build` and `merge` command. This activates consistency checks to verify that the input genomes can be exactly reconstructed. See [#57](https://github.com/neherlab/pangraph/pull/57).
 - Fixed [#56](https://github.com/neherlab/pangraph/issues/56)
 
 ## v0.6.3