From 9f065a14d6e7470147bcdee10bd70763e8afb565 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Wed, 7 Jun 2023 18:18:15 +0200 Subject: [PATCH 01/19] feat: deterministic block name and pair ordering --- src/PanGraph.jl | 2 ++ src/align.jl | 33 +++++++++++++++++++++++---------- 2 files changed, 25 insertions(+), 10 deletions(-) diff --git a/src/PanGraph.jl b/src/PanGraph.jl index 923e6557..361b16f9 100644 --- a/src/PanGraph.jl +++ b/src/PanGraph.jl @@ -163,6 +163,8 @@ function main(args) end function julia_main()::Cint + # initialize random seed for reproducibility + seed!(0) try main(ARGS) catch diff --git a/src/align.jl b/src/align.jl index 209dfb87..f989a5fd 100644 --- a/src/align.jl +++ b/src/align.jl @@ -3,6 +3,7 @@ module Align using Rematch, Dates using LinearAlgebra using ProgressMeter +using Random using Base.Threads: @spawn, @threads @@ -26,6 +27,7 @@ end # ------------------------------------------------------------------------ # guide tree for order of pairwise comparison for multiple alignments + # TODO: distance? """ mutable struct Clade @@ -33,7 +35,7 @@ end parent :: Union{Clade,Nothing} left :: Union{Clade,Nothing} right :: Union{Clade,Nothing} - graph :: Channel{Graph} + graph :: Channel{Tuple{Graph,Int}} end Clade is a node (internal or leaf) of a binary guide tree used to order pairwise alignments @@ -41,13 +43,16 @@ associated to a multiple genome alignment in progress. `name` is only non-empty for leaf nodes. `parent` is `nothing` for the root node. `graph` is a 0-sized channel that is used as a message passing primitive in alignment. + It contains the graph and an index used to decide the order of items in a pair in + pairwise graph merge. """ +Message=Tuple{Graph,Int} mutable struct Clade name :: String parent :: Union{Clade,Nothing} left :: Union{Clade,Nothing} right :: Union{Clade,Nothing} - graph :: Channel{Graph} + graph :: Channel{Message} end # --------------------------- @@ -58,20 +63,20 @@ end Generate an empty, disconnected clade. """ -Clade() = Clade("",nothing,nothing,nothing,Channel{Graph}(2)) +Clade() = Clade("", nothing, nothing, nothing, Channel{Message}(2)) """ Clade(name) Generate an empty, disconnected clade with name `name`. """ -Clade(name) = Clade(name,nothing,nothing,nothing,Channel{Graph}(2)) +Clade(name) = Clade(name, nothing, nothing, nothing, Channel{Message}(2)) """ Clade(left::Clade, right::Clade) Generate an nameless clade with `left` and `right` children. """ function Clade(left::Clade, right::Clade) - parent = Clade("",nothing,left,right,Channel{Graph}(2)) + parent = Clade("", nothing, left, right, Channel{Message}(2)) left.parent = parent right.parent = parent @@ -655,15 +660,22 @@ function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(h meter_lock = ReentrantLock() G = nothing - for clade ∈ postorder(tree) + for (n_clade, clade) ∈ enumerate(postorder(tree)) @spawn try + Random.seed!(n_clade) if isleaf(clade) close(clade.graph) - put!(clade.parent.graph, tips[clade.name]) + msg = (tips[clade.name], n_clade) + put!(clade.parent.graph, msg) else - Gₗ = take!(clade.graph) - Gᵣ = take!(clade.graph) + Gₗ, Pₗ = take!(clade.graph) + Gᵣ, Pᵣ = take!(clade.graph) close(clade.graph) + # ensure a consistent ordering of the two graphs, + # irrespective of which process is sending the message first. + if Pₗ > Pᵣ + Gₗ, Gᵣ = Gᵣ, Gₗ + end # the lock ensures that at most N=Threads.nthreads() processes are # spawning run(`cmd`) instances at the same time @@ -678,7 +690,8 @@ function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(h end if clade.parent !== nothing - put!(clade.parent.graph, G₀) + msg = (G₀, n_clade) + put!(clade.parent.graph, msg) else G = G₀ close(error_channel) From 450e987b40f4a7f16c9346cc4050dd71fd0ff4c6 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Fri, 9 Jun 2023 16:49:34 +0200 Subject: [PATCH 02/19] feat: deterministic graph construction --- Manifest.toml | 4 +-- Project.toml | 3 +- src/align.jl | 41 +++++++++++++++++++++++++- src/block.jl | 79 ++++++++++++++++++++++++++++++--------------------- 4 files changed, 91 insertions(+), 36 deletions(-) diff --git a/Manifest.toml b/Manifest.toml index fd8baaae..23770584 100644 --- a/Manifest.toml +++ b/Manifest.toml @@ -247,9 +247,9 @@ uuid = "efe28fd5-8261-553b-a9e1-b2916fc3738e" version = "0.5.5+0" [[deps.OrderedCollections]] -git-tree-sha1 = "85f8e6578bf1f9ee0d11e7bb1b1456435479d47c" +git-tree-sha1 = "d321bf2de576bf25ec4d3e4360faca399afca282" uuid = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" -version = "1.4.1" +version = "1.6.0" [[deps.PDMats]] deps = ["LinearAlgebra", "SparseArrays", "SuiteSparse"] diff --git a/Project.toml b/Project.toml index 97e29456..f8d7b2f9 100644 --- a/Project.toml +++ b/Project.toml @@ -12,6 +12,7 @@ JLD2 = "033835bb-8acc-5ee8-8aae-3f567f8a3819" JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6" LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e" Logging = "56ddb016-857b-54e1-b83d-db4d58db5568" +OrderedCollections = "bac558e1-5e72-5ebc-8fee-abe8a469f55d" PackageCompiler = "9b87118b-4619-50d2-8e1e-99f35a4d4d9d" Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f" ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca" @@ -23,4 +24,4 @@ TreeTools = "62f0eae3-8c0e-4032-a621-7756092209e5" minimap2_jll = "d341526d-637d-5003-8fc4-9c6812cd2b55" [compat] -TreeTools = ">= 0.6.2" \ No newline at end of file +TreeTools = ">= 0.6.2" diff --git a/src/align.jl b/src/align.jl index f989a5fd..67855824 100644 --- a/src/align.jl +++ b/src/align.jl @@ -356,6 +356,31 @@ function preprocess(hits, skip, energy, blocks!) return hits end +# DEBUG +function log_alignment(G₁::Graph, G₂::Graph, hits, fname::String) + open(fname, "w") do io + for G in (G₁, G₂) + write(io, "------------ G ------------\n") + PC = pancontigs(G) + for (name, seq) in zip(PC.name, PC.sequence) + write(io, ">$name\n") + write(io, seq, "\n") + end + end + write(io, "------------ hits ------------\n") + for h in hits + write(io, """ + ......................... + qry -> $(h.qry.name) | $(h.qry.start) -> $(h.qry.stop) | $(h.qry.length) + ref -> $(h.ref.name) | $(h.ref.start) -> $(h.ref.stop) | $(h.ref.length) + len -> $(h.length) + strand -> $(h.orientation) + cigar -> $(h.cigar) + """) + end + end +end + function do_align(G₁::Graph, G₂::Graph, energy::Function, aligner::Function) hits = if G₁ == G₂ self = pancontigs(G₁) @@ -364,6 +389,9 @@ function do_align(G₁::Graph, G₂::Graph, energy::Function, aligner::Function) aligner(pancontigs(G₁), pancontigs(G₂)) end sort!(hits; by=energy) + + # DEBUG + # log_alignment(G₁, G₂, hits, "issue/minimap/$(randstring(10)).log") return hits end @@ -662,7 +690,11 @@ function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(h G = nothing for (n_clade, clade) ∈ enumerate(postorder(tree)) @spawn try + + # random seed for the thread - to ensure deterministic reproducibility + # in block names Random.seed!(n_clade) + if isleaf(clade) close(clade.graph) msg = (tips[clade.name], n_clade) @@ -676,7 +708,7 @@ function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(h if Pₗ > Pᵣ Gₗ, Gᵣ = Gᵣ, Gₗ end - + # the lock ensures that at most N=Threads.nthreads() processes are # spawning run(`cmd`) instances at the same time G₀ = lock_semaphore(s) do @@ -684,6 +716,13 @@ function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(h align_self(G₀, energy, minblock, aligner, verify, false) end + # DEBUG : save graph at each iteration in a file + # open("issue/comp/graph_iteration_$(n_clade).json", "w") do io + # finalize!(G₀) + # marshal(io, G₀; fmt=:json) + # end + + # advance progress bar in a thread-safe way lock(meter_lock) do next!(meter) diff --git a/src/block.jl b/src/block.jl index f81a302d..2b66f081 100644 --- a/src/block.jl +++ b/src/block.jl @@ -4,6 +4,7 @@ using Rematch import Base: show, length, append!, keys, merge!, pop! +import OrderedCollections: OrderedDict, OrderedSet # internal modules using ..Intervals @@ -28,11 +29,17 @@ export depth, combine, swap!, check # operators export assertequivalent export alignment, diversity -const AlleleMaps{T} = Union{Dict{Node{T},SNPMap},Dict{Node{T},InsMap},Dict{Node{T},DelMap}} +const AlleleMaps{T} = Union{OrderedDict{Node{T},SNPMap},OrderedDict{Node{T},InsMap},OrderedDict{Node{T},DelMap}} # ------------------------------------------------------------------------ # utility functions +# DEBUG +function log(msg...) + println(stderr, msg...) + flush(stderr) +end + """ applyalleles(seq::Array{UInt8}, mutate::SNPMap, insert::InsMap, delete::DelMap) @@ -270,9 +277,9 @@ end uuid :: String sequence :: Array{UInt8} gaps :: Dict{Int,Int} - mutate :: Dict{Node{Block},SNPMap} - insert :: Dict{Node{Block},InsMap} - delete :: Dict{Node{Block},DelMap} + mutate :: OrderedDict{Node{Block},SNPMap} + insert :: OrderedDict{Node{Block},InsMap} + delete :: OrderedDict{Node{Block},DelMap} end Store a multiple sequence alignment of contiguous DNA related by homology. @@ -286,10 +293,13 @@ mutable struct Block uuid :: String sequence :: Array{UInt8} gaps :: Dict{Int,Int} - mutate :: Dict{Node{Block},SNPMap} - insert :: Dict{Node{Block},InsMap} - delete :: Dict{Node{Block},DelMap} + mutate :: OrderedDict{Node{Block},SNPMap} + insert :: OrderedDict{Node{Block},InsMap} + delete :: OrderedDict{Node{Block},DelMap} end +SNPsDict = OrderedDict{Node{Block},SNPMap} +InsDict = OrderedDict{Node{Block},InsMap} +DelDict = OrderedDict{Node{Block},DelMap} function show(io::IO, m::Dict{Node{Block}, T}) where T <: Union{SNPMap, InsMap, DelMap} print(io, "{\n") @@ -317,13 +327,13 @@ Block(sequence,gaps,mutate,insert,delete) = Block(random_id(),sequence,gaps,muta Construct a block with a unique `uuid` with fixed `sequence` and `gaps`. No individuals and thus polymorphisms are initialized. """ -Block(sequence,gaps) = Block(sequence,gaps,Dict{Node{Block},SNPMap}(),Dict{Node{Block},InsMap}(),Dict{Node{Block},DelMap}()) +Block(sequence,gaps) = Block(sequence,gaps,SNPsDict(),InsDict(),DelDict()) """ Block(sequence,gaps) Construct a block with a unique `uuid` with fixed `sequence`. """ -Block(sequence) = Block(sequence,Dict{Int,Int}(),Dict{Node{Block},SNPMap}(),Dict{Node{Block},InsMap}(),Dict{Node{Block},DelMap}()) +Block(sequence) = Block(sequence,Dict{Int,Int}(),SNPsDict(),InsDict(),DelDict()) """ Block(sequence,gaps) @@ -333,20 +343,20 @@ Block() = Block(UInt8[]) # move alleles translate(d::Dict{Int,Int}, δ) = Dict(x+δ => v for (x,v) ∈ d) # gaps -translate(d::Dict{Node{Block},InsMap}, δ) = Dict{Node{Block},InsMap}(n => Dict((x+δ,Δ) => v for ((x,Δ),v) ∈ val) for (n,val) ∈ d) # insertions -translate(dict::T, δ) where T <: AlleleMaps{Block} = Dict(key=>Dict(x+δ=>v for (x,v) in val) for (key,val) in dict) +translate(d::InsDict, δ) = InsDict(n => Dict((x+δ,Δ) => v for ((x,Δ),v) ∈ val) for (n,val) ∈ d) # insertions +translate(dict::T, δ) where T <: Union{SNPsDict,DelDict} = T(key=>Dict(x+δ=>v for (x,v) in val) for (key,val) in dict) # select alleles within window -lociwithin(dict::Dict{Node{Block},SNPMap}, i) = -Dict{Node{Block},SNPMap}( +lociwithin(dict::SNPsDict, i) = +SNPsDict( node => SNPMap( locus => allele for (locus,allele) in subdict if i.start ≤ locus ≤ i.stop ) for (node, subdict) ∈ dict ) # XXX: have to deal with left case i.e. you have a deletion that starts before i.start but continues onwards -lociwithin(dict::Dict{Node{Block},DelMap}, i) = -Dict{Node{Block},DelMap}( +lociwithin(dict::DelDict, i) = +DelDict( node => DelMap( let k = max(locus,i.start) @@ -357,9 +367,9 @@ Dict{Node{Block},DelMap}( ) # XXX: have to special case the left edge -function lociwithin(dict::Dict{Node{Block},InsMap}, i) +function lociwithin(dict::InsDict, i) i = (i.start == 1) ? (0:i.stop) : i - return Dict{Node{Block},InsMap}( + return InsDict( node => InsMap( locus => insert for (locus, insert) in subdict if i.start ≤ first(locus) ≤ i.stop ) for (node, subdict) ∈ dict @@ -372,12 +382,12 @@ function lociwithin(dict::Dict{Int,Int}, i) end # copy dictionary -Base.copy(dict::T) where T <: AlleleMaps{Block} = Dict(node=>Base.copy(subdict) for (node,subdict) in dict) +Base.copy(dict::T) where T <: AlleleMaps{Block} = T(node=>Base.copy(subdict) for (node,subdict) in dict) # merge alleles (recursively) function merge!(base::T, others::T...) where T <: AlleleMaps{Block} # keys not found in base - for node ∈ Set(k for other in others for k ∈ keys(other) if k ∉ keys(base)) + for node ∈ OrderedSet(k for other in others for k ∈ keys(other) if k ∉ keys(base)) base[node] = merge((other[node] for other in others if node ∈ keys(other))...) end @@ -390,7 +400,7 @@ function merge!(base::T, others::T...) where T <: AlleleMaps{Block} end function merge_cat!(base::InsMap, gaps::Dict{Int,Int}, others::InsMap...) - new = Set(locus for other in others for locus in first.(keys(other))) + new = OrderedSet(locus for other in others for locus in first.(keys(other))) shared = intersect(Set(keys(gaps)), new) length(shared) == 0 && return merge!(base, others...) @@ -407,10 +417,10 @@ function merge_cat!(base::InsMap, gaps::Dict{Int,Int}, others::InsMap...) end end -function merge_cat!(base::Dict{Node{Block},InsMap}, others::Dict{Node{Block},InsMap}...) - Ks = Set(keys(base)) +function merge_cat!(base::InsDict, others::InsDict...) + Ks = OrderedSet(keys(base)) # keys not found in base - for node ∈ Set(k for other in others for k ∈ keys(other) if k ∉ Ks) + for node ∈ OrderedSet(k for other in others for k ∈ keys(other) if k ∉ Ks) base[node] = merge((other[node] for other in others if node ∈ keys(other))...) end @@ -588,9 +598,9 @@ function reverse_complement(b::Block; keepid=false) revcmpl(dict::DelMap) = Dict(len-(locus+del-1)+1 => del for (locus,del) in dict) revcmpl(dict::InsMap) = Dict((len-locus,b.gaps[locus]-length(ins)-off) => reverse_complement(ins) for ((locus,off),ins) in dict) - mutate = Dict(node => revcmpl(snp) for (node, snp) in b.mutate) - insert = Dict(node => revcmpl(ins) for (node, ins) in b.insert) - delete = Dict(node => revcmpl(del) for (node, del) in b.delete) + mutate = SNPsDict(node => revcmpl(snp) for (node, snp) in b.mutate) + insert = InsDict(node => revcmpl(ins) for (node, ins) in b.insert) + delete = DelDict(node => revcmpl(del) for (node, del) in b.delete) gaps = Dict(len-locus => gap for (locus, gap) in b.gaps) return keepid ? Block(b.uuid, seq,gaps,mutate,insert,delete) : Block(seq,gaps,mutate,insert,delete) @@ -776,7 +786,7 @@ function sequence(b::Block, node::Node{Block}; gaps=false, debug=false, forward= return (node.strand || forward) ? seq : reverse_complement(seq) end -function gapconsensus(insert::Dict{Node{Block},InsMap}, len::Int, x::Int) +function gapconsensus(insert::InsDict, len::Int, x::Int) num = sum(1 for ins in values(insert) for locus in keys(ins) if first(locus) == x; init=0) @assert num > 0 @@ -968,6 +978,11 @@ function rereference(qry::Block, ref::Block, segments) delete = Base.copy(ref.delete), ) + # DEBUG + # log("type of combined mutate", typeof(combined.mutate)) + # log("type of combined insert", typeof(combined.insert)) + # log("type of combined delete", typeof(combined.delete)) + map(dict, from, to) = translate(lociwithin(dict, from), to.start-from.start) x = ( @@ -1048,7 +1063,7 @@ function rereference(qry::Block, ref::Block, segments) end for I in unmatched - merge!(combined.delete, Dict(node => Dict(I.lo=>length(I)))) + merge!(combined.delete, DelDict(node => Dict(I.lo=>length(I)))) end end @@ -1061,7 +1076,7 @@ function rereference(qry::Block, ref::Block, segments) newgap = nothing else - newdeletes = Dict(node => Dict(x.ref=>Δ.stop-Δ.start+1) for node ∈ keys(qry)) + newdeletes = DelDict(node => Dict(x.ref=>Δ.stop-Δ.start+1) for node ∈ keys(qry)) merge!(combined.delete, newdeletes) end @@ -1082,7 +1097,7 @@ function rereference(qry::Block, ref::Block, segments) newgap = (x.ref-1, 0) - newinserts = Dict( + newinserts = InsDict( let seq = applyalleles(qry.sequence[Δ], mutate[node], insert[node], delete[node]) if length(seq) > 0 @@ -1107,7 +1122,7 @@ function rereference(qry::Block, ref::Block, segments) end (Δq, Δr) => let # simple translation of alleles of qry -> ref # carry over mutations to qry sequences as long as its different from new reference - muts = Dict( + muts = SNPsDict( node => Dict( x => nuc for (x,nuc) in subdict if nuc != ref.sequence[x] ) for (node, subdict) in map(qry.mutate,Δq,Δr) @@ -1117,7 +1132,7 @@ function rereference(qry::Block, ref::Block, segments) # apply mutations to all qry sequences where qry ≠ ref that are not deleted or already mutated qrysnps = findall(qry.sequence[Δq] .!= ref.sequence[Δr]) - newmuts = Dict( + newmuts = SNPsDict( node => Dict( Δr.start+(x-1) => qry.sequence[Δq.start+(x-1)] for x in qrysnps if ((Δq.start+(x-1)) ∉ keys(qry.mutate[node])) ) for node ∈ keys(qry) From ed8366c26efa899a6b397353bb48764206c5bbc2 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Mon, 19 Jun 2023 10:08:10 +0200 Subject: [PATCH 03/19] fix: ins-map consistency --- src/block.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/block.jl b/src/block.jl index 2b66f081..6b8216fd 100644 --- a/src/block.jl +++ b/src/block.jl @@ -1104,7 +1104,7 @@ function rereference(qry::Block, ref::Block, segments) if (δ+length(seq)) > last(newgap) newgap = (first(newgap),length(seq)+δ) end - node => Dict((x.ref-1,δ) => seq) + node => InsMap((x.ref-1,δ) => seq) else node => InsMap() end From af5f80473478b4d857215f8bfe029b4760217201 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Mon, 19 Jun 2023 14:42:09 +0200 Subject: [PATCH 04/19] fix for missing insertions --- src/block.jl | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/src/block.jl b/src/block.jl index 6b8216fd..7ce1aa7c 100644 --- a/src/block.jl +++ b/src/block.jl @@ -1068,7 +1068,12 @@ function rereference(qry::Block, ref::Block, segments) end delete!(qry.gaps, x.qry-1) - delete!(newgaps, Δ.start-1) + # DEBUG delete only if no more insertions are present here + if Δ.start-1 ∉ [ + locus for (node, ins) ∈ combined.insert for (locus, δ) ∈ keys(ins) + ] + delete!(newgaps, Δ.start-1) + end if last(newgap) > 0 newgaps[newgap[1]] = newgap[2] From 95a984a19b9dc19c7a4ac0469067aceae0be258b Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Mon, 19 Jun 2023 15:50:01 +0200 Subject: [PATCH 05/19] chore: modified changelog --- CHANGELOG.md | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 40834316..58e67687 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,8 +1,10 @@ # PanGraph Changelog -## v0.6.4 (draft) +## v0.7.0 (draft) - fasta input files are checked for duplicated records, and white lines between records are tolerated, see [#55](https://github.com/neherlab/pangraph/pull/55). +- PanGraph execution is now deterministic, and same input files always produce the same output. +- Fixed [#56](https://github.com/neherlab/pangraph/issues/56) ## v0.6.3 From e4e4fa8677364e23b2417009bac13af6d6e74a86 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Mon, 19 Jun 2023 16:39:45 +0200 Subject: [PATCH 06/19] feat: added flag to enforce consistency checks --- docs/src/cli/build.md | 25 ++-- src/align.jl | 22 ++- src/block.jl | 2 +- src/build.jl | 302 ++++++++++++++++++++++-------------------- 4 files changed, 192 insertions(+), 159 deletions(-) diff --git a/docs/src/cli/build.md b/docs/src/cli/build.md index fd4a7f3b..a3e3e5f8 100644 --- a/docs/src/cli/build.md +++ b/docs/src/cli/build.md @@ -4,18 +4,19 @@ Build a multiple sequence alignment pangraph. ## Options -| Name | Type | Short Flag | Long Flag | Description | -| :------------------- | :------ | :--------- | :--------------- | :-------------------------------------------------------------------------------------------------------- | -| minimum length | Integer | l | len | minimum block size for alignment graph (in nucleotides) | -| block junction cost | Float | a | alpha | energy cost for introducing block partitions due to alignment merger | -| block diversity cost | Float | b | beta | energy cost for interblock diversity due to alignment merger | -| circular genomes | Boolean | c | circular | toggle if input genomes are circular | -| pairwise sensitivity | String | s | sensitivity | controls the pairwise genome alignment sensitivity of minimap 2. Currently only accepts "5", "10" or "20" | -| maximum self-maps | Integer | x | max-self-map | maximum number of iterations to perform block self maps per pairwise graph merger | -| enforce uppercase | Boolean | u | upper-case | toggle to force genomes to uppercase characters | -| distance calculator | String | d | distance-backend | only accepts "native" or "mash" | -| alignment kernel | String | k | alignment-kernel | only accepts "minimap2" or "mmseqs" | -| kmer length (mmseqs) | Integer | K | kmer-length | kmer length, only used for mmseqs2 alignment kernel. If not specified will use mmseqs default. | +| Name | Type | Short Flag | Long Flag | Description | +| :------------------- | :------ | :--------- | :--------------- | :------------------------------------------------------------------------------------------------------------ | +| minimum length | Integer | l | len | minimum block size for alignment graph (in nucleotides) | +| block junction cost | Float | a | alpha | energy cost for introducing block partitions due to alignment merger | +| block diversity cost | Float | b | beta | energy cost for interblock diversity due to alignment merger | +| circular genomes | Boolean | c | circular | toggle if input genomes are circular | +| pairwise sensitivity | String | s | sensitivity | controls the pairwise genome alignment sensitivity of minimap 2. Currently only accepts "5", "10" or "20" | +| maximum self-maps | Integer | x | max-self-map | maximum number of iterations to perform block self maps per pairwise graph merger | +| enforce uppercase | Boolean | u | upper-case | toggle to force genomes to uppercase characters | +| distance calculator | String | d | distance-backend | only accepts "native" or "mash" | +| alignment kernel | String | k | alignment-kernel | only accepts "minimap2" or "mmseqs" | +| kmer length (mmseqs) | Integer | K | kmer-length | kmer length, only used for mmseqs2 alignment kernel. If not specified will use mmseqs default. | +| consistency check | Boolean | v | verify | toggle to activate consistency check: verifies that input genomes can be exactly reconstructed from the graph | ## Arguments Expects one or more fasta files. diff --git a/src/align.jl b/src/align.jl index 67855824..ba3abf0f 100644 --- a/src/align.jl +++ b/src/align.jl @@ -431,7 +431,7 @@ The _lower_ the score, the _better_ the alignment. Only negative energies are co `minblock` is the minimum size block that will be produced from the algorithm. `maxiter` is maximum number of duplications that will be considered during this alignment. """ -function align_self(G₁::Graph, energy::Function, minblock::Int, aligner::Function, verify::Function, verbose::Bool; maxiter=100, sensitivity="asm10") +function align_self(G₁::Graph, energy::Function, minblock::Int, aligner::Function, verify::Function, verbose::Bool; maxiter=100) G₀ = G₁ for niter in 1:maxiter @@ -473,6 +473,9 @@ function align_self(G₁::Graph, energy::Function, minblock::Int, aligner::Funct detransitive!(G₀) purge!(G₀) prune!(G₀) + + # verify that isolates are correctly reconstructed (-v flag) + verify(G₀, msg="verify align-self $niter") end return G₀ @@ -606,6 +609,9 @@ function align_pair(G₁::Graph, G₂::Graph, energy::Function, minblock::Int, a purge!(G) prune!(G) + # verify that isolates are correctly reconstructed (-v flag) + verify(G, msg="verify align-pair") + return G end @@ -625,8 +631,8 @@ The _lower_ the score, the _better_ the alignment. Only negative energies are co `compare` is the function to be used to generate pairwise distances that generate the internal guide tree. """ -function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100) - function verify(graph, msg="") +function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100, verbose=false) + function verify(graph; msg="") if reference !== nothing for (name,path) ∈ graph.sequence seq = sequence(path) @@ -663,7 +669,7 @@ function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(h println("--> insert: $(path.node[i].block.insert[path.node[i]])") println("--> delete: $(path.node[i].block.delete[path.node[i]])") - error("--> isolate '$name' incorrectly reconstructed") + error("$msg\n--> isolate '$name' incorrectly reconstructed") end end end @@ -712,8 +718,12 @@ function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(h # the lock ensures that at most N=Threads.nthreads() processes are # spawning run(`cmd`) instances at the same time G₀ = lock_semaphore(s) do - G₀ = align_pair(Gₗ, Gᵣ, energy, minblock, aligner, verify, false) - align_self(G₀, energy, minblock, aligner, verify, false) + verbose && log("--> align-pair for clade n. $n_clade") + G₀ = align_pair(Gₗ, Gᵣ, energy, minblock, aligner, verify, verbose) + verbose && log("--> align-self for clade n. $n_clade") + G₀ = align_self(G₀, energy, minblock, aligner, verify, verbose, maxiter=maxiter) + verbose && log("--> graph merging for clade n. $n_clade completed") + G₀ end # DEBUG : save graph at each iteration in a file diff --git a/src/block.jl b/src/block.jl index 7ce1aa7c..130596ab 100644 --- a/src/block.jl +++ b/src/block.jl @@ -1068,7 +1068,7 @@ function rereference(qry::Block, ref::Block, segments) end delete!(qry.gaps, x.qry-1) - # DEBUG delete only if no more insertions are present here + # DEBUG delete only if no more insertions are present in this gap if Δ.start-1 ∉ [ locus for (node, ins) ∈ combined.insert for (locus, δ) ∈ keys(ins) ] diff --git a/src/build.jl b/src/build.jl index 30c6ef42..87064bba 100644 --- a/src/build.jl +++ b/src/build.jl @@ -1,162 +1,184 @@ Build = Command( - "build", - "pangraph build [arguments]", - "align genomes into a multiple sequence alignment graph", - """zero, one or more fasta files. - files can be optionally gzipped. - multiple records within one file are treated as seperate genomes. - if no file is given, reads fasta record from stdin""", - [ - Arg( - Int, - "minimum length", - (short="-l", long="--len"), - "minimum block size for alignment graph (in nucleotides)", - 100, - ), - Arg( - Float64, - "block junction cost", - (short="-a", long="--alpha"), - "energy cost for introducing junction due to alignment merger", - 100, - ), - Arg( - Float64, - "block diversity cost", - (short="-b", long="--beta"), - "energy cost for interblock diversity due to alignment merger", - 10, - ), - Arg( - Bool, - "circular genomes", - (short="-c", long="--circular"), - "toggle if input genomes are circular", - false, - ), - Arg( - Bool, - "enforce uppercase", - (short="-u", long="--upper-case"), - "transforms all sequence to upper case", - false, - ), - Arg( - String, - "pairwise sensitivity", - (short="-s", long="--sensitivity"), - "used to set pairwise alignment sensitivity\n\trecognized options: [5, 10, 20]", - "10", - ), - Arg( - Int, - "maximum self maps", - (short="-x", long="--max-self-map"), - "maximum number of self mappings to consider per pairwise graph merger", - 100, - ), - Arg( - String, - "distance calculator", - (short="-d", long="--distance-backend"), - "backend to use to estimate pairwise distance for guide tree\n\trecognized options: [native, mash]", - "native", - ), - Arg( - String, - "alignment kernel", - (short="-k", long="--alignment-kernel"), - "backend to use for pairwise genome alignment\n\trecognized options: [minimap2, mmseqs]", - "minimap2", - ), - Arg( - Int, - "k-mer length", - (short="-K", long="--kmer-length"), - "kmer length, only used for mmseqs2 alignment kernel. If not specified will use mmseqs default.", - 0, - ) - ], + "build", + "pangraph build [arguments]", + "align genomes into a multiple sequence alignment graph", + """zero, one or more fasta files. + files can be optionally gzipped. + multiple records within one file are treated as seperate genomes. + if no file is given, reads fasta record from stdin""", + [ + Arg( + Int, + "minimum length", + (short = "-l", long = "--len"), + "minimum block size for alignment graph (in nucleotides)", + 100, + ), + Arg( + Float64, + "block junction cost", + (short = "-a", long = "--alpha"), + "energy cost for introducing junction due to alignment merger", + 100, + ), + Arg( + Float64, + "block diversity cost", + (short = "-b", long = "--beta"), + "energy cost for interblock diversity due to alignment merger", + 10, + ), + Arg( + Bool, + "circular genomes", + (short = "-c", long = "--circular"), + "toggle if input genomes are circular", + false, + ), + Arg( + Bool, + "enforce uppercase", + (short = "-u", long = "--upper-case"), + "transforms all sequence to upper case", + false, + ), + Arg( + String, + "pairwise sensitivity", + (short = "-s", long = "--sensitivity"), + "used to set pairwise alignment sensitivity\n\trecognized options: [5, 10, 20]", + "10", + ), + Arg( + Int, + "maximum self maps", + (short = "-x", long = "--max-self-map"), + "maximum number of self mappings to consider per pairwise graph merger", + 100, + ), + Arg( + String, + "distance calculator", + (short = "-d", long = "--distance-backend"), + "backend to use to estimate pairwise distance for guide tree\n\trecognized options: [native, mash]", + "native", + ), + Arg( + String, + "alignment kernel", + (short = "-k", long = "--alignment-kernel"), + "backend to use for pairwise genome alignment\n\trecognized options: [minimap2, mmseqs]", + "minimap2", + ), + Arg( + Int, + "k-mer length", + (short = "-K", long = "--kmer-length"), + "kmer length, only used for mmseqs2 alignment kernel. If not specified will use mmseqs default.", + 0, + ), + Arg( + Bool, + "verify that input genomes can be reconstructed from output", + (short = "-v", long = "--verify"), + "toggle to activate consistency check at each step of the graph merging process", + false, + ), + ], + (args) -> let + files = parse(Build, args) + files = if files === nothing || length(files) == 0 + ["/dev/stdin"] + else + files + end - (args) -> let - files = parse(Build, args) - files = if files === nothing || length(files) == 0 - ["/dev/stdin"] - else - files - end + minblock = arg(Build, "-l") + circular = arg(Build, "-c") + uppercase = arg(Build, "-u") - minblock = arg(Build, "-l") - circular = arg(Build, "-c") - uppercase = arg(Build, "-u") + α = arg(Build, "-a") + β = arg(Build, "-b") - α = arg(Build, "-a") - β = arg(Build, "-b") + energy = function (aln) + len = aln.length + len < minblock && return Inf - energy = function(aln) - len = aln.length - len < minblock && return Inf + cuts(hit) = (hit.start > minblock) + ((hit.length - hit.stop) > minblock) - cuts(hit) = (hit.start > minblock) + ((hit.length-hit.stop) > minblock) + ncuts = cuts(aln.qry) + cuts(aln.ref) + nmuts = aln.divergence * aln.length - ncuts = cuts(aln.qry)+cuts(aln.ref) - nmuts = aln.divergence*aln.length + return -len + α * ncuts + β * nmuts + end - return -len + α*ncuts + β*nmuts - end - - maxiter = arg(Build, "-x") - sensitivity = @match arg(Build, "-s") begin - "5" => "asm5" - "10" => "asm10" - "20" => "asm20" - _ => begin + maxiter = arg(Build, "-x") + sensitivity = @match arg(Build, "-s") begin + "5" => "asm5" + "10" => "asm10" + "20" => "asm20" + _ => begin usage(Build) exit(1) end - end + end - graph(io) = graphs(io; circular=circular, upper=uppercase) - isolates = (G for file in files for G ∈ open(graph,file)) + graph(io) = graphs(io; circular = circular, upper = uppercase) + isolates = (G for file in files for G ∈ open(graph, file)) - compare = @match arg(Build, "-d") begin - "native" => Graphs.Mash.distance - "mash" => begin - if !Graphs.havecommand("mash") - panic("external command mash not found. either install or use native backend\n") - end - Graphs.mash - end - _ => begin + compare = @match arg(Build, "-d") begin + "native" => Graphs.Mash.distance + "mash" => begin + if !Graphs.havecommand("mash") + panic( + "external command mash not found. either install or use native backend\n", + ) + end + Graphs.mash + end + _ => begin # XXX: hacky... Build.arg[8].value = "native" usage(Build) exit(1) - end - end + end + end + + aligner(contigs₁, contigs₂) = @match arg(Build, "-k") begin + "minimap2" => Minimap.align(contigs₁, contigs₂, minblock, sensitivity) + "mmseqs" => let + if !Shell.havecommand("mmseqs") + panic( + "external command mmseqs not found. please install before running build step with mmseqs backend\n", + ) + end + kmer_len = arg(Build, "-K") + MMseqs.align(contigs₁, contigs₂, kmer_len) + end + _ => error("unrecognized alignment kernel") + end - aligner(contigs₁, contigs₂) = @match arg(Build, "-k") begin - "minimap2" => Minimap.align(contigs₁, contigs₂, minblock, sensitivity) - "mmseqs" => let - if !Shell.havecommand("mmseqs") - panic("external command mmseqs not found. please install before running build step with mmseqs backend\n") - end - kmer_len = arg(Build, "-K") - MMseqs.align(contigs₁, contigs₂, kmer_len) - end - _ => error("unrecognized alignment kernel") - end + reference = + arg(Build, "-v") ? + Dict(begin + key = collect(keys(G.sequence))[1] + seq = sequence(collect(values(G.sequence))[1]) + key => seq + end for G in isolates) : nothing - graph = Graphs.align(aligner, isolates...; - compare = compare, - energy = energy, - minblock = minblock, - maxiter = maxiter, - ) - finalize!(graph) + graph = Graphs.align( + aligner, + isolates...; + compare = compare, + energy = energy, + minblock = minblock, + maxiter = maxiter, + reference = reference, + verbose = false, + ) + finalize!(graph) - marshal(stdout, graph; fmt=:json) - end + marshal(stdout, graph; fmt = :json) + end, ) From f7c0ae200a6e4433b136ddc3c9b5821963780379 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Mon, 19 Jun 2023 16:41:46 +0200 Subject: [PATCH 07/19] feat: activate consistency check at trace --- trace.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/trace.jl b/trace.jl index a5295b86..90032cc6 100644 --- a/trace.jl +++ b/trace.jl @@ -15,9 +15,9 @@ PanGraph.main(["help", "version"]) # version usage PanGraph.main(["help", "help"]) # help usage # build (native - mmseqs) -PanGraph.main(["build", "-c", "-u", "-b", "0", "-a", "0", "$root/test.fa"]) +PanGraph.main(["build", "-v", "-c", "-u", "-b", "0", "-a", "0", "$root/test.fa"]) # PanGraph.main(["build", "-c", "-u", "-d", "mash", "$root/test.fa"]) -PanGraph.main(["build", "-c", "-u", "-b", "0", "-a", "0", "-k", "mmseqs", "-K", "8", "$root/test.fa"]) +PanGraph.main(["build", "-v", "-c", "-u", "-b", "0", "-a", "0", "-k", "mmseqs", "-K", "8", "$root/test.fa"]) # export PanGraph.main(["export", "-o", "$root/export", "$root/test.json"]) From 10ba8e6b6968087acfba670eab78baef193e1f8f Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Mon, 19 Jun 2023 16:52:01 +0200 Subject: [PATCH 08/19] chore: updated version and changelog --- CHANGELOG.md | 3 ++- Project.toml | 2 +- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index 58e67687..ae974c9b 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,8 @@ ## v0.7.0 (draft) - fasta input files are checked for duplicated records, and white lines between records are tolerated, see [#55](https://github.com/neherlab/pangraph/pull/55). -- PanGraph execution is now deterministic, and same input files always produce the same output. +- PanGraph execution is now deterministic, and same input files always produce the same output, see [#57](https://github.com/neherlab/pangraph/pull/57). +- introduced the `-v` flag in the `build` command. This activates consistency checks at each mergers. In this check we verify that the input genomes can be exactly reconstructed from the graph. See [#57](https://github.com/neherlab/pangraph/pull/57). - Fixed [#56](https://github.com/neherlab/pangraph/issues/56) ## v0.6.3 diff --git a/Project.toml b/Project.toml index f8d7b2f9..171bc211 100644 --- a/Project.toml +++ b/Project.toml @@ -1,7 +1,7 @@ name = "PanGraph" uuid = "0f9f61ca-f32c-45e1-b3bc-00138f4f8814" authors = ["Nicholas Noll "] -version = "0.6.3" +version = "0.7.0" [deps] Dates = "ade2ca70-3891-5945-98fb-dc099432e06a" From 59112a538e7b9829b73739a755979a6e0f021141 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Mon, 19 Jun 2023 17:04:33 +0200 Subject: [PATCH 09/19] chore: added -v flag to cli-tests --- CHANGELOG.md | 2 +- tests/run-cli-tests.sh | 6 +++--- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ae974c9b..d72dde7f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,6 @@ # PanGraph Changelog -## v0.7.0 (draft) +## v0.7.0 - fasta input files are checked for duplicated records, and white lines between records are tolerated, see [#55](https://github.com/neherlab/pangraph/pull/55). - PanGraph execution is now deterministic, and same input files always produce the same output, see [#57](https://github.com/neherlab/pangraph/pull/57). diff --git a/tests/run-cli-tests.sh b/tests/run-cli-tests.sh index fc171a6b..991ef2cc 100644 --- a/tests/run-cli-tests.sh +++ b/tests/run-cli-tests.sh @@ -49,14 +49,14 @@ pangraph generate -s 100 -r 1e-1 -m 1e-3 -d 5e-2 -i 1e-2 -t 5 "$TESTDIR/randseqs echo "Test pangraph build - minimap asm20 no energy" export JULIA_NUM_THREADS=4 -pangraph build -c -k minimap2 -s 20 -a 0 -b 0 "$TESTDIR/input.fa" > "$TESTDIR/test1.json" +pangraph build -v -c -k minimap2 -s 20 -a 0 -b 0 "$TESTDIR/input.fa" > "$TESTDIR/test1.json" echo "Test pangraph build - mash" -pangraph build -c -d mash "$TESTDIR/input.fa" > "$TESTDIR/test2.json" +pangraph build -v -c -d mash "$TESTDIR/input.fa" > "$TESTDIR/test2.json" echo "Test pangraph build - mmseqs" export JULIA_NUM_THREADS=1 -pangraph build -c -k mmseqs -K 8 "$TESTDIR/input.fa" > "$TESTDIR/test3.json" +pangraph build -v -c -k mmseqs -K 8 "$TESTDIR/input.fa" > "$TESTDIR/test3.json" echo "Test pangraph polish" pangraph polish -c -l 10000 "$TESTDIR/test1.json" > "$TESTDIR/polished.json" From a2fe79efbdcfff664ece5f2394b176917491cf36 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Mon, 19 Jun 2023 18:42:03 +0200 Subject: [PATCH 10/19] feat: OrderedDict in unmarshall --- src/graph.jl | 8 +++++--- 1 file changed, 5 insertions(+), 3 deletions(-) diff --git a/src/graph.jl b/src/graph.jl index 13664c6d..171aeef1 100644 --- a/src/graph.jl +++ b/src/graph.jl @@ -101,6 +101,8 @@ using .Intervals import .Shell: mash, mafft, havecommand import ..PanGraph: PanContigs +using OrderedCollections: OrderedDict + export Graph export Shell, Blocks, Nodes, Utility @@ -508,9 +510,9 @@ function unmarshal(io) b.sequence, b.gaps, # empty until we build the required node{block} objects - Dict{Node{Block},SNPMap}(), - Dict{Node{Block},InsMap}(), - Dict{Node{Block},DelMap}(), + OrderedDict{Node{Block},SNPMap}(), + OrderedDict{Node{Block},InsMap}(), + OrderedDict{Node{Block},DelMap}(), ) end) From 88b1411933026c2d40440aa68b34ddda2367635a Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Tue, 20 Jun 2023 17:25:22 +0200 Subject: [PATCH 11/19] feat: implement deep copy for graphs --- src/graph.jl | 46 ++++++++++++++++++++++++++++++++++++++++++++++ src/marginalize.jl | 2 +- 2 files changed, 47 insertions(+), 1 deletion(-) diff --git a/src/graph.jl b/src/graph.jl index 171aeef1..ddee8683 100644 --- a/src/graph.jl +++ b/src/graph.jl @@ -156,6 +156,52 @@ function Graph(name::String, sequence::Array{UInt8}; circular=false) ) end +""" + copy(G::Graph) + +Returns a deep copy of `G`. +""" +function copy(G::Graph) + blocks = deepcopy(G.block) + paths = deepcopy(G.sequence) + nodes_match = Dict{Node, Node}() + for (name, path) in G.sequence + new_nodes = Node[] + for node in path.node + if node ∉ keys(nodes_match) + nodes_match[node] = Node(blocks[node.block.uuid], node.strand) + end + push!(new_nodes, nodes_match[node]) + end + paths[name].node = new_nodes + end + for (uuid, block) in G.block + + new_ins = OrderedDict{Node, InsMap}() + for (k, v) in block.insert + new_node = nodes_match[k] + new_ins[new_node] = deepcopy(v) + end + blocks[uuid].insert = new_ins + + new_del = OrderedDict{Node, DelMap}() + for (k, v) in block.delete + new_node = nodes_match[k] + new_del[new_node] = deepcopy(v) + end + blocks[uuid].delete = new_del + + new_snp = OrderedDict{Node, SNPMap}() + for (k, v) in block.mutate + new_node = nodes_match[k] + new_snp[new_node] = deepcopy(v) + end + blocks[uuid].mutate = new_snp + + end + return Graph(blocks, paths) +end + """ Utility function that raises an error if the list of records has entries with duplicated diff --git a/src/marginalize.jl b/src/marginalize.jl index 3cb73f1d..7ee6353f 100644 --- a/src/marginalize.jl +++ b/src/marginalize.jl @@ -47,7 +47,7 @@ Marginalize = Command( pairs = [ (n₁,n₂) for n₁ in names for n₂ in names if n₁ < n₂ ] Threads.@threads for (name₁, name₂) in pairs - G = deepcopy(graph) + G = Graphs.copy(graph) Graphs.keeponly!(G, name₁, name₂) if reduce changed = true From a9439fba35d572f2d460ff03dabedf9bce735410 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Tue, 20 Jun 2023 17:50:01 +0200 Subject: [PATCH 12/19] feat: added consistency check in marginalize --- docs/src/cli/marginalize.md | 1 + src/graph.jl | 4 +- src/marginalize.jl | 201 +++++++++++++++++++++--------------- 3 files changed, 119 insertions(+), 87 deletions(-) diff --git a/docs/src/cli/marginalize.md b/docs/src/cli/marginalize.md index 60256e6d..f8604293 100644 --- a/docs/src/cli/marginalize.md +++ b/docs/src/cli/marginalize.md @@ -9,6 +9,7 @@ Compute all pairwise marginalizations of a multiple sequence alignment pangraph. | Output path | String | o | output-path | Path to direcotry where the output of all pairwise mariginalizations will be stored if supplied | | Reduce paralogs | Boolean | r | reduce-paralog | Collapses coparallel paths through duplicated blocks. | | Projection strains | String | s | Strains | Collapses the graph structure to only blocks and edges contained by the paths of the supplied strain names. comma seperated, no spaces | +| Consistency check | Boolean | v | verify | toggle to activate consistency check: verifies that output genomes are exactly equal to input genomes | ## Arguments Zero or one pangraph file which must be formatted as a JSON. diff --git a/src/graph.jl b/src/graph.jl index ddee8683..ad3814b7 100644 --- a/src/graph.jl +++ b/src/graph.jl @@ -157,11 +157,11 @@ function Graph(name::String, sequence::Array{UInt8}; circular=false) end """ - copy(G::Graph) + copy_graph(G::Graph) Returns a deep copy of `G`. """ -function copy(G::Graph) +function copy_graph(G::Graph) blocks = deepcopy(G.block) paths = deepcopy(G.sequence) nodes_match = Dict{Node, Node}() diff --git a/src/marginalize.jl b/src/marginalize.jl index 7ee6353f..3d133d5c 100644 --- a/src/marginalize.jl +++ b/src/marginalize.jl @@ -1,94 +1,125 @@ Marginalize = Command( - "marginalize", - "pangraph marginalize [arguments]", - "computes all pairwise marginalizations of a multiple sequence alignment graph", - """multiple sequence alignment accepted in formats: [json]""", - [ - Arg( - String, - "output path", - (short="-o", long="--output-path"), - "path to directory where all pairwise marginalizations will be stored\n\tif empty, will skip this computation", - "" - ), - Arg( - Bool, - "reduce paralog paths", - (short="-r", long="--reduce-paralog"), - "collapse coparallel paths through duplications", - false, - ), - Arg( - String, - "isolates to project onto", - (short="-s", long="--strains"), - "collapse the graph to only blocks contained by paths of the given isolates.\n\tcomma seperated list, no spaces", - "", - ), - ], - (args) -> let - path = parse(Marginalize, args) - path = if (path === nothing || length(path) == 0) - nothing - elseif length(path) == 1 - path - else - return 2 - end + "marginalize", + "pangraph marginalize [arguments]", + "computes all pairwise marginalizations of a multiple sequence alignment graph", + """multiple sequence alignment accepted in formats: [json]""", + [ + Arg( + String, + "output path", + (short = "-o", long = "--output-path"), + "path to directory where all pairwise marginalizations will be stored\n\tif empty, will skip this computation", + "", + ), + Arg( + Bool, + "reduce paralog paths", + (short = "-r", long = "--reduce-paralog"), + "collapse coparallel paths through duplications", + false, + ), + Arg( + String, + "isolates to project onto", + (short = "-s", long = "--strains"), + "collapse the graph to only blocks contained by paths of the given isolates.\n\tcomma seperated list, no spaces", + "", + ), + Arg( + Bool, + "check that genomes are preserved", + (short = "-v", long = "--verify"), + "sanity check that verifies that genomes in the output graphs are identical\n\tto genomes in the input graphs.", + false, + ), + ], + (args) -> let + path = parse(Marginalize, args) + path = if (path === nothing || length(path) == 0) + nothing + elseif length(path) == 1 + path + else + return 2 + end - graph = load(path, Marginalize) - names = collect(keys(graph.sequence)) + graph = load(path, Marginalize) + names = collect(keys(graph.sequence)) - reduce = arg(Marginalize, "-r") - output = arg(Marginalize, "-o") + reduce = arg(Marginalize, "-r") + output = arg(Marginalize, "-o") + verify_flag = arg(Marginalize, "-v") - if length(output) > 0 - isdir(output) || mkpath(output) - pairs = [ (n₁,n₂) for n₁ in names for n₂ in names if n₁ < n₂ ] + function verify(G₀, G₋) + # verifies that after the marginalization all genomes from + # G₋ are exactly equal to their initial value in in G₀ + for (name, path) in G₋.sequence + seq₀ = sequence(G₀.sequence[name], shift = true) + seq₋ = sequence(path, shift = true) + @assert seq₀ == seq₋ + end + end - Threads.@threads for (name₁, name₂) in pairs - G = Graphs.copy(graph) - Graphs.keeponly!(G, name₁, name₂) - if reduce - changed = true - while changed - l = length(G.block) - Graphs.deparalog!(G) - Graphs.detransitive!(G) - changed = l != length(G.block) - end - else - Graphs.detransitive!(G) - end + function log(msg...) + println(stderr, msg...) + flush(stderr) + end - # recompute positions - Graphs.finalize!(G) - open("$(output)/$(name₁)-$(name₂).json", "w") do io - marshal(io, G; fmt=:json) - end - end - end + if length(output) > 0 + isdir(output) || mkpath(output) + pairs = [(n₁, n₂) for n₁ in names for n₂ in names if n₁ < n₂] - isolates = arg(Marginalize, "-s") - if length(isolates) > 0 - names = split(isolates,',') - Graphs.keeponly!(graph, names...) + Threads.@threads for (name₁, name₂) in pairs + G = Graphs.copy_graph(graph) + Graphs.keeponly!(G, name₁, name₂) + if reduce + changed = true + while changed + l = length(G.block) + Graphs.deparalog!(G) + Graphs.detransitive!(G) + changed = l != length(G.block) + end + else + Graphs.detransitive!(G) + end - if reduce - changed = true - while changed - l = length(graph.block) - Graphs.deparalog!(graph) - Graphs.detransitive!(graph) - changed = l != length(graph.block) - end - else - Graphs.detransitive!(graph) - end - - # recompute positions - Graphs.finalize!(graph) - marshal(stdout, graph; fmt=:json) - end - end + # recompute positions + Graphs.finalize!(G) + + verify_flag && verify(graph, G) + + open("$(output)/$(name₁)-$(name₂).json", "w") do io + marshal(io, G; fmt = :json) + end + end + end + + isolates = arg(Marginalize, "-s") + if length(isolates) > 0 + + G = Graphs.copy(graph) + names = split(isolates, ',') + Graphs.keeponly!(G, names...) + + if reduce + changed = true + while changed + l = length(G.block) + Graphs.deparalog!(G) + Graphs.detransitive!(G) + changed = l != length(G.block) + end + else + Graphs.detransitive!(G) + end + + # recompute positions + Graphs.finalize!(G) + + verify_flag && verify(graph, G) + + marshal(stdout, G; fmt = :json) + end + end, ) From 067789fa478ff6ce0dbbe1055bb617b126f2cc96 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Tue, 20 Jun 2023 17:52:47 +0200 Subject: [PATCH 13/19] fix: copy -> copy_graph --- src/marginalize.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/marginalize.jl b/src/marginalize.jl index 3d133d5c..b65ea80c 100644 --- a/src/marginalize.jl +++ b/src/marginalize.jl @@ -98,7 +98,7 @@ Marginalize = Command( isolates = arg(Marginalize, "-s") if length(isolates) > 0 - G = Graphs.copy(graph) + G = Graphs.copy_graph(graph) names = split(isolates, ',') Graphs.keeponly!(G, names...) From 08ef580db3c7c59165337f908f4ff42724d091d5 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Tue, 20 Jun 2023 18:01:37 +0200 Subject: [PATCH 14/19] chore: changelog and cli-tests --- CHANGELOG.md | 2 +- tests/run-cli-tests.sh | 2 +- trace.jl | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index d72dde7f..cf04ecba 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ - fasta input files are checked for duplicated records, and white lines between records are tolerated, see [#55](https://github.com/neherlab/pangraph/pull/55). - PanGraph execution is now deterministic, and same input files always produce the same output, see [#57](https://github.com/neherlab/pangraph/pull/57). -- introduced the `-v` flag in the `build` command. This activates consistency checks at each mergers. In this check we verify that the input genomes can be exactly reconstructed from the graph. See [#57](https://github.com/neherlab/pangraph/pull/57). +- introduced the `-v` flag in the `build` and `merge` command. This activates consistency checks to verify that the input genomes can be exactly reconstructed. See [#57](https://github.com/neherlab/pangraph/pull/57). - Fixed [#56](https://github.com/neherlab/pangraph/issues/56) ## v0.6.3 diff --git a/tests/run-cli-tests.sh b/tests/run-cli-tests.sh index 991ef2cc..f34673d8 100644 --- a/tests/run-cli-tests.sh +++ b/tests/run-cli-tests.sh @@ -68,7 +68,7 @@ echo "Test pangraph PanX export" pangraph export -ng -pX -o "$TESTDIR/export" "$TESTDIR/test1.json" echo "Test pangraph marginalize" -pangraph marginalize -o "$TESTDIR/marginalize" "$TESTDIR/test1.json" +pangraph marginalize -v -o "$TESTDIR/marginalize" "$TESTDIR/test1.json" # remove generated files rm -r $TESTDIR diff --git a/trace.jl b/trace.jl index 90032cc6..0f1c8d3a 100644 --- a/trace.jl +++ b/trace.jl @@ -26,7 +26,7 @@ PanGraph.main(["export", "-o", "$root/export", "$root/test.json"]) PanGraph.main(["generate", "-o", "$root/export.json", "$root/test.fa"]) # marginalize -PanGraph.main(["marginalize", "-o", "$root/pairs", "$root/test.json"]) +PanGraph.main(["marginalize", "-v", "-o", "$root/pairs", "$root/test.json"]) # polish PanGraph.main(["polish", "-c", "-l", "10000", "$root/test.json"]) \ No newline at end of file From 53bc71bd1b14cd1978413b85d882f0955a088f8f Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Sat, 24 Jun 2023 13:40:15 +0200 Subject: [PATCH 15/19] feat: add gap positions consistency check --- src/build.jl | 4 ++++ src/graph.jl | 30 ++++++++++++++++++++++++++++++ 2 files changed, 34 insertions(+) diff --git a/src/build.jl b/src/build.jl index 87064bba..e88be8d5 100644 --- a/src/build.jl +++ b/src/build.jl @@ -159,6 +159,7 @@ Build = Command( _ => error("unrecognized alignment kernel") end + # if testing is set to true, collect dictionary of reference sequences for comparisons reference = arg(Build, "-v") ? Dict(begin @@ -179,6 +180,9 @@ Build = Command( ) finalize!(graph) + # test graph consistency + arg(Build, "-v") && Graphs.consistency_check(graph) + marshal(stdout, graph; fmt = :json) end, ) diff --git a/src/graph.jl b/src/graph.jl index ad3814b7..538b0b98 100644 --- a/src/graph.jl +++ b/src/graph.jl @@ -678,6 +678,36 @@ function finalize!(g) end end +""" + consistency_check(G::Graph) + +performs final consistency checks on the graph. Implemented checks for now are: +- check 1-1 correspondence between gaps and insertion positions in block alignments. +""" +function consistency_check(graph) + + # for each block, check that each insertion corresponds to a gap + # and each gap has at least one insertion + for (id, b) in graph.block + + # collect gap positions + gaps = b.gaps |> keys |> collect + + # collect insertion positions + ins_pos = b.insert |> values .|> keys + ins_pos = [first(i) for ip in ins_pos for i in ip if length(i) == 2] |> unique + + # check 1-1 correspondence + if Set(gaps) != Set(ins_pos) + msg = "Consistency check failed in block $id:\n" + msg *= "no 1-1 correspondence between\n" + msg *= "gaps -> $gaps\n" + msg *= "insertion positions -> $ins_pos\n" + error(msg) + end + end +end + # ------------------------------------------------------------------------ # main point of entry From c461450885dabd068baf3e9fedc1c4b8c52280e3 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Sat, 24 Jun 2023 13:43:48 +0200 Subject: [PATCH 16/19] chore: consistency check flag v -> t --- docs/src/cli/build.md | 2 +- docs/src/cli/marginalize.md | 2 +- src/build.jl | 6 +++--- src/marginalize.jl | 10 +++++----- tests/run-cli-tests.sh | 8 ++++---- trace.jl | 6 +++--- 6 files changed, 17 insertions(+), 17 deletions(-) diff --git a/docs/src/cli/build.md b/docs/src/cli/build.md index a3e3e5f8..19c3c278 100644 --- a/docs/src/cli/build.md +++ b/docs/src/cli/build.md @@ -16,7 +16,7 @@ Build a multiple sequence alignment pangraph. | distance calculator | String | d | distance-backend | only accepts "native" or "mash" | | alignment kernel | String | k | alignment-kernel | only accepts "minimap2" or "mmseqs" | | kmer length (mmseqs) | Integer | K | kmer-length | kmer length, only used for mmseqs2 alignment kernel. If not specified will use mmseqs default. | -| consistency check | Boolean | v | verify | toggle to activate consistency check: verifies that input genomes can be exactly reconstructed from the graph | +| consistency check | Boolean | t | test | toggle to activate consistency check: verifies that input genomes can be exactly reconstructed from the graph | ## Arguments Expects one or more fasta files. diff --git a/docs/src/cli/marginalize.md b/docs/src/cli/marginalize.md index f8604293..864e1559 100644 --- a/docs/src/cli/marginalize.md +++ b/docs/src/cli/marginalize.md @@ -9,7 +9,7 @@ Compute all pairwise marginalizations of a multiple sequence alignment pangraph. | Output path | String | o | output-path | Path to direcotry where the output of all pairwise mariginalizations will be stored if supplied | | Reduce paralogs | Boolean | r | reduce-paralog | Collapses coparallel paths through duplicated blocks. | | Projection strains | String | s | Strains | Collapses the graph structure to only blocks and edges contained by the paths of the supplied strain names. comma seperated, no spaces | -| Consistency check | Boolean | v | verify | toggle to activate consistency check: verifies that output genomes are exactly equal to input genomes | +| Consistency check | Boolean | t | test | toggle to activate consistency check: verifies that output genomes are exactly equal to input genomes | ## Arguments Zero or one pangraph file which must be formatted as a JSON. diff --git a/src/build.jl b/src/build.jl index e88be8d5..318e3af4 100644 --- a/src/build.jl +++ b/src/build.jl @@ -80,7 +80,7 @@ Build = Command( Arg( Bool, "verify that input genomes can be reconstructed from output", - (short = "-v", long = "--verify"), + (short = "-t", long = "--test"), "toggle to activate consistency check at each step of the graph merging process", false, ), @@ -161,7 +161,7 @@ Build = Command( # if testing is set to true, collect dictionary of reference sequences for comparisons reference = - arg(Build, "-v") ? + arg(Build, "-t") ? Dict(begin key = collect(keys(G.sequence))[1] seq = sequence(collect(values(G.sequence))[1]) @@ -181,7 +181,7 @@ Build = Command( finalize!(graph) # test graph consistency - arg(Build, "-v") && Graphs.consistency_check(graph) + arg(Build, "-t") && Graphs.consistency_check(graph) marshal(stdout, graph; fmt = :json) end, diff --git a/src/marginalize.jl b/src/marginalize.jl index b65ea80c..cc7f6132 100644 --- a/src/marginalize.jl +++ b/src/marginalize.jl @@ -28,8 +28,8 @@ Marginalize = Command( Arg( Bool, "check that genomes are preserved", - (short = "-v", long = "--verify"), - "sanity check that verifies that genomes in the output graphs are identical\n\tto genomes in the input graphs.", + (short = "-t", long = "--test"), + "consistency check that verifies that genomes in the output graphs are identical\n\tto genomes in the input graphs.", false, ), ], @@ -48,7 +48,7 @@ Marginalize = Command( reduce = arg(Marginalize, "-r") output = arg(Marginalize, "-o") - verify_flag = arg(Marginalize, "-v") + test_flag = arg(Marginalize, "-t") function verify(G₀, G₋) # verifies that after the marginalization all genomes from @@ -87,7 +87,7 @@ Marginalize = Command( # recompute positions Graphs.finalize!(G) - verify_flag && verify(graph, G) + test_flag && verify(graph, G) open("$(output)/$(name₁)-$(name₂).json", "w") do io marshal(io, G; fmt = :json) @@ -117,7 +117,7 @@ Marginalize = Command( # recompute positions Graphs.finalize!(G) - verify_flag && verify(graph, G) + test_flag && verify(graph, G) marshal(stdout, G; fmt = :json) end diff --git a/tests/run-cli-tests.sh b/tests/run-cli-tests.sh index f34673d8..55cbbcc5 100644 --- a/tests/run-cli-tests.sh +++ b/tests/run-cli-tests.sh @@ -49,14 +49,14 @@ pangraph generate -s 100 -r 1e-1 -m 1e-3 -d 5e-2 -i 1e-2 -t 5 "$TESTDIR/randseqs echo "Test pangraph build - minimap asm20 no energy" export JULIA_NUM_THREADS=4 -pangraph build -v -c -k minimap2 -s 20 -a 0 -b 0 "$TESTDIR/input.fa" > "$TESTDIR/test1.json" +pangraph build -t -c -k minimap2 -s 20 -a 0 -b 0 "$TESTDIR/input.fa" > "$TESTDIR/test1.json" echo "Test pangraph build - mash" -pangraph build -v -c -d mash "$TESTDIR/input.fa" > "$TESTDIR/test2.json" +pangraph build -t -c -d mash "$TESTDIR/input.fa" > "$TESTDIR/test2.json" echo "Test pangraph build - mmseqs" export JULIA_NUM_THREADS=1 -pangraph build -v -c -k mmseqs -K 8 "$TESTDIR/input.fa" > "$TESTDIR/test3.json" +pangraph build -t -c -k mmseqs -K 8 "$TESTDIR/input.fa" > "$TESTDIR/test3.json" echo "Test pangraph polish" pangraph polish -c -l 10000 "$TESTDIR/test1.json" > "$TESTDIR/polished.json" @@ -68,7 +68,7 @@ echo "Test pangraph PanX export" pangraph export -ng -pX -o "$TESTDIR/export" "$TESTDIR/test1.json" echo "Test pangraph marginalize" -pangraph marginalize -v -o "$TESTDIR/marginalize" "$TESTDIR/test1.json" +pangraph marginalize -t -o "$TESTDIR/marginalize" "$TESTDIR/test1.json" # remove generated files rm -r $TESTDIR diff --git a/trace.jl b/trace.jl index 0f1c8d3a..f34339e1 100644 --- a/trace.jl +++ b/trace.jl @@ -15,9 +15,9 @@ PanGraph.main(["help", "version"]) # version usage PanGraph.main(["help", "help"]) # help usage # build (native - mmseqs) -PanGraph.main(["build", "-v", "-c", "-u", "-b", "0", "-a", "0", "$root/test.fa"]) +PanGraph.main(["build", "-t", "-c", "-u", "-b", "0", "-a", "0", "$root/test.fa"]) # PanGraph.main(["build", "-c", "-u", "-d", "mash", "$root/test.fa"]) -PanGraph.main(["build", "-v", "-c", "-u", "-b", "0", "-a", "0", "-k", "mmseqs", "-K", "8", "$root/test.fa"]) +PanGraph.main(["build", "-t", "-c", "-u", "-b", "0", "-a", "0", "-k", "mmseqs", "-K", "8", "$root/test.fa"]) # export PanGraph.main(["export", "-o", "$root/export", "$root/test.json"]) @@ -26,7 +26,7 @@ PanGraph.main(["export", "-o", "$root/export", "$root/test.json"]) PanGraph.main(["generate", "-o", "$root/export.json", "$root/test.fa"]) # marginalize -PanGraph.main(["marginalize", "-v", "-o", "$root/pairs", "$root/test.json"]) +PanGraph.main(["marginalize", "-t", "-o", "$root/pairs", "$root/test.json"]) # polish PanGraph.main(["polish", "-c", "-l", "10000", "$root/test.json"]) \ No newline at end of file From 9f7f3c1dcf8b852f1de6164509087ccda9c54700 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Tue, 27 Jun 2023 16:14:25 +0200 Subject: [PATCH 17/19] feat: added random seed in CLI --- CHANGELOG.md | 2 +- docs/src/cli/build.md | 1 + src/align.jl | 4 ++-- src/build.jl | 11 +++++++++++ 4 files changed, 15 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cf04ecba..cdd8e02a 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,7 +3,7 @@ ## v0.7.0 - fasta input files are checked for duplicated records, and white lines between records are tolerated, see [#55](https://github.com/neherlab/pangraph/pull/55). -- PanGraph execution is now deterministic, and same input files always produce the same output, see [#57](https://github.com/neherlab/pangraph/pull/57). +- PanGraph execution is now deterministic, and same input files always produce the same output, see [#57](https://github.com/neherlab/pangraph/pull/57). For the build command, a random seed can be set with the `-r` flag. - introduced the `-v` flag in the `build` and `merge` command. This activates consistency checks to verify that the input genomes can be exactly reconstructed. See [#57](https://github.com/neherlab/pangraph/pull/57). - Fixed [#56](https://github.com/neherlab/pangraph/issues/56) diff --git a/docs/src/cli/build.md b/docs/src/cli/build.md index 19c3c278..2a4dbd61 100644 --- a/docs/src/cli/build.md +++ b/docs/src/cli/build.md @@ -17,6 +17,7 @@ Build a multiple sequence alignment pangraph. | alignment kernel | String | k | alignment-kernel | only accepts "minimap2" or "mmseqs" | | kmer length (mmseqs) | Integer | K | kmer-length | kmer length, only used for mmseqs2 alignment kernel. If not specified will use mmseqs default. | | consistency check | Boolean | t | test | toggle to activate consistency check: verifies that input genomes can be exactly reconstructed from the graph | +| random seed | Int | r | random-seed | controls the random naming of blocks | ## Arguments Expects one or more fasta files. diff --git a/src/align.jl b/src/align.jl index ba3abf0f..888f087e 100644 --- a/src/align.jl +++ b/src/align.jl @@ -631,7 +631,7 @@ The _lower_ the score, the _better_ the alignment. Only negative energies are co `compare` is the function to be used to generate pairwise distances that generate the internal guide tree. """ -function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100, verbose=false) +function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(hit)->(-Inf), minblock=100, reference=nothing, maxiter=100, verbose=false, rand_seed=0) function verify(graph; msg="") if reference !== nothing for (name,path) ∈ graph.sequence @@ -699,7 +699,7 @@ function align(aligner::Function, Gs::Graph...; compare=Mash.distance, energy=(h # random seed for the thread - to ensure deterministic reproducibility # in block names - Random.seed!(n_clade) + Random.seed!(rand_seed+n_clade) if isleaf(clade) close(clade.graph) diff --git a/src/build.jl b/src/build.jl index 318e3af4..77233be6 100644 --- a/src/build.jl +++ b/src/build.jl @@ -84,6 +84,13 @@ Build = Command( "toggle to activate consistency check at each step of the graph merging process", false, ), + Arg( + Int, + "random seed", + (short = "-r", long = "--random-seed"), + "random seed. Controls block random naming.", + 0, + ), ], (args) -> let files = parse(Build, args) @@ -123,6 +130,9 @@ Build = Command( end end + r_seed = arg(Build, "-r") + seed!(r_seed) + graph(io) = graphs(io; circular = circular, upper = uppercase) isolates = (G for file in files for G ∈ open(graph, file)) @@ -177,6 +187,7 @@ Build = Command( maxiter = maxiter, reference = reference, verbose = false, + rand_seed = r_seed, ) finalize!(graph) From e1c5775f090c523d9e8f29b6416758b7f053c013 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Tue, 27 Jun 2023 16:22:52 +0200 Subject: [PATCH 18/19] chore: changed docs --- docs/src/cli/build.md | 2 +- src/build.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/docs/src/cli/build.md b/docs/src/cli/build.md index 2a4dbd61..ee6ae020 100644 --- a/docs/src/cli/build.md +++ b/docs/src/cli/build.md @@ -17,7 +17,7 @@ Build a multiple sequence alignment pangraph. | alignment kernel | String | k | alignment-kernel | only accepts "minimap2" or "mmseqs" | | kmer length (mmseqs) | Integer | K | kmer-length | kmer length, only used for mmseqs2 alignment kernel. If not specified will use mmseqs default. | | consistency check | Boolean | t | test | toggle to activate consistency check: verifies that input genomes can be exactly reconstructed from the graph | -| random seed | Int | r | random-seed | controls the random naming of blocks | +| random seed | Int | r | random-seed | random seed for pangraph construction. | ## Arguments Expects one or more fasta files. diff --git a/src/build.jl b/src/build.jl index 77233be6..c7e9210b 100644 --- a/src/build.jl +++ b/src/build.jl @@ -88,7 +88,7 @@ Build = Command( Int, "random seed", (short = "-r", long = "--random-seed"), - "random seed. Controls block random naming.", + "random seed for pangraph construction.", 0, ), ], From 9c0af7ba1d5fd746e73989270b22d4675d21ff74 Mon Sep 17 00:00:00 2001 From: Marco Molari Date: Tue, 27 Jun 2023 17:01:34 +0200 Subject: [PATCH 19/19] chore: small changelog correction --- CHANGELOG.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index cdd8e02a..330eef6f 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -4,7 +4,7 @@ - fasta input files are checked for duplicated records, and white lines between records are tolerated, see [#55](https://github.com/neherlab/pangraph/pull/55). - PanGraph execution is now deterministic, and same input files always produce the same output, see [#57](https://github.com/neherlab/pangraph/pull/57). For the build command, a random seed can be set with the `-r` flag. -- introduced the `-v` flag in the `build` and `merge` command. This activates consistency checks to verify that the input genomes can be exactly reconstructed. See [#57](https://github.com/neherlab/pangraph/pull/57). +- introduced the `-t` flag in the `build` and `merge` command. This activates consistency checks to verify that the input genomes can be exactly reconstructed. See [#57](https://github.com/neherlab/pangraph/pull/57). - Fixed [#56](https://github.com/neherlab/pangraph/issues/56) ## v0.6.3