From 0e2e042fb5bb8515ca41977a1968d0db0756f782 Mon Sep 17 00:00:00 2001 From: Kevin Klein <38384885+0x002A@users.noreply.github.com> Date: Fri, 19 Nov 2021 11:31:20 +0100 Subject: [PATCH] Prepare second release (#6) --- README.md | 38 +++- include/ms/BlastFileReader.h | 2 +- include/ms/Kernel.h | 17 +- include/ms/{Lb.fwd.h => MS.fwd.h} | 0 include/ms/SequenceAccessor.h | 2 +- include/ms/graph/Edge.h | 2 +- include/ms/graph/Graph.h | 2 +- include/ms/matching/Id2OverlapMap.h | 64 ++---- include/ms/matching/MatchMap.h | 2 +- include/ms/types/Toggle.h | 2 +- libms/src/MS.fwd.cpp | 24 ++ libms/src/kernel/ap.cpp | 2 +- libms/src/kernel/dg.cpp | 8 +- libms/src/kernel/{co.cpp => ol.cpp} | 22 +- pipeline/pipeline.sh | 56 +++++ pipeline/scrubber_bfs.py | 252 +++++++++++++++++++++ pipeline/setAbundanceThresholdFromHisto.py | 36 +++ pipeline/unitig_filter.py | 185 +++++++++++++++ src/main.cpp | 15 +- 19 files changed, 645 insertions(+), 86 deletions(-) rename include/ms/{Lb.fwd.h => MS.fwd.h} (100%) create mode 100644 libms/src/MS.fwd.cpp rename libms/src/kernel/{co.cpp => ol.cpp} (83%) create mode 100644 pipeline/pipeline.sh create mode 100644 pipeline/scrubber_bfs.py create mode 100644 pipeline/setAbundanceThresholdFromHisto.py create mode 100644 pipeline/unitig_filter.py diff --git a/README.md b/README.md index 43435bd..8a5d987 100644 --- a/README.md +++ b/README.md @@ -3,18 +3,40 @@ **Mu**lti-**C**ore **H**ybrid **S**hort- **A**nd **L**ong-read **S**equence **A**ssembler Based on: -Gatter, Thomas, et al. "Economic genome assembly from low coverage Illumina and Nanopore data." 20th International -Workshop on Algorithms in Bioinformatics (WABI 2020). Schloss Dagstuhl-Leibniz-Zentrum für Informatik, 2020. +Gatter, T., von Löhneysen, S., Fallmann, J. et al. LazyB: fast and cheap genome assembly. Algorithms Mol Biol 16, 8 ( +2021). -# Binary +# Prerequisites -A statically linked binary for x86-64 can be downloaded from the releases page on GitHub. +`MuCHSALSA` requires the following tools to be available in the `PATH`: + +- jellyfish 2 +- bbduk +- Abyss 2 (using 'abyss-pe') +- minimap2 +- Python 3.6.9 (at least) + +# Download + +A zip file containing a statically linked binary for x86-64 alongside with the assembly pipeline and associated scripts +can be downloaded from the releases page on GitHub. # Usage -In contrast to [LazyB](https://github.com/TGatter/LazyB) this tool has an additional parameter to control the level of -parallelization. Simply plug it into the LazyB-pipeline and set the additional parameter. -**ATTENTION**: If no value is given all available cores will be used. +The pipeline can be run using the following command: + +```bash +sh pipeline.sh \[k-mer-size-filter\] \[k-mer-size-assembly\] \[name\] \[illumina-inputfile-1\] \[illumina-inputfile-2\]] \[nanopore-inputfile\] \[output-folder\] +``` + +**Note**: The level of parallelization used for parts (default value: 8) of the pipeline is set via a variable +in `pipeline.sh`. + +[k-mer-size-filter] specifies the k-mer size for k-mer counting in raw illumina data. Reads with highly abundant k-mers +are removed from the data. Starting at k=50 is recommended. + +[k-mer-size-assembly] specifies the k-mer size during illumina assembly (here using Abyss). Starting at k=90 is +recommended. # Building @@ -59,5 +81,5 @@ find . -regex '.*\.\(cpp\|h\)' -exec clang-format-12 -style=file -i {} \; # Standard Library -This project uses _libc++_ as standard library. For information on building _libc++_ +This project uses `libc++-12` as standard library. For information on building `libc++` see [here](https://libcxx.llvm.org/docs/BuildingLibcxx.html). \ No newline at end of file diff --git a/include/ms/BlastFileReader.h b/include/ms/BlastFileReader.h index 206d77d..e63c729 100644 --- a/include/ms/BlastFileReader.h +++ b/include/ms/BlastFileReader.h @@ -26,7 +26,7 @@ #include -#include "Lb.fwd.h" +#include "MS.fwd.h" namespace muchsalsa { diff --git a/include/ms/Kernel.h b/include/ms/Kernel.h index f500cc0..2e8f682 100644 --- a/include/ms/Kernel.h +++ b/include/ms/Kernel.h @@ -19,8 +19,8 @@ // //===---------------------------------------------------------------------------------------------------------------==// -#ifndef INCLUDED_MUCHSALSA_PROKRASTINATOR -#define INCLUDED_MUCHSALSA_PROKRASTINATOR +#ifndef INCLUDED_MUCHSALSA_KERNEL +#define INCLUDED_MUCHSALSA_KERNEL #pragma once @@ -31,7 +31,7 @@ #include #include -#include "Lb.fwd.h" +#include "MS.fwd.h" #include "graph/Edge.h" namespace muchsalsa { @@ -40,9 +40,8 @@ namespace muchsalsa { // KERNEL // ===================================================================================================================== -std::optional computeOverlap(matching::MatchMap const &matches, std::vector &ids, - graph::Edge const &edge, bool direction, std::size_t score, - bool isPrimary); +std::optional getOverlap(matching::MatchMap const &matches, std::vector &ids, + graph::Edge const &edge, bool direction, std::size_t score, bool isPrimary); std::vector, std::size_t, bool>> getMaxPairwisePaths(matching::MatchMap const &matches, graph::Edge const &edge, @@ -55,8 +54,8 @@ graph::Graph getMaxSpanTree(graph::Graph const &graph); std::vector> getConnectedComponents(graph::Graph const &graph); -graph::DiGraph getDirectionGraph(gsl::not_null pMatchMap, graph::Graph const &graph, - graph::Graph const &connectedComponent, graph::Vertex const &startNode); +graph::DiGraph getDirectedGraph(gsl::not_null pMatchMap, graph::Graph const &graph, + graph::Graph const &connectedComponent, graph::Vertex const &startNode); std::vector> linearizeGraph(gsl::not_null pDiGraph); @@ -68,6 +67,6 @@ void assemblePath( } // namespace muchsalsa -#endif // INCLUDED_MUCHSALSA_PROKRASTINATOR +#endif // INCLUDED_MUCHSALSA_KERNEL // ---------------------------------------------------- END-OF-FILE ---------------------------------------------------- diff --git a/include/ms/Lb.fwd.h b/include/ms/MS.fwd.h similarity index 100% rename from include/ms/Lb.fwd.h rename to include/ms/MS.fwd.h diff --git a/include/ms/SequenceAccessor.h b/include/ms/SequenceAccessor.h index dcaaaa6..4104a7f 100644 --- a/include/ms/SequenceAccessor.h +++ b/include/ms/SequenceAccessor.h @@ -34,7 +34,7 @@ #include #include -#include "Lb.fwd.h" +#include "MS.fwd.h" namespace muchsalsa { diff --git a/include/ms/graph/Edge.h b/include/ms/graph/Edge.h index b4e9f60..1012ca0 100644 --- a/include/ms/graph/Edge.h +++ b/include/ms/graph/Edge.h @@ -29,7 +29,7 @@ #include #include -#include "Lb.fwd.h" +#include "MS.fwd.h" #include "types/Direction.h" #include "types/Toggle.h" diff --git a/include/ms/graph/Graph.h b/include/ms/graph/Graph.h index 0948e09..2e7dbb5 100644 --- a/include/ms/graph/Graph.h +++ b/include/ms/graph/Graph.h @@ -38,7 +38,7 @@ #include #include -#include "Lb.fwd.h" +#include "MS.fwd.h" #include "Util.h" #include "graph/Vertex.h" diff --git a/include/ms/matching/Id2OverlapMap.h b/include/ms/matching/Id2OverlapMap.h index 0756953..aad6f26 100644 --- a/include/ms/matching/Id2OverlapMap.h +++ b/include/ms/matching/Id2OverlapMap.h @@ -58,77 +58,61 @@ struct KeyEqual : public std::binary_function { // class Id2OverlapMap // ------------------- +/** + * Class representing the mapping of pairs of a illumina id and a clique index to overlaps. + */ class Id2OverlapMap { public: - /** - * Constructor. - */ - Id2OverlapMap() = default; - - /** - * Destructor. - */ - ~Id2OverlapMap() = default; - - /** - * Moving is disallowed. - */ - Id2OverlapMap(Id2OverlapMap const &) = delete; - - /** - * Copying is disallowed. - */ - Id2OverlapMap(Id2OverlapMap &&) = delete; - - /** - * Move assignment is disallowed. - */ - Id2OverlapMap &operator=(Id2OverlapMap &&) = delete; - - /** - * Copy assignment is disallowed. - */ - Id2OverlapMap &operator=(Id2OverlapMap const &) = delete; - /** * Assigment operator. * - * @param key - * @return + * @param key a const reference to a std::pair of an unsigned int and a std::size_t representing the illumina id and + * the clique index which the overlap is or should become mapped to + * @return A reference to the mapped overlap, performing an insertion if no element with a key equivalent to key exist */ std::pair &operator[](detail::key_t const &key); /** * Access operator. * - * @param key - * @return + * @param key a const reference to a std::pair of an unsigned int and a std::size_t representing the illumina id and + * the clique index which the overlap is mapped to + * @return A const reference to the mapped overlap + * @throws std::out_of_range */ std::pair const &at(detail::key_t const &key) const; /** * Access operator. * - * @param key - * @return + * @param key a const reference to a std::pair of an unsigned int and a std::size_t representing the illumina id and + * the clique index which the overlap is mapped to + * @return A reference to the mapped overlap + * @throws std::out_of_range */ std::pair &at(detail::key_t const &key); - std::size_t getSize() const; - private: std::unordered_map, detail::KeyHash, detail::KeyEqual> m_map; /*!< std::unordered_map storing the mapping */ }; +// ===================================================================================================================== +// INLINE DEFINITIONS +// ===================================================================================================================== + +// ------------------- +// class Id2OverlapMap +// ------------------- + +// PUBLIC CLASS METHODS + inline std::pair &Id2OverlapMap::operator[](detail::key_t const &key) { return m_map[key]; } inline std::pair &Id2OverlapMap::at(detail::key_t const &key) { return m_map.at(key); } inline std::pair const &Id2OverlapMap::at(detail::key_t const &key) const { return m_map.at(key); } -inline std::size_t Id2OverlapMap::getSize() const { return m_map.size(); } - } // namespace muchsalsa::matching #endif // INCLUDED_MUCHSALSA_ID2OVERLAPMAP diff --git a/include/ms/matching/MatchMap.h b/include/ms/matching/MatchMap.h index 88328ce..7a9eb4c 100644 --- a/include/ms/matching/MatchMap.h +++ b/include/ms/matching/MatchMap.h @@ -31,7 +31,7 @@ #include #include -#include "Lb.fwd.h" +#include "MS.fwd.h" #include "graph/Graph.h" #include "types/Toggle.h" diff --git a/include/ms/types/Toggle.h b/include/ms/types/Toggle.h index 378c1fb..463a6ba 100644 --- a/include/ms/types/Toggle.h +++ b/include/ms/types/Toggle.h @@ -37,7 +37,7 @@ namespace muchsalsa { /** * Class representing a toggle which can reach two possible states. * - * A toggle has the size of a bool, but supports operations like an unsigned integer. + * A toggle has the size of a bool, but supports operations like an integer. * * Example: * diff --git a/libms/src/MS.fwd.cpp b/libms/src/MS.fwd.cpp new file mode 100644 index 0000000..03e194f --- /dev/null +++ b/libms/src/MS.fwd.cpp @@ -0,0 +1,24 @@ +// -*- C++ -*- +//===---------------------------------------------------------------------------------------------------------------==// +// +// Copyright (C) 2021 Kevin Klein +// This file is part of MuCHSALSA . +// +// MuCHSALSA is free software: you can redistribute it and/or modify it under the terms of the GNU General +// Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any +// later version. +// +// MuCHSALSA is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the +// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for more +// details. +// +// You should have received a copy of the GNU General Public License along with MuCHSALSA. +// If not, see . +// +// SPDX-License-Identifier: GPL-3.0-or-later +// +//===---------------------------------------------------------------------------------------------------------------==// + +#include "MS.fwd.h" + +// ---------------------------------------------------- END-OF-FILE ---------------------------------------------------- \ No newline at end of file diff --git a/libms/src/kernel/ap.cpp b/libms/src/kernel/ap.cpp index 6dab7a2..ea6edd3 100644 --- a/libms/src/kernel/ap.cpp +++ b/libms/src/kernel/ap.cpp @@ -1090,7 +1090,7 @@ void muchsalsa::assemblePath( auto globalLeftMostPosition = globalPos1 * -1; auto const targetName = [&]() { - std::string targetName = ">Prokrastinator_"; + std::string targetName = ">muchsalsa_"; targetName.append(std::to_string(asmIdx)); return targetName; diff --git a/libms/src/kernel/dg.cpp b/libms/src/kernel/dg.cpp index 25cd6b0..7a57dec 100644 --- a/libms/src/kernel/dg.cpp +++ b/libms/src/kernel/dg.cpp @@ -32,10 +32,10 @@ #include "types/Direction.h" #include "types/Toggle.h" -muchsalsa::graph::DiGraph muchsalsa::getDirectionGraph(gsl::not_null pMatchMap, - muchsalsa::graph::Graph const & graph, - muchsalsa::graph::Graph const & connectedComponent, - muchsalsa::graph::Vertex const &startNode) { +muchsalsa::graph::DiGraph muchsalsa::getDirectedGraph(gsl::not_null pMatchMap, + muchsalsa::graph::Graph const &graph, + muchsalsa::graph::Graph const &connectedComponent, + muchsalsa::graph::Vertex const &startNode) { std::stack> stack; stack.push(std::make_tuple(&startNode, true)); diff --git a/libms/src/kernel/co.cpp b/libms/src/kernel/ol.cpp similarity index 83% rename from libms/src/kernel/co.cpp rename to libms/src/kernel/ol.cpp index 746f62a..6e94040 100644 --- a/libms/src/kernel/co.cpp +++ b/libms/src/kernel/ol.cpp @@ -29,9 +29,9 @@ namespace { -std::pair computeOverhangs(muchsalsa::matching::MatchMap const & matches, - muchsalsa::graph::Vertex const *const pVertex, - muchsalsa::graph::Edge const &edge, unsigned int illuminaId) { +std::pair getOverhangs(muchsalsa::matching::MatchMap const &matches, + muchsalsa::graph::Vertex const *const pVertex, + muchsalsa::graph::Edge const &edge, unsigned int illuminaId) { auto const pVertexMatch = gsl::make_not_null(matches.getVertexMatch(pVertex->getId(), illuminaId)); auto const pEdgeMatch = gsl::make_not_null(matches.getEdgeMatch(&edge, illuminaId)); @@ -52,19 +52,19 @@ std::pair computeOverhangs(muchsalsa::matching::MatchMap const & } // unnamed namespace -std::optional muchsalsa::computeOverlap(muchsalsa::matching::MatchMap const &matches, - std::vector & ids, - muchsalsa::graph::Edge const &edge, bool direction, - std::size_t score, bool isPrimary) { +std::optional muchsalsa::getOverlap(muchsalsa::matching::MatchMap const &matches, + std::vector &ids, + muchsalsa::graph::Edge const &edge, bool direction, + std::size_t score, bool isPrimary) { auto const &firstId = ids.front(); auto const &lastId = ids.back(); auto const vertices = edge.getVertices(); - auto const overhangsFirstIdFirstVertex = computeOverhangs(matches, vertices.first, edge, firstId); - auto const overhangsLastIdFirstVertex = computeOverhangs(matches, vertices.first, edge, lastId); - auto const overhangsFirstIdSecondVertex = computeOverhangs(matches, vertices.second, edge, firstId); - auto const overhangsLastIdSecondVertex = computeOverhangs(matches, vertices.second, edge, lastId); + auto const overhangsFirstIdFirstVertex = getOverhangs(matches, vertices.first, edge, firstId); + auto const overhangsLastIdFirstVertex = getOverhangs(matches, vertices.first, edge, lastId); + auto const overhangsFirstIdSecondVertex = getOverhangs(matches, vertices.second, edge, firstId); + auto const overhangsLastIdSecondVertex = getOverhangs(matches, vertices.second, edge, lastId); auto const leftOverhangFirstVertex = overhangsFirstIdFirstVertex.first; auto const rightOverhangFirstVertex = overhangsLastIdFirstVertex.second; diff --git a/pipeline/pipeline.sh b/pipeline/pipeline.sh new file mode 100644 index 0000000..7326d85 --- /dev/null +++ b/pipeline/pipeline.sh @@ -0,0 +1,56 @@ +#!/bin/bash +# This is an adapted version of the LazyB pipeline from +# https://github.com/TGatter/LazyB + +SCRIPT=$(readlink -f "$0") +SCRIPTPATH=$(dirname "$SCRIPT") + +K_MER_JELLY=$1 +K_MER_ABYSS=$2 +NAME=$3 +ILLUMINA_RAW_1=$4 +ILLUMINA_RAW_2=$5 +NANO=$6 +OUT_=$7 + +CORES=8 #number of cores can be set manually here +MINLENGTH=500 +ABYSS_MODE=unitigs + +case $OUT_ in + /*) OUT=$OUT_;; + *) OUT=$PWD/$OUT_;; +esac + +mkdir -p $OUT #create output folder if it doesn't already exist +TMP=$(mktemp -d -p $OUT) #create a temporary folder - deleted in the end +BASE=$(basename $NANO .fastq) + +echo ">>>> K-mer Filtering of Illumina Reads" +jellyfish count -m $K_MER_JELLY -s 100M -t $CORES -C $ILLUMINA_RAW_1 $ILLUMINA_RAW_2 -o ${TMP}/"jelly_count_k${K_MER_JELLY}.jf" +jellyfish histo ${TMP}/"jelly_count_k${K_MER_JELLY}.jf" > ${TMP}/"jelly_histo_k${K_MER_JELLY}.histo" +TOTAL_NON_UNIQUE_KMERS=$(awk '{if($1 != "1") s += $2} END{print s}' ${TMP}/"jelly_histo_k${K_MER_JELLY}.histo") +ABUNDANCE_THRESHOLD=$($SCRIPTPATH/setAbundanceThresholdFromHisto.py ${TMP}/"jelly_histo_k${K_MER_JELLY}.histo" $TOTAL_NON_UNIQUE_KMERS) +echo "abundance threshold for k-mer filtering: " $ABUNDANCE_THRESHOLD > $OUT/report.txt +jellyfish dump -L $ABUNDANCE_THRESHOLD ${TMP}/"jelly_count_k${K_MER_JELLY}.jf" > ${TMP}/"filtered_kmers_${K_MER_JELLY}_${ABUNDANCE_THRESHOLD}.fa" +bbduk.sh in1=$ILLUMINA_RAW_1 in2=$ILLUMINA_RAW_2 out1=$TMP/illu_filtered.1.fastq out2=$TMP/illu_filtered.2.fastq ref=${TMP}/"filtered_kmers_${K_MER_JELLY}_${ABUNDANCE_THRESHOLD}.fa" k=$K_MER_JELLY hdist=0 + +echo ">>>> Illumina Assembly" +mkdir -p $OUT/ABYSS #create folder "ABYSS" for ABYSS results +abyss-pe -C $OUT/ABYSS np=$CORES name=$NAME k=$K_MER_ABYSS in="${TMP}/illu_filtered.1.fastq ${TMP}/illu_filtered.2.fastq" ${ABYSS_MODE} 2>&1 | tee $OUT/ABYSS/abyss.log +awk -v min="$MINLENGTH" 'BEGIN {RS = ">" ; ORS = ""} $2 >= min {print ">"$0}' $OUT/ABYSS/"${NAME}-${ABYSS_MODE}.fa" > $OUT/ABYSS/"${NAME}-${ABYSS_MODE}.l${MINLENGTH}.fa" + +echo ">>>> Unitig Filter" +minimap2 -k15 -DP --dual=yes --no-long-join -w5 -m100 -g10000 -r2000 --max-chain-skip 25 --split-prefix foo $NANO $OUT/ABYSS/"${NAME}-${ABYSS_MODE}.l${MINLENGTH}.fa" > $OUT/01_unitigs.to_$BASE.paf +$SCRIPTPATH/unitig_filter.py $OUT/01_unitigs.to_$BASE.paf $OUT/ABYSS/"${NAME}-${ABYSS_MODE}.l${MINLENGTH}.fa" $OUT/report.txt $TMP/unitigs_corrected.fa + +echo ">>>> Scrubbing" +minimap2 -k15 -DP --dual=yes --no-long-join -w5 -m100 -g10000 -r2000 --max-chain-skip 25 --split-prefix foo $NANO $TMP/unitigs_corrected.fa > $OUT/01_contigs_corrected.to_$BASE.paf +$SCRIPTPATH/scrubber_bfs.py $OUT/01_contigs_corrected.to_$BASE.paf $NANO $OUT/02_$BASE.scrubbed.fa $TMP + +echo ">>>> Anchor Mapping" +minimap2 -k15 -DP --dual=yes --no-long-join -w5 -m100 -g10000 -r2000 --max-chain-skip 25 --split-prefix foo $OUT/02_$BASE.scrubbed.fa $TMP/unitigs_corrected.fa > $OUT/02_contigs_corrected.to_$BASE.scrubbed.paf + +echo ">>>> MuCHSALSA" +$SCRIPTPATH/muchsalsa $OUT/02_contigs_corrected.to_$BASE.scrubbed.paf $TMP/unitigs_corrected.fa $OUT/02_$BASE.scrubbed.fa $TMP $CORES +cp $TMP/temp_1.target.fa $OUT/03.assembly.unpolished.fa \ No newline at end of file diff --git a/pipeline/scrubber_bfs.py b/pipeline/scrubber_bfs.py new file mode 100644 index 0000000..42e18f5 --- /dev/null +++ b/pipeline/scrubber_bfs.py @@ -0,0 +1,252 @@ +#!/usr/bin/env python +# This is an unmodified copy of scrubber_bfs.py taken from +# https://github.com/TGatter/LazyB + +import networkx as nx +import sys, os +from Bio import SeqIO +import random +import time + +start_time = time.time() + +paf_anchors = sys.argv[1] # anchors to nanopores +seq_file_base = sys.argv[2] # nanopore reads +output_file = sys.argv[3] # scrubbed nanopore reads +tmp_dir = sys.argv[4] +output = open(output_file, "w") + +subset_size = 60000 + +if seq_file_base.endswith("fa") or seq_file_base.endswith("fasta"): + seq_dict = SeqIO.index_db(seq_file_base + ".idx", seq_file_base, "fasta") +else: + seq_dict = SeqIO.index_db(seq_file_base + ".idx", seq_file_base, "fastq") + + +def find_sequence_r(seqid, start, end): + return seq_dict[seqid].seq[start:end + 1] + + +def find_sequence(seqid): + return seq_dict[seqid].seq + + +def reverse_complement(seq): + return seq.reverse_complement() + + +def write_sequence(f, seq): + for i in range(0, len(seq), 60): + f.write(str(seq[i:i + 60]) + "\n") + + +# set minimum length for mapping hits +threshold_length = 500 +# create graph +G = nx.Graph() + +prevHitId = "" # just to prevent the error at the first line +chunkNodes = {} +############## Read Blast in and build base graph + +for line in open(paf_anchors, "r"): + + # read line into a list + + # 0 qseqid string Query sequence name + # 1 qlen int Query sequence length + # 2 qstart int Query start (0-based) + # 3 qend int Query end (1-based) + # 4 strand char Relative strand: "+" or "-" + + # 5 sseqid string Target sequence name + # 6 slen int Target sequence length + # 7 sstart int Target start on original strand (0-based) + # 8 send int Target end on original strand (0-based) + # 9 int Number of residue matches + # 10 int Alignment block length + # 11 int Mapping quality (0-255; 255 for missing) + + linetemp = line.rstrip().split("\t") + if len(linetemp) == 1: + continue + + id_1, id_2 = linetemp[0], linetemp[5] # remember about 0-based counting + + len_1, len_2 = int(linetemp[1]), int(linetemp[6]) # remember about 0-based counting + + s_1, s_2 = int(linetemp[2]), int(linetemp[7]) + e_1, e_2 = int(linetemp[3]), int(linetemp[8]) + + if e_1 - s_1 < 500: # only hits >= 500 bp count + continue + + # add node to the graph if necessary + if not G.has_node(id_2): # if this node does not exist, then create it + G.add_node(id_2) # , print_index=node_index) + G.nodes[id_2]['length'] = len_2 # set length of nanopore read + G.nodes[id_2]['illu_to_ranges'] = {} # initialize empty matches to anchors + G.nodes[id_2]['seq_to_ranges'] = {} # initialize empty matches to other nanos + + # add the anchor hit range to the vertex information + # we exclude information of second hits of the same anchor-nanopore pair + if id_1 in G.nodes[id_2]['illu_to_ranges']: + continue + + G.nodes[id_2]['illu_to_ranges'][id_1] = (s_2, e_2) + + # are we still in the same scaffold? If not, let's start a new one + if id_1 != prevHitId: + chunkNodes = [] + prevHitId = id_1 + + # here are reads associated with the same anchor + # and now iterate only through these reads + for prevId in chunkNodes: + # add edge if necessary + if not G.has_edge(prevId, id_2): + G.add_edge(prevId, id_2) + chunkNodes.append(id_2) + +total_nodes = len(G.nodes()) +print("\n" + "# Nodes " + str(total_nodes)) +print("# Edges " + str(len(G.edges())) + "\n") + + +def print_corrected_read(cid, seq_to_ranges, nano_to_ranges, length): + join = [] + for sid in seq_to_ranges: + s, e, _ = seq_to_ranges[sid] + join.append((s, e)) + for sid in nano_to_ranges: + s, e = nano_to_ranges[sid] + join.append((s, e)) + + join.sort() + + covered = [] + for s, e in join: + if len(covered) == 0: + covered.append((s, e)) + continue + + cs = covered[-1][0] + ce = covered[-1][1] + if cs <= e and s <= ce: + covered[-1] = (min(s, cs), max(e, ce)) # join + else: + covered.append((s, e)) # add + + for i, (cs, ce) in enumerate(covered): + output.write(">" + cid + "_" + str(i) + "\n") + write_sequence(output, find_sequence_r(cid, max(cs, 200), min(ce, length - 200))) + + +## take arbitrary node of G (always the min for reproducability) as starting point of traversal with bredth-first search +## add passed nodes to subset until threshold size is reached +## the nodes of this connected subgraph are mapped all versus all +## all nodes from this subset that have all their neighbors in the set are scrubbed and then removed from G +## repeat until G is empty + +# set size threshold of connected subset of nodes +bfs_subset = set() +counter = 0 + +# start printing progress +print('0/' + str(total_nodes)) + +while (len(G) != 0): + if (len(bfs_subset) > 0): + possible_starts = set(G.nodes()).difference( + bfs_subset) # avoid to choose starting vertex that is already contained in subset + + else: + possible_starts = set(G.nodes()) + start_node = min(possible_starts) + + # breadth-first-search + bfs_edges = nx.bfs_edges(G, start_node, depth_limit=subset_size) + bfs_nodes = [start_node] + [v for u, v in bfs_edges] + + # add nodes starting from a random node to subset following bread-first-search until threshold is reached + for node in bfs_nodes: + if (len(bfs_subset) >= subset_size): + break + else: + bfs_subset.add(node) + + if (len(bfs_subset) < subset_size and len(G) > len( + bfs_subset)): # the conncected component is smaller than threshold size -> so merge it to next one + continue + + # copy the set and delete all nodes with neighbors outside the set + bfs_subset_center = bfs_subset.copy() + for u, v in nx.edge_boundary(G, bfs_subset): + bfs_subset_center.discard(u) + + # map nodes of this subset against each other + # write sequences to file + f1 = open(tmp_dir + "/temp_sequences.fa", "w") + for node in bfs_subset: + f1.write(">" + node + "\n") + write_sequence(f1, find_sequence(node)) + f1.close() + + os.system( + 'minimap2 -x ava-ont ' + tmp_dir + '/temp_sequences.fa ' + tmp_dir + '/temp_sequences.fa > ' + tmp_dir + '/temp_pwa.paf 2> /dev/null') + + # read information of paf file and write to node dictionary seq_to_ranges + for line in open(tmp_dir + '/temp_pwa.paf', "r"): + + linetemp = line.rstrip().split("\t") + if len(linetemp) == 1: + continue + + id_1, id_2 = linetemp[0], linetemp[5] # remember about 0-based counting + if id_1 == id_2: + continue + + s_1, s_2 = int(linetemp[2]), int(linetemp[7]) + e_1, e_2 = int(linetemp[3]), int(linetemp[8]) + + if e_1 - s_1 < 500: + continue + + direction = linetemp[4] + + # write information for id_1 to dict + if id_2 not in G.nodes[id_1]['seq_to_ranges']: + G.nodes[id_1]['seq_to_ranges'][id_2] = (s_1, e_1, direction) + else: + str_1 = G.nodes[id_1]['seq_to_ranges'][id_2][0] + str_2 = G.nodes[id_1]['seq_to_ranges'][id_2][1] + d = G.nodes[id_1]['seq_to_ranges'][id_2][2] + if direction == d and (abs(str_1 - e_1) < 500 or abs(s_1 - str_2) < 500): + G.nodes[id_1]['seq_to_ranges'][id_2] = (min(s_1, str_1), max(e_1, str_2), direction) + + # write information for id_2 to dict + if id_1 not in G.nodes[id_2]['seq_to_ranges']: + G.nodes[id_2]['seq_to_ranges'][id_1] = (s_2, e_2, direction) + else: + str_1 = G.nodes[id_2]['seq_to_ranges'][id_1][0] + str_2 = G.nodes[id_2]['seq_to_ranges'][id_1][1] + d = G.nodes[id_2]['seq_to_ranges'][id_1][2] + if direction == d and (abs(str_1 - e_2) < 500 or abs(s_2 - str_2) < 500): + G.nodes[id_2]['seq_to_ranges'][id_1] = (min(s_2, str_1), max(e_2, str_2), direction) + + # iterate over the nodes of the subset and write scrubbed sequences to output file + for node in bfs_subset_center: + print_corrected_read(node, G.nodes[node]['seq_to_ranges'], G.nodes[node]['illu_to_ranges'], + G.nodes[node]['length']) + + # remove nodes from G + G.remove_nodes_from(bfs_subset_center) + subset_length = len(bfs_subset_center) + bfs_subset.clear() + + # print progress + counter += subset_length + print(str(counter) + '/' + str(total_nodes)) + +print("--- %s seconds ---" % (time.time() - start_time)) diff --git a/pipeline/setAbundanceThresholdFromHisto.py b/pipeline/setAbundanceThresholdFromHisto.py new file mode 100644 index 0000000..02646e9 --- /dev/null +++ b/pipeline/setAbundanceThresholdFromHisto.py @@ -0,0 +1,36 @@ +#!/usr/bin/env python +# This is an unmodified copy of setAbundanceThresholdFromHisto.py taken from +# https://github.com/TGatter/LazyB + +## Script reads the histofile of k-mer counting +## upper abundance threshold is set + +import sys +import numpy as np + +histo = sys.argv[1] +total_non_unique_k_mers = int(sys.argv[2]) + +# calculate quartiles 1 and 3 +q1_th_value = round((total_non_unique_k_mers + 1) * 0.25) +q1 = 0 +q3_th_value = round((total_non_unique_k_mers + 1) * 0.75) +q3 = 0 + +current_value = 0 +with open(histo, 'r') as file: + for line in file: + split_line = line.rstrip().split() + abundance = int(split_line[0]) + frequency = int(split_line[1]) + if (abundance > 1): # exclude k-mers with frequency 1 == non_unique k-mers + current_value += frequency + if (q1 == 0 and current_value >= q1_th_value): + q1 = abundance + elif (q3 == 0 and current_value >= q3_th_value): + q3 = abundance + break + +iqr = q3 - q1 +upper_outlier = q3 + 2 * iqr +print(str(upper_outlier)) diff --git a/pipeline/unitig_filter.py b/pipeline/unitig_filter.py new file mode 100644 index 0000000..c35633c --- /dev/null +++ b/pipeline/unitig_filter.py @@ -0,0 +1,185 @@ +#!/usr/bin/env python +# This is an unmodified copy of unitig_filter.py taken from +# https://github.com/TGatter/LazyB + +import numpy as np +import sys +from Bio import SeqIO + +paf_file = sys.argv[1] +unitigs = sys.argv[2] +output_stats = sys.argv[3] +output_file = sys.argv[4] + +unitigs_corrected = open(output_file, 'w') +stats_file = open(output_stats, 'a') + +# open up sequence map for illumina +illumina_dict = SeqIO.index_db(unitigs + ".idx", unitigs, "fasta") + + +def find_sequence_illumina(seqid, start, end): + return illumina_dict[seqid].seq[start:end + 1] + + +def write_sequence(f, seq): + for i in range(0, len(seq), 60): + f.write(str(seq[i:i + 60]) + "\n") + + +############ read input paf file ####################################################################################### +mapping_freq = {} +nano_lengths = {} + +with open(paf_file, 'r') as input_file: + current_id = '' + current_length = 0 + current_nano_ids = [] + current_profile = [] + block_counter = 0 + + for line in input_file: + split_line = line.rstrip().split('\t') + illu_id = split_line[0] + illu_length = int(split_line[1]) + illuminaStart = int(split_line[2]) + illuminaEnd = int(split_line[3]) - 1 + nano_id = split_line[5] + nano_length = split_line[6] + + if nano_id not in nano_lengths: + nano_lengths[nano_id] = int(nano_length) + + if current_id == illu_id: # still in the same block + if nano_id not in current_nano_ids: + block_counter += 1 + current_nano_ids.append(nano_id) + for pos in range(illuminaStart, illuminaEnd + 1): + current_profile[pos] += 1 + + else: # new block + if current_id != '': # close previous block, avoid beginning + mapping_freq[current_id] = [block_counter, current_length, max(current_profile)] + + # start new block + current_profile = [0] * illu_length + for pos in range(illuminaStart, illuminaEnd + 1): + current_profile[pos] += 1 + + block_counter = 1 + current_id = illu_id + current_length = illu_length + del current_nano_ids[:] + current_nano_ids.append(nano_id) + + mapping_freq[current_id] = [block_counter, current_length, max(current_profile)] # last block + + +# print("finished reading input file") + + +############# define filter cut-off #################################################################################### + +def filter_cutoff(mapping_dict): + blocks_maxcov = [x[2] for x in list(mapping_dict.values())] + # define cut-off + q1 = np.percentile(blocks_maxcov, 25) + q3 = np.percentile(blocks_maxcov, 75) + iqr = q3 - q1 + upper_whisker = q3 + 1.5 * iqr + stats_file.write(">>> unitig filter \n") + stats_file.write("upper_outlier: " + str(upper_whisker) + '\n') + stats_file.write("Q3: " + str(q3) + '\n') + + return upper_whisker, blocks_maxcov, q3 + + +cutoff, blocks_maxcov, q3 = filter_cutoff(mapping_freq) + + +########### evaluate and cut outliers ################################################################################# + +def cut_peaks(unitig_id, coverage_profile, file): + subrange = (0, 0) + below_cutoff = False + unitig_fragments = [] + for i, cov in enumerate(coverage_profile): + if cov <= q3: # set q3 as cutoff + if not below_cutoff: # start new subrange below cutoff + subrange = (i, i) + else: # prolonging subrange + subrange = (subrange[0], i) + below_cutoff = True + + else: + if below_cutoff and subrange[1] - subrange[0] + 1 >= 500: + file.write('>' + unitig_id + '_' + str(len(unitig_fragments)) + ' ' + str(subrange[1] - subrange[0] + 1) + + ' ' + str(subrange[0]) + ' ' + str(subrange[1]) + '\n') + write_sequence(file, find_sequence_illumina(unitig_id, subrange[0], subrange[1])) + unitig_fragments.append(subrange) + subrange = (0, 0) + below_cutoff = False + if below_cutoff and subrange[1] - subrange[0] + 1 >= 500: + file.write('>' + unitig_id + '_' + str(len(unitig_fragments)) + ' ' + str(subrange[1] - subrange[0] + 1) + + ' ' + str(subrange[0]) + ' ' + str(subrange[1]) + '\n') + write_sequence(file, find_sequence_illumina(unitig_id, subrange[0], subrange[1])) + unitig_fragments.append(subrange) + return unitig_fragments + + +########### detect outlier and write new unitig file ########################################################### + +with open(paf_file, 'r') as input_file: + current_id = '' + current_profile = [] + all_counter = 0 + outlier_counter = 0 + rescued_outlier = 0 + total_rescued_length = 0 + for line in input_file: + split_line = line.rstrip().split('\t') + illu_id = split_line[0] + illu_length = int(split_line[1]) + illuminaStart = int(split_line[2]) + illuminaEnd = int(split_line[3]) - 1 + + if current_id == illu_id: # still in the same block + if mapping_freq[illu_id][2] > cutoff: # outlier unitig block + for pos in range(illuminaStart, illuminaEnd + 1): + current_profile[pos] += 1 + else: # normal unitig block + continue + + else: # new block + all_counter += 1 + if current_id != '' and mapping_freq[current_id][2] > cutoff: # close previous block if outlier + fragments = cut_peaks(current_id, current_profile, unitigs_corrected) + if len(fragments) > 0: + rescued_outlier += 1 + for entry in fragments: + total_rescued_length += (entry[1] - entry[0] + 1) + outlier_counter += 1 + + if mapping_freq[illu_id][2] > cutoff: # new outlier block + current_profile = [0] * illu_length + for pos in range(illuminaStart, illuminaEnd + 1): + current_profile[pos] += 1 + current_id = illu_id + + else: # new normal unitig + unitigs_corrected.write('>' + illumina_dict[illu_id].description + '\n') + write_sequence(unitigs_corrected, illumina_dict[illu_id].seq) + current_id = illu_id + + if current_id != '' and mapping_freq[current_id][2] > cutoff: # close last block if outlier + fragments = cut_peaks(current_id, current_profile, unitigs_corrected) + if len(fragments) > 0: + rescued_outlier += 1 + outlier_counter += 1 + +stats_file.write("#all unitigs: " + str(all_counter) + '\n') +stats_file.write("#outliers: " + str(outlier_counter) + '\n') +stats_file.write("#rescued outliers: " + str(rescued_outlier) + '\n') + +unitigs_corrected.close() +stats_file.close() diff --git a/src/main.cpp b/src/main.cpp index 1037d20..b10a04b 100644 --- a/src/main.cpp +++ b/src/main.cpp @@ -284,8 +284,9 @@ auto main(int const argc, char const *argv[]) -> int { TRACE("Starting assembly\n"); //// OUTPUT FILES //// - OutputWriter outputWriter(app.getOutputFilePath() + "/temp.query.fa", app.getOutputFilePath() + "/temp.align.paf", - app.getOutputFilePath() + "/temp.target.fa"); + OutputWriter outputWriter(app.getOutputFilePath() + "/temp_1.query.fa", + app.getOutputFilePath() + "/temp_1.align.paf", + app.getOutputFilePath() + "/temp_1.target.fa"); //// /OUTPUT FILES //// std::atomic assemblyIdx{-1}; @@ -386,16 +387,16 @@ void chainingAndOverlaps(gsl::not_null const pJob) { } std::for_each(std::begin(minusPaths), std::end(minusPaths), [=](auto &minusPath) { - auto const overlap = muchsalsa::computeOverlap(*pMatchMap, std::get<0>(minusPath), *pEdge, false, - std::get<1>(minusPath), std::get<2>(minusPath)); + auto const overlap = muchsalsa::getOverlap(*pMatchMap, std::get<0>(minusPath), *pEdge, false, + std::get<1>(minusPath), std::get<2>(minusPath)); if (overlap.has_value()) { pEdge->appendOrder(std::move(overlap.value())); } }); std::for_each(std::begin(plusPaths), std::end(plusPaths), [=](auto &plusPath) { - auto const order = muchsalsa::computeOverlap(*pMatchMap, std::get<0>(plusPath), *pEdge, true, std::get<1>(plusPath), - std::get<2>(plusPath)); + auto const order = muchsalsa::getOverlap(*pMatchMap, std::get<0>(plusPath), *pEdge, true, std::get<1>(plusPath), + std::get<2>(plusPath)); if (order.has_value()) { pEdge->appendOrder(std::move(order.value())); } @@ -618,7 +619,7 @@ void assemblePaths(gsl::not_null const pJob) { auto const pMatchMap = gsl::make_not_null(std::any_cast(pJob->getParam(2))); if (pMaxNplVertex != nullptr) { - auto diGraph = muchsalsa::getDirectionGraph(pMatchMap, *pGraph, subGraph, *pMaxNplVertex); + auto diGraph = muchsalsa::getDirectedGraph(pMatchMap, *pGraph, subGraph, *pMaxNplVertex); auto const paths = muchsalsa::linearizeGraph(&diGraph); Id2OverlapMap id2OverlapMap;