From 0e2e042fb5bb8515ca41977a1968d0db0756f782 Mon Sep 17 00:00:00 2001
From: Kevin Klein <38384885+0x002A@users.noreply.github.com>
Date: Fri, 19 Nov 2021 11:31:20 +0100
Subject: [PATCH] Prepare second release (#6)

---
 README.md                                  |  38 +++-
 include/ms/BlastFileReader.h               |   2 +-
 include/ms/Kernel.h                        |  17 +-
 include/ms/{Lb.fwd.h => MS.fwd.h}          |   0
 include/ms/SequenceAccessor.h              |   2 +-
 include/ms/graph/Edge.h                    |   2 +-
 include/ms/graph/Graph.h                   |   2 +-
 include/ms/matching/Id2OverlapMap.h        |  64 ++----
 include/ms/matching/MatchMap.h             |   2 +-
 include/ms/types/Toggle.h                  |   2 +-
 libms/src/MS.fwd.cpp                       |  24 ++
 libms/src/kernel/ap.cpp                    |   2 +-
 libms/src/kernel/dg.cpp                    |   8 +-
 libms/src/kernel/{co.cpp => ol.cpp}        |  22 +-
 pipeline/pipeline.sh                       |  56 +++++
 pipeline/scrubber_bfs.py                   | 252 +++++++++++++++++++++
 pipeline/setAbundanceThresholdFromHisto.py |  36 +++
 pipeline/unitig_filter.py                  | 185 +++++++++++++++
 src/main.cpp                               |  15 +-
 19 files changed, 645 insertions(+), 86 deletions(-)
 rename include/ms/{Lb.fwd.h => MS.fwd.h} (100%)
 create mode 100644 libms/src/MS.fwd.cpp
 rename libms/src/kernel/{co.cpp => ol.cpp} (83%)
 create mode 100644 pipeline/pipeline.sh
 create mode 100644 pipeline/scrubber_bfs.py
 create mode 100644 pipeline/setAbundanceThresholdFromHisto.py
 create mode 100644 pipeline/unitig_filter.py

diff --git a/README.md b/README.md
index 43435bd..8a5d987 100644
--- a/README.md
+++ b/README.md
@@ -3,18 +3,40 @@
 **Mu**lti-**C**ore **H**ybrid **S**hort- **A**nd **L**ong-read **S**equence **A**ssembler
 
 Based on:
-Gatter, Thomas, et al. "Economic genome assembly from low coverage Illumina and Nanopore data." 20th International
-Workshop on Algorithms in Bioinformatics (WABI 2020). Schloss Dagstuhl-Leibniz-Zentrum für Informatik, 2020.
+Gatter, T., von Löhneysen, S., Fallmann, J. et al. LazyB: fast and cheap genome assembly. Algorithms Mol Biol 16, 8 (
+2021).
 
-# Binary
+# Prerequisites
 
-A statically linked binary for x86-64 can be downloaded from the releases page on GitHub.
+`MuCHSALSA` requires the following tools to be available in the `PATH`:
+
+- jellyfish 2
+- bbduk
+- Abyss 2 (using 'abyss-pe')
+- minimap2
+- Python 3.6.9 (at least)
+
+# Download
+
+A zip file containing a statically linked binary for x86-64 alongside with the assembly pipeline and associated scripts
+can be downloaded from the releases page on GitHub.
 
 # Usage
 
-In contrast to [LazyB](https://github.com/TGatter/LazyB) this tool has an additional parameter to control the level of
-parallelization. Simply plug it into the LazyB-pipeline and set the additional parameter.
-**ATTENTION**: If no value is given all available cores will be used.
+The pipeline can be run using the following command:
+
+```bash
+sh pipeline.sh \[k-mer-size-filter\] \[k-mer-size-assembly\] \[name\] \[illumina-inputfile-1\] \[illumina-inputfile-2\]] \[nanopore-inputfile\] \[output-folder\]
+```
+
+**Note**: The level of parallelization used for parts (default value: 8) of the pipeline is set via a variable
+in `pipeline.sh`.
+
+[k-mer-size-filter] specifies the k-mer size for k-mer counting in raw illumina data. Reads with highly abundant k-mers
+are removed from the data. Starting at k=50 is recommended.
+
+[k-mer-size-assembly] specifies the k-mer size during illumina assembly (here using Abyss). Starting at k=90 is
+recommended.
 
 # Building
 
@@ -59,5 +81,5 @@ find . -regex '.*\.\(cpp\|h\)' -exec clang-format-12 -style=file -i {} \;
 
 # Standard Library
 
-This project uses _libc++_ as standard library. For information on building _libc++_
+This project uses `libc++-12` as standard library. For information on building `libc++`
 see [here](https://libcxx.llvm.org/docs/BuildingLibcxx.html).
\ No newline at end of file
diff --git a/include/ms/BlastFileReader.h b/include/ms/BlastFileReader.h
index 206d77d..e63c729 100644
--- a/include/ms/BlastFileReader.h
+++ b/include/ms/BlastFileReader.h
@@ -26,7 +26,7 @@
 
 #include <gsl/pointers>
 
-#include "Lb.fwd.h"
+#include "MS.fwd.h"
 
 namespace muchsalsa {
 
diff --git a/include/ms/Kernel.h b/include/ms/Kernel.h
index f500cc0..2e8f682 100644
--- a/include/ms/Kernel.h
+++ b/include/ms/Kernel.h
@@ -19,8 +19,8 @@
 //
 //===---------------------------------------------------------------------------------------------------------------==//
 
-#ifndef INCLUDED_MUCHSALSA_PROKRASTINATOR
-#define INCLUDED_MUCHSALSA_PROKRASTINATOR
+#ifndef INCLUDED_MUCHSALSA_KERNEL
+#define INCLUDED_MUCHSALSA_KERNEL
 
 #pragma once
 
@@ -31,7 +31,7 @@
 #include <unordered_map>
 #include <vector>
 
-#include "Lb.fwd.h"
+#include "MS.fwd.h"
 #include "graph/Edge.h"
 
 namespace muchsalsa {
@@ -40,9 +40,8 @@ namespace muchsalsa {
 //                                                        KERNEL
 // =====================================================================================================================
 
-std::optional<graph::EdgeOrder> computeOverlap(matching::MatchMap const &matches, std::vector<unsigned int> &ids,
-                                               graph::Edge const &edge, bool direction, std::size_t score,
-                                               bool isPrimary);
+std::optional<graph::EdgeOrder> getOverlap(matching::MatchMap const &matches, std::vector<unsigned int> &ids,
+                                           graph::Edge const &edge, bool direction, std::size_t score, bool isPrimary);
 
 std::vector<std::tuple<std::vector<unsigned int>, std::size_t, bool>>
 getMaxPairwisePaths(matching::MatchMap const &matches, graph::Edge const &edge,
@@ -55,8 +54,8 @@ graph::Graph getMaxSpanTree(graph::Graph const &graph);
 
 std::vector<std::vector<muchsalsa::graph::Vertex *>> getConnectedComponents(graph::Graph const &graph);
 
-graph::DiGraph getDirectionGraph(gsl::not_null<matching::MatchMap *> pMatchMap, graph::Graph const &graph,
-                                 graph::Graph const &connectedComponent, graph::Vertex const &startNode);
+graph::DiGraph getDirectedGraph(gsl::not_null<matching::MatchMap *> pMatchMap, graph::Graph const &graph,
+                                graph::Graph const &connectedComponent, graph::Vertex const &startNode);
 
 std::vector<std::vector<muchsalsa::graph::Vertex const *>> linearizeGraph(gsl::not_null<graph::DiGraph *> pDiGraph);
 
@@ -68,6 +67,6 @@ void assemblePath(
 
 } // namespace muchsalsa
 
-#endif // INCLUDED_MUCHSALSA_PROKRASTINATOR
+#endif // INCLUDED_MUCHSALSA_KERNEL
 
 // ---------------------------------------------------- END-OF-FILE ----------------------------------------------------
diff --git a/include/ms/Lb.fwd.h b/include/ms/MS.fwd.h
similarity index 100%
rename from include/ms/Lb.fwd.h
rename to include/ms/MS.fwd.h
diff --git a/include/ms/SequenceAccessor.h b/include/ms/SequenceAccessor.h
index dcaaaa6..4104a7f 100644
--- a/include/ms/SequenceAccessor.h
+++ b/include/ms/SequenceAccessor.h
@@ -34,7 +34,7 @@
 #include <unordered_map>
 #include <utility>
 
-#include "Lb.fwd.h"
+#include "MS.fwd.h"
 
 namespace muchsalsa {
 
diff --git a/include/ms/graph/Edge.h b/include/ms/graph/Edge.h
index b4e9f60..1012ca0 100644
--- a/include/ms/graph/Edge.h
+++ b/include/ms/graph/Edge.h
@@ -29,7 +29,7 @@
 #include <utility>
 #include <vector>
 
-#include "Lb.fwd.h"
+#include "MS.fwd.h"
 #include "types/Direction.h"
 #include "types/Toggle.h"
 
diff --git a/include/ms/graph/Graph.h b/include/ms/graph/Graph.h
index 0948e09..2e7dbb5 100644
--- a/include/ms/graph/Graph.h
+++ b/include/ms/graph/Graph.h
@@ -38,7 +38,7 @@
 #include <utility>
 #include <vector>
 
-#include "Lb.fwd.h"
+#include "MS.fwd.h"
 #include "Util.h"
 #include "graph/Vertex.h"
 
diff --git a/include/ms/matching/Id2OverlapMap.h b/include/ms/matching/Id2OverlapMap.h
index 0756953..aad6f26 100644
--- a/include/ms/matching/Id2OverlapMap.h
+++ b/include/ms/matching/Id2OverlapMap.h
@@ -58,77 +58,61 @@ struct KeyEqual : public std::binary_function<key_t, key_t, bool> {
 // class Id2OverlapMap
 // -------------------
 
+/**
+ * Class representing the mapping of pairs of a illumina id and a clique index to overlaps.
+ */
 class Id2OverlapMap {
 public:
-  /**
-   * Constructor.
-   */
-  Id2OverlapMap() = default;
-
-  /**
-   * Destructor.
-   */
-  ~Id2OverlapMap() = default;
-
-  /**
-   * Moving is disallowed.
-   */
-  Id2OverlapMap(Id2OverlapMap const &) = delete;
-
-  /**
-   * Copying is disallowed.
-   */
-  Id2OverlapMap(Id2OverlapMap &&) = delete;
-
-  /**
-   * Move assignment is disallowed.
-   */
-  Id2OverlapMap &operator=(Id2OverlapMap &&) = delete;
-
-  /**
-   * Copy assignment is disallowed.
-   */
-  Id2OverlapMap &operator=(Id2OverlapMap const &) = delete;
-
   /**
    * Assigment operator.
    *
-   * @param key
-   * @return
+   * @param key a const reference to a std::pair of an unsigned int and a std::size_t representing the illumina id and
+   *            the clique index which the overlap is or should become mapped to
+   * @return A reference to the mapped overlap, performing an insertion if no element with a key equivalent to key exist
    */
   std::pair<int, int> &operator[](detail::key_t const &key);
 
   /**
    * Access operator.
    *
-   * @param key
-   * @return
+   * @param key a const reference to a std::pair of an unsigned int and a std::size_t representing the illumina id and
+   *            the clique index which the overlap is mapped to
+   * @return A const reference to the mapped overlap
+   * @throws std::out_of_range
    */
   std::pair<int, int> const &at(detail::key_t const &key) const;
 
   /**
    * Access operator.
    *
-   * @param key
-   * @return
+   * @param key a const reference to a std::pair of an unsigned int and a std::size_t representing the illumina id and
+   *            the clique index which the overlap is mapped to
+   * @return A reference to the mapped overlap
+   * @throws std::out_of_range
    */
   std::pair<int, int> &at(detail::key_t const &key);
 
-  std::size_t getSize() const;
-
 private:
   std::unordered_map<detail::key_t, std::pair<int, int>, detail::KeyHash, detail::KeyEqual>
       m_map; /*!< std::unordered_map storing the mapping */
 };
 
+// =====================================================================================================================
+//                                                  INLINE DEFINITIONS
+// =====================================================================================================================
+
+// -------------------
+// class Id2OverlapMap
+// -------------------
+
+// PUBLIC CLASS METHODS
+
 inline std::pair<int, int> &Id2OverlapMap::operator[](detail::key_t const &key) { return m_map[key]; }
 
 inline std::pair<int, int> &Id2OverlapMap::at(detail::key_t const &key) { return m_map.at(key); }
 
 inline std::pair<int, int> const &Id2OverlapMap::at(detail::key_t const &key) const { return m_map.at(key); }
 
-inline std::size_t Id2OverlapMap::getSize() const { return m_map.size(); }
-
 } // namespace muchsalsa::matching
 
 #endif // INCLUDED_MUCHSALSA_ID2OVERLAPMAP
diff --git a/include/ms/matching/MatchMap.h b/include/ms/matching/MatchMap.h
index 88328ce..7a9eb4c 100644
--- a/include/ms/matching/MatchMap.h
+++ b/include/ms/matching/MatchMap.h
@@ -31,7 +31,7 @@
 #include <unordered_map>
 #include <utility>
 
-#include "Lb.fwd.h"
+#include "MS.fwd.h"
 #include "graph/Graph.h"
 #include "types/Toggle.h"
 
diff --git a/include/ms/types/Toggle.h b/include/ms/types/Toggle.h
index 378c1fb..463a6ba 100644
--- a/include/ms/types/Toggle.h
+++ b/include/ms/types/Toggle.h
@@ -37,7 +37,7 @@ namespace muchsalsa {
 /**
  * Class representing a toggle which can reach two possible states.
  *
- * A toggle has the size of a bool, but supports operations like an unsigned integer.
+ * A toggle has the size of a bool, but supports operations like an integer.
  *
  * Example:
  *
diff --git a/libms/src/MS.fwd.cpp b/libms/src/MS.fwd.cpp
new file mode 100644
index 0000000..03e194f
--- /dev/null
+++ b/libms/src/MS.fwd.cpp
@@ -0,0 +1,24 @@
+// -*- C++ -*-
+//===---------------------------------------------------------------------------------------------------------------==//
+//
+// Copyright (C) 2021 Kevin Klein
+// This file is part of MuCHSALSA <https://github.com/0x002A/MuCHSALSA>.
+//
+// MuCHSALSA is free software: you can redistribute it and/or modify it under the terms of the GNU General
+// Public License as published by the Free Software Foundation, either version 3 of the License, or (at your option) any
+// later version.
+//
+// MuCHSALSA is distributed in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even the
+// implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
+// details.
+//
+// You should have received a copy of the GNU General Public License along with MuCHSALSA.
+// If not, see <http://www.gnu.org/licenses/>.
+//
+// SPDX-License-Identifier: GPL-3.0-or-later
+//
+//===---------------------------------------------------------------------------------------------------------------==//
+
+#include "MS.fwd.h"
+
+// ---------------------------------------------------- END-OF-FILE ----------------------------------------------------
\ No newline at end of file
diff --git a/libms/src/kernel/ap.cpp b/libms/src/kernel/ap.cpp
index 6dab7a2..ea6edd3 100644
--- a/libms/src/kernel/ap.cpp
+++ b/libms/src/kernel/ap.cpp
@@ -1090,7 +1090,7 @@ void muchsalsa::assemblePath(
 
   auto       globalLeftMostPosition = globalPos1 * -1;
   auto const targetName             = [&]() {
-    std::string targetName = ">Prokrastinator_";
+    std::string targetName = ">muchsalsa_";
     targetName.append(std::to_string(asmIdx));
 
     return targetName;
diff --git a/libms/src/kernel/dg.cpp b/libms/src/kernel/dg.cpp
index 25cd6b0..7a57dec 100644
--- a/libms/src/kernel/dg.cpp
+++ b/libms/src/kernel/dg.cpp
@@ -32,10 +32,10 @@
 #include "types/Direction.h"
 #include "types/Toggle.h"
 
-muchsalsa::graph::DiGraph muchsalsa::getDirectionGraph(gsl::not_null<muchsalsa::matching::MatchMap *> pMatchMap,
-                                                       muchsalsa::graph::Graph const &                graph,
-                                                       muchsalsa::graph::Graph const & connectedComponent,
-                                                       muchsalsa::graph::Vertex const &startNode) {
+muchsalsa::graph::DiGraph muchsalsa::getDirectedGraph(gsl::not_null<muchsalsa::matching::MatchMap *> pMatchMap,
+                                                      muchsalsa::graph::Graph const                 &graph,
+                                                      muchsalsa::graph::Graph const                 &connectedComponent,
+                                                      muchsalsa::graph::Vertex const                &startNode) {
   std::stack<std::tuple<graph::Vertex const *, Toggle>> stack;
   stack.push(std::make_tuple(&startNode, true));
 
diff --git a/libms/src/kernel/co.cpp b/libms/src/kernel/ol.cpp
similarity index 83%
rename from libms/src/kernel/co.cpp
rename to libms/src/kernel/ol.cpp
index 746f62a..6e94040 100644
--- a/libms/src/kernel/co.cpp
+++ b/libms/src/kernel/ol.cpp
@@ -29,9 +29,9 @@
 
 namespace {
 
-std::pair<double, double> computeOverhangs(muchsalsa::matching::MatchMap const & matches,
-                                           muchsalsa::graph::Vertex const *const pVertex,
-                                           muchsalsa::graph::Edge const &edge, unsigned int illuminaId) {
+std::pair<double, double> getOverhangs(muchsalsa::matching::MatchMap const  &matches,
+                                       muchsalsa::graph::Vertex const *const pVertex,
+                                       muchsalsa::graph::Edge const &edge, unsigned int illuminaId) {
   auto const pVertexMatch = gsl::make_not_null(matches.getVertexMatch(pVertex->getId(), illuminaId));
   auto const pEdgeMatch   = gsl::make_not_null(matches.getEdgeMatch(&edge, illuminaId));
 
@@ -52,19 +52,19 @@ std::pair<double, double> computeOverhangs(muchsalsa::matching::MatchMap const &
 
 } // unnamed namespace
 
-std::optional<muchsalsa::graph::EdgeOrder> muchsalsa::computeOverlap(muchsalsa::matching::MatchMap const &matches,
-                                                                     std::vector<unsigned int> &          ids,
-                                                                     muchsalsa::graph::Edge const &edge, bool direction,
-                                                                     std::size_t score, bool isPrimary) {
+std::optional<muchsalsa::graph::EdgeOrder> muchsalsa::getOverlap(muchsalsa::matching::MatchMap const &matches,
+                                                                 std::vector<unsigned int>           &ids,
+                                                                 muchsalsa::graph::Edge const &edge, bool direction,
+                                                                 std::size_t score, bool isPrimary) {
   auto const &firstId = ids.front();
   auto const &lastId  = ids.back();
 
   auto const vertices = edge.getVertices();
 
-  auto const overhangsFirstIdFirstVertex  = computeOverhangs(matches, vertices.first, edge, firstId);
-  auto const overhangsLastIdFirstVertex   = computeOverhangs(matches, vertices.first, edge, lastId);
-  auto const overhangsFirstIdSecondVertex = computeOverhangs(matches, vertices.second, edge, firstId);
-  auto const overhangsLastIdSecondVertex  = computeOverhangs(matches, vertices.second, edge, lastId);
+  auto const overhangsFirstIdFirstVertex  = getOverhangs(matches, vertices.first, edge, firstId);
+  auto const overhangsLastIdFirstVertex   = getOverhangs(matches, vertices.first, edge, lastId);
+  auto const overhangsFirstIdSecondVertex = getOverhangs(matches, vertices.second, edge, firstId);
+  auto const overhangsLastIdSecondVertex  = getOverhangs(matches, vertices.second, edge, lastId);
 
   auto const leftOverhangFirstVertex  = overhangsFirstIdFirstVertex.first;
   auto const rightOverhangFirstVertex = overhangsLastIdFirstVertex.second;
diff --git a/pipeline/pipeline.sh b/pipeline/pipeline.sh
new file mode 100644
index 0000000..7326d85
--- /dev/null
+++ b/pipeline/pipeline.sh
@@ -0,0 +1,56 @@
+#!/bin/bash
+# This is an adapted version of the LazyB pipeline from
+# https://github.com/TGatter/LazyB
+
+SCRIPT=$(readlink -f "$0")
+SCRIPTPATH=$(dirname "$SCRIPT")
+
+K_MER_JELLY=$1
+K_MER_ABYSS=$2
+NAME=$3
+ILLUMINA_RAW_1=$4
+ILLUMINA_RAW_2=$5
+NANO=$6
+OUT_=$7
+
+CORES=8                     #number of cores can be set manually here
+MINLENGTH=500
+ABYSS_MODE=unitigs
+
+case $OUT_ in
+  /*) OUT=$OUT_;;
+  *) OUT=$PWD/$OUT_;;
+esac
+
+mkdir -p $OUT               #create output folder if it doesn't already exist
+TMP=$(mktemp -d -p $OUT)    #create a temporary folder - deleted in the end
+BASE=$(basename $NANO .fastq)
+
+echo ">>>> K-mer Filtering of Illumina Reads"
+jellyfish count -m $K_MER_JELLY -s 100M -t $CORES -C $ILLUMINA_RAW_1 $ILLUMINA_RAW_2 -o ${TMP}/"jelly_count_k${K_MER_JELLY}.jf"
+jellyfish histo ${TMP}/"jelly_count_k${K_MER_JELLY}.jf" > ${TMP}/"jelly_histo_k${K_MER_JELLY}.histo"
+TOTAL_NON_UNIQUE_KMERS=$(awk '{if($1 != "1") s += $2} END{print s}' ${TMP}/"jelly_histo_k${K_MER_JELLY}.histo")
+ABUNDANCE_THRESHOLD=$($SCRIPTPATH/setAbundanceThresholdFromHisto.py ${TMP}/"jelly_histo_k${K_MER_JELLY}.histo" $TOTAL_NON_UNIQUE_KMERS)
+echo "abundance threshold for k-mer filtering: " $ABUNDANCE_THRESHOLD > $OUT/report.txt
+jellyfish dump -L $ABUNDANCE_THRESHOLD ${TMP}/"jelly_count_k${K_MER_JELLY}.jf" >  ${TMP}/"filtered_kmers_${K_MER_JELLY}_${ABUNDANCE_THRESHOLD}.fa"
+bbduk.sh in1=$ILLUMINA_RAW_1 in2=$ILLUMINA_RAW_2 out1=$TMP/illu_filtered.1.fastq out2=$TMP/illu_filtered.2.fastq ref=${TMP}/"filtered_kmers_${K_MER_JELLY}_${ABUNDANCE_THRESHOLD}.fa" k=$K_MER_JELLY hdist=0
+
+echo ">>>> Illumina Assembly"
+mkdir -p $OUT/ABYSS   #create folder "ABYSS" for ABYSS results
+abyss-pe -C $OUT/ABYSS np=$CORES name=$NAME k=$K_MER_ABYSS in="${TMP}/illu_filtered.1.fastq ${TMP}/illu_filtered.2.fastq" ${ABYSS_MODE} 2>&1 | tee $OUT/ABYSS/abyss.log
+awk -v min="$MINLENGTH" 'BEGIN {RS = ">" ; ORS = ""} $2 >= min {print ">"$0}' $OUT/ABYSS/"${NAME}-${ABYSS_MODE}.fa"  > $OUT/ABYSS/"${NAME}-${ABYSS_MODE}.l${MINLENGTH}.fa"
+
+echo ">>>> Unitig Filter"
+minimap2 -k15 -DP --dual=yes --no-long-join -w5 -m100 -g10000 -r2000 --max-chain-skip 25 --split-prefix foo $NANO $OUT/ABYSS/"${NAME}-${ABYSS_MODE}.l${MINLENGTH}.fa" > $OUT/01_unitigs.to_$BASE.paf
+$SCRIPTPATH/unitig_filter.py $OUT/01_unitigs.to_$BASE.paf $OUT/ABYSS/"${NAME}-${ABYSS_MODE}.l${MINLENGTH}.fa" $OUT/report.txt $TMP/unitigs_corrected.fa
+
+echo ">>>> Scrubbing"
+minimap2 -k15 -DP --dual=yes --no-long-join -w5 -m100 -g10000 -r2000 --max-chain-skip 25 --split-prefix foo $NANO $TMP/unitigs_corrected.fa > $OUT/01_contigs_corrected.to_$BASE.paf
+$SCRIPTPATH/scrubber_bfs.py $OUT/01_contigs_corrected.to_$BASE.paf $NANO $OUT/02_$BASE.scrubbed.fa $TMP
+
+echo ">>>> Anchor Mapping"
+minimap2  -k15 -DP --dual=yes --no-long-join -w5 -m100 -g10000 -r2000 --max-chain-skip 25 --split-prefix foo $OUT/02_$BASE.scrubbed.fa $TMP/unitigs_corrected.fa > $OUT/02_contigs_corrected.to_$BASE.scrubbed.paf
+
+echo ">>>> MuCHSALSA"
+$SCRIPTPATH/muchsalsa $OUT/02_contigs_corrected.to_$BASE.scrubbed.paf $TMP/unitigs_corrected.fa $OUT/02_$BASE.scrubbed.fa $TMP $CORES
+cp $TMP/temp_1.target.fa $OUT/03.assembly.unpolished.fa
\ No newline at end of file
diff --git a/pipeline/scrubber_bfs.py b/pipeline/scrubber_bfs.py
new file mode 100644
index 0000000..42e18f5
--- /dev/null
+++ b/pipeline/scrubber_bfs.py
@@ -0,0 +1,252 @@
+#!/usr/bin/env python
+# This is an unmodified copy of scrubber_bfs.py taken from
+# https://github.com/TGatter/LazyB
+
+import networkx as nx
+import sys, os
+from Bio import SeqIO
+import random
+import time
+
+start_time = time.time()
+
+paf_anchors = sys.argv[1]  # anchors to nanopores
+seq_file_base = sys.argv[2]  # nanopore reads
+output_file = sys.argv[3]  # scrubbed nanopore reads
+tmp_dir = sys.argv[4]
+output = open(output_file, "w")
+
+subset_size = 60000
+
+if seq_file_base.endswith("fa") or seq_file_base.endswith("fasta"):
+    seq_dict = SeqIO.index_db(seq_file_base + ".idx", seq_file_base, "fasta")
+else:
+    seq_dict = SeqIO.index_db(seq_file_base + ".idx", seq_file_base, "fastq")
+
+
+def find_sequence_r(seqid, start, end):
+    return seq_dict[seqid].seq[start:end + 1]
+
+
+def find_sequence(seqid):
+    return seq_dict[seqid].seq
+
+
+def reverse_complement(seq):
+    return seq.reverse_complement()
+
+
+def write_sequence(f, seq):
+    for i in range(0, len(seq), 60):
+        f.write(str(seq[i:i + 60]) + "\n")
+
+
+# set minimum length for mapping hits
+threshold_length = 500
+# create graph
+G = nx.Graph()
+
+prevHitId = ""  # just to prevent the error at the first line
+chunkNodes = {}
+############## Read Blast in and build base graph
+
+for line in open(paf_anchors, "r"):
+
+    # read line into a list
+
+    # 0	qseqid	string 	Query sequence name
+    # 1	qlen 	int 	Query sequence length
+    # 2	qstart	int 	Query start (0-based)
+    # 3	qend	int 	Query end (1-based)
+    # 4 	strand	char 	Relative strand: "+" or "-"
+
+    # 5	sseqid 	string 	Target sequence name
+    # 6	slen 	int 	Target sequence length
+    # 7	sstart	int 	Target start on original strand (0-based)
+    # 8	send	int 	Target end on original strand (0-based)
+    # 9 	int 	Number of residue matches
+    # 10 	int 	Alignment block length
+    # 11 	int 	Mapping quality (0-255; 255 for missing)
+
+    linetemp = line.rstrip().split("\t")
+    if len(linetemp) == 1:
+        continue
+
+    id_1, id_2 = linetemp[0], linetemp[5]  # remember about 0-based counting
+
+    len_1, len_2 = int(linetemp[1]), int(linetemp[6])  # remember about 0-based counting
+
+    s_1, s_2 = int(linetemp[2]), int(linetemp[7])
+    e_1, e_2 = int(linetemp[3]), int(linetemp[8])
+
+    if e_1 - s_1 < 500:  # only hits >= 500 bp count
+        continue
+
+    # add node to the graph if necessary
+    if not G.has_node(id_2):  # if this node does not exist, then create it
+        G.add_node(id_2)  # , print_index=node_index)
+        G.nodes[id_2]['length'] = len_2  # set length of nanopore read
+        G.nodes[id_2]['illu_to_ranges'] = {}  # initialize empty matches to anchors
+        G.nodes[id_2]['seq_to_ranges'] = {}  # initialize empty matches to other nanos
+
+    # add the anchor hit range to the vertex information
+    # we exclude information of second hits of the same anchor-nanopore pair
+    if id_1 in G.nodes[id_2]['illu_to_ranges']:
+        continue
+
+    G.nodes[id_2]['illu_to_ranges'][id_1] = (s_2, e_2)
+
+    # are we still in the same scaffold? If not, let's start a new one
+    if id_1 != prevHitId:
+        chunkNodes = []
+        prevHitId = id_1
+
+    # here are reads associated with the same anchor
+    # and now iterate only through these reads
+    for prevId in chunkNodes:
+        # add edge if necessary
+        if not G.has_edge(prevId, id_2):
+            G.add_edge(prevId, id_2)
+    chunkNodes.append(id_2)
+
+total_nodes = len(G.nodes())
+print("\n" + "# Nodes " + str(total_nodes))
+print("# Edges " + str(len(G.edges())) + "\n")
+
+
+def print_corrected_read(cid, seq_to_ranges, nano_to_ranges, length):
+    join = []
+    for sid in seq_to_ranges:
+        s, e, _ = seq_to_ranges[sid]
+        join.append((s, e))
+    for sid in nano_to_ranges:
+        s, e = nano_to_ranges[sid]
+        join.append((s, e))
+
+    join.sort()
+
+    covered = []
+    for s, e in join:
+        if len(covered) == 0:
+            covered.append((s, e))
+            continue
+
+        cs = covered[-1][0]
+        ce = covered[-1][1]
+        if cs <= e and s <= ce:
+            covered[-1] = (min(s, cs), max(e, ce))  # join
+        else:
+            covered.append((s, e))  # add
+
+    for i, (cs, ce) in enumerate(covered):
+        output.write(">" + cid + "_" + str(i) + "\n")
+        write_sequence(output, find_sequence_r(cid, max(cs, 200), min(ce, length - 200)))
+
+
+## take arbitrary node of G (always the min for reproducability) as starting point of traversal with bredth-first search
+## add passed nodes to subset until threshold size is reached
+## the nodes of this connected subgraph are mapped all versus all
+## all nodes from this subset that have all their neighbors in the set are scrubbed and then removed from G
+## repeat until G is empty
+
+# set size threshold of connected subset of nodes
+bfs_subset = set()
+counter = 0
+
+# start printing progress
+print('0/' + str(total_nodes))
+
+while (len(G) != 0):
+    if (len(bfs_subset) > 0):
+        possible_starts = set(G.nodes()).difference(
+            bfs_subset)  # avoid to choose starting vertex that is already contained in subset
+
+    else:
+        possible_starts = set(G.nodes())
+    start_node = min(possible_starts)
+
+    # breadth-first-search
+    bfs_edges = nx.bfs_edges(G, start_node, depth_limit=subset_size)
+    bfs_nodes = [start_node] + [v for u, v in bfs_edges]
+
+    # add nodes starting from a random node to subset following bread-first-search until threshold is reached
+    for node in bfs_nodes:
+        if (len(bfs_subset) >= subset_size):
+            break
+        else:
+            bfs_subset.add(node)
+
+    if (len(bfs_subset) < subset_size and len(G) > len(
+            bfs_subset)):  # the conncected component is smaller than threshold size -> so merge it to next one
+        continue
+
+    # copy the set and delete all nodes with neighbors outside the set
+    bfs_subset_center = bfs_subset.copy()
+    for u, v in nx.edge_boundary(G, bfs_subset):
+        bfs_subset_center.discard(u)
+
+    # map nodes of this subset against each other
+    # write sequences to file
+    f1 = open(tmp_dir + "/temp_sequences.fa", "w")
+    for node in bfs_subset:
+        f1.write(">" + node + "\n")
+        write_sequence(f1, find_sequence(node))
+    f1.close()
+
+    os.system(
+        'minimap2 -x ava-ont ' + tmp_dir + '/temp_sequences.fa ' + tmp_dir + '/temp_sequences.fa > ' + tmp_dir + '/temp_pwa.paf 2> /dev/null')
+
+    # read information of paf file and write to node dictionary seq_to_ranges
+    for line in open(tmp_dir + '/temp_pwa.paf', "r"):
+
+        linetemp = line.rstrip().split("\t")
+        if len(linetemp) == 1:
+            continue
+
+        id_1, id_2 = linetemp[0], linetemp[5]  # remember about 0-based counting
+        if id_1 == id_2:
+            continue
+
+        s_1, s_2 = int(linetemp[2]), int(linetemp[7])
+        e_1, e_2 = int(linetemp[3]), int(linetemp[8])
+
+        if e_1 - s_1 < 500:
+            continue
+
+        direction = linetemp[4]
+
+        # write information for id_1 to dict
+        if id_2 not in G.nodes[id_1]['seq_to_ranges']:
+            G.nodes[id_1]['seq_to_ranges'][id_2] = (s_1, e_1, direction)
+        else:
+            str_1 = G.nodes[id_1]['seq_to_ranges'][id_2][0]
+            str_2 = G.nodes[id_1]['seq_to_ranges'][id_2][1]
+            d = G.nodes[id_1]['seq_to_ranges'][id_2][2]
+            if direction == d and (abs(str_1 - e_1) < 500 or abs(s_1 - str_2) < 500):
+                G.nodes[id_1]['seq_to_ranges'][id_2] = (min(s_1, str_1), max(e_1, str_2), direction)
+
+        # write information for id_2 to dict
+        if id_1 not in G.nodes[id_2]['seq_to_ranges']:
+            G.nodes[id_2]['seq_to_ranges'][id_1] = (s_2, e_2, direction)
+        else:
+            str_1 = G.nodes[id_2]['seq_to_ranges'][id_1][0]
+            str_2 = G.nodes[id_2]['seq_to_ranges'][id_1][1]
+            d = G.nodes[id_2]['seq_to_ranges'][id_1][2]
+            if direction == d and (abs(str_1 - e_2) < 500 or abs(s_2 - str_2) < 500):
+                G.nodes[id_2]['seq_to_ranges'][id_1] = (min(s_2, str_1), max(e_2, str_2), direction)
+
+    # iterate over the nodes of the subset and write scrubbed sequences to output file
+    for node in bfs_subset_center:
+        print_corrected_read(node, G.nodes[node]['seq_to_ranges'], G.nodes[node]['illu_to_ranges'],
+                             G.nodes[node]['length'])
+
+    # remove nodes from G
+    G.remove_nodes_from(bfs_subset_center)
+    subset_length = len(bfs_subset_center)
+    bfs_subset.clear()
+
+    # print progress
+    counter += subset_length
+    print(str(counter) + '/' + str(total_nodes))
+
+print("--- %s seconds ---" % (time.time() - start_time))
diff --git a/pipeline/setAbundanceThresholdFromHisto.py b/pipeline/setAbundanceThresholdFromHisto.py
new file mode 100644
index 0000000..02646e9
--- /dev/null
+++ b/pipeline/setAbundanceThresholdFromHisto.py
@@ -0,0 +1,36 @@
+#!/usr/bin/env python
+# This is an unmodified copy of setAbundanceThresholdFromHisto.py taken from
+# https://github.com/TGatter/LazyB
+
+## Script reads the histofile of k-mer counting
+## upper abundance threshold is set
+
+import sys
+import numpy as np
+
+histo = sys.argv[1]
+total_non_unique_k_mers = int(sys.argv[2])
+
+# calculate quartiles 1 and 3
+q1_th_value = round((total_non_unique_k_mers + 1) * 0.25)
+q1 = 0
+q3_th_value = round((total_non_unique_k_mers + 1) * 0.75)
+q3 = 0
+
+current_value = 0
+with open(histo, 'r') as file:
+    for line in file:
+        split_line = line.rstrip().split()
+        abundance = int(split_line[0])
+        frequency = int(split_line[1])
+        if (abundance > 1):  # exclude k-mers with frequency 1 == non_unique k-mers
+            current_value += frequency
+            if (q1 == 0 and current_value >= q1_th_value):
+                q1 = abundance
+            elif (q3 == 0 and current_value >= q3_th_value):
+                q3 = abundance
+                break
+
+iqr = q3 - q1
+upper_outlier = q3 + 2 * iqr
+print(str(upper_outlier))
diff --git a/pipeline/unitig_filter.py b/pipeline/unitig_filter.py
new file mode 100644
index 0000000..c35633c
--- /dev/null
+++ b/pipeline/unitig_filter.py
@@ -0,0 +1,185 @@
+#!/usr/bin/env python
+# This is an unmodified copy of unitig_filter.py taken from
+# https://github.com/TGatter/LazyB
+
+import numpy as np
+import sys
+from Bio import SeqIO
+
+paf_file = sys.argv[1]
+unitigs = sys.argv[2]
+output_stats = sys.argv[3]
+output_file = sys.argv[4]
+
+unitigs_corrected = open(output_file, 'w')
+stats_file = open(output_stats, 'a')
+
+# open up sequence map for illumina
+illumina_dict = SeqIO.index_db(unitigs + ".idx", unitigs, "fasta")
+
+
+def find_sequence_illumina(seqid, start, end):
+    return illumina_dict[seqid].seq[start:end + 1]
+
+
+def write_sequence(f, seq):
+    for i in range(0, len(seq), 60):
+        f.write(str(seq[i:i + 60]) + "\n")
+
+
+############ read input paf file #######################################################################################
+mapping_freq = {}
+nano_lengths = {}
+
+with open(paf_file, 'r') as input_file:
+    current_id = ''
+    current_length = 0
+    current_nano_ids = []
+    current_profile = []
+    block_counter = 0
+
+    for line in input_file:
+        split_line = line.rstrip().split('\t')
+        illu_id = split_line[0]
+        illu_length = int(split_line[1])
+        illuminaStart = int(split_line[2])
+        illuminaEnd = int(split_line[3]) - 1
+        nano_id = split_line[5]
+        nano_length = split_line[6]
+
+        if nano_id not in nano_lengths:
+            nano_lengths[nano_id] = int(nano_length)
+
+        if current_id == illu_id:  # still in the same block
+            if nano_id not in current_nano_ids:
+                block_counter += 1
+                current_nano_ids.append(nano_id)
+                for pos in range(illuminaStart, illuminaEnd + 1):
+                    current_profile[pos] += 1
+
+        else:  # new block
+            if current_id != '':  # close previous block, avoid beginning
+                mapping_freq[current_id] = [block_counter, current_length, max(current_profile)]
+
+            # start new block
+            current_profile = [0] * illu_length
+            for pos in range(illuminaStart, illuminaEnd + 1):
+                current_profile[pos] += 1
+
+            block_counter = 1
+            current_id = illu_id
+            current_length = illu_length
+            del current_nano_ids[:]
+            current_nano_ids.append(nano_id)
+
+    mapping_freq[current_id] = [block_counter, current_length, max(current_profile)]  # last block
+
+
+# print("finished reading input file")
+
+
+############# define filter cut-off ####################################################################################
+
+def filter_cutoff(mapping_dict):
+    blocks_maxcov = [x[2] for x in list(mapping_dict.values())]
+    # define cut-off
+    q1 = np.percentile(blocks_maxcov, 25)
+    q3 = np.percentile(blocks_maxcov, 75)
+    iqr = q3 - q1
+    upper_whisker = q3 + 1.5 * iqr
+    stats_file.write(">>> unitig filter \n")
+    stats_file.write("upper_outlier: " + str(upper_whisker) + '\n')
+    stats_file.write("Q3: " + str(q3) + '\n')
+
+    return upper_whisker, blocks_maxcov, q3
+
+
+cutoff, blocks_maxcov, q3 = filter_cutoff(mapping_freq)
+
+
+########### evaluate and cut outliers  #################################################################################
+
+def cut_peaks(unitig_id, coverage_profile, file):
+    subrange = (0, 0)
+    below_cutoff = False
+    unitig_fragments = []
+    for i, cov in enumerate(coverage_profile):
+        if cov <= q3:  # set q3 as cutoff
+            if not below_cutoff:  # start new subrange below cutoff
+                subrange = (i, i)
+            else:  # prolonging subrange
+                subrange = (subrange[0], i)
+            below_cutoff = True
+
+        else:
+            if below_cutoff and subrange[1] - subrange[0] + 1 >= 500:
+                file.write('>' + unitig_id + '_' + str(len(unitig_fragments)) + ' ' + str(subrange[1] - subrange[0] + 1)
+                           + ' ' + str(subrange[0]) + ' ' + str(subrange[1]) + '\n')
+                write_sequence(file, find_sequence_illumina(unitig_id, subrange[0], subrange[1]))
+                unitig_fragments.append(subrange)
+                subrange = (0, 0)
+            below_cutoff = False
+    if below_cutoff and subrange[1] - subrange[0] + 1 >= 500:
+        file.write('>' + unitig_id + '_' + str(len(unitig_fragments)) + ' ' + str(subrange[1] - subrange[0] + 1)
+                   + ' ' + str(subrange[0]) + ' ' + str(subrange[1]) + '\n')
+        write_sequence(file, find_sequence_illumina(unitig_id, subrange[0], subrange[1]))
+        unitig_fragments.append(subrange)
+    return unitig_fragments
+
+
+########### detect outlier and write new unitig file   ###########################################################
+
+with open(paf_file, 'r') as input_file:
+    current_id = ''
+    current_profile = []
+    all_counter = 0
+    outlier_counter = 0
+    rescued_outlier = 0
+    total_rescued_length = 0
+    for line in input_file:
+        split_line = line.rstrip().split('\t')
+        illu_id = split_line[0]
+        illu_length = int(split_line[1])
+        illuminaStart = int(split_line[2])
+        illuminaEnd = int(split_line[3]) - 1
+
+        if current_id == illu_id:  # still in the same block
+            if mapping_freq[illu_id][2] > cutoff:  # outlier unitig block
+                for pos in range(illuminaStart, illuminaEnd + 1):
+                    current_profile[pos] += 1
+            else:  # normal unitig block
+                continue
+
+        else:  # new block
+            all_counter += 1
+            if current_id != '' and mapping_freq[current_id][2] > cutoff:  # close previous block if outlier
+                fragments = cut_peaks(current_id, current_profile, unitigs_corrected)
+                if len(fragments) > 0:
+                    rescued_outlier += 1
+                    for entry in fragments:
+                        total_rescued_length += (entry[1] - entry[0] + 1)
+                outlier_counter += 1
+
+            if mapping_freq[illu_id][2] > cutoff:  # new outlier block
+                current_profile = [0] * illu_length
+                for pos in range(illuminaStart, illuminaEnd + 1):
+                    current_profile[pos] += 1
+                current_id = illu_id
+
+            else:  # new normal unitig
+                unitigs_corrected.write('>' + illumina_dict[illu_id].description + '\n')
+                write_sequence(unitigs_corrected, illumina_dict[illu_id].seq)
+                current_id = illu_id
+
+    if current_id != '' and mapping_freq[current_id][2] > cutoff:  # close last block if outlier
+        fragments = cut_peaks(current_id, current_profile, unitigs_corrected)
+        if len(fragments) > 0:
+            rescued_outlier += 1
+        outlier_counter += 1
+
+stats_file.write("#all unitigs: " + str(all_counter) + '\n')
+stats_file.write("#outliers: " + str(outlier_counter) + '\n')
+stats_file.write("#rescued outliers: " + str(rescued_outlier) + '\n')
+
+unitigs_corrected.close()
+stats_file.close()
diff --git a/src/main.cpp b/src/main.cpp
index 1037d20..b10a04b 100644
--- a/src/main.cpp
+++ b/src/main.cpp
@@ -284,8 +284,9 @@ auto main(int const argc, char const *argv[]) -> int {
     TRACE("Starting assembly\n");
 
     //// OUTPUT FILES ////
-    OutputWriter outputWriter(app.getOutputFilePath() + "/temp.query.fa", app.getOutputFilePath() + "/temp.align.paf",
-                              app.getOutputFilePath() + "/temp.target.fa");
+    OutputWriter outputWriter(app.getOutputFilePath() + "/temp_1.query.fa",
+                              app.getOutputFilePath() + "/temp_1.align.paf",
+                              app.getOutputFilePath() + "/temp_1.target.fa");
     //// /OUTPUT FILES ////
 
     std::atomic<int> assemblyIdx{-1};
@@ -386,16 +387,16 @@ void chainingAndOverlaps(gsl::not_null<Job const *> const pJob) {
   }
 
   std::for_each(std::begin(minusPaths), std::end(minusPaths), [=](auto &minusPath) {
-    auto const overlap = muchsalsa::computeOverlap(*pMatchMap, std::get<0>(minusPath), *pEdge, false,
-                                                   std::get<1>(minusPath), std::get<2>(minusPath));
+    auto const overlap = muchsalsa::getOverlap(*pMatchMap, std::get<0>(minusPath), *pEdge, false,
+                                               std::get<1>(minusPath), std::get<2>(minusPath));
     if (overlap.has_value()) {
       pEdge->appendOrder(std::move(overlap.value()));
     }
   });
 
   std::for_each(std::begin(plusPaths), std::end(plusPaths), [=](auto &plusPath) {
-    auto const order = muchsalsa::computeOverlap(*pMatchMap, std::get<0>(plusPath), *pEdge, true, std::get<1>(plusPath),
-                                                 std::get<2>(plusPath));
+    auto const order = muchsalsa::getOverlap(*pMatchMap, std::get<0>(plusPath), *pEdge, true, std::get<1>(plusPath),
+                                             std::get<2>(plusPath));
     if (order.has_value()) {
       pEdge->appendOrder(std::move(order.value()));
     }
@@ -618,7 +619,7 @@ void assemblePaths(gsl::not_null<Job const *> const pJob) {
   auto const        pMatchMap     = gsl::make_not_null(std::any_cast<MatchMap *>(pJob->getParam(2)));
 
   if (pMaxNplVertex != nullptr) {
-    auto       diGraph = muchsalsa::getDirectionGraph(pMatchMap, *pGraph, subGraph, *pMaxNplVertex);
+    auto       diGraph = muchsalsa::getDirectedGraph(pMatchMap, *pGraph, subGraph, *pMaxNplVertex);
     auto const paths   = muchsalsa::linearizeGraph(&diGraph);
 
     Id2OverlapMap id2OverlapMap;