diff --git a/include/btllib/indexlr.hpp b/include/btllib/indexlr.hpp index 1d64391c..40a77f2f 100644 --- a/include/btllib/indexlr.hpp +++ b/include/btllib/indexlr.hpp @@ -52,12 +52,18 @@ class Indexlr static const unsigned LONG_MODE = 64; /** Include read sequence Phred score along with minimizer information. */ static const unsigned QUAL = 128; + /** Exclude kmers with quality score below the threshold. */ + static const unsigned Q_DROP = 256; + /** Consider only 10% of the base quality scores for averaging. */ + static const unsigned PART_AVG = 512; }; bool output_id() const { return bool(~flags & Flag::NO_ID); } bool output_bx() const { return bool(flags & Flag::BX); } bool output_seq() const { return bool(flags & Flag::SEQ); } bool output_qual() const { return bool(flags & Flag::QUAL); } + bool q_drop() const { return bool(flags & Flag::Q_DROP); } + bool part_avg() const { return bool(flags & Flag::PART_AVG); } bool filter_in() const { return bool(flags & Flag::FILTER_IN); } bool filter_out() const { return bool(flags & Flag::FILTER_OUT); } bool short_mode() const { return bool(flags & Flag::SHORT_MODE); } @@ -100,6 +106,7 @@ class Indexlr bool forward = false; std::string seq; std::string qual; + bool valid = true; }; using HashedKmer = Minimizer; @@ -228,12 +235,16 @@ class Indexlr bool filter_in, bool filter_out, const BloomFilter& filter_in_bf, - const BloomFilter& filter_out_bf); + const BloomFilter& filter_out_bf, + bool drop); static void filter_kmer_qual(Indexlr::HashedKmer& hk, const std::string& kmer_qual, - size_t q); - static size_t calc_kmer_quality(const std::string& qual); + size_t q, + bool drop, + bool partial); + static size_t calc_kmer_quality(const std::string& qual, + bool partial = false); static void calc_minimizer( const std::vector& hashed_kmers_buffer, @@ -466,21 +477,34 @@ Indexlr::filter_hashed_kmer(Indexlr::HashedKmer& hk, bool filter_in, bool filter_out, const BloomFilter& filter_in_bf, - const BloomFilter& filter_out_bf) + const BloomFilter& filter_out_bf, + bool drop) { if (filter_in && filter_out) { std::vector tmp; tmp = { hk.min_hash }; if (!filter_in_bf.contains(tmp) || filter_out_bf.contains(tmp)) { - hk.min_hash = std::numeric_limits::max(); + if (drop) { + hk.valid = false; + } else { + hk.min_hash = std::numeric_limits::max(); + } } } else if (filter_in) { if (!filter_in_bf.contains({ hk.min_hash })) { - hk.min_hash = std::numeric_limits::max(); + if (drop) { + hk.valid = false; + } else { + hk.min_hash = std::numeric_limits::max(); + } } } else if (filter_out) { if (filter_out_bf.contains({ hk.min_hash })) { - hk.min_hash = std::numeric_limits::max(); + if (drop) { + hk.valid = false; + } else { + hk.min_hash = std::numeric_limits::max(); + } } } } @@ -488,29 +512,47 @@ Indexlr::filter_hashed_kmer(Indexlr::HashedKmer& hk, inline void Indexlr::filter_kmer_qual(Indexlr::HashedKmer& hk, const std::string& kmer_qual, - size_t q) + size_t q, + bool drop, + bool partial) { - if (calc_kmer_quality(kmer_qual) < q) { - hk.min_hash = std::numeric_limits::max(); + if (calc_kmer_quality(kmer_qual, partial) < q) { + if (drop) { + hk.valid = false; + } else { + hk.min_hash = std::numeric_limits::max(); + } } } inline size_t -Indexlr::calc_kmer_quality(const std::string& qual) +Indexlr::calc_kmer_quality(const std::string& qual, bool partial) { // convert the quality scores to integers std::vector qual_ints; const int thirty_three = 33; + const size_t ten = 10; qual_ints.reserve(qual.size()); for (auto c : qual) { qual_ints.push_back(c - thirty_three); } - // calculate the mean (potential improvement: use other statistics) + // sort the quality scores + std::sort(qual_ints.begin(), qual_ints.end()); + + // calculate the mean quality score size_t sum = 0; - for (auto q : qual_ints) { - sum += q; + size_t n = (partial ? qual_ints.size() / ten : qual_ints.size()); + if (n == 0) { + if (!qual_ints.empty()) { + n = 1; + } else { + return 0; + } + } + for (size_t i = 0; i < n; ++i) { + sum += qual_ints[i]; } - return (sum / qual_ints.size()); + return sum / n; } inline void @@ -546,7 +588,10 @@ Indexlr::calc_minimizer( if (ssize_t(min_current->pos) > min_pos_prev && min_current->min_hash != std::numeric_limits::max()) { min_pos_prev = ssize_t(min_current->pos); - minimizers.push_back(*min_current); + if (min_current->valid) { + // if the kmer is valid (not suppressed by filters) + minimizers.push_back(*min_current); + } } } @@ -572,11 +617,16 @@ Indexlr::minimize(const std::string& seq, const std::string& qual) const output_seq() ? seq.substr(nh.get_pos(), k) : "", output_qual() ? qual.substr(nh.get_pos(), k) : ""); - filter_hashed_kmer( - hk, filter_in(), filter_out(), filter_in_bf.get(), filter_out_bf.get()); + filter_hashed_kmer(hk, + filter_in(), + filter_out(), + filter_in_bf.get(), + filter_out_bf.get(), + q_drop()); if (q > 0) { - filter_kmer_qual(hk, qual.substr(nh.get_pos(), k), q); + filter_kmer_qual( + hk, qual.substr(nh.get_pos(), k), q, q_drop(), part_avg()); } if (idx + 1 >= w) { diff --git a/recipes/indexlr.cpp b/recipes/indexlr.cpp index f381952f..8fac8c49 100644 --- a/recipes/indexlr.cpp +++ b/recipes/indexlr.cpp @@ -43,12 +43,16 @@ print_usage() std::cerr << "Usage: " << PROGNAME << " -k K -w W [-q Q] [-r repeat_bf_path] [-s solid_bf_path] [--id] " - "[--bx] [--pos] [--seq] [--qual]" + "[--bx] [--pos] [--seq] [--qual] [--q-drop] [--part-avg]" "[-o FILE] FILE...\n\n" " -k K Use K as k-mer size.\n" " -w W Use W as sliding-window size.\n" " -q Q Filter kmers with average quality (Phred score) lower " "than Q [0]. \n" + " --q-drop Drop filtered kmers instead of disqualifying them" + "from generating minimizers. \n" + " --part-avg Consider only 1/10 of the (lowest value) base quality " + "scores for averaging. \n" " --id Include input sequence ids in the output. " "(Default if --bx is not provided)\n" " --bx Include input sequence barcodes in the output.\n" @@ -100,7 +104,7 @@ main(int argc, char* argv[]) bool k_set = false; bool q_set = false; int with_id = 0, with_bx = 0, with_len = 0, with_pos = 0, with_strand = 0, - with_seq = 0, with_qual = 0; + with_seq = 0, with_qual = 0, with_q_drop = 0, with_part_avg = 0; std::unique_ptr repeat_bf, solid_bf; bool with_repeat = false, with_solid = false; int long_mode = 0; @@ -113,6 +117,8 @@ main(int argc, char* argv[]) { "pos", no_argument, &with_pos, 1 }, { "strand", no_argument, &with_strand, 1 }, { "seq", no_argument, &with_seq, 1 }, + { "q-drop", no_argument, &with_q_drop, 1 }, + { "part-avg", no_argument, &with_part_avg, 1 }, { "qual", no_argument, &with_qual, 1 }, { "long", no_argument, &long_mode, 1 }, { "help", no_argument, &help, 1 }, @@ -231,6 +237,12 @@ main(int argc, char* argv[]) if (bool(with_qual)) { flags |= btllib::Indexlr::Flag::QUAL; } + if (bool(with_q_drop)) { + flags |= btllib::Indexlr::Flag::Q_DROP; + } + if (bool(with_part_avg)) { + flags |= btllib::Indexlr::Flag::PART_AVG; + } if (bool(long_mode)) { flags |= btllib::Indexlr::Flag::LONG_MODE; } else {