Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Indexlr mx filter #79

Open
wants to merge 12 commits into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
88 changes: 69 additions & 19 deletions include/btllib/indexlr.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -52,12 +52,18 @@ class Indexlr
static const unsigned LONG_MODE = 64;
/** Include read sequence Phred score along with minimizer information. */
static const unsigned QUAL = 128;
/** Exclude kmers with quality score below the threshold. */
static const unsigned Q_DROP = 256;
/** Consider only 10% of the base quality scores for averaging. */
static const unsigned PART_AVG = 512;
};

bool output_id() const { return bool(~flags & Flag::NO_ID); }
bool output_bx() const { return bool(flags & Flag::BX); }
bool output_seq() const { return bool(flags & Flag::SEQ); }
bool output_qual() const { return bool(flags & Flag::QUAL); }
bool q_drop() const { return bool(flags & Flag::Q_DROP); }
bool part_avg() const { return bool(flags & Flag::PART_AVG); }
bool filter_in() const { return bool(flags & Flag::FILTER_IN); }
bool filter_out() const { return bool(flags & Flag::FILTER_OUT); }
bool short_mode() const { return bool(flags & Flag::SHORT_MODE); }
Expand Down Expand Up @@ -100,6 +106,7 @@ class Indexlr
bool forward = false;
std::string seq;
std::string qual;
bool valid = true;
};

using HashedKmer = Minimizer;
Expand Down Expand Up @@ -228,12 +235,16 @@ class Indexlr
bool filter_in,
bool filter_out,
const BloomFilter& filter_in_bf,
const BloomFilter& filter_out_bf);
const BloomFilter& filter_out_bf,
bool drop);

static void filter_kmer_qual(Indexlr::HashedKmer& hk,
const std::string& kmer_qual,
size_t q);
static size_t calc_kmer_quality(const std::string& qual);
size_t q,
bool drop,
bool partial);
static size_t calc_kmer_quality(const std::string& qual,
bool partial = false);

static void calc_minimizer(
const std::vector<Indexlr::HashedKmer>& hashed_kmers_buffer,
Expand Down Expand Up @@ -466,51 +477,82 @@ Indexlr::filter_hashed_kmer(Indexlr::HashedKmer& hk,
bool filter_in,
bool filter_out,
const BloomFilter& filter_in_bf,
const BloomFilter& filter_out_bf)
const BloomFilter& filter_out_bf,
bool drop)
{
if (filter_in && filter_out) {
std::vector<uint64_t> tmp;
tmp = { hk.min_hash };
if (!filter_in_bf.contains(tmp) || filter_out_bf.contains(tmp)) {
hk.min_hash = std::numeric_limits<uint64_t>::max();
if (drop) {
hk.valid = false;
} else {
hk.min_hash = std::numeric_limits<uint64_t>::max();
}
}
} else if (filter_in) {
if (!filter_in_bf.contains({ hk.min_hash })) {
hk.min_hash = std::numeric_limits<uint64_t>::max();
if (drop) {
hk.valid = false;
} else {
hk.min_hash = std::numeric_limits<uint64_t>::max();
}
}
} else if (filter_out) {
if (filter_out_bf.contains({ hk.min_hash })) {
hk.min_hash = std::numeric_limits<uint64_t>::max();
if (drop) {
hk.valid = false;
} else {
hk.min_hash = std::numeric_limits<uint64_t>::max();
}
}
}
}

inline void
Indexlr::filter_kmer_qual(Indexlr::HashedKmer& hk,
const std::string& kmer_qual,
size_t q)
size_t q,
bool drop,
bool partial)
{
if (calc_kmer_quality(kmer_qual) < q) {
hk.min_hash = std::numeric_limits<uint64_t>::max();
if (calc_kmer_quality(kmer_qual, partial) < q) {
if (drop) {
hk.valid = false;
} else {
hk.min_hash = std::numeric_limits<uint64_t>::max();
}
}
}

inline size_t
Indexlr::calc_kmer_quality(const std::string& qual)
Indexlr::calc_kmer_quality(const std::string& qual, bool partial)
{
// convert the quality scores to integers
std::vector<int> qual_ints;
const int thirty_three = 33;
const size_t ten = 10;
qual_ints.reserve(qual.size());
for (auto c : qual) {
qual_ints.push_back(c - thirty_three);
}
// calculate the mean (potential improvement: use other statistics)
// sort the quality scores
std::sort(qual_ints.begin(), qual_ints.end());

// calculate the mean quality score
size_t sum = 0;
for (auto q : qual_ints) {
sum += q;
size_t n = (partial ? qual_ints.size() / ten : qual_ints.size());
if (n == 0) {
if (!qual_ints.empty()) {
n = 1;
} else {
return 0;
}
}
for (size_t i = 0; i < n; ++i) {
sum += qual_ints[i];
}
return (sum / qual_ints.size());
return sum / n;
}

inline void
Expand Down Expand Up @@ -546,7 +588,10 @@ Indexlr::calc_minimizer(
if (ssize_t(min_current->pos) > min_pos_prev &&
min_current->min_hash != std::numeric_limits<uint64_t>::max()) {
min_pos_prev = ssize_t(min_current->pos);
minimizers.push_back(*min_current);
if (min_current->valid) {
// if the kmer is valid (not suppressed by filters)
minimizers.push_back(*min_current);
}
}
}

Expand All @@ -572,11 +617,16 @@ Indexlr::minimize(const std::string& seq, const std::string& qual) const
output_seq() ? seq.substr(nh.get_pos(), k) : "",
output_qual() ? qual.substr(nh.get_pos(), k) : "");

filter_hashed_kmer(
hk, filter_in(), filter_out(), filter_in_bf.get(), filter_out_bf.get());
filter_hashed_kmer(hk,
filter_in(),
filter_out(),
filter_in_bf.get(),
filter_out_bf.get(),
q_drop());

if (q > 0) {
filter_kmer_qual(hk, qual.substr(nh.get_pos(), k), q);
filter_kmer_qual(
hk, qual.substr(nh.get_pos(), k), q, q_drop(), part_avg());
}

if (idx + 1 >= w) {
Expand Down
16 changes: 14 additions & 2 deletions recipes/indexlr.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -43,12 +43,16 @@ print_usage()
std::cerr
<< "Usage: " << PROGNAME
<< " -k K -w W [-q Q] [-r repeat_bf_path] [-s solid_bf_path] [--id] "
"[--bx] [--pos] [--seq] [--qual]"
"[--bx] [--pos] [--seq] [--qual] [--q-drop] [--part-avg]"
"[-o FILE] FILE...\n\n"
" -k K Use K as k-mer size.\n"
" -w W Use W as sliding-window size.\n"
" -q Q Filter kmers with average quality (Phred score) lower "
"than Q [0]. \n"
" --q-drop Drop filtered kmers instead of disqualifying them"
"from generating minimizers. \n"
" --part-avg Consider only 1/10 of the (lowest value) base quality "
"scores for averaging. \n"
" --id Include input sequence ids in the output. "
"(Default if --bx is not provided)\n"
" --bx Include input sequence barcodes in the output.\n"
Expand Down Expand Up @@ -100,7 +104,7 @@ main(int argc, char* argv[])
bool k_set = false;
bool q_set = false;
int with_id = 0, with_bx = 0, with_len = 0, with_pos = 0, with_strand = 0,
with_seq = 0, with_qual = 0;
with_seq = 0, with_qual = 0, with_q_drop = 0, with_part_avg = 0;
std::unique_ptr<btllib::KmerBloomFilter> repeat_bf, solid_bf;
bool with_repeat = false, with_solid = false;
int long_mode = 0;
Expand All @@ -113,6 +117,8 @@ main(int argc, char* argv[])
{ "pos", no_argument, &with_pos, 1 },
{ "strand", no_argument, &with_strand, 1 },
{ "seq", no_argument, &with_seq, 1 },
{ "q-drop", no_argument, &with_q_drop, 1 },
{ "part-avg", no_argument, &with_part_avg, 1 },
{ "qual", no_argument, &with_qual, 1 },
{ "long", no_argument, &long_mode, 1 },
{ "help", no_argument, &help, 1 },
Expand Down Expand Up @@ -231,6 +237,12 @@ main(int argc, char* argv[])
if (bool(with_qual)) {
flags |= btllib::Indexlr::Flag::QUAL;
}
if (bool(with_q_drop)) {
flags |= btllib::Indexlr::Flag::Q_DROP;
}
if (bool(with_part_avg)) {
flags |= btllib::Indexlr::Flag::PART_AVG;
}
if (bool(long_mode)) {
flags |= btllib::Indexlr::Flag::LONG_MODE;
} else {
Expand Down