From 55c2f99868a76ce18c2198cfa975b4c127424f6d Mon Sep 17 00:00:00 2001 From: Jeremiah Wala Date: Thu, 1 Sep 2016 16:27:50 -0400 Subject: [PATCH] command line update --- README.md | 19 +++++++--- SeqLib/BFC.h | 3 ++ SeqLib/BamRecord.h | 3 +- src/BFC.cpp | 27 ++++++++++++-- src/BamRecord.cpp | 4 +-- src/seqtools.cpp | 89 ++++++++++++++++++++++++++++------------------ 6 files changed, 100 insertions(+), 45 deletions(-) diff --git a/README.md b/README.md index 9ac7117cb..2256676d1 100644 --- a/README.md +++ b/README.md @@ -12,11 +12,13 @@ API Documentation Table of contents ================= - * [Installation](#gh-md-toc) + * [Installation](#installation) * [Integrating into build system](#integrating-into-build-system) * [Description](#description) - * [Examples](#examples) + * [Memory management](#memory-management) + * [Other C++ APIs](#other-c++-apis) * [Command line usage](#command-line-usage) + * [Examples](#examples) * [Attributions](#attributions) Installation @@ -76,7 +78,7 @@ main motivations behind SeqLib is that all access to sequencing reads, BWA, etc completely avoid ``malloc`` and ``free``. In SeqLib all the mallocs/frees are handled automatically in the constructors and destructors. -Note about BamTools, Gamgee and SeqAn +Other C++ APIs ------------------------------ There are overlaps between this project and the [BamTools][BT] project from Derek Barnett, the [Gamgee][gam] project from the Broad Institute, and the [SeqAn][seqan] library from Freie Universitat Berlin. These projects @@ -97,7 +99,16 @@ Command Line Usage ------------------ ```bash ## BFC correction (input mode -m is b (BAM/SAM/CRAM), output mode -w is SAM stream -samtools view $na1 -h 1:1,000,000-1,002,000 | bin/seqtools bfc -m b -w s | samtools sort - -m 4g -o corrected.bam +samtools view $na1 -h 1:1,000,000-1,002,000 | seqtools bfc - -G $REF | samtools sort - -m 4g -o corrected.bam + +## Without a pipe, write to BAM +seqtools bfc in.bam -G $REF -b > corrected.bam + +## Skip realignment, send to fasta +seqtools bfc in.bam -f > corrected.fasta + +## Input as fasta, send to aligned BAM +seqtools bfc -F in.fasta -G $REG -b > corrected.bam ``` Examples diff --git a/SeqLib/BFC.h b/SeqLib/BFC.h index 32eff2ca2..2d4cc3549 100644 --- a/SeqLib/BFC.h +++ b/SeqLib/BFC.h @@ -52,6 +52,9 @@ namespace SeqLib { bool Train(); + /** Add a sequence for either training or correction */ + bool AddSequence(const BamRecord& r); + /** Add a sequence for either training or correction */ bool AddSequence(const char* seq, const char* qual, const char* name); diff --git a/SeqLib/BamRecord.h b/SeqLib/BamRecord.h index 84ccc9b80..9a0c3d1a4 100644 --- a/SeqLib/BamRecord.h +++ b/SeqLib/BamRecord.h @@ -282,7 +282,6 @@ class BamRecord { assert(false); } - /** BamRecord is failed QC */ inline bool QCFailFlag() const { return b ? ((b->core.flag&BAM_FQCFAIL) != 0) : false; } @@ -433,7 +432,7 @@ class BamRecord { void SetQname(const std::string& n); //Set the quality scores - void SetQualities(const std::string& n); + void SetQualities(const std::string& n, int offset); /** Set the sequence name */ void SetSequence(const std::string& seq); diff --git a/src/BFC.cpp b/src/BFC.cpp index db86b5fad..a35b7f163 100644 --- a/src/BFC.cpp +++ b/src/BFC.cpp @@ -50,6 +50,28 @@ namespace SeqLib { return true; } + bool BFC::AddSequence(const BamRecord& r) { + + //char* s = strdup(r.Sequence().c_str()); + const char* q = bam_get_qname(r.raw()); + //uint8_t* l = bam_get_qual(r.raw()); + //char* qual = (char*)malloc(r.Length() + 1); + //if (l) + // for (size_t i = 0; i < r.Length(); ++i) + // qual[i] = l[i] + 33; + //qual[r.Length()] = '\0'; + + bool ret = AddSequence(r.Sequence().c_str(), r.Qualities().c_str(), q); + + //if (s) + // free(s); + //if (qual) + // free(qual); + + return ret; + + } + bool BFC::AddSequence(const char* seq, const char* qual, const char* name) { // do the intial allocation @@ -79,8 +101,9 @@ namespace SeqLib { s->seq = strdup(seq); s->qual = 0; - if (strlen(qual)) - s->qual = strdup(qual); + if (strlen(qual)) { + s->qual = strdup(qual); + } s->l_seq = strlen(seq); n_seqs++; diff --git a/src/BamRecord.cpp b/src/BamRecord.cpp index 369c1ad03..a94e05494 100644 --- a/src/BamRecord.cpp +++ b/src/BamRecord.cpp @@ -295,14 +295,14 @@ namespace SeqLib { b->m_data = b->l_data; } - void BamRecord::SetQualities(const std::string& n) { + void BamRecord::SetQualities(const std::string& n, int offset) { if (n.length() != b->core.l_qseq) throw std::invalid_argument("New quality score should be same as seq length"); char * q = strdup(n.data()); for (size_t i = 0; i < n.length(); ++i) - q[i] -= 33; + q[i] -= offset; memcpy(bam_get_qual(b), q, n.length()); // dont copy /0 terminator free(q); diff --git a/src/seqtools.cpp b/src/seqtools.cpp index ea7d8882a..d9378299c 100644 --- a/src/seqtools.cpp +++ b/src/seqtools.cpp @@ -25,12 +25,12 @@ static const char *BFC_USAGE_MESSAGE = "Contact: Jeremiah Wala [ jwala@broadinstitute.org ]\n" "Usage: seqtools bfc [options]\n\n" "Commands:\n" - //" --input, -i Input FASTA, BAM, CRAM, SAM. If not specified, reads from stdin\n" - //" --imode, -m Input mode. f: FASTA b: BAM/CRAM/SAM : stdin (sam/bam stream)\n" - //" --omode, -w Output stream mode. f: FASTA b: BAM s: SAM : stdin (sam/bam stream)\n" -" --fasta, -f Output stream is a fasta (no realignment)" -" --bam, -b, Output stream is BAM (not SAM)" -" --reference, -G Reference genome if using BWA-MEM realignment\n" +" --verbose, -v Set verbose output\n" +" --fasta, -f Output stream should be a FASTA (no realignment)\n" +" --bam, -b Output stream should be a BAM (not SAM)\n" +" --cram, -C Output stream should be a CRAM (not SAM)\n" +" --infasta, -F Input a FASTA insted of BAM/SAM/CRAM stream\n" +" --reference, -G Reference genome if using BWA-MEM realignment\n" "\nReport bugs to jwala@broadinstitute.org \n\n"; void runbfc(int argc, char** argv); @@ -38,18 +38,21 @@ void parseBfcOptions(int argc, char** argv); namespace opt { + static bool verbose = false; + static char mode = 's'; static std::string input; static std::string reference = "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta"; - static char inputmode = 'f'; - static char outputmode = 's'; + static std::string fasta; // input is a fasta } -static const char* shortopts = "hi:m:w:G:"; +static const char* shortopts = "hbfvCG:F:"; static const struct option longopts[] = { { "help", no_argument, NULL, 'h' }, - { "input", required_argument, NULL, 'i' }, - { "imode", required_argument, NULL, 'm' }, - { "omode", required_argument, NULL, 'w' }, + { "verbose", no_argument, NULL, 'v' }, + { "bam", no_argument, NULL, 'b' }, + { "cram", no_argument, NULL, 'C' }, + { "fasta", no_argument, NULL, 'f' }, + { "infasta", required_argument, NULL, 'F' }, { "reference", required_argument, NULL, 'G' }, { NULL, 0, NULL, 0 } }; @@ -82,38 +85,44 @@ void runbfc(int argc, char** argv) { SeqLib::BFC b; - if (opt::inputmode == 'f') { + // is this a fasta file + + if (!opt::fasta.empty()) { // read in a fasta file - SeqLib::FastqReader f(opt::input); + SeqLib::FastqReader f(opt::fasta); std::string qn, seq; while (f.GetNextSequence(qn, seq)) { std::string e; assert(b.AddSequence(seq.c_str(), e.c_str(), qn.c_str())); } - } else if (opt::inputmode == '-' || opt::inputmode == 'b') { + } else { //if (opt::mode == 'b' || opt::mode == 's' || opt::mode == 'C') { SeqLib::BamReader br; - br.Open(opt::input.empty() ? "-" : opt::input); + br.Open(opt::input == "-" ? "-" : opt::input); SeqLib::BamRecord rec; while(br.GetNextRecord(rec)) { - b.AddSequence(rec.Sequence().c_str(), rec.Qualities().c_str(), rec.Qname().c_str()); + b.AddSequence(rec); //rec.Sequence().c_str(), rec.Qualities().c_str(), rec.Qname().c_str()); } - } else { - std::cerr << "Input mode: " << opt::inputmode << " not recognized " << std::endl; + } + + if (!b.Train()) { + std::cerr << "Training failed on " << b.NumSequences() << std::endl; + exit(EXIT_FAILURE); + } + if (!b.ErrorCorrect()) { + std::cerr << "Correction failed on " << b.NumSequences() << std::endl; exit(EXIT_FAILURE); } - - assert(b.Train()); - assert(b.ErrorCorrect()); SeqLib::UnalignedSequenceVector u; b.GetSequences(u); - std::cerr << "nseqs: " << u.size() - << " kcov: " << b.GetKCov() - << " kmer " << b.GetKMer() << std::endl; - + if (opt::verbose) + std::cerr << "nseqs: " << u.size() + << " kcov: " << b.GetKCov() + << " kmer: " << b.GetKMer() << std::endl; + - if (opt::outputmode == 'f') { + if (opt::mode == 'f') { for (SeqLib::UnalignedSequenceVector::const_iterator i = u.begin(); i != u.end(); ++i) { std::cout << ">" << i->Name << std::endl << i->Seq << std::endl; @@ -122,12 +131,16 @@ void runbfc(int argc, char** argv) { } SeqLib::BamWriter bw; - if (opt::outputmode == 'b') + if (opt::mode == 'b') bw = SeqLib::BamWriter(SeqLib::BAM); - else if (opt::outputmode == 's') + else if (opt::mode == 's') bw = SeqLib::BamWriter(SeqLib::SAM); + else if (opt::mode == 'C') { + bw = SeqLib::BamWriter(SeqLib::CRAM); + bw.SetCramReference(opt::reference); + } else { - std::cerr << "Unrecognized output stream mode " << opt::outputmode << std::endl; + std::cerr << "Unrecognized output stream mode " << opt::mode << std::endl; exit(EXIT_FAILURE); } @@ -151,7 +164,8 @@ void runbfc(int argc, char** argv) { bwa.AlignSequence(i->Seq, i->Name, brv, false, frac, 10); for (SeqLib::BamRecordVector::iterator r = brv.begin(); r != brv.end(); ++r) { - r->SetQualities(i->Qual); + if (!i->Qual.empty()) + r->SetQualities(i->Qual, 33); bw.WriteRecord(*r); } } @@ -164,18 +178,23 @@ void parseBfcOptions(int argc, char** argv) { bool die = false; bool help = false; + // get the first argument as input + if (argc > 1) + opt::input = std::string(argv[1]); + for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) { std::istringstream arg(optarg != NULL ? optarg : ""); switch (c) { - case 'i': arg >> opt::input; break; - case 'm': arg >> opt::inputmode; break; - case 'w': arg >> opt::outputmode; break; + case 'f': opt::mode = 'f'; break; + case 'F': arg >> opt::fasta; break; + case 'b': opt::mode = 'b'; break; + case 'C': opt::mode = 'C'; break; case 'G': arg >> opt::reference; break; default: die= true; } } - if (die || help) { + if (die || help || (opt::input.empty() && opt::fasta.empty())) { std::cerr << "\n" << BFC_USAGE_MESSAGE; if (die) exit(EXIT_FAILURE);