Skip to content

Commit

Permalink
command line update
Browse files Browse the repository at this point in the history
  • Loading branch information
Jeremiah Wala committed Sep 1, 2016
1 parent 6c1843e commit 55c2f99
Show file tree
Hide file tree
Showing 6 changed files with 100 additions and 45 deletions.
19 changes: 15 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -12,11 +12,13 @@ API Documentation
Table of contents
=================

* [Installation](#gh-md-toc)
* [Installation](#installation)
* [Integrating into build system](#integrating-into-build-system)
* [Description](#description)
* [Examples](#examples)
* [Memory management](#memory-management)
* [Other C++ APIs](#other-c++-apis)
* [Command line usage](#command-line-usage)
* [Examples](#examples)
* [Attributions](#attributions)

Installation
Expand Down Expand Up @@ -76,7 +78,7 @@ main motivations behind SeqLib is that all access to sequencing reads, BWA, etc
completely avoid ``malloc`` and ``free``. In SeqLib all the mallocs/frees are handled automatically in the constructors and
destructors.

Note about BamTools, Gamgee and SeqAn
Other C++ APIs
------------------------------
There are overlaps between this project and the [BamTools][BT] project from Derek Barnett, the [Gamgee][gam]
project from the Broad Institute, and the [SeqAn][seqan] library from Freie Universitat Berlin. These projects
Expand All @@ -97,7 +99,16 @@ Command Line Usage
------------------
```bash
## BFC correction (input mode -m is b (BAM/SAM/CRAM), output mode -w is SAM stream
samtools view $na1 -h 1:1,000,000-1,002,000 | bin/seqtools bfc -m b -w s | samtools sort - -m 4g -o corrected.bam
samtools view $na1 -h 1:1,000,000-1,002,000 | seqtools bfc - -G $REF | samtools sort - -m 4g -o corrected.bam

## Without a pipe, write to BAM
seqtools bfc in.bam -G $REF -b > corrected.bam

## Skip realignment, send to fasta
seqtools bfc in.bam -f > corrected.fasta

## Input as fasta, send to aligned BAM
seqtools bfc -F in.fasta -G $REG -b > corrected.bam
```

Examples
Expand Down
3 changes: 3 additions & 0 deletions SeqLib/BFC.h
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,9 @@ namespace SeqLib {

bool Train();

/** Add a sequence for either training or correction */
bool AddSequence(const BamRecord& r);

/** Add a sequence for either training or correction */
bool AddSequence(const char* seq, const char* qual, const char* name);

Expand Down
3 changes: 1 addition & 2 deletions SeqLib/BamRecord.h
Original file line number Diff line number Diff line change
Expand Up @@ -282,7 +282,6 @@ class BamRecord {
assert(false);
}


/** BamRecord is failed QC */
inline bool QCFailFlag() const { return b ? ((b->core.flag&BAM_FQCFAIL) != 0) : false; }

Expand Down Expand Up @@ -433,7 +432,7 @@ class BamRecord {
void SetQname(const std::string& n);

//Set the quality scores
void SetQualities(const std::string& n);
void SetQualities(const std::string& n, int offset);

/** Set the sequence name */
void SetSequence(const std::string& seq);
Expand Down
27 changes: 25 additions & 2 deletions src/BFC.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,6 +50,28 @@ namespace SeqLib {
return true;
}

bool BFC::AddSequence(const BamRecord& r) {

//char* s = strdup(r.Sequence().c_str());
const char* q = bam_get_qname(r.raw());
//uint8_t* l = bam_get_qual(r.raw());
//char* qual = (char*)malloc(r.Length() + 1);
//if (l)
// for (size_t i = 0; i < r.Length(); ++i)
// qual[i] = l[i] + 33;
//qual[r.Length()] = '\0';

bool ret = AddSequence(r.Sequence().c_str(), r.Qualities().c_str(), q);

//if (s)
// free(s);
//if (qual)
// free(qual);

return ret;

}

bool BFC::AddSequence(const char* seq, const char* qual, const char* name) {

// do the intial allocation
Expand Down Expand Up @@ -79,8 +101,9 @@ namespace SeqLib {

s->seq = strdup(seq);
s->qual = 0;
if (strlen(qual))
s->qual = strdup(qual);
if (strlen(qual)) {
s->qual = strdup(qual);
}

s->l_seq = strlen(seq);
n_seqs++;
Expand Down
4 changes: 2 additions & 2 deletions src/BamRecord.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -295,14 +295,14 @@ namespace SeqLib {
b->m_data = b->l_data;
}

void BamRecord::SetQualities(const std::string& n) {
void BamRecord::SetQualities(const std::string& n, int offset) {

if (n.length() != b->core.l_qseq)
throw std::invalid_argument("New quality score should be same as seq length");

char * q = strdup(n.data());
for (size_t i = 0; i < n.length(); ++i)
q[i] -= 33;
q[i] -= offset;
memcpy(bam_get_qual(b), q, n.length()); // dont copy /0 terminator
free(q);

Expand Down
89 changes: 54 additions & 35 deletions src/seqtools.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -25,31 +25,34 @@ static const char *BFC_USAGE_MESSAGE =
"Contact: Jeremiah Wala [ [email protected] ]\n"
"Usage: seqtools bfc [options]\n\n"
"Commands:\n"
//" --input, -i Input FASTA, BAM, CRAM, SAM. If not specified, reads from stdin\n"
//" --imode, -m Input mode. f: FASTA b: BAM/CRAM/SAM <none>: stdin (sam/bam stream)\n"
//" --omode, -w Output stream mode. f: FASTA b: BAM s: SAM <none>: stdin (sam/bam stream)\n"
" --fasta, -f Output stream is a fasta (no realignment)"
" --bam, -b, Output stream is BAM (not SAM)"
" --reference, -G Reference genome if using BWA-MEM realignment\n"
" --verbose, -v Set verbose output\n"
" --fasta, -f Output stream should be a FASTA (no realignment)\n"
" --bam, -b Output stream should be a BAM (not SAM)\n"
" --cram, -C Output stream should be a CRAM (not SAM)\n"
" --infasta, -F <file> Input a FASTA insted of BAM/SAM/CRAM stream\n"
" --reference, -G <file> Reference genome if using BWA-MEM realignment\n"
"\nReport bugs to [email protected] \n\n";

void runbfc(int argc, char** argv);
void parseBfcOptions(int argc, char** argv);

namespace opt {

static bool verbose = false;
static char mode = 's';
static std::string input;
static std::string reference = "/seq/references/Homo_sapiens_assembly19/v1/Homo_sapiens_assembly19.fasta";
static char inputmode = 'f';
static char outputmode = 's';
static std::string fasta; // input is a fasta
}

static const char* shortopts = "hi:m:w:G:";
static const char* shortopts = "hbfvCG:F:";
static const struct option longopts[] = {
{ "help", no_argument, NULL, 'h' },
{ "input", required_argument, NULL, 'i' },
{ "imode", required_argument, NULL, 'm' },
{ "omode", required_argument, NULL, 'w' },
{ "verbose", no_argument, NULL, 'v' },
{ "bam", no_argument, NULL, 'b' },
{ "cram", no_argument, NULL, 'C' },
{ "fasta", no_argument, NULL, 'f' },
{ "infasta", required_argument, NULL, 'F' },
{ "reference", required_argument, NULL, 'G' },
{ NULL, 0, NULL, 0 }
};
Expand Down Expand Up @@ -82,38 +85,44 @@ void runbfc(int argc, char** argv) {

SeqLib::BFC b;

if (opt::inputmode == 'f') {
// is this a fasta file

if (!opt::fasta.empty()) {
// read in a fasta file
SeqLib::FastqReader f(opt::input);
SeqLib::FastqReader f(opt::fasta);

std::string qn, seq;
while (f.GetNextSequence(qn, seq)) {
std::string e;
assert(b.AddSequence(seq.c_str(), e.c_str(), qn.c_str()));
}
} else if (opt::inputmode == '-' || opt::inputmode == 'b') {
} else { //if (opt::mode == 'b' || opt::mode == 's' || opt::mode == 'C') {
SeqLib::BamReader br;
br.Open(opt::input.empty() ? "-" : opt::input);
br.Open(opt::input == "-" ? "-" : opt::input);
SeqLib::BamRecord rec;
while(br.GetNextRecord(rec)) {
b.AddSequence(rec.Sequence().c_str(), rec.Qualities().c_str(), rec.Qname().c_str());
b.AddSequence(rec); //rec.Sequence().c_str(), rec.Qualities().c_str(), rec.Qname().c_str());
}
} else {
std::cerr << "Input mode: " << opt::inputmode << " not recognized " << std::endl;
}

if (!b.Train()) {
std::cerr << "Training failed on " << b.NumSequences() << std::endl;
exit(EXIT_FAILURE);
}
if (!b.ErrorCorrect()) {
std::cerr << "Correction failed on " << b.NumSequences() << std::endl;
exit(EXIT_FAILURE);
}

assert(b.Train());
assert(b.ErrorCorrect());

SeqLib::UnalignedSequenceVector u;
b.GetSequences(u);
std::cerr << "nseqs: " << u.size()
<< " kcov: " << b.GetKCov()
<< " kmer " << b.GetKMer() << std::endl;

if (opt::verbose)
std::cerr << "nseqs: " << u.size()
<< " kcov: " << b.GetKCov()
<< " kmer: " << b.GetKMer() << std::endl;


if (opt::outputmode == 'f') {
if (opt::mode == 'f') {
for (SeqLib::UnalignedSequenceVector::const_iterator i = u.begin();
i != u.end(); ++i) {
std::cout << ">" << i->Name << std::endl << i->Seq << std::endl;
Expand All @@ -122,12 +131,16 @@ void runbfc(int argc, char** argv) {
}

SeqLib::BamWriter bw;
if (opt::outputmode == 'b')
if (opt::mode == 'b')
bw = SeqLib::BamWriter(SeqLib::BAM);
else if (opt::outputmode == 's')
else if (opt::mode == 's')
bw = SeqLib::BamWriter(SeqLib::SAM);
else if (opt::mode == 'C') {
bw = SeqLib::BamWriter(SeqLib::CRAM);
bw.SetCramReference(opt::reference);
}
else {
std::cerr << "Unrecognized output stream mode " << opt::outputmode << std::endl;
std::cerr << "Unrecognized output stream mode " << opt::mode << std::endl;
exit(EXIT_FAILURE);
}

Expand All @@ -151,7 +164,8 @@ void runbfc(int argc, char** argv) {
bwa.AlignSequence(i->Seq, i->Name, brv, false, frac, 10);
for (SeqLib::BamRecordVector::iterator r = brv.begin();
r != brv.end(); ++r) {
r->SetQualities(i->Qual);
if (!i->Qual.empty())
r->SetQualities(i->Qual, 33);
bw.WriteRecord(*r);
}
}
Expand All @@ -164,18 +178,23 @@ void parseBfcOptions(int argc, char** argv) {
bool die = false;
bool help = false;

// get the first argument as input
if (argc > 1)
opt::input = std::string(argv[1]);

for (char c; (c = getopt_long(argc, argv, shortopts, longopts, NULL)) != -1;) {
std::istringstream arg(optarg != NULL ? optarg : "");
switch (c) {
case 'i': arg >> opt::input; break;
case 'm': arg >> opt::inputmode; break;
case 'w': arg >> opt::outputmode; break;
case 'f': opt::mode = 'f'; break;
case 'F': arg >> opt::fasta; break;
case 'b': opt::mode = 'b'; break;
case 'C': opt::mode = 'C'; break;
case 'G': arg >> opt::reference; break;
default: die= true;
}
}

if (die || help) {
if (die || help || (opt::input.empty() && opt::fasta.empty())) {
std::cerr << "\n" << BFC_USAGE_MESSAGE;
if (die)
exit(EXIT_FAILURE);
Expand Down

0 comments on commit 55c2f99

Please sign in to comment.