Skip to content

Commit

Permalink
feat: allow TrimFastq to specify a length per input FASTQ
Browse files Browse the repository at this point in the history
See: #927
  • Loading branch information
nh13 committed Aug 16, 2023
1 parent 5aded50 commit afa97e6
Show file tree
Hide file tree
Showing 2 changed files with 32 additions and 13 deletions.
11 changes: 7 additions & 4 deletions src/main/scala/com/fulcrumgenomics/fastq/TrimFastq.scala
Original file line number Diff line number Diff line change
Expand Up @@ -46,26 +46,29 @@ import com.fulcrumgenomics.sopt._
class TrimFastq
( @arg(flag='i', doc="One or more input fastq files.") val input: Seq[PathToFastq],
@arg(flag='o', doc="A matching number of output fastq files.") val output: Seq[PathToFastq],
@arg(flag='l', doc="Length to trim reads to.") val length: Int,
@arg(flag='l', doc="Length to trim reads to (either one per input fastq file, or one for all).") val length: Seq[Int],
@arg(flag='x', doc="Exclude reads below the trim length.") val exclude: Boolean = false
) extends FgBioTool with LazyLogging {

validate(input.size == output.size, "Number of input and output files must match.")
validate(length.size == 1 || input.size == length.size, "Number of lengths must be one or match the number of input files.")

override def execute(): Unit = {
var discarded: Long = 0
val progress = new ProgressLogger(this.logger, noun="records", verb="Wrote")

val lengths = if (this.length.size == 1) List.fill(this.input.size)(this.length.head) else this.length

val sources = input.map(FastqSource(_))
val writers = output.map(FastqWriter(_))
while (allHaveNext(sources)) {
val recs = sources.map(_.next())
if (exclude && recs.exists(_.length < length)) {
if (exclude && recs.zip(lengths).exists { case (rec, length) => rec.length < length }) {
discarded += 1
}
else {
writers.iterator.zip(recs.iterator).foreach { case(w, r) =>
w.write(r.trimmedTo(length))
writers.lazyZip(recs).lazyZip(lengths).foreach { case (w: FastqWriter, r: FastqRecord, l: Int) =>
w.write(r.trimmedTo(l))
progress.record()
}
}
Expand Down
34 changes: 25 additions & 9 deletions src/test/scala/com/fulcrumgenomics/fastq/TrimFastqTest.scala
Original file line number Diff line number Diff line change
Expand Up @@ -59,9 +59,9 @@ class TrimFastqTest extends UnitSpec {
}

"TrimFastq" should "trim a single file and not discard any records" in {
val (r1, r2) = fqFiles
val (r1, _) = fqFiles
val out = makeTempFile("trimmed.", ".fq")
new TrimFastq(input=Seq(r1), output=Seq(out), length=15, exclude=false).execute()
new TrimFastq(input=Seq(r1), output=Seq(out), length=Seq(15), exclude=false).execute()
val r1Map = FastqSource(out).map(r => r.name -> r).toMap
r1Map.size shouldBe 3
r1Map("10x10").length shouldBe 10
Expand All @@ -70,18 +70,18 @@ class TrimFastqTest extends UnitSpec {
}

it should "trim a single file and discard 2 records" in {
val (r1, r2) = fqFiles
val (r1, _) = fqFiles
val out = makeTempFile("trimmed.", ".fq")
new TrimFastq(input=Seq(r1), output=Seq(out), length=15, exclude=true).execute()
new TrimFastq(input=Seq(r1), output=Seq(out), length=Seq(15), exclude=true).execute()
val r1Map = FastqSource(out).map(r => r.name -> r).toMap
r1Map.size shouldBe 1
r1Map("20x20").length shouldBe 15
}

it should "trim a single file and discard 0 records because they are all long enough" in {
val (r1, r2) = fqFiles
val (r1, _) = fqFiles
val out = makeTempFile("trimmed.", ".fq")
new TrimFastq(input=Seq(r1), output=Seq(out), length=5, exclude=true).execute()
new TrimFastq(input=Seq(r1), output=Seq(out), length=Seq(5), exclude=true).execute()
val r1Map = FastqSource(out).map(r => r.name -> r).toMap
r1Map.size shouldBe 3
r1Map("10x10").length shouldBe 5
Expand All @@ -92,7 +92,7 @@ class TrimFastqTest extends UnitSpec {
it should "not trim or discard any reads" in {
val (r1, r2) = fqFiles
val (r1Out, r2Out) = (makeTempFile("r1out.", ".fq"), makeTempFile("r2out.", ".fq"))
new TrimFastq(input=Seq(r1, r2), output=Seq(r1Out, r2Out), length=25, exclude=false).execute()
new TrimFastq(input=Seq(r1, r2), output=Seq(r1Out, r2Out), length=Seq(25), exclude=false).execute()
val r1Map = FastqSource(r1Out).map(r => r.name -> r).toMap
val r2Map = FastqSource(r2Out).map(r => r.name -> r).toMap
r1Map.size shouldBe 3
Expand All @@ -108,7 +108,7 @@ class TrimFastqTest extends UnitSpec {
it should "trim but not discard some reads" in {
val (r1, r2) = fqFiles
val (r1Out, r2Out) = (makeTempFile("r1out.", ".fq"), makeTempFile("r2out.", ".fq"))
new TrimFastq(input=Seq(r1, r2), output=Seq(r1Out, r2Out), length=15, exclude=false).execute()
new TrimFastq(input=Seq(r1, r2), output=Seq(r1Out, r2Out), length=Seq(15), exclude=false).execute()
val r1Map = FastqSource(r1Out).map(r => r.name -> r).toMap
val r2Map = FastqSource(r2Out).map(r => r.name -> r).toMap
r1Map.size shouldBe 3
Expand All @@ -124,12 +124,28 @@ class TrimFastqTest extends UnitSpec {
it should "trim some reads and discard others by pair in" in {
val (r1, r2) = fqFiles
val (r1Out, r2Out) = (makeTempFile("r1out.", ".fq"), makeTempFile("r2out.", ".fq"))
new TrimFastq(input=Seq(r1, r2), output=Seq(r1Out, r2Out), length=15, exclude=true).execute()
new TrimFastq(input=Seq(r1, r2), output=Seq(r1Out, r2Out), length=Seq(15), exclude=true).execute()
val r1Map = FastqSource(r1Out).map(r => r.name -> r).toMap
val r2Map = FastqSource(r2Out).map(r => r.name -> r).toMap
r1Map.size shouldBe 1
r2Map.size shouldBe r1Map.size
r1Map("20x20").length shouldBe 15
r2Map("20x20").length shouldBe 15
}

it should "trim some reads and discard others with FASTQ specific lengths" in {
val (r1, r2) = fqFiles
val (r1Out, r2Out) = (makeTempFile("r1out.", ".fq"), makeTempFile("r2out.", ".fq"))
new TrimFastq(input = Seq(r1, r2), output=Seq(r1Out, r2Out), length = Seq(10, 15), exclude = false).execute()
val r1Map = FastqSource(r1Out).map(r => r.name -> r).toMap
val r2Map = FastqSource(r2Out).map(r => r.name -> r).toMap
r1Map.size shouldBe 3
r2Map.size shouldBe r1Map.size
r1Map("10x10").length shouldBe 10
r1Map("10x20").length shouldBe 10
r1Map("20x20").length shouldBe 10
r2Map("10x10").length shouldBe 10
r2Map("10x20").length shouldBe 15
r2Map("20x20").length shouldBe 15
}
}

0 comments on commit afa97e6

Please sign in to comment.