Skip to content

Commit

Permalink
add plasmidcall
Browse files Browse the repository at this point in the history
  • Loading branch information
Koeng101 committed Sep 11, 2024
1 parent cc83cb5 commit 6ef8cd8
Show file tree
Hide file tree
Showing 6 changed files with 311 additions and 24 deletions.
2 changes: 1 addition & 1 deletion external/minimap2/minimap2.go
Original file line number Diff line number Diff line change
Expand Up @@ -72,7 +72,7 @@ func Minimap2(templateFastas []fasta.Record, fastqInput io.Reader, w io.Writer)
tmpFile.Close() // Close the file as it's no longer needed

// Start minimap2 pointing to the temporary file and stdin for sequencing data
cmd := exec.Command("minimap2", "-K", "100", "-ax", "map-ont", tmpFile.Name(), "-")
cmd := exec.Command("minimap2", "--cs=short", "-K", "100", "-ax", "map-ont", tmpFile.Name(), "-")
cmd.Stdout = w
cmd.Stdin = fastqInput
if err := cmd.Start(); err != nil {
Expand Down
50 changes: 34 additions & 16 deletions lib/align/cs/cs.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,6 +34,8 @@ package cs
import (
"strconv"
"unicode"

"github.com/koeng101/dnadesign/lib/transform"
)

// CS is a struct format of each element of a cs string. For example, the cs
Expand Down Expand Up @@ -66,15 +68,19 @@ in a DigestedInsertion struct.
// DNA synthesis applications, this is mostly OK because insertions are very
// rare.
type DigestedCS struct {
Position uint64 // Position in the sequence with the change
Type uint8 // The change type. Can only be [A T G C * +].
Position uint64 // Position in the sequence with the change
Type uint8 // The change type. Can only be [. A T G C * +]
Qual byte // The byte of the quality
ReverseComplement bool
}

// DigestedInsertion separately stores inserts from the DigestedCS, in case
// this data is needed for analysis later.
type DigestedInsertion struct {
Position uint64 // Position in the sequence with an insertion
Insertion string // the insertion string
Position uint64 // Position in the sequence with an insertion
Insertion string // the insertion string
Qual string // The string of the quality
ReverseComplement bool
}

// toCS is a function that properly adds size to CS. We use a function here
Expand Down Expand Up @@ -120,38 +126,50 @@ func ParseCS(csString string) []CS {
}

// DigestCS takes a slice of CS structs and returns slices of DigestedCS and DigestedInsertion
func DigestCS(csSlice []CS) ([]DigestedCS, []DigestedInsertion) {
func DigestCS(csSlice []CS, quality string, reverseComplement bool) ([]DigestedCS, []DigestedInsertion) {
if reverseComplement {
quality = transform.ReverseString(quality)
}
var digestedCS []DigestedCS
var digestedInsertions []DigestedInsertion
position := uint64(0)
// We want the quality of the fragment to follow the query, not the
// reference sequence.
var qualityPosition int

for _, cs := range csSlice {
switch cs.Type {
case ':':
position += uint64(cs.Size)
for i := 0; i < cs.Size; i++ {
digestedCS = append(digestedCS, DigestedCS{Position: position, Type: '.', ReverseComplement: reverseComplement, Qual: quality[qualityPosition]})
position++
qualityPosition++
}
case '*':
digestedCS = append(digestedCS, DigestedCS{Position: position, Type: cs.Change[1]}) // *at we need t
digestedCS = append(digestedCS, DigestedCS{Position: position, Type: cs.Change[1], ReverseComplement: reverseComplement, Qual: quality[qualityPosition]}) // *at we need t
position++
qualityPosition++
case '-':
for i := 0; i < cs.Size; i++ {
digestedCS = append(digestedCS, DigestedCS{
Position: position,
Type: '*',
Position: position,
Type: '*',
ReverseComplement: reverseComplement,
// No quality for deletions mutations.
})
position++
}
case '+':
digestedCS = append(digestedCS, DigestedCS{
Position: position,
Type: '+',
})
// Insertions are positioned at where they are inserted. For example,
// if we have it between 18 and 19, the insertion "Position" would be 19
digestedInsertions = append(digestedInsertions, DigestedInsertion{
Position: position,
Insertion: cs.Change,
Position: position,
Insertion: cs.Change,
ReverseComplement: reverseComplement,
Qual: quality[qualityPosition : qualityPosition+len(cs.Change)],
})
// Don't increment position for insertions
// Don't increment position for insertions, but increment qualityPosition
qualityPosition += len(cs.Change)
}
}

Expand Down
36 changes: 29 additions & 7 deletions lib/align/cs/cs_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,18 +28,40 @@ func TestParseAndDigestCS(t *testing.T) {

// Test DigestCS
expectedDigestedCS := []DigestedCS{
{Position: 6, Type: '*'},
{Position: 7, Type: '*'},
{Position: 8, Type: '*'},
{Position: 19, Type: '+'},
{Position: 23, Type: 't'},
{Position: 0, Type: 46, Qual: 65, ReverseComplement: false},
{Position: 1, Type: 46, Qual: 66, ReverseComplement: false},
{Position: 2, Type: 46, Qual: 67, ReverseComplement: false},
{Position: 3, Type: 46, Qual: 68, ReverseComplement: false},
{Position: 4, Type: 46, Qual: 69, ReverseComplement: false},
{Position: 5, Type: 46, Qual: 70, ReverseComplement: false},
{Position: 6, Type: 42, Qual: 0, ReverseComplement: false},
{Position: 7, Type: 42, Qual: 0, ReverseComplement: false},
{Position: 8, Type: 42, Qual: 0, ReverseComplement: false},
{Position: 9, Type: 46, Qual: 71, ReverseComplement: false},
{Position: 10, Type: 46, Qual: 72, ReverseComplement: false},
{Position: 11, Type: 46, Qual: 73, ReverseComplement: false},
{Position: 12, Type: 46, Qual: 74, ReverseComplement: false},
{Position: 13, Type: 46, Qual: 75, ReverseComplement: false},
{Position: 14, Type: 46, Qual: 76, ReverseComplement: false},
{Position: 15, Type: 46, Qual: 77, ReverseComplement: false},
{Position: 16, Type: 46, Qual: 78, ReverseComplement: false},
{Position: 17, Type: 46, Qual: 79, ReverseComplement: false},
{Position: 18, Type: 46, Qual: 80, ReverseComplement: false},
{Position: 19, Type: 46, Qual: 84, ReverseComplement: false},
{Position: 20, Type: 46, Qual: 85, ReverseComplement: false},
{Position: 21, Type: 46, Qual: 86, ReverseComplement: false},
{Position: 22, Type: 46, Qual: 87, ReverseComplement: false},
{Position: 23, Type: 116, Qual: 88, ReverseComplement: false},
{Position: 24, Type: 46, Qual: 89, ReverseComplement: false},
{Position: 25, Type: 46, Qual: 90, ReverseComplement: false},
{Position: 26, Type: 46, Qual: 97, ReverseComplement: false},
}

expectedDigestedInsertions := []DigestedInsertion{
{Position: 19, Insertion: "gtc"},
{Position: 19, Insertion: "gtc", Qual: "QRS", ReverseComplement: false},
}

digestedCS, digestedInsertions := DigestCS(parsedCS)
digestedCS, digestedInsertions := DigestCS(parsedCS, "ABCDEFGHIJKLMNOPQRSTUVWXYZa", false)

if !reflect.DeepEqual(digestedCS, expectedDigestedCS) {
t.Errorf("DigestCS() digestedCS = %v, want %v", digestedCS, expectedDigestedCS)
Expand Down
217 changes: 217 additions & 0 deletions lib/sequencing/plasmidcall/plasmidcall.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,217 @@
/*
Package plasmidcall contains functions from calling whether or not a plasmid
is mutated or not given sequencing data.
In particular, it assesses a plasmid given cs alignments from minimap2. The
cs alignments are compiled, and at each position, mutations are assigned.
There are a couple kinds of mutations:
- DepthMutation. Not enough sequencing depth to make any call (at least 5x)
- NanoporeMutation. Occurs when 80% of mutations are only on 1 strand,
so long as we also have 5 examples of correct sequence on the other strand.
- CleanMutation. Occurs when over 80% of a given position is mutated to a
single mutation type.
- MixedMutation. Occurs when 20% to 80% of a given position are not correct.
*/
package plasmidcall

import (
"math"

"github.com/koeng101/dnadesign/lib/align/cs"
"github.com/koeng101/dnadesign/lib/bio/fastq"
)

// MutationType contains the potential mutation types, including
// depth, nanopore_error, point, indel, insertion, and noisy.
type MutationType string

const (
DepthError MutationType = "depth"
NanoporeError MutationType = "nanopore_error"
Point MutationType = "point"
Indel MutationType = "indel"
Insertion MutationType = "insertion"
)

// Mutation contains mutation data at a certain location.
type Mutation struct {
Position int
MutatioNString string
MutationType MutationType
Change string
CleanMutation bool // False if MixedMutation, True if CleanMutation
}

type CsAlignment struct {
Read fastq.Read
ReverseComplement bool
CS []cs.CS
}

// CallMutations calls mutations from a range of cs alignments.
func CallMutations(referenceSequence string, alignments []CsAlignment) ([]Mutation, error) {
totalDigestedCS := make(map[int][]cs.DigestedCS)
totalDigestedInsertions := make(map[int][]cs.DigestedInsertion)
for _, alignment := range alignments {
// Produce digestedCS and digestedInsertion for each alignment
digestedCS, digestedInsertions := cs.DigestCS(alignment.CS, alignment.Read.Quality, alignment.ReverseComplement)
for i := range digestedCS {
totalDigestedCS[i] = append(totalDigestedCS[i], digestedCS[i])
}
for i := range digestedInsertions {
totalDigestedInsertions[int(digestedInsertions[i].Position)] = append(totalDigestedInsertions[int(digestedInsertions[i].Position)], digestedInsertions[i])
}
}
var mutations []Mutation
// Now we have a map of all the potential mutations at each point in the sequence. Now, we iterate over them.
for i := range referenceSequence {
// i is the particular position of the reference sequence.
csAtPosition := totalDigestedCS[i]
insertionsAtPosition := totalDigestedInsertions[i]

// Check for depth mutation
if len(csAtPosition) < 5 {
mutations = append(mutations, Mutation{
Position: i,
MutationType: DepthError,
CleanMutation: true,
})
continue
}

// First, we check for insertions
if len(insertionsAtPosition) > 0 {
insertionPercentage := float64(len(insertionsAtPosition)) / float64(len(csAtPosition))
insertionCounts := make(map[string]int)
var maxCount int
var mostCommon string

for _, insertion := range insertionsAtPosition {
insertionCounts[insertion.Insertion]++
if insertionCounts[insertion.Insertion] > maxCount {
maxCount = insertionCounts[insertion.Insertion]
mostCommon = insertion.Insertion
}
}
if insertionPercentage > 0.8 {
mutations = append(mutations, Mutation{
Position: i,
MutationType: Insertion,
Change: mostCommon,
CleanMutation: true,
})
} else if insertionPercentage >= 0.25 {
mutations = append(mutations, Mutation{
Position: i,
MutationType: Insertion,
Change: mostCommon,
CleanMutation: false,
})
}

}

Check failure on line 113 in lib/sequencing/plasmidcall/plasmidcall.go

View workflow job for this annotation

GitHub Actions / lint

unnecessary trailing newline (whitespace)

// Count occurrences of each mutation type
mutationCounts := make(map[uint8]int) // used later
totalMutations := 0
forwardStrand := make(map[uint8]int)
reverseStrand := make(map[uint8]int)
for _, digestedCS := range csAtPosition {
if !digestedCS.ReverseComplement {
forwardStrand[digestedCS.Type]++
} else {
reverseStrand[digestedCS.Type]++
}

mutationCounts[digestedCS.Type]++
if digestedCS.Type != '.' {
totalMutations++
}
}
// Now, get the mutational ratios of forward vs reverse.
// These will be used to call nanopore errors. Essentially,
// if we see a mutation to correct ratio that only occurs on
// a single strand (80% difference), we will call it as a nanopore
// error. Practically speaking, this means if there is a 5x
// representation of mutations on only one strand, we believe this
// is a nanopore sequencing error, and treat it as such.
mutationalRatios := make(map[uint8]float64)
for k, _ := range forwardStrand {

Check failure on line 140 in lib/sequencing/plasmidcall/plasmidcall.go

View workflow job for this annotation

GitHub Actions / lint

File is not `gofmt`-ed with `-s` (gofmt)
mutationalRatios[k] = 0
}
for k, _ := range reverseStrand {
mutationalRatios[k] = 0
}
var foundNanoporeMutation bool
for k, _ := range mutationalRatios {
if k != '.' {
forwardRatio := forwardStrand[k] / forwardStrand['.']
reverseRatio := reverseStrand[k] / reverseStrand['.']
// This just runs the test where if a strands correct/incorrect ratio is 5x on one strand and has 5
// correct reads on the other strand, then we
if ((forwardRatio > reverseRatio*5) && (reverseStrand['.'] > 5)) || ((reverseRatio > forwardRatio*5) && (forwardStrand['.'] > 5)) {
mutations = append(mutations, Mutation{Position: i, MutationType: NanoporeError, CleanMutation: true})
foundNanoporeMutation = true
break
}
}
}
// Skip checking other mutations if we find a nanopore mutation
if foundNanoporeMutation {
continue
}

// Check for clean or mixed mutations
if totalMutations > 0 {
var mostCommonMutation uint8
var maxCount int
for mut, count := range mutationCounts {
if count > maxCount {
mostCommonMutation = mut
maxCount = count
}
}

mutationType := Point
if mostCommonMutation == '*' {
mutationType = Indel
}

// Check for a clean mutation
mutationPercentage := float64(maxCount) / float64(len(csAtPosition))
if mutationPercentage > 0.8 {
if mostCommonMutation != '.' {
mutations = append(mutations, Mutation{
Position: i,
MutationType: mutationType,
Change: string(mostCommonMutation),
CleanMutation: true,
})
continue
}
}

// Now, there might now a mixture of mutations
var foundMixedMutation bool
for mut, count := range mutationCounts {
if mut != '.' {
if (float64(count)/float64(mutationCounts['.']) + math.SmallestNonzeroFloat64) > .25 { // we add epsilon to prevent divide by zero
mutations = append(mutations, Mutation{
Position: i,
MutationType: mutationType,
Change: string(mostCommonMutation),
CleanMutation: false,
})
foundMixedMutation = true
break
}
}
}
if foundMixedMutation {
continue
}
}
}
return mutations, nil
}
21 changes: 21 additions & 0 deletions lib/transform/transform.go
Original file line number Diff line number Diff line change
Expand Up @@ -199,3 +199,24 @@ var complementTableRNA = [256]byte{
'y': 'r',
'x': 'x',
}

// ReverseString reverses a string. Mostly takes from a solution by Russ Cox
//
// http://groups.google.com/group/golang-nuts/browse_thread/thread/a0fb81698275eede
//
// This is mainly used for reverse quality scores on sequencing data.
func ReverseString(input string) string {
n := 0
rune := make([]rune, len(input))
for _, r := range input {
rune[n] = r
n++
}
rune = rune[0:n]
// Reverse
for i := 0; i < n/2; i++ {
rune[i], rune[n-1-i] = rune[n-1-i], rune[i]
}
// Convert back to UTF-8.
return string(rune)
}
Loading

0 comments on commit 6ef8cd8

Please sign in to comment.