add plasmidcall

Koeng101 · Sep 11, 2024 · 6ef8cd8 · 6ef8cd8
1 parent cc83cb5
commit 6ef8cd8
Show file tree

Hide file tree

Showing 6 changed files with 311 additions and 24 deletions.
diff --git a/external/minimap2/minimap2.go b/external/minimap2/minimap2.go
@@ -72,7 +72,7 @@ func Minimap2(templateFastas []fasta.Record, fastqInput io.Reader, w io.Writer)
 	tmpFile.Close() // Close the file as it's no longer needed
 
 	// Start minimap2 pointing to the temporary file and stdin for sequencing data
-	cmd := exec.Command("minimap2", "-K", "100", "-ax", "map-ont", tmpFile.Name(), "-")
+	cmd := exec.Command("minimap2", "--cs=short", "-K", "100", "-ax", "map-ont", tmpFile.Name(), "-")
 	cmd.Stdout = w
 	cmd.Stdin = fastqInput
 	if err := cmd.Start(); err != nil {

diff --git a/lib/align/cs/cs.go b/lib/align/cs/cs.go
@@ -34,6 +34,8 @@ package cs
 import (
 	"strconv"
 	"unicode"
+
+	"github.com/koeng101/dnadesign/lib/transform"
 )
 
 // CS is a struct format of each element of a cs string. For example, the cs
@@ -66,15 +68,19 @@ in a DigestedInsertion struct.
 // DNA synthesis applications, this is mostly OK because insertions are very
 // rare.
 type DigestedCS struct {
-	Position uint64 // Position in the sequence with the change
-	Type     uint8  // The change type. Can only be [A T G C * +].
+	Position          uint64 // Position in the sequence with the change
+	Type              uint8  // The change type. Can only be [. A T G C * +]
+	Qual              byte   // The byte of the quality
+	ReverseComplement bool
 }
 
 // DigestedInsertion separately stores inserts from the DigestedCS, in case
 // this data is needed for analysis later.
 type DigestedInsertion struct {
-	Position  uint64 // Position in the sequence with an insertion
-	Insertion string // the insertion string
+	Position          uint64 // Position in the sequence with an insertion
+	Insertion         string // the insertion string
+	Qual              string // The string of the quality
+	ReverseComplement bool
 }
 
 // toCS is a function that properly adds size to CS. We use a function here
@@ -120,38 +126,50 @@ func ParseCS(csString string) []CS {
 }
 
 // DigestCS takes a slice of CS structs and returns slices of DigestedCS and DigestedInsertion
-func DigestCS(csSlice []CS) ([]DigestedCS, []DigestedInsertion) {
+func DigestCS(csSlice []CS, quality string, reverseComplement bool) ([]DigestedCS, []DigestedInsertion) {
+	if reverseComplement {
+		quality = transform.ReverseString(quality)
+	}
 	var digestedCS []DigestedCS
 	var digestedInsertions []DigestedInsertion
 	position := uint64(0)
+	// We want the quality of the fragment to follow the query, not the
+	// reference sequence.
+	var qualityPosition int
 
 	for _, cs := range csSlice {
 		switch cs.Type {
 		case ':':
-			position += uint64(cs.Size)
+			for i := 0; i < cs.Size; i++ {
+				digestedCS = append(digestedCS, DigestedCS{Position: position, Type: '.', ReverseComplement: reverseComplement, Qual: quality[qualityPosition]})
+				position++
+				qualityPosition++
+			}
 		case '*':
-			digestedCS = append(digestedCS, DigestedCS{Position: position, Type: cs.Change[1]}) // *at we need t
+			digestedCS = append(digestedCS, DigestedCS{Position: position, Type: cs.Change[1], ReverseComplement: reverseComplement, Qual: quality[qualityPosition]}) // *at we need t
 			position++
+			qualityPosition++
 		case '-':
 			for i := 0; i < cs.Size; i++ {
 				digestedCS = append(digestedCS, DigestedCS{
-					Position: position,
-					Type:     '*',
+					Position:          position,
+					Type:              '*',
+					ReverseComplement: reverseComplement,
+					// No quality for deletions mutations.
 				})
 				position++
 			}
 		case '+':
-			digestedCS = append(digestedCS, DigestedCS{
-				Position: position,
-				Type:     '+',
-			})
 			// Insertions are positioned at where they are inserted. For example,
 			// if we have it between 18 and 19, the insertion "Position" would be 19
 			digestedInsertions = append(digestedInsertions, DigestedInsertion{
-				Position:  position,
-				Insertion: cs.Change,
+				Position:          position,
+				Insertion:         cs.Change,
+				ReverseComplement: reverseComplement,
+				Qual:              quality[qualityPosition : qualityPosition+len(cs.Change)],
 			})
-			// Don't increment position for insertions
+			// Don't increment position for insertions, but increment qualityPosition
+			qualityPosition += len(cs.Change)
 		}
 	}
 

diff --git a/lib/align/cs/cs_test.go b/lib/align/cs/cs_test.go
@@ -28,18 +28,40 @@ func TestParseAndDigestCS(t *testing.T) {
 
 	// Test DigestCS
 	expectedDigestedCS := []DigestedCS{
-		{Position: 6, Type: '*'},
-		{Position: 7, Type: '*'},
-		{Position: 8, Type: '*'},
-		{Position: 19, Type: '+'},
-		{Position: 23, Type: 't'},
+		{Position: 0, Type: 46, Qual: 65, ReverseComplement: false},
+		{Position: 1, Type: 46, Qual: 66, ReverseComplement: false},
+		{Position: 2, Type: 46, Qual: 67, ReverseComplement: false},
+		{Position: 3, Type: 46, Qual: 68, ReverseComplement: false},
+		{Position: 4, Type: 46, Qual: 69, ReverseComplement: false},
+		{Position: 5, Type: 46, Qual: 70, ReverseComplement: false},
+		{Position: 6, Type: 42, Qual: 0, ReverseComplement: false},
+		{Position: 7, Type: 42, Qual: 0, ReverseComplement: false},
+		{Position: 8, Type: 42, Qual: 0, ReverseComplement: false},
+		{Position: 9, Type: 46, Qual: 71, ReverseComplement: false},
+		{Position: 10, Type: 46, Qual: 72, ReverseComplement: false},
+		{Position: 11, Type: 46, Qual: 73, ReverseComplement: false},
+		{Position: 12, Type: 46, Qual: 74, ReverseComplement: false},
+		{Position: 13, Type: 46, Qual: 75, ReverseComplement: false},
+		{Position: 14, Type: 46, Qual: 76, ReverseComplement: false},
+		{Position: 15, Type: 46, Qual: 77, ReverseComplement: false},
+		{Position: 16, Type: 46, Qual: 78, ReverseComplement: false},
+		{Position: 17, Type: 46, Qual: 79, ReverseComplement: false},
+		{Position: 18, Type: 46, Qual: 80, ReverseComplement: false},
+		{Position: 19, Type: 46, Qual: 84, ReverseComplement: false},
+		{Position: 20, Type: 46, Qual: 85, ReverseComplement: false},
+		{Position: 21, Type: 46, Qual: 86, ReverseComplement: false},
+		{Position: 22, Type: 46, Qual: 87, ReverseComplement: false},
+		{Position: 23, Type: 116, Qual: 88, ReverseComplement: false},
+		{Position: 24, Type: 46, Qual: 89, ReverseComplement: false},
+		{Position: 25, Type: 46, Qual: 90, ReverseComplement: false},
+		{Position: 26, Type: 46, Qual: 97, ReverseComplement: false},
 	}
 
 	expectedDigestedInsertions := []DigestedInsertion{
-		{Position: 19, Insertion: "gtc"},
+		{Position: 19, Insertion: "gtc", Qual: "QRS", ReverseComplement: false},
 	}
 
-	digestedCS, digestedInsertions := DigestCS(parsedCS)
+	digestedCS, digestedInsertions := DigestCS(parsedCS, "ABCDEFGHIJKLMNOPQRSTUVWXYZa", false)
 
 	if !reflect.DeepEqual(digestedCS, expectedDigestedCS) {
 		t.Errorf("DigestCS() digestedCS = %v, want %v", digestedCS, expectedDigestedCS)

diff --git a/lib/sequencing/plasmidcall/plasmidcall.go b/lib/sequencing/plasmidcall/plasmidcall.go
@@ -0,0 +1,217 @@
+/*
+Package plasmidcall contains functions from calling whether or not a plasmid
+is mutated or not given sequencing data.
+
+In particular, it assesses a plasmid given cs alignments from minimap2. The
+cs alignments are compiled, and at each position, mutations are assigned.
+There are a couple kinds of mutations:
+
+  - DepthMutation. Not enough sequencing depth to make any call (at least 5x)
+  - NanoporeMutation. Occurs when 80% of mutations are only on 1 strand,
+    so long as we also have 5 examples of correct sequence on the other strand.
+  - CleanMutation. Occurs when over 80% of a given position is mutated to a
+    single mutation type.
+  - MixedMutation. Occurs when 20% to 80% of a given position are not correct.
+*/
+package plasmidcall
+
+import (
+	"math"
+
+	"github.com/koeng101/dnadesign/lib/align/cs"
+	"github.com/koeng101/dnadesign/lib/bio/fastq"
+)
+
+// MutationType contains the potential mutation types, including
+// depth, nanopore_error, point, indel, insertion, and noisy.
+type MutationType string
+
+const (
+	DepthError    MutationType = "depth"
+	NanoporeError MutationType = "nanopore_error"
+	Point         MutationType = "point"
+	Indel         MutationType = "indel"
+	Insertion     MutationType = "insertion"
+)
+
+// Mutation contains mutation data at a certain location.
+type Mutation struct {
+	Position       int
+	MutatioNString string
+	MutationType   MutationType
+	Change         string
+	CleanMutation  bool // False if MixedMutation, True if CleanMutation
+}
+
+type CsAlignment struct {
+	Read              fastq.Read
+	ReverseComplement bool
+	CS                []cs.CS
+}
+
+// CallMutations calls mutations from a range of cs alignments.
+func CallMutations(referenceSequence string, alignments []CsAlignment) ([]Mutation, error) {
+	totalDigestedCS := make(map[int][]cs.DigestedCS)
+	totalDigestedInsertions := make(map[int][]cs.DigestedInsertion)
+	for _, alignment := range alignments {
+		// Produce digestedCS and digestedInsertion for each alignment
+		digestedCS, digestedInsertions := cs.DigestCS(alignment.CS, alignment.Read.Quality, alignment.ReverseComplement)
+		for i := range digestedCS {
+			totalDigestedCS[i] = append(totalDigestedCS[i], digestedCS[i])
+		}
+		for i := range digestedInsertions {
+			totalDigestedInsertions[int(digestedInsertions[i].Position)] = append(totalDigestedInsertions[int(digestedInsertions[i].Position)], digestedInsertions[i])
+		}
+	}
+	var mutations []Mutation
+	// Now we have a map of all the potential mutations at each point in the sequence. Now, we iterate over them.
+	for i := range referenceSequence {
+		// i is the particular position of the reference sequence.
+		csAtPosition := totalDigestedCS[i]
+		insertionsAtPosition := totalDigestedInsertions[i]
+
+		// Check for depth mutation
+		if len(csAtPosition) < 5 {
+			mutations = append(mutations, Mutation{
+				Position:      i,
+				MutationType:  DepthError,
+				CleanMutation: true,
+			})
+			continue
+		}
+
+		// First, we check for insertions
+		if len(insertionsAtPosition) > 0 {
+			insertionPercentage := float64(len(insertionsAtPosition)) / float64(len(csAtPosition))
+			insertionCounts := make(map[string]int)
+			var maxCount int
+			var mostCommon string
+
+			for _, insertion := range insertionsAtPosition {
+				insertionCounts[insertion.Insertion]++
+				if insertionCounts[insertion.Insertion] > maxCount {
+					maxCount = insertionCounts[insertion.Insertion]
+					mostCommon = insertion.Insertion
+				}
+			}
+			if insertionPercentage > 0.8 {
+				mutations = append(mutations, Mutation{
+					Position:      i,
+					MutationType:  Insertion,
+					Change:        mostCommon,
+					CleanMutation: true,
+				})
+			} else if insertionPercentage >= 0.25 {
+				mutations = append(mutations, Mutation{
+					Position:      i,
+					MutationType:  Insertion,
+					Change:        mostCommon,
+					CleanMutation: false,
+				})
+			}
+
+		}
+
+		// Count occurrences of each mutation type
+		mutationCounts := make(map[uint8]int) // used later
+		totalMutations := 0
+		forwardStrand := make(map[uint8]int)
+		reverseStrand := make(map[uint8]int)
+		for _, digestedCS := range csAtPosition {
+			if !digestedCS.ReverseComplement {
+				forwardStrand[digestedCS.Type]++
+			} else {
+				reverseStrand[digestedCS.Type]++
+			}
+
+			mutationCounts[digestedCS.Type]++
+			if digestedCS.Type != '.' {
+				totalMutations++
+			}
+		}
+		// Now, get the mutational ratios of forward vs reverse.
+		// These will be used to call nanopore errors. Essentially,
+		// if we see a mutation to correct ratio that only occurs on
+		// a single strand (80% difference), we will call it as a nanopore
+		// error. Practically speaking, this means if there is a 5x
+		// representation of mutations on only one strand, we believe this
+		// is a nanopore sequencing error, and treat it as such.
+		mutationalRatios := make(map[uint8]float64)
+		for k, _ := range forwardStrand {
+			mutationalRatios[k] = 0
+		}
+		for k, _ := range reverseStrand {
+			mutationalRatios[k] = 0
+		}
+		var foundNanoporeMutation bool
+		for k, _ := range mutationalRatios {
+			if k != '.' {
+				forwardRatio := forwardStrand[k] / forwardStrand['.']
+				reverseRatio := reverseStrand[k] / reverseStrand['.']
+				// This just runs the test where if a strands correct/incorrect ratio is 5x on one strand and has 5
+				// correct reads on the other strand, then we
+				if ((forwardRatio > reverseRatio*5) && (reverseStrand['.'] > 5)) || ((reverseRatio > forwardRatio*5) && (forwardStrand['.'] > 5)) {
+					mutations = append(mutations, Mutation{Position: i, MutationType: NanoporeError, CleanMutation: true})
+					foundNanoporeMutation = true
+					break
+				}
+			}
+		}
+		// Skip checking other mutations if we find a nanopore mutation
+		if foundNanoporeMutation {
+			continue
+		}
+
+		// Check for clean or mixed mutations
+		if totalMutations > 0 {
+			var mostCommonMutation uint8
+			var maxCount int
+			for mut, count := range mutationCounts {
+				if count > maxCount {
+					mostCommonMutation = mut
+					maxCount = count
+				}
+			}
+
+			mutationType := Point
+			if mostCommonMutation == '*' {
+				mutationType = Indel
+			}
+
+			// Check for a clean mutation
+			mutationPercentage := float64(maxCount) / float64(len(csAtPosition))
+			if mutationPercentage > 0.8 {
+				if mostCommonMutation != '.' {
+					mutations = append(mutations, Mutation{
+						Position:      i,
+						MutationType:  mutationType,
+						Change:        string(mostCommonMutation),
+						CleanMutation: true,
+					})
+					continue
+				}
+			}
+
+			// Now, there might now a mixture of mutations
+			var foundMixedMutation bool
+			for mut, count := range mutationCounts {
+				if mut != '.' {
+					if (float64(count)/float64(mutationCounts['.']) + math.SmallestNonzeroFloat64) > .25 { // we add epsilon to prevent divide by zero
+						mutations = append(mutations, Mutation{
+							Position:      i,
+							MutationType:  mutationType,
+							Change:        string(mostCommonMutation),
+							CleanMutation: false,
+						})
+						foundMixedMutation = true
+						break
+					}
+				}
+			}
+			if foundMixedMutation {
+				continue
+			}
+		}
+	}
+	return mutations, nil
+}
diff --git a/lib/transform/transform.go b/lib/transform/transform.go
@@ -199,3 +199,24 @@ var complementTableRNA = [256]byte{
 	'y': 'r',
 	'x': 'x',
 }
+
+// ReverseString reverses a string. Mostly takes from a solution by Russ Cox
+//
+//	http://groups.google.com/group/golang-nuts/browse_thread/thread/a0fb81698275eede
+//
+// This is mainly used for reverse quality scores on sequencing data.
+func ReverseString(input string) string {
+	n := 0
+	rune := make([]rune, len(input))
+	for _, r := range input {
+		rune[n] = r
+		n++
+	}
+	rune = rune[0:n]
+	// Reverse
+	for i := 0; i < n/2; i++ {
+		rune[i], rune[n-1-i] = rune[n-1-i], rune[i]
+	}
+	// Convert back to UTF-8.
+	return string(rune)
+}