Add cs alignment and cs example to sam

Koeng101 · Sep 9, 2024 · cc83cb5 · cc83cb5
1 parent 4c1108b
commit cc83cb5
Show file tree

Hide file tree

Showing 4 changed files with 285 additions and 1 deletion.
diff --git a/lib/align/cs/cs.go b/lib/align/cs/cs.go
@@ -0,0 +1,159 @@
+/*
+Package cs contains functions for parsing minimap2 cs optional tags.
+
+The following is taken directly from the minimap2 README documentation:
+
+# The cs optional tag
+
+The `cs` SAM/PAF tag encodes bases at mismatches and INDELs. It matches regular
+expression `/(:[0-9]+|\*[a-z][a-z]|[=\+\-][A-Za-z]+)+/`. Like CIGAR, `cs`
+consists of series of operations.  Each leading character specifies the
+operation; the following sequence is the one involved in the operation.
+
+The `cs` tag is enabled by command line option `--cs`. The following alignment,
+for example:
+
+	CGATCGATAAATAGAGTAG---GAATAGCA
+	||||||   ||||||||||   |||| |||
+	CGATCG---AATAGAGTAGGTCGAATtGCA
+
+is represented as `:6-ata:10+gtc:4*at:3`, where `:[0-9]+` represents an
+identical block, `-ata` represents a deletion, `+gtc` an insertion and `*at`
+indicates reference base `a` is substituted with a query base `t`. It is
+similar to the `MD` SAM tag but is standalone and easier to parse.
+
+If `--cs=long` is used, the `cs` string also contains identical sequences in
+the alignment. The above example will become
+`=CGATCG-ata=AATAGAGTAG+gtc=GAAT*at=GCA`. The long form of `cs` encodes both
+reference and query sequences in one string. The `cs` tag also encodes intron
+positions and splicing signals (see the [minimap2 manpage][manpage-cs] for
+details).
+*/
+package cs
+
+import (
+	"strconv"
+	"unicode"
+)
+
+// CS is a struct format of each element of a cs string. For example, the cs
+//
+//	:6-ata:10+gtc:4*at:3
+//
+// would be divided into 7 CS structs. These can be directly processed, or
+// further processed into DigestedCS and DigestedInsertions.
+type CS struct {
+	Type   rune   // Acceptable types: [* + :]
+	Size   int    // Size of cs. For example, :6 would be 6 or -ata would be 3
+	Change string // if insertion or deletion, write out the change. -ata would be ata
+}
+
+/*
+Context for developers:
+
+DigestedCS is my way of encoding more simply changes for the needs of DNA
+synthesis validation. The core idea is that if we remove the need for
+specifically encoding insertions, we can flatten the data structure, and make
+it a lot easier to use. The full insertion information can be stored separately
+in a DigestedInsertion struct.
+*/
+
+// DigestedCS is a simplified way of viewing CS, specifically for finding
+// mutations in sequences. It only contains the position of a given change
+// and the type of change. For point mutations and indels, this contains full
+// information of the change - a longer deletion will simply be a few * in a
+// row. However, it doesn't contain the full information for insertions. For
+// DNA synthesis applications, this is mostly OK because insertions are very
+// rare.
+type DigestedCS struct {
+	Position uint64 // Position in the sequence with the change
+	Type     uint8  // The change type. Can only be [A T G C * +].
+}
+
+// DigestedInsertion separately stores inserts from the DigestedCS, in case
+// this data is needed for analysis later.
+type DigestedInsertion struct {
+	Position  uint64 // Position in the sequence with an insertion
+	Insertion string // the insertion string
+}
+
+// toCS is a function that properly adds size to CS. We use a function here
+// instead copying this work twice to get 100% testing.
+func toCS(current CS, numBuffer string) CS {
+	if current.Type == ':' {
+		current.Size, _ = strconv.Atoi(numBuffer)
+	} else if current.Type == '*' {
+		current.Size = 1 // point mutations are always 1 base pair
+	} else {
+		current.Size = len(current.Change)
+	}
+	return current
+}
+
+// ParseCS takes a short CS string and returns a slice of CS structs
+func ParseCS(csString string) []CS {
+	var result []CS
+	var current CS
+	var numBuffer string
+
+	for _, char := range csString {
+		switch {
+		case char == ':' || char == '*' || char == '+' || char == '-':
+			if current.Type != 0 {
+				result = append(result, toCS(current, numBuffer))
+			}
+			current = CS{Type: char}
+			numBuffer = ""
+		case unicode.IsDigit(char):
+			numBuffer += string(char)
+		default:
+			current.Change += string(char)
+		}
+	}
+
+	// Add the last CS struct
+	if current.Type != 0 {
+		result = append(result, toCS(current, numBuffer))
+	}
+
+	return result
+}
+
+// DigestCS takes a slice of CS structs and returns slices of DigestedCS and DigestedInsertion
+func DigestCS(csSlice []CS) ([]DigestedCS, []DigestedInsertion) {
+	var digestedCS []DigestedCS
+	var digestedInsertions []DigestedInsertion
+	position := uint64(0)
+
+	for _, cs := range csSlice {
+		switch cs.Type {
+		case ':':
+			position += uint64(cs.Size)
+		case '*':
+			digestedCS = append(digestedCS, DigestedCS{Position: position, Type: cs.Change[1]}) // *at we need t
+			position++
+		case '-':
+			for i := 0; i < cs.Size; i++ {
+				digestedCS = append(digestedCS, DigestedCS{
+					Position: position,
+					Type:     '*',
+				})
+				position++
+			}
+		case '+':
+			digestedCS = append(digestedCS, DigestedCS{
+				Position: position,
+				Type:     '+',
+			})
+			// Insertions are positioned at where they are inserted. For example,
+			// if we have it between 18 and 19, the insertion "Position" would be 19
+			digestedInsertions = append(digestedInsertions, DigestedInsertion{
+				Position:  position,
+				Insertion: cs.Change,
+			})
+			// Don't increment position for insertions
+		}
+	}
+
+	return digestedCS, digestedInsertions
+}
diff --git a/lib/align/cs/cs_test.go b/lib/align/cs/cs_test.go
@@ -0,0 +1,107 @@
+package cs
+
+import (
+	"reflect"
+	"testing"
+)
+
+func TestParseAndDigestCS(t *testing.T) {
+	// Test case from the reference documentation
+	csString := ":6-ata:10+gtc:4*at:3"
+
+	// Test ParseCS
+	expectedCS := []CS{
+		{Type: ':', Size: 6, Change: ""},
+		{Type: '-', Size: 3, Change: "ata"},
+		{Type: ':', Size: 10, Change: ""},
+		{Type: '+', Size: 3, Change: "gtc"},
+		{Type: ':', Size: 4, Change: ""},
+		{Type: '*', Size: 1, Change: "at"},
+		{Type: ':', Size: 3, Change: ""},
+	}
+
+	parsedCS := ParseCS(csString)
+
+	if !reflect.DeepEqual(parsedCS, expectedCS) {
+		t.Errorf("ParseCS(%s) = %v, want %v", csString, parsedCS, expectedCS)
+	}
+
+	// Test DigestCS
+	expectedDigestedCS := []DigestedCS{
+		{Position: 6, Type: '*'},
+		{Position: 7, Type: '*'},
+		{Position: 8, Type: '*'},
+		{Position: 19, Type: '+'},
+		{Position: 23, Type: 't'},
+	}
+
+	expectedDigestedInsertions := []DigestedInsertion{
+		{Position: 19, Insertion: "gtc"},
+	}
+
+	digestedCS, digestedInsertions := DigestCS(parsedCS)
+
+	if !reflect.DeepEqual(digestedCS, expectedDigestedCS) {
+		t.Errorf("DigestCS() digestedCS = %v, want %v", digestedCS, expectedDigestedCS)
+	}
+
+	if !reflect.DeepEqual(digestedInsertions, expectedDigestedInsertions) {
+		t.Errorf("DigestCS() digestedInsertions = %v, want %v", digestedInsertions, expectedDigestedInsertions)
+	}
+}
+
+func TestParseAndDigestLongerCS(t *testing.T) {
+	// Test case with a longer CS string
+	csString := ":38-t:25-c:147*ag:29+gta:9*gc:3-gag:6-t:94-cgca:77-c*ag:16-at:4*ct:1+g:1*gc:98-c:191-c:200*ga:21*ga:2*ca:1-c:83*ga"
+
+	// Test ParseCS
+	expectedCS := []CS{
+		{Type: ':', Size: 38, Change: ""},
+		{Type: '-', Size: 1, Change: "t"},
+		{Type: ':', Size: 25, Change: ""},
+		{Type: '-', Size: 1, Change: "c"},
+		{Type: ':', Size: 147, Change: ""},
+		{Type: '*', Size: 1, Change: "ag"},
+		{Type: ':', Size: 29, Change: ""},
+		{Type: '+', Size: 3, Change: "gta"},
+		{Type: ':', Size: 9, Change: ""},
+		{Type: '*', Size: 1, Change: "gc"},
+		{Type: ':', Size: 3, Change: ""},
+		{Type: '-', Size: 3, Change: "gag"},
+		{Type: ':', Size: 6, Change: ""},
+		{Type: '-', Size: 1, Change: "t"},
+		{Type: ':', Size: 94, Change: ""},
+		{Type: '-', Size: 4, Change: "cgca"},
+		{Type: ':', Size: 77, Change: ""},
+		{Type: '-', Size: 1, Change: "c"},
+		{Type: '*', Size: 1, Change: "ag"},
+		{Type: ':', Size: 16, Change: ""},
+		{Type: '-', Size: 2, Change: "at"},
+		{Type: ':', Size: 4, Change: ""},
+		{Type: '*', Size: 1, Change: "ct"},
+		{Type: ':', Size: 1, Change: ""},
+		{Type: '+', Size: 1, Change: "g"},
+		{Type: ':', Size: 1, Change: ""},
+		{Type: '*', Size: 1, Change: "gc"},
+		{Type: ':', Size: 98, Change: ""},
+		{Type: '-', Size: 1, Change: "c"},
+		{Type: ':', Size: 191, Change: ""},
+		{Type: '-', Size: 1, Change: "c"},
+		{Type: ':', Size: 200, Change: ""},
+		{Type: '*', Size: 1, Change: "ga"},
+		{Type: ':', Size: 21, Change: ""},
+		{Type: '*', Size: 1, Change: "ga"},
+		{Type: ':', Size: 2, Change: ""},
+		{Type: '*', Size: 1, Change: "ca"},
+		{Type: ':', Size: 1, Change: ""},
+		{Type: '-', Size: 1, Change: "c"},
+		{Type: ':', Size: 83, Change: ""},
+		{Type: '*', Size: 1, Change: "ga"},
+	}
+
+	parsedCS := ParseCS(csString)
+
+	if !reflect.DeepEqual(parsedCS, expectedCS) {
+		t.Errorf("ParseCS(%s) = %v, want %v", csString, parsedCS, expectedCS)
+	}
+}
diff --git a/lib/bio/sam/sam.go b/lib/bio/sam/sam.go
@@ -525,7 +525,8 @@ func (p *Parser) Next() (Alignment, error) {
 	var optionals []Optional
 	for _, value := range values[11:] {
 		valueSplit := strings.Split(value, ":")
-		optionals = append(optionals, Optional{Tag: valueSplit[0], Type: rune(valueSplit[1][0]), Data: valueSplit[2]})
+		// tag+:+Type+: == len()+1+1+1
+		optionals = append(optionals, Optional{Tag: valueSplit[0], Type: rune(valueSplit[1][0]), Data: value[len(valueSplit[0])+1+1+1:]})
 	}
 	alignment.Optionals = optionals
 	return alignment, nil

diff --git a/lib/bio/sam/sam_test.go b/lib/bio/sam/sam_test.go
@@ -296,3 +296,20 @@ func TestPrimary(t *testing.T) {
 		})
 	}
 }
+
+func ExampleCs() {
+	file := strings.NewReader(`@HD	VN:1.6	SO:unsorted	GO:query
+@SQ	SN:pOpen_V3_amplified	LN:2482
+@PG	ID:minimap2	PN:minimap2	VN:2.24-r1155-dirty	CL:minimap2 -acLx map-ont - APX814_pass_barcode17_e229f2c8_109f9b91_0.fastq.gz
+a2e75729-6c94-44e1-a9a2-c84fbea683a4	16	amplicon	1	60	70S15M3I377M2I3M1D71M1D5M1D2M1D224M1D282M1I86M87S	*	0	0	CTACGTATTGCTAAGGTTAACACAAAGACACCGACAACTTTCTTCAGCACTCCCAACCCGAACCCCATATGGGTTATTGTCTCATGAAGAGTGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACCTGCACCAGTCAGTAAAACGATGGCCAGTGACTTGGTCTCAAATGAACACGATCAACATCGCAAAGTCGGACTTCTCGGACATCGAGCTCGCAGCAATCCCGTTCAACGCACTGGCAGACCACTACGGTGAGCGCCTGGCACGCGAGCAGCTGGCACTGGAGCACGAGTCGTACGAGATGGGTGAGGCACGCTTCCGCAAGATGTTCGAGCGCCAGCTGAAGGCAGGTGAGGTGGCAGACAACGCAGCAGCAAAGCCGCTGATCACGACGCTGCTGCCGAAGATGATCGCACGCATCAACGACTGGTTCGAGGAGGTGAAGGGGCAAGCGTGGTAAGCGCCCGACGGCATTCCAGTTCCTGGACGAGATCAAGCCGGAGGCAGTGGCATACATCACCTCCAACACGCTGGCATGCCTGACGTCGGCAGACAACACGACGGTGCAGGCAGTGGCATCGGCAATCGGTCGCGCAATCGAGGACGAGGCAGAGTTCGGTCGCATCCGCGACCTGGAGGCAAAGCACTTCAAGAAGAACGTGGAGGAGCAGCTGAACAAGCGCGTGGGTCACGTGTACAAGAAGGCATTCATGCAGGTGGTGGAGGCAGACATGCTGTCGAAGGGTCTGCTGGTGGTGAGGCATGGTCGTCGTGGCACAAGGAGGACTCGATCCACGTGGGTGTGCGCTGCATCGAGATGCTGATCGAGTCGACGGGTATGGTGTCGCTGCACCGCCAGAACGCAGGTGTGGTGGGTCAGGACTCGGAGGACATCGAGCTGGCACCGGAGTACGCAGAGGCAATCGCAACGCGCGCAGGTGCACTGGCAGGTATCTCGCCGATGTTCCAGCCGTGCCAGGTGCCGCCGAAGCCGTGGACGGGTATCACGGGTGGTGGTTACTGGGGAAACGGTACGCCGCCCGCTGGCACTGGTGCGCACGCACTCGAAGACGAGACCAAGTCGTCATAGCTGTTTCCTGAGAGCTTGGCAGGTGATGACTCACCTTAACCTTGTCCTTCAGGTGCTGAAGAAAGTTGTCGGTGTCTTTGTGTTAACCTTAGCAATACGTAACTGAACGAGGTGAAC	&())+-887733;@@><==>?@33336=666778;=>>?811-,-,*++,''&()***+9EHHGNGGGFKGDEBBB@@@@AA?2///00100:8=>CELFDCFCCDCDGHFBAAEHSKHIFCECBCCBCCCDEBBC>==?@AAAAAB@@??>:953)(((+***-967722434?66666656559:99<A@@@@ABCBCBD???;9A;2++,,,EFLGPLKSFB;5/..)***+DAHGMKSLIL..---GDEIIFDDLKGGJSJIRJJHFEJISSKLNKJHGHJEGHLHHHIGGFIEGDHSSSLESI==<FSKSHMHNSIJCBEED;889<DMHISSHISJJJSJKHILSKJIJJSIILSHPJGKGGLSFNJNJHSLHIQIMIHKHMSKGISMJKIKKIJHFILDE94458EEFHIPGSLILHHSGHESSQJHRHSSIJKKNKSSOJLADFHCCEG9510//+')(('&&'&&)*,-+,,-+(((((()@@BFSGSMSB<;<=?*((()?@SSSSHEJGNIHIFIFGJKHBA:8*&%%$$$$%%&''),;BDKHKSKSGMBIOMGNFIGILFISIQEIEFGFHSKSIPGSIFFEGMMHMQJHOSHJSRMJLSKSHKHKJJIMLSJCBFGLPGSHHKGPJJKHLHSSKILJJSECHDJJMFOHHGGSIHGNSSMSHJSKISJHB)((()?EFIGLIMSSIJEDI<;?LJSIHHPHSRSHSKSKMJLKFMFMJSSMLHSGJ860//8;=D@BDBBSHJFIHGB>A@BBEKS**H=A@>>>EHGDQJHLJLLHJQSGEQCHGEGCBDFNJIPA@@B<;;<<CHIGFKLNHSHILRMSGSKJMSHMHGMSGLJIIOSKLJGSJLNHESE>><<<ACLHFJEJFJDDHDEDJFKPILIMMMJMGLHGSGNGJGJMHEE)))2567=B?>ABFKSSGSIJGGDSLSGHKFFDDEGENSSHSKKKSGHLMSSHSIJSSHNSSHDDDDESLKIF>BGGIEKIJJDBD===7.....HKPSJSMHNSFGLPGOFJGCHJISSGJNSE??;))))*??7211116;<>GDBDNGJGIGI??DDBF=;::6;<>=100:C=BCPDD???EHEFDFGJLLIIILSB><<41...//139>?;;;;;AOH65556DFGHGGIGDIHISGKSJFGSKKGDBCDFGMGFJB?>=>?CDEFEDJEJIHFFFC>766669::8991)(((-((''&%$##%&%%	NM:i:18	ms:i:2037	AS:i:2034	nn:i:0	tp:A:P	cm:i:168	s1:i:969	s2:i:0	de:f:0.014	cs:Z::15+gaa:3*ct:74*ct:67*gc:230+gg:3-a:5*ct:65-g*ac:2*ac:1-g:2-g:224-g:274*cg:7+a:86	rl:i:0`)
+	parser, _, _ := NewParser(file, DefaultMaxLineSize)
+	samLine, _ := parser.Next()
+
+	for _, optional := range samLine.Optionals {
+		if optional.Tag == "cs" && optional.Type == 'Z' {
+			fmt.Println(optional.Data)
+		}
+	}
+
+	// Output: :15+gaa:3*ct:74*ct:67*gc:230+gg:3-a:5*ct:65-g*ac:2*ac:1-g:2-g:224-g:274*cg:7+a:86
+}