Skip to content

Commit

Permalink
Add cs alignment and cs example to sam
Browse files Browse the repository at this point in the history
  • Loading branch information
Koeng101 committed Sep 9, 2024
1 parent 4c1108b commit cc83cb5
Show file tree
Hide file tree
Showing 4 changed files with 285 additions and 1 deletion.
159 changes: 159 additions & 0 deletions lib/align/cs/cs.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,159 @@
/*
Package cs contains functions for parsing minimap2 cs optional tags.
The following is taken directly from the minimap2 README documentation:
# The cs optional tag
The `cs` SAM/PAF tag encodes bases at mismatches and INDELs. It matches regular
expression `/(:[0-9]+|\*[a-z][a-z]|[=\+\-][A-Za-z]+)+/`. Like CIGAR, `cs`
consists of series of operations. Each leading character specifies the
operation; the following sequence is the one involved in the operation.
The `cs` tag is enabled by command line option `--cs`. The following alignment,
for example:
CGATCGATAAATAGAGTAG---GAATAGCA
|||||| |||||||||| |||| |||
CGATCG---AATAGAGTAGGTCGAATtGCA
is represented as `:6-ata:10+gtc:4*at:3`, where `:[0-9]+` represents an
identical block, `-ata` represents a deletion, `+gtc` an insertion and `*at`
indicates reference base `a` is substituted with a query base `t`. It is
similar to the `MD` SAM tag but is standalone and easier to parse.
If `--cs=long` is used, the `cs` string also contains identical sequences in
the alignment. The above example will become
`=CGATCG-ata=AATAGAGTAG+gtc=GAAT*at=GCA`. The long form of `cs` encodes both
reference and query sequences in one string. The `cs` tag also encodes intron
positions and splicing signals (see the [minimap2 manpage][manpage-cs] for
details).
*/
package cs

import (
"strconv"
"unicode"
)

// CS is a struct format of each element of a cs string. For example, the cs
//
// :6-ata:10+gtc:4*at:3
//
// would be divided into 7 CS structs. These can be directly processed, or
// further processed into DigestedCS and DigestedInsertions.
type CS struct {
Type rune // Acceptable types: [* + :]
Size int // Size of cs. For example, :6 would be 6 or -ata would be 3
Change string // if insertion or deletion, write out the change. -ata would be ata
}

/*
Context for developers:
DigestedCS is my way of encoding more simply changes for the needs of DNA
synthesis validation. The core idea is that if we remove the need for
specifically encoding insertions, we can flatten the data structure, and make
it a lot easier to use. The full insertion information can be stored separately
in a DigestedInsertion struct.
*/

// DigestedCS is a simplified way of viewing CS, specifically for finding
// mutations in sequences. It only contains the position of a given change
// and the type of change. For point mutations and indels, this contains full
// information of the change - a longer deletion will simply be a few * in a
// row. However, it doesn't contain the full information for insertions. For
// DNA synthesis applications, this is mostly OK because insertions are very
// rare.
type DigestedCS struct {
Position uint64 // Position in the sequence with the change
Type uint8 // The change type. Can only be [A T G C * +].
}

// DigestedInsertion separately stores inserts from the DigestedCS, in case
// this data is needed for analysis later.
type DigestedInsertion struct {
Position uint64 // Position in the sequence with an insertion
Insertion string // the insertion string
}

// toCS is a function that properly adds size to CS. We use a function here
// instead copying this work twice to get 100% testing.
func toCS(current CS, numBuffer string) CS {
if current.Type == ':' {
current.Size, _ = strconv.Atoi(numBuffer)
} else if current.Type == '*' {
current.Size = 1 // point mutations are always 1 base pair
} else {
current.Size = len(current.Change)
}
return current
}

// ParseCS takes a short CS string and returns a slice of CS structs
func ParseCS(csString string) []CS {
var result []CS
var current CS
var numBuffer string

for _, char := range csString {
switch {
case char == ':' || char == '*' || char == '+' || char == '-':
if current.Type != 0 {
result = append(result, toCS(current, numBuffer))
}
current = CS{Type: char}
numBuffer = ""
case unicode.IsDigit(char):
numBuffer += string(char)
default:
current.Change += string(char)
}
}

// Add the last CS struct
if current.Type != 0 {
result = append(result, toCS(current, numBuffer))
}

return result
}

// DigestCS takes a slice of CS structs and returns slices of DigestedCS and DigestedInsertion
func DigestCS(csSlice []CS) ([]DigestedCS, []DigestedInsertion) {
var digestedCS []DigestedCS
var digestedInsertions []DigestedInsertion
position := uint64(0)

for _, cs := range csSlice {
switch cs.Type {
case ':':
position += uint64(cs.Size)
case '*':
digestedCS = append(digestedCS, DigestedCS{Position: position, Type: cs.Change[1]}) // *at we need t
position++
case '-':
for i := 0; i < cs.Size; i++ {
digestedCS = append(digestedCS, DigestedCS{
Position: position,
Type: '*',
})
position++
}
case '+':
digestedCS = append(digestedCS, DigestedCS{
Position: position,
Type: '+',
})
// Insertions are positioned at where they are inserted. For example,
// if we have it between 18 and 19, the insertion "Position" would be 19
digestedInsertions = append(digestedInsertions, DigestedInsertion{
Position: position,
Insertion: cs.Change,
})
// Don't increment position for insertions
}
}

return digestedCS, digestedInsertions
}
107 changes: 107 additions & 0 deletions lib/align/cs/cs_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
package cs

import (
"reflect"
"testing"
)

func TestParseAndDigestCS(t *testing.T) {
// Test case from the reference documentation
csString := ":6-ata:10+gtc:4*at:3"

// Test ParseCS
expectedCS := []CS{
{Type: ':', Size: 6, Change: ""},
{Type: '-', Size: 3, Change: "ata"},
{Type: ':', Size: 10, Change: ""},
{Type: '+', Size: 3, Change: "gtc"},
{Type: ':', Size: 4, Change: ""},
{Type: '*', Size: 1, Change: "at"},
{Type: ':', Size: 3, Change: ""},
}

parsedCS := ParseCS(csString)

if !reflect.DeepEqual(parsedCS, expectedCS) {
t.Errorf("ParseCS(%s) = %v, want %v", csString, parsedCS, expectedCS)
}

// Test DigestCS
expectedDigestedCS := []DigestedCS{
{Position: 6, Type: '*'},
{Position: 7, Type: '*'},
{Position: 8, Type: '*'},
{Position: 19, Type: '+'},
{Position: 23, Type: 't'},
}

expectedDigestedInsertions := []DigestedInsertion{
{Position: 19, Insertion: "gtc"},
}

digestedCS, digestedInsertions := DigestCS(parsedCS)

if !reflect.DeepEqual(digestedCS, expectedDigestedCS) {
t.Errorf("DigestCS() digestedCS = %v, want %v", digestedCS, expectedDigestedCS)
}

if !reflect.DeepEqual(digestedInsertions, expectedDigestedInsertions) {
t.Errorf("DigestCS() digestedInsertions = %v, want %v", digestedInsertions, expectedDigestedInsertions)
}
}

func TestParseAndDigestLongerCS(t *testing.T) {
// Test case with a longer CS string
csString := ":38-t:25-c:147*ag:29+gta:9*gc:3-gag:6-t:94-cgca:77-c*ag:16-at:4*ct:1+g:1*gc:98-c:191-c:200*ga:21*ga:2*ca:1-c:83*ga"

// Test ParseCS
expectedCS := []CS{
{Type: ':', Size: 38, Change: ""},
{Type: '-', Size: 1, Change: "t"},
{Type: ':', Size: 25, Change: ""},
{Type: '-', Size: 1, Change: "c"},
{Type: ':', Size: 147, Change: ""},
{Type: '*', Size: 1, Change: "ag"},
{Type: ':', Size: 29, Change: ""},
{Type: '+', Size: 3, Change: "gta"},
{Type: ':', Size: 9, Change: ""},
{Type: '*', Size: 1, Change: "gc"},
{Type: ':', Size: 3, Change: ""},
{Type: '-', Size: 3, Change: "gag"},
{Type: ':', Size: 6, Change: ""},
{Type: '-', Size: 1, Change: "t"},
{Type: ':', Size: 94, Change: ""},
{Type: '-', Size: 4, Change: "cgca"},
{Type: ':', Size: 77, Change: ""},
{Type: '-', Size: 1, Change: "c"},
{Type: '*', Size: 1, Change: "ag"},
{Type: ':', Size: 16, Change: ""},
{Type: '-', Size: 2, Change: "at"},
{Type: ':', Size: 4, Change: ""},
{Type: '*', Size: 1, Change: "ct"},
{Type: ':', Size: 1, Change: ""},
{Type: '+', Size: 1, Change: "g"},
{Type: ':', Size: 1, Change: ""},
{Type: '*', Size: 1, Change: "gc"},
{Type: ':', Size: 98, Change: ""},
{Type: '-', Size: 1, Change: "c"},
{Type: ':', Size: 191, Change: ""},
{Type: '-', Size: 1, Change: "c"},
{Type: ':', Size: 200, Change: ""},
{Type: '*', Size: 1, Change: "ga"},
{Type: ':', Size: 21, Change: ""},
{Type: '*', Size: 1, Change: "ga"},
{Type: ':', Size: 2, Change: ""},
{Type: '*', Size: 1, Change: "ca"},
{Type: ':', Size: 1, Change: ""},
{Type: '-', Size: 1, Change: "c"},
{Type: ':', Size: 83, Change: ""},
{Type: '*', Size: 1, Change: "ga"},
}

parsedCS := ParseCS(csString)

if !reflect.DeepEqual(parsedCS, expectedCS) {
t.Errorf("ParseCS(%s) = %v, want %v", csString, parsedCS, expectedCS)
}
}
3 changes: 2 additions & 1 deletion lib/bio/sam/sam.go
Original file line number Diff line number Diff line change
Expand Up @@ -525,7 +525,8 @@ func (p *Parser) Next() (Alignment, error) {
var optionals []Optional
for _, value := range values[11:] {
valueSplit := strings.Split(value, ":")
optionals = append(optionals, Optional{Tag: valueSplit[0], Type: rune(valueSplit[1][0]), Data: valueSplit[2]})
// tag+:+Type+: == len()+1+1+1
optionals = append(optionals, Optional{Tag: valueSplit[0], Type: rune(valueSplit[1][0]), Data: value[len(valueSplit[0])+1+1+1:]})
}
alignment.Optionals = optionals
return alignment, nil
Expand Down
17 changes: 17 additions & 0 deletions lib/bio/sam/sam_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -296,3 +296,20 @@ func TestPrimary(t *testing.T) {
})
}
}

func ExampleCs() {

Check failure on line 300 in lib/bio/sam/sam_test.go

View workflow job for this annotation

GitHub Actions / lint

tests: ExampleCs refers to unknown identifier: Cs (govet)
file := strings.NewReader(`@HD VN:1.6 SO:unsorted GO:query
@SQ SN:pOpen_V3_amplified LN:2482
@PG ID:minimap2 PN:minimap2 VN:2.24-r1155-dirty CL:minimap2 -acLx map-ont - APX814_pass_barcode17_e229f2c8_109f9b91_0.fastq.gz
a2e75729-6c94-44e1-a9a2-c84fbea683a4 16 amplicon 1 60 70S15M3I377M2I3M1D71M1D5M1D2M1D224M1D282M1I86M87S * 0 0 CTACGTATTGCTAAGGTTAACACAAAGACACCGACAACTTTCTTCAGCACTCCCAACCCGAACCCCATATGGGTTATTGTCTCATGAAGAGTGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACCTGCACCAGTCAGTAAAACGATGGCCAGTGACTTGGTCTCAAATGAACACGATCAACATCGCAAAGTCGGACTTCTCGGACATCGAGCTCGCAGCAATCCCGTTCAACGCACTGGCAGACCACTACGGTGAGCGCCTGGCACGCGAGCAGCTGGCACTGGAGCACGAGTCGTACGAGATGGGTGAGGCACGCTTCCGCAAGATGTTCGAGCGCCAGCTGAAGGCAGGTGAGGTGGCAGACAACGCAGCAGCAAAGCCGCTGATCACGACGCTGCTGCCGAAGATGATCGCACGCATCAACGACTGGTTCGAGGAGGTGAAGGGGCAAGCGTGGTAAGCGCCCGACGGCATTCCAGTTCCTGGACGAGATCAAGCCGGAGGCAGTGGCATACATCACCTCCAACACGCTGGCATGCCTGACGTCGGCAGACAACACGACGGTGCAGGCAGTGGCATCGGCAATCGGTCGCGCAATCGAGGACGAGGCAGAGTTCGGTCGCATCCGCGACCTGGAGGCAAAGCACTTCAAGAAGAACGTGGAGGAGCAGCTGAACAAGCGCGTGGGTCACGTGTACAAGAAGGCATTCATGCAGGTGGTGGAGGCAGACATGCTGTCGAAGGGTCTGCTGGTGGTGAGGCATGGTCGTCGTGGCACAAGGAGGACTCGATCCACGTGGGTGTGCGCTGCATCGAGATGCTGATCGAGTCGACGGGTATGGTGTCGCTGCACCGCCAGAACGCAGGTGTGGTGGGTCAGGACTCGGAGGACATCGAGCTGGCACCGGAGTACGCAGAGGCAATCGCAACGCGCGCAGGTGCACTGGCAGGTATCTCGCCGATGTTCCAGCCGTGCCAGGTGCCGCCGAAGCCGTGGACGGGTATCACGGGTGGTGGTTACTGGGGAAACGGTACGCCGCCCGCTGGCACTGGTGCGCACGCACTCGAAGACGAGACCAAGTCGTCATAGCTGTTTCCTGAGAGCTTGGCAGGTGATGACTCACCTTAACCTTGTCCTTCAGGTGCTGAAGAAAGTTGTCGGTGTCTTTGTGTTAACCTTAGCAATACGTAACTGAACGAGGTGAAC &())+-887733;@@><==>?@33336=666778;=>>?811-,-,*++,''&()***+9EHHGNGGGFKGDEBBB@@@@AA?2///00100:8=>CELFDCFCCDCDGHFBAAEHSKHIFCECBCCBCCCDEBBC>==?@AAAAAB@@??>:953)(((+***-967722434?66666656559:99<A@@@@ABCBCBD???;9A;2++,,,EFLGPLKSFB;5/..)***+DAHGMKSLIL..---GDEIIFDDLKGGJSJIRJJHFEJISSKLNKJHGHJEGHLHHHIGGFIEGDHSSSLESI==<FSKSHMHNSIJCBEED;889<DMHISSHISJJJSJKHILSKJIJJSIILSHPJGKGGLSFNJNJHSLHIQIMIHKHMSKGISMJKIKKIJHFILDE94458EEFHIPGSLILHHSGHESSQJHRHSSIJKKNKSSOJLADFHCCEG9510//+')(('&&'&&)*,-+,,-+(((((()@@BFSGSMSB<;<=?*((()?@SSSSHEJGNIHIFIFGJKHBA:8*&%%$$$$%%&''),;BDKHKSKSGMBIOMGNFIGILFISIQEIEFGFHSKSIPGSIFFEGMMHMQJHOSHJSRMJLSKSHKHKJJIMLSJCBFGLPGSHHKGPJJKHLHSSKILJJSECHDJJMFOHHGGSIHGNSSMSHJSKISJHB)((()?EFIGLIMSSIJEDI<;?LJSIHHPHSRSHSKSKMJLKFMFMJSSMLHSGJ860//8;=D@BDBBSHJFIHGB>A@BBEKS**H=A@>>>EHGDQJHLJLLHJQSGEQCHGEGCBDFNJIPA@@B<;;<<CHIGFKLNHSHILRMSGSKJMSHMHGMSGLJIIOSKLJGSJLNHESE>><<<ACLHFJEJFJDDHDEDJFKPILIMMMJMGLHGSGNGJGJMHEE)))2567=B?>ABFKSSGSIJGGDSLSGHKFFDDEGENSSHSKKKSGHLMSSHSIJSSHNSSHDDDDESLKIF>BGGIEKIJJDBD===7.....HKPSJSMHNSFGLPGOFJGCHJISSGJNSE??;))))*??7211116;<>GDBDNGJGIGI??DDBF=;::6;<>=100:C=BCPDD???EHEFDFGJLLIIILSB><<41...//139>?;;;;;AOH65556DFGHGGIGDIHISGKSJFGSKKGDBCDFGMGFJB?>=>?CDEFEDJEJIHFFFC>766669::8991)(((-((''&%$##%&%% NM:i:18 ms:i:2037 AS:i:2034 nn:i:0 tp:A:P cm:i:168 s1:i:969 s2:i:0 de:f:0.014 cs:Z::15+gaa:3*ct:74*ct:67*gc:230+gg:3-a:5*ct:65-g*ac:2*ac:1-g:2-g:224-g:274*cg:7+a:86 rl:i:0`)
parser, _, _ := NewParser(file, DefaultMaxLineSize)
samLine, _ := parser.Next()

for _, optional := range samLine.Optionals {
if optional.Tag == "cs" && optional.Type == 'Z' {
fmt.Println(optional.Data)
}
}

// Output: :15+gaa:3*ct:74*ct:67*gc:230+gg:3-a:5*ct:65-g*ac:2*ac:1-g:2-g:224-g:274*cg:7+a:86
}

0 comments on commit cc83cb5

Please sign in to comment.