diff --git a/lib/align/cs/cs.go b/lib/align/cs/cs.go new file mode 100644 index 0000000..67da04c --- /dev/null +++ b/lib/align/cs/cs.go @@ -0,0 +1,159 @@ +/* +Package cs contains functions for parsing minimap2 cs optional tags. + +The following is taken directly from the minimap2 README documentation: + +# The cs optional tag + +The `cs` SAM/PAF tag encodes bases at mismatches and INDELs. It matches regular +expression `/(:[0-9]+|\*[a-z][a-z]|[=\+\-][A-Za-z]+)+/`. Like CIGAR, `cs` +consists of series of operations. Each leading character specifies the +operation; the following sequence is the one involved in the operation. + +The `cs` tag is enabled by command line option `--cs`. The following alignment, +for example: + + CGATCGATAAATAGAGTAG---GAATAGCA + |||||| |||||||||| |||| ||| + CGATCG---AATAGAGTAGGTCGAATtGCA + +is represented as `:6-ata:10+gtc:4*at:3`, where `:[0-9]+` represents an +identical block, `-ata` represents a deletion, `+gtc` an insertion and `*at` +indicates reference base `a` is substituted with a query base `t`. It is +similar to the `MD` SAM tag but is standalone and easier to parse. + +If `--cs=long` is used, the `cs` string also contains identical sequences in +the alignment. The above example will become +`=CGATCG-ata=AATAGAGTAG+gtc=GAAT*at=GCA`. The long form of `cs` encodes both +reference and query sequences in one string. The `cs` tag also encodes intron +positions and splicing signals (see the [minimap2 manpage][manpage-cs] for +details). +*/ +package cs + +import ( + "strconv" + "unicode" +) + +// CS is a struct format of each element of a cs string. For example, the cs +// +// :6-ata:10+gtc:4*at:3 +// +// would be divided into 7 CS structs. These can be directly processed, or +// further processed into DigestedCS and DigestedInsertions. +type CS struct { + Type rune // Acceptable types: [* + :] + Size int // Size of cs. For example, :6 would be 6 or -ata would be 3 + Change string // if insertion or deletion, write out the change. -ata would be ata +} + +/* +Context for developers: + +DigestedCS is my way of encoding more simply changes for the needs of DNA +synthesis validation. The core idea is that if we remove the need for +specifically encoding insertions, we can flatten the data structure, and make +it a lot easier to use. The full insertion information can be stored separately +in a DigestedInsertion struct. +*/ + +// DigestedCS is a simplified way of viewing CS, specifically for finding +// mutations in sequences. It only contains the position of a given change +// and the type of change. For point mutations and indels, this contains full +// information of the change - a longer deletion will simply be a few * in a +// row. However, it doesn't contain the full information for insertions. For +// DNA synthesis applications, this is mostly OK because insertions are very +// rare. +type DigestedCS struct { + Position uint64 // Position in the sequence with the change + Type uint8 // The change type. Can only be [A T G C * +]. +} + +// DigestedInsertion separately stores inserts from the DigestedCS, in case +// this data is needed for analysis later. +type DigestedInsertion struct { + Position uint64 // Position in the sequence with an insertion + Insertion string // the insertion string +} + +// toCS is a function that properly adds size to CS. We use a function here +// instead copying this work twice to get 100% testing. +func toCS(current CS, numBuffer string) CS { + if current.Type == ':' { + current.Size, _ = strconv.Atoi(numBuffer) + } else if current.Type == '*' { + current.Size = 1 // point mutations are always 1 base pair + } else { + current.Size = len(current.Change) + } + return current +} + +// ParseCS takes a short CS string and returns a slice of CS structs +func ParseCS(csString string) []CS { + var result []CS + var current CS + var numBuffer string + + for _, char := range csString { + switch { + case char == ':' || char == '*' || char == '+' || char == '-': + if current.Type != 0 { + result = append(result, toCS(current, numBuffer)) + } + current = CS{Type: char} + numBuffer = "" + case unicode.IsDigit(char): + numBuffer += string(char) + default: + current.Change += string(char) + } + } + + // Add the last CS struct + if current.Type != 0 { + result = append(result, toCS(current, numBuffer)) + } + + return result +} + +// DigestCS takes a slice of CS structs and returns slices of DigestedCS and DigestedInsertion +func DigestCS(csSlice []CS) ([]DigestedCS, []DigestedInsertion) { + var digestedCS []DigestedCS + var digestedInsertions []DigestedInsertion + position := uint64(0) + + for _, cs := range csSlice { + switch cs.Type { + case ':': + position += uint64(cs.Size) + case '*': + digestedCS = append(digestedCS, DigestedCS{Position: position, Type: cs.Change[1]}) // *at we need t + position++ + case '-': + for i := 0; i < cs.Size; i++ { + digestedCS = append(digestedCS, DigestedCS{ + Position: position, + Type: '*', + }) + position++ + } + case '+': + digestedCS = append(digestedCS, DigestedCS{ + Position: position, + Type: '+', + }) + // Insertions are positioned at where they are inserted. For example, + // if we have it between 18 and 19, the insertion "Position" would be 19 + digestedInsertions = append(digestedInsertions, DigestedInsertion{ + Position: position, + Insertion: cs.Change, + }) + // Don't increment position for insertions + } + } + + return digestedCS, digestedInsertions +} diff --git a/lib/align/cs/cs_test.go b/lib/align/cs/cs_test.go new file mode 100644 index 0000000..88714b4 --- /dev/null +++ b/lib/align/cs/cs_test.go @@ -0,0 +1,107 @@ +package cs + +import ( + "reflect" + "testing" +) + +func TestParseAndDigestCS(t *testing.T) { + // Test case from the reference documentation + csString := ":6-ata:10+gtc:4*at:3" + + // Test ParseCS + expectedCS := []CS{ + {Type: ':', Size: 6, Change: ""}, + {Type: '-', Size: 3, Change: "ata"}, + {Type: ':', Size: 10, Change: ""}, + {Type: '+', Size: 3, Change: "gtc"}, + {Type: ':', Size: 4, Change: ""}, + {Type: '*', Size: 1, Change: "at"}, + {Type: ':', Size: 3, Change: ""}, + } + + parsedCS := ParseCS(csString) + + if !reflect.DeepEqual(parsedCS, expectedCS) { + t.Errorf("ParseCS(%s) = %v, want %v", csString, parsedCS, expectedCS) + } + + // Test DigestCS + expectedDigestedCS := []DigestedCS{ + {Position: 6, Type: '*'}, + {Position: 7, Type: '*'}, + {Position: 8, Type: '*'}, + {Position: 19, Type: '+'}, + {Position: 23, Type: 't'}, + } + + expectedDigestedInsertions := []DigestedInsertion{ + {Position: 19, Insertion: "gtc"}, + } + + digestedCS, digestedInsertions := DigestCS(parsedCS) + + if !reflect.DeepEqual(digestedCS, expectedDigestedCS) { + t.Errorf("DigestCS() digestedCS = %v, want %v", digestedCS, expectedDigestedCS) + } + + if !reflect.DeepEqual(digestedInsertions, expectedDigestedInsertions) { + t.Errorf("DigestCS() digestedInsertions = %v, want %v", digestedInsertions, expectedDigestedInsertions) + } +} + +func TestParseAndDigestLongerCS(t *testing.T) { + // Test case with a longer CS string + csString := ":38-t:25-c:147*ag:29+gta:9*gc:3-gag:6-t:94-cgca:77-c*ag:16-at:4*ct:1+g:1*gc:98-c:191-c:200*ga:21*ga:2*ca:1-c:83*ga" + + // Test ParseCS + expectedCS := []CS{ + {Type: ':', Size: 38, Change: ""}, + {Type: '-', Size: 1, Change: "t"}, + {Type: ':', Size: 25, Change: ""}, + {Type: '-', Size: 1, Change: "c"}, + {Type: ':', Size: 147, Change: ""}, + {Type: '*', Size: 1, Change: "ag"}, + {Type: ':', Size: 29, Change: ""}, + {Type: '+', Size: 3, Change: "gta"}, + {Type: ':', Size: 9, Change: ""}, + {Type: '*', Size: 1, Change: "gc"}, + {Type: ':', Size: 3, Change: ""}, + {Type: '-', Size: 3, Change: "gag"}, + {Type: ':', Size: 6, Change: ""}, + {Type: '-', Size: 1, Change: "t"}, + {Type: ':', Size: 94, Change: ""}, + {Type: '-', Size: 4, Change: "cgca"}, + {Type: ':', Size: 77, Change: ""}, + {Type: '-', Size: 1, Change: "c"}, + {Type: '*', Size: 1, Change: "ag"}, + {Type: ':', Size: 16, Change: ""}, + {Type: '-', Size: 2, Change: "at"}, + {Type: ':', Size: 4, Change: ""}, + {Type: '*', Size: 1, Change: "ct"}, + {Type: ':', Size: 1, Change: ""}, + {Type: '+', Size: 1, Change: "g"}, + {Type: ':', Size: 1, Change: ""}, + {Type: '*', Size: 1, Change: "gc"}, + {Type: ':', Size: 98, Change: ""}, + {Type: '-', Size: 1, Change: "c"}, + {Type: ':', Size: 191, Change: ""}, + {Type: '-', Size: 1, Change: "c"}, + {Type: ':', Size: 200, Change: ""}, + {Type: '*', Size: 1, Change: "ga"}, + {Type: ':', Size: 21, Change: ""}, + {Type: '*', Size: 1, Change: "ga"}, + {Type: ':', Size: 2, Change: ""}, + {Type: '*', Size: 1, Change: "ca"}, + {Type: ':', Size: 1, Change: ""}, + {Type: '-', Size: 1, Change: "c"}, + {Type: ':', Size: 83, Change: ""}, + {Type: '*', Size: 1, Change: "ga"}, + } + + parsedCS := ParseCS(csString) + + if !reflect.DeepEqual(parsedCS, expectedCS) { + t.Errorf("ParseCS(%s) = %v, want %v", csString, parsedCS, expectedCS) + } +} diff --git a/lib/bio/sam/sam.go b/lib/bio/sam/sam.go index ea81013..e8d4697 100644 --- a/lib/bio/sam/sam.go +++ b/lib/bio/sam/sam.go @@ -525,7 +525,8 @@ func (p *Parser) Next() (Alignment, error) { var optionals []Optional for _, value := range values[11:] { valueSplit := strings.Split(value, ":") - optionals = append(optionals, Optional{Tag: valueSplit[0], Type: rune(valueSplit[1][0]), Data: valueSplit[2]}) + // tag+:+Type+: == len()+1+1+1 + optionals = append(optionals, Optional{Tag: valueSplit[0], Type: rune(valueSplit[1][0]), Data: value[len(valueSplit[0])+1+1+1:]}) } alignment.Optionals = optionals return alignment, nil diff --git a/lib/bio/sam/sam_test.go b/lib/bio/sam/sam_test.go index b17d335..c369e56 100644 --- a/lib/bio/sam/sam_test.go +++ b/lib/bio/sam/sam_test.go @@ -296,3 +296,20 @@ func TestPrimary(t *testing.T) { }) } } + +func ExampleCs() { + file := strings.NewReader(`@HD VN:1.6 SO:unsorted GO:query +@SQ SN:pOpen_V3_amplified LN:2482 +@PG ID:minimap2 PN:minimap2 VN:2.24-r1155-dirty CL:minimap2 -acLx map-ont - APX814_pass_barcode17_e229f2c8_109f9b91_0.fastq.gz +a2e75729-6c94-44e1-a9a2-c84fbea683a4 16 amplicon 1 60 70S15M3I377M2I3M1D71M1D5M1D2M1D224M1D282M1I86M87S * 0 0 CTACGTATTGCTAAGGTTAACACAAAGACACCGACAACTTTCTTCAGCACTCCCAACCCGAACCCCATATGGGTTATTGTCTCATGAAGAGTGGATACATATTTGAATGTATTTAGAAAAATAAACAAATAGGGGTTCCGCGCACCTGCACCAGTCAGTAAAACGATGGCCAGTGACTTGGTCTCAAATGAACACGATCAACATCGCAAAGTCGGACTTCTCGGACATCGAGCTCGCAGCAATCCCGTTCAACGCACTGGCAGACCACTACGGTGAGCGCCTGGCACGCGAGCAGCTGGCACTGGAGCACGAGTCGTACGAGATGGGTGAGGCACGCTTCCGCAAGATGTTCGAGCGCCAGCTGAAGGCAGGTGAGGTGGCAGACAACGCAGCAGCAAAGCCGCTGATCACGACGCTGCTGCCGAAGATGATCGCACGCATCAACGACTGGTTCGAGGAGGTGAAGGGGCAAGCGTGGTAAGCGCCCGACGGCATTCCAGTTCCTGGACGAGATCAAGCCGGAGGCAGTGGCATACATCACCTCCAACACGCTGGCATGCCTGACGTCGGCAGACAACACGACGGTGCAGGCAGTGGCATCGGCAATCGGTCGCGCAATCGAGGACGAGGCAGAGTTCGGTCGCATCCGCGACCTGGAGGCAAAGCACTTCAAGAAGAACGTGGAGGAGCAGCTGAACAAGCGCGTGGGTCACGTGTACAAGAAGGCATTCATGCAGGTGGTGGAGGCAGACATGCTGTCGAAGGGTCTGCTGGTGGTGAGGCATGGTCGTCGTGGCACAAGGAGGACTCGATCCACGTGGGTGTGCGCTGCATCGAGATGCTGATCGAGTCGACGGGTATGGTGTCGCTGCACCGCCAGAACGCAGGTGTGGTGGGTCAGGACTCGGAGGACATCGAGCTGGCACCGGAGTACGCAGAGGCAATCGCAACGCGCGCAGGTGCACTGGCAGGTATCTCGCCGATGTTCCAGCCGTGCCAGGTGCCGCCGAAGCCGTGGACGGGTATCACGGGTGGTGGTTACTGGGGAAACGGTACGCCGCCCGCTGGCACTGGTGCGCACGCACTCGAAGACGAGACCAAGTCGTCATAGCTGTTTCCTGAGAGCTTGGCAGGTGATGACTCACCTTAACCTTGTCCTTCAGGTGCTGAAGAAAGTTGTCGGTGTCTTTGTGTTAACCTTAGCAATACGTAACTGAACGAGGTGAAC &())+-887733;@@><==>?@33336=666778;=>>?811-,-,*++,''&()***+9EHHGNGGGFKGDEBBB@@@@AA?2///00100:8=>CELFDCFCCDCDGHFBAAEHSKHIFCECBCCBCCCDEBBC>==?@AAAAAB@@??>:953)(((+***-967722434?66666656559:99A@BBEKS**H=A@>>>EHGDQJHLJLLHJQSGEQCHGEGCBDFNJIPA@@B<;;<><<ABFKSSGSIJGGDSLSGHKFFDDEGENSSHSKKKSGHLMSSHSIJSSHNSSHDDDDESLKIF>BGGIEKIJJDBD===7.....HKPSJSMHNSFGLPGOFJGCHJISSGJNSE??;))))*??7211116;<>GDBDNGJGIGI??DDBF=;::6;<>=100:C=BCPDD???EHEFDFGJLLIIILSB><<41...//139>?;;;;;AOH65556DFGHGGIGDIHISGKSJFGSKKGDBCDFGMGFJB?>=>?CDEFEDJEJIHFFFC>766669::8991)(((-((''&%$##%&%% NM:i:18 ms:i:2037 AS:i:2034 nn:i:0 tp:A:P cm:i:168 s1:i:969 s2:i:0 de:f:0.014 cs:Z::15+gaa:3*ct:74*ct:67*gc:230+gg:3-a:5*ct:65-g*ac:2*ac:1-g:2-g:224-g:274*cg:7+a:86 rl:i:0`) + parser, _, _ := NewParser(file, DefaultMaxLineSize) + samLine, _ := parser.Next() + + for _, optional := range samLine.Optionals { + if optional.Tag == "cs" && optional.Type == 'Z' { + fmt.Println(optional.Data) + } + } + + // Output: :15+gaa:3*ct:74*ct:67*gc:230+gg:3-a:5*ct:65-g*ac:2*ac:1-g:2-g:224-g:274*cg:7+a:86 +}