From cd904a14642fe623314ea58879c29c24ef60c87d Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Fri, 5 Jan 2024 14:12:43 -0800 Subject: [PATCH 1/3] init --- lib/align/megamash/compress.go | 93 ++++++++++++++++++++++ lib/align/megamash/megamash.go | 119 ++++++++++++++++++++++++++++ lib/align/megamash/megamash_test.go | 71 +++++++++++++++++ 3 files changed, 283 insertions(+) create mode 100644 lib/align/megamash/compress.go create mode 100644 lib/align/megamash/megamash.go create mode 100644 lib/align/megamash/megamash_test.go diff --git a/lib/align/megamash/compress.go b/lib/align/megamash/compress.go new file mode 100644 index 0000000..34862d1 --- /dev/null +++ b/lib/align/megamash/compress.go @@ -0,0 +1,93 @@ +package megamash + +import "math" + +// CompressDNA takes a DNA sequence and converts it into a compressed byte slice. +func CompressDNA(dna string) []byte { + length := len(dna) + var lengthBytes []byte + var flag byte + + // Determine the size of the length field based on the sequence length + switch { + case length <= math.MaxUint8: + lengthBytes = []byte{byte(length)} + flag = 0x00 // flag for uint8 + case length <= math.MaxUint16: + lengthBytes = []byte{byte(length >> 8), byte(length)} + flag = 0x01 // flag for uint16 + case length <= math.MaxUint32: + lengthBytes = []byte{byte(length >> 24), byte(length >> 16), byte(length >> 8), byte(length)} + flag = 0x02 // flag for uint32 + default: + lengthBytes = []byte{byte(length >> 56), byte(length >> 48), byte(length >> 40), byte(length >> 32), byte(length >> 24), byte(length >> 16), byte(length >> 8), byte(length)} + flag = 0x03 // flag for uint64 + } + + // Encode DNA sequence + var dnaBytes []byte + var currentByte byte + for i, nucleotide := range dna { + switch nucleotide { + case 'A': // 00 + case 'T': // 01 + currentByte |= 1 << (6 - (i%4)*2) + case 'G': // 10 + currentByte |= 2 << (6 - (i%4)*2) + case 'C': // 11 + currentByte |= 3 << (6 - (i%4)*2) + } + if i%4 == 3 || i == length-1 { + dnaBytes = append(dnaBytes, currentByte) + currentByte = 0 + } + } + + // Combine all parts into the result + result := append([]byte{flag}, lengthBytes...) + result = append(result, dnaBytes...) + return result +} + +// DecompressDNA takes a compressed byte slice and converts it back to the original DNA string. +func DecompressDNA(compressed []byte) string { + flag := compressed[0] + var length int + lengthBytes := 1 + + switch flag { + case 0x00: + length = int(compressed[1]) + lengthBytes = 1 + case 0x01: + length = int(compressed[1])<<8 + int(compressed[2]) + lengthBytes = 2 + case 0x02: + length = int(compressed[1])<<24 + int(compressed[2])<<16 + int(compressed[3])<<8 + int(compressed[4]) + lengthBytes = 4 + case 0x03: + length = int(compressed[1])<<56 + int(compressed[2])<<48 + int(compressed[3])<<40 + int(compressed[4])<<32 + int(compressed[5])<<24 + int(compressed[6])<<16 + int(compressed[7])<<8 + int(compressed[8]) + lengthBytes = 8 + default: + return "Invalid flag" + } + + // Decode DNA sequence + var dna string + for i, b := range compressed[1+lengthBytes:] { + for j := 0; j < 4 && 4*i+j < length; j++ { + switch (b >> (6 - j*2)) & 0x03 { + case 0: + dna += "A" + case 1: + dna += "T" + case 2: + dna += "G" + case 3: + dna += "C" + } + } + } + + return dna +} diff --git a/lib/align/megamash/megamash.go b/lib/align/megamash/megamash.go new file mode 100644 index 0000000..4a1a037 --- /dev/null +++ b/lib/align/megamash/megamash.go @@ -0,0 +1,119 @@ +/* +Package megamash is an implementation of the megamash algorithm. + +Megamash is an algorithm developed by Keoni Gandall to find templates from +sequencing reactions. For example, you may have a pool of amplicons, and need +to get a count of how many times each amplicon shows up in a given sequencing +reaction. +*/ +package megamash + +import ( + "encoding/base64" + + "github.com/koeng101/dnadesign/lib/transform" +) + +func StandardizedCompressedDNA(sequence string) []byte { + var deterministicSequence string + reverseComplement := transform.ReverseComplement(sequence) + if sequence > reverseComplement { + deterministicSequence = reverseComplement + } else { + deterministicSequence = sequence + } + return CompressDNA(deterministicSequence) + +} + +type MegamashMap []map[string]bool + +func MakeMegamashMap(sequences []string, kmerSize uint) MegamashMap { + var megamashMap MegamashMap + for _, sequence := range sequences { + // First get all kmers with a given sequence + kmerMap := make(map[string]bool) + for i := 0; i <= len(sequence)-int(kmerSize); i++ { + kmerBytes := StandardizedCompressedDNA(sequence[i : i+int(kmerSize)]) + kmerBase64 := base64.StdEncoding.EncodeToString(kmerBytes) + kmerMap[kmerBase64] = true + } + + // Then, get unique kmers for this sequence and only this sequence + uniqueKmerMap := make(map[string]bool) + for kmerBase64, _ := range kmerMap { + unique := true + for _, otherMegaMashMap := range megamashMap { + _, ok := otherMegaMashMap[kmerBase64] + // If this kmer is found, set both to false + if ok { + otherMegaMashMap[kmerBase64] = false + unique = false + break + } + } + if unique { + uniqueKmerMap[kmerBase64] = true + } + } + + // Now we have a unique Kmer map for the given sequence. + // Add it to megamashMap + megamashMap = append(megamashMap, uniqueKmerMap) + } + // Finally, go back through and make a final megamashMap without + // all those falses. + var finalMegamashMap MegamashMap + for _, singleMegamashMap := range megamashMap { + finalMap := make(map[string]bool) + for kmerBase64, value := range singleMegamashMap { + if value { + finalMap[kmerBase64] = true + } + } + finalMegamashMap = append(finalMegamashMap, finalMap) + } + return finalMegamashMap +} + +func (m *MegamashMap) Score(sequence string) []float64 { + var scores []float64 + // The algorithm is as follows: + // - Go through each map. + // - Get the number of matching kmers + // - Divide that by the total kmers available for matching + + // First, get the kmer total + var kmerSize int +out: + for _, maps := range *m { + for kmer, _ := range maps { + decodedBytes, _ := base64.StdEncoding.DecodeString(kmer) + sequence := DecompressDNA(decodedBytes) + kmerSize = len(sequence) + break out + } + } + + // Now, iterate through each map + for _, sequenceMap := range *m { + var score float64 + var totalKmers int = len(sequenceMap) + var matchedKmers int + for i := 0; i <= len(sequence)-int(kmerSize); i++ { + kmerBytes := StandardizedCompressedDNA(sequence[i : i+int(kmerSize)]) + kmerBase64 := base64.StdEncoding.EncodeToString(kmerBytes) + _, ok := sequenceMap[kmerBase64] + if ok { + matchedKmers++ + } + } + if totalKmers == 0 { + score = 0 + } else { + score = float64(matchedKmers) / float64(totalKmers) + } + scores = append(scores, score) + } + return scores +} diff --git a/lib/align/megamash/megamash_test.go b/lib/align/megamash/megamash_test.go new file mode 100644 index 0000000..28ab815 --- /dev/null +++ b/lib/align/megamash/megamash_test.go @@ -0,0 +1,71 @@ +package megamash + +import ( + "testing" +) + +func TestCompressDNA(t *testing.T) { + // Define test cases + tests := []struct { + name string + dna string + expectedLen int // Expected length of the compressed data + expectedFlag byte // Expected flag byte + }{ + {"Empty", "", 2, 0x00}, + {"Short", "ATGC", 3, 0x00}, + {"Medium", "ATGCGTATGCCGTAGC", 6, 0x00}, + // Add more test cases for longer sequences and edge cases + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + compressed := CompressDNA(tc.dna) + if len(compressed) != tc.expectedLen { + t.Errorf("CompressDNA() with input %s, expected length %d, got %d", tc.dna, tc.expectedLen, len(compressed)) + } + if compressed[0] != tc.expectedFlag { + t.Errorf("CompressDNA() with input %s, expected flag %b, got %b", tc.dna, tc.expectedFlag, compressed[0]) + } + }) + } +} + +func TestDecompressDNA(t *testing.T) { + // Define test cases + tests := []struct { + name string + dna string + expected string + }{ + {"Empty", "", ""}, + {"Short", "ATGC", "ATGC"}, + {"Medium", "ATGCGTATGCCGTAGC", "ATGCGTATGCCGTAGC"}, + // Add more test cases as needed + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + compressed := CompressDNA(tc.dna) + decompressed := DecompressDNA(compressed) + if decompressed != tc.expected { + t.Errorf("DecompressDNA() with input %v, expected %s, got %s", compressed, tc.expected, decompressed) + } + }) + } +} + +func TestMegamash(t *testing.T) { + oligo1 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTCTCAATGACCAAACCAACGCAAGTCTTAGTTCGTTCAGTCTCTATTTTATTCTTCATCACACTGTTGCACTTGGTTGTTGCAATGAGATTTCCTAGTATTTTCACTGCTGTGCTGAGACCCGGATCGAACTTAGGTAGCCT" + oligo2 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTGCTATTTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCACTAGTCATAAT" + oligo3 := "CCGTGCGACAAGATTTCAAGGGTCTCTCTTCTATCGCAGCCAAGGAAGAAGGTGTATCTCTAGAGAAGCGTCGAGTGAGACCCGGATCGAACTTAGGTAGCCCCCTTCGAAGTGGCTCTGTCTGATCCTCCGCGGATGGCGACACCATCGGACTGAGGATATTGGCCACA" + + samples := []string{"TTTTGTCTACTTCGTTCCGTTGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGACGGCGCCTCCGTGCGACGAGATTTCAAGGGTCTCTGTGCTATATTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCAGATCGACTTTTAGATTCCTCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATACGTGG", "TGTCCTTTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTACTTATCGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTATCTGAGACCGAAGTGGTTTGCCTAAACGCAGGTGCTGTTGGCAAAGGCAGAAAGTAGTCTTAACCTTGACAATGAGTGGTA", "GTTATTGTCGTCTCCTTTGACTCAGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTGCTGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTCCGCTTCTATCTGAGACCGAAGTGGTTAT", "TGTTCTGTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTCTGCCTTAGAGACCACGCCTCCGTGCGACAAGATTCAAGGGTCTCTGTGCTCTGCCGCTAGTTCCGCTCTAGCTGCTCCGGTATGCATCTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAACTGTTGGTT"} + m := MakeMegamashMap([]string{oligo1, oligo2, oligo3}, 16) + for _, sample := range samples { + scores := m.Score(sample) + if scores[1] < 0.5 { + t.Errorf("Score for oligo2 should be above 0.5") + } + } +} From 9e2b323be253a0d958503c2e873ce2f0e401d6d8 Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Fri, 5 Jan 2024 14:22:24 -0800 Subject: [PATCH 2/3] make linter happy --- lib/align/megamash/compress.go | 2 +- lib/align/megamash/megamash.go | 11 +++++------ 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/lib/align/megamash/compress.go b/lib/align/megamash/compress.go index 34862d1..36a827e 100644 --- a/lib/align/megamash/compress.go +++ b/lib/align/megamash/compress.go @@ -53,7 +53,7 @@ func CompressDNA(dna string) []byte { func DecompressDNA(compressed []byte) string { flag := compressed[0] var length int - lengthBytes := 1 + var lengthBytes int switch flag { case 0x00: diff --git a/lib/align/megamash/megamash.go b/lib/align/megamash/megamash.go index 4a1a037..0d7f949 100644 --- a/lib/align/megamash/megamash.go +++ b/lib/align/megamash/megamash.go @@ -23,7 +23,6 @@ func StandardizedCompressedDNA(sequence string) []byte { deterministicSequence = sequence } return CompressDNA(deterministicSequence) - } type MegamashMap []map[string]bool @@ -41,7 +40,7 @@ func MakeMegamashMap(sequences []string, kmerSize uint) MegamashMap { // Then, get unique kmers for this sequence and only this sequence uniqueKmerMap := make(map[string]bool) - for kmerBase64, _ := range kmerMap { + for kmerBase64 := range kmerMap { unique := true for _, otherMegaMashMap := range megamashMap { _, ok := otherMegaMashMap[kmerBase64] @@ -87,7 +86,7 @@ func (m *MegamashMap) Score(sequence string) []float64 { var kmerSize int out: for _, maps := range *m { - for kmer, _ := range maps { + for kmer := range maps { decodedBytes, _ := base64.StdEncoding.DecodeString(kmer) sequence := DecompressDNA(decodedBytes) kmerSize = len(sequence) @@ -98,10 +97,10 @@ out: // Now, iterate through each map for _, sequenceMap := range *m { var score float64 - var totalKmers int = len(sequenceMap) + var totalKmers = len(sequenceMap) var matchedKmers int - for i := 0; i <= len(sequence)-int(kmerSize); i++ { - kmerBytes := StandardizedCompressedDNA(sequence[i : i+int(kmerSize)]) + for i := 0; i <= len(sequence)-kmerSize; i++ { + kmerBytes := StandardizedCompressedDNA(sequence[i : i+kmerSize]) kmerBase64 := base64.StdEncoding.EncodeToString(kmerBytes) _, ok := sequenceMap[kmerBase64] if ok { From 3cb55d2e6a1f8eee1fbb46dbdfc0ebe16719381d Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Fri, 5 Jan 2024 23:16:42 -0800 Subject: [PATCH 3/3] add megamash stuff --- lib/align/megamash/megamash.go | 19 ++++--------------- lib/align/megamash/megamash_test.go | 12 +++++++++++- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/lib/align/megamash/megamash.go b/lib/align/megamash/megamash.go index 0d7f949..8b28345 100644 --- a/lib/align/megamash/megamash.go +++ b/lib/align/megamash/megamash.go @@ -14,6 +14,7 @@ import ( "github.com/koeng101/dnadesign/lib/transform" ) +// StandardizedCompressedDNA returns the CompressedDNA byte string func StandardizedCompressedDNA(sequence string) []byte { var deterministicSequence string reverseComplement := transform.ReverseComplement(sequence) @@ -60,19 +61,7 @@ func MakeMegamashMap(sequences []string, kmerSize uint) MegamashMap { // Add it to megamashMap megamashMap = append(megamashMap, uniqueKmerMap) } - // Finally, go back through and make a final megamashMap without - // all those falses. - var finalMegamashMap MegamashMap - for _, singleMegamashMap := range megamashMap { - finalMap := make(map[string]bool) - for kmerBase64, value := range singleMegamashMap { - if value { - finalMap[kmerBase64] = true - } - } - finalMegamashMap = append(finalMegamashMap, finalMap) - } - return finalMegamashMap + return megamashMap } func (m *MegamashMap) Score(sequence string) []float64 { @@ -102,8 +91,8 @@ out: for i := 0; i <= len(sequence)-kmerSize; i++ { kmerBytes := StandardizedCompressedDNA(sequence[i : i+kmerSize]) kmerBase64 := base64.StdEncoding.EncodeToString(kmerBytes) - _, ok := sequenceMap[kmerBase64] - if ok { + unique, ok := sequenceMap[kmerBase64] + if ok && unique { matchedKmers++ } } diff --git a/lib/align/megamash/megamash_test.go b/lib/align/megamash/megamash_test.go index 28ab815..8cb2392 100644 --- a/lib/align/megamash/megamash_test.go +++ b/lib/align/megamash/megamash_test.go @@ -2,10 +2,14 @@ package megamash import ( "testing" + + "github.com/koeng101/dnadesign/lib/random" ) func TestCompressDNA(t *testing.T) { // Define test cases + longDna, _ := random.DNASequence(300, 0) + longerDna, _ := random.DNASequence(66000, 0) tests := []struct { name string dna string @@ -15,6 +19,8 @@ func TestCompressDNA(t *testing.T) { {"Empty", "", 2, 0x00}, {"Short", "ATGC", 3, 0x00}, {"Medium", "ATGCGTATGCCGTAGC", 6, 0x00}, + {"Long", longDna, 78, 0x01}, + {"Longest", longerDna, 16505, 0x02}, // Add more test cases for longer sequences and edge cases } @@ -22,7 +28,7 @@ func TestCompressDNA(t *testing.T) { t.Run(tc.name, func(t *testing.T) { compressed := CompressDNA(tc.dna) if len(compressed) != tc.expectedLen { - t.Errorf("CompressDNA() with input %s, expected length %d, got %d", tc.dna, tc.expectedLen, len(compressed)) + t.Errorf("CompressDNA() with input %s, expected length %d, got %d", "", tc.expectedLen, len(compressed)) } if compressed[0] != tc.expectedFlag { t.Errorf("CompressDNA() with input %s, expected flag %b, got %b", tc.dna, tc.expectedFlag, compressed[0]) @@ -32,6 +38,8 @@ func TestCompressDNA(t *testing.T) { } func TestDecompressDNA(t *testing.T) { + longDna, _ := random.DNASequence(300, 0) + longerDna, _ := random.DNASequence(66000, 0) // Define test cases tests := []struct { name string @@ -41,6 +49,8 @@ func TestDecompressDNA(t *testing.T) { {"Empty", "", ""}, {"Short", "ATGC", "ATGC"}, {"Medium", "ATGCGTATGCCGTAGC", "ATGCGTATGCCGTAGC"}, + {"Long", longDna, longDna}, + {"Longest", longerDna, longerDna}, // Add more test cases as needed }