Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add megamash #50

Merged
merged 3 commits into from
Jan 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
93 changes: 93 additions & 0 deletions lib/align/megamash/compress.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,93 @@
package megamash

import "math"

// CompressDNA takes a DNA sequence and converts it into a compressed byte slice.
func CompressDNA(dna string) []byte {
length := len(dna)
var lengthBytes []byte
var flag byte

// Determine the size of the length field based on the sequence length
switch {
case length <= math.MaxUint8:
lengthBytes = []byte{byte(length)}
flag = 0x00 // flag for uint8
case length <= math.MaxUint16:
lengthBytes = []byte{byte(length >> 8), byte(length)}
flag = 0x01 // flag for uint16
case length <= math.MaxUint32:
lengthBytes = []byte{byte(length >> 24), byte(length >> 16), byte(length >> 8), byte(length)}
flag = 0x02 // flag for uint32
default:
lengthBytes = []byte{byte(length >> 56), byte(length >> 48), byte(length >> 40), byte(length >> 32), byte(length >> 24), byte(length >> 16), byte(length >> 8), byte(length)}
flag = 0x03 // flag for uint64
}

// Encode DNA sequence
var dnaBytes []byte
var currentByte byte
for i, nucleotide := range dna {
switch nucleotide {
case 'A': // 00
case 'T': // 01
currentByte |= 1 << (6 - (i%4)*2)
case 'G': // 10
currentByte |= 2 << (6 - (i%4)*2)
case 'C': // 11
currentByte |= 3 << (6 - (i%4)*2)
}
if i%4 == 3 || i == length-1 {
dnaBytes = append(dnaBytes, currentByte)
currentByte = 0
}
}

// Combine all parts into the result
result := append([]byte{flag}, lengthBytes...)
result = append(result, dnaBytes...)
return result
}

// DecompressDNA takes a compressed byte slice and converts it back to the original DNA string.
func DecompressDNA(compressed []byte) string {
flag := compressed[0]
var length int
var lengthBytes int

switch flag {
case 0x00:
length = int(compressed[1])
lengthBytes = 1
case 0x01:
length = int(compressed[1])<<8 + int(compressed[2])
lengthBytes = 2
case 0x02:
length = int(compressed[1])<<24 + int(compressed[2])<<16 + int(compressed[3])<<8 + int(compressed[4])
lengthBytes = 4
case 0x03:
length = int(compressed[1])<<56 + int(compressed[2])<<48 + int(compressed[3])<<40 + int(compressed[4])<<32 + int(compressed[5])<<24 + int(compressed[6])<<16 + int(compressed[7])<<8 + int(compressed[8])
lengthBytes = 8
default:
return "Invalid flag"
}

// Decode DNA sequence
var dna string
for i, b := range compressed[1+lengthBytes:] {
for j := 0; j < 4 && 4*i+j < length; j++ {
switch (b >> (6 - j*2)) & 0x03 {
case 0:
dna += "A"
case 1:
dna += "T"
case 2:
dna += "G"
case 3:
dna += "C"
}
}
}

return dna
}
107 changes: 107 additions & 0 deletions lib/align/megamash/megamash.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,107 @@
/*
Package megamash is an implementation of the megamash algorithm.

Megamash is an algorithm developed by Keoni Gandall to find templates from
sequencing reactions. For example, you may have a pool of amplicons, and need
to get a count of how many times each amplicon shows up in a given sequencing
reaction.
*/
package megamash

import (
"encoding/base64"

"github.com/koeng101/dnadesign/lib/transform"
)

// StandardizedCompressedDNA returns the CompressedDNA byte string
func StandardizedCompressedDNA(sequence string) []byte {
var deterministicSequence string
reverseComplement := transform.ReverseComplement(sequence)
if sequence > reverseComplement {
deterministicSequence = reverseComplement
} else {
deterministicSequence = sequence
}
return CompressDNA(deterministicSequence)
}

type MegamashMap []map[string]bool

func MakeMegamashMap(sequences []string, kmerSize uint) MegamashMap {
var megamashMap MegamashMap
for _, sequence := range sequences {
// First get all kmers with a given sequence
kmerMap := make(map[string]bool)
for i := 0; i <= len(sequence)-int(kmerSize); i++ {
kmerBytes := StandardizedCompressedDNA(sequence[i : i+int(kmerSize)])
kmerBase64 := base64.StdEncoding.EncodeToString(kmerBytes)
kmerMap[kmerBase64] = true
}

// Then, get unique kmers for this sequence and only this sequence
uniqueKmerMap := make(map[string]bool)
for kmerBase64 := range kmerMap {
unique := true
for _, otherMegaMashMap := range megamashMap {
_, ok := otherMegaMashMap[kmerBase64]
// If this kmer is found, set both to false
if ok {
otherMegaMashMap[kmerBase64] = false
unique = false
break
}
}
if unique {
uniqueKmerMap[kmerBase64] = true
}
}

// Now we have a unique Kmer map for the given sequence.
// Add it to megamashMap
megamashMap = append(megamashMap, uniqueKmerMap)
}
return megamashMap
}

func (m *MegamashMap) Score(sequence string) []float64 {
var scores []float64
// The algorithm is as follows:
// - Go through each map.
// - Get the number of matching kmers
// - Divide that by the total kmers available for matching

// First, get the kmer total
var kmerSize int
out:
for _, maps := range *m {
for kmer := range maps {
decodedBytes, _ := base64.StdEncoding.DecodeString(kmer)
sequence := DecompressDNA(decodedBytes)
kmerSize = len(sequence)
break out
}
}

// Now, iterate through each map
for _, sequenceMap := range *m {
var score float64
var totalKmers = len(sequenceMap)
var matchedKmers int
for i := 0; i <= len(sequence)-kmerSize; i++ {
kmerBytes := StandardizedCompressedDNA(sequence[i : i+kmerSize])
kmerBase64 := base64.StdEncoding.EncodeToString(kmerBytes)
unique, ok := sequenceMap[kmerBase64]
if ok && unique {
matchedKmers++
}
}
if totalKmers == 0 {
score = 0
} else {
score = float64(matchedKmers) / float64(totalKmers)
}
scores = append(scores, score)
}
return scores
}
81 changes: 81 additions & 0 deletions lib/align/megamash/megamash_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,81 @@
package megamash

import (
"testing"

"github.com/koeng101/dnadesign/lib/random"
)

func TestCompressDNA(t *testing.T) {
// Define test cases
longDna, _ := random.DNASequence(300, 0)
longerDna, _ := random.DNASequence(66000, 0)
tests := []struct {
name string
dna string
expectedLen int // Expected length of the compressed data
expectedFlag byte // Expected flag byte
}{
{"Empty", "", 2, 0x00},
{"Short", "ATGC", 3, 0x00},
{"Medium", "ATGCGTATGCCGTAGC", 6, 0x00},
{"Long", longDna, 78, 0x01},
{"Longest", longerDna, 16505, 0x02},
// Add more test cases for longer sequences and edge cases
}

for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
compressed := CompressDNA(tc.dna)
if len(compressed) != tc.expectedLen {
t.Errorf("CompressDNA() with input %s, expected length %d, got %d", "", tc.expectedLen, len(compressed))
}
if compressed[0] != tc.expectedFlag {
t.Errorf("CompressDNA() with input %s, expected flag %b, got %b", tc.dna, tc.expectedFlag, compressed[0])
}
})
}
}

func TestDecompressDNA(t *testing.T) {
longDna, _ := random.DNASequence(300, 0)
longerDna, _ := random.DNASequence(66000, 0)
// Define test cases
tests := []struct {
name string
dna string
expected string
}{
{"Empty", "", ""},
{"Short", "ATGC", "ATGC"},
{"Medium", "ATGCGTATGCCGTAGC", "ATGCGTATGCCGTAGC"},
{"Long", longDna, longDna},
{"Longest", longerDna, longerDna},
// Add more test cases as needed
}

for _, tc := range tests {
t.Run(tc.name, func(t *testing.T) {
compressed := CompressDNA(tc.dna)
decompressed := DecompressDNA(compressed)
if decompressed != tc.expected {
t.Errorf("DecompressDNA() with input %v, expected %s, got %s", compressed, tc.expected, decompressed)
}
})
}
}

func TestMegamash(t *testing.T) {
oligo1 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTCTCAATGACCAAACCAACGCAAGTCTTAGTTCGTTCAGTCTCTATTTTATTCTTCATCACACTGTTGCACTTGGTTGTTGCAATGAGATTTCCTAGTATTTTCACTGCTGTGCTGAGACCCGGATCGAACTTAGGTAGCCT"
oligo2 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTGCTATTTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCACTAGTCATAAT"
oligo3 := "CCGTGCGACAAGATTTCAAGGGTCTCTCTTCTATCGCAGCCAAGGAAGAAGGTGTATCTCTAGAGAAGCGTCGAGTGAGACCCGGATCGAACTTAGGTAGCCCCCTTCGAAGTGGCTCTGTCTGATCCTCCGCGGATGGCGACACCATCGGACTGAGGATATTGGCCACA"

samples := []string{"TTTTGTCTACTTCGTTCCGTTGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGACGGCGCCTCCGTGCGACGAGATTTCAAGGGTCTCTGTGCTATATTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCAGATCGACTTTTAGATTCCTCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATACGTGG", "TGTCCTTTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTACTTATCGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTATCTGAGACCGAAGTGGTTTGCCTAAACGCAGGTGCTGTTGGCAAAGGCAGAAAGTAGTCTTAACCTTGACAATGAGTGGTA", "GTTATTGTCGTCTCCTTTGACTCAGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTGCTGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTCCGCTTCTATCTGAGACCGAAGTGGTTAT", "TGTTCTGTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTCTGCCTTAGAGACCACGCCTCCGTGCGACAAGATTCAAGGGTCTCTGTGCTCTGCCGCTAGTTCCGCTCTAGCTGCTCCGGTATGCATCTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAACTGTTGGTT"}
m := MakeMegamashMap([]string{oligo1, oligo2, oligo3}, 16)
for _, sample := range samples {
scores := m.Score(sample)
if scores[1] < 0.5 {
t.Errorf("Score for oligo2 should be above 0.5")
}
}
}
Loading