From 3cb55d2e6a1f8eee1fbb46dbdfc0ebe16719381d Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Fri, 5 Jan 2024 23:16:42 -0800 Subject: [PATCH] add megamash stuff --- lib/align/megamash/megamash.go | 19 ++++--------------- lib/align/megamash/megamash_test.go | 12 +++++++++++- 2 files changed, 15 insertions(+), 16 deletions(-) diff --git a/lib/align/megamash/megamash.go b/lib/align/megamash/megamash.go index 0d7f949..8b28345 100644 --- a/lib/align/megamash/megamash.go +++ b/lib/align/megamash/megamash.go @@ -14,6 +14,7 @@ import ( "github.com/koeng101/dnadesign/lib/transform" ) +// StandardizedCompressedDNA returns the CompressedDNA byte string func StandardizedCompressedDNA(sequence string) []byte { var deterministicSequence string reverseComplement := transform.ReverseComplement(sequence) @@ -60,19 +61,7 @@ func MakeMegamashMap(sequences []string, kmerSize uint) MegamashMap { // Add it to megamashMap megamashMap = append(megamashMap, uniqueKmerMap) } - // Finally, go back through and make a final megamashMap without - // all those falses. - var finalMegamashMap MegamashMap - for _, singleMegamashMap := range megamashMap { - finalMap := make(map[string]bool) - for kmerBase64, value := range singleMegamashMap { - if value { - finalMap[kmerBase64] = true - } - } - finalMegamashMap = append(finalMegamashMap, finalMap) - } - return finalMegamashMap + return megamashMap } func (m *MegamashMap) Score(sequence string) []float64 { @@ -102,8 +91,8 @@ out: for i := 0; i <= len(sequence)-kmerSize; i++ { kmerBytes := StandardizedCompressedDNA(sequence[i : i+kmerSize]) kmerBase64 := base64.StdEncoding.EncodeToString(kmerBytes) - _, ok := sequenceMap[kmerBase64] - if ok { + unique, ok := sequenceMap[kmerBase64] + if ok && unique { matchedKmers++ } } diff --git a/lib/align/megamash/megamash_test.go b/lib/align/megamash/megamash_test.go index 28ab815..8cb2392 100644 --- a/lib/align/megamash/megamash_test.go +++ b/lib/align/megamash/megamash_test.go @@ -2,10 +2,14 @@ package megamash import ( "testing" + + "github.com/koeng101/dnadesign/lib/random" ) func TestCompressDNA(t *testing.T) { // Define test cases + longDna, _ := random.DNASequence(300, 0) + longerDna, _ := random.DNASequence(66000, 0) tests := []struct { name string dna string @@ -15,6 +19,8 @@ func TestCompressDNA(t *testing.T) { {"Empty", "", 2, 0x00}, {"Short", "ATGC", 3, 0x00}, {"Medium", "ATGCGTATGCCGTAGC", 6, 0x00}, + {"Long", longDna, 78, 0x01}, + {"Longest", longerDna, 16505, 0x02}, // Add more test cases for longer sequences and edge cases } @@ -22,7 +28,7 @@ func TestCompressDNA(t *testing.T) { t.Run(tc.name, func(t *testing.T) { compressed := CompressDNA(tc.dna) if len(compressed) != tc.expectedLen { - t.Errorf("CompressDNA() with input %s, expected length %d, got %d", tc.dna, tc.expectedLen, len(compressed)) + t.Errorf("CompressDNA() with input %s, expected length %d, got %d", "", tc.expectedLen, len(compressed)) } if compressed[0] != tc.expectedFlag { t.Errorf("CompressDNA() with input %s, expected flag %b, got %b", tc.dna, tc.expectedFlag, compressed[0]) @@ -32,6 +38,8 @@ func TestCompressDNA(t *testing.T) { } func TestDecompressDNA(t *testing.T) { + longDna, _ := random.DNASequence(300, 0) + longerDna, _ := random.DNASequence(66000, 0) // Define test cases tests := []struct { name string @@ -41,6 +49,8 @@ func TestDecompressDNA(t *testing.T) { {"Empty", "", ""}, {"Short", "ATGC", "ATGC"}, {"Medium", "ATGCGTATGCCGTAGC", "ATGCGTATGCCGTAGC"}, + {"Long", longDna, longDna}, + {"Longest", longerDna, longerDna}, // Add more test cases as needed }