Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

add sequencing functions #49

Merged
merged 10 commits into from
Jan 19, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -71,6 +71,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).

## [Unreleased]
- Added the megamash algorithm [#50](https://github.com/Koeng101/dnadesign/pull/50)
- Changed parsers to return values instead of pointers. Added some sequencing utils [#49](https://github.com/Koeng101/dnadesign/pull/49)
- Added minimap2 and samtools(pileup) integrations in external [#46](https://github.com/Koeng101/dnadesign/pull/46)
- Added sam parser [#5](https://github.com/Koeng101/dnadesign/pull/5)
- Added the LinearFold folding algorithms [#38](https://github.com/Koeng101/dnadesign/pull/38)
Expand Down
57 changes: 56 additions & 1 deletion external/minimap2/minimap2.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,9 +20,15 @@ For more information on minimap2, please visit Heng Li's git: https://github.com
package minimap2

import (
"context"
"io"
"os"
"os/exec"

"github.com/koeng101/dnadesign/lib/bio"
"github.com/koeng101/dnadesign/lib/bio/fastq"
"github.com/koeng101/dnadesign/lib/bio/sam"
"golang.org/x/sync/errgroup"
)

// Minimap2 aligns sequences using minimap2 over the command line. Right
Expand Down Expand Up @@ -65,7 +71,7 @@ func Minimap2(templateFastaInput io.Reader, fastqInput io.Reader, w io.Writer) e
tmpFile.Close() // Close the file as it's no longer needed

// Start minimap2 pointing to the temporary file and stdin for sequencing data
cmd := exec.Command("minimap2", "-ax", "map-ont", tmpFile.Name(), "-")
cmd := exec.Command("minimap2", "-K", "100", "-ax", "map-ont", tmpFile.Name(), "-")
cmd.Stdout = w
cmd.Stdin = fastqInput
if err := cmd.Start(); err != nil {
Expand All @@ -74,3 +80,52 @@ func Minimap2(templateFastaInput io.Reader, fastqInput io.Reader, w io.Writer) e

return cmd.Wait()
}

// Minimap2Channeled uses channels rather than io.Reader and io.Writers.
func Minimap2Channeled(fastaTemplates io.Reader, fastqChan <-chan fastq.Read, samChan chan<- sam.Alignment) error {
ctx := context.Background()
g, ctx := errgroup.WithContext(ctx)

// Create a pipe for writing fastq reads and reading them as an io.Reader
fastqPr, fastqPw := io.Pipe()

// Goroutine to consume fastq reads and write them to the PipeWriter
g.Go(func() error {
defer fastqPw.Close()
for read := range fastqChan {
_, err := read.WriteTo(fastqPw)
if err != nil {
return err // return error to be handled by errgroup
}
}
return nil
})

// Create a pipe for SAM alignments.
samPr, samPw := io.Pipe()

// Use Minimap2 function to process the reads and write SAM alignments.
g.Go(func() error {
defer samPw.Close()
return Minimap2(fastaTemplates, fastqPr, samPw) // Minimap2 writes to samPw
})

// Create a SAM parser from samPr (the PipeReader connected to Minimap2 output).
samParser, err := bio.NewSamParser(samPr)
if err != nil {
return err
}

// Parsing SAM and sending to channel.
g.Go(func() error {
return samParser.ParseToChannel(ctx, samChan, false)
})

// Wait for all goroutines in the group to finish.
if err := g.Wait(); err != nil {
return err // This will return the first non-nil error from the group of goroutines
}

// At this point, all goroutines have finished successfully
return nil
}
170 changes: 170 additions & 0 deletions lib/align/megamash/megamash.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
/*
Package megamash is an implementation of the megamash algorithm.

Megamash is an algorithm developed by Keoni Gandall to find templates from
sequencing reactions. For example, you may have a pool of amplicons, and need
to get a count of how many times each amplicon shows up in a given sequencing
reaction.
*/
package megamash

import (
"context"
"fmt"

"github.com/koeng101/dnadesign/lib/bio/fasta"
"github.com/koeng101/dnadesign/lib/bio/fastq"
"github.com/koeng101/dnadesign/lib/transform"
)

// StandardizedDNA returns the alphabetically lesser strand of a double
// stranded DNA molecule.
func StandardizedDNA(sequence string) string {
var deterministicSequence string
reverseComplement := transform.ReverseComplement(sequence)
if sequence > reverseComplement {
deterministicSequence = reverseComplement
} else {
deterministicSequence = sequence
}
return deterministicSequence
}

var (
DefaultKmerSize uint = 16
DefaultMinimalKmerCount uint = 10
DefaultScoreThreshold float64 = 0.2
)

type MegamashMap struct {
Identifiers []string
Kmers []map[string]bool
KmerSize uint
KmerMinimalCount uint
Threshold float64
}

// NewMegamashMap creates a megamash map that can be searched against.
func NewMegamashMap(sequences []fasta.Record, kmerSize uint, kmerMinimalCount uint, threshold float64) (MegamashMap, error) {
var megamashMap MegamashMap
megamashMap.KmerSize = kmerSize
megamashMap.KmerMinimalCount = kmerMinimalCount
megamashMap.Threshold = threshold

for _, fastaRecord := range sequences {
megamashMap.Identifiers = append(megamashMap.Identifiers, fastaRecord.Identifier)
sequence := fastaRecord.Sequence

// First get all kmers with a given sequence
kmerMap := make(map[string]bool)
for i := 0; i <= len(sequence)-int(kmerSize); i++ {
kmerString := StandardizedDNA(sequence[i : i+int(kmerSize)])
kmerMap[kmerString] = true
}

// Then, get unique kmers for this sequence and only this sequence
uniqueKmerMap := make(map[string]bool)
for kmerBase64 := range kmerMap {
unique := true
for _, otherMegaMashMap := range megamashMap.Kmers {
_, ok := otherMegaMashMap[kmerBase64]
// If this kmer is found, set both to false
if ok {
otherMegaMashMap[kmerBase64] = false
unique = false
break
}
}
if unique {
uniqueKmerMap[kmerBase64] = true
}
}
// Check if we have the minimal kmerCount
var kmerCount uint = 0
for _, unique := range uniqueKmerMap {
if unique {
kmerCount++
}
}
if kmerCount < kmerMinimalCount {
return megamashMap, fmt.Errorf("Got only %d unique kmers of required %d for sequence %s", kmerCount, kmerMinimalCount, fastaRecord.Identifier)
}

// Now we have a unique Kmer map for the given sequence.
// Add it to megamashMap
megamashMap.Kmers = append(megamashMap.Kmers, uniqueKmerMap)
}
return megamashMap, nil
}

// Match contains the identifier and score of a potential match to the searched
// sequence.
type Match struct {
Identifier string
Score float64
}

// Match matches a sequence to all the sequences in a megamash map.
func (m *MegamashMap) Match(sequence string) []Match {
var scores []float64
// The algorithm is as follows:
// - Go through each map.
// - Get the number of matching kmers
// - Divide that by the total kmers available for matching

// First, get the kmer total
var kmerSize int
out:
for _, maps := range m.Kmers {
for kmer := range maps {
kmerSize = len(kmer)
break out
}
}

// Now, iterate through each map
for _, sequenceMap := range m.Kmers {
var score float64
var totalKmers = len(sequenceMap)
var matchedKmers int
for i := 0; i <= len(sequence)-kmerSize; i++ {
kmerString := StandardizedDNA(sequence[i : i+kmerSize])
unique, ok := sequenceMap[kmerString]
if ok && unique {
matchedKmers++
}
}
if totalKmers == 0 {
score = 0
} else {
score = float64(matchedKmers) / float64(totalKmers)
}
scores = append(scores, score)
}

var matches []Match
for i, score := range scores {
if score > m.Threshold {
matches = append(matches, Match{Identifier: m.Identifiers[i], Score: score})
}
}
return matches
}

// FastqMatchChannel processes a channel of fastq.Read and pushes to a channel of matches.
func (m *MegamashMap) FastqMatchChannel(ctx context.Context, sequences <-chan fastq.Read, matches chan<- []Match) error {
for {
select {
case <-ctx.Done():
// Clean up resources, handle cancellation.
return ctx.Err()
case sequence, ok := <-sequences:
if !ok {
close(matches)
return nil
}
sequenceMatches := m.Match(sequence.Sequence)
matches <- sequenceMatches
}
}
}
39 changes: 39 additions & 0 deletions lib/align/megamash/megamash_test.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,39 @@
package megamash

import (
"testing"

"github.com/koeng101/dnadesign/lib/bio/fasta"
)

func TestMegamash(t *testing.T) {
oligo1 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTCTCAATGACCAAACCAACGCAAGTCTTAGTTCGTTCAGTCTCTATTTTATTCTTCATCACACTGTTGCACTTGGTTGTTGCAATGAGATTTCCTAGTATTTTCACTGCTGTGCTGAGACCCGGATCGAACTTAGGTAGCCT"
oligo2 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTGCTATTTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCACTAGTCATAAT"
oligo3 := "CCGTGCGACAAGATTTCAAGGGTCTCTCTTCTATCGCAGCCAAGGAAGAAGGTGTATCTCTAGAGAAGCGTCGAGTGAGACCCGGATCGAACTTAGGTAGCCCCCTTCGAAGTGGCTCTGTCTGATCCTCCGCGGATGGCGACACCATCGGACTGAGGATATTGGCCACA"

samples := []string{"TTTTGTCTACTTCGTTCCGTTGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGACGGCGCCTCCGTGCGACGAGATTTCAAGGGTCTCTGTGCTATATTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCAGATCGACTTTTAGATTCCTCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATACGTGG", "TGTCCTTTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTACTTATCGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTATCTGAGACCGAAGTGGTTTGCCTAAACGCAGGTGCTGTTGGCAAAGGCAGAAAGTAGTCTTAACCTTGACAATGAGTGGTA", "GTTATTGTCGTCTCCTTTGACTCAGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTGCTGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTCCGCTTCTATCTGAGACCGAAGTGGTTAT", "TGTTCTGTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTCTGCCTTAGAGACCACGCCTCCGTGCGACAAGATTCAAGGGTCTCTGTGCTCTGCCGCTAGTTCCGCTCTAGCTGCTCCGGTATGCATCTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAACTGTTGGTT"}
m, err := NewMegamashMap([]fasta.Record{{Sequence: oligo1, Identifier: "oligo1"}, {Sequence: oligo2, Identifier: "oligo2"}, {Sequence: oligo3, Identifier: "oligo3"}}, DefaultKmerSize, DefaultMinimalKmerCount, DefaultScoreThreshold)
if err != nil {
t.Errorf("Failed to make NewMegamashMap: %s", err)
}
for _, sample := range samples {
scores := m.Match(sample)
if scores[0].Identifier != "oligo2" {
t.Errorf("Should have gotten oligo2. Got: %s", scores[0].Identifier)
}
}
}

func BenchmarkMegamash(b *testing.B) {
for i := 0; i < b.N; i++ {
oligo1 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTCTCAATGACCAAACCAACGCAAGTCTTAGTTCGTTCAGTCTCTATTTTATTCTTCATCACACTGTTGCACTTGGTTGTTGCAATGAGATTTCCTAGTATTTTCACTGCTGTGCTGAGACCCGGATCGAACTTAGGTAGCCT"
oligo2 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTGCTATTTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCACTAGTCATAAT"
oligo3 := "CCGTGCGACAAGATTTCAAGGGTCTCTCTTCTATCGCAGCCAAGGAAGAAGGTGTATCTCTAGAGAAGCGTCGAGTGAGACCCGGATCGAACTTAGGTAGCCCCCTTCGAAGTGGCTCTGTCTGATCCTCCGCGGATGGCGACACCATCGGACTGAGGATATTGGCCACA"

samples := []string{"TTTTGTCTACTTCGTTCCGTTGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGACGGCGCCTCCGTGCGACGAGATTTCAAGGGTCTCTGTGCTATATTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCAGATCGACTTTTAGATTCCTCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATACGTGG", "TGTCCTTTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTACTTATCGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTATCTGAGACCGAAGTGGTTTGCCTAAACGCAGGTGCTGTTGGCAAAGGCAGAAAGTAGTCTTAACCTTGACAATGAGTGGTA", "GTTATTGTCGTCTCCTTTGACTCAGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTGCTGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTCCGCTTCTATCTGAGACCGAAGTGGTTAT", "TGTTCTGTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTCTGCCTTAGAGACCACGCCTCCGTGCGACAAGATTCAAGGGTCTCTGTGCTCTGCCGCTAGTTCCGCTCTAGCTGCTCCGGTATGCATCTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAACTGTTGGTT"}
m, _ := NewMegamashMap([]fasta.Record{{Sequence: oligo1, Identifier: "oligo1"}, {Sequence: oligo2, Identifier: "oligo2"}, {Sequence: oligo3, Identifier: "oligo3"}}, DefaultKmerSize, DefaultMinimalKmerCount, DefaultScoreThreshold)
for _, sample := range samples {
_ = m.Match(sample)
}
}
}
Loading
Loading