diff --git a/CHANGELOG.md b/CHANGELOG.md index 7b564df7b..b5337e311 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,10 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0 - Alternative start codons can now be used in the `synthesis/codon` DNA -> protein translation package (#305) - Added a parser and writer for the `pileup` sequence alignment format (#329) - Added statistics to the `synthesis/codon` package (keeping track of the observed start codon occurrences in a translation table) (#350) +- Added option to fragmenter to fragment with only certain overhangs (#387) + + + ### Fixed - `fastq` parser no longer becomes de-aligned when reading (#325) diff --git a/synthesis/fragment/fragment.go b/synthesis/fragment/fragment.go index 23a59b275..ab01d06ec 100644 --- a/synthesis/fragment/fragment.go +++ b/synthesis/fragment/fragment.go @@ -98,11 +98,11 @@ func NextOverhang(currentOverhangs []string) string { } // optimizeOverhangIteration takes in a sequence and optimally fragments it. -func optimizeOverhangIteration(sequence string, minFragmentSize int, maxFragmentSize int, existingFragments []string, existingOverhangs []string) ([]string, float64, error) { +func optimizeOverhangIteration(sequence string, minFragmentSize int, maxFragmentSize int, existingFragments []string, excludeOverhangs []string, includeOverhangs []string) ([]string, float64, error) { // If the sequence is smaller than maxFragment size, stop iteration. if len(sequence) < maxFragmentSize { existingFragments = append(existingFragments, sequence) - return existingFragments, SetEfficiency(existingOverhangs), nil + return existingFragments, SetEfficiency(excludeOverhangs), nil } // Make sure minFragmentSize > maxFragmentSize @@ -136,6 +136,7 @@ func optimizeOverhangIteration(sequence string, minFragmentSize int, maxFragment var bestOverhangEfficiency float64 var bestOverhangPosition int var alreadyExists bool + var buildAvailable bool for overhangOffset := 0; overhangOffset <= maxFragmentSize-minFragmentSize; overhangOffset++ { // We go from max -> min, so we can maximize the size of our fragments overhangPosition := maxFragmentSize - overhangOffset @@ -143,16 +144,27 @@ func optimizeOverhangIteration(sequence string, minFragmentSize int, maxFragment // Make sure overhang isn't already in set alreadyExists = false - for _, existingOverhang := range existingOverhangs { - if existingOverhang == overhangToTest || transform.ReverseComplement(existingOverhang) == overhangToTest { + for _, excludeOverhang := range excludeOverhangs { + if excludeOverhang == overhangToTest || transform.ReverseComplement(excludeOverhang) == overhangToTest { alreadyExists = true } } - if !alreadyExists { + // Make sure overhang is in set of includeOverhangs. If includeOverhangs is + // blank, skip this check. + buildAvailable = false + if len(includeOverhangs) == 0 { + buildAvailable = true + } + for _, includeOverhang := range includeOverhangs { + if includeOverhang == overhangToTest || transform.ReverseComplement(includeOverhang) == overhangToTest { + buildAvailable = true + } + } + if !alreadyExists && buildAvailable { // See if this overhang is a palindrome if !checks.IsPalindromic(overhangToTest) { // Get this overhang set's efficiency - setEfficiency := SetEfficiency(append(existingOverhangs, overhangToTest)) + setEfficiency := SetEfficiency(append(excludeOverhangs, overhangToTest)) // If this overhang is more efficient than any other found so far, set it as the best! if setEfficiency > bestOverhangEfficiency { @@ -167,16 +179,24 @@ func optimizeOverhangIteration(sequence string, minFragmentSize int, maxFragment return []string{}, float64(0), fmt.Errorf("bestOverhangPosition failed by equaling zero") } existingFragments = append(existingFragments, sequence[:bestOverhangPosition]) - existingOverhangs = append(existingOverhangs, sequence[bestOverhangPosition-4:bestOverhangPosition]) + excludeOverhangs = append(excludeOverhangs, sequence[bestOverhangPosition-4:bestOverhangPosition]) sequence = sequence[bestOverhangPosition-4:] - return optimizeOverhangIteration(sequence, minFragmentSize, maxFragmentSize, existingFragments, existingOverhangs) + return optimizeOverhangIteration(sequence, minFragmentSize, maxFragmentSize, existingFragments, excludeOverhangs, includeOverhangs) } // Fragment fragments a sequence into fragments between the min and max size, // choosing fragment ends for optimal assembly efficiency. Since fragments will // be inserted into either a vector or primer binding sites, the first 4 and // last 4 base pairs are the initial overhang set. -func Fragment(sequence string, minFragmentSize int, maxFragmentSize int, existingOverhangs []string) ([]string, float64, error) { +func Fragment(sequence string, minFragmentSize int, maxFragmentSize int, excludeOverhangs []string) ([]string, float64, error) { + sequence = strings.ToUpper(sequence) + return optimizeOverhangIteration(sequence, minFragmentSize, maxFragmentSize, []string{}, append([]string{sequence[:4], sequence[len(sequence)-4:]}, excludeOverhangs...), []string{}) +} + +// FragmentWithOverhangs fragments a sequence with only a certain overhang set. +// This is useful if you are constraining the set of possible overhangs when +// doing more advanced forms of cloning. +func FragmentWithOverhangs(sequence string, minFragmentSize int, maxFragmentSize int, excludeOverhangs []string, includeOverhangs []string) ([]string, float64, error) { sequence = strings.ToUpper(sequence) - return optimizeOverhangIteration(sequence, minFragmentSize, maxFragmentSize, []string{}, append([]string{sequence[:4], sequence[len(sequence)-4:]}, existingOverhangs...)) + return optimizeOverhangIteration(sequence, minFragmentSize, maxFragmentSize, []string{}, append([]string{sequence[:4], sequence[len(sequence)-4:]}, excludeOverhangs...), includeOverhangs) } diff --git a/synthesis/fragment/fragment_test.go b/synthesis/fragment/fragment_test.go index 0a71bfde9..ab3f0c153 100644 --- a/synthesis/fragment/fragment_test.go +++ b/synthesis/fragment/fragment_test.go @@ -85,3 +85,13 @@ func TestRegressionTestMatching12(t *testing.T) { t.Errorf("Expected efficiency of .99 - approximately matches NEB ligase fidelity viewer of .97. Got: %g", efficiency) } } + +func TestFragmentWithOverhangs(t *testing.T) { + defaultOverhangs := []string{"CGAG", "GTCT", "GGGG", "AAAA", "AACT", "AATG", "ATCC", "CGCT", "TTCT", "AAGC", "ATAG", "ATTA", "ATGT", "ACTC", "ACGA", "TATC", "TAGG", "TACA", "TTAC", "TTGA", "TGGA", "GAAG", "GACC", "GCCG", "TCTG", "GTTG", "GTGC", "TGCC", "CTGG", "TAAA", "TGAG", "AAGA", "AGGT", "TTCG", "ACTA", "TTAG", "TCTC", "TCGG", "ATAA", "ATCA", "TTGC", "CACG", "AATA", "ACAA", "ATGG", "TATG", "AAAT", "TCAC"} + gene := "atgaaaaaatttaactggaagaaaatagtcgcgccaattgcaatgctaattattggcttactaggtggtttacttggtgcctttatcctactaacagcagccggggtatcttttaccaatacaacagatactggagtaaaaacggctaagaccgtctacaccaatataacagatacaactaaggctgttaagaaagtacaaaatgccgttgtttctgtcatcaattatcaagaaggttcatcttcagattctctaaatgacctttatggccgtatctttggcggaggggacagttctgattctagccaagaaaattcaaaagattcagatggtctacaggtcgctggtgaaggttctggagtcatctataaaaaagatggcaaagaagcctacatcgtaaccaataaccatgttgtcgatggggctaaaaaacttgaaatcatgctttcggatggttcgaaaattactggtgaacttgttggtaaagacacttactctgacctagcagttgtcaaagtatcttcagataaaataacaactgttgcagaatttgcagactcaaactcccttactgttggtgaaaaagcaattgctatcggtagcccacttggtaccgaatacgccaactcagtaacagaaggaatcgtttctagccttagccgtactataacgatgcaaaacgataatggtgaaactgtatcaacaaacgctatccaaacagatgcagccattaaccctggtaactctggtggtgccctagtcaatattgaaggacaagttatcggtattaattcaagtaaaatttcatcaacgtctgcagtcgctggtagtgctgttgaaggtatggggtttgccattccatcaaacgatgttgttgaaatcatcaatcaattagaaaaagatggtaaagttacacgaccagcactaggaatctcaatagcagatcttaatagcctttctagcagcgcaacttctaaattagatttaccagatgaggtcaaatccggtgttgttgtcggtagtgttcagaaaggtatgccagctgacggtaaacttcaagaatatgatgttatcactgagattgatggtaagaaaatcagctcaaaaactgatattcaaaccaatctttacagccatagtatcggagatactatcaaggtaaccttctatcgtggtaaagataagaaaactgtagatcttaaattaacaaaatctacagaagacatatctgattaa" + + _, _, err := FragmentWithOverhangs(gene, 90, 110, []string{}, defaultOverhangs) + if err != nil { + t.Errorf(err.Error()) + } +}