From 3bf4bbb32f2e13436c05d388390243efa9b73213 Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Tue, 2 Jan 2024 12:37:02 -0800 Subject: [PATCH 1/9] Add sequencing --- lib/bio/bio.go | 19 ++ lib/bio/example_test.go | 23 +++ lib/bio/sam/sam.go | 16 ++ lib/bio/sam/sam_test.go | 30 +++ lib/sequencing/data/reads.fastq | 76 ++++++++ lib/sequencing/data/templates.fasta | 6 + lib/sequencing/example_test.go | 121 ++++++++++++ lib/sequencing/nanopore/example_test.go | 30 +++ lib/sequencing/nanopore/nanopore.go | 245 ++++++++++++++++++++++++ lib/sequencing/sequencing.go | 4 + 10 files changed, 570 insertions(+) create mode 100644 lib/sequencing/data/reads.fastq create mode 100644 lib/sequencing/data/templates.fasta create mode 100644 lib/sequencing/example_test.go create mode 100644 lib/sequencing/nanopore/example_test.go create mode 100644 lib/sequencing/nanopore/nanopore.go create mode 100644 lib/sequencing/sequencing.go diff --git a/lib/bio/bio.go b/lib/bio/bio.go index 8a6cb7a..94b6d04 100644 --- a/lib/bio/bio.go +++ b/lib/bio/bio.go @@ -295,3 +295,22 @@ func ManyToChannel[Data io.WriterTo, Header io.WriterTo](ctx context.Context, ch close(channel) return err } + +// FilterData is a generic function that implements a channel filter. Users +// give an input and output channel, with a filtering function, and FilterData +// filters data from the input into the output. +func FilterData[Data *genbank.Genbank | *fasta.Record | *fastq.Read | *slow5.Read | *sam.Alignment | *pileup.Line | *uniprot.Entry](input <-chan Data, output chan<- Data, filter func(a Data) bool) { + for { + select { + case data, ok := <-input: + if !ok { + // If the input channel is closed, we close the output channel and return + close(output) + return + } + if filter(data) { + output <- data + } + } + } +} diff --git a/lib/bio/example_test.go b/lib/bio/example_test.go index a93e79c..eb6cecf 100644 --- a/lib/bio/example_test.go +++ b/lib/bio/example_test.go @@ -10,6 +10,7 @@ import ( "github.com/koeng101/dnadesign/lib/bio" "github.com/koeng101/dnadesign/lib/bio/fasta" + "github.com/koeng101/dnadesign/lib/bio/sam" ) // Example_read shows an example of reading a file from disk. @@ -399,3 +400,25 @@ ae9a66f5-bf71-4572-8106-f6f8dbd3b799 16 pOpen_V3_amplified 1 60 8S54M1D3M1D108M1 fmt.Println(records[0].CIGAR) // Output: 8S54M1D3M1D108M1D1M1D62M226S } + +func ExampleFilterData() { + // Create channels for input and output + inputChan := make(chan *sam.Alignment, 2) // Buffered channel to prevent blocking + outputChan := make(chan *sam.Alignment) + + var results []sam.Alignment + go bio.FilterData(inputChan, outputChan, func(data *sam.Alignment) bool { return (data.FLAG & 0x900) == 0 }) + + // Send some example Alignments to the input channel + inputChan <- &sam.Alignment{FLAG: 0x900} // Not primary, should not be outputted + inputChan <- &sam.Alignment{SEQ: "FAKE", FLAG: 0x000} // Primary, should be outputted + close(inputChan) // Close the input channel to signal no more data + + // Collect results from the output channel + for alignment := range outputChan { + results = append(results, *alignment) + } + + fmt.Println(results) + // Output: [{ 0 0 0 0 0 FAKE []}] +} diff --git a/lib/bio/sam/sam.go b/lib/bio/sam/sam.go index 6cbf6b0..966d5bd 100644 --- a/lib/bio/sam/sam.go +++ b/lib/bio/sam/sam.go @@ -530,3 +530,19 @@ func (p *Parser) Next() (*Alignment, error) { alignment.Optionals = optionals return &alignment, nil } + +/****************************************************************************** +Jan 01, 2024 + +Below are some helper functions that I've found are useful for when processing +sam files. + +******************************************************************************/ + +// Primary determines whether the Alignment is the primary line of the read. +// This is useful for finding out if a particular read is the best aligned to +// a certain fragment. +func Primary(a *Alignment) bool { + // Perform bitwise AND of FLAG with 0x900 and compare to 0 + return (a.FLAG & 0x900) == 0 +} diff --git a/lib/bio/sam/sam_test.go b/lib/bio/sam/sam_test.go index 66c22b9..4da764d 100644 --- a/lib/bio/sam/sam_test.go +++ b/lib/bio/sam/sam_test.go @@ -266,3 +266,33 @@ func TestValidateAllInOne(t *testing.T) { }) } } + +func TestPrimary(t *testing.T) { + // Define test cases + tests := []struct { + name string + flag uint16 + want bool + }{ + {"No Flags", 0x0, true}, + {"Secondary Alignment", 0x100, false}, + {"Supplementary Alignment", 0x800, false}, + {"Secondary and Supplementary", 0x900, false}, + {"PCR or Optical Duplicate", 0x400, true}, + {"Reverse Complemented", 0x10, true}, + // ... other test cases + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + // Create an Alignment with the given FLAG + a := Alignment{FLAG: tc.flag} + // Call the Primary function + got := Primary(&a) + // Assert that the result is as expected + if got != tc.want { + t.Errorf("Primary() with FLAG 0x%x = %v, want %v", tc.flag, got, tc.want) + } + }) + } +} diff --git a/lib/sequencing/data/reads.fastq b/lib/sequencing/data/reads.fastq new file mode 100644 index 0000000..c81ccc0 --- /dev/null +++ b/lib/sequencing/data/reads.fastq @@ -0,0 +1,76 @@ +@289a197e-4c05-4143-80e6-488e23044378 runid=bb4427242f6da39e67293199a11c6c4b6ab2b141 read=34575 ch=111 start_time=2023-12-29T16:06:13.719061-08:00 flow_cell_id=AQY258 protocol_group_id=nseq28 sample_id=build3-build3gg-u11 barcode=barcode06 barcode_alias=barcode06 parent_read_id=289a197e-4c05-4143-80e6-488e23044378 basecall_model_version_id=dna_r10.4.1_e8.2_400bps_sup@v4.2.0 +TTTTGTCTACTTCGTTCCGTTGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGACGGCGCCTCCGTGCGACGAGATTTCAAGGGTCTCTGTGCTATATTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCAGATCGACTTTTAGATTCCTCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATACGTGG ++ +$%%&%%$$%&'+)**,-+)))+866788711112=>A?@@@BDB@>?746@?>A@D2@970,-+..*++++;662/.-.+,++,//+167>A@A@@B=<887-,'&&%%&''((5555644578::<==B?ABCIJA>>>>@DCAA99::>=<<<=67777+***)//+,,+)&&&+--.02:>442000/1225:=D?=<<=7;866/..../AAA226545+&%%$$ +@af86ed57-1cfe-486f-8205-b2c8d1186454 runid=bb4427242f6da39e67293199a11c6c4b6ab2b141 read=2233 ch=123 start_time=2023-12-29T10:04:32.719061-08:00 flow_cell_id=AQY258 protocol_group_id=nseq28 sample_id=build3-build3gg-u11 barcode=barcode06 barcode_alias=barcode06 parent_read_id=af86ed57-1cfe-486f-8205-b2c8d1186454 basecall_model_version_id=dna_r10.4.1_e8.2_400bps_sup@v4.2.0 +TGTCCTTTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTACTTATCGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTATCTGAGACCGAAGTGGTTTGCCTAAACGCAGGTGCTGTTGGCAAAGGCAGAAAGTAGTCTTAACCTTGACAATGAGTGGTA ++ +$%&$$$$$#')+)+,<>@B?>==<>>;;<<?@DA@?=>==>??<>??7;<706=>=>CBCCB????@CCBDAGFFFGJ<<<<<=54455>@?>:::9..++?@BDCCDCGECFHD@>=<<==>@@B@?@@>>>==>>===>>>A?@ADFGDCA@?????CCCEFDDDDDGJODAA@A;;ABBD<=<:92222223:>>@?@@B?@=<62212=<<<=>AAB=<'&&&'-,-.,**)'&'(,,,-.114888&&&&&'+++++,,* +@fc3455c8-461b-4dd6-a4cf-124712fd8295 runid=bb4427242f6da39e67293199a11c6c4b6ab2b141 read=121781 ch=3 start_time=2023-12-29T16:43:08.719061-08:00 flow_cell_id=AQY258 protocol_group_id=nseq28 sample_id=build3-build3gg-u11 barcode=barcode06 barcode_alias=barcode06 parent_read_id=fc3455c8-461b-4dd6-a4cf-124712fd8295 basecall_model_version_id=dna_r10.4.1_e8.2_400bps_sup@v4.2.0 +TAACTCGTTCAGTTACGTATTGTTTGCAACAATTTCAAGGGTCTCTGTGCTATTTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGATAGAACTTAGGTAGCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATCGTC ++ +$%&'**++66=7777<<<8888878(((::;GHSMJGSJFLIKHS<;<;DEBBBBD@@BF53333>@=7)++(((((/4//055DISMMKKKFSHD<<9778=DJQSH>==<;????===<>=210( +@326338df-52b8-4212-8f38-a430b040946b runid=bb4427242f6da39e67293199a11c6c4b6ab2b141 read=3321 ch=20 start_time=2023-12-29T10:06:29.719061-08:00 flow_cell_id=AQY258 protocol_group_id=nseq28 sample_id=build3-build3gg-u11 barcode=barcode06 barcode_alias=barcode06 parent_read_id=326338df-52b8-4212-8f38-a430b040946b basecall_model_version_id=dna_r10.4.1_e8.2_400bps_sup@v4.2.0 +TTATGTGTACTGTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTAAGACTTAGGCGCCTCCGTGCGACAAGATTTCAAGGGTCTCTGTGCTATTTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCAGGTGCTGTCTTCGCAAAGGCAGAAAGTAGTCTTAACCTTGGCAAT ++ ++55544/-%%%%%&''-*++205111265569999:CCEDEGGEGHSG@?::0+,?=:98&&**//.+++-())))334447777:::=>=78999=>>>>CCBCFGDFEC?=8611////)'&&'(()458444:;>>@@ABBEDBCCGDCC?@@IGFDEDDISJFESJFHFEFEGCCCAEEAA?A@CEF444DBB><<<55779>CAAA@@@9:;;4100014,,,,06CCABA@>;<::00&&&'(47@CS<;:9301+++))*+-/899-,-((( +@2a240eeb-fd8c-41b5-b0c5-2a1c7d566c6b runid=bb4427242f6da39e67293199a11c6c4b6ab2b141 read=2886 ch=26 start_time=2023-12-29T10:12:27.719061-08:00 flow_cell_id=AQY258 protocol_group_id=nseq28 sample_id=build3-build3gg-u11 barcode=barcode06 barcode_alias=barcode06 parent_read_id=2a240eeb-fd8c-41b5-b0c5-2a1c7d566c6b basecall_model_version_id=dna_r10.4.1_e8.2_400bps_sup@v4.2.0 +CATGTTTATGTAGCCTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCGCCTCCGTGCGACAAGATTTCAAGGGTCTCTGTGCTATTTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGCTAGTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCAGGTGCTGTTCTCGCAAAGGCGGGAGACGTGAGCCTTTTAATGGC ++ +$%%%%&&$$$%###%%'+,,+,-.(((((458;?@KJFGDDCCCFFEGCAABBDDDDCGDDGHMHHKJSFMECCB@B@?<<<<@GEEBABDBAB@AAEEAB?<<;9+'&&&&&'3369833444DE>><=@33??@IA422000.2>><77766844589:=<;<===EJDDA<<5444455973.+)),.**'%&&%&&((%%$$%% +@9043af62-24be-4ad1-9bd2-a9ed035cc282 runid=bb4427242f6da39e67293199a11c6c4b6ab2b141 read=5951 ch=88 start_time=2023-12-29T10:37:14.719061-08:00 flow_cell_id=AQY258 protocol_group_id=nseq28 sample_id=build3-build3gg-u11 barcode=barcode06 barcode_alias=barcode06 parent_read_id=9043af62-24be-4ad1-9bd2-a9ed035cc282 basecall_model_version_id=dna_r10.4.1_e8.2_400bps_sup@v4.2.0 +GTGTTACCTGTACTTGGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAAGTATCGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTAGATGTCTGCTGTTCTGCCTTTTTCCGCTTCTATCTGAGACCGAAGTGGTTTGCCTAAATCCAGGTGCTGTCTCGCAAAGGCAGAACTGAGTCTTAACCTTAGCAATGAT ++ +###%&%$$%%&&''*''()))*(('()**,./0544456788778898781+.9:;<;<:7554345689986444457-+*))))))01599978889988::;::::;<=99887776455568888887765566667778:76640//*(()+044444344,(()15457:::8335559767778:::866668977775544,+)$&''(*-*******(()--18985./.%$&&%%%'(++*++,&&&'(()&& +@7b9fcac2-aaf5-4832-b1f8-0ecad9964580 runid=bb4427242f6da39e67293199a11c6c4b6ab2b141 read=18681 ch=10 start_time=2023-12-29T10:38:42.719061-08:00 flow_cell_id=AQY258 protocol_group_id=nseq28 sample_id=build3-build3gg-u11 barcode=barcode06 barcode_alias=barcode06 parent_read_id=7b9fcac2-aaf5-4832-b1f8-0ecad9964580 basecall_model_version_id=dna_r10.4.1_e8.2_400bps_sup@v4.2.0 +ATGTATTCTACTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCCGTGCGACAAGATTTCAAGGGTCTCTGTGCTATTTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATCTGAGACCCAT ++ +%&&(&'&&')+,++,338545599::@@AACFBCDEEJEGFGD@??G:67AA><>512EBBAGEDED====<@@DFECCCBCEGFNMHJFFDEEEGJFHFHCIDCCBCCFEDDDEDGEEFFGJSGEEEHJIHLGEIGDBDABCBDCBABBCDCBBBBCCDD7655662%% +@dbcfe55c-1c4f-445f-aeaf-dab7ebee6822 runid=bb4427242f6da39e67293199a11c6c4b6ab2b141 read=8870 ch=110 start_time=2023-12-29T10:48:34.719061-08:00 flow_cell_id=AQY258 protocol_group_id=nseq28 sample_id=build3-build3gg-u11 barcode=barcode06 barcode_alias=barcode06 parent_read_id=dbcfe55c-1c4f-445f-aeaf-dab7ebee6822 basecall_model_version_id=dna_r10.4.1_e8.2_400bps_sup@v4.2.0 +ATGTTGTGGTGCTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCCGTGCGACGAGATTTCAAGGGTCTCTGTGCTATTTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTGATTTTACAATAAAACTAGGTAGCCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATACGTAGA ++ +%%&))%$$$##$$%%'(,+/,,--///,---08988;;::::===A??@<;;;;>>?@<999)))))6612;<=>=???@@?>=<;;;;=?@@@?=>>=>;94310..//4589:9988;;;<;<;:9999<;<;99::99:::;<;<<=>@?@>====?>@?@B@?===?CB@@97:1(((''%%%&%&********1))).57:::>?>>???>>?@=::::;?>><:9764455555533.///../:?A@?65%%% +@8e1f07e7-2334-4ea5-91b0-594485b5c0eb runid=bb4427242f6da39e67293199a11c6c4b6ab2b141 read=6981 ch=15 start_time=2023-12-29T10:49:00.719061-08:00 flow_cell_id=AQY258 protocol_group_id=nseq28 sample_id=build3-build3gg-u11 barcode=barcode06 barcode_alias=barcode06 parent_read_id=8e1f07e7-2334-4ea5-91b0-594485b5c0eb basecall_model_version_id=dna_r10.4.1_e8.2_400bps_sup@v4.2.0 +GTGTCCTCTACTTGGTTCAGTTGGTGTTGCTGGTCAGACTTTTCTTTCTGCCTTTGCGAGAACAGCACCTCCGTGCGACAAGATTTCAAGGGTCTCTGTGCTATTTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATAAAGACCTTGGGGCCGAAGCCGCTGCATGTCCTGTTAAAGGAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATACGTGG ++ +$$$%')(&'''&&&*+,-5631-++***(')&&&&&&&%&&')(.00-.?A??>=57AAAA;879@>>>>=>===:9999;96553..569BECCECEIDFDCD=,*****:,,,,,@?@BGLA@AAABI98777;===>845BC;=>=37----.3246.---.2100+*++,.73-+'')166666=>>?<<<<=<;:;<<<===>>@>?>==>>>??@?@AAACBC@AA?>?@??@D@@B<;;;;AA98-*)*44467;;;88322364223364.*)'& +@27c2036d-c4eb-42d5-aaca-2a84b6c85521 runid=bb4427242f6da39e67293199a11c6c4b6ab2b141 read=11034 ch=110 start_time=2023-12-29T11:11:50.719061-08:00 flow_cell_id=AQY258 protocol_group_id=nseq28 sample_id=build3-build3gg-u11 barcode=barcode06 barcode_alias=barcode06 parent_read_id=27c2036d-c4eb-42d5-aaca-2a84b6c85521 basecall_model_version_id=dna_r10.4.1_e8.2_400bps_sup@v4.2.0 +TGTTCTGTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTCTGCCTTAGAGACCACGCCTCCGTGCGACAAGATTCAAGGGTCTCTGTGCTCTGCCGCTAGTTCCGCTCTAGCTGCTCCGGTATGCATCTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAACTGTTGGTT ++ +%&&%$##$&**0114A>>>>=;;::9<<;<@BAAACBABCFHG:565++2100/&&'((((''&&&((0/-,,76667;986(')))*7775445012**)))*+228755656AEFDIDEHFEGC)))'%%$$%$$&'...49;ADEHDEDFDDSOEIII:9333234<<@?=<<8('(**))&''''()))++.;<:<<<<::::+ABDD;;7<=?8889:=GGEFJFHQLLGSJGCA@@@BSPEA877545.--++-./0@>=/.-))&%%&&((() +@3691074f-0a78-4ca8-9e9d-a71641e9e342 runid=bb4427242f6da39e67293199a11c6c4b6ab2b141 read=12417 ch=70 start_time=2023-12-29T11:12:41.719061-08:00 flow_cell_id=AQY258 protocol_group_id=nseq28 sample_id=build3-build3gg-u11 barcode=barcode06 barcode_alias=barcode06 parent_read_id=3691074f-0a78-4ca8-9e9d-a71641e9e342 basecall_model_version_id=dna_r10.4.1_e8.2_400bps_sup@v4.2.0 +ATGTTTTTGCGTGTACTTCGTTCAGTTACGTATTGCTAAGGTTCCGACTTCTGCCTTTGCAGACAGCACCTCCGTGCGACAAGATTTCAAGGGTCTCTGTGCTCTCCAGCTAGTTCCGCTCTAGCTGCTCCCAGTTAATACTACTACTGAAGATGAATTGGAGGATGACTTCGATGTTGCTGTTCTGCTTCCCGGCTGGACCCAATC ++ +$$%&')**)(&&%$$%%/+++..43221267799854444765,('''&&'*016993/.+++))).,,--,2244677778>>?B><<>>GEDCFDJGFFB@>><<88999;<<<<@@ABG::::::F>>0**22:<=?@@<<<<;===BCCCABBCCEFEGCCB<;:;7332339;:9874/---.//.//*)''%%%%%'&&&$ +@58fb5245-104d-48b7-b60c-029160e85314 runid=bb4427242f6da39e67293199a11c6c4b6ab2b141 read=11196 ch=9 start_time=2023-12-29T11:25:11.719061-08:00 flow_cell_id=AQY258 protocol_group_id=nseq28 sample_id=build3-build3gg-u11 barcode=barcode06 barcode_alias=barcode06 parent_read_id=58fb5245-104d-48b7-b60c-029160e85314 basecall_model_version_id=dna_r10.4.1_e8.2_400bps_sup@v4.2.0 +ATGTTATGTTAACCTACTTCGTTCAGTTACGTATTGTTGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCCGTGCGACAAGATTTCAAGGGTCTCTGTGCTATTTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATATTGGTT ++ +$$$%%$$%%%$###$%%&&&(*9<<<<<>>=>>>?==>66666=;8555599623/----:--986455566221/+,,47664458665::99:;AFDEJEHDEDCDAAAABEFAAA8544AA@CEIEKHIHHFF@@?>;;;;>=>=43336548989>77::;B@33222208>@AAA@?@><8889AFE@???@CCGEDDC=8899:BFG@?>=<99.--+++--7BBB<=A<=<52/,))' +@cb28d88d-07cd-4f80-b62d-dff55121511b runid=bb4427242f6da39e67293199a11c6c4b6ab2b141 read=26452 ch=3 start_time=2023-12-29T12:15:18.719061-08:00 flow_cell_id=AQY258 protocol_group_id=nseq28 sample_id=build3-build3gg-u11 barcode=barcode06 barcode_alias=barcode06 parent_read_id=cb28d88d-07cd-4f80-b62d-dff55121511b basecall_model_version_id=dna_r10.4.1_e8.2_400bps_sup@v4.2.0 +TACTGTTGCCTGTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACTCCAGTCGAGACAGATTTCGAGGTTCTCTGTGCTATTTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATGCTCTACTGAAGATGAATTGGAGGGTGACTTTGATGTTGCTGTTCTGCCTTTTCCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATACGT ++ +%%&%%%%$%&&%$$%%&'+.0+,+,,1656611101767;;<93...0,,,80,-9:;>?816(''')***,+*'''+)*)*,,,-.///86+)))++-,,())))99665655546=9878<;99::;>>>=6,((((--./16((('&&&)..<>??>?>==?554222*+''''(),->??><0:897666''''''+2;<<;=>=>>?<<20006977779G>88////.....0//07;:/-,-685210 +@4f5a8687-28b0-44d5-a7c4-23370a76536a runid=bb4427242f6da39e67293199a11c6c4b6ab2b141 read=409336 ch=54 start_time=2023-12-29T12:37:12.719061-08:00 flow_cell_id=AQY258 protocol_group_id=nseq28 sample_id=build3-build3gg-u11 barcode=barcode06 barcode_alias=barcode06 parent_read_id=4f5a8687-28b0-44d5-a7c4-23370a76536a basecall_model_version_id=dna_r10.4.1_e8.2_400bps_sup@v4.2.0 +ATGTTTTTGTTTTCTACTTCAACTTTCAGTTACGTATTGCTAAGGTTAAGACTACTTCTGCCTTTGCGAGAACAGCACCTCCGTGCGACAAGATTTCAAGGGTCTCTGTGCTATTTGCCGCTAGTTCCGCTCTAGCTGCTCCGGTTTAATATACTACTGAAGATGAATTGGAGGGTGACTCGAATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATGGAGACTCCGGTAGCCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTCACAATGAGTT ++ +%&&)*+.-*)'%$$%&(((&%&''/2'376544222224564344468877876555((9:<:86-.7310,((())0,**++132..../44222+,,,7666678:::984444454446544221/01112222454))(''+***-))'''--011132334,***012234----*''''(()**-877556876653110-.00387,+*)))'***'*)))))&&&&&).11366556783211-0/0015777642211220012222544---,+,)%%%$ +@02b7df9c-7083-4416-ac49-e8817c66a5b7 runid=bb4427242f6da39e67293199a11c6c4b6ab2b141 read=412380 ch=54 start_time=2023-12-29T13:10:11.719061-08:00 flow_cell_id=AQY258 protocol_group_id=nseq28 sample_id=build3-build3gg-u11 barcode=barcode06 barcode_alias=barcode06 parent_read_id=02b7df9c-7083-4416-ac49-e8817c66a5b7 basecall_model_version_id=dna_r10.4.1_e8.2_400bps_sup@v4.2.0 +TTTTTTTGTTACTCGTTGAGTTTACGTATTGCTAAGGTTAAGACTACTTCTGCCTTTGCGAGCACACCTCCGTGCGACAAGATTTAAAGGTATCTGTGCTCTCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCGCTTCTGAGCCCGGATCGAACTTAGGTAGCCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATACATG ++ +#$&),/0*(&$#&&&'/----00-16656788>>>?AAB@@BBCDC@?A11;;;400.1/+))(''&'.+,*,,,///..+++0/(*+)))))())++13)))))11))*667//;<>=>>G???:::@=:99:?===<<<<<432000..1118;B@@@B>>>==211118=314433333455420/.(((001***(((&'''+-+++--,1558?@@==>>?BSCC@@>:99:;=FJIEBB@@AC<9999===>?>=**/.013222) +@f5f018f4-288d-4db8-a545-92924a388ab6 runid=bb4427242f6da39e67293199a11c6c4b6ab2b141 read=20397 ch=113 start_time=2023-12-29T13:19:04.719061-08:00 flow_cell_id=AQY258 protocol_group_id=nseq28 sample_id=build3-build3gg-u11 barcode=barcode06 barcode_alias=barcode06 parent_read_id=f5f018f4-288d-4db8-a545-92924a388ab6 basecall_model_version_id=dna_r10.4.1_e8.2_400bps_sup@v4.2.0 +TGTTTTGTTGGGTCACCTCGTTCAGTTACGTATTGCTAAGGTTAAGCTACTTCTGCCTTTGCGAGAACACACTCTGCTAGGGGCTACTTATCGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAGGAATTTGTCCTTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTATCTTGAGACCGACCATGCAAGGAGAGGTACAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAAGCAATGCATT ++ +$$%&&('&%$$$#$$$$&%%&''++++,,-../0234322120)''&)%%',((**/111-.,*)'&&&%'++,-1223221233110.+**))--033200//0/----/11114677775,+*(&&&&((&'')'+((21//00.-,**++,.4445444332111234666965555788876664111113,,++(&'(,-.////0111125555543211+++,,.20/-*****,,-6543111210..---.../122*('''))&&''( +@fcb268e5-83cb-4dde-81fe-6e1a77516dee runid=bb4427242f6da39e67293199a11c6c4b6ab2b141 read=25724 ch=111 start_time=2023-12-29T14:39:32.719061-08:00 flow_cell_id=AQY258 protocol_group_id=nseq28 sample_id=build3-build3gg-u11 barcode=barcode06 barcode_alias=barcode06 parent_read_id=fcb268e5-83cb-4dde-81fe-6e1a77516dee basecall_model_version_id=dna_r10.4.1_e8.2_400bps_sup@v4.2.0 +ATGCCCTGTACTTCGTTCAGTTACGTATTGCTAAGGTTTAAGCTACTTTCTGCCTTTGCGAGAACAGCACCTTGCTGAATGAGAAACCTCGGGAAGACGTATCAGTTGGTTACTTATTATCGACACGCCGATTCGTATCAACATGGCTCAGTTCAGCGTGAAATATAATAAGATGAATGTTCTGCAGCATCTACGTCCGTCAGTATTAACTCGAGCGCTCTGTCTTCTGTGGACCCTATCAAACGAGCTGCTAGGGGCTACTTATGGGGTCTGTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGAGTAACTCT ++ +##$%%)%&+)))*(((**)((('(*)()++,6?AAAAB5+**)***,2/+-001?@A7D1111121255@EEFGLNCSIHEFBEKFCCC42223CE?::93332222)'''')+)%%((()5568BAA@<<==98877887::;33334220/)%%%%))34;:88'&&&&'''%%%''))&'00099:::AACBBBCBB97777:84446577;;>BFDGLLHLKJOCE<<=::;;???D@@000003;0///./,,,-,77100//42455BEFAAA@<44444ADDCABCCCBBC===7780//.//***()))(&&&&&&%% +@b6ba70d8-e83b-4762-b82b-9e56d7913572 runid=bb4427242f6da39e67293199a11c6c4b6ab2b141 read=52955 ch=16 start_time=2023-12-29T14:48:45.719061-08:00 flow_cell_id=AQY258 protocol_group_id=nseq28 sample_id=build3-build3gg-u11 barcode=barcode06 barcode_alias=barcode06 parent_read_id=b6ba70d8-e83b-4762-b82b-9e56d7913572 basecall_model_version_id=dna_r10.4.1_e8.2_400bps_sup@v4.2.0 +GCCTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTCTCCTTGCAGACAGCACCTCTGCTAGGGCTGCCTGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGTACTTGCCTTTTTCCGCTTCTATCTCAAGTTAGAAATAATTTGCTAAACGCAGGTGCTATTTATCGCAAAGGCAAGAGTCGTCTTAACCTTAGCAATTAGAT ++ +#$%%%'&'&''()/0//*))**-../3001;9:98898941-,'()('(--+'((()-,,--/5--,'&'++05/,***268977111116&&((((()779;444<7777>??@?>?77767:==?76.,,,>>=<==;;<<874456697-,,,+()(''))()+..././1467-('''''()*-,///..0..-.1(00///65::4442-/('&''.78<0.../4''''',(((000***)))(%%$% diff --git a/lib/sequencing/data/templates.fasta b/lib/sequencing/data/templates.fasta new file mode 100644 index 0000000..51ba0d2 --- /dev/null +++ b/lib/sequencing/data/templates.fasta @@ -0,0 +1,6 @@ +>oligo1 +CCGTGCGACAAGATTTCAAGGGTCTCTGTCTCAATGACCAAACCAACGCAAGTCTTAGTTCGTTCAGTCTCTATTTTATTCTTCATCACACTGTTGCACTTGGTTGTTGCAATGAGATTTCCTAGTATTTTCACTGCTGTGCTGAGACCCGGATCGAACTTAGGTAGCC +>oligo2 +CCGTGCGACAAGATTTCAAGGGTCTCTGTGCTATTTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCC +>oligo3 +CCGTGCGACAAGATTTCAAGGGTCTCTCTTCTATCGCAGCCAAGGAAGAAGGTGTATCTCTAGAGAAGCGTCGAGTGAGACCCGGATCGAACTTAGGTAGCC diff --git a/lib/sequencing/example_test.go b/lib/sequencing/example_test.go new file mode 100644 index 0000000..19ee149 --- /dev/null +++ b/lib/sequencing/example_test.go @@ -0,0 +1,121 @@ +package sequencing_test + +import ( + "bytes" + "context" + "fmt" + "log" + "os" + + "github.com/koeng101/dnadesign/external/minimap2" + "github.com/koeng101/dnadesign/lib/bio" + "github.com/koeng101/dnadesign/lib/bio/fasta" + "github.com/koeng101/dnadesign/lib/bio/fastq" + "github.com/koeng101/dnadesign/lib/bio/sam" + "github.com/koeng101/dnadesign/lib/primers/pcr" + "github.com/koeng101/dnadesign/lib/sequencing/nanopore" + "github.com/koeng101/dnadesign/lib/transform" + "golang.org/x/sync/errgroup" +) + +func ExampleAmpliconAlignment() { + // First, let's define the type we are looking for: amplicons in a pool. + type Amplicon struct { + Identifier string + TemplateSequence string + ForwardPrimer string + ReversePrimer string + } + + // Next, let's define data we'll be working on. In particular, the + // templates and fastq files. + + /* + Data processing steps: + + 1. Simulate PCRs of amplicons + 2. Sort for the right barcodes + 3. Trim fastq reads + 4. Minimap2 fastq reads to amplicons + 5. Filter for primary alignments + */ + var amplicons []Amplicon + var templates []fasta.Record + var pcrTm float64 = 50.0 + + forward := "CCGTGCGACAAGATTTCAAG" + reverse := transform.ReverseComplement("CGGATCGAACTTAGGTAGCC") + oligo1 := Amplicon{Identifier: "oligo1", ForwardPrimer: forward, ReversePrimer: reverse, TemplateSequence: "CCGTGCGACAAGATTTCAAGGGTCTCTGTCTCAATGACCAAACCAACGCAAGTCTTAGTTCGTTCAGTCTCTATTTTATTCTTCATCACACTGTTGCACTTGGTTGTTGCAATGAGATTTCCTAGTATTTTCACTGCTGTGCTGAGACCCGGATCGAACTTAGGTAGCCT"} + oligo2 := Amplicon{Identifier: "oligo2", ForwardPrimer: forward, ReversePrimer: reverse, TemplateSequence: "CCGTGCGACAAGATTTCAAGGGTCTCTGTGCTATTTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCACTAGTCATAAT"} + oligo3 := Amplicon{Identifier: "oligo3", ForwardPrimer: forward, ReversePrimer: reverse, TemplateSequence: "CCGTGCGACAAGATTTCAAGGGTCTCTCTTCTATCGCAGCCAAGGAAGAAGGTGTATCTCTAGAGAAGCGTCGAGTGAGACCCGGATCGAACTTAGGTAGCCCCCTTCGAAGTGGCTCTGTCTGATCCTCCGCGGATGGCGACACCATCGGACTGAGGATATTGGCCACA"} + amplicons = []Amplicon{oligo1, oligo2, oligo3} + + // Simulate PCRs + for _, amplicon := range amplicons { + fragments, _ := pcr.Simulate([]string{amplicon.TemplateSequence}, pcrTm, false, []string{amplicon.ForwardPrimer, amplicon.ReversePrimer}) + if len(fragments) != 1 { + log.Fatalf("Should only get 1 fragment from PCR!") + } + // In case your template will have multiple fragments + for _, fragment := range fragments { + // Make sure to reset identifier if you have more than 1 fragment. + templates = append(templates, fasta.Record{Identifier: amplicon.Identifier, Sequence: fragment}) + } + } + var buf bytes.Buffer + for _, template := range templates { + _, _ = template.WriteTo(&buf) + } + + // Trim fastq reads. All the following processes (trimming, minimap2, + // filtering) are all done concurrently. + + // Setup barcodes and fastq files + barcode := "barcode06" + r, _ := os.Open("data/reads.fastq") + parser := bio.NewFastqParser(r) + + // Setup errorGroups and channels + ctx := context.Background() + errorGroup, ctx := errgroup.WithContext(ctx) + + fastqReads := make(chan *fastq.Read) + fastqBarcoded := make(chan *fastq.Read) + fastqTrimmed := make(chan *fastq.Read) + samReads := make(chan *sam.Alignment) + samPrimary := make(chan *sam.Alignment) + + // Read fastqs into channel + errorGroup.Go(func() error { + return parser.ParseToChannel(ctx, fastqReads, false) + }) + + // Filter the right barcode fastqs from channel + go func() { + bio.FilterData(fastqReads, fastqBarcoded, func(data *fastq.Read) bool { return data.Optionals["barcode"] == barcode }) + }() + + // Trim barcodes + errorGroup.Go(func() error { + return nanopore.TrimBarcodeWithChannels(barcode, fastqBarcoded, fastqTrimmed) + }) + + // Run minimap + errorGroup.Go(func() error { + return minimap2.Minimap2Channeled(&buf, fastqTrimmed, samReads) + }) + + // Sort out primary alignments + go func() { + bio.FilterData(samReads, samPrimary, sam.Primary) + }() + + // Read all them alignments out into memory + var outputAlignments []sam.Alignment + for alignment := range samPrimary { + outputAlignments = append(outputAlignments, *alignment) + } + + fmt.Println(outputAlignments[0].RNAME) + // Output: oligo2 +} diff --git a/lib/sequencing/nanopore/example_test.go b/lib/sequencing/nanopore/example_test.go new file mode 100644 index 0000000..c491af6 --- /dev/null +++ b/lib/sequencing/nanopore/example_test.go @@ -0,0 +1,30 @@ +package nanopore_test + +import ( + "fmt" + + "github.com/koeng101/dnadesign/lib/bio/fastq" + "github.com/koeng101/dnadesign/lib/sequencing/nanopore" +) + +func ExampleTrimBarcode() { + // Here we are trimming the barcode06 barcode. + + // This exactBarcodeMatch has one barcode match in the forward direction and one loose match in the reverse direction + exactBarcodeMatch := fastq.Read{Sequence: "TGTACTTGGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCGCCTTTCCCACTGTTAGATTGAGCTGAAGACAGTTACAAGCCAGGTGGACAATATGGGTGTCGATGTCCAGCACATCACGCTCGTACAGCGTCTGGGTGCTGCGGTAATTGTTGGCAATAGTCGCGAACGTCGTTTCACCCTCCAGACGGTAGCTGCCCTTACGCATGATATCCAGTTGTTCTTGGCTGATCTCGATCGGATCCTTGAAGATCGTCCACATCACGATCTGGTTACACGGCGGGGTACGTCTTCGTTACACCGGGATCATAGCTAGGTGCTGTTCTCGCAAAGGCGGAAACGCATCTTAAGCCTCACGATGGTA", Identifier: "3e94e6db-07cc-424c-8e9f-81390482bd9c", Quality: `#%'%%%'%&&(*5556>BACBCCEDFIHJFISGHOHLHDFCF>36>;4.,,*9977531//.**+++-'+1(*?EMIBC@DFGHFHKSJHFGE@????FIQEPJKIKSMHDC77779BCBLSGHIISKOHSGGJBAAABFGROSGHJGISGSLEKFKSSSIOFJKEFENFKSFF><<<+++)$%%%&&''(&&))))))88752>@@????AFIIKIIA432327;>887779:>AAHLQIJJJFOSEEGHSSFHDBCFNSLHHSGHFDDCHNKQSHHPSIISSHJJKIIOHNKLHOFCCCB;;=?GHHIH@??@@ESEIFIAFDDC??AA@DEB@731(''&%$$$%''(++,-)))()))),)`} + sequence, _ := nanopore.TrimBarcode("barcode06", exactBarcodeMatch) + + fmt.Println(sequence.Sequence) + // Output: TTCTGCCTTTGCGAGAACAGCGCCTTTCCCACTGTTAGATTGAGCTGAAGACAGTTACAAGCCAGGTGGACAATATGGGTGTCGATGTCCAGCACATCACGCTCGTACAGCGTCTGGGTGCTGCGGTAATTGTTGGCAATAGTCGCGAACGTCGTTTCACCCTCCAGACGGTAGCTGCCCTTACGCATGATATCCAGTTGTTCTTGGCTGATCTCGATCGGATCCTTGAAGATCGTCCACATCACGATCTGGTTACACGGCGGGGTACGTCTTCGTTACACCGGGATCATAGCTA +} + +func ExampleTrimImperfectBarcode() { + // Here we are trimming the barcode06 barcode. + + // This imperfectBarcodeMatch has one barcode match in the forward direction and one in the reverse direction. + imperfectBarcodeMatch := fastq.Read{Sequence: "TGTATTCGTACGTTACATATGCTAAGGTTAAGACTACTTTCTGCCTTTTGCAGACGCACTTAGCATCCGTCTAAATCTCGGGAAGACGTCTGGCAGTGTTGGCTGCCTTTGTCGAGGTTAAGGACTACGCAGAAAACACCTACTACAGCAACTTTATCAGCCACCTGGAAAATATCAAATATGCGGGTCAATCCACCGTCCTGCGTGGCCTGGACATCCGGGACATATCTCAAAAACTTGCATCATTATTACAGCTACCTGGGTAGCTTGACCACGCCGCCGTGTACCGAGAATGTGCACTGCTGTCTTCAGCTATGATCCCGGTGTAACAGGTGCTGTTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATACGT", Identifier: "7258a445-8793-466c-8c18-cf9d1cce93f2", Quality: `$$%$###$'%%%%%&+(''''''89:ABFGSHSGGLDCD?9:DC@<.+)()&&&&'''+))))+?>=8540/.-+++,.>>?@BDFEEDCBHKKJHHMMKLEMKKMHEKSSHGFJGFKIGIESMSGDHSRSJEA:888IJIHRHIHGFEI=CB88888KHDABAAE5CSIJFSRJGSSGEGI@>+)((+33;C=<;5212////3:::=FHD@9979998/+`} + sequence, _ := nanopore.TrimBarcode("barcode06", imperfectBarcodeMatch) + + fmt.Println(sequence.Sequence) + // Output: TTCTGCCTTTTGCAGACGCACTTAGCATCCGTCTAAATCTCGGGAAGACGTCTGGCAGTGTTGGCTGCCTTTGTCGAGGTTAAGGACTACGCAGAAAACACCTACTACAGCAACTTTATCAGCCACCTGGAAAATATCAAATATGCGGGTCAATCCACCGTCCTGCGTGGCCTGGACATCCGGGACATATCTCAAAAACTTGCATCATTATTACAGCTACCTGGGTAGCTTGACCACGCCGCCGTGTACCGAGAATGTGCACTGCTGTCTTCAGCTATGATCCCGGTGTAACAG +} diff --git a/lib/sequencing/nanopore/nanopore.go b/lib/sequencing/nanopore/nanopore.go new file mode 100644 index 0000000..85c03af --- /dev/null +++ b/lib/sequencing/nanopore/nanopore.go @@ -0,0 +1,245 @@ +/* +Package nanopore contains data associated with nanopore sequencing. +*/ +package nanopore + +import ( + "fmt" + "strings" + + "github.com/koeng101/dnadesign/lib/align" + "github.com/koeng101/dnadesign/lib/align/matrix" + "github.com/koeng101/dnadesign/lib/alphabet" + "github.com/koeng101/dnadesign/lib/bio/fastq" +) + +// ScoreMagicNumber is the score of a Smith Waterman alignment between a +// barcode and sequence used a threshold for whether the barcode exists. +// It was found from me playing around with sequences. +var ScoreMagicNumber = 12 + +// NativeBarcode contains the data structure defining a nanopore barcode. +// In between Forward and a target DNA sequence is 8bp: CAGCACC followed by a +// T, which is used for the TA ligation to the target end-prepped DNA. +type NativeBarcode struct { + Forward string `json:"forward"` + Reverse string `json:"reverse"` +} + +// NativeBarcodeMap contains a map of native barcodes to their respective +// forward and reverse sequences. +var NativeBarcodeMap = map[string]NativeBarcode{ + "barcode01": {"CACAAAGACACCGACAACTTTCTT", "AAGAAAGTTGTCGGTGTCTTTGTG"}, + "barcode02": {"ACAGACGACTACAAACGGAATCGA", "TCGATTCCGTTTGTAGTCGTCTGT"}, + "barcode03": {"CCTGGTAACTGGGACACAAGACTC", "GAGTCTTGTGTCCCAGTTACCAGG"}, + "barcode04": {"TAGGGAAACACGATAGAATCCGAA", "TTCGGATTCTATCGTGTTTCCCTA"}, + "barcode05": {"AAGGTTACACAAACCCTGGACAAG", "CTTGTCCAGGGTTTGTGTAACCTT"}, + "barcode06": {"GACTACTTTCTGCCTTTGCGAGAA", "TTCTCGCAAAGGCAGAAAGTAGTC"}, + "barcode07": {"AAGGATTCATTCCCACGGTAACAC", "GTGTTACCGTGGGAATGAATCCTT"}, + "barcode08": {"ACGTAACTTGGTTTGTTCCCTGAA", "TTCAGGGAACAAACCAAGTTACGT"}, + "barcode09": {"AACCAAGACTCGCTGTGCCTAGTT", "AACTAGGCACAGCGAGTCTTGGTT"}, + "barcode10": {"GAGAGGACAAAGGTTTCAACGCTT", "AAGCGTTGAAACCTTTGTCCTCTC"}, + "barcode11": {"TCCATTCCCTCCGATAGATGAAAC", "GTTTCATCTATCGGAGGGAATGGA"}, + "barcode12": {"TCCGATTCTGCTTCTTTCTACCTG", "CAGGTAGAAAGAAGCAGAATCGGA"}, + "barcode13": {"AGAACGACTTCCATACTCGTGTGA", "TCACACGAGTATGGAAGTCGTTCT"}, + "barcode14": {"AACGAGTCTCTTGGGACCCATAGA", "TCTATGGGTCCCAAGAGACTCGTT"}, + "barcode15": {"AGGTCTACCTCGCTAACACCACTG", "CAGTGGTGTTAGCGAGGTAGACCT"}, + "barcode16": {"CGTCAACTGACAGTGGTTCGTACT", "AGTACGAACCACTGTCAGTTGACG"}, + "barcode17": {"ACCCTCCAGGAAAGTACCTCTGAT", "ATCAGAGGTACTTTCCTGGAGGGT"}, + "barcode18": {"CCAAACCCAACAACCTAGATAGGC", "GCCTATCTAGGTTGTTGGGTTTGG"}, + "barcode19": {"GTTCCTCGTGCAGTGTCAAGAGAT", "ATCTCTTGACACTGCACGAGGAAC"}, + "barcode20": {"TTGCGTCCTGTTACGAGAACTCAT", "ATGAGTTCTCGTAACAGGACGCAA"}, + "barcode21": {"GAGCCTCTCATTGTCCGTTCTCTA", "TAGAGAACGGACAATGAGAGGCTC"}, + "barcode22": {"ACCACTGCCATGTATCAAAGTACG", "CGTACTTTGATACATGGCAGTGGT"}, + "barcode23": {"CTTACTACCCAGTGAACCTCCTCG", "CGAGGAGGTTCACTGGGTAGTAAG"}, + "barcode24": {"GCATAGTTCTGCATGATGGGTTAG", "CTAACCCATCATGCAGAACTATGC"}, + "barcode25": {"GTAAGTTGGGTATGCAACGCAATG", "CATTGCGTTGCATACCCAACTTAC"}, + "barcode26": {"CATACAGCGACTACGCATTCTCAT", "ATGAGAATGCGTAGTCGCTGTATG"}, + "barcode27": {"CGACGGTTAGATTCACCTCTTACA", "TGTAAGAGGTGAATCTAACCGTCG"}, + "barcode28": {"TGAAACCTAAGAAGGCACCGTATC", "GATACGGTGCCTTCTTAGGTTTCA"}, + "barcode29": {"CTAGACACCTTGGGTTGACAGACC", "GGTCTGTCAACCCAAGGTGTCTAG"}, + "barcode30": {"TCAGTGAGGATCTACTTCGACCCA", "TGGGTCGAAGTAGATCCTCACTGA"}, + "barcode31": {"TGCGTACAGCAATCAGTTACATTG", "CAATGTAACTGATTGCTGTACGCA"}, + "barcode32": {"CCAGTAGAAGTCCGACAACGTCAT", "ATGACGTTGTCGGACTTCTACTGG"}, + "barcode33": {"CAGACTTGGTACGGTTGGGTAACT", "AGTTACCCAACCGTACCAAGTCTG"}, + "barcode34": {"GGACGAAGAACTCAAGTCAAAGGC", "GCCTTTGACTTGAGTTCTTCGTCC"}, + "barcode35": {"CTACTTACGAAGCTGAGGGACTGC", "GCAGTCCCTCAGCTTCGTAAGTAG"}, + "barcode36": {"ATGTCCCAGTTAGAGGAGGAAACA", "TGTTTCCTCCTCTAACTGGGACAT"}, + "barcode37": {"GCTTGCGATTGATGCTTAGTATCA", "TGATACTAAGCATCAATCGCAAGC"}, + "barcode38": {"ACCACAGGAGGACGATACAGAGAA", "TTCTCTGTATCGTCCTCCTGTGGT"}, + "barcode39": {"CCACAGTGTCAACTAGAGCCTCTC", "GAGAGGCTCTAGTTGACACTGTGG"}, + "barcode40": {"TAGTTTGGATGACCAAGGATAGCC", "GGCTATCCTTGGTCATCCAAACTA"}, + "barcode41": {"GGAGTTCGTCCAGAGAAGTACACG", "CGTGTACTTCTCTGGACGAACTCC"}, + "barcode42": {"CTACGTGTAAGGCATACCTGCCAG", "CTGGCAGGTATGCCTTACACGTAG"}, + "barcode43": {"CTTTCGTTGTTGACTCGACGGTAG", "CTACCGTCGAGTCAACAACGAAAG"}, + "barcode44": {"AGTAGAAAGGGTTCCTTCCCACTC", "GAGTGGGAAGGAACCCTTTCTACT"}, + "barcode45": {"GATCCAACAGAGATGCCTTCAGTG", "CACTGAAGGCATCTCTGTTGGATC"}, + "barcode46": {"GCTGTGTTCCACTTCATTCTCCTG", "CAGGAGAATGAAGTGGAACACAGC"}, + "barcode47": {"GTGCAACTTTCCCACAGGTAGTTC", "GAACTACCTGTGGGAAAGTTGCAC"}, + "barcode48": {"CATCTGGAACGTGGTACACCTGTA", "TACAGGTGTACCACGTTCCAGATG"}, + "barcode49": {"ACTGGTGCAGCTTTGAACATCTAG", "CTAGATGTTCAAAGCTGCACCAGT"}, + "barcode50": {"ATGGACTTTGGTAACTTCCTGCGT", "ACGCAGGAAGTTACCAAAGTCCAT"}, + "barcode51": {"GTTGAATGAGCCTACTGGGTCCTC", "GAGGACCCAGTAGGCTCATTCAAC"}, + "barcode52": {"TGAGAGACAAGATTGTTCGTGGAC", "GTCCACGAACAATCTTGTCTCTCA"}, + "barcode53": {"AGATTCAGACCGTCTCATGCAAAG", "CTTTGCATGAGACGGTCTGAATCT"}, + "barcode54": {"CAAGAGCTTTGACTAAGGAGCATG", "CATGCTCCTTAGTCAAAGCTCTTG"}, + "barcode55": {"TGGAAGATGAGACCCTGATCTACG", "CGTAGATCAGGGTCTCATCTTCCA"}, + "barcode56": {"TCACTACTCAACAGGTGGCATGAA", "TTCATGCCACCTGTTGAGTAGTGA"}, + "barcode57": {"GCTAGGTCAATCTCCTTCGGAAGT", "ACTTCCGAAGGAGATTGACCTAGC"}, + "barcode58": {"CAGGTTACTCCTCCGTGAGTCTGA", "TCAGACTCACGGAGGAGTAACCTG"}, + "barcode59": {"TCAATCAAGAAGGGAAAGCAAGGT", "ACCTTGCTTTCCCTTCTTGATTGA"}, + "barcode60": {"CATGTTCAACCAAGGCTTCTATGG", "CCATAGAAGCCTTGGTTGAACATG"}, + "barcode61": {"AGAGGGTACTATGTGCCTCAGCAC", "GTGCTGAGGCACATAGTACCCTCT"}, + "barcode62": {"CACCCACACTTACTTCAGGACGTA", "TACGTCCTGAAGTAAGTGTGGGTG"}, + "barcode63": {"TTCTGAAGTTCCTGGGTCTTGAAC", "GTTCAAGACCCAGGAACTTCAGAA"}, + "barcode64": {"GACAGACACCGTTCATCGACTTTC", "GAAAGTCGATGAACGGTGTCTGTC"}, + "barcode65": {"TTCTCAGTCTTCCTCCAGACAAGG", "CCTTGTCTGGAGGAAGACTGAGAA"}, + "barcode66": {"CCGATCCTTGTGGCTTCTAACTTC", "GAAGTTAGAAGCCACAAGGATCGG"}, + "barcode67": {"GTTTGTCATACTCGTGTGCTCACC", "GGTGAGCACACGAGTATGACAAAC"}, + "barcode68": {"GAATCTAAGCAAACACGAAGGTGG", "CCACCTTCGTGTTTGCTTAGATTC"}, + "barcode69": {"TACAGTCCGAGCCTCATGTGATCT", "AGATCACATGAGGCTCGGACTGTA"}, + "barcode70": {"ACCGAGATCCTACGAATGGAGTGT", "ACACTCCATTCGTAGGATCTCGGT"}, + "barcode71": {"CCTGGGAGCATCAGGTAGTAACAG", "CTGTTACTACCTGATGCTCCCAGG"}, + "barcode72": {"TAGCTGACTGTCTTCCATACCGAC", "GTCGGTATGGAAGACAGTCAGCTA"}, + "barcode73": {"AAGAAACAGGATGACAGAACCCTC", "GAGGGTTCTGTCATCCTGTTTCTT"}, + "barcode74": {"TACAAGCATCCCAACACTTCCACT", "AGTGGAAGTGTTGGGATGCTTGTA"}, + "barcode75": {"GACCATTGTGATGAACCCTGTTGT", "ACAACAGGGTTCATCACAATGGTC"}, + "barcode76": {"ATGCTTGTTACATCAACCCTGGAC", "GTCCAGGGTTGATGTAACAAGCAT"}, + "barcode77": {"CGACCTGTTTCTCAGGGATACAAC", "GTTGTATCCCTGAGAAACAGGTCG"}, + "barcode78": {"AACAACCGAACCTTTGAATCAGAA", "TTCTGATTCAAAGGTTCGGTTGTT"}, + "barcode79": {"TCTCGGAGATAGTTCTCACTGCTG", "CAGCAGTGAGAACTATCTCCGAGA"}, + "barcode80": {"CGGATGAACATAGGATAGCGATTC", "GAATCGCTATCCTATGTTCATCCG"}, + "barcode81": {"CCTCATCTTGTGAAGTTGTTTCGG", "CCGAAACAACTTCACAAGATGAGG"}, + "barcode82": {"ACGGTATGTCGAGTTCCAGGACTA", "TAGTCCTGGAACTCGACATACCGT"}, + "barcode83": {"TGGCTTGATCTAGGTAAGGTCGAA", "TTCGACCTTACCTAGATCAAGCCA"}, + "barcode84": {"GTAGTGGACCTAGAACCTGTGCCA", "TGGCACAGGTTCTAGGTCCACTAC"}, + "barcode85": {"AACGGAGGAGTTAGTTGGATGATC", "GATCATCCAACTAACTCCTCCGTT"}, + "barcode86": {"AGGTGATCCCAACAAGCGTAAGTA", "TACTTACGCTTGTTGGGATCACCT"}, + "barcode87": {"TACATGCTCCTGTTGTTAGGGAGG", "CCTCCCTAACAACAGGAGCATGTA"}, + "barcode88": {"TCTTCTACTACCGATCCGAAGCAG", "CTGCTTCGGATCGGTAGTAGAAGA"}, + "barcode89": {"ACAGCATCAATGTTTGGCTAGTTG", "CAACTAGCCAAACATTGATGCTGT"}, + "barcode90": {"GATGTAGAGGGTACGGTTTGAGGC", "GCCTCAAACCGTACCCTCTACATC"}, + "barcode91": {"GGCTCCATAGGAACTCACGCTACT", "AGTAGCGTGAGTTCCTATGGAGCC"}, + "barcode92": {"TTGTGAGTGGAAAGATACAGGACC", "GGTCCTGTATCTTTCCACTCACAA"}, + "barcode93": {"AGTTTCCATCACTTCAGACTTGGG", "CCCAAGTCTGAAGTGATGGAAACT"}, + "barcode94": {"GATTGTCCTCAAACTGCCACCTAC", "GTAGGTGGCAGTTTGAGGACAATC"}, + "barcode95": {"CCTGTCTGGAAGAAGAATGGACTT", "AAGTCCATTCTTCTTCCAGACAGG"}, + "barcode96": {"CTGAACGGTCATAGAGTCCACCAT", "ATGGTGGACTCTATGACCGTTCAG"}, +} + +func TrimBarcodeWithChannels(barcodeName string, fastqInput <-chan *fastq.Read, fastqOutput chan<- *fastq.Read) error { + for { + select { + case data, ok := <-fastqInput: + if !ok { + // If the input channel is closed, we close the output channel and return + close(fastqOutput) + return nil + } + trimmedRead, err := TrimBarcode(barcodeName, *data) + if err != nil { + close(fastqOutput) + return err + } + fastqOutput <- &trimmedRead + } + } + +} + +// TrimBarcode takes a barcodeName and a fastqSequence and returns a trimmed +// barcode. +func TrimBarcode(barcodeName string, fastqRead fastq.Read) (fastq.Read, error) { + // Copy into new fastq read + var newFastqRead fastq.Read + newFastqRead.Identifier = fastqRead.Identifier + newFastqRead.Optionals = make(map[string]string) + for key, value := range fastqRead.Optionals { + newFastqRead.Optionals[key] = value + } + + // Get the barcode + barcode, ok := NativeBarcodeMap[barcodeName] + if !ok { + return newFastqRead, fmt.Errorf("barcode %s not found in NativeBarcodeMap.", barcodeName) + } + + // Trim in the forward direction + var sequence string + var quality string + sequence = fastqRead.Sequence + quality = fastqRead.Quality + score, alignment, err := Align(sequence[:80], barcode.Forward) + if err != nil { + return newFastqRead, err + } + // Empirically from looking around, it seems to me that approximately a + // score of 21 looks like a barcode match. This is almost by definition a + // magic number, so it is defined above as so. + if score >= ScoreMagicNumber { + modifiedAlignment := strings.ReplaceAll(alignment, "-", "") + index := strings.Index(sequence, modifiedAlignment) + // The index needs to be within the first 80 base pairs. Usually the + // native adapter + native barcode is within the first ~70 bp, but we + // put a small buffer. + if index != -1 { + // 7 here is a magic number between the native barcode and your + // target sequence. It's just a number that exists irl, so it is + // not defined above. + newStart := index + 7 + if newStart < len(sequence) { + sequence = sequence[newStart:] + quality = quality[newStart:] + } + } + } + // Now do the same thing with the reverse strand. + score, alignment, err = Align(sequence[len(sequence)-80:], barcode.Reverse) + if err != nil { + return newFastqRead, err + } + if score >= ScoreMagicNumber { + modifiedAlignment := strings.ReplaceAll(alignment, "-", "") + index := strings.Index(sequence, modifiedAlignment) + // This time we need to check within the last 80 base pairs. + if index != -1 { + newEnd := index - 7 + if newEnd < len(sequence) { + sequence = sequence[:newEnd] + quality = quality[:newEnd] + } + } + } + newFastqRead.Sequence = sequence + newFastqRead.Quality = quality + return newFastqRead, err +} + +// Align uses the SmithWaterman algorithm to align a barcode to a sequence. +// It is used because it is a rather simple algorithm, and since barcodes are +// sufficiently small, fast enough. +func Align(sequence string, barcodeSequence string) (int, string, error) { + m := [][]int{ + /* A C G T U */ + /* A */ {1, -1, -1, -1, -1}, + /* C */ {-1, 1, -1, -1, -1}, + /* G */ {-1, -1, 1, -1, -1}, + /* T */ {-1, -1, -1, 1, -1}, + /* U */ {-1, -1, -1, -1, 1}, + } + + alphabet := alphabet.NewAlphabet([]string{"A", "C", "G", "T", "U"}) + subMatrix, err := matrix.NewSubstitutionMatrix(alphabet, alphabet, m) + if err != nil { + return 0, "", err + } + scoring, err := align.NewScoring(subMatrix, -1) + if err != nil { + return 0, "", err + } + score, alignSequence, _, err := align.SmithWaterman(sequence, barcodeSequence, scoring) + if err != nil { + return 0, "", err + } + return score, alignSequence, nil +} diff --git a/lib/sequencing/sequencing.go b/lib/sequencing/sequencing.go new file mode 100644 index 0000000..fb049e5 --- /dev/null +++ b/lib/sequencing/sequencing.go @@ -0,0 +1,4 @@ +/* +Package sequencing contains functions associated with handling sequencing data. +*/ +package sequencing From 87fea88f475981185092585eb100ddc9627d1dc1 Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Tue, 2 Jan 2024 12:40:21 -0800 Subject: [PATCH 2/9] add checks for minimap2 --- lib/sequencing/example_test.go | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/lib/sequencing/example_test.go b/lib/sequencing/example_test.go index 19ee149..f80d12d 100644 --- a/lib/sequencing/example_test.go +++ b/lib/sequencing/example_test.go @@ -6,6 +6,7 @@ import ( "fmt" "log" "os" + "os/exec" "github.com/koeng101/dnadesign/external/minimap2" "github.com/koeng101/dnadesign/lib/bio" @@ -19,6 +20,12 @@ import ( ) func ExampleAmpliconAlignment() { + // Only run function if minimap2 is available + _, err := exec.LookPath("minimap2") + if err != nil { + fmt.Println("oligo2") + return + } // First, let's define the type we are looking for: amplicons in a pool. type Amplicon struct { Identifier string From 055800e92510cd0aa5c6fd2db3d48cede3321b57 Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Tue, 2 Jan 2024 12:40:38 -0800 Subject: [PATCH 3/9] Add external functions needed for sequencing example --- external/minimap2/minimap2.go | 55 +++++++++++++++++++++++++++++++++++ 1 file changed, 55 insertions(+) diff --git a/external/minimap2/minimap2.go b/external/minimap2/minimap2.go index 274c261..f6ae596 100644 --- a/external/minimap2/minimap2.go +++ b/external/minimap2/minimap2.go @@ -20,9 +20,15 @@ For more information on minimap2, please visit Heng Li's git: https://github.com package minimap2 import ( + "context" "io" "os" "os/exec" + + "github.com/koeng101/dnadesign/lib/bio" + "github.com/koeng101/dnadesign/lib/bio/fastq" + "github.com/koeng101/dnadesign/lib/bio/sam" + "golang.org/x/sync/errgroup" ) // Minimap2 aligns sequences using minimap2 over the command line. Right @@ -74,3 +80,52 @@ func Minimap2(templateFastaInput io.Reader, fastqInput io.Reader, w io.Writer) e return cmd.Wait() } + +// Minimap2Channeled uses channels rather than io.Reader and io.Writers. +func Minimap2Channeled(fastaTemplates io.Reader, fastqChan <-chan *fastq.Read, samChan chan<- *sam.Alignment) error { + ctx := context.Background() + g, ctx := errgroup.WithContext(ctx) + + // Create a pipe for writing fastq reads and reading them as an io.Reader + fastqPr, fastqPw := io.Pipe() + + // Goroutine to consume fastq reads and write them to the PipeWriter + g.Go(func() error { + defer fastqPw.Close() + for read := range fastqChan { + _, err := read.WriteTo(fastqPw) + if err != nil { + return err // return error to be handled by errgroup + } + } + return nil + }) + + // Create a pipe for SAM alignments. + samPr, samPw := io.Pipe() + + // Use Minimap2 function to process the reads and write SAM alignments. + g.Go(func() error { + defer samPw.Close() + return Minimap2(fastaTemplates, fastqPr, samPw) // Minimap2 writes to samPw + }) + + // Create a SAM parser from samPr (the PipeReader connected to Minimap2 output). + samParser, err := bio.NewSamParser(samPr) + if err != nil { + return err + } + + // Parsing SAM and sending to channel. + g.Go(func() error { + return samParser.ParseToChannel(ctx, samChan, false) + }) + + // Wait for all goroutines in the group to finish. + if err := g.Wait(); err != nil { + return err // This will return the first non-nil error from the group of goroutines + } + + // At this point, all goroutines have finished successfully + return nil +} From 40c8db717fc0f5e193150d560e18ad56574405bc Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Fri, 5 Jan 2024 10:36:45 -0800 Subject: [PATCH 4/9] work in progress --- external/minimap2/minimap2.go | 2 +- lib/sequencing/example_test.go | 15 ++++++ lib/sequencing/nanopore/nanopore.go | 75 +++++++++++++++-------------- 3 files changed, 56 insertions(+), 36 deletions(-) diff --git a/external/minimap2/minimap2.go b/external/minimap2/minimap2.go index f6ae596..b0d8ffd 100644 --- a/external/minimap2/minimap2.go +++ b/external/minimap2/minimap2.go @@ -71,7 +71,7 @@ func Minimap2(templateFastaInput io.Reader, fastqInput io.Reader, w io.Writer) e tmpFile.Close() // Close the file as it's no longer needed // Start minimap2 pointing to the temporary file and stdin for sequencing data - cmd := exec.Command("minimap2", "-ax", "map-ont", tmpFile.Name(), "-") + cmd := exec.Command("minimap2", "-K", "100", "-ax", "map-ont", tmpFile.Name(), "-") cmd.Stdout = w cmd.Stdin = fastqInput if err := cmd.Start(); err != nil { diff --git a/lib/sequencing/example_test.go b/lib/sequencing/example_test.go index f80d12d..995d29e 100644 --- a/lib/sequencing/example_test.go +++ b/lib/sequencing/example_test.go @@ -126,3 +126,18 @@ func ExampleAmpliconAlignment() { fmt.Println(outputAlignments[0].RNAME) // Output: oligo2 } + +func RunWorkflow(errorGroup *errgroup.Group, functions []func() error) error { + for _, function := range functions { + errorGroup.Go(func() error { + return function() + }) + } + return nil +} + +func ExampleErrgroup() { + + fmt.Println("hi") + // Output: hi +} diff --git a/lib/sequencing/nanopore/nanopore.go b/lib/sequencing/nanopore/nanopore.go index 85c03af..86b4529 100644 --- a/lib/sequencing/nanopore/nanopore.go +++ b/lib/sequencing/nanopore/nanopore.go @@ -169,50 +169,55 @@ func TrimBarcode(barcodeName string, fastqRead fastq.Read) (fastq.Read, error) { var quality string sequence = fastqRead.Sequence quality = fastqRead.Quality - score, alignment, err := Align(sequence[:80], barcode.Forward) - if err != nil { - return newFastqRead, err - } - // Empirically from looking around, it seems to me that approximately a - // score of 21 looks like a barcode match. This is almost by definition a - // magic number, so it is defined above as so. - if score >= ScoreMagicNumber { - modifiedAlignment := strings.ReplaceAll(alignment, "-", "") - index := strings.Index(sequence, modifiedAlignment) - // The index needs to be within the first 80 base pairs. Usually the - // native adapter + native barcode is within the first ~70 bp, but we - // put a small buffer. - if index != -1 { - // 7 here is a magic number between the native barcode and your - // target sequence. It's just a number that exists irl, so it is - // not defined above. - newStart := index + 7 - if newStart < len(sequence) { - sequence = sequence[newStart:] - quality = quality[newStart:] + if len(sequence) > 80 { + score, alignment, err := Align(sequence[:80], barcode.Forward) + if err != nil { + return newFastqRead, err + } + // Empirically from looking around, it seems to me that approximately a + // score of 21 looks like a barcode match. This is almost by definition a + // magic number, so it is defined above as so. + if score >= ScoreMagicNumber { + modifiedAlignment := strings.ReplaceAll(alignment, "-", "") + index := strings.Index(sequence, modifiedAlignment) + // The index needs to be within the first 80 base pairs. Usually the + // native adapter + native barcode is within the first ~70 bp, but we + // put a small buffer. + if index != -1 { + // 7 here is a magic number between the native barcode and your + // target sequence. It's just a number that exists irl, so it is + // not defined above. + newStart := index + 7 + if newStart < len(sequence) { + sequence = sequence[newStart:] + quality = quality[newStart:] + } } } } + // Now do the same thing with the reverse strand. - score, alignment, err = Align(sequence[len(sequence)-80:], barcode.Reverse) - if err != nil { - return newFastqRead, err - } - if score >= ScoreMagicNumber { - modifiedAlignment := strings.ReplaceAll(alignment, "-", "") - index := strings.Index(sequence, modifiedAlignment) - // This time we need to check within the last 80 base pairs. - if index != -1 { - newEnd := index - 7 - if newEnd < len(sequence) { - sequence = sequence[:newEnd] - quality = quality[:newEnd] + if len(sequence) > 80 { + score, alignment, err := Align(sequence[len(sequence)-80:], barcode.Reverse) + if err != nil { + return newFastqRead, err + } + if score >= ScoreMagicNumber { + modifiedAlignment := strings.ReplaceAll(alignment, "-", "") + index := strings.Index(sequence, modifiedAlignment) + // This time we need to check within the last 80 base pairs. + if index != -1 { + newEnd := index - 7 + if newEnd < len(sequence) { + sequence = sequence[:newEnd] + quality = quality[:newEnd] + } } } } newFastqRead.Sequence = sequence newFastqRead.Quality = quality - return newFastqRead, err + return newFastqRead, nil } // Align uses the SmithWaterman algorithm to align a barcode to a sequence. From a2f6ed0c3dc6f3e92c0f6cbbfa575590ff3196a7 Mon Sep 17 00:00:00 2001 From: Koeng101 Date: Sun, 7 Jan 2024 10:24:20 -0800 Subject: [PATCH 5/9] add megamash (#50) * add megamash stuff --- lib/align/megamash/compress.go | 93 ++++++++++++++++++++++++ lib/align/megamash/megamash.go | 107 ++++++++++++++++++++++++++++ lib/align/megamash/megamash_test.go | 81 +++++++++++++++++++++ 3 files changed, 281 insertions(+) create mode 100644 lib/align/megamash/compress.go create mode 100644 lib/align/megamash/megamash.go create mode 100644 lib/align/megamash/megamash_test.go diff --git a/lib/align/megamash/compress.go b/lib/align/megamash/compress.go new file mode 100644 index 0000000..36a827e --- /dev/null +++ b/lib/align/megamash/compress.go @@ -0,0 +1,93 @@ +package megamash + +import "math" + +// CompressDNA takes a DNA sequence and converts it into a compressed byte slice. +func CompressDNA(dna string) []byte { + length := len(dna) + var lengthBytes []byte + var flag byte + + // Determine the size of the length field based on the sequence length + switch { + case length <= math.MaxUint8: + lengthBytes = []byte{byte(length)} + flag = 0x00 // flag for uint8 + case length <= math.MaxUint16: + lengthBytes = []byte{byte(length >> 8), byte(length)} + flag = 0x01 // flag for uint16 + case length <= math.MaxUint32: + lengthBytes = []byte{byte(length >> 24), byte(length >> 16), byte(length >> 8), byte(length)} + flag = 0x02 // flag for uint32 + default: + lengthBytes = []byte{byte(length >> 56), byte(length >> 48), byte(length >> 40), byte(length >> 32), byte(length >> 24), byte(length >> 16), byte(length >> 8), byte(length)} + flag = 0x03 // flag for uint64 + } + + // Encode DNA sequence + var dnaBytes []byte + var currentByte byte + for i, nucleotide := range dna { + switch nucleotide { + case 'A': // 00 + case 'T': // 01 + currentByte |= 1 << (6 - (i%4)*2) + case 'G': // 10 + currentByte |= 2 << (6 - (i%4)*2) + case 'C': // 11 + currentByte |= 3 << (6 - (i%4)*2) + } + if i%4 == 3 || i == length-1 { + dnaBytes = append(dnaBytes, currentByte) + currentByte = 0 + } + } + + // Combine all parts into the result + result := append([]byte{flag}, lengthBytes...) + result = append(result, dnaBytes...) + return result +} + +// DecompressDNA takes a compressed byte slice and converts it back to the original DNA string. +func DecompressDNA(compressed []byte) string { + flag := compressed[0] + var length int + var lengthBytes int + + switch flag { + case 0x00: + length = int(compressed[1]) + lengthBytes = 1 + case 0x01: + length = int(compressed[1])<<8 + int(compressed[2]) + lengthBytes = 2 + case 0x02: + length = int(compressed[1])<<24 + int(compressed[2])<<16 + int(compressed[3])<<8 + int(compressed[4]) + lengthBytes = 4 + case 0x03: + length = int(compressed[1])<<56 + int(compressed[2])<<48 + int(compressed[3])<<40 + int(compressed[4])<<32 + int(compressed[5])<<24 + int(compressed[6])<<16 + int(compressed[7])<<8 + int(compressed[8]) + lengthBytes = 8 + default: + return "Invalid flag" + } + + // Decode DNA sequence + var dna string + for i, b := range compressed[1+lengthBytes:] { + for j := 0; j < 4 && 4*i+j < length; j++ { + switch (b >> (6 - j*2)) & 0x03 { + case 0: + dna += "A" + case 1: + dna += "T" + case 2: + dna += "G" + case 3: + dna += "C" + } + } + } + + return dna +} diff --git a/lib/align/megamash/megamash.go b/lib/align/megamash/megamash.go new file mode 100644 index 0000000..8b28345 --- /dev/null +++ b/lib/align/megamash/megamash.go @@ -0,0 +1,107 @@ +/* +Package megamash is an implementation of the megamash algorithm. + +Megamash is an algorithm developed by Keoni Gandall to find templates from +sequencing reactions. For example, you may have a pool of amplicons, and need +to get a count of how many times each amplicon shows up in a given sequencing +reaction. +*/ +package megamash + +import ( + "encoding/base64" + + "github.com/koeng101/dnadesign/lib/transform" +) + +// StandardizedCompressedDNA returns the CompressedDNA byte string +func StandardizedCompressedDNA(sequence string) []byte { + var deterministicSequence string + reverseComplement := transform.ReverseComplement(sequence) + if sequence > reverseComplement { + deterministicSequence = reverseComplement + } else { + deterministicSequence = sequence + } + return CompressDNA(deterministicSequence) +} + +type MegamashMap []map[string]bool + +func MakeMegamashMap(sequences []string, kmerSize uint) MegamashMap { + var megamashMap MegamashMap + for _, sequence := range sequences { + // First get all kmers with a given sequence + kmerMap := make(map[string]bool) + for i := 0; i <= len(sequence)-int(kmerSize); i++ { + kmerBytes := StandardizedCompressedDNA(sequence[i : i+int(kmerSize)]) + kmerBase64 := base64.StdEncoding.EncodeToString(kmerBytes) + kmerMap[kmerBase64] = true + } + + // Then, get unique kmers for this sequence and only this sequence + uniqueKmerMap := make(map[string]bool) + for kmerBase64 := range kmerMap { + unique := true + for _, otherMegaMashMap := range megamashMap { + _, ok := otherMegaMashMap[kmerBase64] + // If this kmer is found, set both to false + if ok { + otherMegaMashMap[kmerBase64] = false + unique = false + break + } + } + if unique { + uniqueKmerMap[kmerBase64] = true + } + } + + // Now we have a unique Kmer map for the given sequence. + // Add it to megamashMap + megamashMap = append(megamashMap, uniqueKmerMap) + } + return megamashMap +} + +func (m *MegamashMap) Score(sequence string) []float64 { + var scores []float64 + // The algorithm is as follows: + // - Go through each map. + // - Get the number of matching kmers + // - Divide that by the total kmers available for matching + + // First, get the kmer total + var kmerSize int +out: + for _, maps := range *m { + for kmer := range maps { + decodedBytes, _ := base64.StdEncoding.DecodeString(kmer) + sequence := DecompressDNA(decodedBytes) + kmerSize = len(sequence) + break out + } + } + + // Now, iterate through each map + for _, sequenceMap := range *m { + var score float64 + var totalKmers = len(sequenceMap) + var matchedKmers int + for i := 0; i <= len(sequence)-kmerSize; i++ { + kmerBytes := StandardizedCompressedDNA(sequence[i : i+kmerSize]) + kmerBase64 := base64.StdEncoding.EncodeToString(kmerBytes) + unique, ok := sequenceMap[kmerBase64] + if ok && unique { + matchedKmers++ + } + } + if totalKmers == 0 { + score = 0 + } else { + score = float64(matchedKmers) / float64(totalKmers) + } + scores = append(scores, score) + } + return scores +} diff --git a/lib/align/megamash/megamash_test.go b/lib/align/megamash/megamash_test.go new file mode 100644 index 0000000..8cb2392 --- /dev/null +++ b/lib/align/megamash/megamash_test.go @@ -0,0 +1,81 @@ +package megamash + +import ( + "testing" + + "github.com/koeng101/dnadesign/lib/random" +) + +func TestCompressDNA(t *testing.T) { + // Define test cases + longDna, _ := random.DNASequence(300, 0) + longerDna, _ := random.DNASequence(66000, 0) + tests := []struct { + name string + dna string + expectedLen int // Expected length of the compressed data + expectedFlag byte // Expected flag byte + }{ + {"Empty", "", 2, 0x00}, + {"Short", "ATGC", 3, 0x00}, + {"Medium", "ATGCGTATGCCGTAGC", 6, 0x00}, + {"Long", longDna, 78, 0x01}, + {"Longest", longerDna, 16505, 0x02}, + // Add more test cases for longer sequences and edge cases + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + compressed := CompressDNA(tc.dna) + if len(compressed) != tc.expectedLen { + t.Errorf("CompressDNA() with input %s, expected length %d, got %d", "", tc.expectedLen, len(compressed)) + } + if compressed[0] != tc.expectedFlag { + t.Errorf("CompressDNA() with input %s, expected flag %b, got %b", tc.dna, tc.expectedFlag, compressed[0]) + } + }) + } +} + +func TestDecompressDNA(t *testing.T) { + longDna, _ := random.DNASequence(300, 0) + longerDna, _ := random.DNASequence(66000, 0) + // Define test cases + tests := []struct { + name string + dna string + expected string + }{ + {"Empty", "", ""}, + {"Short", "ATGC", "ATGC"}, + {"Medium", "ATGCGTATGCCGTAGC", "ATGCGTATGCCGTAGC"}, + {"Long", longDna, longDna}, + {"Longest", longerDna, longerDna}, + // Add more test cases as needed + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + compressed := CompressDNA(tc.dna) + decompressed := DecompressDNA(compressed) + if decompressed != tc.expected { + t.Errorf("DecompressDNA() with input %v, expected %s, got %s", compressed, tc.expected, decompressed) + } + }) + } +} + +func TestMegamash(t *testing.T) { + oligo1 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTCTCAATGACCAAACCAACGCAAGTCTTAGTTCGTTCAGTCTCTATTTTATTCTTCATCACACTGTTGCACTTGGTTGTTGCAATGAGATTTCCTAGTATTTTCACTGCTGTGCTGAGACCCGGATCGAACTTAGGTAGCCT" + oligo2 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTGCTATTTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCACTAGTCATAAT" + oligo3 := "CCGTGCGACAAGATTTCAAGGGTCTCTCTTCTATCGCAGCCAAGGAAGAAGGTGTATCTCTAGAGAAGCGTCGAGTGAGACCCGGATCGAACTTAGGTAGCCCCCTTCGAAGTGGCTCTGTCTGATCCTCCGCGGATGGCGACACCATCGGACTGAGGATATTGGCCACA" + + samples := []string{"TTTTGTCTACTTCGTTCCGTTGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGACGGCGCCTCCGTGCGACGAGATTTCAAGGGTCTCTGTGCTATATTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCAGATCGACTTTTAGATTCCTCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATACGTGG", "TGTCCTTTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTACTTATCGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTATCTGAGACCGAAGTGGTTTGCCTAAACGCAGGTGCTGTTGGCAAAGGCAGAAAGTAGTCTTAACCTTGACAATGAGTGGTA", "GTTATTGTCGTCTCCTTTGACTCAGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTGCTGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTCCGCTTCTATCTGAGACCGAAGTGGTTAT", "TGTTCTGTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTCTGCCTTAGAGACCACGCCTCCGTGCGACAAGATTCAAGGGTCTCTGTGCTCTGCCGCTAGTTCCGCTCTAGCTGCTCCGGTATGCATCTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAACTGTTGGTT"} + m := MakeMegamashMap([]string{oligo1, oligo2, oligo3}, 16) + for _, sample := range samples { + scores := m.Score(sample) + if scores[1] < 0.5 { + t.Errorf("Score for oligo2 should be above 0.5") + } + } +} From 13018ce9bc376cf09033ed3ace119a7c2bd5dc09 Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Tue, 9 Jan 2024 12:44:12 -0800 Subject: [PATCH 6/9] changed how generics work --- lib/align/megamash/compress.go | 93 ------------------------- lib/align/megamash/megamash.go | 103 +++++++++++++++++++++------- lib/align/megamash/megamash_test.go | 86 ++++++----------------- lib/bio/bio.go | 83 ++++++++++++---------- lib/bio/example_test.go | 27 +++++--- lib/bio/fasta/example_test.go | 2 +- lib/bio/fasta/fasta.go | 16 ++--- lib/bio/fasta/fasta_test.go | 4 +- lib/bio/fastq/fastq.go | 26 +++---- lib/bio/genbank/genbank.go | 34 ++++----- lib/bio/iowriter_test.go | 24 +++++++ lib/bio/pileup/pileup.go | 24 +++---- lib/bio/pileup/pileup_test.go | 2 +- lib/bio/sam/sam.go | 22 +++--- lib/bio/slow5/example_test.go | 2 +- lib/bio/slow5/slow5.go | 42 ++++++------ lib/bio/slow5/slow5_test.go | 8 +-- lib/bio/uniprot/uniprot.go | 20 +++--- 18 files changed, 290 insertions(+), 328 deletions(-) delete mode 100644 lib/align/megamash/compress.go create mode 100644 lib/bio/iowriter_test.go diff --git a/lib/align/megamash/compress.go b/lib/align/megamash/compress.go deleted file mode 100644 index 36a827e..0000000 --- a/lib/align/megamash/compress.go +++ /dev/null @@ -1,93 +0,0 @@ -package megamash - -import "math" - -// CompressDNA takes a DNA sequence and converts it into a compressed byte slice. -func CompressDNA(dna string) []byte { - length := len(dna) - var lengthBytes []byte - var flag byte - - // Determine the size of the length field based on the sequence length - switch { - case length <= math.MaxUint8: - lengthBytes = []byte{byte(length)} - flag = 0x00 // flag for uint8 - case length <= math.MaxUint16: - lengthBytes = []byte{byte(length >> 8), byte(length)} - flag = 0x01 // flag for uint16 - case length <= math.MaxUint32: - lengthBytes = []byte{byte(length >> 24), byte(length >> 16), byte(length >> 8), byte(length)} - flag = 0x02 // flag for uint32 - default: - lengthBytes = []byte{byte(length >> 56), byte(length >> 48), byte(length >> 40), byte(length >> 32), byte(length >> 24), byte(length >> 16), byte(length >> 8), byte(length)} - flag = 0x03 // flag for uint64 - } - - // Encode DNA sequence - var dnaBytes []byte - var currentByte byte - for i, nucleotide := range dna { - switch nucleotide { - case 'A': // 00 - case 'T': // 01 - currentByte |= 1 << (6 - (i%4)*2) - case 'G': // 10 - currentByte |= 2 << (6 - (i%4)*2) - case 'C': // 11 - currentByte |= 3 << (6 - (i%4)*2) - } - if i%4 == 3 || i == length-1 { - dnaBytes = append(dnaBytes, currentByte) - currentByte = 0 - } - } - - // Combine all parts into the result - result := append([]byte{flag}, lengthBytes...) - result = append(result, dnaBytes...) - return result -} - -// DecompressDNA takes a compressed byte slice and converts it back to the original DNA string. -func DecompressDNA(compressed []byte) string { - flag := compressed[0] - var length int - var lengthBytes int - - switch flag { - case 0x00: - length = int(compressed[1]) - lengthBytes = 1 - case 0x01: - length = int(compressed[1])<<8 + int(compressed[2]) - lengthBytes = 2 - case 0x02: - length = int(compressed[1])<<24 + int(compressed[2])<<16 + int(compressed[3])<<8 + int(compressed[4]) - lengthBytes = 4 - case 0x03: - length = int(compressed[1])<<56 + int(compressed[2])<<48 + int(compressed[3])<<40 + int(compressed[4])<<32 + int(compressed[5])<<24 + int(compressed[6])<<16 + int(compressed[7])<<8 + int(compressed[8]) - lengthBytes = 8 - default: - return "Invalid flag" - } - - // Decode DNA sequence - var dna string - for i, b := range compressed[1+lengthBytes:] { - for j := 0; j < 4 && 4*i+j < length; j++ { - switch (b >> (6 - j*2)) & 0x03 { - case 0: - dna += "A" - case 1: - dna += "T" - case 2: - dna += "G" - case 3: - dna += "C" - } - } - } - - return dna -} diff --git a/lib/align/megamash/megamash.go b/lib/align/megamash/megamash.go index 8b28345..86e12d5 100644 --- a/lib/align/megamash/megamash.go +++ b/lib/align/megamash/megamash.go @@ -9,13 +9,16 @@ reaction. package megamash import ( - "encoding/base64" + "context" + "fmt" + "github.com/koeng101/dnadesign/lib/bio/fasta" + "github.com/koeng101/dnadesign/lib/bio/fastq" "github.com/koeng101/dnadesign/lib/transform" ) -// StandardizedCompressedDNA returns the CompressedDNA byte string -func StandardizedCompressedDNA(sequence string) []byte { +// StandardizedDNA returns the a standard DNA string +func StandardizedDNA(sequence string) string { var deterministicSequence string reverseComplement := transform.ReverseComplement(sequence) if sequence > reverseComplement { @@ -23,27 +26,45 @@ func StandardizedCompressedDNA(sequence string) []byte { } else { deterministicSequence = sequence } - return CompressDNA(deterministicSequence) + return deterministicSequence } -type MegamashMap []map[string]bool +var ( + DefaultKmerSize uint = 16 + DefaultMinimalKmerCount uint = 10 + DefaultScoreThreshold float64 = 0.2 +) + +type MegamashMap struct { + Identifiers []string + Kmers []map[string]bool + KmerSize uint + KmerMinimalCount uint + Threshold float64 +} -func MakeMegamashMap(sequences []string, kmerSize uint) MegamashMap { +func NewMegamashMap(sequences []fasta.Record, kmerSize uint, kmerMinimalCount uint, threshold float64) (MegamashMap, error) { var megamashMap MegamashMap - for _, sequence := range sequences { + megamashMap.KmerSize = kmerSize + megamashMap.KmerMinimalCount = kmerMinimalCount + megamashMap.Threshold = threshold + + for _, fastaRecord := range sequences { + megamashMap.Identifiers = append(megamashMap.Identifiers, fastaRecord.Identifier) + sequence := fastaRecord.Sequence + // First get all kmers with a given sequence kmerMap := make(map[string]bool) for i := 0; i <= len(sequence)-int(kmerSize); i++ { - kmerBytes := StandardizedCompressedDNA(sequence[i : i+int(kmerSize)]) - kmerBase64 := base64.StdEncoding.EncodeToString(kmerBytes) - kmerMap[kmerBase64] = true + kmerString := StandardizedDNA(sequence[i : i+int(kmerSize)]) + kmerMap[kmerString] = true } // Then, get unique kmers for this sequence and only this sequence uniqueKmerMap := make(map[string]bool) for kmerBase64 := range kmerMap { unique := true - for _, otherMegaMashMap := range megamashMap { + for _, otherMegaMashMap := range megamashMap.Kmers { _, ok := otherMegaMashMap[kmerBase64] // If this kmer is found, set both to false if ok { @@ -56,15 +77,30 @@ func MakeMegamashMap(sequences []string, kmerSize uint) MegamashMap { uniqueKmerMap[kmerBase64] = true } } + // Check if we have the minimal kmerCount + var kmerCount uint = 0 + for _, unique := range uniqueKmerMap { + if unique { + kmerCount++ + } + } + if kmerCount < kmerMinimalCount { + return megamashMap, fmt.Errorf("Got only %d unique kmers of required %d for sequence %s", kmerCount, kmerMinimalCount, fastaRecord.Identifier) + } // Now we have a unique Kmer map for the given sequence. // Add it to megamashMap - megamashMap = append(megamashMap, uniqueKmerMap) + megamashMap.Kmers = append(megamashMap.Kmers, uniqueKmerMap) } - return megamashMap + return megamashMap, nil } -func (m *MegamashMap) Score(sequence string) []float64 { +type Match struct { + Identifier string + Score float64 +} + +func (m *MegamashMap) Match(sequence string) []Match { var scores []float64 // The algorithm is as follows: // - Go through each map. @@ -74,24 +110,21 @@ func (m *MegamashMap) Score(sequence string) []float64 { // First, get the kmer total var kmerSize int out: - for _, maps := range *m { + for _, maps := range m.Kmers { for kmer := range maps { - decodedBytes, _ := base64.StdEncoding.DecodeString(kmer) - sequence := DecompressDNA(decodedBytes) - kmerSize = len(sequence) + kmerSize = len(kmer) break out } } // Now, iterate through each map - for _, sequenceMap := range *m { + for _, sequenceMap := range m.Kmers { var score float64 var totalKmers = len(sequenceMap) var matchedKmers int for i := 0; i <= len(sequence)-kmerSize; i++ { - kmerBytes := StandardizedCompressedDNA(sequence[i : i+kmerSize]) - kmerBase64 := base64.StdEncoding.EncodeToString(kmerBytes) - unique, ok := sequenceMap[kmerBase64] + kmerString := StandardizedDNA(sequence[i : i+kmerSize]) + unique, ok := sequenceMap[kmerString] if ok && unique { matchedKmers++ } @@ -103,5 +136,29 @@ out: } scores = append(scores, score) } - return scores + + var matches []Match + for i, score := range scores { + if score > m.Threshold { + matches = append(matches, Match{Identifier: m.Identifiers[i], Score: score}) + } + } + return matches +} + +func (m *MegamashMap) FastqMatchChannel(ctx context.Context, sequences <-chan fastq.Read, matches chan<- []Match) error { + for { + select { + case <-ctx.Done(): + // Clean up resources, handle cancellation. + return ctx.Err() + case sequence, ok := <-sequences: + if !ok { + close(matches) + return nil + } + sequenceMatches := m.Match(sequence.Sequence) + matches <- sequenceMatches + } + } } diff --git a/lib/align/megamash/megamash_test.go b/lib/align/megamash/megamash_test.go index 8cb2392..b154742 100644 --- a/lib/align/megamash/megamash_test.go +++ b/lib/align/megamash/megamash_test.go @@ -3,79 +3,37 @@ package megamash import ( "testing" - "github.com/koeng101/dnadesign/lib/random" + "github.com/koeng101/dnadesign/lib/bio/fasta" ) -func TestCompressDNA(t *testing.T) { - // Define test cases - longDna, _ := random.DNASequence(300, 0) - longerDna, _ := random.DNASequence(66000, 0) - tests := []struct { - name string - dna string - expectedLen int // Expected length of the compressed data - expectedFlag byte // Expected flag byte - }{ - {"Empty", "", 2, 0x00}, - {"Short", "ATGC", 3, 0x00}, - {"Medium", "ATGCGTATGCCGTAGC", 6, 0x00}, - {"Long", longDna, 78, 0x01}, - {"Longest", longerDna, 16505, 0x02}, - // Add more test cases for longer sequences and edge cases - } - - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - compressed := CompressDNA(tc.dna) - if len(compressed) != tc.expectedLen { - t.Errorf("CompressDNA() with input %s, expected length %d, got %d", "", tc.expectedLen, len(compressed)) - } - if compressed[0] != tc.expectedFlag { - t.Errorf("CompressDNA() with input %s, expected flag %b, got %b", tc.dna, tc.expectedFlag, compressed[0]) - } - }) - } -} - -func TestDecompressDNA(t *testing.T) { - longDna, _ := random.DNASequence(300, 0) - longerDna, _ := random.DNASequence(66000, 0) - // Define test cases - tests := []struct { - name string - dna string - expected string - }{ - {"Empty", "", ""}, - {"Short", "ATGC", "ATGC"}, - {"Medium", "ATGCGTATGCCGTAGC", "ATGCGTATGCCGTAGC"}, - {"Long", longDna, longDna}, - {"Longest", longerDna, longerDna}, - // Add more test cases as needed - } - - for _, tc := range tests { - t.Run(tc.name, func(t *testing.T) { - compressed := CompressDNA(tc.dna) - decompressed := DecompressDNA(compressed) - if decompressed != tc.expected { - t.Errorf("DecompressDNA() with input %v, expected %s, got %s", compressed, tc.expected, decompressed) - } - }) - } -} - func TestMegamash(t *testing.T) { oligo1 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTCTCAATGACCAAACCAACGCAAGTCTTAGTTCGTTCAGTCTCTATTTTATTCTTCATCACACTGTTGCACTTGGTTGTTGCAATGAGATTTCCTAGTATTTTCACTGCTGTGCTGAGACCCGGATCGAACTTAGGTAGCCT" oligo2 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTGCTATTTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCACTAGTCATAAT" oligo3 := "CCGTGCGACAAGATTTCAAGGGTCTCTCTTCTATCGCAGCCAAGGAAGAAGGTGTATCTCTAGAGAAGCGTCGAGTGAGACCCGGATCGAACTTAGGTAGCCCCCTTCGAAGTGGCTCTGTCTGATCCTCCGCGGATGGCGACACCATCGGACTGAGGATATTGGCCACA" samples := []string{"TTTTGTCTACTTCGTTCCGTTGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGACGGCGCCTCCGTGCGACGAGATTTCAAGGGTCTCTGTGCTATATTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCAGATCGACTTTTAGATTCCTCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATACGTGG", "TGTCCTTTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTACTTATCGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTATCTGAGACCGAAGTGGTTTGCCTAAACGCAGGTGCTGTTGGCAAAGGCAGAAAGTAGTCTTAACCTTGACAATGAGTGGTA", "GTTATTGTCGTCTCCTTTGACTCAGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTGCTGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTCCGCTTCTATCTGAGACCGAAGTGGTTAT", "TGTTCTGTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTCTGCCTTAGAGACCACGCCTCCGTGCGACAAGATTCAAGGGTCTCTGTGCTCTGCCGCTAGTTCCGCTCTAGCTGCTCCGGTATGCATCTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAACTGTTGGTT"} - m := MakeMegamashMap([]string{oligo1, oligo2, oligo3}, 16) + m, err := NewMegamashMap([]fasta.Record{fasta.Record{Sequence: oligo1, Identifier: "oligo1"}, fasta.Record{Sequence: oligo2, Identifier: "oligo2"}, fasta.Record{Sequence: oligo3, Identifier: "oligo3"}}, DefaultKmerSize, DefaultMinimalKmerCount, DefaultScoreThreshold) + if err != nil { + t.Errorf("Failed to make NewMegamashMap: %s", err) + } for _, sample := range samples { - scores := m.Score(sample) - if scores[1] < 0.5 { - t.Errorf("Score for oligo2 should be above 0.5") + scores := m.Match(sample) + if scores[0].Identifier != "oligo2" { + t.Errorf("Should have gotten oligo2. Got: %s", scores[0].Identifier) + } + } +} + +func BenchmarkMegamash(b *testing.B) { + for i := 0; i < b.N; i++ { + oligo1 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTCTCAATGACCAAACCAACGCAAGTCTTAGTTCGTTCAGTCTCTATTTTATTCTTCATCACACTGTTGCACTTGGTTGTTGCAATGAGATTTCCTAGTATTTTCACTGCTGTGCTGAGACCCGGATCGAACTTAGGTAGCCT" + oligo2 := "CCGTGCGACAAGATTTCAAGGGTCTCTGTGCTATTTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCACTAGTCATAAT" + oligo3 := "CCGTGCGACAAGATTTCAAGGGTCTCTCTTCTATCGCAGCCAAGGAAGAAGGTGTATCTCTAGAGAAGCGTCGAGTGAGACCCGGATCGAACTTAGGTAGCCCCCTTCGAAGTGGCTCTGTCTGATCCTCCGCGGATGGCGACACCATCGGACTGAGGATATTGGCCACA" + + samples := []string{"TTTTGTCTACTTCGTTCCGTTGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGACGGCGCCTCCGTGCGACGAGATTTCAAGGGTCTCTGTGCTATATTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCAGATCGACTTTTAGATTCCTCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATACGTGG", "TGTCCTTTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTACTTATCGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTATCTGAGACCGAAGTGGTTTGCCTAAACGCAGGTGCTGTTGGCAAAGGCAGAAAGTAGTCTTAACCTTGACAATGAGTGGTA", "GTTATTGTCGTCTCCTTTGACTCAGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTGCTGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTCCGCTTCTATCTGAGACCGAAGTGGTTAT", "TGTTCTGTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTCTGCCTTAGAGACCACGCCTCCGTGCGACAAGATTCAAGGGTCTCTGTGCTCTGCCGCTAGTTCCGCTCTAGCTGCTCCGGTATGCATCTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAACTGTTGGTT"} + m, _ := NewMegamashMap([]fasta.Record{fasta.Record{Sequence: oligo1, Identifier: "oligo1"}, fasta.Record{Sequence: oligo2, Identifier: "oligo2"}, fasta.Record{Sequence: oligo3, Identifier: "oligo3"}}, DefaultKmerSize, DefaultMinimalKmerCount, DefaultScoreThreshold) + for _, sample := range samples { + _ = m.Match(sample) } } } diff --git a/lib/bio/bio.go b/lib/bio/bio.go index 94b6d04..bbe6369 100644 --- a/lib/bio/bio.go +++ b/lib/bio/bio.go @@ -61,21 +61,27 @@ Lower level interfaces ******************************************************************************/ -// parserInterface is a generic interface that all parsers must support. It is +// DataTypes defines the possible data types returned by every parser. +type DataTypes interface { + genbank.Genbank | fasta.Record | fastq.Read | slow5.Read | sam.Alignment | pileup.Line | uniprot.Entry +} + +// HeaderTypes defines the possible header types returned by every parser. +type HeaderTypes interface { + genbank.Header | fasta.Header | fastq.Header | slow5.Header | sam.Header | pileup.Header | uniprot.Header +} + +// ParserInterface is a generic interface that all parsers must support. It is // very simple, only requiring two functions, Header() and Next(). Header() // returns the header of the file if there is one: most files, like fasta, // fastq, and pileup do not contain headers, while others like sam and slow5 do // have headers. Next() returns a record/read/line from the file format, and // terminates on an io.EOF error. // -// Next() may terminate with an io.EOF error with a nil Data or with a -// full Data, depending on where the EOF is in the actual file. A check -// for this is needed at the last Next(), when it returns an io.EOF error. A -// pointer is used to represent the difference between a null Data and an -// empty Data. -type parserInterface[Data io.WriterTo, Header io.WriterTo] interface { +// Next() terminates at io.EOF with an empty Data struct. +type ParserInterface[Data DataTypes, Header HeaderTypes] interface { Header() (Header, error) - Next() (Data, error) + Next() (Data, error) // Returns io.EOF on end of file } /****************************************************************************** @@ -87,82 +93,82 @@ Higher level parse // Parser is generic bioinformatics file parser. It contains a LowerLevelParser // and implements useful functions on top of it: such as Parse(), ParseToChannel(), and // ParseWithHeader(). -type Parser[Data io.WriterTo, Header io.WriterTo] struct { - parserInterface parserInterface[Data, Header] +type Parser[Data DataTypes, Header HeaderTypes] struct { + ParserInterface ParserInterface[Data, Header] } // NewFastaParser initiates a new FASTA parser from an io.Reader. -func NewFastaParser(r io.Reader) *Parser[*fasta.Record, *fasta.Header] { +func NewFastaParser(r io.Reader) *Parser[fasta.Record, fasta.Header] { return NewFastaParserWithMaxLineLength(r, DefaultMaxLengths[Fasta]) } // NewFastaParserWithMaxLineLength initiates a new FASTA parser from an // io.Reader and a user-given maxLineLength. -func NewFastaParserWithMaxLineLength(r io.Reader, maxLineLength int) *Parser[*fasta.Record, *fasta.Header] { - return &Parser[*fasta.Record, *fasta.Header]{parserInterface: fasta.NewParser(r, maxLineLength)} +func NewFastaParserWithMaxLineLength(r io.Reader, maxLineLength int) *Parser[fasta.Record, fasta.Header] { + return &Parser[fasta.Record, fasta.Header]{ParserInterface: fasta.NewParser(r, maxLineLength)} } // NewFastqParser initiates a new FASTQ parser from an io.Reader. -func NewFastqParser(r io.Reader) *Parser[*fastq.Read, *fastq.Header] { +func NewFastqParser(r io.Reader) *Parser[fastq.Read, fastq.Header] { return NewFastqParserWithMaxLineLength(r, DefaultMaxLengths[Fastq]) } // NewFastqParserWithMaxLineLength initiates a new FASTQ parser from an // io.Reader and a user-given maxLineLength. -func NewFastqParserWithMaxLineLength(r io.Reader, maxLineLength int) *Parser[*fastq.Read, *fastq.Header] { - return &Parser[*fastq.Read, *fastq.Header]{parserInterface: fastq.NewParser(r, maxLineLength)} +func NewFastqParserWithMaxLineLength(r io.Reader, maxLineLength int) *Parser[fastq.Read, fastq.Header] { + return &Parser[fastq.Read, fastq.Header]{ParserInterface: fastq.NewParser(r, maxLineLength)} } // NewGenbankParser initiates a new Genbank parser form an io.Reader. -func NewGenbankParser(r io.Reader) *Parser[*genbank.Genbank, *genbank.Header] { +func NewGenbankParser(r io.Reader) *Parser[genbank.Genbank, genbank.Header] { return NewGenbankParserWithMaxLineLength(r, DefaultMaxLengths[Genbank]) } // NewGenbankParserWithMaxLineLength initiates a new Genbank parser from an // io.Reader and a user-given maxLineLength. -func NewGenbankParserWithMaxLineLength(r io.Reader, maxLineLength int) *Parser[*genbank.Genbank, *genbank.Header] { - return &Parser[*genbank.Genbank, *genbank.Header]{parserInterface: genbank.NewParser(r, maxLineLength)} +func NewGenbankParserWithMaxLineLength(r io.Reader, maxLineLength int) *Parser[genbank.Genbank, genbank.Header] { + return &Parser[genbank.Genbank, genbank.Header]{ParserInterface: genbank.NewParser(r, maxLineLength)} } // NewSlow5Parser initiates a new SLOW5 parser from an io.Reader. -func NewSlow5Parser(r io.Reader) (*Parser[*slow5.Read, *slow5.Header], error) { +func NewSlow5Parser(r io.Reader) (*Parser[slow5.Read, slow5.Header], error) { return NewSlow5ParserWithMaxLineLength(r, DefaultMaxLengths[Slow5]) } // NewSlow5ParserWithMaxLineLength initiates a new SLOW5 parser from an // io.Reader and a user-given maxLineLength. -func NewSlow5ParserWithMaxLineLength(r io.Reader, maxLineLength int) (*Parser[*slow5.Read, *slow5.Header], error) { +func NewSlow5ParserWithMaxLineLength(r io.Reader, maxLineLength int) (*Parser[slow5.Read, slow5.Header], error) { parser, err := slow5.NewParser(r, maxLineLength) - return &Parser[*slow5.Read, *slow5.Header]{parserInterface: parser}, err + return &Parser[slow5.Read, slow5.Header]{ParserInterface: parser}, err } // NewSamParser initiates a new SAM parser from an io.Reader. -func NewSamParser(r io.Reader) (*Parser[*sam.Alignment, *sam.Header], error) { +func NewSamParser(r io.Reader) (*Parser[sam.Alignment, sam.Header], error) { return NewSamParserWithMaxLineLength(r, DefaultMaxLengths[Sam]) } // NewSamParserWithMaxLineLength initiates a new SAM parser from an io.Reader // and a user-given maxLineLength. -func NewSamParserWithMaxLineLength(r io.Reader, maxLineLength int) (*Parser[*sam.Alignment, *sam.Header], error) { +func NewSamParserWithMaxLineLength(r io.Reader, maxLineLength int) (*Parser[sam.Alignment, sam.Header], error) { parser, _, err := sam.NewParser(r, maxLineLength) - return &Parser[*sam.Alignment, *sam.Header]{parserInterface: parser}, err + return &Parser[sam.Alignment, sam.Header]{ParserInterface: parser}, err } // NewPileupParser initiates a new Pileup parser from an io.Reader. -func NewPileupParser(r io.Reader) *Parser[*pileup.Line, *pileup.Header] { +func NewPileupParser(r io.Reader) *Parser[pileup.Line, pileup.Header] { return NewPileupParserWithMaxLineLength(r, DefaultMaxLengths[Pileup]) } // NewPileupParserWithMaxLineLength initiates a new Pileup parser from an // io.Reader and a user-given maxLineLength. -func NewPileupParserWithMaxLineLength(r io.Reader, maxLineLength int) *Parser[*pileup.Line, *pileup.Header] { - return &Parser[*pileup.Line, *pileup.Header]{parserInterface: pileup.NewParser(r, maxLineLength)} +func NewPileupParserWithMaxLineLength(r io.Reader, maxLineLength int) *Parser[pileup.Line, pileup.Header] { + return &Parser[pileup.Line, pileup.Header]{ParserInterface: pileup.NewParser(r, maxLineLength)} } // NewUniprotParser initiates a new Uniprot parser from an io.Reader. No // maxLineLength is necessary. -func NewUniprotParser(r io.Reader) *Parser[*uniprot.Entry, *uniprot.Header] { - return &Parser[*uniprot.Entry, *uniprot.Header]{parserInterface: uniprot.NewParser(r)} +func NewUniprotParser(r io.Reader) *Parser[uniprot.Entry, uniprot.Header] { + return &Parser[uniprot.Entry, uniprot.Header]{ParserInterface: uniprot.NewParser(r)} } /****************************************************************************** @@ -179,7 +185,7 @@ Parser higher-level functions // records in a file, as the parser reads the underlying io.Reader in a // straight line. func (p *Parser[Data, Header]) Next() (Data, error) { - return p.parserInterface.Next() + return p.ParserInterface.Next() } // Header is a parsing primitive that should be used when low-level control is @@ -198,7 +204,7 @@ func (p *Parser[Data, Header]) Next() (Data, error) { // // SLOW5 func (p *Parser[Data, Header]) Header() (Header, error) { - return p.parserInterface.Header() + return p.ParserInterface.Header() } // ParseN returns a countN number of records/reads/lines from the parser. @@ -255,7 +261,7 @@ Concurrent higher-level functions // Context can be used to close the parser in the middle of parsing - for // example, if an error is found in another parser elsewhere and all files // need to close. -func (p *Parser[Data, Header]) ParseToChannel(ctx context.Context, channel chan<- Data, keepChannelOpen bool) error { +func (p *Parser[DataTypes, HeaderTypes]) ParseToChannel(ctx context.Context, channel chan<- DataTypes, keepChannelOpen bool) error { for { select { case <-ctx.Done(): @@ -280,7 +286,7 @@ func (p *Parser[Data, Header]) ParseToChannel(ctx context.Context, channel chan< // functions. It properly does concurrent parsing of many parsers to a // single channel, then closes that channel. If any of the files fail to // parse, the entire pipeline exits and returns. -func ManyToChannel[Data io.WriterTo, Header io.WriterTo](ctx context.Context, channel chan<- Data, parsers ...*Parser[Data, Header]) error { +func ManyToChannel[Data DataTypes, Header HeaderTypes](ctx context.Context, channel chan<- Data, parsers ...*Parser[Data, Header]) error { errorGroup, ctx := errgroup.WithContext(ctx) // For each parser, start a new goroutine to parse data to the channel for _, p := range parsers { @@ -299,16 +305,21 @@ func ManyToChannel[Data io.WriterTo, Header io.WriterTo](ctx context.Context, ch // FilterData is a generic function that implements a channel filter. Users // give an input and output channel, with a filtering function, and FilterData // filters data from the input into the output. -func FilterData[Data *genbank.Genbank | *fasta.Record | *fastq.Read | *slow5.Read | *sam.Alignment | *pileup.Line | *uniprot.Entry](input <-chan Data, output chan<- Data, filter func(a Data) bool) { +func FilterData[Data DataTypes](ctx context.Context, input <-chan Data, output chan<- Data, filter func(a Data) bool) error { for { select { + case <-ctx.Done(): + close(output) + return ctx.Err() + case data, ok := <-input: if !ok { // If the input channel is closed, we close the output channel and return close(output) - return + return nil // returning nil as the input channel being closed is a normal completion signal } if filter(data) { + // Only send data through if it passes the filter. output <- data } } diff --git a/lib/bio/example_test.go b/lib/bio/example_test.go index eb6cecf..d1620d1 100644 --- a/lib/bio/example_test.go +++ b/lib/bio/example_test.go @@ -11,6 +11,7 @@ import ( "github.com/koeng101/dnadesign/lib/bio" "github.com/koeng101/dnadesign/lib/bio/fasta" "github.com/koeng101/dnadesign/lib/bio/sam" + "golang.org/x/sync/errgroup" ) // Example_read shows an example of reading a file from disk. @@ -95,11 +96,11 @@ FPEFLTMMARKMKDTDSEEEIREAFRVFDKDGNGYISAAELRHVMTNLGEKLTDEEVDEMIREA DIDGDGQVNYEEFVQMMTAK*`) parser := bio.NewFastaParser(file) - channel := make(chan *fasta.Record) + channel := make(chan fasta.Record) ctx := context.Background() go func() { _ = parser.ParseToChannel(ctx, channel, false) }() - var records []*fasta.Record + var records []fasta.Record for record := range channel { records = append(records, record) } @@ -123,11 +124,11 @@ DIDGDGQVNYEEFVQMMTAK*`) parser1 := bio.NewFastaParser(file1) parser2 := bio.NewFastaParser(file2) - channel := make(chan *fasta.Record) + channel := make(chan fasta.Record) ctx := context.Background() go func() { _ = bio.ManyToChannel(ctx, channel, parser1, parser2) }() - var records []*fasta.Record + var records []fasta.Record for record := range channel { records = append(records, record) } @@ -403,20 +404,24 @@ ae9a66f5-bf71-4572-8106-f6f8dbd3b799 16 pOpen_V3_amplified 1 60 8S54M1D3M1D108M1 func ExampleFilterData() { // Create channels for input and output - inputChan := make(chan *sam.Alignment, 2) // Buffered channel to prevent blocking - outputChan := make(chan *sam.Alignment) + inputChan := make(chan sam.Alignment, 2) // Buffered channel to prevent blocking + outputChan := make(chan sam.Alignment) var results []sam.Alignment - go bio.FilterData(inputChan, outputChan, func(data *sam.Alignment) bool { return (data.FLAG & 0x900) == 0 }) + ctx := context.Background() + errorGroup, ctx := errgroup.WithContext(ctx) + errorGroup.Go(func() error { + return bio.FilterData(ctx, inputChan, outputChan, func(data sam.Alignment) bool { return (data.FLAG & 0x900) == 0 }) + }) // Send some example Alignments to the input channel - inputChan <- &sam.Alignment{FLAG: 0x900} // Not primary, should not be outputted - inputChan <- &sam.Alignment{SEQ: "FAKE", FLAG: 0x000} // Primary, should be outputted - close(inputChan) // Close the input channel to signal no more data + inputChan <- sam.Alignment{FLAG: 0x900} // Not primary, should not be outputted + inputChan <- sam.Alignment{SEQ: "FAKE", FLAG: 0x000} // Primary, should be outputted + close(inputChan) // Close the input channel to signal no more data // Collect results from the output channel for alignment := range outputChan { - results = append(results, *alignment) + results = append(results, alignment) } fmt.Println(results) diff --git a/lib/bio/fasta/example_test.go b/lib/bio/fasta/example_test.go index dd6abe8..f5e9145 100644 --- a/lib/bio/fasta/example_test.go +++ b/lib/bio/fasta/example_test.go @@ -29,7 +29,7 @@ func Example_basic() { } break } - records = append(records, *record) + records = append(records, record) } fmt.Println(records[0].Sequence) // Output: ATGC diff --git a/lib/bio/fasta/fasta.go b/lib/bio/fasta/fasta.go index 416cbf3..813f1fc 100644 --- a/lib/bio/fasta/fasta.go +++ b/lib/bio/fasta/fasta.go @@ -78,8 +78,8 @@ type Parser struct { } // Header returns nil,nil. -func (p *Parser) Header() (*Header, error) { - return &Header{}, nil +func (p *Parser) Header() (Header, error) { + return Header{}, nil } // NewParser returns a Parser that uses r as the source @@ -106,9 +106,9 @@ func NewParser(r io.Reader, maxLineSize int) *Parser { // It is worth noting the amount of bytes read are always right up to before // the next fasta starts which means this function can effectively be used // to index where fastas start in a file or string. -func (p *Parser) Next() (*Record, error) { +func (p *Parser) Next() (Record, error) { if !p.more { - return &Record{}, io.EOF + return Record{}, io.EOF } for p.scanner.Scan() { line := p.scanner.Bytes() @@ -127,7 +127,7 @@ func (p *Parser) Next() (*Record, error) { case line[0] != '>' && p.start: err := fmt.Errorf("invalid input: missing sequence identifier for sequence starting at line %d", p.line) record, _ := p.newRecord() - return &record, err + return record, err // start of a fasta line case line[0] != '>': p.buff.Write(line) @@ -136,7 +136,7 @@ func (p *Parser) Next() (*Record, error) { record, err := p.newRecord() // New name p.identifier = string(line[1:]) - return &record, err + return record, err // Process first line of file case line[0] == '>' && p.start: p.identifier = string(line[1:]) @@ -147,9 +147,9 @@ func (p *Parser) Next() (*Record, error) { // Add final sequence in file record, err := p.newRecord() if err != nil { - return &record, err + return record, err } - return &record, nil + return record, nil } func (p *Parser) newRecord() (Record, error) { diff --git a/lib/bio/fasta/fasta_test.go b/lib/bio/fasta/fasta_test.go index ba5dfa6..997b7e4 100644 --- a/lib/bio/fasta/fasta_test.go +++ b/lib/bio/fasta/fasta_test.go @@ -47,7 +47,7 @@ func TestParser(t *testing.T) { } break } - fastas = append(fastas, *fa) + fastas = append(fastas, fa) } if len(fastas) != len(test.expected) { t.Errorf("case index %d: got %d fastas, expected %d", testIndex, len(fastas), len(test.expected)) @@ -77,7 +77,7 @@ func TestReadEmptyFasta(t *testing.T) { targetError = err break } - fastas = append(fastas, *fa) + fastas = append(fastas, fa) } if targetError == nil { t.Errorf("expected error reading empty fasta stream") diff --git a/lib/bio/fastq/fastq.go b/lib/bio/fastq/fastq.go index c5ef531..15abd7d 100644 --- a/lib/bio/fastq/fastq.go +++ b/lib/bio/fastq/fastq.go @@ -59,8 +59,8 @@ type Parser struct { } // Header returns nil,nil. -func (parser *Parser) Header() (*Header, error) { - return &Header{}, nil +func (parser *Parser) Header() (Header, error) { + return Header{}, nil } // NewParser returns a Parser that uses r as the source @@ -87,9 +87,9 @@ func NewParser(r io.Reader, maxLineSize int) *Parser { // files, fastq always have 4 lines following each other - not variable with // a line limit of 80 like fasta files have. So instead of a for loop, you // can just parse 4 lines at once. -func (parser *Parser) Next() (*Read, error) { +func (parser *Parser) Next() (Read, error) { if parser.atEOF { - return &Read{}, io.EOF + return Read{}, io.EOF } // Initialization of parser state variables. var ( @@ -106,7 +106,7 @@ func (parser *Parser) Next() (*Read, error) { line, err = parser.reader.ReadSlice('\n') parser.line++ if err != nil { - return &Read{}, err + return Read{}, err } line = line[:len(line)-1] // Exclude newline delimiter. @@ -127,17 +127,17 @@ func (parser *Parser) Next() (*Read, error) { line, err = parser.reader.ReadSlice('\n') parser.line++ if err != nil { - return &Read{}, err + return Read{}, err } if len(line) <= 1 { // newline delimiter - actually checking for empty line - return &Read{}, fmt.Errorf("empty fastq sequence for %q, got to line %d: %w", seqIdentifier, parser.line, err) + return Read{}, fmt.Errorf("empty fastq sequence for %q, got to line %d: %w", seqIdentifier, parser.line, err) } sequence = line[:len(line)-1] // Exclude newline delimiter. var newSequence []byte for _, char := range sequence { newSequence = append(newSequence, char) if !strings.ContainsRune("ATGCN", rune(char)) { - return &Read{}, errors.New("Only letters ATGCN are allowed for DNA/RNA in fastq file. Got letter: " + string(char)) + return Read{}, errors.New("Only letters ATGCN are allowed for DNA/RNA in fastq file. Got letter: " + string(char)) } } @@ -145,7 +145,7 @@ func (parser *Parser) Next() (*Read, error) { _, err = parser.reader.ReadSlice('\n') parser.line++ if err != nil { - return &Read{}, err + return Read{}, err } // parse quality @@ -154,18 +154,18 @@ func (parser *Parser) Next() (*Read, error) { if err != nil { // If the line is EOF, just continue and finish off returning the new read. if err != io.EOF { - return &Read{}, nil + return Read{}, nil } parser.atEOF = true } if len(line) <= 1 { // newline delimiter - actually checking for empty line - return &Read{}, fmt.Errorf("empty quality sequence for %q, got to line %d: %w", seqIdentifier, parser.line, err) + return Read{}, fmt.Errorf("empty quality sequence for %q, got to line %d: %w", seqIdentifier, parser.line, err) } quality = string(line[:len(line)-1]) // Parsing ended. Check for inconsistencies. if lookingForIdentifier { - return &Read{}, fmt.Errorf("did not find fastq start '@', got to line %d: %w", parser.line, err) + return Read{}, fmt.Errorf("did not find fastq start '@', got to line %d: %w", parser.line, err) } fastq := Read{ Identifier: seqIdentifier, @@ -176,7 +176,7 @@ func (parser *Parser) Next() (*Read, error) { // Gotten to this point err is non-nil only in EOF case. // We report this error to note the fastq may be incomplete/corrupt // like in the case of using an io.LimitReader wrapping the underlying reader. - return &fastq, nil + return fastq, nil } // Reset discards all data in buffer and resets state. diff --git a/lib/bio/genbank/genbank.go b/lib/bio/genbank/genbank.go index c04045a..eabcca9 100644 --- a/lib/bio/genbank/genbank.go +++ b/lib/bio/genbank/genbank.go @@ -568,8 +568,8 @@ type Parser struct { } // Header returns nil,nil. -func (parser *Parser) Header() (*Header, error) { - return &Header{}, nil +func (parser *Parser) Header() (Header, error) { + return Header{}, nil } // NewParser returns a Parser that uses r as the source @@ -584,7 +584,7 @@ func NewParser(r io.Reader, maxLineSize int) *Parser { } // Next takes in a reader representing a multi gbk/gb/genbank file and outputs the next record -func (parser *Parser) Next() (*Genbank, error) { +func (parser *Parser) Next() (Genbank, error) { parser.parameters.init() // Loop through each line of the file for lineNum := 0; parser.scanner.Scan(); lineNum++ { @@ -615,7 +615,7 @@ func (parser *Parser) Next() (*Genbank, error) { case "metadata": // Handle empty lines if len(line) == 0 { - return &Genbank{}, &ParseError{line: line, lineNo: lineNum, info: "unexpected empty metadata"} + return Genbank{}, &ParseError{line: line, lineNo: lineNum, info: "unexpected empty metadata"} } // If we are currently reading a line, we need to figure out if it is a new meta line. @@ -637,7 +637,7 @@ func (parser *Parser) Next() (*Genbank, error) { // parseReferencesFn = parseReferences in genbank_test. We use Fn for testing purposes. reference, err := parseReferencesFn(parser.parameters.metadataData) if err != nil { - return &Genbank{}, &ParseError{line: line, lineNo: lineNum, before: true, wraps: err, info: "failed in parsing reference"} + return Genbank{}, &ParseError{line: line, lineNo: lineNum, before: true, wraps: err, info: "failed in parsing reference"} } parser.parameters.genbank.Meta.References = append(parser.parameters.genbank.Meta.References, reference) @@ -669,7 +669,7 @@ func (parser *Parser) Next() (*Genbank, error) { for countIndex := 2; countIndex < len(fields)-1; countIndex += 2 { // starts at two because we don't want to include "BASE COUNT" in our fields count, err := strconv.Atoi(fields[countIndex]) if err != nil { - return &Genbank{}, &ParseError{line: line, lineNo: lineNum, wraps: err, info: "invalid base count"} + return Genbank{}, &ParseError{line: line, lineNo: lineNum, wraps: err, info: "invalid base count"} } baseCount := BaseCount{ @@ -693,12 +693,12 @@ func (parser *Parser) Next() (*Genbank, error) { // TODO: parse location when line is read, or track line number so error is localized location, err := parseLocation(feature.Location.GbkLocationString) if err != nil { - return &Genbank{}, &ParseError{before: true, line: line, lineNo: lineNum, wraps: err, info: "invalid feature location"} + return Genbank{}, &ParseError{before: true, line: line, lineNo: lineNum, wraps: err, info: "invalid feature location"} } feature.Location = location err = parser.parameters.genbank.AddFeature(&feature) if err != nil { - return &Genbank{}, &ParseError{before: true, line: line, lineNo: lineNum, wraps: err, info: "problem adding feature"} + return Genbank{}, &ParseError{before: true, line: line, lineNo: lineNum, wraps: err, info: "problem adding feature"} } } @@ -717,7 +717,7 @@ func (parser *Parser) Next() (*Genbank, error) { indent := countLeadingSpaces(parser.parameters.currentLine) // determine if current line is a new top level feature if indent == 0 { - return &Genbank{}, &ParseError{line: line, lineNo: lineNum, info: "unexpected metadata when parsing feature"} + return Genbank{}, &ParseError{line: line, lineNo: lineNum, info: "unexpected metadata when parsing feature"} } else if indent < countLeadingSpaces(parser.parameters.prevline) || parser.parameters.prevline == "FEATURES" { parser.parameters.saveLastAttribute() @@ -726,7 +726,7 @@ func (parser *Parser) Next() (*Genbank, error) { // An initial feature line looks like this: `source 1..2686` with a type separated by its location if len(splitLine) < 2 { - return &Genbank{}, &ParseError{line: line, lineNo: lineNum, info: "malformed feature"} + return Genbank{}, &ParseError{line: line, lineNo: lineNum, info: "malformed feature"} } parser.parameters.feature.Type = strings.TrimSpace(splitLine[0]) parser.parameters.feature.Location.GbkLocationString = strings.TrimSpace(splitLine[len(splitLine)-1]) @@ -769,26 +769,26 @@ func (parser *Parser) Next() (*Genbank, error) { parser.parameters.attributeValue = removeAttributeValueQuotes parser.parameters.multiLineFeature = false // without this we can't tell if something is a multiline feature or multiline qualifier } else { - return &Genbank{}, &ParseError{line: line, lineNo: lineNum, info: "invalid feature"} + return Genbank{}, &ParseError{line: line, lineNo: lineNum, info: "invalid feature"} } case "sequence": if len(line) < 2 { // throw error if line is malformed - return &Genbank{}, &ParseError{line: line, lineNo: lineNum, info: "too short line found while parsing genbank sequence"} + return Genbank{}, &ParseError{line: line, lineNo: lineNum, info: "too short line found while parsing genbank sequence"} } else if line[0:2] == "//" { // end of sequence parser.parameters.genbank.Sequence = parser.parameters.sequenceBuilder.String() parser.parameters.genbankStarted = false parser.parameters.sequenceBuilder.Reset() - return &parser.parameters.genbank, nil + return parser.parameters.genbank, nil } else { // add line to total sequence parser.parameters.sequenceBuilder.WriteString(sequenceRegex.ReplaceAllString(line, "")) } default: - return &Genbank{}, fmt.Errorf("Unknown parse step: %s", parser.parameters.parseStep) + return Genbank{}, fmt.Errorf("Unknown parse step: %s", parser.parameters.parseStep) } } - return &Genbank{}, io.EOF + return Genbank{}, io.EOF } func countLeadingSpaces(line string) int { @@ -1228,7 +1228,7 @@ func parseMultiNth(r io.Reader, count int) ([]Genbank, *ParseError) { } return genbanks, perr } - genbanks = append(genbanks, *gb) + genbanks = append(genbanks, gb) } return genbanks, nil } @@ -1236,7 +1236,7 @@ func parseMultiNth(r io.Reader, count int) ([]Genbank, *ParseError) { func parse(r io.Reader) (Genbank, error) { parser := NewParser(r, bufio.MaxScanTokenSize) gb, err := parser.Next() - return *gb, err + return gb, err } func write(gb Genbank, path string) error { diff --git a/lib/bio/iowriter_test.go b/lib/bio/iowriter_test.go new file mode 100644 index 0000000..ba54d3d --- /dev/null +++ b/lib/bio/iowriter_test.go @@ -0,0 +1,24 @@ +package bio + +import ( + "io" + "testing" + + "github.com/koeng101/dnadesign/lib/bio/fasta" + "github.com/koeng101/dnadesign/lib/bio/fastq" + "github.com/koeng101/dnadesign/lib/bio/genbank" + "github.com/koeng101/dnadesign/lib/bio/pileup" + "github.com/koeng101/dnadesign/lib/bio/sam" + "github.com/koeng101/dnadesign/lib/bio/slow5" + "github.com/koeng101/dnadesign/lib/bio/uniprot" +) + +func TestAllTypesImplementWriterTo(t *testing.T) { + var _ io.WriterTo = &genbank.Genbank{} + var _ io.WriterTo = &fasta.Record{} + var _ io.WriterTo = &fastq.Read{} + var _ io.WriterTo = &slow5.Read{} + var _ io.WriterTo = &sam.Alignment{} + var _ io.WriterTo = &pileup.Line{} + var _ io.WriterTo = &uniprot.Entry{} +} diff --git a/lib/bio/pileup/pileup.go b/lib/bio/pileup/pileup.go index 5e1aaff..e87cfd9 100644 --- a/lib/bio/pileup/pileup.go +++ b/lib/bio/pileup/pileup.go @@ -72,8 +72,8 @@ type Parser struct { } // Header returns nil,nil. -func (parser *Parser) Header() (*Header, error) { - return &Header{}, nil +func (parser *Parser) Header() (Header, error) { + return Header{}, nil } // NewParser creates a parser from an io.Reader for pileup data. @@ -85,15 +85,15 @@ func NewParser(r io.Reader, maxLineSize int) *Parser { // Next parses the next pileup row in a pileup file. // Next returns an EOF if encountered. -func (parser *Parser) Next() (*Line, error) { +func (parser *Parser) Next() (Line, error) { if parser.atEOF { - return &Line{}, io.EOF + return Line{}, io.EOF } // Parse out a single line lineBytes, err := parser.reader.ReadSlice('\n') if err != nil { if err != io.EOF { - return &Line{}, err + return Line{}, err } parser.atEOF = true } @@ -103,7 +103,7 @@ func (parser *Parser) Next() (*Line, error) { // In this case, the file has ended on a newline. Just return // with an io.EOF if len(strings.TrimSpace(line)) == 0 && parser.atEOF { - return &Line{}, io.EOF + return Line{}, io.EOF } line = line[:len(line)-1] // Exclude newline delimiter. @@ -111,17 +111,17 @@ func (parser *Parser) Next() (*Line, error) { // Check that there are 6 values, as defined by the pileup format values := strings.Split(line, "\t") if len(values) != 6 { - return &Line{}, fmt.Errorf("Error on line %d: Got %d values, expected 6.", parser.line, len(values)) + return Line{}, fmt.Errorf("Error on line %d: Got %d values, expected 6.", parser.line, len(values)) } // Convert Position and ReadCount to integers positionInteger, err := strconv.Atoi(strings.TrimSpace(values[1])) if err != nil { - return &Line{}, fmt.Errorf("Error on line %d. Got error: %w", parser.line, err) + return Line{}, fmt.Errorf("Error on line %d. Got error: %w", parser.line, err) } readCountInteger, err := strconv.Atoi(strings.TrimSpace(values[3])) if err != nil { - return &Line{}, fmt.Errorf("Error on line %d. Got error: %w", parser.line, err) + return Line{}, fmt.Errorf("Error on line %d. Got error: %w", parser.line, err) } // Parse ReadResults @@ -169,18 +169,18 @@ func (parser *Parser) Next() (*Line, error) { case '0', '1', '2', '3', '4', '5', '6', '7', '8', '9', 'A', 'T', 'G', 'C', 'N', 'a', 't', 'g', 'c', 'n', '-', '+': continue default: - return &Line{}, fmt.Errorf("Rune within +,- not found on line %d. Got %c: only runes allowed are: [0 1 2 3 4 5 6 7 8 9 A T G C N a t g c n - +]", parser.line, letter) + return Line{}, fmt.Errorf("Rune within +,- not found on line %d. Got %c: only runes allowed are: [0 1 2 3 4 5 6 7 8 9 A T G C N a t g c n - +]", parser.line, letter) } } readResults = append(readResults, readResult) skip = skip + regularExpressionInt + len(numberOfJumps) // The 1 makes sure to include the regularExpressionInt in readResult string default: - return &Line{}, fmt.Errorf("Rune not found on line %d. Got %c: only runes allowed are: [^ $ . , * A T G C N a t g c n - +]", parser.line, resultRune) + return Line{}, fmt.Errorf("Rune not found on line %d. Got %c: only runes allowed are: [^ $ . , * A T G C N a t g c n - +]", parser.line, resultRune) } readCount = readCount + 1 } - return &Line{Sequence: values[0], Position: uint(positionInteger), ReferenceBase: values[2], ReadCount: uint(readCountInteger), ReadResults: readResults, Quality: values[5]}, nil + return Line{Sequence: values[0], Position: uint(positionInteger), ReferenceBase: values[2], ReadCount: uint(readCountInteger), ReadResults: readResults, Quality: values[5]}, nil } /****************************************************************************** diff --git a/lib/bio/pileup/pileup_test.go b/lib/bio/pileup/pileup_test.go index dccb62c..0321591 100644 --- a/lib/bio/pileup/pileup_test.go +++ b/lib/bio/pileup/pileup_test.go @@ -26,7 +26,7 @@ func TestParse(t *testing.T) { } break } - pileupReads = append(pileupReads, *pileupRead) + pileupReads = append(pileupReads, pileupRead) } if pileupReads[2].ReadCount != 1401 { t.Errorf("Expected 1401 read counts. Got: %d", pileupReads[2].ReadCount) diff --git a/lib/bio/sam/sam.go b/lib/bio/sam/sam.go index 966d5bd..dd5e527 100644 --- a/lib/bio/sam/sam.go +++ b/lib/bio/sam/sam.go @@ -357,8 +357,8 @@ type Parser struct { } // Header returns the parsed sam header. -func (p *Parser) Header() (*Header, error) { - return &p.FileHeader, nil +func (p *Parser) Header() (Header, error) { + return p.FileHeader, nil } func checkIfValidSamLine(lineBytes []byte) bool { @@ -458,7 +458,7 @@ func NewParser(r io.Reader, maxLineSize int) (*Parser, Header, error) { } // Next parsers the next read from a parser. Returns an `io.EOF` upon EOF. -func (p *Parser) Next() (*Alignment, error) { +func (p *Parser) Next() (Alignment, error) { var alignment Alignment var finalLine bool var line string @@ -480,7 +480,7 @@ func (p *Parser) Next() (*Alignment, error) { } if !finalLine { if err != nil { - return nil, err + return alignment, err } } line = strings.TrimSpace(string(lineBytes)) @@ -488,35 +488,35 @@ func (p *Parser) Next() (*Alignment, error) { p.line++ values := strings.Split(line, "\t") if len(values) < 11 { - return nil, fmt.Errorf("Line %d had error: must have at least 11 tab-delimited values. Had %d", p.line, len(values)) + return alignment, fmt.Errorf("Line %d had error: must have at least 11 tab-delimited values. Had %d", p.line, len(values)) } alignment.QNAME = values[0] flag64, err := strconv.ParseUint(values[1], 10, 16) // convert string to uint16 if err != nil { - return nil, fmt.Errorf("Line %d had error: %s", p.line, err) + return alignment, fmt.Errorf("Line %d had error: %s", p.line, err) } alignment.FLAG = uint16(flag64) alignment.RNAME = values[2] pos64, err := strconv.ParseInt(values[3], 10, 32) // convert string to int32 if err != nil { - return nil, fmt.Errorf("Line %d had error: %s", p.line, err) + return alignment, fmt.Errorf("Line %d had error: %s", p.line, err) } alignment.POS = int32(pos64) mapq64, err := strconv.ParseUint(values[4], 10, 8) // convert string to uint8 (otherwise known as byte) if err != nil { - return nil, fmt.Errorf("Line %d had error: %s", p.line, err) + return alignment, fmt.Errorf("Line %d had error: %s", p.line, err) } alignment.MAPQ = uint8(mapq64) alignment.CIGAR = values[5] alignment.RNEXT = values[6] pnext64, err := strconv.ParseInt(values[7], 10, 32) if err != nil { - return nil, fmt.Errorf("Line %d had error: %s", p.line, err) + return alignment, fmt.Errorf("Line %d had error: %s", p.line, err) } alignment.PNEXT = int32(pnext64) tlen64, err := strconv.ParseInt(values[8], 10, 32) if err != nil { - return nil, fmt.Errorf("Line %d had error: %s", p.line, err) + return alignment, fmt.Errorf("Line %d had error: %s", p.line, err) } alignment.TLEN = int32(tlen64) alignment.SEQ = values[9] @@ -528,7 +528,7 @@ func (p *Parser) Next() (*Alignment, error) { optionals = append(optionals, Optional{Tag: valueSplit[0], Type: rune(valueSplit[1][0]), Data: valueSplit[2]}) } alignment.Optionals = optionals - return &alignment, nil + return alignment, nil } /****************************************************************************** diff --git a/lib/bio/slow5/example_test.go b/lib/bio/slow5/example_test.go index b59b27f..989f68c 100644 --- a/lib/bio/slow5/example_test.go +++ b/lib/bio/slow5/example_test.go @@ -25,7 +25,7 @@ func ExampleNewParser() { // Break at EOF break } - outputReads = append(outputReads, *read) + outputReads = append(outputReads, read) } fmt.Println(outputReads[0].RawSignal[0:10]) diff --git a/lib/bio/slow5/slow5.go b/lib/bio/slow5/slow5.go index 9c81865..ef1f9c7 100644 --- a/lib/bio/slow5/slow5.go +++ b/lib/bio/slow5/slow5.go @@ -115,8 +115,8 @@ type Parser struct { } // Header returns the header -func (parser *Parser) Header() (*Header, error) { - return &parser.header, nil +func (parser *Parser) Header() (Header, error) { + return parser.header, nil } // NewParser parsers a slow5 file. @@ -213,9 +213,9 @@ func NewParser(r io.Reader, maxLineSize int) (*Parser, error) { } // Next parses the next read from a parser. -func (parser *Parser) Next() (*Read, error) { +func (parser *Parser) Next() (Read, error) { if parser.hitEOF { - return &Read{}, io.EOF + return Read{}, io.EOF } lineBytes, err := parser.reader.ReadSlice('\n') if err != nil { @@ -223,10 +223,10 @@ func (parser *Parser) Next() (*Read, error) { parser.hitEOF = true if strings.TrimSpace(string(lineBytes)) == "" { // If EOF line is empty, return. If not, continue! - return nil, io.EOF + return Read{}, io.EOF } } else { - return nil, err + return Read{}, err } } parser.line++ @@ -248,37 +248,37 @@ func (parser *Parser) Next() (*Read, error) { case "read_group": readGroupID, err := strconv.ParseUint(values[valueIndex], 10, 32) if err != nil { - return nil, fmt.Errorf("Failed convert read_group '%s' to uint on line %d. Got Error: %w", values[valueIndex], parser.line, err) + return Read{}, fmt.Errorf("Failed convert read_group '%s' to uint on line %d. Got Error: %w", values[valueIndex], parser.line, err) } newRead.ReadGroupID = uint32(readGroupID) case "digitisation": digitisation, err := strconv.ParseFloat(values[valueIndex], 64) if err != nil { - return nil, fmt.Errorf("Failed to convert digitisation '%s' to float on line %d. Got Error: %w", values[valueIndex], parser.line, err) + return Read{}, fmt.Errorf("Failed to convert digitisation '%s' to float on line %d. Got Error: %w", values[valueIndex], parser.line, err) } newRead.Digitisation = digitisation case "offset": offset, err := strconv.ParseFloat(values[valueIndex], 64) if err != nil { - return nil, fmt.Errorf("Failed to convert offset '%s' to float on line %d. Got Error: %w", values[valueIndex], parser.line, err) + return Read{}, fmt.Errorf("Failed to convert offset '%s' to float on line %d. Got Error: %w", values[valueIndex], parser.line, err) } newRead.Offset = offset case "range": nanoporeRange, err := strconv.ParseFloat(values[valueIndex], 64) if err != nil { - return nil, fmt.Errorf("Failed to convert range '%s' to float on line %d. Got Error: %w", values[valueIndex], parser.line, err) + return Read{}, fmt.Errorf("Failed to convert range '%s' to float on line %d. Got Error: %w", values[valueIndex], parser.line, err) } newRead.Range = nanoporeRange case "sampling_rate": samplingRate, err := strconv.ParseFloat(values[valueIndex], 64) if err != nil { - return nil, fmt.Errorf("Failed to convert sampling_rate '%s' to float on line %d. Got Error: %w", values[valueIndex], parser.line, err) + return Read{}, fmt.Errorf("Failed to convert sampling_rate '%s' to float on line %d. Got Error: %w", values[valueIndex], parser.line, err) } newRead.SamplingRate = samplingRate case "len_raw_signal": lenRawSignal, err := strconv.ParseUint(values[valueIndex], 10, 64) if err != nil { - return nil, fmt.Errorf("Failed to convert len_raw_signal '%s' to float on line %d. Got Error: %w", values[valueIndex], parser.line, err) + return Read{}, fmt.Errorf("Failed to convert len_raw_signal '%s' to float on line %d. Got Error: %w", values[valueIndex], parser.line, err) } newRead.LenRawSignal = lenRawSignal case "raw_signal": @@ -286,7 +286,7 @@ func (parser *Parser) Next() (*Read, error) { for rawSignalIndex, rawSignalString := range strings.Split(values[valueIndex], ",") { rawSignal, err := strconv.ParseInt(rawSignalString, 10, 16) if err != nil { - return nil, fmt.Errorf("Failed to convert raw signal '%s' to int on line %d, signal index %d. Got Error: %w", rawSignalString, parser.line, rawSignalIndex, err) + return Read{}, fmt.Errorf("Failed to convert raw signal '%s' to int on line %d, signal index %d. Got Error: %w", rawSignalString, parser.line, rawSignalIndex, err) } rawSignals = append(rawSignals, int16(rawSignal)) } @@ -294,44 +294,44 @@ func (parser *Parser) Next() (*Read, error) { case "start_time": startTime, err := strconv.ParseUint(values[valueIndex], 10, 64) if err != nil { - return nil, fmt.Errorf("Failed to convert start_time '%s' to uint on line %d. Got Error: %w", values[valueIndex], parser.line, err) + return Read{}, fmt.Errorf("Failed to convert start_time '%s' to uint on line %d. Got Error: %w", values[valueIndex], parser.line, err) } newRead.StartTime = startTime case "read_number": readNumber, err := strconv.ParseInt(values[valueIndex], 10, 32) if err != nil { - return nil, fmt.Errorf("Failed to convert read_number '%s' to int on line %d. Got Error: %w", values[valueIndex], parser.line, err) + return Read{}, fmt.Errorf("Failed to convert read_number '%s' to int on line %d. Got Error: %w", values[valueIndex], parser.line, err) } newRead.ReadNumber = int32(readNumber) case "start_mux": startMux, err := strconv.ParseUint(values[valueIndex], 10, 8) if err != nil { - return nil, fmt.Errorf("Failed to convert start_mux '%s' to uint on line %d. Got Error: %w", values[valueIndex], parser.line, err) + return Read{}, fmt.Errorf("Failed to convert start_mux '%s' to uint on line %d. Got Error: %w", values[valueIndex], parser.line, err) } newRead.StartMux = uint8(startMux) case "median_before": medianBefore, err := strconv.ParseFloat(values[valueIndex], 64) if err != nil { - return nil, fmt.Errorf("Failed to convert median_before '%s' to float on line %d. Got Error: %w", values[valueIndex], parser.line, err) + return Read{}, fmt.Errorf("Failed to convert median_before '%s' to float on line %d. Got Error: %w", values[valueIndex], parser.line, err) } newRead.MedianBefore = medianBefore case "end_reason": endReasonIndex, err := strconv.ParseInt(values[valueIndex], 10, 64) if err != nil { - return nil, fmt.Errorf("Failed to convert end_reason '%s' to int on line %d. Got Error: %w", values[valueIndex], parser.line, err) + return Read{}, fmt.Errorf("Failed to convert end_reason '%s' to int on line %d. Got Error: %w", values[valueIndex], parser.line, err) } if _, ok := parser.endReasonMap[int(endReasonIndex)]; !ok { - return nil, fmt.Errorf("End reason out of range. Got '%d' on line %d. Cannot find valid enum reason", int(endReasonIndex), parser.line) + return Read{}, fmt.Errorf("End reason out of range. Got '%d' on line %d. Cannot find valid enum reason", int(endReasonIndex), parser.line) } newRead.EndReason = parser.endReasonMap[int(endReasonIndex)] case "channel_number": // For whatever reason, this is a string. newRead.ChannelNumber = values[valueIndex] default: - return nil, fmt.Errorf("Unknown field to parser '%s' found on line %d. Please report to github.com/koeng101/dnadesign/lib", fieldValue, parser.line) + return Read{}, fmt.Errorf("Unknown field to parser '%s' found on line %d. Please report to github.com/koeng101/dnadesign/lib", fieldValue, parser.line) } } - return &newRead, nil + return newRead, nil } /****************************************************************************** diff --git a/lib/bio/slow5/slow5_test.go b/lib/bio/slow5/slow5_test.go index a71d4e3..65f3cdd 100644 --- a/lib/bio/slow5/slow5_test.go +++ b/lib/bio/slow5/slow5_test.go @@ -40,7 +40,7 @@ func TestParse(t *testing.T) { } break } - outputReads = append(outputReads, *read) + outputReads = append(outputReads, read) } if outputReads[0].RawSignal[0] != 430 { t.Errorf("Expected first outputRead to have a raw_signal of 430. Got: %d", outputReads[0].RawSignal[0]) @@ -123,7 +123,7 @@ func TestParseReads(t *testing.T) { } break } - outputReads = append(outputReads, *read) + outputReads = append(outputReads, read) } if outputReads[0].ReadID != "0026631e-33a3-49ab-aa22-3ab157d71f8b" { t.Errorf("First read id should be 0026631e-33a3-49ab-aa22-3ab157d71f8b. Got: %s", outputReads[0].ReadID) @@ -173,7 +173,7 @@ func TestWrite(t *testing.T) { // Break at EOF break } - reads <- *read + reads <- read } close(reads) }() @@ -238,7 +238,7 @@ func TestSvb(t *testing.T) { } break } - outputReads = append(outputReads, *read) + outputReads = append(outputReads, read) } for readNum, read := range outputReads { diff --git a/lib/bio/uniprot/uniprot.go b/lib/bio/uniprot/uniprot.go index b1cd90c..73c36e9 100644 --- a/lib/bio/uniprot/uniprot.go +++ b/lib/bio/uniprot/uniprot.go @@ -45,8 +45,8 @@ func (header *Header) WriteTo(w io.Writer) (int64, error) { } // Header returns nil,nil. -func (p *Parser) Header() (*Header, error) { - return &Header{}, nil +func (p *Parser) Header() (Header, error) { + return Header{}, nil } // Entry_WriteTo writes an entry to an io.Writer. It specifically writes a JSON @@ -72,14 +72,14 @@ func NewParser(r io.Reader) *Parser { return &Parser{decoder: decoder} } -func (p *Parser) Next() (*Entry, error) { +func (p *Parser) Next() (Entry, error) { decoderToken, err := p.decoder.Token() // Check decoding if err != nil { // If we are the end of the file, return io.EOF if err.Error() == "EOF" { - return &Entry{}, io.EOF + return Entry{}, io.EOF } } @@ -89,9 +89,9 @@ func (p *Parser) Next() (*Entry, error) { var e Entry err = p.decoder.DecodeElement(&e, &startElement) if err != nil { - return &Entry{}, err + return Entry{}, err } - return &e, nil + return e, nil } return p.Next() } @@ -100,13 +100,13 @@ func (p *Parser) Next() (*Entry, error) { var BaseURL string = "https://rest.uniprot.org/uniprotkb/" // Get gets a uniprot from its accessionID -func Get(ctx context.Context, accessionID string) (*Entry, error) { +func Get(ctx context.Context, accessionID string) (Entry, error) { var entry Entry // Parse the base URL baseURL, err := url.Parse(BaseURL) if err != nil { - return &entry, err + return entry, err } // Resolve the full URL @@ -120,13 +120,13 @@ func Get(ctx context.Context, accessionID string) (*Entry, error) { client := &http.Client{} resp, err := client.Do(req) if err != nil { - return &entry, err + return entry, err } defer resp.Body.Close() // Check for HTTP errors if resp.StatusCode != http.StatusOK { - return &entry, fmt.Errorf("Got http status code: %d", resp.StatusCode) + return entry, fmt.Errorf("Got http status code: %d", resp.StatusCode) } // Return the first parsed XML From 858cbc2fd0bdd0ffc2ceab320d092947edde901c Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Sun, 14 Jan 2024 22:01:22 -0800 Subject: [PATCH 7/9] Make linter happy --- external/minimap2/minimap2.go | 2 +- lib/align/megamash/megamash_test.go | 4 +- lib/bio/example_test.go | 51 +++++++++ lib/bio/sam/sam.go | 2 +- lib/bio/sam/sam_test.go | 2 +- lib/sequencing/example_test.go | 50 +++------ lib/sequencing/nanopore/example_test.go | 30 ------ lib/sequencing/nanopore/nanopore.go | 137 ------------------------ lib/synthesis/codon/codon_test.go | 18 ++-- lib/synthesis/codon/example_test.go | 10 +- 10 files changed, 86 insertions(+), 220 deletions(-) delete mode 100644 lib/sequencing/nanopore/example_test.go diff --git a/external/minimap2/minimap2.go b/external/minimap2/minimap2.go index b0d8ffd..578d24c 100644 --- a/external/minimap2/minimap2.go +++ b/external/minimap2/minimap2.go @@ -82,7 +82,7 @@ func Minimap2(templateFastaInput io.Reader, fastqInput io.Reader, w io.Writer) e } // Minimap2Channeled uses channels rather than io.Reader and io.Writers. -func Minimap2Channeled(fastaTemplates io.Reader, fastqChan <-chan *fastq.Read, samChan chan<- *sam.Alignment) error { +func Minimap2Channeled(fastaTemplates io.Reader, fastqChan <-chan fastq.Read, samChan chan<- sam.Alignment) error { ctx := context.Background() g, ctx := errgroup.WithContext(ctx) diff --git a/lib/align/megamash/megamash_test.go b/lib/align/megamash/megamash_test.go index b154742..48e4e02 100644 --- a/lib/align/megamash/megamash_test.go +++ b/lib/align/megamash/megamash_test.go @@ -12,7 +12,7 @@ func TestMegamash(t *testing.T) { oligo3 := "CCGTGCGACAAGATTTCAAGGGTCTCTCTTCTATCGCAGCCAAGGAAGAAGGTGTATCTCTAGAGAAGCGTCGAGTGAGACCCGGATCGAACTTAGGTAGCCCCCTTCGAAGTGGCTCTGTCTGATCCTCCGCGGATGGCGACACCATCGGACTGAGGATATTGGCCACA" samples := []string{"TTTTGTCTACTTCGTTCCGTTGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGACGGCGCCTCCGTGCGACGAGATTTCAAGGGTCTCTGTGCTATATTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCAGATCGACTTTTAGATTCCTCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATACGTGG", "TGTCCTTTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTACTTATCGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTATCTGAGACCGAAGTGGTTTGCCTAAACGCAGGTGCTGTTGGCAAAGGCAGAAAGTAGTCTTAACCTTGACAATGAGTGGTA", "GTTATTGTCGTCTCCTTTGACTCAGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTGCTGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTCCGCTTCTATCTGAGACCGAAGTGGTTAT", "TGTTCTGTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTCTGCCTTAGAGACCACGCCTCCGTGCGACAAGATTCAAGGGTCTCTGTGCTCTGCCGCTAGTTCCGCTCTAGCTGCTCCGGTATGCATCTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAACTGTTGGTT"} - m, err := NewMegamashMap([]fasta.Record{fasta.Record{Sequence: oligo1, Identifier: "oligo1"}, fasta.Record{Sequence: oligo2, Identifier: "oligo2"}, fasta.Record{Sequence: oligo3, Identifier: "oligo3"}}, DefaultKmerSize, DefaultMinimalKmerCount, DefaultScoreThreshold) + m, err := NewMegamashMap([]fasta.Record{{Sequence: oligo1, Identifier: "oligo1"}, {Sequence: oligo2, Identifier: "oligo2"}, {Sequence: oligo3, Identifier: "oligo3"}}, DefaultKmerSize, DefaultMinimalKmerCount, DefaultScoreThreshold) if err != nil { t.Errorf("Failed to make NewMegamashMap: %s", err) } @@ -31,7 +31,7 @@ func BenchmarkMegamash(b *testing.B) { oligo3 := "CCGTGCGACAAGATTTCAAGGGTCTCTCTTCTATCGCAGCCAAGGAAGAAGGTGTATCTCTAGAGAAGCGTCGAGTGAGACCCGGATCGAACTTAGGTAGCCCCCTTCGAAGTGGCTCTGTCTGATCCTCCGCGGATGGCGACACCATCGGACTGAGGATATTGGCCACA" samples := []string{"TTTTGTCTACTTCGTTCCGTTGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGACGGCGCCTCCGTGCGACGAGATTTCAAGGGTCTCTGTGCTATATTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCAGATCGACTTTTAGATTCCTCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATACGTGG", "TGTCCTTTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTACTTATCGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTATCTGAGACCGAAGTGGTTTGCCTAAACGCAGGTGCTGTTGGCAAAGGCAGAAAGTAGTCTTAACCTTGACAATGAGTGGTA", "GTTATTGTCGTCTCCTTTGACTCAGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTGCTGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTCCGCTTCTATCTGAGACCGAAGTGGTTAT", "TGTTCTGTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTCTGCCTTAGAGACCACGCCTCCGTGCGACAAGATTCAAGGGTCTCTGTGCTCTGCCGCTAGTTCCGCTCTAGCTGCTCCGGTATGCATCTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCGGATCGAACTTAGGTAGCCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAACTGTTGGTT"} - m, _ := NewMegamashMap([]fasta.Record{fasta.Record{Sequence: oligo1, Identifier: "oligo1"}, fasta.Record{Sequence: oligo2, Identifier: "oligo2"}, fasta.Record{Sequence: oligo3, Identifier: "oligo3"}}, DefaultKmerSize, DefaultMinimalKmerCount, DefaultScoreThreshold) + m, _ := NewMegamashMap([]fasta.Record{{Sequence: oligo1, Identifier: "oligo1"}, {Sequence: oligo2, Identifier: "oligo2"}, {Sequence: oligo3, Identifier: "oligo3"}}, DefaultKmerSize, DefaultMinimalKmerCount, DefaultScoreThreshold) for _, sample := range samples { _ = m.Match(sample) } diff --git a/lib/bio/example_test.go b/lib/bio/example_test.go index d1620d1..ce5955f 100644 --- a/lib/bio/example_test.go +++ b/lib/bio/example_test.go @@ -10,6 +10,7 @@ import ( "github.com/koeng101/dnadesign/lib/bio" "github.com/koeng101/dnadesign/lib/bio/fasta" + "github.com/koeng101/dnadesign/lib/bio/fastq" "github.com/koeng101/dnadesign/lib/bio/sam" "golang.org/x/sync/errgroup" ) @@ -427,3 +428,53 @@ func ExampleFilterData() { fmt.Println(results) // Output: [{ 0 0 0 0 0 FAKE []}] } + +func Example_runWorkflow() { + // Workflows are a way of running bioinformatics programs replacing stdin/stdout + // with go channels. This allows for concurrent processing of data. + // + // Currently, we just use standard errorGroup handling for workflows, but + // aim to support multiple workers for maximizing throughput. + + // First, setup parser + file := strings.NewReader(`@289a197e-4c05-4143-80e6-488e23044378 runid=bb4427242f6da39e67293199a11c6c4b6ab2b141 read=34575 ch=111 start_time=2023-12-29T16:06:13.719061-08:00 flow_cell_id=AQY258 protocol_group_id=nseq28 sample_id=build3-build3gg-u11 barcode=barcode06 barcode_alias=barcode06 parent_read_id=289a197e-4c05-4143-80e6-488e23044378 basecall_model_version_id=dna_r10.4.1_e8.2_400bps_sup@v4.2.0 +TTTTGTCTACTTCGTTCCGTTGCGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGACGGCGCCTCCGTGCGACGAGATTTCAAGGGTCTCTGTGCTATATTGCCGCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTGAGACCCAGATCGACTTTTAGATTCCTCAGGTGCTGTTCTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATACGTGG ++ +$%%&%%$$%&'+)**,-+)))+866788711112=>A?@@@BDB@>?746@?>A@D2@970,-+..*++++;662/.-.+,++,//+167>A@A@@B=<887-,'&&%%&''((5555644578::<==B?ABCIJA>>>>@DCAA99::>=<<<=67777+***)//+,,+)&&&+--.02:>442000/1225:=D?=<<=7;866/..../AAA226545+&%%$$ +@af86ed57-1cfe-486f-8205-b2c8d1186454 runid=bb4427242f6da39e67293199a11c6c4b6ab2b141 read=2233 ch=123 start_time=2023-12-29T10:04:32.719061-08:00 flow_cell_id=AQY258 protocol_group_id=nseq28 sample_id=build3-build3gg-u11 barcode=barcode07 barcode_alias=barcode07 parent_read_id=af86ed57-1cfe-486f-8205-b2c8d1186454 basecall_model_version_id=dna_r10.4.1_e8.2_400bps_sup@v4.2.0 +TGTCCTTTACTTCGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCACCTCTGCTAGGGGCTACTTATCGGGTCTCTAGTTCCGCTCTAGCTGCTCCAGTTAATACTACTACTGAAGATGAATTGGAGGGTGACTTCGATGTTGCTGTTCTGCCTTTTTCCGCTTCTATCTGAGACCGAAGTGGTTTGCCTAAACGCAGGTGCTGTTGGCAAAGGCAGAAAGTAGTCTTAACCTTGACAATGAGTGGTA ++ +$%&$$$$$#')+)+,<>@B?>==<>>;;<<?@DA@?=>==>??<>??7;<706=>=>CBCCB????@CCBDAGFFFGJ<<<<<=54455>@?>:::9..++?@BDCCDCGECFHD@>=<<==>@@B@?@@>>>==>>===>>>A?@ADFGDCA@?????CCCEFDDDDDGJODAA@A;;ABBD<=<:92222223:>>@?@@B?@=<62212=<<<=>AAB=<'&&&'-,-.,**)'&'(,,,-.114888&&&&&'+++++,,*`) + parser := bio.NewFastqParser(file) + + // We setup the error group here. It's context is used if we need to cancel + // code that is running. + ctx := context.Background() + errorGroup, ctx := errgroup.WithContext(ctx) + + // Now we set up a workflow. We need two things: channels for the internal + // workflow steps to pass between, and the workflow steps themselves. We + // Set them up here. + fastqReads := make(chan fastq.Read) + fastqBarcoded := make(chan fastq.Read) + + // Read fastqs into channel + errorGroup.Go(func() error { + return parser.ParseToChannel(ctx, fastqReads, false) + }) + + // Filter the right barcode fastqs from channel + barcode := "barcode07" + errorGroup.Go(func() error { + return bio.FilterData(ctx, fastqReads, fastqBarcoded, func(data fastq.Read) bool { return data.Optionals["barcode"] == barcode }) + }) + + // Now, check the outputs. We should have sorted only for barcode07 + var reads []fastq.Read + for read := range fastqBarcoded { + reads = append(reads, read) + } + + fmt.Println(reads[0].Identifier) + // Output: af86ed57-1cfe-486f-8205-b2c8d1186454 +} diff --git a/lib/bio/sam/sam.go b/lib/bio/sam/sam.go index dd5e527..bf6ce1a 100644 --- a/lib/bio/sam/sam.go +++ b/lib/bio/sam/sam.go @@ -542,7 +542,7 @@ sam files. // Primary determines whether the Alignment is the primary line of the read. // This is useful for finding out if a particular read is the best aligned to // a certain fragment. -func Primary(a *Alignment) bool { +func Primary(a Alignment) bool { // Perform bitwise AND of FLAG with 0x900 and compare to 0 return (a.FLAG & 0x900) == 0 } diff --git a/lib/bio/sam/sam_test.go b/lib/bio/sam/sam_test.go index 4da764d..b17d335 100644 --- a/lib/bio/sam/sam_test.go +++ b/lib/bio/sam/sam_test.go @@ -288,7 +288,7 @@ func TestPrimary(t *testing.T) { // Create an Alignment with the given FLAG a := Alignment{FLAG: tc.flag} // Call the Primary function - got := Primary(&a) + got := Primary(a) // Assert that the result is as expected if got != tc.want { t.Errorf("Primary() with FLAG 0x%x = %v, want %v", tc.flag, got, tc.want) diff --git a/lib/sequencing/example_test.go b/lib/sequencing/example_test.go index 995d29e..b53a794 100644 --- a/lib/sequencing/example_test.go +++ b/lib/sequencing/example_test.go @@ -14,12 +14,15 @@ import ( "github.com/koeng101/dnadesign/lib/bio/fastq" "github.com/koeng101/dnadesign/lib/bio/sam" "github.com/koeng101/dnadesign/lib/primers/pcr" - "github.com/koeng101/dnadesign/lib/sequencing/nanopore" "github.com/koeng101/dnadesign/lib/transform" "golang.org/x/sync/errgroup" ) -func ExampleAmpliconAlignment() { +func Example_ampliconAlignment() { + // This is currently a work-in-progress. Sequencing utilities are under + // development right now. + // + // // Only run function if minimap2 is available _, err := exec.LookPath("minimap2") if err != nil { @@ -48,7 +51,7 @@ func ExampleAmpliconAlignment() { */ var amplicons []Amplicon var templates []fasta.Record - var pcrTm float64 = 50.0 + pcrTm := 50.0 forward := "CCGTGCGACAAGATTTCAAG" reverse := transform.ReverseComplement("CGGATCGAACTTAGGTAGCC") @@ -86,11 +89,10 @@ func ExampleAmpliconAlignment() { ctx := context.Background() errorGroup, ctx := errgroup.WithContext(ctx) - fastqReads := make(chan *fastq.Read) - fastqBarcoded := make(chan *fastq.Read) - fastqTrimmed := make(chan *fastq.Read) - samReads := make(chan *sam.Alignment) - samPrimary := make(chan *sam.Alignment) + fastqReads := make(chan fastq.Read) + fastqBarcoded := make(chan fastq.Read) + samReads := make(chan sam.Alignment) + samPrimary := make(chan sam.Alignment) // Read fastqs into channel errorGroup.Go(func() error { @@ -98,46 +100,26 @@ func ExampleAmpliconAlignment() { }) // Filter the right barcode fastqs from channel - go func() { - bio.FilterData(fastqReads, fastqBarcoded, func(data *fastq.Read) bool { return data.Optionals["barcode"] == barcode }) - }() - - // Trim barcodes errorGroup.Go(func() error { - return nanopore.TrimBarcodeWithChannels(barcode, fastqBarcoded, fastqTrimmed) + return bio.FilterData(ctx, fastqReads, fastqBarcoded, func(data fastq.Read) bool { return data.Optionals["barcode"] == barcode }) }) // Run minimap errorGroup.Go(func() error { - return minimap2.Minimap2Channeled(&buf, fastqTrimmed, samReads) + return minimap2.Minimap2Channeled(&buf, fastqBarcoded, samReads) }) // Sort out primary alignments - go func() { - bio.FilterData(samReads, samPrimary, sam.Primary) - }() + errorGroup.Go(func() error { + return bio.FilterData(ctx, samReads, samPrimary, sam.Primary) + }) // Read all them alignments out into memory var outputAlignments []sam.Alignment for alignment := range samPrimary { - outputAlignments = append(outputAlignments, *alignment) + outputAlignments = append(outputAlignments, alignment) } fmt.Println(outputAlignments[0].RNAME) // Output: oligo2 } - -func RunWorkflow(errorGroup *errgroup.Group, functions []func() error) error { - for _, function := range functions { - errorGroup.Go(func() error { - return function() - }) - } - return nil -} - -func ExampleErrgroup() { - - fmt.Println("hi") - // Output: hi -} diff --git a/lib/sequencing/nanopore/example_test.go b/lib/sequencing/nanopore/example_test.go deleted file mode 100644 index c491af6..0000000 --- a/lib/sequencing/nanopore/example_test.go +++ /dev/null @@ -1,30 +0,0 @@ -package nanopore_test - -import ( - "fmt" - - "github.com/koeng101/dnadesign/lib/bio/fastq" - "github.com/koeng101/dnadesign/lib/sequencing/nanopore" -) - -func ExampleTrimBarcode() { - // Here we are trimming the barcode06 barcode. - - // This exactBarcodeMatch has one barcode match in the forward direction and one loose match in the reverse direction - exactBarcodeMatch := fastq.Read{Sequence: "TGTACTTGGTTCAGTTACGTATTGCTAAGGTTAAGACTACTTTCTGCCTTTGCGAGAACAGCGCCTTTCCCACTGTTAGATTGAGCTGAAGACAGTTACAAGCCAGGTGGACAATATGGGTGTCGATGTCCAGCACATCACGCTCGTACAGCGTCTGGGTGCTGCGGTAATTGTTGGCAATAGTCGCGAACGTCGTTTCACCCTCCAGACGGTAGCTGCCCTTACGCATGATATCCAGTTGTTCTTGGCTGATCTCGATCGGATCCTTGAAGATCGTCCACATCACGATCTGGTTACACGGCGGGGTACGTCTTCGTTACACCGGGATCATAGCTAGGTGCTGTTCTCGCAAAGGCGGAAACGCATCTTAAGCCTCACGATGGTA", Identifier: "3e94e6db-07cc-424c-8e9f-81390482bd9c", Quality: `#%'%%%'%&&(*5556>BACBCCEDFIHJFISGHOHLHDFCF>36>;4.,,*9977531//.**+++-'+1(*?EMIBC@DFGHFHKSJHFGE@????FIQEPJKIKSMHDC77779BCBLSGHIISKOHSGGJBAAABFGROSGHJGISGSLEKFKSSSIOFJKEFENFKSFF><<<+++)$%%%&&''(&&))))))88752>@@????AFIIKIIA432327;>887779:>AAHLQIJJJFOSEEGHSSFHDBCFNSLHHSGHFDDCHNKQSHHPSIISSHJJKIIOHNKLHOFCCCB;;=?GHHIH@??@@ESEIFIAFDDC??AA@DEB@731(''&%$$$%''(++,-)))()))),)`} - sequence, _ := nanopore.TrimBarcode("barcode06", exactBarcodeMatch) - - fmt.Println(sequence.Sequence) - // Output: TTCTGCCTTTGCGAGAACAGCGCCTTTCCCACTGTTAGATTGAGCTGAAGACAGTTACAAGCCAGGTGGACAATATGGGTGTCGATGTCCAGCACATCACGCTCGTACAGCGTCTGGGTGCTGCGGTAATTGTTGGCAATAGTCGCGAACGTCGTTTCACCCTCCAGACGGTAGCTGCCCTTACGCATGATATCCAGTTGTTCTTGGCTGATCTCGATCGGATCCTTGAAGATCGTCCACATCACGATCTGGTTACACGGCGGGGTACGTCTTCGTTACACCGGGATCATAGCTA -} - -func ExampleTrimImperfectBarcode() { - // Here we are trimming the barcode06 barcode. - - // This imperfectBarcodeMatch has one barcode match in the forward direction and one in the reverse direction. - imperfectBarcodeMatch := fastq.Read{Sequence: "TGTATTCGTACGTTACATATGCTAAGGTTAAGACTACTTTCTGCCTTTTGCAGACGCACTTAGCATCCGTCTAAATCTCGGGAAGACGTCTGGCAGTGTTGGCTGCCTTTGTCGAGGTTAAGGACTACGCAGAAAACACCTACTACAGCAACTTTATCAGCCACCTGGAAAATATCAAATATGCGGGTCAATCCACCGTCCTGCGTGGCCTGGACATCCGGGACATATCTCAAAAACTTGCATCATTATTACAGCTACCTGGGTAGCTTGACCACGCCGCCGTGTACCGAGAATGTGCACTGCTGTCTTCAGCTATGATCCCGGTGTAACAGGTGCTGTTCGCAAAGGCAGAAAGTAGTCTTAACCTTAGCAATACGT", Identifier: "7258a445-8793-466c-8c18-cf9d1cce93f2", Quality: `$$%$###$'%%%%%&+(''''''89:ABFGSHSGGLDCD?9:DC@<.+)()&&&&'''+))))+?>=8540/.-+++,.>>?@BDFEEDCBHKKJHHMMKLEMKKMHEKSSHGFJGFKIGIESMSGDHSRSJEA:888IJIHRHIHGFEI=CB88888KHDABAAE5CSIJFSRJGSSGEGI@>+)((+33;C=<;5212////3:::=FHD@9979998/+`} - sequence, _ := nanopore.TrimBarcode("barcode06", imperfectBarcodeMatch) - - fmt.Println(sequence.Sequence) - // Output: TTCTGCCTTTTGCAGACGCACTTAGCATCCGTCTAAATCTCGGGAAGACGTCTGGCAGTGTTGGCTGCCTTTGTCGAGGTTAAGGACTACGCAGAAAACACCTACTACAGCAACTTTATCAGCCACCTGGAAAATATCAAATATGCGGGTCAATCCACCGTCCTGCGTGGCCTGGACATCCGGGACATATCTCAAAAACTTGCATCATTATTACAGCTACCTGGGTAGCTTGACCACGCCGCCGTGTACCGAGAATGTGCACTGCTGTCTTCAGCTATGATCCCGGTGTAACAG -} diff --git a/lib/sequencing/nanopore/nanopore.go b/lib/sequencing/nanopore/nanopore.go index 86b4529..0fc3c0c 100644 --- a/lib/sequencing/nanopore/nanopore.go +++ b/lib/sequencing/nanopore/nanopore.go @@ -3,21 +3,6 @@ Package nanopore contains data associated with nanopore sequencing. */ package nanopore -import ( - "fmt" - "strings" - - "github.com/koeng101/dnadesign/lib/align" - "github.com/koeng101/dnadesign/lib/align/matrix" - "github.com/koeng101/dnadesign/lib/alphabet" - "github.com/koeng101/dnadesign/lib/bio/fastq" -) - -// ScoreMagicNumber is the score of a Smith Waterman alignment between a -// barcode and sequence used a threshold for whether the barcode exists. -// It was found from me playing around with sequences. -var ScoreMagicNumber = 12 - // NativeBarcode contains the data structure defining a nanopore barcode. // In between Forward and a target DNA sequence is 8bp: CAGCACC followed by a // T, which is used for the TA ligation to the target end-prepped DNA. @@ -126,125 +111,3 @@ var NativeBarcodeMap = map[string]NativeBarcode{ "barcode95": {"CCTGTCTGGAAGAAGAATGGACTT", "AAGTCCATTCTTCTTCCAGACAGG"}, "barcode96": {"CTGAACGGTCATAGAGTCCACCAT", "ATGGTGGACTCTATGACCGTTCAG"}, } - -func TrimBarcodeWithChannels(barcodeName string, fastqInput <-chan *fastq.Read, fastqOutput chan<- *fastq.Read) error { - for { - select { - case data, ok := <-fastqInput: - if !ok { - // If the input channel is closed, we close the output channel and return - close(fastqOutput) - return nil - } - trimmedRead, err := TrimBarcode(barcodeName, *data) - if err != nil { - close(fastqOutput) - return err - } - fastqOutput <- &trimmedRead - } - } - -} - -// TrimBarcode takes a barcodeName and a fastqSequence and returns a trimmed -// barcode. -func TrimBarcode(barcodeName string, fastqRead fastq.Read) (fastq.Read, error) { - // Copy into new fastq read - var newFastqRead fastq.Read - newFastqRead.Identifier = fastqRead.Identifier - newFastqRead.Optionals = make(map[string]string) - for key, value := range fastqRead.Optionals { - newFastqRead.Optionals[key] = value - } - - // Get the barcode - barcode, ok := NativeBarcodeMap[barcodeName] - if !ok { - return newFastqRead, fmt.Errorf("barcode %s not found in NativeBarcodeMap.", barcodeName) - } - - // Trim in the forward direction - var sequence string - var quality string - sequence = fastqRead.Sequence - quality = fastqRead.Quality - if len(sequence) > 80 { - score, alignment, err := Align(sequence[:80], barcode.Forward) - if err != nil { - return newFastqRead, err - } - // Empirically from looking around, it seems to me that approximately a - // score of 21 looks like a barcode match. This is almost by definition a - // magic number, so it is defined above as so. - if score >= ScoreMagicNumber { - modifiedAlignment := strings.ReplaceAll(alignment, "-", "") - index := strings.Index(sequence, modifiedAlignment) - // The index needs to be within the first 80 base pairs. Usually the - // native adapter + native barcode is within the first ~70 bp, but we - // put a small buffer. - if index != -1 { - // 7 here is a magic number between the native barcode and your - // target sequence. It's just a number that exists irl, so it is - // not defined above. - newStart := index + 7 - if newStart < len(sequence) { - sequence = sequence[newStart:] - quality = quality[newStart:] - } - } - } - } - - // Now do the same thing with the reverse strand. - if len(sequence) > 80 { - score, alignment, err := Align(sequence[len(sequence)-80:], barcode.Reverse) - if err != nil { - return newFastqRead, err - } - if score >= ScoreMagicNumber { - modifiedAlignment := strings.ReplaceAll(alignment, "-", "") - index := strings.Index(sequence, modifiedAlignment) - // This time we need to check within the last 80 base pairs. - if index != -1 { - newEnd := index - 7 - if newEnd < len(sequence) { - sequence = sequence[:newEnd] - quality = quality[:newEnd] - } - } - } - } - newFastqRead.Sequence = sequence - newFastqRead.Quality = quality - return newFastqRead, nil -} - -// Align uses the SmithWaterman algorithm to align a barcode to a sequence. -// It is used because it is a rather simple algorithm, and since barcodes are -// sufficiently small, fast enough. -func Align(sequence string, barcodeSequence string) (int, string, error) { - m := [][]int{ - /* A C G T U */ - /* A */ {1, -1, -1, -1, -1}, - /* C */ {-1, 1, -1, -1, -1}, - /* G */ {-1, -1, 1, -1, -1}, - /* T */ {-1, -1, -1, 1, -1}, - /* U */ {-1, -1, -1, -1, 1}, - } - - alphabet := alphabet.NewAlphabet([]string{"A", "C", "G", "T", "U"}) - subMatrix, err := matrix.NewSubstitutionMatrix(alphabet, alphabet, m) - if err != nil { - return 0, "", err - } - scoring, err := align.NewScoring(subMatrix, -1) - if err != nil { - return 0, "", err - } - score, alignSequence, _, err := align.SmithWaterman(sequence, barcodeSequence, scoring) - if err != nil { - return 0, "", err - } - return score, alignSequence, nil -} diff --git a/lib/synthesis/codon/codon_test.go b/lib/synthesis/codon/codon_test.go index 7f0781e..30fc089 100644 --- a/lib/synthesis/codon/codon_test.go +++ b/lib/synthesis/codon/codon_test.go @@ -58,7 +58,7 @@ func TestOptimize(t *testing.T) { sequence, _ := parser.Next() table := NewTranslationTable(11) - err := table.UpdateWeightsWithSequence(*sequence) + err := table.UpdateWeightsWithSequence(sequence) if err != nil { t.Error(err) } @@ -82,7 +82,7 @@ func TestOptimizeSameSeed(t *testing.T) { sequence, _ := parser.Next() optimizationTable := NewTranslationTable(11) - err := optimizationTable.UpdateWeightsWithSequence(*sequence) + err := optimizationTable.UpdateWeightsWithSequence(sequence) if err != nil { t.Error(err) } @@ -105,7 +105,7 @@ func TestOptimizeDifferentSeed(t *testing.T) { sequence, _ := parser.Next() optimizationTable := NewTranslationTable(11) - err := optimizationTable.UpdateWeightsWithSequence(*sequence) + err := optimizationTable.UpdateWeightsWithSequence(sequence) if err != nil { t.Error(err) } @@ -215,7 +215,7 @@ func TestCompromiseCodonTable(t *testing.T) { // weight our codon optimization table using the regions we collected from the genbank file above optimizationTable := NewTranslationTable(11) - err := optimizationTable.UpdateWeightsWithSequence(*sequence) + err := optimizationTable.UpdateWeightsWithSequence(sequence) if err != nil { t.Error(err) } @@ -225,7 +225,7 @@ func TestCompromiseCodonTable(t *testing.T) { parser2 := bio.NewGenbankParser(file2) sequence2, _ := parser2.Next() optimizationTable2 := NewTranslationTable(11) - err = optimizationTable2.UpdateWeightsWithSequence(*sequence2) + err = optimizationTable2.UpdateWeightsWithSequence(sequence2) if err != nil { t.Error(err) } @@ -249,7 +249,7 @@ func TestAddCodonTable(t *testing.T) { // weight our codon optimization table using the regions we collected from the genbank file above optimizationTable := NewTranslationTable(11) - err := optimizationTable.UpdateWeightsWithSequence(*sequence) + err := optimizationTable.UpdateWeightsWithSequence(sequence) if err != nil { t.Error(err) } @@ -259,7 +259,7 @@ func TestAddCodonTable(t *testing.T) { parser2 := bio.NewGenbankParser(file2) sequence2, _ := parser2.Next() optimizationTable2 := NewTranslationTable(11) - err = optimizationTable2.UpdateWeightsWithSequence(*sequence2) + err = optimizationTable2.UpdateWeightsWithSequence(sequence2) if err != nil { t.Error(err) } @@ -289,7 +289,7 @@ func TestCapitalizationRegression(t *testing.T) { sequence, _ := parser.Next() optimizationTable := NewTranslationTable(11) - err := optimizationTable.UpdateWeightsWithSequence(*sequence) + err := optimizationTable.UpdateWeightsWithSequence(sequence) if err != nil { t.Error(err) } @@ -313,7 +313,7 @@ func TestOptimizeSequence(t *testing.T) { defer file.Close() parser := bio.NewGenbankParser(file) sequence, _ := parser.Next() - return *sequence + return sequence }() ) diff --git a/lib/synthesis/codon/example_test.go b/lib/synthesis/codon/example_test.go index d9a84e9..7744d86 100644 --- a/lib/synthesis/codon/example_test.go +++ b/lib/synthesis/codon/example_test.go @@ -29,7 +29,7 @@ func ExampleTranslationTable_Optimize() { parser := bio.NewGenbankParser(file) sequence, _ := parser.Next() codonTable := codon.NewTranslationTable(11) - _ = codonTable.UpdateWeightsWithSequence(*sequence) + _ = codonTable.UpdateWeightsWithSequence(sequence) // Here, we double check if the number of genes is equal to the number of stop codons stopCodonCount := 0 @@ -88,7 +88,7 @@ func ExampleCompromiseCodonTable() { // weight our codon optimization table using the regions we collected from the genbank file above optimizationTable := codon.NewTranslationTable(11) - err := optimizationTable.UpdateWeightsWithSequence(*sequence) + err := optimizationTable.UpdateWeightsWithSequence(sequence) if err != nil { panic(fmt.Errorf("got unexpected error in an example: %w", err)) } @@ -99,7 +99,7 @@ func ExampleCompromiseCodonTable() { sequence2, _ := parser2.Next() optimizationTable2 := codon.NewTranslationTable(11) - err = optimizationTable2.UpdateWeightsWithSequence(*sequence2) + err = optimizationTable2.UpdateWeightsWithSequence(sequence2) if err != nil { panic(fmt.Errorf("got unexpected error in an example: %w", err)) } @@ -123,7 +123,7 @@ func ExampleAddCodonTable() { // weight our codon optimization table using the regions we collected from the genbank file above optimizationTable := codon.NewTranslationTable(11) - err := optimizationTable.UpdateWeightsWithSequence(*sequence) + err := optimizationTable.UpdateWeightsWithSequence(sequence) if err != nil { panic(fmt.Errorf("got unexpected error in an example: %w", err)) } @@ -134,7 +134,7 @@ func ExampleAddCodonTable() { sequence2, _ := parser2.Next() optimizationTable2 := codon.NewTranslationTable(11) - err = optimizationTable2.UpdateWeightsWithSequence(*sequence2) + err = optimizationTable2.UpdateWeightsWithSequence(sequence2) if err != nil { panic(fmt.Errorf("got unexpected error in an example: %w", err)) } From 059b96f4a56b3fb7e68a788c601c1d74f1e7da38 Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Fri, 19 Jan 2024 15:40:29 -0800 Subject: [PATCH 8/9] add some docs to megamash --- lib/align/megamash/megamash.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/lib/align/megamash/megamash.go b/lib/align/megamash/megamash.go index 86e12d5..408486b 100644 --- a/lib/align/megamash/megamash.go +++ b/lib/align/megamash/megamash.go @@ -17,7 +17,8 @@ import ( "github.com/koeng101/dnadesign/lib/transform" ) -// StandardizedDNA returns the a standard DNA string +// StandardizedDNA returns the alphabetically lesser strand of a double +// stranded DNA molecule. func StandardizedDNA(sequence string) string { var deterministicSequence string reverseComplement := transform.ReverseComplement(sequence) @@ -43,6 +44,7 @@ type MegamashMap struct { Threshold float64 } +// NewMegamashMap creates a megamash map that can be searched against. func NewMegamashMap(sequences []fasta.Record, kmerSize uint, kmerMinimalCount uint, threshold float64) (MegamashMap, error) { var megamashMap MegamashMap megamashMap.KmerSize = kmerSize @@ -95,11 +97,14 @@ func NewMegamashMap(sequences []fasta.Record, kmerSize uint, kmerMinimalCount ui return megamashMap, nil } +// Match contains the identifier and score of a potential match to the searched +// sequence. type Match struct { Identifier string Score float64 } +// Match matches a sequence to all the sequences in a megamash map. func (m *MegamashMap) Match(sequence string) []Match { var scores []float64 // The algorithm is as follows: @@ -146,6 +151,7 @@ out: return matches } +// FastqMatchChannel processes a channel of fastq.Read and pushes to a channel of matches. func (m *MegamashMap) FastqMatchChannel(ctx context.Context, sequences <-chan fastq.Read, matches chan<- []Match) error { for { select { From ab6d69582c05695224656abbd3d1c170820bdac7 Mon Sep 17 00:00:00 2001 From: Keoni Gandall Date: Fri, 19 Jan 2024 15:45:00 -0800 Subject: [PATCH 9/9] Added changelog --- README.md | 2 ++ 1 file changed, 2 insertions(+) diff --git a/README.md b/README.md index 3e0743c..e2bb202 100644 --- a/README.md +++ b/README.md @@ -71,6 +71,8 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/), and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html). ## [Unreleased] +- Added the megamash algorithm [#50](https://github.com/Koeng101/dnadesign/pull/50) +- Changed parsers to return values instead of pointers. Added some sequencing utils [#49](https://github.com/Koeng101/dnadesign/pull/49) - Added minimap2 and samtools(pileup) integrations in external [#46](https://github.com/Koeng101/dnadesign/pull/46) - Added sam parser [#5](https://github.com/Koeng101/dnadesign/pull/5) - Added the LinearFold folding algorithms [#38](https://github.com/Koeng101/dnadesign/pull/38)