From 38519880dc9feb866f9ff1051b30c12c5346e8e3 Mon Sep 17 00:00:00 2001 From: Koeng101 Date: Sat, 1 Apr 2023 02:08:34 +0200 Subject: [PATCH] Add fastq parser (#300) * Add fastq parser * thank you matiasinsaurralde for speedups! * 100% test coverage --------- Co-authored-by: Timothy Stiles --- io/fastq/data/nanosavseq.fastq | 16 ++ io/fastq/data/nanosavseq.fastq.gz | Bin 0 -> 2312 bytes io/fastq/data/nanosavseq_emptyseq.fastq | 16 ++ io/fastq/data/nanosavseq_noidentifier.fastq | 16 ++ io/fastq/data/nanosavseq_noplus.fastq | 14 + io/fastq/data/nanosavseq_noquality.fastq | 13 + io/fastq/data/nanosavseq_noquality2.fastq | 15 ++ io/fastq/data/nanosavseq_noseq.fastq | 13 + io/fastq/example_test.go | 64 +++++ io/fastq/fastq.go | 284 ++++++++++++++++++++ io/fastq/fastq_test.go | 66 +++++ 11 files changed, 517 insertions(+) create mode 100644 io/fastq/data/nanosavseq.fastq create mode 100644 io/fastq/data/nanosavseq.fastq.gz create mode 100644 io/fastq/data/nanosavseq_emptyseq.fastq create mode 100644 io/fastq/data/nanosavseq_noidentifier.fastq create mode 100644 io/fastq/data/nanosavseq_noplus.fastq create mode 100644 io/fastq/data/nanosavseq_noquality.fastq create mode 100644 io/fastq/data/nanosavseq_noquality2.fastq create mode 100644 io/fastq/data/nanosavseq_noseq.fastq create mode 100644 io/fastq/example_test.go create mode 100644 io/fastq/fastq.go create mode 100644 io/fastq/fastq_test.go diff --git a/io/fastq/data/nanosavseq.fastq b/io/fastq/data/nanosavseq.fastq new file mode 100644 index 00000000..c1d451c9 --- /dev/null +++ b/io/fastq/data/nanosavseq.fastq @@ -0,0 +1,16 @@ +@e3cc70d5-90ef-49b6-bbe1-cfef99537d73 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13956 ch=53 start_time=2020-11-11T01:49:01Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GATGTGCGCCGTTCCAGTTGCGACGTACTATAATCCCCGGCAACACGGTGCTGATTCTCTTCCTGTTCCAGAAAGCATAAACAGATGCAAGTCTGGTGTGATTAACTTCACCAAAGGGCTGGTTGTAATATTAGGAAATCTAACAATAGATTCTGTTGGTTGGACTCTAAAATTAGAAATTTGATAGATTCCTTTTCCCAAATGAAAGTTTAACGTACACTTTGTTTCTAAAGGAAGGTCAAATTACAGTCTACAGCATCGTAATGGTTCATTTTCATTTATATTTTAATACTAGAAAAGTCCTAGGTTGAAGATAACCACATAATAAGCTGCAACTTCAGCTGTCCCAACCTGAAGAAGAATCGCAGGAGTCGAAATAACTTCTGTAAAGCAAGTAGTTTGAACCTATTGATGTTTCAACATGAGCAATACGTAACT ++ +$$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0-+3558,/)+&)'&&%&$$'%'%'&*/5978<9;**'3*'&&A?99:;:97:278?=9B?CLJHGG=9<@AC@@=>?=>D>=3<>=>3362$%/((+/%&+//.-,%-4:+..000,&$#%$$%+*)&*0%.//*?<<;>DE>.8942&&//074&$033)*&&&%**)%)962133-%'&*99><<=1144??6.027639.011/-)($#$(/422*4;:=122>?@6964:.5'8:52)*675=:4@;323&&##'.-57*4597)+0&:7<7-550REGB21/0+*79/&/6538())+)+23665+(''$$$'-2(&&*-.-#$&%%$$,-)&$$#$'&,);;9'04;:EB91*999211%;9/7<:=(,%%%)7<@9(+--5/679;:9AA.0024139)&'%?AI@;=91374(--6=773;3445))1*/459:EDB>=A4446+))&&$&'"##%##)+&-/3';;.685&8>24+'#&+++78:;9643*)&&'19+$%$,%%#')8<?LQKGBFGHCHD>AA=664741&)*%%.0'-'%&&%$(55:7=9@B@==A>:<7*$&,,14>@<,-9>54/-D==9:>?&'&'('),/%&%&&(&#$%''.)&,)*&(.)))%)*5;>7..+*%%$$'479<54*<<:''%($',*8:;?BA/-(,&0769963,,*/644&%)&&&++.++.--,)%%& +@60907b6b-5e38-498e-9c07-f036ebd8c658 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13962 ch=53 start_time=2020-11-11T01:49:07Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +AGTGTGCTTCGTTCAGTTGCGTATTGCTACTTACTAACGAGGTCTTCCTTGTATGTTGAGTGAGAGCGGTGAACCAAGACGCAGTATTATTGGGTAAACCTTAGGCCGACGTTGTTTTGATCGCGCCCCACTGCCGTTCTCCATTCTGGTTGCACCAGTTGAATCTGAGGTCCACCAAACGTAATGCGGGGTGCATTTCGCTGATTTGGGGTCCATTATCAGACATTTTAGTTTGTTCGTTTAGATGAAATCTAAAAACAACACGAACGTCATGATACTCTAAAAAGTCTTCATAGAACAGACAACGCTACAGACTACCCAATTTGGGTTCCTGGCAATTAATTGTAAAAAAGGTAAGCGAGCAACCTCATTCTCGATATCGATGTACTGAATACGTTGATTTAGAACCAGCCTCAACTCCGTCTTGCGACAATATGCGCTT ++ +%')'++124>AB,93;36-,,-//3695.00046+,)*)%%324>B;../%,&..85(''/-?;:8:7;AEFIJE3=9:7942:><11&*+969796+-0&'&))+-2952;?:HLC86:;:-%%%%%%+'%$')67:1160()'$&$'&0$#$#%'*7>BCA?>01&51&&5$$D?AG?7<3))';7-**@?BBD)8:-(%111/.027=?ADD13=JLAE==?B6$((+77;+''-%&&&.79;-8/.87<;794---,6558622%#$%(67,37*%+**+ +@990e110e-5e50-41a2-8ad5-92044d4465b8 runid=99790f25859e24307203c25273f3a8be8283e7eb read=15011 ch=52 start_time=2020-11-11T01:49:18Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GGTATACTTCGTTCAGTTACGTATTGCTCAAGACGGAGTTGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGATAGTGCCACGGTTTCCTGTTTTCGCACACATTGTTACCAGAACCTAAGTGGGTAGTCTTGTAGTGCGTTGTTCGTTTCTATGAAGACTTTTTAGAGTATCATGACGTTCGTGTGCTGAATTTCATCTAAGCGAACAAACTAAAATGTCTGATAATGGACCCCAAAATCAACGAAATGCGCCCCGCGTGCGTTTGTTGGTTCCCTCAGATTCGCTGGCAGTAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAGCATCGGCCCCAAGGTTTGCCAATAATACTGCGTCTTGGTTCCACCGCTCTCACTCAACATGGCAAGGAAGACCTCGTTAGTAGTAGCAATACGTAACC ++ +%&&%$&(()')23-+.:)''.10355=DEAE@E;--&+:<115924-0CJ:<>DE?=6,.+,.//0*123,*7//&'&'%#&$$)*631-/0&%($),&%)(+/0-/29;88=;8EGFHFJFEFFFB===C@?;((426?=<5&':;<8&8()%76:?5.'2-,'()/&20.-3>8+$#'&.1186EBA@B>C;:-/)+...0-+%1-/3*&.)$'(&'$'1&,466663)5+<6)++,.7999;;92;9:977$+61)-124.5970<,8=:-.1--,+'*++(-***,,12@??9:2/61-)&## diff --git a/io/fastq/data/nanosavseq.fastq.gz b/io/fastq/data/nanosavseq.fastq.gz new file mode 100644 index 0000000000000000000000000000000000000000..986183fd9bfbd3578f4c43757beaf2ba0f02a582 GIT binary patch literal 2312 zcmV+j3HSCNiwFp;^(ABg18!k%Z*yUGb7gTZW?^%5aR9BC+mhV25r*&e6simf5F|n3 zKmY{K7|twLYgMX}@G3W|J#KM{&N56%eyb%|8N-v6O3{WubuI(u^r=ofBpRa zTYW_wWAk>Elm7`hh>+fB5*f`=7<@&-Xw4`2ONA zeSk}!?|=S$`ImULxqaKWbz9eM-`CZ_JA8BS&8@rJor8-owyn;saMjP<;huekUl`i2 zcRA-^3QTwY%kHOdFvHIe)8RTybTAbLwk?dn5G?NpH-A4&_jkjUxW1nd{<$>_xiB;Q z@k_>8u&RIe%kCkFJ?!P9_dEJU;>~^wSZ|N|=-+hfvczCn>R7a4D&e&Dv=gM$VgNo^{29(b86o z^O}+@IV4%#2$45^H*z7SdC^ukU8Qi_DM4~=)YvskFNH+taKIH+CT}2HQZUq+!BC~x zU(ef7@R}95u{El-l5IvvK?)(YRz-$!0w+a9hEPr`dVv2HNU5X*4)Bm*RFb|Q)?sN> zLk{@WVucYc@PIyO)zU`jN7_C<8T!4OV36c3NqX@$)dnAaDVfUExDqpuf&fgt{$g#zB{1)M&$Knj7ya2M)ApF267pWP>6=te{lY+E_q=b0BBYz)e7!baro+ezaVgCZkl3#>ieajUt5SHR!85 zpN`Y$nx@eOAqXW1|LJf@4+mfmttIaJO1BENlW07lilQ)W*MoE=Z~|d&3kVrtgU5?8*x~UsXo}jYi&9n?9wUcUUIyfg;DT>8Sr*X91G$V{~hDix{gGDlQ zNK%X|0=mypRuKZc%2hv^sw#3%2HY4ss3sqV4&yXSaG67L^U>8bD-ko=S}n^muXWR) z6!L}qD|k^_k~BrH7R@X(5BhD@o??*%Z-NJ(&9`Qhe@XSYT?fVN80wR3HUZbOl65m24jqN;%HxY~al|{iDy?C8qmV&E_ zQg;C^FJ{NZq=V6MaXFvi9zVaJAZ8j!88ja87Z(V8_Gc92zpi^s@ddX2Bm!P$721%$rj*wm@Bmf6x87;DmERYTt zjAU6>&hvm0zK)ZD*#!3n?A0Yj(2Z|`Q+Su-oCq?mtjc3 zRM|31imWJU0_Fm7ux`POWrvjZxE{c>x{-+7`~)1rVYbZTP*Q?Y-}21Z$cnz_NPyod{9jy?%oS;q4#Q zzHGrRe1ni*SP(D!ElT;tg8YvIaldpRUT&TVq1E~7K!SCTwLR>xx)=MgeRUr}cH6I7 z4;O0ji}Sepjx9>?tLfN&tLcbc#Fb=z+iks~CK6$N@g3X$vmIC85iQ%bCHmHOZ0ptI z1YZ8vd%T#BZ|z6uWL_+Y4=DCEzi=R{$A>StARmH>khG{e0#%bU=t0I4RP^QCQ;LcX zfS{~sC|mCax46@d6Yh9)Cr6PSj8T16l2U;APJjr|oM2RCg`T0>mQ~4M2k0&kD2Ei7 z6-iR$bqzg&Q6Sx_2LEn4W6B&Hh@v!u2NVoZMwR5s^i2;I09uiG)MZta(44TG3t8r- zu6@x@53o2i5t37Av7nL@@K6N$97bzl`?ekX_PpKP-rU`sZ*FdYApnuNhaODmc^nkN zT|cx4x{1{2ZdMf*w1h}&Bv?h|)E3DB(t$?qbaQh3)avumEyrmE0z#KoK?g;P6gHPR zsz`z}gcHnB*=YEYL=}T31;lN@!1cWqy@i&|B!%XL0=6sYbJ$R}V@E3vuqbd|6dCp1 iW68xZL!U6W)wImB?L@6PG&G09tN#EIPgmyp5C8xf!*7xR literal 0 HcmV?d00001 diff --git a/io/fastq/data/nanosavseq_emptyseq.fastq b/io/fastq/data/nanosavseq_emptyseq.fastq new file mode 100644 index 00000000..cf6c2494 --- /dev/null +++ b/io/fastq/data/nanosavseq_emptyseq.fastq @@ -0,0 +1,16 @@ +@e3cc70d5-90ef-49b6-bbe1-cfef99537d73 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13956 ch=53 start_time=2020-11-11T01:49:01Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GATGTGCGCCGTTCCAGTTGCGACGTACTATAATCCCCGGCAACACGGTGCTGATTCTCTTCCTGTTCCAGAAAGCATAAACAGATGCAAGTCTGGTGTGATTAACTTCACCAAAGGGCTGGTTGTAATATTAGGAAATCTAACAATAGATTCTGTTGGTTGGACTCTAAAATTAGAAATTTGATAGATTCCTTTTCCCAAATGAAAGTTTAACGTACACTTTGTTTCTAAAGGAAGGTCAAATTACAGTCTACAGCATCGTAATGGTTCATTTTCATTTATATTTTAATACTAGAAAAGTCCTAGGTTGAAGATAACCACATAATAAGCTGCAACTTCAGCTGTCCCAACCTGAAGAAGAATCGCAGGAGTCGAAATAACTTCTGTAAAGCAAGTAGTTTGAACCTATTGATGTTTCAACATGAGCAATACGTAACT ++ +$$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0-+3558,/)+&)'&&%&$$'%'%'&*/5978<9;**'3*'&&A?99:;:97:278?=9B?CLJHGG=9<@AC@@=>?=>D>=3<>=>3362$%/((+/%&+//.-,%-4:+..000,&$#%$$%+*)&*0%.//*?<<;>DE>.8942&&//074&$033)*&&&%**)%)962133-%'&*99><<=1144??6.027639.011/-)($#$(/422*4;:=122>?@6964:.5'8:52)*675=:4@;323&&##'.-57*4597)+0&:7<7-550REGB21/0+*79/&/6538())+)+23665+(''$$$'-2(&&*-.-#$&%%$$,-)&$$#$'&,);;9'04;:EB91*999211%;9/7<:=(,%%%)7<@9(+--5/679;:9AA.0024139)&'%?AI@;=91374(--6=773;3445))1*/459:EDB>=A4446+))&&$&'"##%##)+&-/3';;.685&8>24+'#&+++78:;9643*)&&'19+$%$,%%#')8<?LQKGBFGHCHD>AA=664741&)*%%.0'-'%&&%$(55:7=9@B@==A>:<7*$&,,14>@<,-9>54/-D==9:>?&'&'('),/%&%&&(&#$%''.)&,)*&(.)))%)*5;>7..+*%%$$'479<54*<<:''%($',*8:;?BA/-(,&0769963,,*/644&%)&&&++.++.--,)%%& +@60907b6b-5e38-498e-9c07-f036ebd8c658 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13962 ch=53 start_time=2020-11-11T01:49:07Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +AGTGTGCTTCGTTCAGTTGCGTATTGCTACTTACTAACGAGGTCTTCCTTGTATGTTGAGTGAGAGCGGTGAACCAAGACGCAGTATTATTGGGTAAACCTTAGGCCGACGTTGTTTTGATCGCGCCCCACTGCCGTTCTCCATTCTGGTTGCACCAGTTGAATCTGAGGTCCACCAAACGTAATGCGGGGTGCATTTCGCTGATTTGGGGTCCATTATCAGACATTTTAGTTTGTTCGTTTAGATGAAATCTAAAAACAACACGAACGTCATGATACTCTAAAAAGTCTTCATAGAACAGACAACGCTACAGACTACCCAATTTGGGTTCCTGGCAATTAATTGTAAAAAAGGTAAGCGAGCAACCTCATTCTCGATATCGATGTACTGAATACGTTGATTTAGAACCAGCCTCAACTCCGTCTTGCGACAATATGCGCTT ++ +%')'++124>AB,93;36-,,-//3695.00046+,)*)%%324>B;../%,&..85(''/-?;:8:7;AEFIJE3=9:7942:><11&*+969796+-0&'&))+-2952;?:HLC86:;:-%%%%%%+'%$')67:1160()'$&$'&0$#$#%'*7>BCA?>01&51&&5$$D?AG?7<3))';7-**@?BBD)8:-(%111/.027=?ADD13=JLAE==?B6$((+77;+''-%&&&.79;-8/.87<;794---,6558622%#$%(67,37*%+**+ +@990e110e-5e50-41a2-8ad5-92044d4465b8 runid=99790f25859e24307203c25273f3a8be8283e7eb read=15011 ch=52 start_time=2020-11-11T01:49:18Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 + ++ +%&&%$&(()')23-+.:)''.10355=DEAE@E;--&+:<115924-0CJ:<>DE?=6,.+,.//0*123,*7//&'&'%#&$$)*631-/0&%($),&%)(+/0-/29;88=;8EGFHFJFEFFFB===C@?;((426?=<5&':;<8&8()%76:?5.'2-,'()/&20.-3>8+$#'&.1186EBA@B>C;:-/)+...0-+%1-/3*&.)$'(&'$'1&,466663)5+<6)++,.7999;;92;9:977$+61)-124.5970<,8=:-.1--,+'*++(-***,,12@??9:2/61-)&## diff --git a/io/fastq/data/nanosavseq_noidentifier.fastq b/io/fastq/data/nanosavseq_noidentifier.fastq new file mode 100644 index 00000000..64a72914 --- /dev/null +++ b/io/fastq/data/nanosavseq_noidentifier.fastq @@ -0,0 +1,16 @@ +@e3cc70d5-90ef-49b6-bbe1-cfef99537d73 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13956 ch=53 start_time=2020-11-11T01:49:01Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GATGTGCGCCGTTCCAGTTGCGACGTACTATAATCCCCGGCAACACGGTGCTGATTCTCTTCCTGTTCCAGAAAGCATAAACAGATGCAAGTCTGGTGTGATTAACTTCACCAAAGGGCTGGTTGTAATATTAGGAAATCTAACAATAGATTCTGTTGGTTGGACTCTAAAATTAGAAATTTGATAGATTCCTTTTCCCAAATGAAAGTTTAACGTACACTTTGTTTCTAAAGGAAGGTCAAATTACAGTCTACAGCATCGTAATGGTTCATTTTCATTTATATTTTAATACTAGAAAAGTCCTAGGTTGAAGATAACCACATAATAAGCTGCAACTTCAGCTGTCCCAACCTGAAGAAGAATCGCAGGAGTCGAAATAACTTCTGTAAAGCAAGTAGTTTGAACCTATTGATGTTTCAACATGAGCAATACGTAACT ++ +$$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0-+3558,/)+&)'&&%&$$'%'%'&*/5978<9;**'3*'&&A?99:;:97:278?=9B?CLJHGG=9<@AC@@=>?=>D>=3<>=>3362$%/((+/%&+//.-,%-4:+..000,&$#%$$%+*)&*0%.//*?<<;>DE>.8942&&//074&$033)*&&&%**)%)962133-%'&*99><<=1144??6.027639.011/-)($#$(/422*4;:=122>?@6964:.5'8:52)*675=:4@;323&&##'.-57*4597)+0&:7<7-550REGB21/0+*79/&/6538())+)+23665+(''$$$'-2(&&*-.-#$&%%$$,-)&$$#$'&,);;9'04;:EB91*999211%;9/7<:=(,%%%)7<@9(+--5/679;:9AA.0024139)&'%?AI@;=91374(--6=773;3445))1*/459:EDB>=A4446+))&&$&'"##%##)+&-/3';;.685&8>24+'#&+++78:;9643*)&&'19+$%$,%%#')8<?LQKGBFGHCHD>AA=664741&)*%%.0'-'%&&%$(55:7=9@B@==A>:<7*$&,,14>@<,-9>54/-D==9:>?&'&'('),/%&%&&(&#$%''.)&,)*&(.)))%)*5;>7..+*%%$$'479<54*<<:''%($',*8:;?BA/-(,&0769963,,*/644&%)&&&++.++.--,)%%& +@60907b6b-5e38-498e-9c07-f036ebd8c658 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13962 ch=53 start_time=2020-11-11T01:49:07Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +AGTGTGCTTCGTTCAGTTGCGTATTGCTACTTACTAACGAGGTCTTCCTTGTATGTTGAGTGAGAGCGGTGAACCAAGACGCAGTATTATTGGGTAAACCTTAGGCCGACGTTGTTTTGATCGCGCCCCACTGCCGTTCTCCATTCTGGTTGCACCAGTTGAATCTGAGGTCCACCAAACGTAATGCGGGGTGCATTTCGCTGATTTGGGGTCCATTATCAGACATTTTAGTTTGTTCGTTTAGATGAAATCTAAAAACAACACGAACGTCATGATACTCTAAAAAGTCTTCATAGAACAGACAACGCTACAGACTACCCAATTTGGGTTCCTGGCAATTAATTGTAAAAAAGGTAAGCGAGCAACCTCATTCTCGATATCGATGTACTGAATACGTTGATTTAGAACCAGCCTCAACTCCGTCTTGCGACAATATGCGCTT ++ +%')'++124>AB,93;36-,,-//3695.00046+,)*)%%324>B;../%,&..85(''/-?;:8:7;AEFIJE3=9:7942:><11&*+969796+-0&'&))+-2952;?:HLC86:;:-%%%%%%+'%$')67:1160()'$&$'&0$#$#%'*7>BCA?>01&51&&5$$D?AG?7<3))';7-**@?BBD)8:-(%111/.027=?ADD13=JLAE==?B6$((+77;+''-%&&&.79;-8/.87<;794---,6558622%#$%(67,37*%+**+ +990e110e-5e50-41a2-8ad5-92044d4465b8 runid=99790f25859e24307203c25273f3a8be8283e7eb read=15011 ch=52 start_time=2020-11-11T01:49:18Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GGTATACTTCGTTCAGTTACGTATTGCTCAAGACGGAGTTGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGATAGTGCCACGGTTTCCTGTTTTCGCACACATTGTTACCAGAACCTAAGTGGGTAGTCTTGTAGTGCGTTGTTCGTTTCTATGAAGACTTTTTAGAGTATCATGACGTTCGTGTGCTGAATTTCATCTAAGCGAACAAACTAAAATGTCTGATAATGGACCCCAAAATCAACGAAATGCGCCCCGCGTGCGTTTGTTGGTTCCCTCAGATTCGCTGGCAGTAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAGCATCGGCCCCAAGGTTTGCCAATAATACTGCGTCTTGGTTCCACCGCTCTCACTCAACATGGCAAGGAAGACCTCGTTAGTAGTAGCAATACGTAACC ++ +%&&%$&(()')23-+.:)''.10355=DEAE@E;--&+:<115924-0CJ:<>DE?=6,.+,.//0*123,*7//&'&'%#&$$)*631-/0&%($),&%)(+/0-/29;88=;8EGFHFJFEFFFB===C@?;((426?=<5&':;<8&8()%76:?5.'2-,'()/&20.-3>8+$#'&.1186EBA@B>C;:-/)+...0-+%1-/3*&.)$'(&'$'1&,466663)5+<6)++,.7999;;92;9:977$+61)-124.5970<,8=:-.1--,+'*++(-***,,12@??9:2/61-)&## diff --git a/io/fastq/data/nanosavseq_noplus.fastq b/io/fastq/data/nanosavseq_noplus.fastq new file mode 100644 index 00000000..8e9e6dc3 --- /dev/null +++ b/io/fastq/data/nanosavseq_noplus.fastq @@ -0,0 +1,14 @@ +@e3cc70d5-90ef-49b6-bbe1-cfef99537d73 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13956 ch=53 start_time=2020-11-11T01:49:01Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GATGTGCGCCGTTCCAGTTGCGACGTACTATAATCCCCGGCAACACGGTGCTGATTCTCTTCCTGTTCCAGAAAGCATAAACAGATGCAAGTCTGGTGTGATTAACTTCACCAAAGGGCTGGTTGTAATATTAGGAAATCTAACAATAGATTCTGTTGGTTGGACTCTAAAATTAGAAATTTGATAGATTCCTTTTCCCAAATGAAAGTTTAACGTACACTTTGTTTCTAAAGGAAGGTCAAATTACAGTCTACAGCATCGTAATGGTTCATTTTCATTTATATTTTAATACTAGAAAAGTCCTAGGTTGAAGATAACCACATAATAAGCTGCAACTTCAGCTGTCCCAACCTGAAGAAGAATCGCAGGAGTCGAAATAACTTCTGTAAAGCAAGTAGTTTGAACCTATTGATGTTTCAACATGAGCAATACGTAACT ++ +$$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0-+3558,/)+&)'&&%&$$'%'%'&*/5978<9;**'3*'&&A?99:;:97:278?=9B?CLJHGG=9<@AC@@=>?=>D>=3<>=>3362$%/((+/%&+//.-,%-4:+..000,&$#%$$%+*)&*0%.//*?<<;>DE>.8942&&//074&$033)*&&&%**)%)962133-%'&*99><<=1144??6.027639.011/-)($#$(/422*4;:=122>?@6964:.5'8:52)*675=:4@;323&&##'.-57*4597)+0&:7<7-550REGB21/0+*79/&/6538())+)+23665+(''$$$'-2(&&*-.-#$&%%$$,-)&$$#$'&,);;9'04;:EB91*999211%;9/7<:=(,%%%)7<@9(+--5/679;:9AA.0024139)&'%?AI@;=91374(--6=773;3445))1*/459:EDB>=A4446+))&&$&'"##%##)+&-/3';;.685&8>24+'#&+++78:;9643*)&&'19+$%$,%%#')8<?LQKGBFGHCHD>AA=664741&)*%%.0'-'%&&%$(55:7=9@B@==A>:<7*$&,,14>@<,-9>54/-D==9:>?&'&'('),/%&%&&(&#$%''.)&,)*&(.)))%)*5;>7..+*%%$$'479<54*<<:''%($',*8:;?BA/-(,&0769963,,*/644&%)&&&++.++.--,)%%& +@60907b6b-5e38-498e-9c07-f036ebd8c658 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13962 ch=53 start_time=2020-11-11T01:49:07Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +AGTGTGCTTCGTTCAGTTGCGTATTGCTACTTACTAACGAGGTCTTCCTTGTATGTTGAGTGAGAGCGGTGAACCAAGACGCAGTATTATTGGGTAAACCTTAGGCCGACGTTGTTTTGATCGCGCCCCACTGCCGTTCTCCATTCTGGTTGCACCAGTTGAATCTGAGGTCCACCAAACGTAATGCGGGGTGCATTTCGCTGATTTGGGGTCCATTATCAGACATTTTAGTTTGTTCGTTTAGATGAAATCTAAAAACAACACGAACGTCATGATACTCTAAAAAGTCTTCATAGAACAGACAACGCTACAGACTACCCAATTTGGGTTCCTGGCAATTAATTGTAAAAAAGGTAAGCGAGCAACCTCATTCTCGATATCGATGTACTGAATACGTTGATTTAGAACCAGCCTCAACTCCGTCTTGCGACAATATGCGCTT ++ +%')'++124>AB,93;36-,,-//3695.00046+,)*)%%324>B;../%,&..85(''/-?;:8:7;AEFIJE3=9:7942:><11&*+969796+-0&'&))+-2952;?:HLC86:;:-%%%%%%+'%$')67:1160()'$&$'&0$#$#%'*7>BCA?>01&51&&5$$D?AG?7<3))';7-**@?BBD)8:-(%111/.027=?ADD13=JLAE==?B6$((+77;+''-%&&&.79;-8/.87<;794---,6558622%#$%(67,37*%+**+ +@990e110e-5e50-41a2-8ad5-92044d4465b8 runid=99790f25859e24307203c25273f3a8be8283e7eb read=15011 ch=52 start_time=2020-11-11T01:49:18Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GGTATACTTCGTTCAGTTACGTATTGCTCAAGACGGAGTTGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGATAGTGCCACGGTTTCCTGTTTTCGCACACATTGTTACCAGAACCTAAGTGGGTAGTCTTGTAGTGCGTTGTTCGTTTCTATGAAGACTTTTTAGAGTATCATGACGTTCGTGTGCTGAATTTCATCTAAGCGAACAAACTAAAATGTCTGATAATGGACCCCAAAATCAACGAAATGCGCCCCGCGTGCGTTTGTTGGTTCCCTCAGATTCGCTGGCAGTAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAGCATCGGCCCCAAGGTTTGCCAATAATACTGCGTCTTGGTTCCACCGCTCTCACTCAACATGGCAAGGAAGACCTCGTTAGTAGTAGCAATACGTAACC diff --git a/io/fastq/data/nanosavseq_noquality.fastq b/io/fastq/data/nanosavseq_noquality.fastq new file mode 100644 index 00000000..cefeab0e --- /dev/null +++ b/io/fastq/data/nanosavseq_noquality.fastq @@ -0,0 +1,13 @@ +@e3cc70d5-90ef-49b6-bbe1-cfef99537d73 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13956 ch=53 start_time=2020-11-11T01:49:01Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GATGTGCGCCGTTCCAGTTGCGACGTACTATAATCCCCGGCAACACGGTGCTGATTCTCTTCCTGTTCCAGAAAGCATAAACAGATGCAAGTCTGGTGTGATTAACTTCACCAAAGGGCTGGTTGTAATATTAGGAAATCTAACAATAGATTCTGTTGGTTGGACTCTAAAATTAGAAATTTGATAGATTCCTTTTCCCAAATGAAAGTTTAACGTACACTTTGTTTCTAAAGGAAGGTCAAATTACAGTCTACAGCATCGTAATGGTTCATTTTCATTTATATTTTAATACTAGAAAAGTCCTAGGTTGAAGATAACCACATAATAAGCTGCAACTTCAGCTGTCCCAACCTGAAGAAGAATCGCAGGAGTCGAAATAACTTCTGTAAAGCAAGTAGTTTGAACCTATTGATGTTTCAACATGAGCAATACGTAACT ++ +$$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0-+3558,/)+&)'&&%&$$'%'%'&*/5978<9;**'3*'&&A?99:;:97:278?=9B?CLJHGG=9<@AC@@=>?=>D>=3<>=>3362$%/((+/%&+//.-,%-4:+..000,&$#%$$%+*)&*0%.//*?<<;>DE>.8942&&//074&$033)*&&&%**)%)962133-%'&*99><<=1144??6.027639.011/-)($#$(/422*4;:=122>?@6964:.5'8:52)*675=:4@;323&&##'.-57*4597)+0&:7<7-550REGB21/0+*79/&/6538())+)+23665+(''$$$'-2(&&*-.-#$&%%$$,-)&$$#$'&,);;AB,93;36-,,-//3695.00046+,)*)%%324>B;../%,&..85(''/-?;:8:7;AEFIJE3=9:7942:><11&*+969796+-0&'&))+-2952;?:HLC86:;:-%%%%%%+'%$')67:1160()'$&$'&0$#$#%'*7>BCA?>01&51&&5$$D?AG?7<3))';7-**@?BBD)8:-(%111/.027=?ADD13=JLAE==?B6$((+77;+''-%&&&.79;-8/.87<;794---,6558622%#$%(67,37*%+**+ +@990e110e-5e50-41a2-8ad5-92044d4465b8 runid=99790f25859e24307203c25273f3a8be8283e7eb read=15011 ch=52 start_time=2020-11-11T01:49:18Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 diff --git a/io/fastq/data/nanosavseq_noquality2.fastq b/io/fastq/data/nanosavseq_noquality2.fastq new file mode 100644 index 00000000..9cb5fe9e --- /dev/null +++ b/io/fastq/data/nanosavseq_noquality2.fastq @@ -0,0 +1,15 @@ +@e3cc70d5-90ef-49b6-bbe1-cfef99537d73 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13956 ch=53 start_time=2020-11-11T01:49:01Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GATGTGCGCCGTTCCAGTTGCGACGTACTATAATCCCCGGCAACACGGTGCTGATTCTCTTCCTGTTCCAGAAAGCATAAACAGATGCAAGTCTGGTGTGATTAACTTCACCAAAGGGCTGGTTGTAATATTAGGAAATCTAACAATAGATTCTGTTGGTTGGACTCTAAAATTAGAAATTTGATAGATTCCTTTTCCCAAATGAAAGTTTAACGTACACTTTGTTTCTAAAGGAAGGTCAAATTACAGTCTACAGCATCGTAATGGTTCATTTTCATTTATATTTTAATACTAGAAAAGTCCTAGGTTGAAGATAACCACATAATAAGCTGCAACTTCAGCTGTCCCAACCTGAAGAAGAATCGCAGGAGTCGAAATAACTTCTGTAAAGCAAGTAGTTTGAACCTATTGATGTTTCAACATGAGCAATACGTAACT ++ +$$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0-+3558,/)+&)'&&%&$$'%'%'&*/5978<9;**'3*'&&A?99:;:97:278?=9B?CLJHGG=9<@AC@@=>?=>D>=3<>=>3362$%/((+/%&+//.-,%-4:+..000,&$#%$$%+*)&*0%.//*?<<;>DE>.8942&&//074&$033)*&&&%**)%)962133-%'&*99><<=1144??6.027639.011/-)($#$(/422*4;:=122>?@6964:.5'8:52)*675=:4@;323&&##'.-57*4597)+0&:7<7-550REGB21/0+*79/&/6538())+)+23665+(''$$$'-2(&&*-.-#$&%%$$,-)&$$#$'&,);;9'04;:EB91*999211%;9/7<:=(,%%%)7<@9(+--5/679;:9AA.0024139)&'%?AI@;=91374(--6=773;3445))1*/459:EDB>=A4446+))&&$&'"##%##)+&-/3';;.685&8>24+'#&+++78:;9643*)&&'19+$%$,%%#')8<?LQKGBFGHCHD>AA=664741&)*%%.0'-'%&&%$(55:7=9@B@==A>:<7*$&,,14>@<,-9>54/-D==9:>?&'&'('),/%&%&&(&#$%''.)&,)*&(.)))%)*5;>7..+*%%$$'479<54*<<:''%($',*8:;?BA/-(,&0769963,,*/644&%)&&&++.++.--,)%%& +@60907b6b-5e38-498e-9c07-f036ebd8c658 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13962 ch=53 start_time=2020-11-11T01:49:07Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +AGTGTGCTTCGTTCAGTTGCGTATTGCTACTTACTAACGAGGTCTTCCTTGTATGTTGAGTGAGAGCGGTGAACCAAGACGCAGTATTATTGGGTAAACCTTAGGCCGACGTTGTTTTGATCGCGCCCCACTGCCGTTCTCCATTCTGGTTGCACCAGTTGAATCTGAGGTCCACCAAACGTAATGCGGGGTGCATTTCGCTGATTTGGGGTCCATTATCAGACATTTTAGTTTGTTCGTTTAGATGAAATCTAAAAACAACACGAACGTCATGATACTCTAAAAAGTCTTCATAGAACAGACAACGCTACAGACTACCCAATTTGGGTTCCTGGCAATTAATTGTAAAAAAGGTAAGCGAGCAACCTCATTCTCGATATCGATGTACTGAATACGTTGATTTAGAACCAGCCTCAACTCCGTCTTGCGACAATATGCGCTT ++ +%')'++124>AB,93;36-,,-//3695.00046+,)*)%%324>B;../%,&..85(''/-?;:8:7;AEFIJE3=9:7942:><11&*+969796+-0&'&))+-2952;?:HLC86:;:-%%%%%%+'%$')67:1160()'$&$'&0$#$#%'*7>BCA?>01&51&&5$$D?AG?7<3))';7-**@?BBD)8:-(%111/.027=?ADD13=JLAE==?B6$((+77;+''-%&&&.79;-8/.87<;794---,6558622%#$%(67,37*%+**+ +@990e110e-5e50-41a2-8ad5-92044d4465b8 runid=99790f25859e24307203c25273f3a8be8283e7eb read=15011 ch=52 start_time=2020-11-11T01:49:18Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GGTATACTTCGTTCAGTTACGTATTGCTCAAGACGGAGTTGAGGCTGGTTCTAAATCACCCATTCAGTACATCGATATCGATAGTGCCACGGTTTCCTGTTTTCGCACACATTGTTACCAGAACCTAAGTGGGTAGTCTTGTAGTGCGTTGTTCGTTTCTATGAAGACTTTTTAGAGTATCATGACGTTCGTGTGCTGAATTTCATCTAAGCGAACAAACTAAAATGTCTGATAATGGACCCCAAAATCAACGAAATGCGCCCCGCGTGCGTTTGTTGGTTCCCTCAGATTCGCTGGCAGTAACCAGAATGGAGAACGCAGTGGGGCGCGATCAAAACAGCATCGGCCCCAAGGTTTGCCAATAATACTGCGTCTTGGTTCCACCGCTCTCACTCAACATGGCAAGGAAGACCTCGTTAGTAGTAGCAATACGTAACC ++ diff --git a/io/fastq/data/nanosavseq_noseq.fastq b/io/fastq/data/nanosavseq_noseq.fastq new file mode 100644 index 00000000..491749c4 --- /dev/null +++ b/io/fastq/data/nanosavseq_noseq.fastq @@ -0,0 +1,13 @@ +@e3cc70d5-90ef-49b6-bbe1-cfef99537d73 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13956 ch=53 start_time=2020-11-11T01:49:01Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +GATGTGCGCCGTTCCAGTTGCGACGTACTATAATCCCCGGCAACACGGTGCTGATTCTCTTCCTGTTCCAGAAAGCATAAACAGATGCAAGTCTGGTGTGATTAACTTCACCAAAGGGCTGGTTGTAATATTAGGAAATCTAACAATAGATTCTGTTGGTTGGACTCTAAAATTAGAAATTTGATAGATTCCTTTTCCCAAATGAAAGTTTAACGTACACTTTGTTTCTAAAGGAAGGTCAAATTACAGTCTACAGCATCGTAATGGTTCATTTTCATTTATATTTTAATACTAGAAAAGTCCTAGGTTGAAGATAACCACATAATAAGCTGCAACTTCAGCTGTCCCAACCTGAAGAAGAATCGCAGGAGTCGAAATAACTTCTGTAAAGCAAGTAGTTTGAACCTATTGATGTTTCAACATGAGCAATACGTAACT ++ +$$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0-+3558,/)+&)'&&%&$$'%'%'&*/5978<9;**'3*'&&A?99:;:97:278?=9B?CLJHGG=9<@AC@@=>?=>D>=3<>=>3362$%/((+/%&+//.-,%-4:+..000,&$#%$$%+*)&*0%.//*?<<;>DE>.8942&&//074&$033)*&&&%**)%)962133-%'&*99><<=1144??6.027639.011/-)($#$(/422*4;:=122>?@6964:.5'8:52)*675=:4@;323&&##'.-57*4597)+0&:7<7-550REGB21/0+*79/&/6538())+)+23665+(''$$$'-2(&&*-.-#$&%%$$,-)&$$#$'&,);;9'04;:EB91*999211%;9/7<:=(,%%%)7<@9(+--5/679;:9AA.0024139)&'%?AI@;=91374(--6=773;3445))1*/459:EDB>=A4446+))&&$&'"##%##)+&-/3';;.685&8>24+'#&+++78:;9643*)&&'19+$%$,%%#')8<?LQKGBFGHCHD>AA=664741&)*%%.0'-'%&&%$(55:7=9@B@==A>:<7*$&,,14>@<,-9>54/-D==9:>?&'&'('),/%&%&&(&#$%''.)&,)*&(.)))%)*5;>7..+*%%$$'479<54*<<:''%($',*8:;?BA/-(,&0769963,,*/644&%)&&&++.++.--,)%%& +@60907b6b-5e38-498e-9c07-f036ebd8c658 runid=99790f25859e24307203c25273f3a8be8283e7eb read=13962 ch=53 start_time=2020-11-11T01:49:07Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 +AGTGTGCTTCGTTCAGTTGCGTATTGCTACTTACTAACGAGGTCTTCCTTGTATGTTGAGTGAGAGCGGTGAACCAAGACGCAGTATTATTGGGTAAACCTTAGGCCGACGTTGTTTTGATCGCGCCCCACTGCCGTTCTCCATTCTGGTTGCACCAGTTGAATCTGAGGTCCACCAAACGTAATGCGGGGTGCATTTCGCTGATTTGGGGTCCATTATCAGACATTTTAGTTTGTTCGTTTAGATGAAATCTAAAAACAACACGAACGTCATGATACTCTAAAAAGTCTTCATAGAACAGACAACGCTACAGACTACCCAATTTGGGTTCCTGGCAATTAATTGTAAAAAAGGTAAGCGAGCAACCTCATTCTCGATATCGATGTACTGAATACGTTGATTTAGAACCAGCCTCAACTCCGTCTTGCGACAATATGCGCTT ++ +%')'++124>AB,93;36-,,-//3695.00046+,)*)%%324>B;../%,&..85(''/-?;:8:7;AEFIJE3=9:7942:><11&*+969796+-0&'&))+-2952;?:HLC86:;:-%%%%%%+'%$')67:1160()'$&$'&0$#$#%'*7>BCA?>01&51&&5$$D?AG?7<3))';7-**@?BBD)8:-(%111/.027=?ADD13=JLAE==?B6$((+77;+''-%&&&.79;-8/.87<;794---,6558622%#$%(67,37*%+**+ +@990e110e-5e50-41a2-8ad5-92044d4465b8 runid=99790f25859e24307203c25273f3a8be8283e7eb read=15011 ch=52 start_time=2020-11-11T01:49:18Z flow_cell_id=AEI083 protocol_group_id=NanoSav2 sample_id=nanosavseq2 diff --git a/io/fastq/example_test.go b/io/fastq/example_test.go new file mode 100644 index 00000000..498b1e38 --- /dev/null +++ b/io/fastq/example_test.go @@ -0,0 +1,64 @@ +package fastq_test + +import ( + _ "embed" + "fmt" + "os" + "strings" + + "github.com/TimothyStiles/poly/io/fastq" +) + +//go:embed data/nanosavseq.fastq +var baseFastq string + +// ExampleRead shows basic usage for Read. +func ExampleRead() { + fastqs, _ := fastq.Read("data/nanosavseq.fastq") + fmt.Println(fastqs[0].Identifier) + //Output: + //e3cc70d5-90ef-49b6-bbe1-cfef99537d73 +} + +// ExampleReadGz shows basic usage for ReadGz. +func ExampleReadGz() { + fastqs, _ := fastq.ReadGz("data/nanosavseq.fastq.gz") + fmt.Println(fastqs[0].Identifier) + //Output: + //e3cc70d5-90ef-49b6-bbe1-cfef99537d73 +} + +// ExampleWrite shows basic usage of the writer. +func ExampleWrite() { + fastqs, _ := fastq.Read("data/nanosavseq.fastq") // get example data + _ = fastq.Write(fastqs, "data/test.fastq") // write it out again + testSequence, _ := fastq.Read("data/test.fastq") // read it in again + + os.Remove("data/test.fastq") // getting rid of test file + + fmt.Println(testSequence[0].Identifier) + fmt.Println(testSequence[0].Sequence) + fmt.Println(testSequence[0].Quality) + //Output: + //e3cc70d5-90ef-49b6-bbe1-cfef99537d73 + //GATGTGCGCCGTTCCAGTTGCGACGTACTATAATCCCCGGCAACACGGTGCTGATTCTCTTCCTGTTCCAGAAAGCATAAACAGATGCAAGTCTGGTGTGATTAACTTCACCAAAGGGCTGGTTGTAATATTAGGAAATCTAACAATAGATTCTGTTGGTTGGACTCTAAAATTAGAAATTTGATAGATTCCTTTTCCCAAATGAAAGTTTAACGTACACTTTGTTTCTAAAGGAAGGTCAAATTACAGTCTACAGCATCGTAATGGTTCATTTTCATTTATATTTTAATACTAGAAAAGTCCTAGGTTGAAGATAACCACATAATAAGCTGCAACTTCAGCTGTCCCAACCTGAAGAAGAATCGCAGGAGTCGAAATAACTTCTGTAAAGCAAGTAGTTTGAACCTATTGATGTTTCAACATGAGCAATACGTAACT + //$$&%&%#$)*59;/767C378411,***,('11<;:,0039/0&()&'2(/*((4.1.09751).601+'#&&&,-**/0-+3558,/)+&)'&&%&$$'%'%'&*/5978<9;**'3*'&&A?99:;:97:278?=9B?CLJHGG=9<@AC@@=>?=>D>=3<>=>3362$%/((+/%&+//.-,%-4:+..000,&$#%$$%+*)&*0%.//*?<<;>DE>.8942&&//074&$033)*&&&%**)%)962133-%'&*99><<=1144??6.027639.011/-)($#$(/422*4;:=122>?@6964:.5'8:52)*675=:4@;323&&##'.-57*4597)+0&:7<7-550REGB21/0+*79/&/6538())+)+23665+(''$$$'-2(&&*-.-#$&%%$$,-)&$$#$'&,);;, and includes quality +values for a sequence. + +This package provides a parser and writer for working with Fastq formatted +sequencing data. +*/ +package fastq + +import ( + "bufio" + "bytes" + "compress/gzip" + "errors" + "fmt" + "io" + "math" + "os" + "strings" + "unsafe" +) + +/****************************************************************************** +March 22, 2023 + +Fastq Parser begins here + +I basically stole everything from the fasta parser, and added a few bits +for parsing out additional data from fastq nanopore files. Mwhahaha, stealing +code! + +Keoni + +******************************************************************************/ + +var ( + gzipReaderFn = gzip.NewReader + openFn = os.Open + buildFn = Build +) + +// Fastq is a struct representing a single Fastq file element with an Identifier, its corresponding sequence, its quality score, and any optional pieces of data. +type Fastq struct { + Identifier string `json:"identifier"` + Optionals map[string]string `json:"optionals"` // Nanopore, for example, carries along data like: read=13956 ch=53 start_time=2020-11-11T01:49:01Z + Sequence string `json:"sequence"` + Quality string `json:"quality"` +} + +// Parse parses a given Fastq file into an array of Fastq structs. Internally, it uses ParseFastqConcurrent. +func Parse(r io.Reader) ([]Fastq, error) { + // 32kB is a magic number often used by the Go stdlib for parsing. We multiply it by two. + const maxLineSize = 2 * 32 * 1024 + parser := NewParser(r, maxLineSize) + return parser.ParseAll() +} + +// Parser is a flexible parser that provides ample +// control over reading fastq-formatted sequences. +// It is initialized with NewParser. +type Parser struct { + // reader keeps state of current reader. + reader bufio.Reader + line uint +} + +// NewParser returns a Parser that uses r as the source +// from which to parse fastq formatted sequences. +func NewParser(r io.Reader, maxLineSize int) *Parser { + return &Parser{ + reader: *bufio.NewReaderSize(r, maxLineSize), + } +} + +// ParseAll parses all sequences in underlying reader only returning non-EOF errors. +// It returns all valid fastq sequences up to error if encountered. +func (parser *Parser) ParseAll() ([]Fastq, error) { + return parser.ParseN(math.MaxInt) +} + +// ParseN parses up to maxSequences fastq sequences from the Parser's underlying reader. +// ParseN does not return EOF if encountered. +// If an non-EOF error is encountered it returns it and all correctly parsed sequences up to then. +func (parser *Parser) ParseN(maxSequences int) (fastqs []Fastq, err error) { + for counter := 0; counter < maxSequences; counter++ { + fastq, _, err := parser.ParseNext() + if err != nil { + if errors.Is(err, io.EOF) { + err = nil // EOF not treated as parsing error. + } + return fastqs, err + } + fastqs = append(fastqs, fastq) + } + return fastqs, nil +} + +// ParseNext reads next fastq genome in underlying reader and returns the result +// and the amount of bytes read during the call. +// ParseNext only returns an error if it: +// - Attempts to read and fails to find a valid fastq sequence. +// - Returns reader's EOF if called after reader has been exhausted. +// - If a EOF is encountered immediately after a sequence with no newline ending. +// In this case the Fastq up to that point is returned with an EOF error. +// +// It is worth noting the amount of bytes read are always right up to before +// the next fastq starts which means this function can effectively be used +// to index where fastqs start in a file or string. +// +// ParseNext is simplified for fastq files from fasta files. Unlike fasta +// files, fastq always have 4 lines following each other - not variable with +// a line limit of 80 like fasta files have. So instead of a for loop, you +// can just parse 4 lines at once. +func (parser *Parser) ParseNext() (Fastq, int64, error) { + if _, err := parser.reader.Peek(1); err != nil { + // Early return on error. Probably will be EOF. + return Fastq{}, 0, err + } + + // More general case of error handling. + handleErr := func(err error) error { + isEOF := errors.Is(err, io.EOF) + if errors.Is(err, bufio.ErrBufferFull) { + // Buffer size too small to read fastq line. + return fmt.Errorf("line %d too large for buffer, use larger maxLineSize: %w", parser.line+1, err) + } else if isEOF { + return fmt.Errorf("line %d failed: unexepcted EOF encountered", parser.line+1) + } + return err + } + + // Initialization of parser state variables. + var ( + // Parser looks for a line starting with '@' + // that contains the next fastq sequence identifier. + lookingForIdentifier = true + seqIdentifier, quality string + optionals map[string]string + sequence, line []byte + err error + totalRead int64 + ) + + // parse identifier + line, err = parser.reader.ReadSlice('\n') + totalRead += int64(len(line)) + parser.line++ + if handleErr(err) != nil { + return Fastq{}, totalRead, handleErr(err) + } + + line = line[:len(line)-1] // Exclude newline delimiter. + if string(line)[0] == '@' { + lookingForIdentifier = false + } + lineSplits := strings.Split(string(line), " ") + seqIdentifier = lineSplits[0][1:] + optionals = make(map[string]string) + for _, optionalDatum := range lineSplits[1:] { + optionalSplits := strings.Split(optionalDatum, "=") + optionalKey := optionalSplits[0] + optionalValue := optionalSplits[1] + optionals[optionalKey] = optionalValue + } + + // parse sequence + line, err = parser.reader.ReadSlice('\n') + totalRead += int64(len(line)) + parser.line++ + if handleErr(err) != nil { + return Fastq{}, totalRead, handleErr(err) + } + if len(line) <= 1 { // newline delimiter - actually checking for empty line + return Fastq{}, totalRead, fmt.Errorf("empty fastq sequence for %q, got to line %d: %w", seqIdentifier, parser.line, err) + } + sequence = line[:len(line)-1] // Exclude newline delimiter. + + // skip + + _, err = parser.reader.ReadSlice('\n') + totalRead += int64(len(line)) + parser.line++ + if handleErr(err) != nil { + return Fastq{}, totalRead, handleErr(err) + } + + // parse quality + line, err = parser.reader.ReadSlice('\n') + totalRead += int64(len(line)) + parser.line++ + if handleErr(err) != nil { + return Fastq{}, totalRead, handleErr(err) + } + if len(line) <= 1 { // newline delimiter - actually checking for empty line + return Fastq{}, totalRead, fmt.Errorf("empty quality sequence for %q, got to line %d: %w", seqIdentifier, parser.line, err) + } + quality = string(line[:len(line)-1]) + + // Parsing ended. Check for inconsistencies. + if lookingForIdentifier { + return Fastq{}, totalRead, fmt.Errorf("did not find fastq start '@', got to line %d: %w", parser.line, err) + } + fastq := Fastq{ + Identifier: seqIdentifier, + Optionals: optionals, + Quality: quality, + Sequence: *(*string)(unsafe.Pointer(&sequence)), // Stdlib strings.Builder.String() does this so it *should* be safe. + } + // Gotten to this point err is non-nil only in EOF case. + // We report this error to note the fastq may be incomplete/corrupt + // like in the case of using an io.LimitReader wrapping the underlying reader. + return fastq, totalRead, err +} + +// Reset discards all data in buffer and resets state. +func (parser *Parser) Reset(r io.Reader) { + parser.reader.Reset(r) + parser.line = 0 +} + +/****************************************************************************** + +Start of Read functions + +******************************************************************************/ + +// ReadGz reads a gzipped file into an array of Fastq structs. +func ReadGz(path string) ([]Fastq, error) { + file, err := openFn(path) + if err != nil { + return nil, err + } + defer file.Close() + reader, err := gzipReaderFn(file) + if err != nil { + return nil, err + } + defer reader.Close() + return Parse(reader) +} + +// Read reads a file into an array of Fastq structs +func Read(path string) ([]Fastq, error) { + file, err := openFn(path) + if err != nil { + return nil, err + } + defer file.Close() + return Parse(file) +} + +/****************************************************************************** + +Start of Write functions + +******************************************************************************/ + +// Build converts a Fastqs array into a byte array to be written to a file. +func Build(fastqs []Fastq) ([]byte, error) { + var fastqString bytes.Buffer + for _, fastq := range fastqs { + fastqString.WriteString("@") + fastqString.WriteString(fastq.Identifier) + fastqString.WriteString("\n") + + // fastq doesn't limit at 80 characters, since it is + // mainly reading big ole' sequencing files without + // human input. + fastqString.WriteString(fastq.Sequence) + fastqString.WriteString("\n+\n") + fastqString.WriteString(fastq.Quality) + fastqString.WriteString("\n") + } + return fastqString.Bytes(), nil +} + +// Write writes a fastq array to a file. +func Write(fastqs []Fastq, path string) error { + fastqBytes, _ := buildFn(fastqs) // fastq.Build returns only nil errors. + return os.WriteFile(path, fastqBytes, 0644) +} diff --git a/io/fastq/fastq_test.go b/io/fastq/fastq_test.go new file mode 100644 index 00000000..cc0f4350 --- /dev/null +++ b/io/fastq/fastq_test.go @@ -0,0 +1,66 @@ +package fastq + +import ( + "os" + "testing" +) + +func TestParseNLow(t *testing.T) { + file, err := os.Open("data/nanosavseq.fastq") + if err != nil { + t.Errorf("Failed to read nanosavseq.fastq. Got error: %s", err) + } + const maxLineSize = 2 * 32 * 1024 + parser := NewParser(file, maxLineSize) + _, err = parser.ParseN(0) + if err != nil { + t.Errorf("Failed to parse 0 fastqs. Got error: %s", err) + } +} + +func TestParseSmallLine(t *testing.T) { + file, _ := os.Open("data/nanosavseq.fastq") + parser := NewParser(file, 10) + _, err := parser.ParseAll() + if err == nil { + t.Errorf("Should have encountered a maxLine error") + } + parser.Reset(file) +} + +func TestRead(t *testing.T) { + _, err := Read("data/doesntexist.fastq") + if err == nil { + t.Errorf("Should have failed to read non-existent file") + } + _, err = ReadGz("data/doesntexist.fastq.gz") + if err == nil { + t.Errorf("Should have failed to read non-existent gz file") + } + _, err = ReadGz("data/nanosavseq.fastq") + if err == nil { + t.Errorf("Should have failed to read a file that is not gz'ed") + } +} + +func testException(t *testing.T, filePath string, errorString string) { + file, err := os.Open(filePath) + if err != nil { + t.Errorf("Failed to read %s. Got error: %s", filePath, err) + } + const maxLineSize = 2 * 32 * 1024 + parser := NewParser(file, maxLineSize) + _, err = parser.ParseAll() + if err == nil { + t.Errorf("%s parser should have gotten error: %s", filePath, errorString) + } +} + +func TestParseExceptions(t *testing.T) { + testException(t, "data/nanosavseq_noseq.fastq", "no seq") + testException(t, "data/nanosavseq_noquality.fastq", "no quality") + testException(t, "data/nanosavseq_noidentifier.fastq", "no identifier") + testException(t, "data/nanosavseq_emptyseq.fastq", "empty seq") + testException(t, "data/nanosavseq_noplus.fastq", "no plus EOF") + testException(t, "data/nanosavseq_noquality2.fastq", "no quality EOF") +}