diff --git a/ALAT_GGAT.txt b/ALAT_GGAT.txt
new file mode 100644
index 0000000..e5e019b
--- /dev/null
+++ b/ALAT_GGAT.txt
@@ -0,0 +1,42 @@
+>XP_002289904.1|alanine_aminotransferase|Thalassiosira_pseudonana_CCMP1335
+MQYAVRGEVVIRADAMAAEGRKIIYTNIGNPHAVGQKPITYYRQVLSLCDLPAECGVDNTQVAAAFPSDV
+IERAIEMRDAIGPAGTGAYTNSQGIGKFRDDVAHFITARDEHVALPSNIFLSNGASAAIENVLTGLIGSN
+RDAIMIPIPQYPIYSAIISRLGARQVGYFLERRTAAVERDGLDIRALTLINPGNPTGQVLGREDLEIICT
+FCAKHNIVLLADEVYQRNIYDDKKEFVSAKKVAVETPGCENLQLISFHSTSKGLIGECGRRGGYMELHNI
+DPYVQTQLYKLASSGLCSGVDGQMMTSLMVRPPLPGEESHELFSRQEFEIFSSLKRRAVSLVRGLNDIDG
+MTCTPAEGAMYAFPRVELPPKALDAAAINDQTPDNLYALSLLEETGICVVPASGFGQKEGRIGFRTTFLP
+PEDELNQAVVEFKRHHEWFCEKYA
+>OEU21541.1|alanine_aminotransferase|Fragilariopsis_cylindrus_CCMP1102
+MEYAVRGTVVIAADRINDELKAEQSMGAESKYKFQKIIYTNIGNPQSVGQQPLTWPRQVLALIDLPDEEG
+INHPNIQNIFPSDAIARARTIKIGLGGNGSGAYSHSKGIKMFREDVCTFLQNRDGIDVPTDVENIFLSNG
+ASAAIFNLLTSLIADNKCGIMIPIPQYPIYSASVEQLGGQKVGYYLDEKNKWNLSIDELERSLKEALENG
+TNVVAFVLINPGNPTGAVLTKQTVQDVVKFCSKHNLVLLADEVYQENVYNEQDKFYSCKRAAYDCGLLET
+NSIELASFHSTSKGVFGECGRRGGYMELTGFDENIKNQLYKLASASLCSTVNGQCMTSLMCRGPSPDDVS
+YESHEKEKLDIFNSLKKRSKIVNDGLNSIDGFSCQPAQGAMYCFPSIDNMPMKAINEAAEQNITPDTLYA
+LSLLERTGICVVPASGFGQRPGRYGFRTTFLPSEDDMAYSVNAMKDHHKEFCQKYA
+>NP_005300.1|alanine_aminotransferase1|Homo_sapiens
+MASSTGDRSQAVRHGLRAKVLTLDGMNPRVRRVEYAVRGPIVQRALELEQELRQGVKKPFTEVIRANIGD
+AQAMGQRPITFLRQVLALCVNPDLLSSPNFPDDAKKRAERILQACGGHSLGAYSVSSGIQLIREDVARYI
+ERRDGGIPADPNNVFLSTGASDAIVTVLKLLVAGEGHTRTGVLIPIPQYPLYSATLAELGAVQVDYYLDE
+ERAWALDVAELHRALGQARDHCRPRALCVINPGNPTGQVQTRECIEAVIRFAFEERLFLLADEVYQDNVY
+AAGSQFHSFKKVLMEMGPPYAGQQELASFHSTSKGYMGECGFRGGYVEVVNMDAAVQQQMLKLMSVRLCP
+PVPGQALLDLVVSPPAPTDPSFAQFQAEKQAVLAELAAKAKLTEQVFNEAPGISCNPVQGAMYSFPRVQL
+PPRAVERAQELGLAPDMFFCLRLLEETGICVVPGSGFGQREGTYHFRMTILPPLEKLRLLLEKLSRFHAK
+FTLEYS
+>AAC62456.1|alanine_aminotransferase|Zea_mays
+MAASVTVENLNPKVLKCEYAVRGEIVIHAQRRQQQLQTQPGSLPFDEILYCNIGNPQSLGQQPVTFFREV
+LALCDHPCLLEKEETKSLFSADAISRAKQILATIPGRATGAYSHSQGIKGLRDAIAAGIMSRDGFPANAD
+DIFITDGASPGVHMMMQLLIRNEKDGILCPIPQYPLYSASIALHGGTLVPYYLNEKNGWGLEISDFKTRL
+EDVRSKGIDVRALVVINPGNPTGQVLAEDNQYDIVKFCKNEGLVLLADEVYQENIYVDNKKFNSFKKIVR
+SMGYGEDDLPLVSLQSVSKGYYGECGKRGGYMEITGFSAPVREQIYKIASVNLCSNITGQILASLVMNPP
+KAGDESYASYKAEKDGILESLARRAKALEDAFNKLEGFSCNKAEGAMYLFPQIHLPQKAIEAAKAAKKAP
+DAFYALRLLESTGIVVVPGSGFGQVPGTWHIRCTILPQEDKIPAVISRFRAFHEAFLAEYRD
+>XP_009315638.1|alanine_aminotransferase|Trypanosoma_grayi
+MSTSRKAIHINPRVVEAQYAVRGLIPMRADEIKNALATPEGKGKYPFSSLVYCNIGNPQALEQKPLTFNR
+QVMSLVDAPFLLDNAAIKAQYPADAVARAQEYLSHIGNRTGAYTDSAGYAFVREIVARHINERDHGAKPL
+MDASSIMLTDGASTGVRLLLQILIGDASDGVMIPIPQYPLYTAQIALLGGTPAMYYLDENKGWALNVADL
+ASAYDECVAQRKATPRVLVVINPGNPTGGVLERGVMEAVAKFCCDRGMVLMADEVYQENIYAEGKRFVSF
+REVVLGLPAPYNTDTVLASLHSTSKGIIGECGRRGGYFSLTNAPAALTEQVVKMSSINLCSNVNGQLMTA
+LMCAPPRAGDASYDAYWAEYNAIFGSLKKRALMLAKELNSIRGFACQPVEGAMYAFPTIQLPEKYAQHNA
+ELNAREGRKLAPDARWALELLESSGIVVVPGSGFGQQPNTLHFRTTILPPEAQMERMVKALRGFQEDVWA
+KYA
\ No newline at end of file
diff --git a/Bestrophin.txt b/Bestrophin.txt
new file mode 100644
index 0000000..1726f65
--- /dev/null
+++ b/Bestrophin.txt
@@ -0,0 +1,54 @@
+>gi|224000585|ref|XP_002289965.1|T.pse|bestrophin1_TPS
+MGPPIDPSVPVTDQVGEGSRKYRRTVYTHDDWVRHRSPDRFGNNLSTLFNSGIYKQVANEVFATTAVATF
+VFLWNMIAGGYTDLAGVQHGPIIDSPLAQMVGLPMTAFTILTPSLGLLLVFRTNTSYGRWDEARKMWGLN
+INHTRDLNRMATAWYGNEGNMDSVAFMGGDIPYSQPIDPVQRAYDLGQVSLFTWAFVRSMKRHLSPPEED
+EEDFKAELRARLTPEQAENIINAAHRPNRALFDLSVAIENLPMHFLRKNAINTNLSIFEDTLGGCERLLS
+SPVPLFYSRHTARFLSTWLLLLPFGLYEQFKDSWNHIAMIPATAFISVCLFGIEELATQLEEPFTILPMQ
+GFCDKIGGWCDEIVSWAGQGQQEYTEENAMSNEQEMTYWR
+>gi|223999673|ref|XP_002289509.1|T.pse|bestrophin2_TPS
+MPSFTSLSTLLLLALSSPQISAFAPLSSTSTPINVAPSTTTSTTNLQMGPPKTDIVLSETYGEGSRKYRR
+TVYTHNEWVKHRSSDRFAKNLFSMVNSGVYKSLAKEVFATTAVASAIVAWNGIAGGYTDFNGVEHGAIMS
+FLPQLVLPLTPFTLLSPSLGLLLVFRTNSSYGRWDEARKMWGLNINHTRDLNRMATAWYGHDNQIIDPAK
+RAEDLRQVSLYTWAFVRSMKRHLSPPSEDEEAFVEELYARMAPEQAEAIISAAHRPNRALYDLSVVIDKL
+PMHFMRKNEINKNLSIFEDTLGGCERLLSSPVPLFYTRHTARFLSTWLLLLPLAMYQPFSGSWNHVAMIP
+ATALTSVFLFGIDELSTQLEEPFTILPMQGFCDKIGGWCDEIVSWRGQGLDKEEQQYY
+>gi|WP_077172616.1|bestrophin|Pseudomonas_psychrotolerans
+MITRPQNPSLRELLFTVRGSIVQAIWPKLLYVVLLSLAVTLSHDVFLRFDFGLTTTPLTLWGLTLAIFLG
+FRNTTAYQRFWEARGLWGELLIAGRNLARQVETLVPGLTAPERRQLLTPLLAFGYALRDHLRREAPSADL
+QRVLVGEDALLAAPHRPSALIRRLGTRLVARAREEGLGDPLIANLDHQLDRLTAVLSGCERIRQTPIPYP
+YILMLHRVVHVYCFLLPFCLVDSLGWFTPLAVLVLAYTFFGLDALGDQIADPFGTQPNHLPLDALSRGLE
+IAVLDLLGEPTPEPIRAEAGLLR
+>gi|NP_004174.1|bestrophin-1_isoform1|Homo_sapiens
+MTITYTSQVANARLGSFSRLLLCWRGSIYKLLYGEFLIFLLCYYIIRFIYRLALTEEQQLMFEKLTLYCD
+SYIQLIPISFVLGFYVTLVVTRWWNQYENLPWPDRLMSLVSGFVEGKDEQGRLLRRTLIRYANLGNVLIL
+RSVSTAVYKRFPSAQHLVQAGFMTPAEHKQLEKLSLPHNMFWVPWVWFANLSMKAWLGGRIRDPILLQSL
+LNEMNTLRTQCGHLYAYDWISIPLVYTQVVTVAVYSFFLTCLVGRQFLNPAKAYPGHELDLVVPVFTFLQ
+FFFYVGWLKVAEQLINPFGEDDDDFETNWIVDRNLQVSLLAVDEMHQDLPRMEPDMYWNKPEPQPPYTAA
+SAQFRRASFMGSTFNISLNKEEMEFQPNQEDEEDAHAGIIGRFLGLQSHDHHPPRANSRTKLLWPKRESL
+LHEGLPKNHKAAKQNVRGQEDNKAWKLKAVDAFKSAPLYQRPGYYSAPQTPLSPTPMFFPLEPSAPSKLH
+SVTGIDTKDKSLKTVSSGAKKSFELLSESDGALMEHPEVSQVRRKTVEFNLTDMPEIPENHLKEPLEQSP
+TNIHTTLKDHMDPYWALENRDEAHS
+>gi|AAR99655.1|bestrophin2|Homo_sapiens
+MTVTYTARVANARFGGFSQLLLLWRGSIYKLLWRELLCFLGFYMALSAAYRFVLTEGQKRYFEKLVIYCD
+QYASLIPVSFVLGFYVTLVVNRWWSQYLCMPLPDALMCVVAGTVHGRDDRGRLYRRTLMRYAGLSAVLIL
+RSVSTAVFKRFPTIDHVVEAGFMTREERKKFENLNSSYNKYWVPCVWFSNLAAQARREGRIRDNSALKLL
+LEELNVFRGKCGMLFHYDWISVPLVYTQVVTIALYSYFLACLIGRQFLDPAQGYKDHDLDLCVPIFTLLQ
+FFFYAGWLKVAEQLINPFGEDDDDFETNFLIDRNFQVSMLAVDEMYDDLAVLEKDLYWDAAEARAPYTAA
+TVFQLRQPSFQGSTFDITLAKEDMQFQRLDGLDGPMGGAPGDFLQRLLPAGAGMVAGGPLGRRLSFLLRK
+NSCVSEASTGASCSCAVVPEGAAPECSCGDPLLDPGLPEPEAPPPAGPEPLTLIPGPVEPFSIVTMPGPR
+GPAPPWLPSPIGEEEENLA
+>gi|NP_988974.1|bestrophin-2|Xenopus_tropicalis
+MTVTYTARVANARFGGFYKLLLLWRGSIYKLLYKEFLAFFLMYLALSIIYRFFLNEEQKLYFDKVAIYCN
+NYANLIPVSFVLGFYVNLVVNRWWNQYLSLPFPDRVMCAISGTVHGSDETGRLYRRTLMRYCSLSGLLIL
+RSVSTAAFKRFPTIDHVVEAGFMTRLERKKFENLQSSYNKYWVPCVWFCNLASQARSEGRIRDDHSFKML
+MEELNTFRGNCGMLFHYDWISVPLVYTQVVTIAVYSFFLTCLIGRQFLDPARGYPGHELDLYVPVFTLLQ
+FFFYAGWLKVAEQLINPFGEDDDDFEINFLIDRNFQVSMLAVDEMYSDVPPMEKDRYWNHSDPRPPYTAA
+TLFQKHMPSFQGSTFNMAIPKEDMQFQPLSDIEEMNEDTLTHPPPLLSRFLPGVGPSPLSSSAALASHFA
+APGSRLTLLRRSTSSFSSSSEFQCQEPVQDPPYSLVDSLGPGLNVQEGHTEELCNMGSQASLFLPPKTMD
+GGENVQPVEEGEDAASLVAT
+>gi|WP_068888990.1|bestrophin|Acinetobacter_celticus
+MIVRDQPNIFKVLFSWRGTILPKILPPLGVVMLISAIIGVLSYIGYFKFPELPFVGFTVIGVVLSIFLGF
+KNSACYERWWDARKLWGILIANSRHFDRDCRMLSQGRRERVIQHVIVFANVLRDRLRHQTANPTELVKTS
+GMSQQALTQLYQQANAPQYTLSLIQWELMQALKDGEISDIIYTQMNDHVMDLSMVQTGCDRIATTPLPFA
+YSVLLNRTVYFFCLILPFSLGSTLGIFTPLLVGVLAYTFLGLDALSSELEEPFGTQSNDLPLDSMVRTIE
+IELLGTLGKPTPPPIQAQDNNLL
\ No newline at end of file
diff --git a/CA_alpha.txt b/CA_alpha.txt
new file mode 100644
index 0000000..c7c5b49
--- /dev/null
+++ b/CA_alpha.txt
@@ -0,0 +1,76 @@
+>jgi|Emihu1|456048|estExtDG_Genemark1.C_1660056|CA2_alpha_EHUX
+MSPNKHSWRYARPGPNHVADEWVQRETWGASFPTCINGIEQSPINIVTGEAIPMKSLPEISTDIDAAPHY
+VSNTGSGFQLFETTPTESMIANGTFIDTIEGSSKGESWVGGQKFLFYQMHWHTPSENTIDGRSFPLEAHF
+VHQLDDPMLVGTLHRLAVISLLYEPGPCNAFLDQFWEEFPMVPGFRQHFADGVNDFERLADEVINIDEGE
+GYFYWHGSLTTPPCTEGVGWYMLKHRETVSDRQIDALRYALAVS
+>XP_005764209.1 carbonic anhydrase [Emiliania huxleyi CCMP1516]
+MGCTQSKHDASEDASNGTMLQAVLGHLGQLDGDAKLDHTTMSIVYDIFKDMDKDSDGTVDKSEFEKFLST
+HPAAKTLWEGEGKASMSRSLKDAIADERLSFYELVAAFAPEAPHHAGDASGIGGLESLISDAAWGYRGYN
+GPENWALLSPKNKLAATGKEQCPVDILPSTCVPCPAVDGDASLAYGVGPGTILNNGHTIQVNWKGGSMSV
+GGTTFEAAQFHFHTASETTIRGMQYPLEMHVVHVTPGANERVSEPMRIAVLAVLFETRTDVEEVFLSQFF
+DQLPSHVAHDQDDAETLTRPVDLSSISLDGGYYRLRGSLTTPPCTEGLEWSVLASPLPILPAQLETFRKA
+LGKTVRNFRPTQPLNGRSITWVCACQA
+>jgi|Thaps3|22391|estExt_fgenesh1_pg.C_chr_40655|CA1_alpha_TPS
+MILLQPMTKRMSTSSHLVILVVLLRLQSSNSRSWLDCINTSKLENDGMPRVGKRHNATTSSLSSDAAITI
+AQMSGNIGTSTTSEDTEIVIHGATTTLFEEVDPFRVTDSPSTVPSYSSSPPTLSPSASPTITPLPTTEKP
+TRLPTLPPTFQTGKNEPLNPKPGYFNYDMNSDYGPHRWKRVDVEDDFFHTFDLKAEDTNNCGSGDHQSPI
+DVCTKPRGNCKETHEMRPKSGDYKMDGELITKQILPSKLRLVMAPRTGDEPDPPQVDFSSNGRGIIDMTN
+IDFKFPSEHTVCGSKFDGEMQYYMYHPGRERFVAVSFFLEASPTNPTNEHLQEVIDAFRTVFIKDKSLCA
+EKQRLENYAQGFVSPANRKLHGEENKTLDSIEDDGELWNTTTIESNEDREYQRRLALKWHPFHPDIQKTI
+HFWGYHGSFTEPPCTDDIVDWKIMDVPTPISTKQLAQLKQLLFNHVDKNCERTSVHNSDGSVARPTQETS
+KYYKCTRDDYVSDEERGVCGDLGCINPFGEGLNPYYPPIVDVTGPPTRAPST
+>jgi|Thaps3|262006|thaps1_ua_kg.chr_4000016|CA2_alpha_TPS
+SEHRLCGKQYDAEMQLFHLHNEGNLEALAILIDADDGTSENPHFQKLLDFFQKKFNADKSMSRDWVWDPL
+EPGYILRSIHFWAYSGSTTEPPCFEGVNWRIIDVPMKISPGQYQQLQRLMFDHSNARPVQP
+>jgi|Thaps3|22257|estExt_fgenesh1_pg.C_chr_40398|CA11_alpha_TPS
+MTRVSNIDSMMDGFGKLSRRAKILYLSSLAVSLAMVVFGACVLTLDYTTRTTSKVENSIGGVVNADDSDE
+AKIQIETQTPTLSPSSSPIYTEKLSLVSSPAPSTSNLRATSAPTNSPVDIGTLQPVTRKPVQPKPTPRPA
+SPKPSSPPSTRYPSISPSQHPTNSPSLSPVTPSPPPTLTQSILPSITNMPSLESLFQSHEVPKDPKPTYF
+NYNGNSDYGPRSWENVTLLNSTENYWHEFGFNDNQCGVGAQSPIDVCTTPMRHCQEHHEFRSKLRVLMHR
+REGDEPDPPHVDFAGVGAKSLDLLNIDIKIPSEHTVCGRRYDGEMQYYFYHPVKGSLIVIAWLFDAQNEF
+ASNEHLQLVIDEFQALYDDTEGACLVNMTLNETGVTAPPHQRLSSRSDRELEKENHGCSGSNLNGPAPSN
+AEYPIQQP
+>jgi|Phatr2|35370|fgenesh1_pg.C_chr_7000291|CA1_PTRI
+MRLIAISLCCLMPCTVRCRSWRNIEPLHGWNENDTSGTIWRMEFNPLFTSAPTSMPTTATPSDIPSSRPS
+SFPSAPPSASPSVAPSPSPSTAPSESDPYRPNDPPKNPEQWYFNYDTSANALYGPGHAGIIQQQNNQFNV
+GYKNNRWGSVGNPPNNYWTEFMDNGFGPWRGILANRNPTRNMCDRVGMQSPIDLRPSGAVCDEHHEVRSR
+RGDFQIFEDEVTKEIQPNKLRLRYKRRPCRNLNELACQEPDPPNADFPNNWGGYADVTHIDFKVPGEHLI
+RGEKFDGEMQIFHIHRGRRRMVVQSVTIRATSTGFNSYFQEAIDVFRAVYDINIARCSALRRKERRLVSN
+AHIILGKNMTSKFHDYSSWGDFSTGLEDVELESKRSLRKSNWDPYHELLIPSIHFYRYDGSLTEPPCGEF
+VSWFVSDTPMRISLSQLEEVKTILFKNVDENCQPTSVQFGHSVARPIQETAGRPVWQCTPREFGPDP*
+>jgi|Phatr2|44526|estExt_fgenesh1_pg.C_chr_40337|CA2_PTRI
+MVGLPSVLLCTLIAFTTAQTGRDLDRFNYRGTDGTDYGPEDWDQVSCTDTETCLGWPDGFETARGWDLGE
+NHCRWCPLGTRQCGIHHQSPIDLQRNRAVPGDPEEKECIDVHWMAYYDSTCDWENLKALNAFSIERHALK
+VNQPIEQLASGDYRLACRNASGRRFGRIDFSKGFSEWWLMSHMDIHVPSEHTQEGKRYDGEIHLYHFYSI
+PGSQSSTNNEMASVTIFLEAYDDVPDYPMLNRLICQWRQVEDKTREECGLPSVETEYPGCFYYQRGHTID
+GFNTIALTQDGTQRNLRQKSRNLRPKSMSVHDLILYNYAQSQTNSSYTPKRLLHSEEDHAEADPNFDWEK
+FVTRQDGNANITQGNRQLLNYDHVGPWFNYFPMLGVRTEYYYRYSGTQTVPPCYGRFFEGNNRRQTNHWR
+VLKDPIRVTQRQVDEMHRLLKERIASVDDPLASCEPDTAAKVDENDPTKISVARPVMETRSTHYKVFCEC
+EDWRSKFPEDVEWCKKGLQDRLFNHPYNFETDGF*
+>jgi|Phatr2|55029|estExt_Phatr1_ua_kg.C_chr_210026|CA3_PTRI
+MSLSGIVCSRAKWFLLSIALPTLGLGLNKTAFSYNKKDEYSPDNWYRLDIAGNVCRGPRNSPIALESTPC
+DAYEGYGLYSGTCTLNDLDFQLTELGVKIKYPKDGSCDINTLTVPGVSGNFRLLEVTIHGGSEHSIDGNF
+SGAEIQLVHEKINSQEGHLAVLAILVEPEGPKDNLFFGTLLDEWRAVRADSTASCAKAGYDVPTLYWLAS
+GTPVNTRHSYVRSYFTSPRFNAYSLLPTNTSFYRYYGGLTTPPCSEIVWWSVADTVMRISTGQYAELMTM
+ITTGYVNVTDEAGCEPWSVASPSGSTSRPLQARNGRPVDRICPV*
+>jgi|Phatr2|54251|estExt_Phatr1_ua_kg.C_chr_40037|CA7_PTRI
+MRSFLLWSLVASFATAQEGSNLDRFNYRGTEGTDYGPEDWDQVSCSDTENCLGWPDAFEASRGWSLKDNF
+CRWCPAGSSSCGTHHQSPIDLQRNRAVPGDPDENECIDVHWMAYYDSTCTWDTLKELNAFSVERHALKVV
+QPITETTSGEWEIACRDDSGKRFGRIDFSKGFSQWWFLSHMDFHVPSEHTQEGKRYDGELHMYHFYSVTG
+AEAGIDNEMASVAFFLEAYDDIPDYPMLNRLICQWREAEEKTREECGLPSILTEYPGCFFYNRGHTDSAV
+TTQSISNGQRKLRTTSRNLRPKVKSVHDIILQNHEQMQSNATFKPHKLILSEDDHAEADPDFDWGAFVAE
+QVAKSTSSQEHRELMNYDHVGPWFNYFPLVDVRTEYYYRYSGSQTVPPCYGRHIGGSRKQTNHWRFMKDP
+LRVTQRQIDEMHRLLKERIAPLDDPLASCQPDTAAKVNEDDPTKISVARPLMETRDTHYKVFCECIDWPS
+KWPEDRAWCEQGFMDRLYTHPYNFQTDGF*
+>gbi|AQL05019.1|Alpha_carbonic_anhydrase_7|Zea_mays
+MHALVRPWDTLPVLLLSRLCMVLLDALRAGWLGSVDEDEEDFSYRRNAGNGPARWGLIRREWATCNVGLL
+QSPIGLSDTLAGLADRSGRLGRSYRPAAASLVNRGHSIMVRFNSNPGGVVIDGVAYRLRQMHWHAPSEHA
+INGRRYALELQMVHQSDTNRYAVVSQLYRISRRRPDRTIHRLERYIRRIARRKNHEELIDEEVDPRRPGT
+RSNRRPLQEANGRAITFYYTSPAHGRGANGD
+>gi|OMO73707.1|Alpha_carbonic_anhydrase|Corchorus_olitorius
+MKHQSKPIFVSAFLIIFAVLFLSHSASVSAQEVEDEREFDYLEKSGKGPKHWGDLKQEWAACKNGDLQSP
+IDMSSLRVKVIKKSGEMKKRYKPCHAVVKNRGHDISLQWLDNDAGSIKINGTEYFLQQAHWHSPSEHTIN
+GRRYALELHMVHQSKDPNLKNNLAVVGLLYKFGAPDSFISKLISNITSMNDHVQERYMGVIDPSAIKMGG
+KKYYRYMGSLTVPPCTEGVIWTMNKKVRTVSRDQVRALRIAVHDYAEANARPVQPLNRREVELYGPNPGD
+VSN
+
diff --git a/CA_beta.txt b/CA_beta.txt
new file mode 100644
index 0000000..bad26ed
--- /dev/null
+++ b/CA_beta.txt
@@ -0,0 +1,24 @@
+>jgi|Phatr2|45443|estExt_fgenesh1_pg.C_chr_70069|CA5_beta
+MKFATAATVTLLALSTVDALNVKKLFRFGKTSLPKDSSPKPAAKGGYDLDVSELFDGNNKFIADKLAGDP
+AYFDTLGTVHSPKYLYIGCVDARAPPNMIMGTEAGTMLTVRNIANMVVNNDLAVMSAIQFGINVLKIPNV
+ILCGHYECGGVRASVANVDHAPPLSIWLRNIRDVYRLHAKELDAIKDPEERHRRLVDLNVIEQCVNLFKT
+GVIQAKRIESYKDGGVAIPQVHPVVFDPKTGEVKKLKVDFDKYMAEINGIYDLYDLENAKVPM
+>jgi|Phatr2|51305|estExt_fgenesh1_kg.C_chr_10001|CA4_beta
+MKFLSASIALLACATSVEAFNANKAFRFGAKAMPEVSSESATSALSAGGAEKKSYDLDITEIFDGNKKFI
+ETKKAQDAAYFDTLGTVHSPKYLYIGCVDARAPPNMIMGTEAGTMLTVRNIANMVVNNDLAVMSAIQFGI
+NVLKIPHVIVCGHYECGGVRASVANVDHAPPLSIWLRNIRDVYRLHARELDAIKDPEDRHRRLVDLNVIE
+QCVNLYKTGVIQAKRIESYQEGAPAAIPRVHPIVFDPKTGAIRKLQVDFDKYMSELDAIYDLYELENAKI
+PA*
+>gi|ONM39907.1|BetaCA_4|Zea_mays
+MAVERLKTGFEQFKADVYDKKPELFEPLKAHQSPKYMVFACSDSRVCPSVTLGLHPGEAFAVRNIASMVP
+PYDKTKYAGVGSAIEYAVCALKVEVIVVIGHSRCGGIKALLSLEDGAPDKFHFVEEWVRVGAPAKSKVLA
+DHASAPFEDQCSILEKEAVNVSLENLKSYPFVKEGLEKGTLKLVGGHYDFVNGKFETWEP
+>gi|SIT99918.1|beta-carbonic_anhydrase|Mycobacterium_bovis_AF2122/97
+MTVTDDYLANNVDYASGFKGPLPMPPSKHIAIVACMDARLDVYRMLGIKEGEAHVIRNAGCVVTDDVIRS
+LAISQRLLGTREIILLHHTDCGMLTFTDDDFKRAIQDETGIRPTWSPESYPDAVEDVRQSLRRIEVNPFV
+TKHTSLRGFVFDVATGKLNEVTP
+>gi|XP_014177286.1|betaCA|Trichosporon_asahii_var.asahii_CBS_2479
+MSNYLQETHDRVFAQNKEWAAKQRAKDPEFFTRLAAGQSPEYLWIGCSDSRMPAEMITGLEPGEAFIHRN
+IANMVNNLDLSAMAVINYAVRHLKVKHIIVCGHYGCGGVQAAMTPKDLGILNPWLRNIRDVYRLHEKELD
+AIADDEKRYERLVELNVVEQCRNVIKTAAVQQSYAENEYPIVHGWVFDFRTGLLKDLEIDYAKVLKDIQK
+IYNLTE
diff --git a/CA_delta.txt b/CA_delta.txt
new file mode 100644
index 0000000..79048d7
--- /dev/null
+++ b/CA_delta.txt
@@ -0,0 +1,61 @@
+>jgi|Thaps3|262009|thaps1_ua_kg.chr_4000019|CA7_delta_TPS
+LTKKTARDWVWDPLEPGYILHFWAYSGSTTEPPCFEGVNWRIFDVPMKISPGQYQQLQRLMFDHVDPDTC
+KLTSTHYNESNARPVQPYRGGANYRCRRSGYVSDKERKASGLRRGFKDPADWRGVDLLPWIEGEFPNV*
+>jgi|Thaps3|233|fgenesh1_pm.C_chr_2000003|CA4_delta_TPS
+MGDITPNTKPYFQSSMCPVNVHWHLGSEHYSYGEFDENGNGPHGNVARPSWANRDLATDGAAVADGFRCH
+HYDENDPKFTTKYDWKHCHGMEVGETYEVHWPHSAAGACGTVNQYQTPFYDGVFCNLPMESFTTLGGQDI
+ANAVGVHGQVFTIVNDESYFYPDMIRGMIVEPEMNMGQDIAMYTGSTTGDSRSNEMCSQYAPITWQVDRK
+CHMISASSFDKLCYDMKMQRDDMSDDLHAHGSRELVKDEYVANNQANRNLRA
+>jgi|Thaps3|814|fgenesh1_pm.C_chr_19a_19000002|CA5_delta_TPS
+MVNNVDCVHTPGPQAGANVTKGYKGGMEVDYVPNTKPYFQSSMCPVNVHWHLGTEHYSAGEYDEFGTGPN
+SVNNNLPQNQQVRPGYRCRHFDKSQPMFTNEYRWEFCVGMQVGETYEVHWPHSAAGACGTPDQYQTPFYD
+GVFCNLDEEKFSTLSAQDVADAVGVQAQVFTVVNDERYFYPDLMRGFIKDGEYGKDIAMYTGSTTGTTRS
+NEVCSSYAPITWQVDRKCHLISASSFDRLCETMRLQRDNMTLDMHAHGSRELVKDSLVANNQANRRLGGH
+DHHHHHHGHDHADHLWADGHGHLHEEWF
+>jgi|Thaps3|34125|e_gw1.5.359.1|CA6_delta_TPS
+VPGPQAGGNVTKGYVGELDVGDLTPNTKQYFQSSMCPVNVHWHLGSEHYSYGEFDENGDGPHGNIPRPDW
+ANRDLAGAGESVPDGFRCHHFDETDAKFTTKYEWKHCEGMEVGETYEVHWPHSAAGACGTVNQYQTPFYD
+GVFCNLPMETFVTLGAQDIASAVGVHGQVFTVVNDESYFYPDMIRGMIVDPDMNMGQDVAMYTGSTTGDS
+RSNEMCSQYAPITWQVDRKCHMISASSFDKLCYDMKMQRDDMSDDLHAHGSRELVMDSLVANNQAN*
+>gi|OEU09193.1|delta_carbonic_anhydrase|Fragilariopsis_cylindrus_CCMP1102
+MTFYQAAVVALLASTVNNAVNAEEDCTSIVDLACGTEGFSTLCSVLTDVAPALDPDVVSSLKTVFAPTDD
+AFAAVKFDLVTEEALLDILGYHLSTFELTGECGSLIEMADGKDTRTLCNKDKEPVFQKGWANSRAVMPQF
+DPTAGIAVCGDATVYVIDSVLIPKDYFVDEEGEVVEDNVQEVIDAPDPNDGKDYFKELLIAKGTVTEGSN
+TCANTNPQFPNINCLGEDGTVDVGPQAAANVTKGYVGGMEVDIVPITKSYYQAGLCPVNVHWHLGSEHFS
+AGEFDCEDPKKCGPYHAADDAAHDDDGHTDDAGEGDSRRQLAGDARKGYQCNYYDEDDSKFTAPYDWQFC
+DKTMEVGQTYEIHWPHSSAGACGTPNQYQTPFYDGVFCNLPLDVFQTLSAQDIASNVGVQAQVFTIVNDE
+AYYYPNLFGGMIVDGDFGADMAIYTGSTTGTSRDNEVCSQYAPITWQVDRKCHMISASSFDKMCADMMAQ
+RDDMTDDLYAHGAREVTADIITADNQQTRGRGLRLRKNNKN
+>gi|XP_005772538.1|delta_CA|Emiliania_huxleyi_CCMP1516
+MSQADWLEQNVERISKDDLTETPTTAEALEGQPNEKAVIIGAAKATGSDIAYKLSHLLALVVAGVIALLA
+SAALADGRSVIKLKDTSNLPRLTALTATLDGETINLKDHGLDYRADELLGPQYGVGLHHDSSGYGWGKAG
+ARETLQEYIDELGLLQVIAAVPSVIATDGLKHPAHFLECTELKKAGLSAMSLAIIAEVASAVMIIFHGLA
+LVGLLPLSAKLAKGFAGLVWFTLTAGFLIVVCLPIGVYETEWTCNKDFVPAIRLWDHFVYNWAFPVGYLG
+YACSLLVFSVVLCFPSLEEGAQEFDKKKTKLGLVKVVAGLFVGLVIAASVSVGIAASQDAFKDPEVDPSV
+NPCKAQKPYHAAPGDNYFRNIECMKDNLVQHLSEGQYDYHGTGPAYNSTNSTKDLYANHVHGYHDRDAED
+YVSKEEYYESKKNKGDPYADDGKKKKEKKEWTERLGLRCHHYDDEHEMFKTVATGAKKPYEWKHCVEMMV
+GETYEVPWPHSAAGACGTEWQYPDALLRRRLLQEGVVNILTPLNTYEKIGVQGQVFTIVNSDEEQYQYEN
+LIDGAWMDGKDKWVDVAKYTGSTTGTTRNNEMCSRYAPITWQVDRTCHMISAKSFDKLCYDMKQKKDDMG
+GDLYPHGAREIVADYLVANNQQSRK
+>gb|ABS87870.1|delta_carbonic_anhydrase2|Lingulodinium_polyedrum
+MVARLMLAASVLLVRAWGTGCPDDPEVDLCSETTTDESGTGTGTEEVNVNGAMRTRTSLMPMLXLAGVFR
+SKNALFALPLLGXPLAAEAAAAAGTSGPSTCGAVKDMYKEQGCCGRPDKELDVVIVPKPTKRLFGANICE
+GKQPVHATPGDNYFKNVDCLNGTTLQVLEQAGANVTLGYRGRLDASSRTPILTPYWQNGLCPVNVHWHLG
+TEHYSKGQFDEHGTGPDIAAEEDAEGEADSRRLAVARRGYRCSKYDAKDAKFTTEYNWQHCEGMHVGETY
+EVHWPHSAAGACGTPYQYQTPFYDGVFCVDGIVSLSPLNTYMKIGVQSQVYTIVNDETYYYPEMIKGMIV
+DGHYGQDIAKYTGSTTGTSRDNEVCSRYTPITWQVDRKCHLISASSFDKMCADMKNQHDDMSSDLHAHGS
+RVLVDRNFTGNNFHRRM
+>ABG37687.1 delta-carbonic anhydrase [Emiliania huxleyi]
+MSQADWLEHNVERISKDDLTETPTTAEALEGQPNAKAVTIGAAKATGSDIAYKLSHLLALVVAGVIALLA
+SAALADGRSVIKLKDTSNLPRLTALTATLDGKTINLKDHGLDYRADELLGPQYGVGLHHDSSGYGWGKAG
+ARETLQEYIDELGLLQVIAAVPSVIATDGLKHPAHFLECTELKKAGLSAMSLAIIAEVASAVMIIFHGLA
+LVGLLPLSAKLAKGFAGLVWFTLTAGFLIVVCLAIGVYETEWTCNNDFVPAIRLSDHFVYNWAFPVGYLG
+YACSLLVFSVVLCFTSLEEGAQEFDKKKTKLGLVTVVAGLFVGLVIAASVSVGIAASQDAFKEVEVDPSV
+NPCKAQKPYHAAPGDNYFRNIECMKDNLVQVLEQAGANVTRGYVGGLDAGNWRTPILDHYDDTDLCTVNV
+HWHLGAEHLSEGQYDYHGTGPAYNSTNSTKDLYANHVHGYHDRDAEDYVSKEEYYESKKNKGDPYADDGK
+KKKEKKEWTERLGLRCHHYDDEHEMFKTAATGAKKPYEWKHCVEMMVGETYEVHWPHSAAGACGTEWQYQ
+TPFYDGVFCKEGVVNILTPLNTYEKIGVQGQVFTIVNSDEEQYQYENLIDGAWMDGKDKWVDVAKYTGST
+TGTTRNNEMCSRYAPITWQVDRTCHMISAKSFDKLCYDMKQKKDDMGGDLYPHGAREIVADYLVANNQQS
+RK
+
+
diff --git a/CA_zeta.txt b/CA_zeta.txt
new file mode 100644
index 0000000..1b47915
--- /dev/null
+++ b/CA_zeta.txt
@@ -0,0 +1,10 @@
+>gi|XP_002295227.1|TPSE|CA3_zeta
+MCMHVDLQVAMSSILSKLTGKDDTSAPPLTPKDIVAALQSRGWEAEIISASSISQDMVEVDPAGILKCVD
+GRGSDNTRMAGPKMPGGIYAIAHNRGTTSVDGLKEITKEVASKGHVPSVHGDHSADMLGCGFFRLWVTGE
+FDSMGYPRPEFDADQGAAAVKESGGVIEMHHGSHTEKVVYINLVENKTLEPDENDQRFIVDGWAAIKFNL
+DVVKFLVAAAATVEMLGGPRIAKIVVA
+>pdb|3BOH|Tweisflo|CA_zeta
+SHMSLTPDQIVAALQERGWQAEIVTEFSLLNEMVDVDPQGILKCVDGRGSDNTQFCGPKMPGGIYAIAHN
+RGVTTLEGLKQITKEVASKGHVPSVHGDHSSDMLGCGFFKLWVTGRFDDMGYPRPQFDADQGAKAVENAG
+GVIEMHHGSHAEKVVYINLVENKTLEPDEDDQRFIVDGWAAGKFGLDVPKFLIAAAATVEMLGGPKKAKI
+VIP
diff --git a/GCL.txt b/GCL.txt
new file mode 100644
index 0000000..1556797
--- /dev/null
+++ b/GCL.txt
@@ -0,0 +1,75 @@
+>jgi|Thaps3|35164|e_gw1.6.74.1|GCL_TPS
+TFKGETAAHIIYSKLLEHGTEVVNGYSGGAILPLLDQFHQNHPRHGDKKKIRWITNSNESSAGHVAEGIA
+KSSTEPDGKLAAGIIVATSGPGATNLVTPITDAMCDGVPLIVLCGQAATTAPQDAFQSCPAVEIMKPCTK
+WSYQIKNAAEVPFAMDYAFYLARNGRPGPVFIDLPKDLQIQQLNDEVIGNFLDGLGLYTEDESYNVEHDN
+EFMVDLIKNAKRPFIIAGQGANDSHEELMELAETLQIPVATTLHALGTFDERHPLATNMLGMHGHATPNY
+LIQDCDLLLCIGSRFDDRITGRPSDFIPAARQAAKEGRGGVIHVDVRFSENAKQVKPTYFVHSTGKKFLQ
+AVNSAIRANPPKDTSRTKQWIEKKKELEKEYPIRITKEVTQTNMNCQSVIAEMNRQLVESGKIDDTIFST
+GVGIHQMAAAQLITWTQPRQMLSSGSLGTMGVSLGYCMGAKLANPKKWCISVDGDGSFNMTFTELKTIGE
+EKIPVKLMILDNESQMMVEYWQRLFHDERYIAVRNKSPKYTTLASAFDIKSIYCECAEELEEKMRSFLFD
+YDDEPVLFHVRIERTPCLPMVAPGQPLDNMILVDEDFEVDKSAAPS*
+>jgi|Phatr2|56476|AGR_Contig7213|GCL_PTRI
+MKFSTAALIFAVSATASTAFVPHAFVSPKSPRPALFSTELRKTDVTADLKNGVTVNPFDQSALAAGVSPL
+TETGTATTSSSQHWDPQADAELAKLAAIEARAGAAAYMGQYEAQSGASLIYSKLVEHGVTVVNGFSGGAV
+LPLLDQFHEGHPRHETSGVTPIRWITNSNEASSGHIAEGYAKSMPINGPHKPVGVAVATSGPGVTNLITP
+LQDAICDGVPLVVLCGQAATVAPEDAFQSSPAVDLTRPCTKWSYQIKSAAEIPLVMDYAFYIARNGRPGP
+VFVDLPKDLLNQILTGDLINSFIDAENPGDETSFARLQKMYRPDGEVFQALHLGTGGKGLPFEIYKDEAA
+PQNTPTYKLKPVTHANTVDSYHADHHPSDRVIRTGKVVAGEHLPNEQGPLQVGGEMTKKITDLIMKAKKP
+VIIAGQGCNDASAELKIFADRLQIPVATTLHGLGCFDERSELALNMVGMHGHPTPNFMVQEADLIICVGS
+RFDDRITGRMSDFVPEARVAEEEGRGGVIHVDIRLTENAKQISPTFFVHSTGKKFLETMIEFLAGMDSKP
+NTSAWIKRMKELQKEYPVKIPSFPSETVSVTNEDGSTTETTRTRASAQSVVAELDRQLLAADAMDDAIFT
+TGVGIHQMVAAQLITWTQPRQMLSSGSLGTMGVALGYSIGAKLANADKMVIAVDGDGSFNMTFTELKTLA
+EQGIPVKIMILDNDGQMMVEYWQRLFHDNRLIAVRNSANPDYSTLAKAFGIKSVYCDCEEDLEARMKEFL
+FDDPDEPVLFHVRIERTPCLPLVAPGQPLQDMILEDVEVDVDKSAAPS*
+>gi|OOC01793.1|glyoxylate_carboligase|Amycolatopsis_azurea_DSM43854
+MPRIPAMQAVVDVLVSEGVDTAFGCPGAAILPLYHAMQDSGIEHLIVRHEEGATHMADGWARTTGNVGVA
+IGTSGPAGTNMITGLYTAQADSIPILCITGQADSRKLHTEAFQAVDIVEIAKPVTKWAVQVKEAAQLPWV
+FREAFRIARSGRPGPVLIDLPIDVQRQEIEWDSSIDSPLPVIRTTPSPARVERALDLLLAAERPLILAGG
+GVVLGGASDRLRTAAELLGVPVGVTLMGKGTFPEDHELFAGMAGIQTSQRWANAAFLEADLVLALGARFG
+DRHTGDLDVYRGSRKFIHVDIEPTQLGKVFGPDLGIVSDTGAFLDALIEAASKRSPARDRAWPRRIGELK
+ESLPRREDFEDTPIKAPRVFKEINEFYGEDAYFVTAIGLYQIWSGQFQRAHKPRHYQVCGQAGPLGWEIP
+AAIGVKKAKPEAEVVGVVGDYSFQFLVEELAVAAQYDVGFVLIMLNNEYLGLIRQAETGYEMNFEVDIHY
+DKNGTDNVKVMEAYGCSGTRVTEPGEIRTSLEWARKEAERTSRPVLVEIMIEREGNAAMGKALDSVVEFE
+PIAG
+>gi|SJM69470.1|Glyoxylate_carboligase|Gulosibacter_sp.10
+MAKMRAVDAAVLILEKEGATQAFGLPGAAINPFYSAMRAHGGIKHVLARHVEGASHMAEGYTRTRPGNIG
+VCIGTSGPAGTDMITGLYSASADSIPILCITGQAPVAKLDKEDFQAVDIASIAKPVTKLAKTVLEAGQVP
+GVFQEAFRLMRSGRPGPVLIDLPIDVQQTEIEFDIDSYEPSPVAKPAATRAQLERALELIEGAERPLLVA
+GGGILNAAAEADFRALAEELGIPVVPTLMGWGIIPDDHPLHAGMVGLQTSHRYGNENLLASDLVFGIGNR
+WANRHTGDVDTYRKGRTIIHADIEPTQIGRVFAPDYGIVSDAGELIRGLLELVRERSGSLRDRSGWAAEC
+QDRKARLQRKTNFDNVPIKPQRVYQEMNRAFGEDARYVTTIGLSQIAGAQMLHVFKPRHWINAGQAGPLG
+WTLPAALGAAVAEPETPVVALSGDYDFQFLIEELAVGAQHRIPYVHVVVNNSYLGLIRQAQRGFEMDFEV
+SLAFENINSSLEVQGETVKGYGVDHVKVAEGLGCKAIRVEDPSKLQEAFAQAQELAAEHRVPVVVEVILE
+RVTNISMAGASIDAVNEFEEIAETAEDAPTAILPIGSREQAAAPVASGA
+>gb|SJN08827.1|Glyoxylate_carboligase|Leucobacter_sp.7(1)
+MALMRAVDAAVLILEKEGATQAFGLPGAAINPFYSAMRAHGGIKHVLARHVEGASHMAEGFTRAEPGNIG
+ICIGTSGPAGTDMITGLYSAAADSIPILCITGQAPVAKLDKEDFQAVDIASIAKPVTKMAKTVLEAGQVP
+GVFQQAFYLMRSGRPGPVLIDLPIDVQQTQIEFDIDLYEPLPIAKPTASQAQIDGIFALLDAAERPVIVA
+GGGIINADASAEFVTLAETLGVPVIPTLMGWGTIPDDHELMAGMVGLQTQHRYGNENLLASDLVIGLGNR
+WANRHTGTLDVYTEGRKFVHIDIEPTQIGRVFSPDLGIVSDAGAAIAGLLATATERKAQGTLPDRSAWVA
+ETQERKGSLQRKTNFDNVPIKPQRVYQEMNRAFGRDTRYVTTIGLSQIAGAQMLHVYKPRHWINCGQAGP
+LGWTLPAALGVVAADPKTPVVALSGDYDFQFMIEELAVGAQFKLPYIHVVVNNSYLGLIRQAQRGFEMDY
+HVSLAFDNINSPETEGYGVDHIKVAEGLGCKAIRVREADDLAASFQRAKDLMQEFQVPVVVEVILERITN
+IAMSGAGIDAINEFEDLAEGPDDAPTATIPLKQPAEAAR
+>gb|SJN20588.1|Glyoxylate_carboligase|Vibrio_sp.JB196
+MAKMKAIEAAVEVLRKEGVDIAFGVPGAAINPFYAAMKKVGGIDHVLARHVEGASHMAEGYTRTNDNNIG
+VCVGTSGPAGTDMITGLYSASADSIPILCITGQAPRARLHKEDFQAVDIESIAKPVTKWATTVLEPALVP
+RAFQQAFHIMRSGRPGPVLIDLPIDVQLAEIEFDIDTYEPLQPYQPTATRAQVEKALTMLTESEKPLIVS
+GGGVINAGASAELQELAELLNVPVIPTLMGWGTIPDDHELMAGMVGLQTSHRYGNATMLASDFVLGIGNR
+WANRHTGSVDVYTQGRKFVHVDIEPTQIGRVFCPDLGIVSDAKSALTLFLDVAKEMKASGKLKNTSDWVS
+ECIERKASMLRKTHYEEVPMKPMRVYEEMNKAFGEDTCYVSTIGLSQIAAAQFLHVYKPRHWINCGQAGP
+LGWTIPAALGVRAADPKRPIVAISGDYDFQFMIEELAVGAQFKLPYIHVVVNNSYLGLIRQAQRQFDIDY
+CVQLAFDNQNAPEMEGYGVDHVAVVEGLGCKAIRVRNPEDAPAAFAQAKELMAKHQVPVVVEFILERVTN
+ISMGVEIDGVNEFESLALDPNDAPTAITFNQ
+>gb|WP_077156852.1|glyoxylate_carboligase|Burkholderia_sp.KK1
+MPKMRAVDAAVLVLEKEGIDTAFGVPGAAINPFYSAMKKSGGISHVLARHVEGASHMAEGYTRAAPGNIG
+VCIGTSGPAGTDMITGLYSAQADSIPILAITGQAPRARLYKEDFQAVDIESIAKPVTKWAVTVREPALVP
+RVFQQAFHLMRSGRPGPVLVDLPIDVQLAEIEFDIDTYEPLPVYKPKATRAQIEKALTMLNDAEKPLIVS
+GGGVLNAAAEDLLVQFAETLGVPVIPTLMSWGAIPDDHPLMAGMVGLQTSHRYGNATMLASDFVLGIGNR
+WANRHTGSVEVYTKGRKFVHVDIEPTQIGRVFGPDLGIVSDAKAALELFVEVAKEWKAAGKLKDRSAWVS
+DCQQRKRTLQRKTHFDNVPMKPQRVYEEMNLAFDRDTCFVTTIGLSQIAGAQFLHVFKARNWINCGQAGP
+LGWTIPAALGVRAADPQRKIVALSGDYDFQFMIEELAVGAQFKLPYVHVVVNNSYLGLIRQAQRGFDMDY
+CVQLAFDNINAPELEGYGVDHVAVAEGLGCKALRVHKPEDIAPALKQAQALAAEHQVPVVVEMILERVTN
+IAMGTEIDAINEFEELAETKADAPTAVTPLD
+
+
diff --git a/GDCT.txt b/GDCT.txt
new file mode 100644
index 0000000..d391257
--- /dev/null
+++ b/GDCT.txt
@@ -0,0 +1,57 @@
+>jgi|Thaps3|36208|e_gw1.9.80.1|GDCT_TPSE
+MLKSTATALLRRAKRTSILPSTSSRSLASSTNEEPLVKTSLYNLHKELGGDMVPFAGYELPVLYKGDNGG
+VMKEHLWCRSDGKASLFDVSHMGQIRWRGRDRAAFLEKIVVGDIAGLSEGSGCLSLVTNVNGGIIDDTVI
+TNAGDYIYMVVNGATKFGDMKHFKEQMESFDGDVNMEYLEDSMQLLAIQGPGAAEAVSKLLPGAFDLTKM
+AFMTGVDTTLDGVDGCRITRCGYTGEDGFEIAMPAEHAVSIASKLLSDPSVNPTGLGARDSLRLEAGLCL
+YGHDLDENTNPIEATLGWTMGGPKSRRRTEGGFLGAEHILKPDGKFQKVARKRVGIKGMKAPAREHAEIF
+DANGETKIGEVTSGTFSPCLKAPIAMGYVETELAKAGTEVNVQIRGKMQKAEIVRMPFVESRYYRIPE*
+>jgi|Phatr2|56477|AGR_jgi|Phatr1|28288|GDCT_PTRI
+MKRLSCVRGLRLRKGRVHLRCASSQTVPERANVVVVGGGIIGTSVAYHLAKAGVEDVLLLERDRLTSGTT
+WHAAGLMNSFGSMSSTSTWSRQYTQELYRDILPTETGLETGYMGIGFIELACDADRLEAFRRIAAFNRFL
+GVDVAEISPEQVKDLFPLCETSDVLSGFWVENDGRANPTDATMALAKGARLHGANIIEQCHVAGVTTSKP
+NGNYRAKVTGVRLENETVIAANIVVNCAGMWARQFGEACGVYNIPNQAAEHYYLITEPMKEIDPSWPVIE
+DSSKCVYIRPEGKGLMLGFFEWEGAAWKPEGVPLDFSFGELDPDWDRMMPYVEQAMKRVPAAENVGVKAL
+FCGPESFTPDNRPIVGESPELRNYYIAAGLNSIGILTGGGIGKILAQWIQQGCSPHDVDVTAIDASRFQR
+YQSNITYRNDRTGEALGNTYKVHYPDHQPTTCRNAKQSVLHERLVNANAFFQETSGWESPSWYAPHGTNP
+KVETESFGRENWFLHWEAEHISCRNNVALFDMSFMSKFHVQGNDAGKFLNRLSTANVDGDWGMITYTQWL
+DEQGYMAADLTITKMAENHFMVVATDTMLNKVYSHMLDRLVHGEHVFVTDVTGRYAQLNLQGPRSRELLQ
+GLTSVDLNNFAFRRAEEIDIGLARVLCIRITYVGELGYELFVPVEQARHVYDCIVELGREFSLSHAGLKA
+LGSLRMEKGYRDYGHDMDNTDRLLDCGLGFTCDFEKEGGFIGQKHVLAQKDAAKERGGLLKRIVNVLVLD
+PAPLLHHGEILWKDGRRISDIRAASYGHTVGGAVGLSMLTRDIPVKKNWLDGSDWEVEVGSRKHPCRLSI
+RPMYDPASVRVKDA*
+>gi|OEU06768.1|GDCT|Fragilariopsis_cylindrus_CCMP1102
+MLLSSFRRSAATAAFRSPYVVASASAATRRTMMAAAAAEPLMKTSLNEWHKELGGEMVPFAGYELPVLYK
+GDNVKNGGVMKEHLWCRSEGKASLFDVSHMGQIRWHGKDRAKFVEKMVVGDIQGLDTNHGCLSLITNDQG
+GIKDDTVIVNAGDYIHMVVNGSMKFSDMAHFQKHLDDYDGDVTMEYLEDDMQLLALQGSGSADVLSKLLP
+EGFNLKTMAFMTGLDTTIDGIENCRITRCGYTGEDGFEISTPSGISTIQIASKLCSDPNVNPTGLGARDS
+LRLEAGLCLYGNDLNETINPIMGTLAWTLGKGGPNARRRQEQDFTGASTFLKEDGKLKKQARKRIGIIGM
+KAPARQYTEIYDVDGIQLIGEITSGTFSPCLKKPIAMGYIDTIMSKNDTPIKLKIRNKMVDAHVTKMPFV
+ESNYYRVPE
+>gi|EWM26666.1|GDCT|Nannochloropsis_gaditana
+MAPRESDWFCGGAYMSAGGINLASPLDLMTPHQKLSSDDRALEKTALFDMHLEMKGKMVPFAGYELPVLY
+EMPEWGGIVKEHLHCRAKASVFDVSHMGQIKWHGKDRVKFLETLVVGDVAGLGVGEARLSLLTNKDGGII
+DDTIITNAGDYTYMIVNGATKGGDMAHFKEQMESFKGDVCFEYFHEQQLLALQGPSAAETLQALLPADVD
+LSKVNFMTGFDTTVGGLQARVTRCGYTGEDGFEVSVAWKDARALAELFLEGPGIRLAGLGARDSLRLESG
+LCLYGNDIDATITPVEAALGWTMGGPKGRRRKEQGFLGAEKFLSPEGKFLPISRKRVGLAGFKAPARAHT
+EIFDPSGVNKIGEVTSGTFSPSLNKPIAMGYVAKEFSAEGSKVAVKVRGKLQGADVTKMPFVTQHYYKAP
+>jgi|Emihu1|422537|estExtDG_fgenesh_newKGs_pm.C_760029
+MPAESELRKTPLHAEHLALGAKMGPFGGWDMPIQYPDGIMKSHLFTRAKAGLFDVSHMLGVVVRGADRAL
+PDGSGTLSVLTNEAGGIIDDMIITNAARGPSEKRKRGDHLYMVINAGHEDKDLPHMEAQLSKFDASVETL
+PNNGILALQGPAAAEVLQALTPVDLSQMPFMSARPMEVAGEQCFVARSGYTGEDGFEIAVPPGGGSQHAV
+RGLWSTLLEREDVTPVGLGARDSLRLEAGLCLYGERHASSARNDLDDTTSPVEGALAWVIAKRRRAEDGS
+FCGSDRILAELRDKSRTRARCGFVVDQGAPVREGTALRDESGAEVGIVTSGGFSPCLKKGIGMCYVTPGR
+NKSGTKLLAEVRGKTQSLTVTKMPFVEQRYYRGP*
+>gi|AAL33597.1|glycine_cleavage_complex_T-protein_partial|Zea_mays
+MRGGLWQLGQSVTRRLAQAEKKVIARRCFASEADLKKTALYDFHVANGGKMVPFAGWSMPIQYKDSIMDS
+TINCRENGSLFDVAHMCGLSLKGKDCIPFLEKLVVGDIAGLAPGTGTLSVLTNEKGGAIDDTVITKVTDD
+HIYLVVNAGCREKDLAHIEEHMKAFKAKGGDVSWHIHDERSLLALQGPLAAPVLQHLTKEDLSQVYFGQF
+TFLDINGFPCYLTRTGYTGEDGFEISVPNEYAVDLAKAMLEKSEGKVRLTGLGARDSLRLEAGLCLYGND
+LEQHITPIEAGLTWAVGKRRRAEGGFLGAEVILKQIADGPPQRRVGFISSGPPARGHSEIQNEKGESIGE
+ITSGGFSPCLKKNIAMGYVKSGNHKAGTKVNILVRGKPYEGVVTKMPFVPT
+>gi|NP_172650.1|Glycine_cleavage_T-protein_family|Arabidopsis_thaliana
+MRGGSLWQLGQSITRRLAQSDKKVVSRRYFASEADLKKTALYDFHVAHGGKMVPFAGWSMPIQYKDSIMD
+STVNCRENGSLFDVAHMCGLSLKGKDCVPFLETLVVADVAGLAPGTGSLTVFTNEKGGAIDDSVITKVTD
+EHIYLVVNAGCRDKDLAHIEEHMKAFKSKGGDVSWHIHDERSLLALQGPLAAPVLQHLTKEDLSKLYFGN
+FQILDINGSTCFLTRTGYTGEDGFEISVPDEHAVDLAKAILEKSEGKVRLTGLGARDSLRLEAGLCLYGN
+DMEQHISPVEAGLTWAIGKRRRAEGGFLGADVILQQLKDGPTIRRVGFFSSGPPARSHSEVHDESGNKIG
+EITSGGFSPNLKKNIAMGYVKSGQHKTGTKVKILVRGKPYEGSITKMPFVATKYYKPT
\ No newline at end of file
diff --git a/GK.txt b/GK.txt
new file mode 100644
index 0000000..a8f69f7
--- /dev/null
+++ b/GK.txt
@@ -0,0 +1,31 @@
+>XP_002288791.1|GK|Thalassiosira_pseudonana_CCMP1335
+ASEILAEAIHKYTTTTKLEGVVIVKDDHATPNEIETLKKHNIVVRSASHPVPDVRSVSGANEILQSASNS
+DEHTLVIACISGGGSALFCSPRDPLTLEELMATNAALLSSGMSVEKMNVIRKRLENGKGGKLAAAAYPAT
+VLTLVLSDIIGDPLDLIASGPTVPDVSSWMDACQLVDEYGLELDDTPKSSHPAFSNMPSHDQLQSETILV
+GNNHAAVMAAADMAEKLGYVPVVLGTRVDGEASVVAGVYTSMAEMLTQQRKNDGGKYPIAPLPAALIAGG
+ETTVTLPPKCSGKGGRNQELALAAALKLQEMSLRDVVLVSVGTDGTDGPTDAAGAIVDGASITRIEQNNK
+NKLSAKETLRNHDAYNFFDSDGDISLIRTGATGTNVADVCITLV
+>OEU14163.1|glycerate_kinase|Fragilariopsis_cylindrus_CCMP1102
+MLSCKYLARTTVVASTYFGTFLPFVSSYSHVVKVLQVGRMSSSFNNYGHDNNLKYPAITTMSSSSSSSSS
+SKIRQFSSSSNQEEHMTKDAMQIIHDAIRAVNPYTAIGSNFVRVNDTLKITNKEQQLEYNLPEDYDEIVI
+VAFGKASTSMATAVVQQIFPKIKNNGDCDDSTSSGHNIPCRGVVICKDEHITANEREVLTDHGIEAYEAS
+HPVPDARSSNAADKLLQMVSSRASPRTLVICCISGGGSSLFCRPTPPLTLQDLQQVNSVLLANGMDIQEM
+NVLRKRLEQGKGGRLAAACFPSHVVALILSDVIGDPLDLIASGPTVPDTSTWEDGWRILQQYNLKDKLPK
+VVVDMLQNGKNGRLEDSPSADHPVFENTKNILVGNNALAVEAASNTARSLGYNPVVLGTEIEGEAKEIAN
+VYTAMASYLQNAFSQKTTKNSITQEQQYMITQSLPTAIIAGGETTVTLTPNSGKGGRNQELALSAALKLE
+SLELRNVVLASVGTDGGDGPTDAAGAVVDATTIAGTRSQALEALANHNAYPYLDGLKGTTEWPPLIKTGP
+TGTNVADICVTLIKAKPE
+>KPK23911.1|glycerate_kinase|Nitrospira_bacterium_SG8_3
+MSVQQSILEEMRNQALEIFQAALRAVEPVEAILKHVKMEGESLLIGKRRMELSKFDRILVVGAGKADAPM
+AQAVESLLGERVSDGIIVVKDGHGLPLQRVKVHEASHPVPDERGLGGTEEILSLVSGAGERDLVICLISG
+GGSALLVAPAQGVTLKDKQQVTQLLLACGASIHEINTVRKHLSRVKGGGLAHAAHPATLVSLILSDVIGD
+DLDTIASGPTVPDSTTFHQAGQILERYGIWDQVPGSVRMYVKKGVKGEIAETPKPGDPSFQRDAWELVGT
+NLQALKAARKEAERLGYRTMILSGMMEGETREVAKAHAAIAKEVLNSENPIAPPACVLSGGETTVTLQGD
+GKGGRNTEFALASAIALEGVEHVIVLSGGTDGTDGLTDAAGAFADGKTVVRARQGELDPTDYIRRNDSYT
+FFETLGGLVITGPTRTNVMDVCVMLVRR
+>BAH57057.1|GK|Arabidopsis thaliana]
+MVHDYATTTNGTSKRCSALPTTNTVDVSSVSDLFEFICSGPLVNKIGITPQRVGQSIDKWLLYGSQLCRL
+FQLNELKLTIPQKARLYHYYIPVFIWCEDQIALHNSKFKDGDDVPPLVIGFSAPQGCGKTTLVFALDYLF
+KTTKKKSATISVDDFYLTAEGQAELRKKNPGNALLEYRGNAGSHDLKLSVETLEALSKLTKEGLKMKVPR
+YNKSAYSGRGDRADSSTWPEVEGPLSVILFEGWMLGFKPLPADVVKAVDPQLEVVNKNLEAYYDAWDKYI
+DAWVVIKIQDPSYVYRWRLQVCLSHNKTKQFLMRFFYTNTVLFLV
\ No newline at end of file
diff --git a/GOX.txt b/GOX.txt
new file mode 100644
index 0000000..aecae2f
--- /dev/null
+++ b/GOX.txt
@@ -0,0 +1,49 @@
+>jgi|Thaps3|406|fgenesh1_pm.C_chr_4000047|GOX_TPS
+MHVKICNAGDYQRVARSILPTPLYEYLASGTDDEQTLSENESAFKAWYLRPRVMRPVGSISTVTTLFGQR
+LSMPVFVSPAGVHALCDEVHGECAAARACGKVGTIFGLSQHATRSIEQVAEATQGNTNLWYQSYILKDRE
+MTLRLARRAAKAGYRGIFLTVDSVRFGFREADARNNFSSLPEPHRLVNYDDEVSQAQHPKKAWVAPEASV
+DKSKIYSGQEEAWDQNTEQLFEQNPSWEDVRWLKREVCRDLPLIVKGIMTAEDAIEAKKAGADGVMVSNH
+GGRGLDSALPTIDVLPEIVAAVGDQFPVLLDSGIRRGTDVLKALALGATAVGIGKPLFFALSVGGEDAVL
+NLLQMFQRETEAAMAICGCKSVSDVTRQLVTRHPSGSGRVGKYERSKL
+>jgi|Thaps3|3353|fgenesh1_pg.C_chr_3000287|GOX2_TPS
+MALNPHKLGVHKILKHIPLNALFDAPSKARLRKAVNIADLRLCAKQRAHKMVFDYLDAGADDEISLRRGK
+DAYSELEMHFHILSGLKPPLDLSTKIFGQDVKLPFFGCPTAGNRMFHWEGETAAAKAAQHHGTLYGLSSL
+ATTGITEIGKLTDGPKVFQLYVWKDRELVKEVLAKAKEGGFNAMALTVDFTWYGNRERDIRNDFSIPPKY
+SMAQIVEAIRKPAWTYDFLSHEPYTYACINTDVPADSLAAFVNSQLCPEFDWRDAEWLLGEWNMPSAVKG
+VCRPDDAIKAVETGFTTMWVSNHGARQLETSPATIDVLPSIREAVGPDVEIILDGGVQRGTDICKALALG
+ADSVGVGKPYLYGLAAGGTEGVIKAYDILKVELDRAMGLLGAGTVDELKKRGPGLIKRRHASARDYPDRY
+AYERGYGGGVI*
+>jgi|Phatr2|22568|estExt_gwp_gw1.C_chr_180099|GOX_PTRI
+MLEESEKRNLLNVDDYQVLAKTKLPHSLYEYLASGTADATTLRENRDAFARWYLRPRAMRPVGRISTRMV
+LFGQGLSMPVFCSPAGVHALCHPDGECATARVCQDLGLLFGLSQHATKSIEQVAAAAPQSHRYYQAYILK
+DRSITARLVQRAIQAGYSGIFLTVDSVRFGYREADARNGFDALPSPHRLANYDEVRQQNLDQTYNAKTHL
+AWDQNSELLFEQNVSWKDVTWLKEEVCGGLPLIVKGIMTAEDAVLAIEAGADAIMVSNHGGRQLDTCLGS
+IDVLPEVVMAVGGRVPVLLDGGVRRGTDVVKALALGAAAVGLGKPLFFALACGGESSLKDMLEILQTEIE
+VAMALCGCETISDIQSSHITRHPGGHFQSRL*
+>jgi|Phatr2|50804|estExt_fgenesh1_pm.C_chr_40021|GOX2_PTRI
+MIFNPHKLGLHKILKHIPLNAIFDAPYKRKLARAVNIADLRLIAKSRAHKMVFDYLDAGADDEISLRRGK
+DAYSEFEMHYKVLAGIKPPLDLSTKIFGQDVTLPFFGCPTAGNRMFHWEGETAAAKAAEHHGTMYGLSSL
+ATTGITEIGELFNGPKVFQLYVWKDRELVKDVLAKAKEGGFNALALTVDFTWYGNRERDIRNDFSIPPKY
+NITQTIEAIRKPAWTYDFLSHEPYTYACINTDVPADSLAAFVNSQLSPEFSWSDAEWLLGEWNGPAAPKG
+VVRPEDAKKAIEIGFSSIWVSNHGARQLETSPATIDVLPSIRAAVGPDVEIIMDGGVQRGTDICKALALG
+ADAVGVGKPYLWGLAAGGTAGVIKAYDILKVELDRAMGLLGTPTVAALKKEGPSLIKRRPGSARDYPDMY
+AYERGYGGGVV*
+>gb|Ectocarpus_siliculosus|CBN75171.1|Glycolate_Oxidase_(2-Hydroxyacid_Oxidase)
+MGSPEKKVPVDLSRCISLDDFQRQAKPILGKALYEYVASGTDDEQTLSENRQAFKRMFLLPRMMRVVSDI
+DLRLDVFGQRLSMPVFVSPAGVHKLMHPEGECATARACAEAGTLMGVSQHATVSLEDVAAAAPRCARWFQ
+LYILKDRELTAGILRRSEKAGYTAICLTVDSVRFGSREADWRNNFNGLPPGVTLANYPTQDGYNDRVKDA
+WDQNTEKLFDERATWSDIAWLKSLTSLPILVKGILTAQDAVSAVEAGASGVIVSNHGGRALDGSLSSIES
+LAPVVKAVRSVPTGANVPIFLDSGVRRGTDVLKALALGATAVLLGRPMFFSLAVGGQEGVQRMLSIIRDE
+LEAAMALCGCQRLQDITKDLVTDFREGGSTFHRPRL
+>jgi|Emihu1|99212|fgeneshEH_pg.18__10
+MRLARRAFSSVPPFAAAVDDALLTRLERAAHRVTTNASICDRHGDDESHHRSVPPSAVVYAHSTEEVQAV
+VRVCAETRTPLISFGAGTSLEGHIQAVQGGVCLDLSEMNAVLEVNPEDLDCRVQAGITRKSLNDHLRDTG
+LTFPVDPGADASLGGMAACGASGTTAVKYGTMRENCLGLTAVLASGEVVRTGGRARKSSAGYDLTRLLVG
+SEGTLAVLTEVQLKLYPLPAAVSAATCSFPTLSDAARAVAGLLQCGVPVSRSELLDASAIAAFNKYSTEV
+ADLQEAPTLFLEVEGVSEAAVEAAAAVARECCADSGGGEFQWATSESERRRLWAARHATYYASLALRPGS
+RGVVTDAVVPLSRLAEVMGETAADVAEAGVVGPIFGHAGDGNFHCILLLRDEDPPDYVERLSQLNDRLIR
+RTLAAGGSCTGEHGVGVGKKQYLAREFGEGAVEMMRTVKRSLDPLGILNPGKVVDVSKHEAVL*
+>gi|T002129.1|Arabidopsis_thaliana|glycolate_oxidase_translation|Arabidopsis_thaliana
+MEITNVTEYDAIAKAKLPKMVYDYYASGAEDQWTLQENRNAFARILFRPRILIDVNKIDMATTVLGFKISMPIMVAPTAFQKMAHPDGEYATARAASAAGTIMTLSSWATSSVEEVASTGPGIRFFQLYVYKNRKVVEQLVRRAEKAGFKAIALTVDTPRLGRRESDIKNRFTLPPNLTLKNFEGLDLGKMDEANDSGLASYVAGQIDRTLSWKDIQWLQTITNMPILVKGVLTGEDARIAIQAGAAGIIVSNHGARQLDYVPATISALEEVVKATQGRVPVFLDGGVRRGTDVFKALALGASGIFIGRPVVFALAAEGEAGVKKVLQMLRDEFELTMALSGCRSLSEITRNHIVTEWDTPRHLPRL
+>gi|BAA82872.1|GOX|Homo_Sapien
+mlprlicindyeqhaksvlpksiydyyrsgandeetladniaafsrwklyprmlrnvaetdlstsvlgqrvsmpicvgatamqrmahvdgelatvracqslgtgmmlsswatssieevaeagpealrwlqlyiykdrevtkklvrqaekmgykaifvtvdtpylgnrlddvrnrfklppqlrmknfetstlsfspeenfgddsglaayvakaidpsiswedikwlrrltslpivakgilrgddareavkhglngilvsnhgarqldgvpatidvlpeiveavegkvevfldggvrkgtdvlkalalgakavfvgrpivwglafqgekgvqdvlxilkeefrlamalsgcqnvkvidktlvrknplavski
diff --git a/GlcDH.txt b/GlcDH.txt
new file mode 100644
index 0000000..73f3e45
--- /dev/null
+++ b/GlcDH.txt
@@ -0,0 +1,72 @@
+>sp|P0AEP9.1|GLCD_ECOLI|Glycolate_oxidase_subunit_GlcD
+MSILYEERLDGALPDVDRTSVLMALREHVPGLEILHTDEEIIPYECDGLSAYRTRPLLVVLPKQMEQVTA
+ILAVCHRLRVPVVTRGAGTGLSGGALPLEKGVLLVMARFKEILDINPVGRRARVQPGVRNLAISQAVAPH
+NLYYAPDPSSQIACSIGGNVAENAGGVHCLKYGLTVHNLLKIEVQTLDGEALTLGSDALDSPGFDLLALF
+TGSEGMLGVTTEVTVKLLPKPPVARVLLASFDSVEKAGLAVGDIIANGIIPGGLEMMDNLSIRAAEDFIH
+AGYPVDAEAILLCELDGVESDVQEDCERVNDILLKAGATDVRLAQDEAERVRFWAGRKNAFPAVGRISPD
+YYCMDGTIPRRALPGVLEGIARLSQQYDLRVANVFHAGDGNMHPLILFDANEPGEFARAEELGGKILELC
+VEVGGSISGEHGIGREKINQMCAQFNSDEITTFHAVKAAFDPDGLLNPGKNIPTLHRCAEFGAMHVHHGH
+LPFPELERF
+>sp|P52073.1|GLCE_ECOLI|Glycolate_oxidase_subunit_GlcE
+MLRECDYSQALLEQVNQAISDKTPLVIQGSNSKAFLGRPVTGQTLDVRCHRGIVNYDPTELVITARVGTP
+LVTIEAALESAGQMLPCEPPHYGEEATWGGMVACGLAGPRRPWSGSVRDFVLGTRIITGAGKHLRFGGEV
+MKNVAGYDLSRLMVGSYGCLGVLTEISMKVLPRPRASLSLRREISLQEAMSEIAEWQLQPLPISGLCYFD
+NALWIRLEGGEGSVKAARELLGGEEVAGQFWQQLREQQLPFFSLPGTLWRISLPSDAPMMDLPGEQLIDW
+GGALRWLKSTAEDNQIHRIARNAGGHATRFSAGDGGFAPLSAPLFRYHQQLKQQLDPCGVFNPGRMYAEL
+>sp|Q55124|Q55124_SYNY3|Glycolate_oxidase_subunit_GlcD|Synechocystis
+MAIFSPVNAVTDIIPQLEKIVGQDGVIKRKDELFTYECDGLTGYRQRPALVVLPRTTEQVATIVKLCHDR
+QIPWIARGAGTGLSGGALPGADSLLIVTTRMRQILAVDYDNQTIVVQPGVVNNWVTQTVSGAGFYYAPDP
+SSQIVCSIGGNIAENSGGVHCLKYGTTTNHVLGLKLVIPDGSIVEVGGQVPETPGYDLTGLFVGSEGTLG
+IATEITLKILKTPESICVVLADFLSLEATAQSVADIIAAGIVPAGMEIMDNFSINAVEDVVATNCYPRDA
+AAILLVELDGLPIEVELNQAKVEEICRNNGARNTAIAYDQETRLKMWKGRKAAFAAAGKLSPSYFVQDGV
+VPRTQLVQILSDINDLSKKYGFAIANVFHAGDGNLHPLILYDQKVPGAWEKVEELGGEILKRCVELGGSL
+SGEHGIGIDKNCFMPNMFNEVDLETMQWVRQCFNPDNLANPGKLFPTPRSCGEVANAQRLNLGQDKKMEE
+IY
+>gi|BAA18106.1|slr0806|Synechocystis
+MDWSAIAASLTTQGLEVIQDPQQRKKLSTDYAHFSPILMAQLEGKQADLVVLARSEPEAIAVIRCCVANQ
+IPLTVRGAGTGNYGQCVPLEGGIVLDLSPMQRIISLEPGRAVVEPGVKLGKLEQQAKQMGWELRLLPSTY
+QTATVGGFVSGGSTGMGAVNYGTLFDPGNVQSLTVLTMEAEPQRLILSGEAAQPVIHGYGTNGIITEITL
+PLTPALPWREAIVSFTNLSSAIAFAQNLAHQDGIVSKEISIQADPIPQYFSSLKSYYQPGAHWVMVIVSE
+LDWLAFTQLAKASKGEIIFEQDPQSPGKKINLIEFNWNHTTLLARAVDPSLTYLQVFFYRDVEQILALAK
+LFKDEIMFHIEIMRIQGQMCLAGFPLVKFINGDRLEEIMAAHQNLGARIANPHTYSLAGGSVQPLPESQL
+IFKRQVDPLNLLNPGKLTD
+>gi|BAA16857.1|glycolate_oxidase_subunit_GlcE|Synechocystis
+MPMAVVSLPFSPQNFPHSSSCSVQDLPPHQQMAIAQALAEPEHAPSHWVAPESQQELQCLLSECDRNNWP
+VIPCGNQSKLAWGGLAKPVQLLVSSAGLNRIVDHAVADLTVTVEAGVKLKDLQAILQPHQQFLPLNPLYH
+DQATVGGIMATGCAGPWQQRYGGVRDLVLGFSFVRWDGQLAKAGGRVVKNVAGYDLMKLFVGSYGTLGFI
+SQITFRLYPLPSHSQTVFLTGDTNQLAKLSQALRRSGLAPTAALICSPALVQALNLGEELGLLVRFQNLE
+PVVQAQIDEVKKLAQTLTLASQSFDNQAESELWQRWENAMAGQGTTETILCKFGLLPAKAADFLQQLPGL
+GHVQLGNGIGWVRFGQLDREKLNQQRQICQNYGGYVTVLEASPECKKHWDVWGQSGHGLAMMGRLKNQFD
+PHNTFSPGRFVGGF
+>sp|Q0ZAZ1|Q0ZAZ1_CHLRE|Glycolate_dehydrogenase|Chlamydomonas
+MPRGQGKRLAQLLGAQLKQYAAEVRGISTAGGASRGGARGPASPSSLEQQTRQVAQVAVQQSTQQAVKVV
+VPAIKVDLVGAVSSVSESDKVEPGVFKNVDGHRFEDGRYAAFVEEITKFIPKERQYSDPVRTFAYGTDAS
+FYRLNPKLVVKVHNEDEVRRIMPIAERLQVPITFRAAGTSLSGQAITDSVLIKLSHTGKNFRNFTVHGDG
+SVITVEPGLIGGEVNRILAAHQKKNKLPIQYKIGPDPSSIDSCMIGGIVSNNSSGMCCGVSQNTYHTLKD
+MRVVFVDGTVLDTADPNSCTAFMKSHRSLVDGVVSLARRVQADKELTALIRRKFAIKCTTGYSLNALVDF
+PVDNPIEIIKHLIIGSEGTLGFVSRATYNTVPEWPNKASAFIVFPDVRAACTGASVLRNETSVDAVELFD
+RASLRECENNEDMMRLVPDIKGCDPMAAALLIECRGQDEAALQSRIEEVVRVLTAAGLPFGAKAAQPMAI
+DAYPFHHDQKNAKVFWDVRRGLIPIVGAAREPGTSMLIEDVACPVDKLADMMIDLIDMFQRHGYHDASCI
+GHALEGNLHLVFSQGFRNKEEVQRFSDMMEEMCHLVATKHSGSLKGEHGTGRNVAPFVEMEWGNKAYELM
+WELKALFDPSHTLNPGVILNRDQDAHIKFLKPSPAASPIVNRCIECGFCESNCPSRDITLTPRQRISVYR
+EMYRLKQLGPGASEEEKKQLAAMSSSYAYDGEQTCAADGMCQEKCPVKINTGDLIKSMRAEHMKEEKTAS
+GMADWLAANFGVINSNVPRFLNIVNAMYSVVGSAPLSAISRALNAATNHFVPVWNPYMPKGAAPLKVPAP
+PAPAAAEASGIPRKVVYMSSCVTRMMGPAASDTETAAVHEKVMSLFGKAGYEVIIPEGVASQCCGMMFNS
+RGFKDAAASKGAELEAALLKASDNGKIPIVIDTSPCLAQVKSQISEPSLRFALYEPVEFIRHFLVDKLEW
+KKVRDQVAIHVPCSSKKMGIEESFAKLAGLCANEVVPSGIPCCGMAGDRGMRFPELTGASLQHLNLPKTC
+KDGYSTSRTCEMSLSNHAGINFRGLVYLVDEATAPKKQAAAAKTA
+>gb|XP_002178591.1|glycolate_oxidase|Phaeodactylum_tricornutum
+MIFNPHKLGLHKILKHIPLNAIFDAPYKRKLARAVNIADLRLIAKSRAHKMVFDYLDAGADDEISLRRGK
+DAYSEFEMHYKVLAGIKPPLDLSTKIFGQDVTLPFFGCPTAGNRMFHWEGETAAAKAAEHHGTMYGLSSL
+ATTGITEIGELFNGPKVFQLYVWKDRELVKDVLAKAKEGGFNALALTVDFTWYGNRERDIRNDFSIPPKY
+NITQTIEAIRKPAWTYDFLSHEPYTYACINTDVPADSLAAFVNSQLSPEFSWSDAEWLLGEWNGPAAPKG
+VVRPEDAKKAIEIGFSSIWVSNHGARQLETSPATIDVLPSIRAAVGPDVEIIMDGGVQRGTDICKALALG
+ADAVGVGKPYLWGLAAGGTAGVIKAYDILKVELDRAMGLLGTPTVAALKKEGPSLIKRRPGSARDYPDMY
+AYERGYGGGVV
+>gb|XP_002183215.1|glycolate_oxidase|Phaeodactylum_tricornutum
+MLEESEKRNLLNVDDYQVLAKTKLPHSLYEYLASGTADATTLRENRDAFARWYLRPRAMRPVGRISTRMV
+LFGQGLSMPVFCSPAGVHALCHPDGECATARVCQDLGLLFGLSQHATKSIEQVAAAAPQSHRYYQAYILK
+DRSITARLVQRAIQAGYSGIFLTVDSVRFGYREADARNGFDALPSPHRLANYDEVRQQNLDQTYNAKTHL
+AWDQNSELLFEQNVSWKDVTWLKEEVCGGLPLIVKGIMTAEDAVLAIEAGADAIMVSNHGGRQLDTCLGS
+IDVLPEVVMAVGGRVPVLLDGGVRRGTDVVKALALGAAAVGLGKPLFFALACGGESSLKDMLEILQTEIE
+VAMALCGCETISDIQSSHITRHPGGHFQSRL
diff --git a/HR.txt b/HR.txt
new file mode 100644
index 0000000..9804d90
--- /dev/null
+++ b/HR.txt
@@ -0,0 +1,28 @@
+>jgi|Thaps3|2846|fgenesh1_pg.C_chr_2000801|HR
+MAAAATRFLMRRPTAIFLNSSRLDYDKALDFSLLSRLTDLTLNNVDSISSVDEIVQKVVDSKAEIVITKE
+MEVPLEALERLPTSVKLWCEAGTGYNNIPIAQARKQSIDVVNIPTYSTASVAHMVITYIMSFSSAIFKQA
+KMLHDGDQTNFRVFQHPIYEITAKKLGLIGGSGTIGTAVIDVALPLGMDVLVSSRSGKLPSGHKYESNPR
+VKVVSLDELLSTSDYVSINCPLNSDTRHSIGEREIRLMKPTAFLINTARGAIINEAELIQCMKENVIAGA
+GLDTQEMEPPKPDSDLWKLDNVFLTPHIGWRRLETRQRLVDMTTDNIDHYIKGELQNVVN*
+>jgi|Phatr2|56499|AGR_Contig1088|HR_PTRI
+MRTRIVSATAILPLFHNLGRRRFAERKRVGLILALFSTPLSNCNGRRRAVSAFLSPTNTLSSTKTPSKSF
+LYYSTKSRTSRSVTELLRLRASSLCLRSPKSTRTLSVMSLGAVSTESSERLVTTVTDHARTMADAAIHSV
+DPVTAVRDHVRKLVDLSSTAAANHTSKPGTKATLLHIGIDPHNMVNLSLSDYDHILVVAFGKASSAMATA
+LLERLTEGQPATNQLPSISGLVIVKDGHATPQQLEILQQSRYNISVREASHPVPDQRGVDASRKLLDLVH
+TYASPRTLVFALLSGGGSALFCAPHESLTLLDLQQTNQALLQSGWSITDMNVVRKRLETGKGGRLAAAAH
+PGTVVSLILSDVLGDPLDLIASGPTVPDTSTWSDAWALAETLPEKALPDAVRRLMRAGVDGHLPDSPSPS
+HGVFARAVTCLVGNNAKAVTAAATTAQRLGYHPVILGTRTEGEARQVARWLVQLAQHLALPETPSKQFSL
+ASLPAALICGGETTVTLPEQSQKHGKGGRNQELALAAALELQRVGLNSKNDVVVVVASVGTDGTDGPTDA
+AGAIVDGHTVDRLPGDALLALETHNAYPYLAQTDANGRSPLLKTGPTGTNVADVYLVLIQKSRLK*
+>gb|SJN46695.1|Hydroxypyruvate_reductase|Pseudoalteromonas_sp.JB197
+MKITILDNATLAKTSLDCIAQLGELTVHELTSAEQVVAHSKNADVLITNKAVVNRETMSQLKSLKLICVS
+ATGTNNVDLVAAKELGIAVTNVAGYSTPSVVQHTFSLITNLLGNTHRYQADCQQGAWQKSEMFCRLDYSF
+NDLQDKTFAIIGGGTLGSAVATVASAFGANVITAERKGAQCREGRIPFEQAIKTADIISVHCPLTDETRD
+LITLNELKIMKPSSIIINTARGGIINEADLATALEQNLIAGAGVDVLTKEPAELTNPLANYKGNNLLLTP
+HIAWASTESIVRLVNEVSLNIMAFTQQQSRNRLV
+>Q9CA90.1|HPR2|Arabidopsis_thaliana
+MESIGVLMMCPMSSYLENELEKRFNLLRFWTSPEKSVLLETHRNSIRAVVGNASAGADAQLISDLPNLEI
+VSSFSVGLDKIDLGKCKEKGIRVTNTPDVLTEDVADLAIGLILALLRRLCECDRYVRSGKWKQGEFQLTT
+KFSGKSVGIIGLGRIGTAIAKRAEAFSCPINYYSRTIKPDVAYKYYPTVVDLAQNSDILVVACPLTEQTR
+HIVDRQVMDALGAKGVLINIGRGPHVDEQELIKALTEGRLGGAALDVFEQEPHVPEELFGLENVVLLPHV
+GSGTVETRNAMADLVVGNLEAHFSGKSLLTPVV
\ No newline at end of file
diff --git a/ICL.txt b/ICL.txt
new file mode 100644
index 0000000..c58f67a
--- /dev/null
+++ b/ICL.txt
@@ -0,0 +1,38 @@
+>jgi|Thaps3|35523|e_gw1.7.84.1|ICL_TPS
+MRDIAIIEQWWRDPRWKGTKRTYSASDVASLRNSSEARGSSFVNPKCSYSNRSSRKLYQLLTSLHAAGGY
+SHTFGALDPVQVVQMAPHLSSIYISGWQCSSTASSTNEPGPDFADYPMNTVPLKCDQLVRAQLHHDRRQS
+EERASAILSNKTPAPKVDYLTPIVADGDTGHGGLSAVMKLVKLFVEAGAAGVHFEDQKPGTKKCGHMGGK
+VLVSTQEHVDRLVAARLAADVLGVELVIVARTDAEAATLLDSNVDGRDHPFILGATVPGTIPMNEAMKSA
+SASGGGNAANMENEWNAKARPMTFGEAVLDKILSSGVSQRKKDEMSRMWYASDPDTLSNANARRIADSIF
+GAKNSIYFDWEACRVREGYYRVKPGIEYCIQRARAYAPYADLIWMETATPGIPDARKFSEGVKKVYPNQM
+LAYNLSPSFNWDASGMTDDELARFNDDLGRLGYTWQFITLAGFHSNGLVVTKLARSFGDEGMLAYVREIQ
+RQEKEEEVELLKHQKWSGAELVDRMVNVASGGQSSTAAMGAGVTEDQFGKH*
+>jgi|Phatr2|51088|estExt_fgenesh1_pm.C_chr_150006|ICL_PTRI
+MKFTVASISGSSSASPTSNGATKEDVAKPKIPRGLRALPAAAASSRVSPGSELGMLRSEASSIDQWWKDP
+RWKNTTRVYSSTDVACLRPSAQARNNLRQAPGVSFSSQQSDKLWSLLVQLQARKGYSHTFGALDPVQVTQ
+MAPHLSSIYVSGWQCSSTASSTNEPGPDFADYPMNTVPNKVDQLVRAQLHHDRRQQQERSEALLAGKDPG
+QPVDYLRPIVADADTGHGGLSAVMKLTKLMVEAGAAGMHLEDQKPGTKKCGHMGGKVLVSTQEHIDRLVA
+SRLASDILGVNLILVARTDAEAATLLDSNIDGRDHPFILGVTTPGMPTLQDAVAKAPAGQANQVTTEWTK
+QANLMTFGEAVLATIQRSSKPAYQKRQMEQRWMASNPNTLSNAQARRMADEILGQANAVNFDWESCRVRE
+GYYQLRPGIEYCIQRAIAYAPYADLIWMETKIPAIDDAAQFSRGVHAVHPHQMLAYNLSPSFNWDASGMT
+DSQIASFNDDLGRLGYVWQFITLAGFHGNGLVMTKLARAYGDRGMIAYVEQIQRQERIHKVELLTHQKWS
+GAELVDQMVNVASGGVSSTAAMGAGVTEAQFGH*
+>gi|Q9SE26.1|Isocitrate_lyase|Dendrobium_crumenatum
+MASSSVPPMITEEEARFEAEVSAVESWWRTDRFRLTRRPYSARDVVSLRGTLHHSYASDQMAKKLWRTLK
+SHQSAGTASRTFGALDPVQVTMMAKHLDTIYVSGWQCSSTHTATNEPGPDLADYPYNTVPNKVEHLFFAQ
+LYHDRKQHEARVSMTREQRAKTPYVDYLRPIIADGDTGFGGATATVKLCKLFVERGAAGVHIEDQSSVTK
+KCGHMAGKVLVAVSEHINRLVAARLQFDVMGVETVLVARTDAVAATLIQSNVDLRDHQFILGATNPDFKR
+RSLAAVLSAAMAAGKTGAVLQAIEDDWLSRAGLMTFSDAVINGINRQNLPEYEKQRRLNEWAAATEYSKC
+VSNEQGREIAERLGAGEIFWDWDIARTREGFYRFRGSVEAAVVRGRAFAPHADLIWMETSSPDLVECGKF
+AQGMKASHPEIMLAYNLSPSFNWDAAGMTDEEMRDFIPRIAKMGFCWQFITLGGFHADALVTDTFAREFA
+KQGMLAYVERIQREERNNGVDTLAHQKWSGANYYDRYLKTVQGGISSTAAMGKGVTEEQFKEESRTGTRG
+LDRGGITVNAKSRL
+>gb|AAA33976.1|glyoxysomal_isocitrate_lyase, partial|Glycine_max
+EAEVAEVQAWWNSERFRLTKRPYTARDVVSLRGNLRQTYASNEMAKKLWRLLKNHQANGTASRTFGALDP
+VQVTQMAKHLDTIYVSGWQCSATHTTSNEPGPDLADYPYDTVPNKVEHLFFAQQYHDRKQKEERMRMSRE
+ERARTPYVDYLRPIIADGDTGFGGTTATVKLCKLFVERGAAGIHIEDQSSVTKKCGHMAGKVLVAISEHI
+NRLVAARLQFDVMGVETVLVARTDAEAANLIQSNIDTRDHQFILGVTNPNLKGKSLATLMQQGMAAGKNG
+AELQALEDEWLSKAQLKTLSEAVVEAIERQNNIGEEEKRRKLNEWMHHSSYERCLSNEEGREIAEKLGVR
+NLFWDWDLPRTREGFYRFKGSVTASVVRGCAFSPHADVIWMETASPNVVECTEFSEGVRSKHPQMMLGYN
+LSPSFNWDASGMSDEQMKDFIPKIAKLGYVWQFITVGGLHSNALITSTFARDFANRGMLAYVERIQREER
+NNGVDTLAHQKWAGANYYDRYLKTVQGGVASTAAMGKGVTEEQFKESWTRSGAVNIDRGSIVVAKARM
diff --git a/MAGIC_HMM.sh b/MAGIC_HMM.sh
new file mode 100644
index 0000000..74cb720
--- /dev/null
+++ b/MAGIC_HMM.sh
@@ -0,0 +1,10 @@
+#! /usr/bin/env bash
+#
+
+for i in SLC4 Bestrophin CA_beta CA_delta CA_zeta CA_alpha GOX GDCT PGP GCL HR SPT TSR ICL PK PEPC PEPCK MDH OMT ME PPDK PYC SHMT MS GlcDH ALAT_GGAT GK
+ do
+ #mafft "$i".txt> "$i"_aln.txt
+ #./hmmbuild "$i".hmm "$i"_aln.txt
+ ./hmmsearch -o "$1"_"$i"_hmmout.csv --tblout "$1"_"$i"_HMM.csv "$i".hmm "$1"
+ done
+
\ No newline at end of file
diff --git a/MASTER_pepTOhmm.ipynb b/MASTER_pepTOhmm.ipynb
new file mode 100644
index 0000000..84f0bc7
--- /dev/null
+++ b/MASTER_pepTOhmm.ipynb
@@ -0,0 +1,2097 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 1,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "from ftplib import FTP #import the ftp library\n",
+ "import re \n",
+ "import os\n",
+ "import pandas as pd\n",
+ "import numpy as np"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 2,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "#cwd = os.getcwd()\n",
+ "#print cwd"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 3,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "## different classes\n",
+ "#t='Dinophyceae'\n",
+ "#t='Bacillariophyta'\n",
+ "t='Haptophyta'\n",
+ "#t='Raphidophyceae'\n",
+ "need='mmetsp_taxonomy.txt'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### MAFFT and HMMbuild in shell"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "###run mafft and hmmbuild first to make script faster\n",
+ "#os.system('./mafft_hmmbuil.sh')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Species list for HMMFUNCTION"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "will not retrive files, but will create a list with names that we can pass to bash and other functions"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "set(['Exanthemachrysis', 'Gephyrocapsa', 'Chrysochromulina', 'Isochrysis', 'Pleurochrysis', 'Pavlova', 'Phaeocystis', 'Coccolithus', 'Imantonia', 'Unidentified eukaryote', 'Prymnesium', 'Chrysoculter', 'Scyphosphaera', 'Emiliania', 'Calcidiscus'])\n"
+ ]
+ }
+ ],
+ "source": [
+ "mt=open('mmetsp_taxonomy.txt','r')\n",
+ "g=[] #make an empty list to store genus names\n",
+ "for line in mt:\n",
+ " if re.search(t,line): #if taxa name in line\n",
+ " g= g+line.split('\\t')[7:8]#pull out the 8th field should be genus, keeping as list\n",
+ "\n",
+ "g=set(g) #keep only unique genus names\n",
+ "print g\n",
+ "#close the taxonomy file\n",
+ "mt.close()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 5,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['Gephyrocapsa-oceanica-RCC1303.pep.fa.gz', 'Chrysochromulina-polylepis-CCMP1757.pep.fa.gz', 'Isochrysis-galbana-CCMP1323.pep.fa.gz', 'Isochrysis-sp-CCMP1244.pep.fa.gz', 'Isochrysis-sp-CCMP1324.pep.fa.gz', 'Pleurochrysis-carterae-CCMP645.pep.fa.gz', 'Pavlova-sp-CCMP459.pep.fa.gz', 'Prymnesium-parvum-Texoma1.pep.fa.gz', 'Emiliania-huxleyi-374.pep.fa.gz', 'Emiliania-huxleyi-379.pep.fa.gz', 'Emiliania-huxleyi-CCMP370.pep.fa.gz', 'Emiliania-huxleyi-PLYM219.pep.fa.gz']\n"
+ ]
+ }
+ ],
+ "source": [
+ "ftp= FTP('ftp.imicrobe.us') #set home ftp server\n",
+ "ftp.login() #log in\n",
+ "ftp.cwd('camera/combined_assemblies') #ch\n",
+ "\n",
+ "files=ftp.nlst() #make a list of all files and directories in wd\n",
+ "delimiter=' '\n",
+ "all=delimiter.join(files)\n",
+ "\n",
+ "names=[]\n",
+ "\n",
+ "for genus in g:\n",
+ " string= genus+\"\\S*.pep.fa.gz\"\n",
+ " taxafiles=re.findall(string, all)\n",
+ " #print \"{} files matching genus=\".format(len(taxafiles))+genus\n",
+ " #print taxafiles\n",
+ " if len(taxafiles) > 0:\n",
+ " for filex in taxafiles:\n",
+ " command = \"RETR \"+filex\n",
+ " outfile = filex\n",
+ " #ftp.retrbinary(command, open(outfile, 'wb').write)\n",
+ " names.append(outfile)\n",
+ " \n",
+ "ftp.quit()\n",
+ "\n",
+ "print names\n"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['Gephyrocapsa-oceanica-RCC1303.pep.fa.gz', 'Chrysochromulina-polylepis-CCMP1757.pep.fa.gz']\n"
+ ]
+ }
+ ],
+ "source": [
+ "names=names[:2]\n",
+ "print names"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### Retrieve PEP.fa in shell"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 147,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/plain": [
+ "0"
+ ]
+ },
+ "execution_count": 147,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "###run retriver in shell\n",
+ "os.system('python ./MMETSP_sample_import.py {} {}'.format(t,need))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "#### Fix error on Dinos"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 6,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "30\n",
+ "27\n"
+ ]
+ }
+ ],
+ "source": [
+ "##the last species is not available for some reason so I removed it\n",
+ "print len(names)\n",
+ "\n",
+ "if t=='Dinophyceae':\n",
+ " names.remove('Durinskia-baltica-CSIRO_CS-38.pep.fa.gz')\n",
+ " names.remove('Oxyrrhis-marina-CCMP1795.pep.fa.gz')\n",
+ " names.remove('Alexandrium-fundyense-CCMP1719.pep.fa.gz')\n",
+ " \n",
+ "print len(names)"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### GET counts"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 115,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "names=[i.split('.')[0] for i in names] #removes .pep.fa.gz from the names\n",
+ "\n",
+ "ftp= FTP('ftp.imicrobe.us') #set ftp server\n",
+ "ftp.login() #log in\n",
+ "ftp.cwd('camera/combined_assemblies') #change directory\n",
+ "\n",
+ "t='/Users/maria_hernandez/Documents/Big_Data3050/CMM_MoreSP/' #location for files\n",
+ "for ID in names:\n",
+ " #change to taxa directory/readcounts\n",
+ " ripdir= ID+\"/readcounts\"\n",
+ " ftp.cwd(ripdir) #change directory\n",
+ " savefile= t+ID+\"_cds_counts.txt\" #saves files with unique names\n",
+ " ftp.retrbinary('RETR cds.dat', open(savefile, 'wb').write)\n",
+ " ftp.cwd(\"~/camera/combined_assemblies\") #change directory to restart loop in right place\n",
+ " \n",
+ "ftp.quit() "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 16,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "names='Chrysochromulina-polylepis-CCMP1757.pep.fa.gz'"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### RUN HMM in shell"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "for i in names:\n",
+ " os.system('./MAGIC_HMM.sh {}'.format(i))"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 22,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "for i in names:\n",
+ " os.system('./MAGIC_one.sh {}'.format(i))"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### HMM READ"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 8,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "def readHMM(Organism,gene_name):\n",
+ " \"\"\" Takes in organism and gene_name from HMM results and makes a table.\n",
+ " HMM results from --tblout that have the following name organism_genename_HMM.csv\n",
+ " Note: pep.fa files differ in structure and it can affect how the HMM output is written. If you can't read the file in \n",
+ " modify the fuction\"\"\"\n",
+ "\n",
+ " hold=pd.read_csv('{}_{}_HMM.csv'.format(Organism,gene_name),delim_whitespace=True,engine='python',skiprows=2,skipfooter=10)\n",
+ " #empty files have 12 rows so the following if statement will only work on files that are not empty\n",
+ " \n",
+ " if hold.shape[0]!=0:\n",
+ " readX=pd.read_csv('{}_{}_HMM.csv'.format(Organism,gene_name),delim_whitespace=True,engine='python',skiprows=3,\n",
+ " header=None ,skipfooter=10)\n",
+ " \n",
+ " samplenames=[]\n",
+ " for i in np.arange(1,readX.shape[1]+1):\n",
+ " samplenames.append(str(i))\n",
+ "\n",
+ " readX.columns=samplenames\n",
+ " \n",
+ " new=pd.DataFrame()\n",
+ " \n",
+ " new['CAMPEPid']=readX['1']\n",
+ " new['contig']=readX['19']\n",
+ " new['Evalue']=readX['5']\n",
+ " new['Annotation']='{}'.format(gene_name)\n",
+ "\n",
+ " new.contig=new.contig.str.split(\"|\").str[1]\n",
+ " new.contig=new.contig.str.split(\"_\").str[0]\n",
+ " return new\n",
+ " #return readX"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 9,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "
\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " CAMPEPid | \n",
+ " contig | \n",
+ " Evalue | \n",
+ " Annotation | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " CAMPEP_0193794506 | \n",
+ " 169964 | \n",
+ " 1.600000e-125 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " CAMPEP_0193783668 | \n",
+ " 140936 | \n",
+ " 1.900000e-44 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " CAMPEP_0193795430 | \n",
+ " 170428 | \n",
+ " 1.400000e-43 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " CAMPEP_0193788850 | \n",
+ " 155911 | \n",
+ " 1.300000e-25 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " CAMPEP_0193725542 | \n",
+ " 15531 | \n",
+ " 4.000000e-24 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " CAMPEPid contig Evalue Annotation\n",
+ "0 CAMPEP_0193794506 169964 1.600000e-125 Bestrophin\n",
+ "1 CAMPEP_0193783668 140936 1.900000e-44 Bestrophin\n",
+ "2 CAMPEP_0193795430 170428 1.400000e-43 Bestrophin\n",
+ "3 CAMPEP_0193788850 155911 1.300000e-25 Bestrophin\n",
+ "4 CAMPEP_0193725542 15531 4.000000e-24 Bestrophin"
+ ]
+ },
+ "execution_count": 9,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "readHMM('Chrysochromulina-polylepis-CCMP1757.pep.fa.gz','Bestrophin').head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "def makeHMM(Organism,gene_type):\n",
+ " \"\"\"Combines the plots from each gene_type into a table\"\"\"\n",
+ " if gene_type=='CF':\n",
+ " a=readHMM(Organism,'PK')\n",
+ " b=readHMM(Organism,'PEPC')\n",
+ " c=readHMM(Organism,'PEPCK')\n",
+ " d=readHMM(Organism,'PK')\n",
+ " e=readHMM(Organism,'MDH')\n",
+ " f=readHMM(Organism,'OMT')\n",
+ " g=readHMM(Organism,'PYC')\n",
+ " h=readHMM(Organism,'PPDK')\n",
+ " i=readHMM(Organism,'ME')\n",
+ " \n",
+ " frames = [a,b,c,d,e,f,g,i,h]\n",
+ " result = pd.concat(frames)\n",
+ " return result\n",
+ " if gene_type=='PR':\n",
+ " a=readHMM(Organism,'SHMT')\n",
+ " b=readHMM(Organism,'GOX')\n",
+ " c=readHMM(Organism,'GDCT')\n",
+ " d=readHMM(Organism,'PGP')\n",
+ " e=readHMM(Organism,'ICL')\n",
+ " f=readHMM(Organism,'GCL')\n",
+ " g=readHMM(Organism,'HR')\n",
+ " h=readHMM(Organism,'SPT')\n",
+ " i=readHMM(Organism,'TSR')\n",
+ " j=readHMM(Organism,'MS')\n",
+ " k=readHMM(Organism,'GlcDH')\n",
+ " l=readHMM(Organism,'ALAT_GGAT')\n",
+ " m=readHMM(Organism,'GK')\n",
+ " \n",
+ " frames = [a,b,c,d,e,f,g,h,i,j,k,l,m]\n",
+ " result = pd.concat(frames)\n",
+ " return result\n",
+ " if gene_type=='BP':\n",
+ " a=readHMM(Organism,'CA_alpha')\n",
+ " b=readHMM(Organism,'CA_delta')\n",
+ " c=readHMM(Organism,'CA_beta')\n",
+ " #d=readHMM(Organism,'Ca_zeta')\n",
+ " e=readHMM(Organism,'Bestrophin')\n",
+ " f=readHMM(Organism,'SLC4')\n",
+ "\n",
+ " frames = [a,b,c,e,f]\n",
+ " result = pd.concat(frames)\n",
+ " return result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 13,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " CAMPEPid | \n",
+ " contig | \n",
+ " Evalue | \n",
+ " Annotation | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " CAMPEP_0193716998 | \n",
+ " 10130_1 | \n",
+ " 8.300000e-164 | \n",
+ " CA_delta | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " CAMPEP_0193720114 | \n",
+ " 12139_1 | \n",
+ " 2.500000e-144 | \n",
+ " CA_delta | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " CAMPEP_0193740950 | \n",
+ " 25073_1 | \n",
+ " 2.000000e-112 | \n",
+ " CA_delta | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " CAMPEP_0193744210 | \n",
+ " 27355_1 | \n",
+ " 1.700000e-97 | \n",
+ " CA_delta | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " CAMPEP_0193731632 | \n",
+ " 19523_1 | \n",
+ " 7.400000e-17 | \n",
+ " CA_beta | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " CAMPEP_0193739048 | \n",
+ " 23878_1 | \n",
+ " 4.500000e-07 | \n",
+ " CA_beta | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " CAMPEP_0193794506 | \n",
+ " 169964_1 | \n",
+ " 1.600000e-125 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " CAMPEP_0193783668 | \n",
+ " 140936_1 | \n",
+ " 1.900000e-44 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " CAMPEP_0193795430 | \n",
+ " 170428_1 | \n",
+ " 1.400000e-43 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " CAMPEP_0193788850 | \n",
+ " 155911_1 | \n",
+ " 1.300000e-25 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " CAMPEP_0193725542 | \n",
+ " 15531_1 | \n",
+ " 4.000000e-24 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " CAMPEP_0193705254 | \n",
+ " 1979_1 | \n",
+ " 2.600000e-23 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " CAMPEP_0193801006 | \n",
+ " 173214_1 | \n",
+ " 5.100000e-23 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " CAMPEP_0193712428 | \n",
+ " 6963_1 | \n",
+ " 7.900000e-23 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " CAMPEP_0193742798 | \n",
+ " 26324_1 | \n",
+ " 1.400000e-21 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " CAMPEP_0193727680 | \n",
+ " 16768_1 | \n",
+ " 7.400000e-21 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " CAMPEP_0193741146 | \n",
+ " 25209_1 | \n",
+ " 4.600000e-20 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " CAMPEP_0193737912 | \n",
+ " 23215_1 | \n",
+ " 3.300000e-18 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " CAMPEP_0193731534 | \n",
+ " 19445_1 | \n",
+ " 1.400000e-12 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " CAMPEP_0193704942 | \n",
+ " 1794_1 | \n",
+ " 3.800000e-12 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " CAMPEP_0193706960 | \n",
+ " 3047_1 | \n",
+ " 5.300000e-12 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " CAMPEP_0193714820 | \n",
+ " 8398_1 | \n",
+ " 5.500000e-12 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " CAMPEP_0193764498 | \n",
+ " 56150_1 | \n",
+ " 1.500000e-11 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " CAMPEP_0193735246 | \n",
+ " 21696_1 | \n",
+ " 2.000000e-09 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " CAMPEP_0193761482 | \n",
+ " 51549_1 | \n",
+ " 2.400000e-09 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " CAMPEP_0193790988 | \n",
+ " 168202_1 | \n",
+ " 3.800000e-09 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " CAMPEP_0193785494 | \n",
+ " 145252_1 | \n",
+ " 1.300000e-08 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " CAMPEP_0193779272 | \n",
+ " 125944_1 | \n",
+ " 1.300000e-08 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " CAMPEP_0193779030 | \n",
+ " 125485_1 | \n",
+ " 1.500000e-08 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " CAMPEP_0193709764 | \n",
+ " 5146_1 | \n",
+ " 1.600000e-08 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 44 | \n",
+ " CAMPEP_0193708378 | \n",
+ " 4061_1 | \n",
+ " 4.700000e-06 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 45 | \n",
+ " CAMPEP_0193783502 | \n",
+ " 140102_1 | \n",
+ " 5.000000e-06 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 46 | \n",
+ " CAMPEP_0193742954 | \n",
+ " 26440_1 | \n",
+ " 5.900000e-06 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 47 | \n",
+ " CAMPEP_0193721942 | \n",
+ " 13485_1 | \n",
+ " 8.700000e-06 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 48 | \n",
+ " CAMPEP_0193746848 | \n",
+ " 31034_1 | \n",
+ " 2.400000e-05 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 49 | \n",
+ " CAMPEP_0193796456 | \n",
+ " 170945_1 | \n",
+ " 3.800000e-05 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 50 | \n",
+ " CAMPEP_0193724924 | \n",
+ " 15193_1 | \n",
+ " 6.700000e-05 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 51 | \n",
+ " CAMPEP_0193792018 | \n",
+ " 168721_1 | \n",
+ " 7.800000e-05 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 52 | \n",
+ " CAMPEP_0193708728 | \n",
+ " 4339_1 | \n",
+ " 8.100000e-05 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 53 | \n",
+ " CAMPEP_0193739010 | \n",
+ " 23853_1 | \n",
+ " 3.300000e-04 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 54 | \n",
+ " CAMPEP_0193782824 | \n",
+ " 136333_1 | \n",
+ " 5.700000e-04 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 55 | \n",
+ " CAMPEP_0193800510 | \n",
+ " 172967_1 | \n",
+ " 7.400000e-04 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 56 | \n",
+ " CAMPEP_0193766222 | \n",
+ " 61043_1 | \n",
+ " 8.100000e-04 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 57 | \n",
+ " CAMPEP_0193732752 | \n",
+ " 20295_1 | \n",
+ " 1.300000e-03 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 58 | \n",
+ " CAMPEP_0193760434 | \n",
+ " 50060_1 | \n",
+ " 6.400000e-03 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 59 | \n",
+ " CAMPEP_0193703674 | \n",
+ " 1074_1 | \n",
+ " 6.800000e-03 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 60 | \n",
+ " CAMPEP_0193760374 | \n",
+ " 49940_1 | \n",
+ " 1.000000e-02 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 61 | \n",
+ " CAMPEP_0193759686 | \n",
+ " 48915_1 | \n",
+ " 2.100000e-02 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 62 | \n",
+ " CAMPEP_0193752988 | \n",
+ " 39679_1 | \n",
+ " 4.200000e-02 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 63 | \n",
+ " CAMPEP_0193764752 | \n",
+ " 56822_1 | \n",
+ " 6.800000e-02 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 64 | \n",
+ " CAMPEP_0193741608 | \n",
+ " 25516_1 | \n",
+ " 9.000000e-02 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 65 | \n",
+ " CAMPEP_0193800210 | \n",
+ " 172820_1 | \n",
+ " 2.000000e-01 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 66 | \n",
+ " CAMPEP_0193738984 | \n",
+ " 23837_1 | \n",
+ " 1.300000e+00 | \n",
+ " Bestrophin | \n",
+ "
\n",
+ " \n",
+ " 0 | \n",
+ " CAMPEP_0193734326 | \n",
+ " 21185_1 | \n",
+ " 1.100000e-131 | \n",
+ " SLC4 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " CAMPEP_0193793958 | \n",
+ " 169690_1 | \n",
+ " 1.200000e-86 | \n",
+ " SLC4 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " CAMPEP_0193739320 | \n",
+ " 24061_1 | \n",
+ " 6.400000e-79 | \n",
+ " SLC4 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " CAMPEP_0193749936 | \n",
+ " 35400_1 | \n",
+ " 1.900000e-17 | \n",
+ " SLC4 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " CAMPEP_0193785558 | \n",
+ " 145391_1 | \n",
+ " 5.900000e-14 | \n",
+ " SLC4 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " CAMPEP_0193709316 | \n",
+ " 4808_1 | \n",
+ " 3.300000e+00 | \n",
+ " SLC4 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " CAMPEP_0193780838 | \n",
+ " 129005_1 | \n",
+ " 3.500000e+00 | \n",
+ " SLC4 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
80 rows × 4 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " CAMPEPid contig Evalue Annotation\n",
+ "0 CAMPEP_0193716998 10130_1 8.300000e-164 CA_delta\n",
+ "1 CAMPEP_0193720114 12139_1 2.500000e-144 CA_delta\n",
+ "2 CAMPEP_0193740950 25073_1 2.000000e-112 CA_delta\n",
+ "3 CAMPEP_0193744210 27355_1 1.700000e-97 CA_delta\n",
+ "0 CAMPEP_0193731632 19523_1 7.400000e-17 CA_beta\n",
+ "1 CAMPEP_0193739048 23878_1 4.500000e-07 CA_beta\n",
+ "0 CAMPEP_0193794506 169964_1 1.600000e-125 Bestrophin\n",
+ "1 CAMPEP_0193783668 140936_1 1.900000e-44 Bestrophin\n",
+ "2 CAMPEP_0193795430 170428_1 1.400000e-43 Bestrophin\n",
+ "3 CAMPEP_0193788850 155911_1 1.300000e-25 Bestrophin\n",
+ "4 CAMPEP_0193725542 15531_1 4.000000e-24 Bestrophin\n",
+ "5 CAMPEP_0193705254 1979_1 2.600000e-23 Bestrophin\n",
+ "6 CAMPEP_0193801006 173214_1 5.100000e-23 Bestrophin\n",
+ "7 CAMPEP_0193712428 6963_1 7.900000e-23 Bestrophin\n",
+ "8 CAMPEP_0193742798 26324_1 1.400000e-21 Bestrophin\n",
+ "9 CAMPEP_0193727680 16768_1 7.400000e-21 Bestrophin\n",
+ "10 CAMPEP_0193741146 25209_1 4.600000e-20 Bestrophin\n",
+ "11 CAMPEP_0193737912 23215_1 3.300000e-18 Bestrophin\n",
+ "12 CAMPEP_0193731534 19445_1 1.400000e-12 Bestrophin\n",
+ "13 CAMPEP_0193704942 1794_1 3.800000e-12 Bestrophin\n",
+ "14 CAMPEP_0193706960 3047_1 5.300000e-12 Bestrophin\n",
+ "15 CAMPEP_0193714820 8398_1 5.500000e-12 Bestrophin\n",
+ "16 CAMPEP_0193764498 56150_1 1.500000e-11 Bestrophin\n",
+ "17 CAMPEP_0193735246 21696_1 2.000000e-09 Bestrophin\n",
+ "18 CAMPEP_0193761482 51549_1 2.400000e-09 Bestrophin\n",
+ "19 CAMPEP_0193790988 168202_1 3.800000e-09 Bestrophin\n",
+ "20 CAMPEP_0193785494 145252_1 1.300000e-08 Bestrophin\n",
+ "21 CAMPEP_0193779272 125944_1 1.300000e-08 Bestrophin\n",
+ "22 CAMPEP_0193779030 125485_1 1.500000e-08 Bestrophin\n",
+ "23 CAMPEP_0193709764 5146_1 1.600000e-08 Bestrophin\n",
+ ".. ... ... ... ...\n",
+ "44 CAMPEP_0193708378 4061_1 4.700000e-06 Bestrophin\n",
+ "45 CAMPEP_0193783502 140102_1 5.000000e-06 Bestrophin\n",
+ "46 CAMPEP_0193742954 26440_1 5.900000e-06 Bestrophin\n",
+ "47 CAMPEP_0193721942 13485_1 8.700000e-06 Bestrophin\n",
+ "48 CAMPEP_0193746848 31034_1 2.400000e-05 Bestrophin\n",
+ "49 CAMPEP_0193796456 170945_1 3.800000e-05 Bestrophin\n",
+ "50 CAMPEP_0193724924 15193_1 6.700000e-05 Bestrophin\n",
+ "51 CAMPEP_0193792018 168721_1 7.800000e-05 Bestrophin\n",
+ "52 CAMPEP_0193708728 4339_1 8.100000e-05 Bestrophin\n",
+ "53 CAMPEP_0193739010 23853_1 3.300000e-04 Bestrophin\n",
+ "54 CAMPEP_0193782824 136333_1 5.700000e-04 Bestrophin\n",
+ "55 CAMPEP_0193800510 172967_1 7.400000e-04 Bestrophin\n",
+ "56 CAMPEP_0193766222 61043_1 8.100000e-04 Bestrophin\n",
+ "57 CAMPEP_0193732752 20295_1 1.300000e-03 Bestrophin\n",
+ "58 CAMPEP_0193760434 50060_1 6.400000e-03 Bestrophin\n",
+ "59 CAMPEP_0193703674 1074_1 6.800000e-03 Bestrophin\n",
+ "60 CAMPEP_0193760374 49940_1 1.000000e-02 Bestrophin\n",
+ "61 CAMPEP_0193759686 48915_1 2.100000e-02 Bestrophin\n",
+ "62 CAMPEP_0193752988 39679_1 4.200000e-02 Bestrophin\n",
+ "63 CAMPEP_0193764752 56822_1 6.800000e-02 Bestrophin\n",
+ "64 CAMPEP_0193741608 25516_1 9.000000e-02 Bestrophin\n",
+ "65 CAMPEP_0193800210 172820_1 2.000000e-01 Bestrophin\n",
+ "66 CAMPEP_0193738984 23837_1 1.300000e+00 Bestrophin\n",
+ "0 CAMPEP_0193734326 21185_1 1.100000e-131 SLC4\n",
+ "1 CAMPEP_0193793958 169690_1 1.200000e-86 SLC4\n",
+ "2 CAMPEP_0193739320 24061_1 6.400000e-79 SLC4\n",
+ "3 CAMPEP_0193749936 35400_1 1.900000e-17 SLC4\n",
+ "4 CAMPEP_0193785558 145391_1 5.900000e-14 SLC4\n",
+ "5 CAMPEP_0193709316 4808_1 3.300000e+00 SLC4\n",
+ "6 CAMPEP_0193780838 129005_1 3.500000e+00 SLC4\n",
+ "\n",
+ "[80 rows x 4 columns]"
+ ]
+ },
+ "execution_count": 13,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "makeHMM('Chrysochromulina-polylepis-CCMP1757.pep.fa.gz','BP')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 31,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "def HMMcmp(Organism,gene_type):\n",
+ " result=makeHMM(Organism,gene_type)\n",
+ " contig_EV=dict(zip(result.contig,result.Evalue))\n",
+ " contig_Annot=dict(zip(result.contig,result.Annotation))\n",
+ " \n",
+ " \n",
+ " Organism = re.sub('.pep.fa.gz', '', Organism)\n",
+ " \n",
+ " expression=pd.read_csv('{}_cds_counts.txt'.format(Organism),delimiter='\\t',index_col=0)\n",
+ " \n",
+ " expression['log2CPM']=np.log2(expression.sum(axis=1)/1000000)\n",
+ " expression.index= expression.index.str.split(\"|\").str[1]\n",
+ " contig_CMP=dict(zip(expression.index,expression.log2CPM))\n",
+ " \n",
+ " Contig=[]\n",
+ " Evalue=[]\n",
+ " Annotation=[]\n",
+ " log2CPM=[]\n",
+ " \n",
+ " for i in expression.index:\n",
+ " a=contig_CMP.get(i)\n",
+ " if a>-16 or a==\"-inf\" or a==\"inf\":\n",
+ " Contig.append(i)\n",
+ " Evalue.append(contig_EV.get(i))\n",
+ " Annotation.append(contig_Annot.get(i))\n",
+ " log2CPM.append(a)\n",
+ " \n",
+ "\n",
+ " out=pd.DataFrame()\n",
+ " out['Contig']=Contig\n",
+ " out['Evalue']=Evalue\n",
+ " out['Annotation']=Annotation\n",
+ " out['log2CPM']=log2CPM\n",
+ " \n",
+ " return out"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 15,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Contig | \n",
+ " Evalue | \n",
+ " Annotation | \n",
+ " log2CPM | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 3_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -6.713308 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 1_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -6.339929 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 4_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -9.924541 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 6_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -14.844106 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 5_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -9.441721 | \n",
+ "
\n",
+ " \n",
+ " 5 | \n",
+ " 8_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -10.127438 | \n",
+ "
\n",
+ " \n",
+ " 6 | \n",
+ " 7_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -11.431723 | \n",
+ "
\n",
+ " \n",
+ " 7 | \n",
+ " 2_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -7.236558 | \n",
+ "
\n",
+ " \n",
+ " 8 | \n",
+ " 9_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -15.609640 | \n",
+ "
\n",
+ " \n",
+ " 9 | \n",
+ " 10_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -15.231129 | \n",
+ "
\n",
+ " \n",
+ " 10 | \n",
+ " 11_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -10.400187 | \n",
+ "
\n",
+ " \n",
+ " 11 | \n",
+ " 16_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -8.201098 | \n",
+ "
\n",
+ " \n",
+ " 12 | \n",
+ " 15_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -6.864807 | \n",
+ "
\n",
+ " \n",
+ " 13 | \n",
+ " 18_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -12.488625 | \n",
+ "
\n",
+ " \n",
+ " 14 | \n",
+ " 17_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -10.552190 | \n",
+ "
\n",
+ " \n",
+ " 15 | \n",
+ " 20_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -14.761644 | \n",
+ "
\n",
+ " \n",
+ " 16 | \n",
+ " 23_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -12.823044 | \n",
+ "
\n",
+ " \n",
+ " 17 | \n",
+ " 14_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -7.118589 | \n",
+ "
\n",
+ " \n",
+ " 18 | \n",
+ " 22_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -7.907814 | \n",
+ "
\n",
+ " \n",
+ " 19 | \n",
+ " 13_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -10.756643 | \n",
+ "
\n",
+ " \n",
+ " 20 | \n",
+ " 24_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -9.447753 | \n",
+ "
\n",
+ " \n",
+ " 21 | \n",
+ " 26_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -11.150209 | \n",
+ "
\n",
+ " \n",
+ " 22 | \n",
+ " 25_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -13.439715 | \n",
+ "
\n",
+ " \n",
+ " 23 | \n",
+ " 28_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -15.609640 | \n",
+ "
\n",
+ " \n",
+ " 24 | \n",
+ " 27_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -10.931569 | \n",
+ "
\n",
+ " \n",
+ " 25 | \n",
+ " 30_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -11.055052 | \n",
+ "
\n",
+ " \n",
+ " 26 | \n",
+ " 31_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -12.061204 | \n",
+ "
\n",
+ " \n",
+ " 27 | \n",
+ " 33_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -10.766662 | \n",
+ "
\n",
+ " \n",
+ " 28 | \n",
+ " 35_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -14.761644 | \n",
+ "
\n",
+ " \n",
+ " 29 | \n",
+ " 36_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -10.797142 | \n",
+ "
\n",
+ " \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ " ... | \n",
+ "
\n",
+ " \n",
+ " 49525 | \n",
+ " 173613_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -9.884445 | \n",
+ "
\n",
+ " \n",
+ " 49526 | \n",
+ " 173610_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -8.018679 | \n",
+ "
\n",
+ " \n",
+ " 49527 | \n",
+ " 173614_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -9.277828 | \n",
+ "
\n",
+ " \n",
+ " 49528 | \n",
+ " 173615_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -9.524301 | \n",
+ "
\n",
+ " \n",
+ " 49529 | \n",
+ " 173616_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -9.744216 | \n",
+ "
\n",
+ " \n",
+ " 49530 | \n",
+ " 173617_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -8.363612 | \n",
+ "
\n",
+ " \n",
+ " 49531 | \n",
+ " 173594_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -10.517941 | \n",
+ "
\n",
+ " \n",
+ " 49532 | \n",
+ " 173619_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -11.000831 | \n",
+ "
\n",
+ " \n",
+ " 49533 | \n",
+ " 173618_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -9.690777 | \n",
+ "
\n",
+ " \n",
+ " 49534 | \n",
+ " 173620_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -10.180025 | \n",
+ "
\n",
+ " \n",
+ " 49535 | \n",
+ " 173621_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -7.328406 | \n",
+ "
\n",
+ " \n",
+ " 49536 | \n",
+ " 173623_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -10.746693 | \n",
+ "
\n",
+ " \n",
+ " 49537 | \n",
+ " 173622_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -9.873577 | \n",
+ "
\n",
+ " \n",
+ " 49538 | \n",
+ " 173624_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -10.124214 | \n",
+ "
\n",
+ " \n",
+ " 49539 | \n",
+ " 173625_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -5.489727 | \n",
+ "
\n",
+ " \n",
+ " 49540 | \n",
+ " 173628_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -9.125825 | \n",
+ "
\n",
+ " \n",
+ " 49541 | \n",
+ " 173626_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -8.247258 | \n",
+ "
\n",
+ " \n",
+ " 49542 | \n",
+ " 173629_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -10.190102 | \n",
+ "
\n",
+ " \n",
+ " 49543 | \n",
+ " 173631_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -7.165661 | \n",
+ "
\n",
+ " \n",
+ " 49544 | \n",
+ " 173632_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -11.231129 | \n",
+ "
\n",
+ " \n",
+ " 49545 | \n",
+ " 173633_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -9.676540 | \n",
+ "
\n",
+ " \n",
+ " 49546 | \n",
+ " 173634_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -6.513189 | \n",
+ "
\n",
+ " \n",
+ " 49547 | \n",
+ " 173627_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -10.186735 | \n",
+ "
\n",
+ " \n",
+ " 49548 | \n",
+ " 173635_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -10.196859 | \n",
+ "
\n",
+ " \n",
+ " 49549 | \n",
+ " 173636_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -9.434715 | \n",
+ "
\n",
+ " \n",
+ " 49550 | \n",
+ " 173637_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -8.188417 | \n",
+ "
\n",
+ " \n",
+ " 49551 | \n",
+ " 173638_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -5.760235 | \n",
+ "
\n",
+ " \n",
+ " 49552 | \n",
+ " 173639_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -2.661282 | \n",
+ "
\n",
+ " \n",
+ " 49553 | \n",
+ " 173554_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -7.592275 | \n",
+ "
\n",
+ " \n",
+ " 49554 | \n",
+ " 173630_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -8.716036 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
49555 rows × 4 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Contig Evalue Annotation log2CPM\n",
+ "0 3_1 NaN None -6.713308\n",
+ "1 1_1 NaN None -6.339929\n",
+ "2 4_1 NaN None -9.924541\n",
+ "3 6_1 NaN None -14.844106\n",
+ "4 5_1 NaN None -9.441721\n",
+ "5 8_1 NaN None -10.127438\n",
+ "6 7_1 NaN None -11.431723\n",
+ "7 2_1 NaN None -7.236558\n",
+ "8 9_1 NaN None -15.609640\n",
+ "9 10_1 NaN None -15.231129\n",
+ "10 11_1 NaN None -10.400187\n",
+ "11 16_1 NaN None -8.201098\n",
+ "12 15_1 NaN None -6.864807\n",
+ "13 18_1 NaN None -12.488625\n",
+ "14 17_1 NaN None -10.552190\n",
+ "15 20_1 NaN None -14.761644\n",
+ "16 23_1 NaN None -12.823044\n",
+ "17 14_1 NaN None -7.118589\n",
+ "18 22_1 NaN None -7.907814\n",
+ "19 13_1 NaN None -10.756643\n",
+ "20 24_1 NaN None -9.447753\n",
+ "21 26_1 NaN None -11.150209\n",
+ "22 25_1 NaN None -13.439715\n",
+ "23 28_1 NaN None -15.609640\n",
+ "24 27_1 NaN None -10.931569\n",
+ "25 30_1 NaN None -11.055052\n",
+ "26 31_1 NaN None -12.061204\n",
+ "27 33_1 NaN None -10.766662\n",
+ "28 35_1 NaN None -14.761644\n",
+ "29 36_1 NaN None -10.797142\n",
+ "... ... ... ... ...\n",
+ "49525 173613_1 NaN None -9.884445\n",
+ "49526 173610_1 NaN None -8.018679\n",
+ "49527 173614_1 NaN None -9.277828\n",
+ "49528 173615_1 NaN None -9.524301\n",
+ "49529 173616_1 NaN None -9.744216\n",
+ "49530 173617_1 NaN None -8.363612\n",
+ "49531 173594_1 NaN None -10.517941\n",
+ "49532 173619_1 NaN None -11.000831\n",
+ "49533 173618_1 NaN None -9.690777\n",
+ "49534 173620_1 NaN None -10.180025\n",
+ "49535 173621_1 NaN None -7.328406\n",
+ "49536 173623_1 NaN None -10.746693\n",
+ "49537 173622_1 NaN None -9.873577\n",
+ "49538 173624_1 NaN None -10.124214\n",
+ "49539 173625_1 NaN None -5.489727\n",
+ "49540 173628_1 NaN None -9.125825\n",
+ "49541 173626_1 NaN None -8.247258\n",
+ "49542 173629_1 NaN None -10.190102\n",
+ "49543 173631_1 NaN None -7.165661\n",
+ "49544 173632_1 NaN None -11.231129\n",
+ "49545 173633_1 NaN None -9.676540\n",
+ "49546 173634_1 NaN None -6.513189\n",
+ "49547 173627_1 NaN None -10.186735\n",
+ "49548 173635_1 NaN None -10.196859\n",
+ "49549 173636_1 NaN None -9.434715\n",
+ "49550 173637_1 NaN None -8.188417\n",
+ "49551 173638_1 NaN None -5.760235\n",
+ "49552 173639_1 NaN None -2.661282\n",
+ "49553 173554_1 NaN None -7.592275\n",
+ "49554 173630_1 NaN None -8.716036\n",
+ "\n",
+ "[49555 rows x 4 columns]"
+ ]
+ },
+ "execution_count": 15,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "HMMcmp('Chrysochromulina-polylepis-CCMP1757.pep.fa.gz','BP')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 32,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "def HMMclean(Organism):\n",
+ " '''Will create an HMM table and filter out values that do not meet an e-value'''\n",
+ " a=HMMcmp(Organism,'BP')\n",
+ " b=HMMcmp(Organism,'CF')\n",
+ " c=HMMcmp(Organism,'PR')\n",
+ " \n",
+ " frames=[a,b,c]\n",
+ " HMM= pd.concat(frames)\n",
+ " \n",
+ " #print HMM\n",
+ " \n",
+ " HMM=HMM[HMM['Evalue'] < .00001]\n",
+ " \n",
+ " #return HMM.drop_duplicates()\n",
+ " \n",
+ " Genes= np.unique(HMM['Annotation'])\n",
+ " A=pd.DataFrame(0,index=[Organism],columns=Genes)\n",
+ " \n",
+ " for j in Genes:\n",
+ " counts=HMM.Annotation.value_counts()[j]\n",
+ " A[j]= counts\n",
+ " \n",
+ " return A\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 17,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ALAT_GGAT | \n",
+ " Bestrophin | \n",
+ " CA_beta | \n",
+ " CA_delta | \n",
+ " GCL | \n",
+ " GDCT | \n",
+ " GlcDH | \n",
+ " HR | \n",
+ " ICL | \n",
+ " MDH | \n",
+ " ... | \n",
+ " OMT | \n",
+ " PEPC | \n",
+ " PEPCK | \n",
+ " PGP | \n",
+ " PK | \n",
+ " PYC | \n",
+ " SHMT | \n",
+ " SLC4 | \n",
+ " SPT | \n",
+ " TSR | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Chrysochromulina-polylepis-CCMP1757.pep.fa.gz | \n",
+ " 8 | \n",
+ " 48 | \n",
+ " 2 | \n",
+ " 4 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 9 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " ... | \n",
+ " 47 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 11 | \n",
+ " 9 | \n",
+ " 10 | \n",
+ " 4 | \n",
+ " 5 | \n",
+ " 3 | \n",
+ " 7 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1 rows × 21 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ALAT_GGAT Bestrophin CA_beta \\\n",
+ "Chrysochromulina-polylepis-CCMP1757.pep.fa.gz 8 48 2 \n",
+ "\n",
+ " CA_delta GCL GDCT GlcDH HR \\\n",
+ "Chrysochromulina-polylepis-CCMP1757.pep.fa.gz 4 2 1 4 9 \n",
+ "\n",
+ " ICL MDH ... OMT PEPC \\\n",
+ "Chrysochromulina-polylepis-CCMP1757.pep.fa.gz 3 2 ... 47 1 \n",
+ "\n",
+ " PEPCK PGP PK PYC SHMT \\\n",
+ "Chrysochromulina-polylepis-CCMP1757.pep.fa.gz 1 11 9 10 4 \n",
+ "\n",
+ " SLC4 SPT TSR \n",
+ "Chrysochromulina-polylepis-CCMP1757.pep.fa.gz 5 3 7 \n",
+ "\n",
+ "[1 rows x 21 columns]"
+ ]
+ },
+ "execution_count": 17,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "HMMclean('Chrysochromulina-polylepis-CCMP1757.pep.fa.gz')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 18,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['Gephyrocapsa-oceanica-RCC1303.pep.fa.gz', 'Chrysochromulina-polylepis-CCMP1757.pep.fa.gz']\n"
+ ]
+ }
+ ],
+ "source": [
+ "print names"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 33,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "def gen_type(gene_type):\n",
+ " \"\"\" Allows you to extract specific gene_types\"\"\"\n",
+ " \n",
+ " if gene_type=='BP':\n",
+ " fixed=['SLC4','Bestrophin','CA_alpha','CA_beta','CA_delta','CA_zeta']\n",
+ " if gene_type=='PR':\n",
+ " fixed=['PGP','GOX','SPT','ALAT_GGAT','GDCT','SHMT','HR','GK','GlcDH','MS','ICL','GCL','TSR']\n",
+ " if gene_type=='CF':\n",
+ " fixed=['PK','PEPC','PEPCK','MDH','OMT','ME','PPDK','PYC']\n",
+ " \n",
+ " A=pd.DataFrame(0, index=names, columns=fixed)\n",
+ " \n",
+ " for i,j in enumerate(names):\n",
+ " for k in fixed:\n",
+ " B=HMMclean(j)\n",
+ " #print B\n",
+ " if k in list(B):\n",
+ " A[k][i]=B[k]\n",
+ " \n",
+ " A.index = A.index.str.split('.').str[0]\n",
+ " \n",
+ " C=A.transpose()\n",
+ " #C.to_csv('{}_{}_GeneCountHMM.csv'.format(t,gene_type))\n",
+ " \n",
+ " return A.transpose()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 30,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "#gen_type('BP')"
+ ]
+ },
+ {
+ "cell_type": "markdown",
+ "metadata": {},
+ "source": [
+ "### All sections in one table"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 34,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ " Gephyrocapsa-oceanica-RCC1303 \\\n",
+ "SLC4 6 \n",
+ "Bestrophin 37 \n",
+ "CA_alpha 2 \n",
+ "CA_beta 0 \n",
+ "CA_delta 2 \n",
+ "CA_zeta 0 \n",
+ "PK 4 \n",
+ "PEPC 1 \n",
+ "PEPCK 1 \n",
+ "MDH 4 \n",
+ "OMT 69 \n",
+ "ME 2 \n",
+ "PPDK 0 \n",
+ "PYC 16 \n",
+ "PGP 9 \n",
+ "GOX 1 \n",
+ "SPT 2 \n",
+ "ALAT_GGAT 13 \n",
+ "GDCT 4 \n",
+ "SHMT 4 \n",
+ "HR 10 \n",
+ "GK 0 \n",
+ "GlcDH 5 \n",
+ "MS 0 \n",
+ "ICL 4 \n",
+ "GCL 4 \n",
+ "TSR 7 \n",
+ "\n",
+ " Chrysochromulina-polylepis-CCMP1757 Isochrysis-galbana-CCMP1323 \\\n",
+ "SLC4 5 14 \n",
+ "Bestrophin 48 34 \n",
+ "CA_alpha 0 2 \n",
+ "CA_beta 2 7 \n",
+ "CA_delta 4 2 \n",
+ "CA_zeta 0 0 \n",
+ "PK 9 6 \n",
+ "PEPC 1 1 \n",
+ "PEPCK 1 2 \n",
+ "MDH 2 4 \n",
+ "OMT 47 75 \n",
+ "ME 3 8 \n",
+ "PPDK 0 0 \n",
+ "PYC 10 17 \n",
+ "PGP 11 11 \n",
+ "GOX 0 2 \n",
+ "SPT 3 6 \n",
+ "ALAT_GGAT 8 17 \n",
+ "GDCT 1 3 \n",
+ "SHMT 4 9 \n",
+ "HR 9 8 \n",
+ "GK 0 0 \n",
+ "GlcDH 4 7 \n",
+ "MS 0 2 \n",
+ "ICL 3 4 \n",
+ "GCL 2 2 \n",
+ "TSR 7 11 \n",
+ "\n",
+ " Isochrysis-sp-CCMP1244 Isochrysis-sp-CCMP1324 \\\n",
+ "SLC4 8 5 \n",
+ "Bestrophin 35 18 \n",
+ "CA_alpha 0 0 \n",
+ "CA_beta 5 4 \n",
+ "CA_delta 1 2 \n",
+ "CA_zeta 0 0 \n",
+ "PK 3 4 \n",
+ "PEPC 1 0 \n",
+ "PEPCK 1 1 \n",
+ "MDH 4 3 \n",
+ "OMT 61 28 \n",
+ "ME 3 3 \n",
+ "PPDK 0 0 \n",
+ "PYC 11 9 \n",
+ "PGP 7 4 \n",
+ "GOX 1 1 \n",
+ "SPT 2 4 \n",
+ "ALAT_GGAT 13 8 \n",
+ "GDCT 4 1 \n",
+ "SHMT 4 6 \n",
+ "HR 6 4 \n",
+ "GK 0 0 \n",
+ "GlcDH 4 2 \n",
+ "MS 1 1 \n",
+ "ICL 4 2 \n",
+ "GCL 4 1 \n",
+ "TSR 9 3 \n",
+ "\n",
+ " Pleurochrysis-carterae-CCMP645 Pavlova-sp-CCMP459 \\\n",
+ "SLC4 4 4 \n",
+ "Bestrophin 14 19 \n",
+ "CA_alpha 13 0 \n",
+ "CA_beta 15 9 \n",
+ "CA_delta 0 0 \n",
+ "CA_zeta 0 0 \n",
+ "PK 9 2 \n",
+ "PEPC 1 0 \n",
+ "PEPCK 2 2 \n",
+ "MDH 2 6 \n",
+ "OMT 51 27 \n",
+ "ME 2 2 \n",
+ "PPDK 3 1 \n",
+ "PYC 11 9 \n",
+ "PGP 3 3 \n",
+ "GOX 1 1 \n",
+ "SPT 2 3 \n",
+ "ALAT_GGAT 10 6 \n",
+ "GDCT 1 1 \n",
+ "SHMT 5 4 \n",
+ "HR 8 3 \n",
+ "GK 1 0 \n",
+ "GlcDH 2 5 \n",
+ "MS 0 1 \n",
+ "ICL 2 2 \n",
+ "GCL 1 4 \n",
+ "TSR 7 5 \n",
+ "\n",
+ " Prymnesium-parvum-Texoma1 Emiliania-huxleyi-374 \\\n",
+ "SLC4 17 8 \n",
+ "Bestrophin 32 14 \n",
+ "CA_alpha 0 0 \n",
+ "CA_beta 5 0 \n",
+ "CA_delta 6 1 \n",
+ "CA_zeta 0 0 \n",
+ "PK 6 3 \n",
+ "PEPC 2 1 \n",
+ "PEPCK 5 1 \n",
+ "MDH 4 3 \n",
+ "OMT 51 22 \n",
+ "ME 1 4 \n",
+ "PPDK 0 0 \n",
+ "PYC 15 9 \n",
+ "PGP 4 4 \n",
+ "GOX 2 1 \n",
+ "SPT 4 1 \n",
+ "ALAT_GGAT 14 7 \n",
+ "GDCT 4 2 \n",
+ "SHMT 5 4 \n",
+ "HR 7 4 \n",
+ "GK 0 0 \n",
+ "GlcDH 5 4 \n",
+ "MS 1 1 \n",
+ "ICL 6 3 \n",
+ "GCL 1 1 \n",
+ "TSR 9 4 \n",
+ "\n",
+ " Emiliania-huxleyi-379 Emiliania-huxleyi-CCMP370 \\\n",
+ "SLC4 9 6 \n",
+ "Bestrophin 22 38 \n",
+ "CA_alpha 0 3 \n",
+ "CA_beta 5 0 \n",
+ "CA_delta 1 1 \n",
+ "CA_zeta 0 0 \n",
+ "PK 3 5 \n",
+ "PEPC 1 1 \n",
+ "PEPCK 1 1 \n",
+ "MDH 3 5 \n",
+ "OMT 35 73 \n",
+ "ME 2 2 \n",
+ "PPDK 0 0 \n",
+ "PYC 9 13 \n",
+ "PGP 3 8 \n",
+ "GOX 1 1 \n",
+ "SPT 0 3 \n",
+ "ALAT_GGAT 11 15 \n",
+ "GDCT 3 4 \n",
+ "SHMT 5 6 \n",
+ "HR 5 7 \n",
+ "GK 0 0 \n",
+ "GlcDH 2 4 \n",
+ "MS 1 1 \n",
+ "ICL 2 8 \n",
+ "GCL 3 3 \n",
+ "TSR 6 6 \n",
+ "\n",
+ " Emiliania-huxleyi-PLYM219 \n",
+ "SLC4 8 \n",
+ "Bestrophin 32 \n",
+ "CA_alpha 4 \n",
+ "CA_beta 0 \n",
+ "CA_delta 2 \n",
+ "CA_zeta 0 \n",
+ "PK 5 \n",
+ "PEPC 1 \n",
+ "PEPCK 1 \n",
+ "MDH 2 \n",
+ "OMT 77 \n",
+ "ME 2 \n",
+ "PPDK 0 \n",
+ "PYC 16 \n",
+ "PGP 7 \n",
+ "GOX 1 \n",
+ "SPT 2 \n",
+ "ALAT_GGAT 12 \n",
+ "GDCT 3 \n",
+ "SHMT 5 \n",
+ "HR 6 \n",
+ "GK 0 \n",
+ "GlcDH 4 \n",
+ "MS 1 \n",
+ "ICL 4 \n",
+ "GCL 4 \n",
+ "TSR 9 \n"
+ ]
+ }
+ ],
+ "source": [
+ "a=gen_type('BP')\n",
+ "b=gen_type('CF')\n",
+ "c=gen_type('PR')\n",
+ "\n",
+ "framesX=(a,b,c)\n",
+ "outFrame=pd.concat(framesX)\n",
+ "\n",
+ "outFrame.to_csv('COP_ADD.csv')\n",
+ "\n",
+ "print outFrame"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 24,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "SLC4 5\n",
+ "Bestrophin 48\n",
+ "CA_alpha 0\n",
+ "CA_beta 2\n",
+ "CA_delta 4\n",
+ "CA_zeta 0\n",
+ "PK 9\n",
+ "PEPC 1\n",
+ "PEPCK 1\n",
+ "MDH 2\n",
+ "OMT 47\n",
+ "ME 3\n",
+ "PPDK 0\n",
+ "PYC 10\n",
+ "PGP 11\n",
+ "GOX 0\n",
+ "SPT 3\n",
+ "ALAT_GGAT 8\n",
+ "GDCT 1\n",
+ "SHMT 4\n",
+ "HR 9\n",
+ "GK 0\n",
+ "GlcDH 4\n",
+ "MS 0\n",
+ "ICL 3\n",
+ "GCL 2\n",
+ "TSR 7\n",
+ "Name: Chrysochromulina-polylepis-CCMP1757, dtype: int64\n"
+ ]
+ }
+ ],
+ "source": [
+ "print outFrame['Chrysochromulina-polylepis-CCMP1757']"
+ ]
+ }
+ ],
+ "metadata": {
+ "anaconda-cloud": {},
+ "kernelspec": {
+ "display_name": "Python [conda root]",
+ "language": "python",
+ "name": "conda-root-py"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 2
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython2",
+ "version": "2.7.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 1
+}
diff --git a/MDH.txt b/MDH.txt
new file mode 100644
index 0000000..434d77e
--- /dev/null
+++ b/MDH.txt
@@ -0,0 +1,48 @@
+>jgi|Thaps3|20726|estExt_fgenesh1_pg.C_chr_10366|MDH1
+MIVNRLATKALTALRSTSSATLKSCFSTSTPTSAKVAVLGAAGGIGQPLSLLCKLSPEVSTLSCYDIVGT
+PGVAADLSHIPTKSGTMGRLPSPVQWPMAGNGGLEETLTGADVVVIPAGVPRKPGMTRDDLFNTNASIVK
+TLVEGCAQFCPDAVIAIISNPVNSTVPIAAEVLKKHGVYNPKKLAGVTTLDVCRANTFVANSQGLDPKDV
+NVTVIGGHAGITILPLFSRVEGAKFTDEELEAITVRTQFGGDEVVAAKAGAGSATLSMAYAGYVFTENVL
+KALRGEEIVQCAFVESGLTDAKYFASPVKFGKGGVEEILPLGALSAYEQGWFDKMMPELKKQIQKGEDFV
+NN*
+>jgi|Thaps3|25953|estExt_fgenesh1_kg.C_chr_40009|MDH2
+MKAFMKEVFLSYGVTPERAEVCSDVLIESDKRGIDSHGLGRLKPIYCDRMDDGILFPDKPIDIISESDTT
+ALVDGNLGLGLYIGPHCMQMAIDKAKKHGVGFVAVRNSTHYGIAGYYATMATQQGCIGLTGTNARPSIAP
+TFGVEPMMGTNPLTFGIPSSDDFPFVIDCATSVNQRGKIEKYAREGVDTPRGAVIDDQGIERTDTDGILR
+DMVLGKCALTPVGGAGDKMGGYKGYGWATTVELLCTALQSGPWGEDICGVDRATGKPKPMPLGHFFLAID
+IEKICPVDTFKKNSGEFLQALRDSKKAPNGPGRIWTAGEIENDARVERTAQGGMKVPIPLQKNMKALRDT
+RPGLKEKYVKLLFE*
+>jgi|Thaps3|41425|estExt_gwp_gw1.C_chr_70437|MDH3
+MAIQPSITRVAISGAAGNIGYALLPLLASGYVFGDDRSVELRLLEIPHAVKALAGVRMELIDCAFPCLTD
+VIITTEPEEAFEGADVIVLVGGFPRKQGMERKDLIHANTKIFTTMGRAIEEVASPNVKVLVVANPANTNC
+LVALNEASRIPSKNFCALTYLDHQRAKAQVAIRLGVRPNQVKNVSIWGNHSNTQYPDVLTDGYISFDSGE
+DIPISTLMANDLEWTNDDFVQIVQNRGKHVIEVRGNSSALSAAQATADCLATWLVTGTKRGETISMAVYN
+DKGYYGVKKGLVFSFPCECRDGDWFVKTGLELSDLAMEKLQVTENELKEEREDAEELIKQTRFRSMSTVS
+TASLASSTSEMELSVPPRVLTSRI*
+>jgi|Phatr2|51297|estExt_fgenesh1_pm.C_chr_310007|MDH_PTRI
+MFTARSLASVASYSSASVARQMSSASKKVAVLGAAGGIGQPLSMLLKLSPAIGELACYDIVGTPGVAADL
+SHIPTRARVSGCLPAAGAWPPRGNEGLGEALTGADVVVIPAGVPRKPGMTRDDLFNTNAGIVKTLIQGVA
+EFCPEAVIAIISNPVNSTVPIAAEILKQKGVYNPRKLCGVTTFDVIRANTFAAAHMGVDPASVDVTVIGG
+HAGITILPLYSQLEGFAPSDAEREAITVRTQFGGDEVVQAKAGSGSATLSMAYAGYLFTEKVLQGLNGEK
+VTQCAYVQSDLTDCKYFASPCEFGPNGVEKVLGYGTLSAYEQAWFDKMIPDLQKQIKKGEDFVNA*
+>gb|AAW79319.1_malate_dehydrogenase_partial|Isochrysis_galbana
+AVLGAAGGIGQPLSLLCKLSDHIDEVACYDVVGTPGVAADLSHIPSGAKITGDLPSAGTWPPSHNAGLER
+ALTGASVVVIPAGVPRKPGMTRDDLFNTNASIVKTLVEGCAKFCPDAVLAIISNPVNSTVPIAAEVLKKA
+GVYNKNKVVGVTTLDVCRANTFLAEKLGKSPKDINVPVIGGHAGITILPLLSQVPGASALPADVTAALTH
+RIQFGGDEVVQAKAGSGSATLSMAYAGFLFTEGLIKAMKGEEVIQCAYVESTLTPAAYFASPCKFGPEGV
+KEVLGFGTLSAYEKQWFDKMVPDLQKQIAKGIDFVNSPA
+>gb|XP_001418129.1|predicted_protein|Ostreococcus_lucimarinus2_CCE9901
+MPASTVDDVVTVLITGAAGQIGYALAPMVCAGAATGPGKKIALKLLDVEFASEALRGVKMEIMDCAFDAC
+VSVDVFTDCEKACEGVDVAIMVGGFPRKQGMERKDVLGKNVAIYKQQASALASKAKKDVKIVVVANPANT
+NAKILAKFAPSIPRGNVTCMTRLDHNRALAQLGERSGKATIEVKNAIIWGNHSSTQYPDVNHATIEGKPA
+REVIGNDAYLDGDFVDVVRARGAAIIEARKLSSALSAASSVCDHVYDWIHGTKEGEWTSMGVISDGSYGV
+PEGLVYSFPVTCTGGKWQIVQGLSIDERSRKLMDESAKELTEEFELAEQCLAESA
+>gb|EKX43420.1|hypothetical_protein_GUITHDRAFT_163842|Guillardia_theta_CCMP2712
+MSKPVMTVCITGAAGQIGYALLPHLCSGKTFGPDQPVKLHLLDLNIEGVQTALNGVKMELEDATYPLLKG
+VVCTGDAKVAFTGADAVIMLGAFPRKDGMERKDLLEKNCGIFKEQGELLNTVASKTVKVLVVGNPANTNC
+LIAAECAPNIPRENFSALTRLDHNRAIAQLAIKASVPVEQVQNSIIWGNHSSTQYPDINAATINGKKAKE
+VVNNDEWYKNEFIPCIQKRGAAIIAARKLSSALSAAQAISDHMHDWFLGTPAGQFVSMAVDSTGNKYGVA
+DGLIYSFPVSCSNGKWTIKEGLEIDDFSKEKMKATEQELTDEKKTAMEILGK
+
+
+
diff --git a/ME.txt b/ME.txt
new file mode 100644
index 0000000..cba49fc
--- /dev/null
+++ b/ME.txt
@@ -0,0 +1,63 @@
+>XP_645111.1 NADP-dependent malate dehydrogenase [Dictyostelium discoideum AX4]
+MQNKPSFILRNPSANKGTGFNNEEREKLGLKGLLPPKVESLQEQSDRALSQFTSFNTNLERYIFLNCLRD
+RNETLFYYLLSNNLELMMPIIYTPTVGEACQKFGNEFRFAQGMYFASQDKGNIRAMMDNWPAEGVDIIVV
+SDGSRILGLGDLGTNGMGIPVGKLQLYVAGAGFCPTRTLPVIIDSGTNTKKYLEDKYYLGERHPRIPDSE
+YYPLVDEFLAAAFNKWPKVIVQFEDISNDHCFNLLDEYRNKYLCFNDDIQGTGSVILSGFINAVRSVQKP
+IKEHRMVFLGAGSAGIGVADCIMSLFDEAGVSKEEARKSFWFVDSKGLITTTRGDELTSQKKQYAREDYT
+YQLKSLLEVVRDVKPTAIIGLSGIGGSFSQEVIEEMAKHVEKPIVFALSNPTTNAECTAEQAYQWTDGRC
+IFASGSPFKPVEYKGKTFVPGQGNNMYIFPGLGLAASVCEAKHVTDAMIITAAKTLASFVEDSEVLTGKI
+YPGLQHIREISTRIAVKVIEKAYEEGMAQLPRPDNIEALVKSRQYVPSYDKSKN
+>CAK01689.1 NAD-dependent malic enzyme [Bartonella tribocorum CIP 105476]
+MSRERNNDFSSPIAELDSAALFYHQHPKPGKLEIQATKPLDNQRDLALAYSPGVAAPCLAIHEDPNLAAQ
+YTSRSNLVAVISNGTAVLGLGNIGPLASKPVMEGKAVLFKKFANIDVFDIEIDASDIEQMVQTVSSLEPT
+FGGINLEDIKAPECFEIEEKLRAKMNIPVFHDDQHGTAIIVSAAVLNALNLSGKKIENAKIVTSGAGAAA
+LACLNLLVRLGAKVENIWLSDLEGVVYEGRKTLMDRWKVNYAQKTDARTLSDIIDNADIFLGLSAGGVLK
+PEYLKKMAQNPLILALANPVPEIMPEKAHSIRPDAMICTGRSDYPNQVNNVLCFPYIFRGALDVGATAIN
+EEMKMAAVHAIAALAREETSDVVARAYSKEPPNFGPDYLIPSPFDPRLILRIAPAVAKAAMATGVALRPI
+EDMEAYYDILNRFVFLSGLTMKPVFAAAKTSKRKRVIYANGEDERVLRAAQVVIEEQTATPLLIGRPHVI
+EARLKRFGLRIRPNIDFELTNPEDDPRFRDYVNLFFHYTGRRGVTPEMAKTIVRTSTTAIAALAVMREEA
+DAMICGLEGRFERQLELIEQIIGLDPHVHRFSAMSLLISQQRTLFLTDTYVNENPSAEEIAEMTVLAAQE
+VEAFGITPKAALLSHSNFGSKNTESARKMRRATEILAKLHPHLEADGEMHGDAALSKVFRDRVFPDSRLK
+SEANLLVFPTLDSANITLNTVKSLTNALHVGPILIGAARPAHILTPSVTSRGVVNITALAVLAANRKNSL
+VK
+>AEE81903.1 NAD-dependent malic enzyme 2 [Arabidopsis thaliana]
+MMWKNIAGLSKAAAAARTHGSRRCFSTAIPGPCIVHKRGADILHDPWFNKDTGFPLTERDRLGIRGLLPP
+RVMTCVQQCDRFIESFRSLENNTKGEPENVVALAKWRMLNRLHDRNETLYYRVLIDNIKDFAPIIYTPTV
+GLVCQNYSGLYRRPRGMYFSAKDKGEMMSMIYNWPAPQVDMIVITDGSRILGLGDLGVQGIGIPIGKLDM
+YVAAAGINPQRVLPIMLDVGTNNEKLLQNDLYLGVRQPRLEGEEYLEIIDEFMEAAFTRWPKAVVQFEDF
+QAKWAFGTLERYRKKFCMFNDDVQGTAGVALAGLLGTVRAQGRPISDFVNQKIVVVGAGSAGLGVTKMAV
+QAVARMAGISESEATKNFYLIDKDGLVTTERTKLDPGAVLFAKNPAEIREGASIVEVVKKVRPHVLLGLS
+GVGGIFNEEVLKAMRESDSCKPAIFAMSNPTLNAECTAADAFKHAGGNIVFASGSPFENVELENGKVGHV
+NQANNMYLFPGIGLGTLLSGARIVTDGMLQAASECLASYMTDEEVQKGILYPSINNIRHITAEVGAAVLR
+AAVTDDIAEGHGDVGPKDLSHMSKEDTVNYITRNMWFPVYSPLVHEK
+>AEC06242.1 NAD-dependent malic enzyme 1 [Arabidopsis thaliana]
+MGIANKLRLSSSSLSRILHRRILYSSAVRSFTTSEGHRPTIVHKQGLDILHDPWFNKGTAFTMTERNRLD
+LRGLLPPNVMDSEQQIFRFMTDLKRLEEQARDGPSDPNALAKWRILNRLHDRNETMYYKVLINNIEEYAP
+IVYTPTVGLVCQNYSGLFRRPRGMYFSAEDRGEMMSMVYNWPAEQVDMIVVTDGSRILGLGDLGVHGIGI
+AVGKLDLYVAAAGINPQRVLPVMIDVGTNNEKLRNDPMYLGLQQRRLEDDDYIDVIDEFMEAVYTRWPHV
+IVQFEDFQSKWAFKLLQRYRCTYRMFNDDVQGTAGVAIAGLLGAVRAQGRPMIDFPKMKIVVAGAGSAGI
+GVLNAARKTMARMLGNTETAFDSAQSQFWVVDAQGLITEGRENIDPEAQPFARKTKEMERQGLKEGATLV
+EVVREVKPDVLLGLSAVGGLFSKEVLEAMKGSTSTRPAIFAMSNPTKNAECTPQDAFSILGENMIFASGS
+PFKNVEFGNGHVGHCNQGNNMYLFPGIGLGTLLSGAPIVSDGMLQAASECLAAYMSEEEVLEGIIYPPIS
+RIRDITKRIAAAVIKEAIEEDLVEGYREMDAREIQKLDEEGLMEYVENNMWNPEYPTLVYKDD
+>XP_002177890.1 predicted protein [Phaeodactylum tricornutum CCAP 1055/1]
+MISSACRGSLKSLCSVQLRSNTRQHASLISCNNSPNVKRFSTAFSSQTDGEVLTVGGALSGEEPIHETGE
+WAGCKRSFMIPIRISVRGTDILLDPLYNKGTAFKTGERDRLRFRGMLPHRIMNIHLQKERFLQALRAEDS
+NIRKNVMLEDLHDRNETLYHRVLVDHIEEMAPYIYTPTVGQACMEFATRYRRPRGMYFTEEDRGHMAAMV
+YNWPHRDVHVICVTDGSRILGLGDLGANGMGIPIGKLALYCAAGGIAPHRVLPVVFDAGTNNEALLQDKY
+YLGVQRKRLKGAAYFRMMDEFMDAVRFRWPNVLVQFEDFSSEVAQTLLDRYRDDHLCFNDDIQGTGATTL
+AGVLGALRAKGEEVTSLGDQRIVIAGAGSAGIGIAQVLMQAMEEQGRTPEEAKNAFYILDQNGLLGTDRA
+NDLNAEQRVFVRSADNNLSLMDVVKKYKPTILLGVTTVGGLFTGDLIREMHSSCERPIIFPLSNPTNKAE
+CTAEQAYEWTNGQCIFASGSPFDTIEFEDGRVFYPSQCNNMYVFPGLGLGASVCGAQKVTDRMLYVAAET
+LANFVSKKDMEEGRLFPQLTRIREVSHRIAVAVVEEALREGLATKVKPADANDLDSFVGRKMYFPEYVPL
+VEKREISI
+>XP_002290550.1 NAD dependent malic enzyme [Thalassiosira pseudonana CCMP1335]
+MYLNFSIPSTEILNNPLFNKSTAFKGGERDRLRFRGLLPPRRLNMKVQKQRILEEIRAEDSMIRKNMILE
+EVHDRNETLYHRILVDHIEEMAPIIYTPTVGQACKEFGMRFRRPRGMYFCEEDRGHMAAMVYNWPQKDVH
+VIVVTDGSRILGLGDLGANGMGIPIGKLSLYCAAGGIAPHRVMPVVLDVGTNNEELIKDPFYLGMQRPRL
+QGTKYYHLVDEFIQAVRHRWPNVLIQFEDFSSDKAQKLLNKYRDEILCFNDDIQGTGATTLAGVLGGLRA
+KGEQPTSLGEQRILIAGAGSAGIGVAQVLMQAMMEHGRTEEEAKKCFYIADEKGLLGTDRIHELSPEQAM
+FARDEDGGLSLNEIVNKYKPTMILGMTAVGGLFTEQLIRNMAKHCERPIIFPLSNPTTKAECSAEQAFEW
+TDGKCIFASGSPFEPVEMNDGRKFYPTQCNNMFVFPGIGLGVTLCGARTVSDRMLYVAAEALANYVTEDE
+LAEGKVFPSINTIRDVSKKVAIAVIEEAISTGQASKLTEKDISDLDDFVSRKMYDPIYVPLIEKRTIEI
+
diff --git a/MS.txt b/MS.txt
new file mode 100644
index 0000000..ae63d4b
--- /dev/null
+++ b/MS.txt
@@ -0,0 +1,46 @@
+>jgi|Phatr2|54478|estExt_Phatr1_ua_kg.C_chr_80015
+MIEFRSEQVHVRVHAPANKAAEEMLTPDALRLLGLLCERFDVRRQALLAARKTHATSFDAGDVPHFLSAE
+DHPAQRDPHWRCAPVPDDVQDRRVEITGPVDRKMVINGLNSGACVYMADFEDSTSPTWFNVIDGQLNLRD
+AVRGTIAFTNAAGKVYTVQHASRPATLFVRPRGWHLDEAHVTVNGKVASGSLFDFAMYFFHNVHHLKEKG
+TGPYFYLPKLESHKEAALWNDVFVAAQQFMGVPIGTIRATVLLETITAAFEMEEILYELRDHSLGLNCGR
+WDYLFSFIKKFKHHTDKLTPDRNHLTMTTPLMEAYVKRLIYICHKRGTFAMGGMSASIPIKNDPAANDAA
+MQKVADDKLREVTAGHDGSWVAHPALVKVAKDVFDEHMLTPNQITSKPGYVGSSINEQDLLRLPPIPHGK
+AITSEGLARGVGIVLAYTEAWLRGIGCIPLHNAMEDAATAEISRAQIWQWRSQKASTQDDNRPITASRVA
+ALVQQEVDRQCNGVAGKSKGKWRLAGNLVENMLNKDELDDFLTSVCYPHIVTTAYDDGRIAKL*
+>AAP75564.1 malate synthase [Chlamydomonas reinhardtii]
+MSVQTIPGVAILGPVTAEQASILSPEAQLFVATLHRTFNPRRKELLKRRDERQKDLDAGRLPDFLPETAA
+VRADPGWKCAPPAPGLVDRRVEITGPVDRKMVINALNSGATQYMADFEDSHAPTWDGNLEGQVNMRDAVR
+RAISYTGPNGKVYSLRTDGKLATLLVRPRGWHLVEAHFMVDGEPCSASLFDFGLFFFHNAAATLAAGQGP
+YFYLPKMESHLEARLWNDVFNASQDMLRLPRGTVRATVLIETLLAAFEMEEILYELRDHSSGLNCGRWDY
+IFSFIKKLRNHPQFVLPDRSAVTMTSPFMDAYVRLLIKTCHKRGVHAMGGMAAQIPIKDDPAANAAALAK
+VRGDKEREVVAGHDGTWVAHPALVPIAMEIFNKHMPTPNQLHVRRDDVTVTAHNLLDVRGGALLAEGGIT
+EKGLRDNLSVGLAYMENWLRGVGCVPIHNLMEDAATAEISRSAVWQWVRHHARTRDGRVVTAAWVNDLLA
+QELDQLKSKMGAERFARSKYPLAAQLFQSTITGDAYSDFLTTLCYDHIVTKTPSRM
+>jgi|Thaps3|26293|estExt_fgenesh1_pm.C_chr_60036
+MEARSPTVQIEVHAPICAAASEILTPDSLRFVGYLCNKFEDRRQALLNARKSKAMEFDSGGLPHFEKSDG
+SGSGGDPHWRCASIPGDVMDRRVEITGPVDRKMVINGLNSGANVYMADFEDSTSPTWSNLTEGQRNLRDA
+TRGKITYTNKQTGRVYALKEKTAVLFVRPRGWHLDEAHVTVNGRVASGSLFDFALYFYHNVHCLLQKGSR
+PYFYLPKLEHYLEARLWNDVFKAAQSYFGVPYGTIRATVLLETITAAFQMEEILYELRDHSLGLNCGRWD
+YLFSYIKKFKCHDDKIAPDRSHLTMTDTPLLKSYVDRLIYICHKRGTFAMGGMAAQIPIKGDPAANEAAM
+ARIEKDKIREALAGHDGTWVAHPALVSLAKAVFDRYMPTPNQIDKNPGLTGKDVTEADLLRLQLVPKGTA
+ITSTGLQKGVSIVLAYTEAWLRGIGCIPLNHHMEDAATAEISRAQIWQWKYHGVKTEDDGIVISASRISK
+LVHDEVKRCSGGEDRGKWFLAGKLVEDMLTKDRLDDFLTTVCYPHILTTRYEGDVIPEDEPSSKL*
+>NP_001190219.1 malate synthase [Arabidopsis thaliana]
+MELETSVYRPNVAVYDSPDGVEVRGRYDQIFAKILTREALSFVAELQREFRGHVKYAMECRREARRRYNS
+GAVPGFDPSTKFIRDGDWSCASVPPAVADRRVEITGPVERKMIINALNSGAKVFMADFEDALSPSWENLM
+RGHVNLKDAVDGSITFHDKSRNRVYKLNDQTAKLFVRPRGWHLPEAHILIDGEPATGCLVDFGLYFFHNY
+AKFRQTQGSGFGPFFYLPKMEHSREAKIWNSVFERAEKMAGIERGSIRATVLIETLPAVFQMNEILYELR
+DHSVGLNCGRWDYIFSYVKTFQAHPDRLLPDRVLVGMGQHFMRSYSDLLIRTCHKRGVHAMGGMAAQIPI
+RDDPKANEMALDLVRKDKLREVRAGHDGTWAAHPGLIPICMEAFTGHMGKSPNQIKSVKREDAAAITEED
+LLQIPRGVRTLEGLRLNTRVGIQYLAAWLTGSGSVPLYNLMEDAATAEISRVQNWQWIRYGVELDGDGLG
+VRVSKELFGRVVEEEMERIEKEVGKDKFKNGMYKEACKMFTKQCTAPELDDFLTLAVYNHIVAHYPINVS
+RL
+>gi|ONM16505.1| malate synthase1 [Zea mays]
+MAASTAAPCYDAPEGVDVRGRYDREFAGILTRDALDFVAGLQREFRAAVRYAMEQRREAQRRYDAGELPR
+FDPATTLVREGDWTCAPVPPAVADRTVEITGPAEPRKMVINALNSGAKVFMADFEDALSPTWENLMHGQV
+NLRDAVAGTISFRDAARGRTYELNDRTAKLFVRPRGWHLPEAHILIDGEPAIGCLVDFGLYFFHNHAAFR
+AGQGAGFGPFFYLPKMEHSREARIWNGVFQRAEKAAGIELGSIRATVLVETLPAVFQMNEILHELREHSA
+GLNCGRWDYIFSYVKTFRAHPDRLLPDRALVGMAQHFMRSYSHLLIHTCHRRGVHAMGGMAAQIPIKDDA
+AANEAALELVRKDKLREVRAGHDGTWAAHPGLIPAIREVFEGHLGGRPNQIGDAAGHEGASVNEEDLIQP
+PRGARTVDGLRLNVRVGVQYLAAWLAGSGSVPLYNLMEDAATAEISRVQNWQWLRHGAALDAGGVEVRAT
+PELLARVVEEEMARVEAEVGPDRFRKGRYAEAGRIFSRQCTAPELDDFLTLDAYNLIVAHHPGASPCKL
diff --git a/OMT.txt b/OMT.txt
new file mode 100644
index 0000000..b15d999
--- /dev/null
+++ b/OMT.txt
@@ -0,0 +1,34 @@
+>jgi|Thaps3|20731|estExt_fgenesh1_pg.C_chr_10375|OMT1_TPS
MTSTTTSTSSVLTPPPLPKSVVFATSGLGGMLGWCVVHPANTIAVRMNLASMQGKKFSLNGMIKESGLMS
VYDGLGAGVWRQVFYASSRFGLFETCRDKLHDIRGKTDFAGRVAVGAVTGATAAAISCPMEVATVRMSND
ATLPLNERRNYKGVFDVVKRISTEEGVSALWRGVVPFAQRAALVGVFQVATLDQFKELYAHQFNQKKGSI
PNVFCSAMTSGLIYSIATMPLEASKNRMASQKADVVTGKLPYTSTLQTMKSVSANEGFLALYNGFVPYYI
RCGGHTVAMFIAVQLLRDQYNSMQH
+>jgi|Thaps3|26366|estExt_fgenesh1_pm.C_chr_100013|OMT2
+MPESKSFAQIAEPFVCGGSAATFASIVIHPMDLAKVRMQLYGQLNPGKPVPGFTTLLTNMVKNDGIASVY
+KGVDAAIGRQLVYGTARIGLHRAISDKMKEMNEGKPISFLMKTLSGMMSGSIAVCIGTPFDIALVRLQSD
+SMAPVGERKNYKNVFDALTRTVSEEGAGALYKGLVPNILRGMSMNVGMLACYDQAKETVGKLLNDPMVNG
+PALTTQVGASCVAGFTAALFSMPFDLIKSRLMAQKVDPVTNKLPYSGVMDCAMQVLKKEGPKGFYSGFSA
+YYGRCAPHAMIILLSIESITQGYRNVLGLQK*
+>jgi|Phatr2|8990|e_gw1.1.526.1|OMT1_PTRI
+MVIPFPSYGFQLATSPTAGNSVSTPTKAADLSKPIIFATSGLGGCLGWAFVHPANTLAVRMNLASMSGKP
+FSFPKMIQESGWMGLYDGISAGVLRQVFYATSRFGLFETFRDKLHEYRGKTDFGARIVVGATTGGIAAYL
+SCPMEVAVVRMSNDSTLPMEERRNYKNVFDTASRVIKEEGPLAFWRGSNPFVIRAMMVGVFQVATLDQFK
+DLYEHYLNQRRNSITNVFSAAMTSGLIYALATMPLEACKNRMASQKADKITGKLPYKTILQTLRKVSADE
+GFLALYNGFLPYYIRCGGHTVSMFIIVQILRDSYMQYAL*
+>jgi|Phatr2|16785|e_gw1.28.73.1|OMT2_PTRI
+MSLTNNNSHSLAKTLEPFVCGGSAATFASVIIHPIDLAKVRMQLYGQLNPGKPIPSFPSIIKSIVTRDGP
+LSVYKGVDAAIGRQMVYGTARIGLHRTFSDKLVELNDGKPISFLQKTLSGMLSGSIAVCIGTPFDIALVR
+LQSDGMAEPQDRRNYKNVFDALLRTSKEEGVGALYKGLLPNILRGMSMNVGMLACYDQAKEVVAALLNDP
+MTNGPSLPTRLGASATAGFTAALFSLPFDVMKSRLMAMKPNPLTGEMPYKGVVDCAVQMAKNEGPRSFFS
+GFSAYYGRCAPHAMIILLSIESITNLYRQTFS*
+>gb|CAA53720.1|Oxoglutarate/malate carrier protein|Caenorhabditis elegans
+MAEDKTKRLGRWYFGGVAGAMAACCTHPLDLLKVQLQTQQQGKLTIGQLSLKIYKNDGILAFYNGVSASV
+LRQLTYSTTRFGIYETVKKQLPQDQPLPFYQKALLAGFAGACGGMVGTPGDLVNVRMQNDSKLPLEQRRN
+YKHALDGLVRITREEGFMKMFNGATMATSRAILMTIGQLSFYDQIKQTLISSGVAEDNLQTHFASSISAA
+SVATVMTQPLDVMKTRMMNAAPGEFKGILDCFMFTAKLGPMGFFKGFIPAWARLAPHTVLTFIFFEQLRL
+KFGYAPPVKA
>gi|AFW56593.1|plastidic2-oxoglutarate/malate_transporter|Zea_mays
+MASSTAASPLTCHHLGSVGARPRLPSLSISLRRRSSSSSKPTSLSHSLPSKHSLAPPPAASASSRRGLTP
+VPASASAAAAPAPDPVPVPAPAPAPAPAPAAPPKKPALQGAAIKPLLASIATGVLIWLIPPPAGVPRNAW
+QLLAIFLSTIVGIITQPLPLGAVALLGLGAAVLSRTLTFAAAFSAFGDPIPWLIALAFFFARGFIKTGLG
+SRVAYAFVAAFGSSSLGLGYSLVFAEALLAPAIPSVSARAGGIFLPLVKSLCEACGSRAGDGTERRLGAW
+LMLTCFQTSVVSSAMFLTAMAANPLSANLTAATIGEGIGWTLWAKAAIVPGLLSLVLVPLILYVIYPPEV
+KASPDAPRLAKERLAKMGPMSKEETIMAGTLLLTVGLWIFGGMLNVDAVSAAILGLAVLLISGVVTWKEC
+LAESVAWDTLTWFAALIAMAGYLNKFGLISWFSETVVKFVGGLGMSWQLSFGVLVLLYFYSHYFFASGAA
+HIGAMFTAFLSVASALGTPSLFAAMVLSFLSNLMGGTTHYGIGSAPVFYGAGYVPLAQWWGYGFVISVVN
+IIIWLGVGGFWWKIIGLW
\ No newline at end of file
diff --git a/PEPC.txt b/PEPC.txt
new file mode 100644
index 0000000..b058965
--- /dev/null
+++ b/PEPC.txt
@@ -0,0 +1,97 @@
+>jgi|Phatr2|51136|estExt_fgenesh1_pm.C_chr_170007|PEPC1_PTRI
MIDAASKLTATEALGVTRVFSIMLNLVNAAEVQHRNRQIRAHESTKDPSGGPLPKTEDSIRGTMETLLES
KQATPEEIFAQLQKQKVEIVLTAHPTQVQRKSLLRKYRRVSEMLAYLERPDLDGFEKSSAQTSLQTILSS
IWGADEIRRQKPTPQQEAAGGNAILESVLWDAVPAYLRKLDQQCRLTLGQSLPVDVCPIKFASWIGGDRD
GNPNVTPEVTREVVLQQRLRAARLLLKDMYDLISELAISSRFSPAMDALADSVKDSQHKREKYRRVIGHL
IKRLVKTARECELELSKLNTSASMVSQTLVEEAVDGWQDVDALDDATDLIKPLRIMYDSLVETGFGLVAD
GLLVDIIRRLYVFGMSLVPLDIREESTKHTEALDAITRWLGIGSYSEWTEEARLSWLTSELSNKRPLYRI
RELPKLGFNDSVLKTLNVFGTIATLRPSCLGAYVISQAQTASDVLAVMLLQKQYGMTDKNRNMMRVVPLF
ETLNDLTNAPDKLEQLFSIPLYVGAVKGKQEVMVGYSDSAKDAGRLAACWAQYNSQERMVKVAAKHNIEL
TFFHGKGGTVGRGGNPSVYRAIMSHPPNTINGRFRVTEQGEMITQNFGAPSIAERTLDIYTAGVCREAFS
ERVEPSQAWRDQMQRISDVSCAEYRHLVREEPRFVPYFRQATPELELGSLNIGSRPAKRNPKGGIESLRA
IPWTFAWTQTRTHLSAWLGVGAGLTTTDQSELKTLRAMYIEWPWFRETIDLIAMIVSKTDFSISKNYDDQ
LVEKKEGLLKLGDEVREKMVQTRQAVLDVTESTDVAGAHVALMRGSSTIRHPYVDPVNVIQAELLKRLRV
MDKKKSLLADEMEEQEILKDALIISINGIAQGMRNSG
>jgi|Phatr2|27976|estExt_Genewise1.C_chr_100146|PEPC2_PTRI
MLSSSCRRSFLAAKTRLRSCVTTSLSTGCPWSAISSGSTSRHIDRFFSTHSSFDEPNPSLFGASPLQAST
VSSDATSIPSNEADRDIQLRADIKVMGSLLGRIIQTHEGAEVLEKVETMRGLAKTWRDQGAGRDPSTKQA
ADQTFQNLAAYAKSFTDAELFTVSRAFTHFLAIANAAESHHRGRRLKQSRLLSDESSGALYPKPDSVGGV
LPSLLAQGHDADAIYDALTSQTTELVLTAHPTEVNRRTILNKKRRIQRILTMADQQRQLGASSVFEQAEL
NDALYREISSIWLSDEVSRIKPSPETEAEKGTLVLETVLWEAVPTFLRKLDATTREFLGKPLPLDSSPIR
FASWMGGDRDGNPNVKPDTTRQVCLRNRQKAATLFARNLRTLEAELSLTTCSREVREVVGAAREPYRIFL
QPMIRKMEATTDWAAQELAILQKRRSGDKSASGIASVASTNVEGIYLDQEEFRAELLTIYRSLQETGNEV
AASGILTDIIRNLSSFGLTLIPLDVRQESDRHEEALDAITRYLGLGSYIQWDEQTRVSWLTTQISSKRPL
LRAGVWYEHPDYFSPTAIDTLEISRMIAEQHEGSLGAYVISQATSASDVLAVLLLQLDAGVKKPLRVAPL
FETLDDLNGAADTMRQLFSLPAYMGTIGGKQEVMIGYSDSAKDAGRMAATWAQYETQETLAKLAKEFGVD
MTFFHGKGGTVGRGGNPQTFTAIMAHAPKTINGHFRVTEQGEMISQNFGYADRAERTMDIYTAAVLAEKL
SERPKVKDEWRSMMKILSDISCEAYRQVVRKDERFVPYFRSATPELELSNLNIGSRPAKRKATGGVESLR
AIPWNFAWTQTRFNLPTWLGVGDAIGQLLKSDRAPLLRELYREARAFQTMVDLVEMVLAKSEPAIAAHYD
SVLVKDPKAKELGKEVRQLHMATEEAILDLTEHKKLGENNAVLQRALVVRNPYVDCLNILQVETLDRLRQ
VEEGKEDKVLKDALLTTITGVANGMGNTG
+>jgi|Thaps3|268546|estExt_thaps1_ua_kg.C_chr_30296|PEPC1_TPS
+MGTLLGDAISTHHGRDVLEKVEALRTMAKESRRSGDSSSERLQSMVDFVSGLSATELVVVSRAFAHFLGV
+ANAAEAHQRCRRLKLDLEREVSGEDVKGLLVEGAPTPEEVFKSLTSQTVEIVLTAHPTQVNRRTLLEKHG
+RVQKILNDADGLRESGTPYQRKLLDDALRREIASIWQTDEVSRVKPTPQSEAERGTLVVETVLWEALPSF
+LRKLDATMKWGLGEKYGLPLTASPFKFASWMGGDRDGNPNVTPDVTREVCLTNRIKAAQLLEGDVRELMG
+VGTESEAMQRVRERSGDSRAPYRAYLNPVATKLANTATWAQQELRKSTTSFAPDEVYLHKDELMDELLTV
+HQSLCDSGNTVVANGRLADIIRKLSSFGLTLVPLDIRQESDRHEEALNCITKYLGLGSYSQWDEGTRVSW
+ITKQLQSKQPLIRDGAWNQPGNEQFFTPTSIDTLETFKMISDMHEESLGAYVISQCTSVSDILAVLLLQL
+DAGVNKTLRVVPLFETLDDLNGAAATMEHLFSIPAYVGSLEGRKQEIMVGYSDSAKDAGRLAASWAQYET
+QVTLSEVAKKHSVDVVYFHGKGGTVGRGGNPNTFEAILSHAPGTINGQFRVTEQGEMINQNFGFSDRAER
+TLDIYTAALLAEQNTDRPLPTKEWKDMMDKLSQISCDAYRKIVRGDERFVPYFRAATPELELSNLNIGSR
+PAKRKASGGVESLRAIPWIFAWTQTRLNLPTWLGVGEAINEVLSSPDEQTLRTMYKEWGSFRTTIDLVEM
+TLSKSDSSIARHYENVLVRDPAAVALGGEIRNIHDATERAVMDLTGHKTLSEHDILLQRLMAVRNPYVDC
+LNVLQAETLKRLRESEGSSEEEVLKDALLTTITGVANGMGNTG*
+>jgi|Thaps3|34543|e_gw1.5.29.1|PEPC2_TPS
+MDRAANPDDTAPFEEMKKLAYDINPRDTLGVMKTFSIALNLVNAAEVHHRIRLVRVSELKDDVNHIGPLP
+MVEDSIRGTMEILLEGDCDDKDKLFERLTTQKCEIVLTAHPTEVNRKTIISKYRKISELLAYMERPDLHP
+FERAEAVNNLRGIISAIWGADEIRRVKPTVQKEAAGGCAVIESVLWDAVPSYLRKLDAQCRVTLGKKFPV
+DATPIKFASWIGGDRDGNPNCTPEVTLEVVTRQRLRAAKMFLNDLNMLYSELAISSRFSKELEALAASVK
+KSDDNREKYRRVIGHLRRRLVRTVKECEAKLHTLTDTSEDVEPIIKSEELMTPLRIMYDSLVETGFELVA
+DGHVSDIIRRVAVFGMTLVPLDIREESTRHTIAIDAITRHLGIGSYKEWDEEARLNWLQSELNNKRPLFR
+IRDIEDNLLGLDPDNRKTLMVFKVASELDSESLGAYVISQANTASDVLLVMLLQKQFGMTEKNGKLMRVV
+PLFETLTDLTNSPAQLERLFSITNYLGAINGKQEVMVGYSDSAKDAGRLAACWAQYTAQEAMANVADRYG
+VELTFFHGKGGTVGRGGNPALYRAILSHPPNTINGRFRVTEQGEMIRQNFGSLEIAERTLDIYTAALLRE
+SFTKRVEPKQEWRDQMERVSEVSCAAYRHTVRDDPRFVPYFRQATPELELGRLNIGSRPAKRNPKGGVDS
+LRAIPWTFAWAQTRMHLSAWLGVGDGLRSDVSDQCRYMKTLQEMYEQWPWFREIISLISMLVSKTDFSIT
+KNYDDLLVDSNLRSLGDEVRNKLVETRQAVIDVSGATDISGPHVQLMRASSTIRNPYVDSINVVQAEILK
+VLR
+>gb|Q6R2V6.1|Chlamydomonas_reinhardtii|Phosphoenolpyruvate_carboxylase
+MTDSTYDFGAVRDDLTPLEDDCKLLGSLLDDCLRVEIGETMFKKIERIRALAQCASNLSIKGDAGASDML
+SHRLAEELMNLDMDEAVPLTRACGHYLNLSGIAELHHGVRRDRATREPNPNSCDAVFARLITEGVDPEEL
+YRAVSEQNVEVVLTAHPTQVNRRTLQYKHTRIAALLQQHDRSDLTAEERRNMVSELQREVAALWQTDELR
+RQKPTPLDEARGGLHIVEQSLWAAVPQYMRRLSAALKKHTGHDLPLQATPFRFGSWMGGDRDGNPNVTAK
+VTAHVTALARWMAADLYLREIDTLRFELSMNQCSAAVWKMARRIIAEGHTKRAGVVRAKAAAALHQTATD
+AASHGGSAASAAAAAAAGGDVVADGTSGGGAAAAAGPAAAAAADDAFTFSRLGRPRPERPSTDVRSVGVL
+AGGEGAAFPGGMILGTQPVSAHTAAEVSVPHELPGQDVEGGSEMDFNESRRASDAGDLGASQHPMLGGPS
+AGASAEPTAHGYTTTATAAAAAADGTQPEPEVPGTPSYADPGTPDRLGALPGPFTPGPTPFREAANAAMS
+TAASGGAGGGGGGGANRAASGLGGDPTFTRRSLMAQRLGTSSVQFARAHEHPGFHPYRIVLGHVRDRLAA
+TRRRMEDLLSGREPAGEAHGGVGAGGGGGGGAAPWYESEDELAEPLMACYWSLWECGGGVIADGRLLDLI
+RRVYTFGMCLMKLDLRQESTRHAEALDAVTSYLGLGSYLEWSEDQKIEWLTKELQGRRPLIPADMPMSAE
+VREVLDTFKVAAHLGRDNLGAYVISMTKGASDVMAVELLQREARMQVGAEAGGRGGGGPEDGGSLRVVPL
+FETLEDLDAAEDVMTRLLTNPWYREHLRAVHGDAQEVMLGYSDSGKDAGRLAANWALYKCQERLVAITKA
+NNVKLTLFHGRGGTVGRGGGPTHIAIQSQPPGSVEGTFRITEQGEMVQAKFGISGVALSQLETYTTAVLL
+ATMRPPSPPRREEWRAVMEMLSRVSCESYRNIVHHSPLFLRYFKHATPEAELGNLYIGSRPARRRNKDAS
+ISTLRAIPWIFAWTQNRLILPSWLGIGAALTAAMTQGHLPTLQAMYREWPFFGSTVDLIEMILAKTDPRI
+AALYEEVLVNDPEEKKLGAELRERLQRCQGAILKVTGHENLLSNNPTLSKLISMRSPFVDPINILQVEVL
+RRLRQDPNNMRLRDALLISINGIAAGMRNTG
+>gb|XP_003055786.1|predicted_protein|Micromonas_pusilla_CCMP1545
+MPTAMNLLKDAKFKELVADSHYLSGSGETHDFDHHEVLQESEDLMRALFFSIVRETTGTEFDDSLEAVYA
+LSEQFHKSNDPADFAALTTKLGSLSDEETVMLASAFSNVLNLHNVSEHVAAAMEERHARLDDIPRGPAKT
+TNGAIKGLIANGVSKETIYEALAEQEVDLVFTAHPTQALRRSMLKNFARIRQCLLDLQARRLSGYERAEI
+LASMSSAIQAAWRTDEIRRNPPKPQDEMRAGLSYFNDTLFEGLPKFVRRIDTALINQGLPRIPLDKSIMK
+FSSWMGGDRDGNPNVDSHCTKDAVYLARSKAADLYFDAIQNLIFSLSMWRCSESFKARAAARHAFVLSQQ
+DDGALYAERKRRNYVDFWHALPLSQPYRVVLSEVRDRLYNTKEAIKDVIAGRVDALNPDDASIFTSKEQL
+LEPLLACYNSLIDVGDKSVADGYLLDLIRQVNCFGLSLVRLDIRQESDRHADAMDAITKHIGLGSYNEWD
+EEKKCAFLVAELEGKRPLVPRDLECTAEVQEVIDTFHMAGHLQRVCPGSLGTYVISMATVASDVLAVVLL
+QRECGGHEDRLLRVAPLFERLDDLRDGPAQLRRLFSVPWYHKHIDGFQEVMIGYSDSGKDAGRMAAAWAL
+YEGQENATHVGNEFGVKLTLFHGRGGTVGRGGGPSHLAIMSQPPATINGRLRVTVQGEVIEQNFGEHENC
+FHTLDLYTAATLEHSLKPPTSPQTEWRDVMNVMSEESCEKYRKVVFETPEFIRYFAQATPAQELGSLNIG
+SRPAKRKANPTVTALRAIPWIFAWTQSRFHLPVWLGMAEGFQKLKDDGKLPMLRAMYKDWPFFRVTMDLI
+EMVLAKADFNVAEYYEKVLVEPGLHAFGATLRQQLVNTVKIVLEITEHPDLLTPQSDSKGQSSSTFLAEK
+LSMRSTYITPLNIIQVENLKRLRAIESGEVSEEFMAKYAPSMPWSKEMLSLHGKNNWYHATVSDTLIITM
+KGIAAGMQNTG
+>gb|XP_001420862.1|predicted_protein|Ostreococcus_lucimarinus_CCE9901
+MLKAVFSGKDKAKITSHKRTGSLFASEEGEALDALARSSSYLSGRGETKEFNAHDVIEECDELLRTIFFA
+VVRETAGDKFLGQLKSVYEASEKFGSSHDPKDFDAMQAMLETMEVDESLQFASAYSNLLNLHNISEQVAN
+AMEERHRRLDDIPRGPAKTTNGAIKGLLRAGKSTEEIYSALAVQHVDLVLTAHPTQALRRSMLKSFGIIR
+EKLLQLQRFRLSRYERAEVLDEIRSKVASAWRTDEIRRTPPKPQDEMRAGLTYFQQTIWDGIPTFMRRVD
+TSLLANGCPRLPLDRSIVTFGSWMGGDRDGNPYVTASCTRDVVLLARVQGVNLLFRAIQRLIFDLSMWRC
+NDAVKALAKDILENSETDNFTIFEERKKRNYDDFWKAIPEHEPYRVILAELRDKLYNTREALQRCIADND
+VNIDMNDETIIRSKDELFAPLVVCYESLIEVGDAQIANAYLLDVIRQVQCFGLGLVKLDIRQESDRHAEA
+LDAVTRYIGLGSYLEWSEEQKIEFLTRELESKRPLLPSDLECSDDVREVLDTCKMIAHLQQTCPGALGTY
+VISMATSASDVLAVVLLQRECGCRKQDLLRVAPLFERLDDLNDAPRVLRQLFSVKWYHDHIAGFQEVMIG
+YSDSGKDAGRMAAAWALYDGQERVVAAGKEFDVALTLFHGRGGTVGRGGGPAHIAMLSQPPGTVNGSIRV
+TVQGEVIETDFGEKENCFHTLDLYTASVLEHTLKPPAHPRDEWRRVMDRMSEYSCAHYRKTVFETPDFVG
+YFAQATPGAELGSLNIGSRPAKRKPSAGVTALRAIPWIFAWTQSRFHLPVWLGISTSFRRLIDEGELETL
+RDMYKSWPFFEVTIDLVEMVLAKADPVVVAYYERALVDPKLHDFGASLRGELQESIDCILAVSEHIGLLA
+KPEKVEANEAVQVHKKLAHKLHKRSLYITPLNVCQVRYLIAARALENEEDGDKLSMQKVKITLLEGYPFQ
+DYNYKGAVNDVLKITMKGIAAGMQNTG
+>gb|EKX31868.1|hypothetical_protein|GUITHDRAFT_121941|Guillardia_theta1_CCMP2712
+MQPLLQDTIVLRGLFFDALRHGMKGSSEKKRFMNQPSCHIEEHGYNTRNIDFNAASTNLECIEAFSENVK
+KILSLSEKYAEQHTSKTLQDIVDVVERSSIEEVKIVARIFAVLLSQINIAERHHRYRRWSMYKRREIAIL
+HFSDGQHHQADDCFKMLKENGFTPQKIHDSLCKQNLELVFTAHPTQSERRSILKKFAALDAALEALDLHG
+ESQTPLQKELIFMRLQQTLLAIWRTNNMRTIKPSPEDEARYGLSVVEETLWDAVPQHYRIVDDSLRRLGQ
+PPLPLNCNLITLGSWMGGDRDGNPYVTHDITKKIIYLSMMRACRLYYNEVEKLLWALSMTGEPSSEVLEW
+LAEHQNDYHNETVEVEGRGGHQTKNWDFYRSEQTVEEPYRQVLVIVREMLERTIIRAEALSHGREPPPVD
+GKCFRTTEELMKPFALIYDSLEQSGDHLVCHGKLKDLIRRIRTFGLYLVKLDIRQESSKHEEVMDAITSH
+LGLGQYSTWSEETKIQWLTNELLSKRPLLGSQDFNCSPMVREVLDTFKVIMSCNAEPFGAYVISMTQSAS
+DVLEVHLLQKEAGCRQHLRVAPLFETKEDLINAPKALLALYKNEWYRNHFDTVNTKYQEVMLGYSDSAKD
+AGRLTSVWELYKAQESLVQISAEHKIPLNLFHGRGGSVGRGGGPQYLAILSQPAGSINGSLRVTIQGEVI
+ENYFGSHRSCELTFERYTTAILKATLTPPAPPSDLFRDVMQRMSETSCAAYKKIVYDTPGFVDYFRAITP
+EQELKTLNFGSRPSKRAKGGIETLRAIPWMFAWTQMRLHMPVWFGVGSAFKSEIDAGNLDTLREMYAKWP
+FFQSTVELVEAVLSKVDVEITRLYEKMLVPADVLYIGEMIYKELDMSIECVKMITGRENLLSNNPIIKRL
+YDIRRPMTDPLNILQAKVLRDMRMHENPPQELQESFAATVQGIAAGMGWTG
+
+
diff --git a/PEPCK.txt b/PEPCK.txt
new file mode 100644
index 0000000..e567884
--- /dev/null
+++ b/PEPCK.txt
@@ -0,0 +1,48 @@
+>jgi|Phatr2|23074|estExt_gwp_gw1.C_chr_210075|PEPCK_PTRI
+MLLTTGAARVFLRSAAVSKSAVKTFAARAVLGSGRLSSPSSLYHGCSVQTFTSLPNDASTSCKEGREAYN
+VSQTHKGTDACLKVGIDKLGITGPSTIYRNLNYDEIFEHEVKNGEGVVAKAEYGDTFCVDTGKFTGRSPK
+DKWIVLNKGSETEANIDWNSINQATKPEVFDELYDKAVDYFNQRESCYVADVYCGANPSTRKKIRFLFDK
+AWQQHFVTNMFIRPSDEAELDGFDPDFTVINCCAQVDDDWERHGLHSDTAVVFNIEKKTAVVFGTWYGGE
+NKKGIFSLMNYWLPMQGHLPMHCSANVGKEGDVALFFGLSGTGKTTLSADPHRALIGDDEHGWDHDGIFN
+FEGGCYAKTINLSEATEPDIYRAIHKDALLENVAIRDDGTPDYSNVSKTENGRVSYPIFNIPGYHKEQMA
+GHPSNIIFLSCDAFGVMPPVARLSSGQAMYHFLSGYTAKVAGTERGITEPSATFSTCFGAAFMTMHPTVY
+ADLLQEKLDKHGSHAYLVNSGWSGGAYGTGKRMSIKTTRTCIDAILDGSIHDAEFQVDPIFGYEVPKSLP
+GLDDLLLDPKSTWDNQDAYDETAAKLAKMYSDNFKQYEGKGSIDYTKFGPKI*
>jgi|Thaps3||gi|18203370|sp|Q9PP01|PEPCK_TPS
+MKKFDKLGLDNIKEIFHNLSYDELNAHEKANNEGLSTDNDTFCVDTGIFTGRSPKDKYFVKQDPSSKYIA
+WGKVNQPITKELFDKLLTKAKQELSGKKIYVQDVFCGASLQSRKAVRFVTEIAWQAHFVKNMFIRPSQEE
+LENFKADFIVYNACKCINEDYKQDGLNSEVFVIFNVEENIAVIGGTWYGGEMKKGIFSMMNYWLPLENKL
+SMHCSANVGEKDDVALFFGLSGTGKTTLSTDPKRRLIGDDEHGWDDEGVFNFEGGCYAKTINLDPEHEPE
+IYGAIKRNALLENVVLRADKSVDYADASKTENTRVSYPIEHIENHEPSLKAGHPKNIIFLSADAFGILPP
+VSKLSKEQAMYYFLSGYTAKVAGTERGITEPQATFSACFGEPFMPLHPTVYARLLGEKIEKHEVNVYLVN
+TGWSGGSYGVGKRMSIKATRACINAILDGSITKCEFENFEVFDLAIPKALEGVESVLLNPINTWLDKNAY
+IATRDKLAHMFIQNFKRYEDVKEGIEFSKFGPKI
+>gb|EKX34557.1|phosphoenolpyruvate_carboxykinase|Guillardia_theta_CCMP2712
+MASSTSNDFTINTKNMFNVRQESCDNVVEQTLKLATYTTSKLVEPAVPFNFLDHPGCHRNMSFEDVRAKI
+LARGEGSLVQESEALLVDTGKFTGRCPKDRYIVNAGEAASKVGWGDINRGITEEVFDKVMEGSAKRLCGL
+EEVFVFDGFVGASRSSRKAVRVVTELAWHHHFCTNMFIRPTEEELASFKPDITILNSRYLFEGWKEAGLR
+SETCVALDLNRGLSVITGTEYSGEMKKGAFSMMNYYLPLQNIMSMHCSATVGKGGDTAIFFGLSGTGKTT
+LSADTTRFLIGDDEHGWDEEGIFNLEGGCYAKMIDLDPSKEPLIHAAIKENAILENIVLDQHGRPDYKDI
+SKTENTRGSYPMWHIPNYQPSGTAPPPRNVIFLTCDAFGVLPPVSRLSTEQALYHLVCGYTSKVAGTEMG
+ITEPTPTFSICFGGAFMPLPARVYAQLFRSKIEQHGCQVFLVNTGWSGGSYGTGRRMDINTTRAIVAAIL
+DGSIEEATFSSPDPCFQLSVPLALPGVDAHVLNPRNTWASQEEYEATSRKLMGMYQANWQQFASDPFMAQ
+LSRFGPGGDRST
+>gb|XP_001694964.1|phosphoenolpyruvate_carboxykinase,splice_variant|Chlamydomonas_reinhardtii
+MALLSSRSSANCTGRSVRRATVAPAPVVKPHSSVAMRFTNNTKQDAAVPAPCDMQFVLDSKFTRESGLQP
+RVVFRNLTTPQLYEMALAHEPGTHITSSGALATLSGEKTGRSPKDKRVVRDPETEKDLWWGPYSPNYVMD
+DRTFLTNRERAIDYLNTLDRVYVVDAFVNWDPESRLKVRVVTSRAYHALFMSNMLIKPTEEELKTFGEPD
+FVIYNAGAFPANKYTQFMTSQTSIDLSLKHKEMVILGTMYAGEMKKGVFTLMHYLMPMQGKLSLHSGCNV
+GADDDVTMFFGLSGTGKTTLSADPKRPLIGDDEHVWSDKGVFNIEGGCYAKCIGLKATTEPEIWNAIKFG
+TVLENVDYNPVTREVDYESERLTENTRASYPIEFMNNARIPCVGPHPKNVVLLACDAFGALPPVSRLTLE
+QAMYHFISGYTAKVAGTEMGVTEPTATFSACFGSAFLMLHPYKYATMLAEKMKAHGTTAWLINTGWTGGK
+YGVGKRISLKHTRAIIDAIHSGELDKAEYVTTPIFGLQVPKAISGVPAEILSPENVWPNKDEFAMCLNSL
+GHMFIRNFEHFNDGEQFVGKDTAARILTGGPQPIAKEDVEKKGFGAFKTQ
+>gb|AAC98698.1|phosphoenolpyruvate_carboxykinase|Rattus_norvegicus
+MPPQLHNGLDFSAKVIQGSLDSLPQEVRKFVEGNAQLCQPEYIHICDGSEEEYGRLLAHMQEEGVIRKLK
+KYDNCWLALTDPRDVARIESKTVIITQEQRDTVPIPKSGQSQLGRWMSEEDFEKAFNARFPGCMKGRTMY
+VIPFSMGPLGSPLAKIGIELTDSPYVVASMRIMTRMGTSVLEALGDGEFIKCLHSVGCPLPLKKPLVNNW
+ACNPELTLIAHLPDRREIISFGSGYGGNSLLGKKCFALRIASRLAKEEGWLAEHMLILGITNPEGKKKYL
+AAAFPSACGKTNLAMMNPTLPGWKVECVGDDIAWMKFDAQGNLRAINPENGFFGVAPGTSVKTNPNAIKT
+IQKNTIFTNVAETSDGGVYWEGIDEPLAPGVTITSWKNKEWRPQDEEPCAHPNSRFCTPASQCPIIDPAW
+ESPEGVPIEGIIFGGRRPAGVPLVYEALSWQHGVFVGAAMRSEATAAAEHKGKVIMHDPFAMRPFFGYNF
+GKYLAHWLSMAHRPAAKLPKIFHVNWFRKDKNGKFLWPGFGENSRVLEWMFGRIEGEDSAKLTPIGYVPK
+EDALNLKGLGDVNVEELFGISKEFWEKEVEEIDKYLEDQVNADLPYEIERELRALKQRISQM
diff --git a/PEP_HMM_1.py b/PEP_HMM_1.py
new file mode 100644
index 0000000..591d8f0
--- /dev/null
+++ b/PEP_HMM_1.py
@@ -0,0 +1,95 @@
+#!/usr/bin/python
+#MMETSP_sample_import.py
+#inputs: (1) taxonomic classification of interest, (2) mmetsp taxa file with path
+#outputs: rips and writes peptide fasta files and count data for each taxa
+
+#import libraries
+import sys
+from ftplib import FTP #import the ftp library
+import re
+import os
+import numpy as np
+
+######Get arguments from command line########
+t=sys.argv[1] #full or partial taxa name
+need='mmetsp_taxonomy.txt' #mmetsp taxa file with path
+
+####Build files for HMM##########
+#you only need to run this once
+os.system('./mafft_hmmbuild.sh')
+
+###########RETRIEVE NAMES!!##########
+mt=open('mmetsp_taxonomy.txt','r')
+g=[] #make an empty list to store genus names
+for line in mt:
+ if re.search(t,line): #if taxa name in line
+ g= g+line.split('\t')[7:8]#pull out the 8th field should be genus, keeping as list
+
+g=set(g) #keep only unique genus names
+print g
+#close the taxonomy file
+mt.close()
+
+ftp= FTP('ftp.imicrobe.us') #set home ftp server
+ftp.login() #log in
+ftp.cwd('camera/combined_assemblies') #ch
+
+files=ftp.nlst() #make a list of all files and directories in wd
+delimiter=' '
+all=delimiter.join(files)
+
+names=[]
+
+for genus in g:
+ string= genus+"\S*.pep.fa.gz"
+ taxafiles=re.findall(string, all)
+ print "{} files matching genus=".format(len(taxafiles))+genus
+ print taxafiles
+ if len(taxafiles) > 0:
+ for filex in taxafiles:
+ command = "RETR "+filex
+ outfile = filex
+ #ftp.retrbinary(command, open(outfile, 'wb').write)
+ names.append(outfile)
+
+ftp.quit()
+
+print 'Part 1'
+
+###########RETRIEVE PEP.FA##########
+os.system('python ./MMETSP_sample_import.py {} {}'.format(t,need))
+
+###########RETREIVE COUNTS##########
+names2=[i.split('.')[0] for i in names] #removes .pep.fa.gz from the names
+
+ftp= FTP('ftp.imicrobe.us') #set ftp server
+ftp.login() #log in
+ftp.cwd('camera/combined_assemblies') #change directory
+
+#location for files
+t=os.getcwd()
+for ID in names2:
+ #change to taxa directory/readcounts
+ ripdir= ID+"/readcounts"
+ ftp.cwd(ripdir) #change directory
+ savefile= ID+"_cds_counts.txt" #saves files with unique names
+ ftp.retrbinary('RETR cds.dat', open(savefile, 'wb').write)
+ ftp.cwd("~/camera/combined_assemblies") #change directory to restart loop in right place
+
+ftp.quit()
+
+print 'Part 2'
+
+#if t=='Dinophyceae':
+ #names.remove('Durinskia-baltica-CSIRO_CS-38.pep.fa.gz')
+ #names.remove('Oxyrrhis-marina-CCMP1795.pep.fa.gz')
+ #names.remove('Alexandrium-fundyense-CCMP1719.pep.fa.gz')
+
+print names
+print 'Part 2B'
+
+#########RUN HMM #########
+for i in names:
+ os.system('./MAGIC_HMM.sh {}'.format(i))
+
+print 'COMPLETE'
\ No newline at end of file
diff --git a/PEP_HMM_PART2.ipynb b/PEP_HMM_PART2.ipynb
new file mode 100644
index 0000000..827603f
--- /dev/null
+++ b/PEP_HMM_PART2.ipynb
@@ -0,0 +1,869 @@
+{
+ "cells": [
+ {
+ "cell_type": "code",
+ "execution_count": 40,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "import sys\n",
+ "from ftplib import FTP #import the ftp library\n",
+ "import re \n",
+ "import os\n",
+ "import pandas as pd\n",
+ "import numpy as np"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 57,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "#t='Dinophyceae'\n",
+ "#t='Bacillariophyta'\n",
+ "#t='Haptophyta'\n",
+ "t='Raphidophyceae'"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 58,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['Chattonella-subsalsa-CCMP2191.pep.fa.gz', 'Heterosigma-akashiwo-CCMP2393.pep.fa.gz', 'Heterosigma-akashiwo-CCMP3107.pep.fa.gz', 'Heterosigma-akashiwo-CCMP452.pep.fa.gz', 'Heterosigma-akashiwo-NB.pep.fa.gz']\n"
+ ]
+ }
+ ],
+ "source": [
+ "##retrive files\n",
+ "mt=open('mmetsp_taxonomy.txt','r')\n",
+ "g=[] #make an empty list to store genus names\n",
+ "for line in mt:\n",
+ " if re.search(t,line): #if taxa name in line\n",
+ " g= g+line.split('\\t')[7:8]#pull out the 8th field should be genus, keeping as list\n",
+ "\n",
+ "g=set(g) #keep only unique genus names\n",
+ "#print g\n",
+ "#close the taxonomy file\n",
+ "mt.close()\n",
+ "\n",
+ "ftp= FTP('ftp.imicrobe.us') #set home ftp server\n",
+ "ftp.login() #log in\n",
+ "ftp.cwd('camera/combined_assemblies') #ch\n",
+ "\n",
+ "files=ftp.nlst() #make a list of all files and directories in wd\n",
+ "delimiter=' '\n",
+ "all=delimiter.join(files)\n",
+ "\n",
+ "names=[]\n",
+ "\n",
+ "for genus in g:\n",
+ " string= genus+\"\\S*.pep.fa.gz\"\n",
+ " taxafiles=re.findall(string, all)\n",
+ " #print \"{} files matching genus=\".format(len(taxafiles))+genus\n",
+ " #print taxafiles\n",
+ " if len(taxafiles) > 0:\n",
+ " for filex in taxafiles:\n",
+ " command = \"RETR \"+filex\n",
+ " outfile = filex\n",
+ " #ftp.retrbinary(command, open(outfile, 'wb').write)\n",
+ " names.append(outfile)\n",
+ " \n",
+ "ftp.quit()\n",
+ "\n",
+ "if t=='Dinophyceae':\n",
+ " names.remove('Durinskia-baltica-CSIRO_CS-38.pep.fa.gz')\n",
+ " names.remove('Oxyrrhis-marina-CCMP1795.pep.fa.gz')\n",
+ " names.remove('Alexandrium-fundyense-CCMP1719.pep.fa.gz')\n",
+ "\n",
+ "print names"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 4,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "['Karenia-brevis-CCMP2229.pep.fa.gz', 'Karenia-brevis-SP1.pep.fa.gz', 'Karenia-brevis-SP3.pep.fa.gz']\n"
+ ]
+ }
+ ],
+ "source": [
+ "#names=names[:3]\n",
+ "#print names"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 59,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "def readHMM(Organism,gene_name):\n",
+ " \"\"\" Takes in organism and gene_name from HMM results and makes a table.\n",
+ " HMM results from --tblout that have the following name organism_genename_HMM.csv\n",
+ " Note: pep.fa files differ in structure and it can affect how the HMM output is written. If you can't read the file in \n",
+ " modify the fuction\"\"\"\n",
+ "\n",
+ " hold=pd.read_csv('{}_{}_HMM.csv'.format(Organism,gene_name),delim_whitespace=True,engine='python',skiprows=2,skipfooter=10)\n",
+ " #empty files have 12 rows so the following if statement will only work on files that are not empty\n",
+ " \n",
+ " if hold.shape[0]!=0:\n",
+ " readX=pd.read_csv('{}_{}_HMM.csv'.format(Organism,gene_name),delim_whitespace=True,engine='python',skiprows=3,\n",
+ " header=None ,skipfooter=10)\n",
+ " \n",
+ " samplenames=[]\n",
+ " for i in np.arange(1,readX.shape[1]+1):\n",
+ " samplenames.append(str(i))\n",
+ "\n",
+ " readX.columns=samplenames\n",
+ " \n",
+ " new=pd.DataFrame()\n",
+ " new['CAMPEPid']=readX['1']\n",
+ " new['contig']=readX['19']\n",
+ " new['Evalue']=readX['5']\n",
+ " new['Annotation']='{}'.format(gene_name)\n",
+ "\n",
+ " new.contig=new.contig.str.split(\"|\").str[1]\n",
+ " return new\n",
+ " #return readX"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 60,
+ "metadata": {
+ "collapsed": true
+ },
+ "outputs": [],
+ "source": [
+ "##MAKEHMM\n",
+ "def makeHMM(Organism,gene_type):\n",
+ " \"\"\"Combines the plots from each gene_type into a table\"\"\"\n",
+ " if gene_type=='CF':\n",
+ " a=readHMM(Organism,'PK')\n",
+ " b=readHMM(Organism,'PEPC')\n",
+ " c=readHMM(Organism,'PEPCK')\n",
+ " d=readHMM(Organism,'PK')\n",
+ " e=readHMM(Organism,'MDH')\n",
+ " f=readHMM(Organism,'OMT')\n",
+ " g=readHMM(Organism,'PYC')\n",
+ " h=readHMM(Organism,'PPDK')\n",
+ " i=readHMM(Organism,'ME')\n",
+ " \n",
+ " frames = [a,b,c,d,e,f,g,i,h]\n",
+ " result = pd.concat(frames)\n",
+ " return result\n",
+ " if gene_type=='PR':\n",
+ " a=readHMM(Organism,'SHMT')\n",
+ " b=readHMM(Organism,'GOX')\n",
+ " c=readHMM(Organism,'GDCT')\n",
+ " d=readHMM(Organism,'PGP')\n",
+ " e=readHMM(Organism,'ICL')\n",
+ " f=readHMM(Organism,'GCL')\n",
+ " g=readHMM(Organism,'HR')\n",
+ " h=readHMM(Organism,'SPT')\n",
+ " i=readHMM(Organism,'TSR')\n",
+ " j=readHMM(Organism,'MS')\n",
+ " k=readHMM(Organism,'GlcDH')\n",
+ " l=readHMM(Organism,'ALAT_GGAT')\n",
+ " m=readHMM(Organism,'GK')\n",
+ " \n",
+ " frames = [a,b,c,d,e,f,g,h,i,j,k,l,m]\n",
+ " result = pd.concat(frames)\n",
+ " return result\n",
+ " if gene_type=='BP':\n",
+ " a=readHMM(Organism,'CA_alpha')\n",
+ " b=readHMM(Organism,'CA_delta')\n",
+ " c=readHMM(Organism,'CA_beta')\n",
+ " d=readHMM(Organism,'Ca_zeta')\n",
+ " e=readHMM(Organism,'Bestrophin')\n",
+ " f=readHMM(Organism,'SLC4')\n",
+ "\n",
+ " frames = [a,b,c,d,e,f]\n",
+ " result = pd.concat(frames)\n",
+ " return result"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 61,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "###COUNTS\n",
+ "def HMMcmp(Organism,gene_type):\n",
+ " result1=makeHMM(Organism,gene_type)\n",
+ " contig_EV=dict(zip(result1.contig,result1.Evalue))\n",
+ " contig_Annot=dict(zip(result1.contig,result1.Annotation))\n",
+ " \n",
+ " import re\n",
+ " Organism = re.sub('.pep.fa.gz', '', Organism)\n",
+ " \n",
+ " expression=pd.read_csv('{}_cds_counts.txt'.format(Organism),delimiter='\\t',index_col=0)\n",
+ " \n",
+ " #treatments=len(list(expression))\n",
+ " #print treatments\n",
+ " \n",
+ " expression['log2CPM']=np.log2(expression.sum(axis=1)/1000000)\n",
+ " expression.index= expression.index.str.split(\"|\").str[1]\n",
+ " contig_CMP=dict(zip(expression.index,expression.log2CPM))\n",
+ " \n",
+ " Contig=[]\n",
+ " Evalue=[]\n",
+ " Annotation=[]\n",
+ " log2CPM=[]\n",
+ " \n",
+ " for i in expression.index:\n",
+ " a=contig_CMP.get(i)\n",
+ " if a>-16 or a==\"-inf\" or a==\"inf\":\n",
+ " Contig.append(i)\n",
+ " Evalue.append(contig_EV.get(i))\n",
+ " Annotation.append(contig_Annot.get(i))\n",
+ " log2CPM.append(a)\n",
+ " \n",
+ " out=pd.DataFrame()\n",
+ " out['Contig']=Contig\n",
+ " out['Evalue']=Evalue\n",
+ " out['Annotation']=Annotation\n",
+ " out['log2CPM']=log2CPM\n",
+ " \n",
+ " return out"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 62,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Contig | \n",
+ " Evalue | \n",
+ " Annotation | \n",
+ " log2CPM | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " 0 | \n",
+ " 267033_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -12.137153 | \n",
+ "
\n",
+ " \n",
+ " 1 | \n",
+ " 3814_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -9.180025 | \n",
+ "
\n",
+ " \n",
+ " 2 | \n",
+ " 77900_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -11.266233 | \n",
+ "
\n",
+ " \n",
+ " 3 | \n",
+ " 163778_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -13.609640 | \n",
+ "
\n",
+ " \n",
+ " 4 | \n",
+ " 156264_1 | \n",
+ " NaN | \n",
+ " None | \n",
+ " -15.231129 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Contig Evalue Annotation log2CPM\n",
+ "0 267033_1 NaN None -12.137153\n",
+ "1 3814_1 NaN None -9.180025\n",
+ "2 77900_1 NaN None -11.266233\n",
+ "3 163778_1 NaN None -13.609640\n",
+ "4 156264_1 NaN None -15.231129"
+ ]
+ },
+ "execution_count": 62,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "HMMcmp('Heterosigma-akashiwo-CCMP2393.pep.fa.gz','BP').head()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 63,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "###CLEAN\n",
+ "def HMMclean(Organism):\n",
+ " '''Will create an HMM table and filter out values that do not meet an e-value'''\n",
+ " aX=HMMcmp(Organism,'BP')\n",
+ " bX=HMMcmp(Organism,'CF')\n",
+ " cX=HMMcmp(Organism,'PR')\n",
+ " \n",
+ " frames=[aX,bX,cX]\n",
+ " HMM= pd.concat(frames)\n",
+ " \n",
+ " HMM=HMM[HMM['Evalue'] < .0001]\n",
+ " \n",
+ " #return HMM.drop_duplicates()\n",
+ " \n",
+ " Genes= np.unique(HMM['Annotation'])\n",
+ " AX=pd.DataFrame(0,index=[Organism],columns=Genes)\n",
+ " \n",
+ " for j in Genes:\n",
+ " counts=HMM.Annotation.value_counts()[j]\n",
+ " AX[j]= counts\n",
+ " \n",
+ " return AX\n",
+ " "
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 23,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " ALAT_GGAT | \n",
+ " Bestrophin | \n",
+ " CA_alpha | \n",
+ " CA_beta | \n",
+ " GCL | \n",
+ " GDCT | \n",
+ " GK | \n",
+ " GOX | \n",
+ " GlcDH | \n",
+ " HR | \n",
+ " ... | \n",
+ " ME | \n",
+ " OMT | \n",
+ " PEPCK | \n",
+ " PGP | \n",
+ " PK | \n",
+ " PYC | \n",
+ " SHMT | \n",
+ " SLC4 | \n",
+ " SPT | \n",
+ " TSR | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " Heterosigma-akashiwo-CCMP2393.pep.fa.gz | \n",
+ " 13 | \n",
+ " 6 | \n",
+ " 5 | \n",
+ " 9 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 1 | \n",
+ " 4 | \n",
+ " 2 | \n",
+ " 5 | \n",
+ " ... | \n",
+ " 4 | \n",
+ " 62 | \n",
+ " 13 | \n",
+ " 9 | \n",
+ " 3 | \n",
+ " 15 | \n",
+ " 4 | \n",
+ " 3 | \n",
+ " 1 | \n",
+ " 6 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
1 rows × 22 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " ALAT_GGAT Bestrophin CA_alpha \\\n",
+ "Heterosigma-akashiwo-CCMP2393.pep.fa.gz 13 6 5 \n",
+ "\n",
+ " CA_beta GCL GDCT GK GOX GlcDH \\\n",
+ "Heterosigma-akashiwo-CCMP2393.pep.fa.gz 9 1 1 1 4 2 \n",
+ "\n",
+ " HR ... ME OMT PEPCK PGP PK \\\n",
+ "Heterosigma-akashiwo-CCMP2393.pep.fa.gz 5 ... 4 62 13 9 3 \n",
+ "\n",
+ " PYC SHMT SLC4 SPT TSR \n",
+ "Heterosigma-akashiwo-CCMP2393.pep.fa.gz 15 4 3 1 6 \n",
+ "\n",
+ "[1 rows x 22 columns]"
+ ]
+ },
+ "execution_count": 23,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "HMMclean('Heterosigma-akashiwo-CCMP2393.pep.fa.gz')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 64,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [],
+ "source": [
+ "###GENE_TYPE\n",
+ "def gen_type(gene_type):\n",
+ " \"\"\" Allows you to extract specific gene_types\"\"\"\n",
+ " \n",
+ " if gene_type=='BP':\n",
+ " fixed=['SLC4','Bestrophin','CA_alpha','CA_beta','CA_delta','CA_zeta']\n",
+ " if gene_type=='PR':\n",
+ " fixed=['PGP','GOX','SPT','ALAT_GGAT','GDCT','SHMT','HR','GK','GlcDH','MS','ICL','GCL','TSR']\n",
+ " if gene_type=='CF':\n",
+ " fixed=['PK','PEPC','PEPCK','MDH','OMT','ME','PPDK','PYC']\n",
+ " \n",
+ " A=pd.DataFrame(0, index=names, columns=fixed)\n",
+ " \n",
+ " for i,j in enumerate(names):\n",
+ " for k in fixed:\n",
+ " B=HMMclean(j)\n",
+ " #print B\n",
+ " if k in list(B):\n",
+ " A[k][i]=B[k]\n",
+ " \n",
+ " A.index = A.index.str.split('.').str[0] \n",
+ " ####the count is the number of transcriptomes taken into account\n",
+ " ###I just added gene_name to teh count to make teh values unique for when teh different gene_type frames are concatenated\n",
+ " A['count{}'.format(gene_type)]=0\n",
+ " \n",
+ " for i,j in enumerate(A.index):\n",
+ " expression=pd.read_csv('{}_cds_counts.txt'.format(j),delimiter='\\t',index_col=0)\n",
+ " A['count{}'.format(gene_type)][i]=len(list(expression))\n",
+ " \n",
+ " C=A.transpose()\n",
+ " C.to_csv('{}_{}_GeneCountHMM.csv'.format(t,gene_type))\n",
+ " \n",
+ " return A.transpose()"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 38,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "data": {
+ "text/html": [
+ "\n",
+ "
\n",
+ " \n",
+ " \n",
+ " | \n",
+ " Karenia-brevis-CCMP2229 | \n",
+ " Karenia-brevis-SP1 | \n",
+ " Karenia-brevis-SP3 | \n",
+ " Karenia-brevis-Wilson | \n",
+ " Peridinium-aciculiferum-PAER_2 | \n",
+ " Karlodinium-micrum-CCMP2283 | \n",
+ " Prorocentrum-minimum-CCMP1329 | \n",
+ " Prorocentrum-minimum-CCMP2233 | \n",
+ " Symbiodinium-kawagutii-CCMP2468 | \n",
+ " Symbiodinium-sp-C1 | \n",
+ " ... | \n",
+ " Alexandrium-monilatum-CCMP3105 | \n",
+ " Alexandrium-temarense-CCMP1771 | \n",
+ " Azadinium-spinosum-3D9 | \n",
+ " Ceratium-fusus-PA161109 | \n",
+ " Oxyrrhis-marina-LB1974 | \n",
+ " Oxyrrhis-marina | \n",
+ " Glenodinium-foliaceum-CCAP1116_3 | \n",
+ " Scrippsiella-hangoei-SHTV5 | \n",
+ " Scrippsiella-hangoei_like-SHHI_4 | \n",
+ " Scrippsiella-trochoidea-CCMP3099 | \n",
+ "
\n",
+ " \n",
+ " \n",
+ " \n",
+ " SLC4 | \n",
+ " 12 | \n",
+ " 19 | \n",
+ " 14 | \n",
+ " 32 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " ... | \n",
+ " 4 | \n",
+ " 10 | \n",
+ " 6 | \n",
+ " 5 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " Bestrophin | \n",
+ " 132 | \n",
+ " 139 | \n",
+ " 90 | \n",
+ " 128 | \n",
+ " 27 | \n",
+ " 78 | \n",
+ " 49 | \n",
+ " 50 | \n",
+ " 0 | \n",
+ " 32 | \n",
+ " ... | \n",
+ " 86 | \n",
+ " 85 | \n",
+ " 62 | \n",
+ " 87 | \n",
+ " 54 | \n",
+ " 59 | \n",
+ " 71 | \n",
+ " 61 | \n",
+ " 57 | \n",
+ " 2 | \n",
+ "
\n",
+ " \n",
+ " CA_alpha | \n",
+ " 20 | \n",
+ " 19 | \n",
+ " 19 | \n",
+ " 19 | \n",
+ " 25 | \n",
+ " 20 | \n",
+ " 7 | \n",
+ " 6 | \n",
+ " 0 | \n",
+ " 7 | \n",
+ " ... | \n",
+ " 19 | \n",
+ " 23 | \n",
+ " 19 | \n",
+ " 24 | \n",
+ " 16 | \n",
+ " 17 | \n",
+ " 35 | \n",
+ " 29 | \n",
+ " 30 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " CA_beta | \n",
+ " 7 | \n",
+ " 6 | \n",
+ " 5 | \n",
+ " 4 | \n",
+ " 18 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 22 | \n",
+ " ... | \n",
+ " 8 | \n",
+ " 8 | \n",
+ " 7 | \n",
+ " 20 | \n",
+ " 9 | \n",
+ " 15 | \n",
+ " 18 | \n",
+ " 23 | \n",
+ " 28 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " CA_delta | \n",
+ " 9 | \n",
+ " 7 | \n",
+ " 8 | \n",
+ " 7 | \n",
+ " 0 | \n",
+ " 6 | \n",
+ " 5 | \n",
+ " 7 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " ... | \n",
+ " 6 | \n",
+ " 7 | \n",
+ " 2 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 1 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " CA_zeta | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " ... | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ " 0 | \n",
+ "
\n",
+ " \n",
+ " countBP | \n",
+ " 4 | \n",
+ " 2 | \n",
+ " 2 | \n",
+ " 4 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 2 | \n",
+ " ... | \n",
+ " 4 | \n",
+ " 4 | \n",
+ " 3 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 4 | \n",
+ " 2 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ " 3 | \n",
+ "
\n",
+ " \n",
+ "
\n",
+ "
7 rows × 27 columns
\n",
+ "
"
+ ],
+ "text/plain": [
+ " Karenia-brevis-CCMP2229 Karenia-brevis-SP1 Karenia-brevis-SP3 \\\n",
+ "SLC4 12 19 14 \n",
+ "Bestrophin 132 139 90 \n",
+ "CA_alpha 20 19 19 \n",
+ "CA_beta 7 6 5 \n",
+ "CA_delta 9 7 8 \n",
+ "CA_zeta 0 0 0 \n",
+ "countBP 4 2 2 \n",
+ "\n",
+ " Karenia-brevis-Wilson Peridinium-aciculiferum-PAER_2 \\\n",
+ "SLC4 32 2 \n",
+ "Bestrophin 128 27 \n",
+ "CA_alpha 19 25 \n",
+ "CA_beta 4 18 \n",
+ "CA_delta 7 0 \n",
+ "CA_zeta 0 0 \n",
+ "countBP 4 2 \n",
+ "\n",
+ " Karlodinium-micrum-CCMP2283 Prorocentrum-minimum-CCMP1329 \\\n",
+ "SLC4 3 2 \n",
+ "Bestrophin 78 49 \n",
+ "CA_alpha 20 7 \n",
+ "CA_beta 0 0 \n",
+ "CA_delta 6 5 \n",
+ "CA_zeta 0 0 \n",
+ "countBP 3 4 \n",
+ "\n",
+ " Prorocentrum-minimum-CCMP2233 Symbiodinium-kawagutii-CCMP2468 \\\n",
+ "SLC4 2 1 \n",
+ "Bestrophin 50 0 \n",
+ "CA_alpha 6 0 \n",
+ "CA_beta 0 0 \n",
+ "CA_delta 7 0 \n",
+ "CA_zeta 0 0 \n",
+ "countBP 3 4 \n",
+ "\n",
+ " Symbiodinium-sp-C1 ... \\\n",
+ "SLC4 2 ... \n",
+ "Bestrophin 32 ... \n",
+ "CA_alpha 7 ... \n",
+ "CA_beta 22 ... \n",
+ "CA_delta 1 ... \n",
+ "CA_zeta 0 ... \n",
+ "countBP 2 ... \n",
+ "\n",
+ " Alexandrium-monilatum-CCMP3105 Alexandrium-temarense-CCMP1771 \\\n",
+ "SLC4 4 10 \n",
+ "Bestrophin 86 85 \n",
+ "CA_alpha 19 23 \n",
+ "CA_beta 8 8 \n",
+ "CA_delta 6 7 \n",
+ "CA_zeta 0 0 \n",
+ "countBP 4 4 \n",
+ "\n",
+ " Azadinium-spinosum-3D9 Ceratium-fusus-PA161109 \\\n",
+ "SLC4 6 5 \n",
+ "Bestrophin 62 87 \n",
+ "CA_alpha 19 24 \n",
+ "CA_beta 7 20 \n",
+ "CA_delta 2 0 \n",
+ "CA_zeta 0 0 \n",
+ "countBP 3 2 \n",
+ "\n",
+ " Oxyrrhis-marina-LB1974 Oxyrrhis-marina \\\n",
+ "SLC4 0 1 \n",
+ "Bestrophin 54 59 \n",
+ "CA_alpha 16 17 \n",
+ "CA_beta 9 15 \n",
+ "CA_delta 0 0 \n",
+ "CA_zeta 0 0 \n",
+ "countBP 3 4 \n",
+ "\n",
+ " Glenodinium-foliaceum-CCAP1116_3 Scrippsiella-hangoei-SHTV5 \\\n",
+ "SLC4 2 3 \n",
+ "Bestrophin 71 61 \n",
+ "CA_alpha 35 29 \n",
+ "CA_beta 18 23 \n",
+ "CA_delta 1 0 \n",
+ "CA_zeta 0 0 \n",
+ "countBP 2 3 \n",
+ "\n",
+ " Scrippsiella-hangoei_like-SHHI_4 Scrippsiella-trochoidea-CCMP3099 \n",
+ "SLC4 2 0 \n",
+ "Bestrophin 57 2 \n",
+ "CA_alpha 30 0 \n",
+ "CA_beta 28 0 \n",
+ "CA_delta 0 0 \n",
+ "CA_zeta 0 0 \n",
+ "countBP 3 3 \n",
+ "\n",
+ "[7 rows x 27 columns]"
+ ]
+ },
+ "execution_count": 38,
+ "metadata": {},
+ "output_type": "execute_result"
+ }
+ ],
+ "source": [
+ "gen_type('BP')"
+ ]
+ },
+ {
+ "cell_type": "code",
+ "execution_count": 65,
+ "metadata": {
+ "collapsed": false
+ },
+ "outputs": [
+ {
+ "name": "stdout",
+ "output_type": "stream",
+ "text": [
+ "COMPLETE\n"
+ ]
+ }
+ ],
+ "source": [
+ "a=gen_type('BP')\n",
+ "b=gen_type('CF')\n",
+ "c=gen_type('PR')\n",
+ "\n",
+ "framesX=(a,b,c)\n",
+ "outFrame=pd.concat(framesX)\n",
+ "\n",
+ "outFrame.to_csv('{}_GeneCountHMM.csv'.format(t))\n",
+ "\n",
+ "print 'COMPLETE'"
+ ]
+ }
+ ],
+ "metadata": {
+ "anaconda-cloud": {},
+ "kernelspec": {
+ "display_name": "Python [default]",
+ "language": "python",
+ "name": "python2"
+ },
+ "language_info": {
+ "codemirror_mode": {
+ "name": "ipython",
+ "version": 2
+ },
+ "file_extension": ".py",
+ "mimetype": "text/x-python",
+ "name": "python",
+ "nbconvert_exporter": "python",
+ "pygments_lexer": "ipython2",
+ "version": "2.7.12"
+ }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 0
+}
diff --git a/PGP.txt b/PGP.txt
new file mode 100644
index 0000000..9fd4a2c
--- /dev/null
+++ b/PGP.txt
@@ -0,0 +1,27 @@
+>jgi|Thaps3|25544|estExt_fgenesh1_pg.C_chr_19c_290042|PGP_TPS
+MSMRPKDLLPGVDVFIFDCDGVIWRGDSVIPGIPQTLEKLRALGKKMYFVTNNSTKSRAGYKKKFDSLGL
+NVPAEEIFSSSFAAAAYLEQSKFKETGKKVYVVGEVGIQEELDLIGVPHFGGPEDANKQPDMGPGCMVEH
+DEDVGAVVVGFDRNINYYKIQYAQLCINENPGCEFIATNTDAVTHLTDAQEWAGNGSMVGAIKGCTGREP
+TVVGKPSPLMIDYLCDKLGLDRGRICMVGDRLDTDILFGSDNGLKSLLVLSGVTTEEKLLSQENVITPDY
+YADSIVDFFVDENAKVGA*
+>jgi|Phatr2|48026|estExt_fgenesh1_pg.C_chr_150301|PGP_PTRI
+MSLRMAGTLALFVTGATTRALGRSGGTSKRPFSLRLPLQTGTQPTLALSLSSSASANKPTLSLTEQMRKE
+SEAELAKLAHHYEDRARNDPAFADLAPIIWKTLDEATAFVNDHIETIMFDCDGVVYRTPDECPGAKECIQ
+RLLDKGKRVFFVTNNAASNRSQLRAKLSEILAIENLTDDMMVPSSYSCARFLQREILDRKGRGRLFVIGS
+RGLCDELEQTGFEVLTGNGPLDSDASMTREDLATYPFSEHPVDAVVVANVLLQMNPDAPLVATNKDAFDL
+VGVDGRHIPGNGCAVVALEHSSKRTAINVGKPSATLADLIAADHGINPSRTMFVGDRLDTDIQFGVENGM
+HSVLVMTGVTTADSMVQLGNGTNDEPLPNIVIPHIGLLY*
+>gi|EOD14003.1|2-phosphoglycolate_phosphatase|EHUX_CCMP1516
+MGLSLSSSLRIGLLSAAAGTAAGFSFGAAAASGVRRTASTGLRPASLAAQSSAAAVSFKDAGLCKKLDVP
+ADLLEKVDVFIFDCDGVIWKGDSLIDKVPAVLAMLRAAGKKVFFVTNNSTKSRKGYLGKFKSLGLEETQP
+EEIFSSSFAAAAYLEQTKFKESGKKVYIIGEVGIEEELDMIGVPWIGGGSDAGKKIALKSGYALPHDSDV
+GAVIVGFDREINYHKIQYAQLCINENPGCEFIATNLDAVTHLTDAQEWAGNGAMAGAIKGCTGREPTLVG
+KPSPLMIDYMVEKFGIERGRICMVGDRLDTDILFGQNNGLLSCLTLSGVTTEEKLLSPENEIKPDFYVDS
+IADFL
+>gi|NP_001119316.1|2-phosphoglycolate_phosphatase1|Arabidopsis_thaliana
+MLSRSVASAVTPVSSSSLLPNSKPIFCLKTLSGYRSSSFCGGCIRKINHKPLRMTSSNITPRAMATQQLE
+NADQLIDSVETFIFDCDGVIWKGDKLIEGVPETLDMLRAKGKRLVFVTNNSTKSRKQYGKKFETLGLNVN
+EEEIFASSFAAAAYLQSINFPKDKKVYVIGEEGILKELELAGFQYLGGPDDGKRQIELKPGFLMEHDHDV
+GAVVVGFDRYFNYYKIQYGTLCIRENPGCLFIATNRDAVTHLTDAQEWAGGGSMVGALVGSTQREPLVVG
+KPSTFMMDYLADKFGIQKSQICMVGDRLDTDILFGQNGGCKTLLVLSGVTSISMLESPENKIQPDFYTSK
+ISDFLSPKAATV
diff --git a/PK.txt b/PK.txt
new file mode 100644
index 0000000..fd194ac
--- /dev/null
+++ b/PK.txt
@@ -0,0 +1,111 @@
+>jgi|Thaps3|22345|estExt_fgenesh1_pg.C_chr_40571|PK_TPS
+MISNTSDVPLLAGGYINLDTVKATNNIGSRRTKIICTLGPACWDVSQLEELIESGMNVARFNFSHGDHDG
+HKACLDRLRQAAKNMNQNVAVLLDTKGPEIRTGFFADGAKSINLVKGEELILTSDYAYKGDSKKLACSYE
+KLASSVNPGQSILVADGSLVLTVVSCDETTGEVVTRVENNAKIGERKNMNLPGVVVDLPTLTEKDVDDIV
+NWGIKHDVDYIAASFVRKASDVLFIRKILAENGGSGIKIISKIENQEGLQNYLEILQATDGIMVARGDLG
+MEIPPEKVFLAQKYMIREANIAGKPVITATQMLESMITNPRPTRAECSDVANACYDGTDAVMLSGETANG
+CYYRQAVEIMARTCAEAETSVNWNELYQSVRNSVRKRYQLSSSESLASSAVKTAVDVGAKVIVVYSESGA
+TARHIAKFRPGMPVAVLTPSEQVARQSFGLLKGSYAFVVDTLEDTHKLDKEVMRECRVAGIAQAGDPVVI
+VCGSTFGTGATNQIKVEFVQSDDGDADDGKAHLDNNNAEYNGCTIC*
+>jgi|Phatr2|22404|estExt_gwp_gw1.C_chr_170082|PK1_PTRI
+MKLSLLALTFALGHAFVPPSFLASPSSRKVLSSSRSASVAANAADVLAKTTSSSSTPSSLMPKETTVAAV
+PKVAQRWRKSTKQVVTLGPASSNKEMIEKLFLAGADVFRLNFSHGSQEQKKELLIMIREVEEKYSHPIGI
+LGDLQGPKLRVGEFSKPEGEFLELGQSFRLDLDNAKGDNKRVQLPHPEIIKASELGHALLVDDGKVKLVV
+TAKGDDYLECRVDVAGMIKDRKGVNTPDSVLEISPLTPKDRSDLEYMLGIGVDWVALSFVQTPADMVEIH
+ALIDEKLPSGQFKPAVMAKIEKPSCFYDDNLQRIVGLCNGIMVARGDLGVECPPEDVPLLQKEIIDECRN
+QGRPVIVATQMLESMIEVPTPTRAEASDVATAIYDGADAIMLSAESAAGKFPEESVAMQQRIINRVEGDK
+HYRSYLKQNEPDPENTPTDAIITAARQVAKTIGAKSIVCFSLRGSTVLRASKSRPGVPILALCPFKETSR
+QLALSWGVYSDLPKAGSYGYTVSEEDMFNYDRPMVEKSTDDFDLVLKNACRAALKKGLVSDPDDLLVVTA
+GLPFGTPGAANIIRVVPAAGPSCWDGVCRVD*
+>jgi|Phatr2|49098|estExt_fgenesh1_pg.C_chr_200225|PK2_PTRI
+MTASQTKITASGPELRGANITLDTIMKKTDVSTRQTKIVCTLGPACWEVEQLESLIDAGLSIARFNFSHG
+DHEGHKACLDRLRQAADHKKKHVAVMLDTKGPEIRSGFFADGAKKISLVKGETIVLTSDYSFKGDKHKLA
+CSYPVLAKSVTPGQQILVADGSLVLTVLSCDEAAGEVSCRVENNAGIGERKNMNLPGVIVDLPTLTDKDI
+DDIQNWGIVNDIDFIAASFVRKASDVHKIREVLGEKGKGIKIICKIENQEGMDNYDEILEATDAIMVARG
+DLGMEIPPEKVFLAQKMMIRQANIAGKPVVTATQMLESMITNPRPTRAECSDVANAVLDGTDCVMLSGET
+ANGEYPTAAVTIMSETCCEAEGAQNTNMLYQAVRNSTLSQYGILSTSESIASSAAKTAIDVGAKAIIVCS
+ESGMTATQVAKFRPGRPIHVLTHDVRVARQCSGYLRGASVEVISSMDQMDPAIDAYIECCKANGKAVAGD
+AFVVVTGTVAQRGATNLMRVMYA*
+>jgi|Phatr2|56445|AGR_estExt_Phatr1_ua_kg.C_chr_200033|PK3_PTRI
+MSLSQSSDVPILAGGFITLDTVKHPTNTINRRTKIVCTIGPACWNVDQLEILIESGMNVARFNFSHGDHA
+GHGAVLERVRQAAQNKGRNIAILLDTKGPEIRTGFFANGASKIELVKGETIVLTSDYKFKGDQHKLACSY
+PALAQSVTQGQQILVADGSLVLTVLQTDEAAGEVSCRIDNNASMGERKNMNLPGVKVDLPTFTEKDVDDI
+VNFGIKHKVDFIAASFVRKQSDVANLRQLLAENGGQQIKICCKIENQEGLENYDEILQATDSIMVARGDL
+GMEIPPAKVFLAQKMMIREANIAGKPVITATQMLESMINNPRPTRAECSDVANAVLDGTDCVMLSGETAN
+GPYFEEAVKVMARTCCEAENSRNYNSLYSAVRSSVMAKYGSVPPEESLASSAVKTAIDVNARLILVLSES
+GMTAGYVSKFRPGRAIVCLTPSDAVARQTGGILKGVHSYVVDNLDNTEELIAETGVEAVKAGIASVGDLM
+VVVSGTLYGIGKNNQVRVSVIEAPEGTVKETAAAMKRLVSFVYAADEIPGNAD*
+>jgi|Phatr2|45997|estExt_fgenesh1_pg.C_chr_80378|PK4a_PTRI
+MLSSTSTIPKLDGEVVTLSVIKKPTETKKRRTKIICTLGPACWSEEGLGQLMDAGMNVARFNFSHGDHEG
+HGKVLERLRKVAKEKKRNIAVLLDTKGPEIRTGFFADGIDKINLSKGDTIVLTTDYDFKGDSKRLACSYP
+TLAKSVTQGQAILIADGSLVLTVLSIDTANNEVQCRVENNASIGERKNMNLPGVVVDLPTFTERDVNDIV
+NFGIKSKVDFIAASFVRKGSDVTNLRKLLADNGGPQIKIICKIENQEGLENYGDILEHTDAIMVARGDLG
+MEIPSSKVFLAQKYMIREANVAGKPVVTATQMLESMVTNPRPTRAECSDVANAVYDGTDAVMLSGETANG
+PHFEKAVLVMARTCCEAESSRNYNLLFQSVRNSIVIARGGLSTGESMASSAVKSALDIEAKLIVVMSETG
+KMGNYVAKFRPGLSVLCMTPNETAARQASGLLLGMHTVVVDSLEKSEELVEELNYELVQSNFLKPGDKMV
+VIAGRMAGMKEQLRIVTLDEGKSYGHIVSGTSFFFERTRLLDFND*
+>jgi|Phatr2|27502|estExt_Genewise1.C_chr_80291|PK4b_PTRI
+MLSSTSTIPKLDGEVVTLSIIKKPTETKKRRTKIICTLGPACWSEEGLGQLMDAGMNVARFNFSHGDHEG
+HGKVLERLRKVAKEKKRNIAVLLDTKGPEIRTGFFADGIDKINLSKGDTIVLTTDYDFKGDSKRLACSYP
+TLAKSVTQGQAILIADGSLVLTVLSIDTANNEVQCRVENNASIGERKNMNLPGVVVDLPTFTERDVNDIV
+NFGIKNKVDFIAASFVRKGSDVTNLRKLLADNGGPQIKIICKIENQEGLENYGDILEHTDAIMVARGDLG
+MEIPSSKVFLAQKYMIREANVAGKPVVTATQMLESMVTNPRPTRAECSDVANAVYDGTDAVMLSGETANG
+PHFEKAVLVMARTCCEAESSRNYNLLFQSVRNSIVIARGGLSTGESMASSAVKSALDIEAKLIVVMSETG
+KMGNYVAKFRPGLSVLCMTPNETAARQASGLLLGMHTVVVDSLEKSEELVEELNYELVQSNFLKPGDKMV
+VIAGRMAGMKEQLRIVTLDEGKSYGHIVSGTSFFFERTRLLDFND*
+>jgi|Phatr2|49002|estExt_fgenesh1_pg.C_chr_200111|PK5_PTRI
+MMRSFLRHAHRRACAQQLRTIGTLRLNQMPVTGANTKIVCTIGPASDQAESLGQLVTYGMSVARLNFSHA
+GDDYTYSEANMALLRNAVGKHHHLATGSSTDLPKNLRAILVDTKGPEIRTGILPGDVEIMDIPVGATVML
+CIEDVSQEVLAEGEFKIHVDYESIAKTVKIGDKVLLDDGLIELEVMEVHPGSGTVLTSALNGGPIKKNKG
+VNLPGVQLDLPALTDKDKRDLDWACRVGADFVAASFIRTPANVRSVIAYLDRCISKLPDVNGMKPLRPLV
+ISKIESKEGVDNFDEILEESDGIMVARGDLGVEIPYSKVFAAQRMMVHKCNEIGKPVIVATQMLDSMMRN
+PRPTRAEVTDVGTAVMDGADAVMLSGETAAGKYPIESIRAMASVAWEADQIVNSKSSIVWNEDLHEKMDL
+MEQELDAVAASAVRSAQDMGAKMIVLITMSGRVARAVARHRPTVPVLAYCTDVQVARRLQLHRSIIPIML
+QSEADPGDSSTRMGYLRAEAVRTAKELGFAHSGDRIIMVDRTVGKSHDMHEFSHNMKVVTLRDS*
+>jgi|Phatr2|56172|estExt_Phatr1_ua_pm.C_chr_230011|PK6_PTRI
+MFRRAVLSLSTRAIRTPVPCSVARGGASQVRSLAQTTFYLPDPADRSQDVHNRGNLQLSKIVATIGPTSE
+QEEPLRLVTDAGMRIMRLNFSHATKEEVELRITNLALAQKALQPPGTLEMQDVRALLLDTKGPEIRSGKL
+AHDESGHATVTLQQGQRIELFNDASRQQQSGSTEQALYIDYPGLHRCLHPSMKVLLDDGAITLTVQSVNV
+EAATVSCVVDNAGELRSRAGVNLPLADTSDLPAMSDKDKQDIKYGMTMDIDYVAASFVQTAEGVNEIRGY
+IQQCAQELGWDDSHPLPLIISKIETAGALQHFDAILAASDGIMVARGDLGVEIPLTQVTNAQKEMVAACN
+AVGKPVIVATQMLESMAKSPRPTRAEVSDVTNAIYDGADCVMLSGETAKGKFPTEAVRTMNEIILAAERY
+TTSGALGHSYHRPAFVGPKTADSAVAKAAVTASVERDCAAILVLTQHGSLPPLVSAYRPRVPIFAFCPTP
+KLARQLQVYRGIHPIVDSTLTDGNDCKRPEQAVQEAKDMGLLQSGDEVVVVSMDGTTATMKIAIVS*
+>gb|CAM77898.1_pyruvate_kinase|Magnetospirillum_gryphiswaldenseMSR1
+MRRTRKAKIIATLGPASSTPQAIESLFRAGADVFRLNFSHGSHADHQARYDTIRALEQKIGRPIGVLADL
+QGPKLRVGKFADGKIKLETGATFRLDLSPELGTGVRAPLLHPEVFAAMNVGTELLLDDGKLRLRVEQHGG
+DFAETRVIVGGELSNHKGVNVPNVVLPISPLTDKDKADLDFAVDMGADWIALSFVQRPGDVLEARKLIAR
+KVGSRVRLLSKLEKPSAIDYLEEIVELSDAVMVARGDLGVECPPESVPILQKRIIKCCRSAGKPVVVATQ
+MLDSMVHSPSPTRAEASDVATAIYDGADAVMLSAETASGDYPVDAVTMMDRIINRVEEDDQYTVITDASR
+SQPENTTRDAISAAARQVAHTLKAAAVVTFTSSGSTTLRAARERPQQPIISLTSGIEVARQLALVWGAHC
+VPTQEVRSFAEMVQTAAKAAQDESFAHPGDRIVITAGVPFGCSGTTNILRVAEIEESGEVL
+>gb|AAU81896.1_pyruvate_kinase|Achlya_bisexualis
+MAGAINLHKQGVELEAIMGDNEGQTRRSKIFCTIGPACWSVEKLTELIDAGMNVARFNFSHGDHKTHSEV
+LNRLRTAIASRPHRHVAIMLDTKGPEIRTGFLATEDKKVHIEKDSIIEFTTDYEFLGDETKLACSYEDLP
+TSVKVGGPILVADGSLVLEVTEILETGVKARALNSATLGERKNMNLPGAKVTLPTLTERDEDDLINWGLV
+QGVDFIAASFVRCGQDIDNIRAVLGPRGRAIKIIAKIENQEGLENFDDILEKTDGIMVARGDLGMEIAPE
+KVFLAQKMMIRKANIAGKPVVTATQMLESMIHNPRPTRAECTDVANAVLDGSDAVMLSGETANGDYPVEA
+VRMMHKTCLQAEGAIHYDELYQALRNSVLETNGKMSTQEAIASSAVKTAIDMGAKMIVVLTETGTTARLI
+AKYRPACPILVLTALGETARQCEGFLKGSYCRVMGSMIGTDSILYRATDLGKQFGWIKKGDAVVAIHGMM
+EARSGSTNMLKVLVCD
+>gb|AAU81895_1_pyruvate_kinase|Achlya_bisexualis
+MLARSLRSRAVRSFARGLSNKPSKNDAFSMTKIVGTVGPVSENAKTTQELTNAGLKIMRINFSHATYDEA
+HLRMSHLRASKGVHAKHTGKEFNVRAVLLDTQGPEIRGGAFPEKKINLTKGDMITLTTDVQYKEASTKDM
+LYVTYEQLPATVKVGDTVLLDDGLISLTVKSIDVASGQVRCLIENSEVLGSRKGVNLPGLVVDLPALTAK
+DKQDVEFGVEHDMDFIAVSFVRKPEDVNDVKDFVNSVMPKYWPAGHPAPLIISKIENYEGVSNFDRILEV
+SDGIMVARGDLGVEIPMQEVLTCQKDMVSKCNAAGKPVIVATQMLESMIRNPRPTRAEILDVGNAVLDGA
+DAVMLSGEVAQGKWPVESVKTMMSVIKEADAYVKREQYKKEALSQKEAVACAVATTAKSLHAAMIVVMTA
+SGEVARLVSKHKPSVPVMCYTTSQKVGRQLQIHRGLYPIVAPTPCKMNLQEAISTAKKLGWLHNGDQVVM
+LSSETPTGVVGQQYIMRVATVGEDIAH
+>gi|NP_001193725.1|pyruvate_kinase_PKM_isoform_c|Homo_sapiens
+MQWSSERGERLLTPGACSSEVPSAVPSRSGGSPGHTVFSSERSLLVRPRSHPEPKGEHYVTGSPTPENQR
+TSAAMSKPHSEAGTAFIQTQQLHAAMADTFLEHMCRLDIDSPPITARNTGIICTIGPASRSVETLKEMIK
+SGMNVARLNFSHGTHEYHAETIKNVRTATESFASDPILYRPVAVALDTKGPEIRTGLIKGSGTAEVELKK
+GATLKITLDNAYMEKCDENILWLDYKNICKVVEVGSKIYVDDGLISLQVKQKGADFLVTEVENGGSLGSK
+KGVNLPGAAVDLPAVSEKDIQDLKFGVEQDVDMVFASFIRKASDVHEVRKVLGEKGKNIKIISKIENHEG
+VRRFDEILEASDGIMVARGDLGIEIPAEKVFLAQKMMIGRCNRAGKPVICATQMLESMIKKPRPTRAEGS
+DVANAVLDGADCIMLSGETAKGDYPLEAVRMQHLIAREAEAAMFHRKLFEELVRASSHSTDLMEAMAMGS
+VEASYKCLAAALIVLTESGRSAHQVARYRPRAPIIAVTRNPQTARQAHLYRGIFPVLCKDPVQEAWAEDV
+DLRVNFAMNVGKARGFFKKGDVVIVLTGWRPGSGFTNTMRVVPVP
+>potatoPKCYT1|S53332
+MANIDIAGIMKDLPNDGRIPKTKIVCTLGPSSRTVPMLEKLLRAGMNVARFNFSHGTHEYHQETLDNLKIAMQNT QILCAVMLDTKGPEIRTGFLTDGKPIQLKEGQEITVSTDYTIKGNEEMISMSYKKLVMDLKPGNTILCADGTITL TVLSCDPPSGTVRCRCENTATLGERKNVNLPGVVVDLPTLTEKDKEDILEWGVPNNIDMIALSFVRKGSDLVNVR KVLGPHAKRIQLMSKVENQEGVINFDEILRETDSFMVARGDLGMEIPVEKIFLAQKMMIYKCNLAGKAVVTATQM LESMIKSPAPTRAEATDVANAVLDGTDCVMLSGESAAGAYPELAVKIMSRICIEAESSLDNEAIFKEMIRCTPLP MSPLESLASSAVRTANKARAKLIVVLTRGGSTAKLVAKYRPAVPILSVVVPVLTTDSFDWSISDETPARHSLVYR GLIPLLGEGSAKATDSESTEVILEAALKSAVTRGLCKPGDAVVALHRIGSASVIKICVVK
\ No newline at end of file
diff --git a/PPDK.txt b/PPDK.txt
new file mode 100644
index 0000000..555d84e
--- /dev/null
+++ b/PPDK.txt
@@ -0,0 +1,74 @@
+>jgi|Thaps3|5500|fgenesh1_pg.C_chr_5000542|PPDK_TPS
+MVINSQYIYPFGGSAPKPTVDPDKQIVGGKGLGLQVMFKIGVDVPPGFTLTTPLCQVYAKQNDLPADVWK
+GVRENVRRIEIDMEKEFGSNENPLLFSCRSGAAMSMPGMMDTVLNIGLNDITVDGLAKATGNDRFAWDSY
+RRLLDMFGEVVLGISHEDFEKRFDKVKEAANAKSDVDLGVEDLKKLCDEYKQVYLEEGKVFPMDPYEQLY
+ACVKAVFGSWMTPRAVKYREINNIKNLIGTATNIQTMVFGNMGDDSGTGVAFSRNPSTGENLMYGEYLIN
+AQGEDVVAGIRTPQPISQMQEVLPDAYAKFLENVDKLEHYFKDMQDVEFTVEKGQLWMLQCRNGKRTGVA
+AIRIATELVEEGICTKSEALLKVEPRHVEQLLHPTFSPEALKSVAYTEGIVAKGLPGSPGAAVGKLVFNP
+KQAEDERAKGESVILVRETTSPEDVGGMWAAAGILTARGGMTSHAAVVARGWGKPCVCGCSDIVVSEMDE
+TVTVKSSNLVFKSGDVISINGGTGEVINAVIEVKTPDLKGDLSVLLGWADEEEGVMQVLANADSGPDASQ
+AANNGAMGIGLCRTEHMFFAPERLPIVRRWIFHTECLDDLDHIKHFQRSDFKDLFVAMNGKHVTIRLLDP
+PLHEFLPRPEQVHEKTAEELGFGKDVKRMLARIDSMHEENPMLGLRGCRLGIVKPEFTQMQVEAIMSAAA
+DFMEEAPDTAKVHPRIMIPLIGSISEYKNQALIIKREAERIKVERGLDIPYEIGTMIEVPRAAVVSDKIA
+ALVDDEDSKPLCTFFSYGTNDLTQMTMGISRDDSNGFIPKYLELGILEDDPFQTIDVEGVGYMVKHSATL
+GKVVNPNLSLSVCGEHGGDPKSIEFFDSLGLNYVSCSPFRVPVARLAAGQIRVKRRMEEAKLAETAARAV
+LIKAAEKTTALNVGGQTNVLVHQ*
+>jgi|Phatr2|21988|estExt_gwp_gw1.C_chr_150095|PPDK_PTRI
MKFSSAAATTVGLLLSGHAPMIFSFVTPPSRFASGHQASGSERSIISHSTESSSSLTSHNTNDSRSISSK
MQFPLFMTATYSEAGKKKTSTISTTGSNSMVSEQEDSKHGIVPFGGSAAHVKTPDKQILGGKGLGLQEMS
YVGIDVPPGFTLTTPLCQVFQENGDLPEEMWRQVEAAIQRVEQDMDRKFGDPSYPLLFSCRSGAAISMPG
MMDTVLNVGLNKETVEGLAKATGNKRFAYDSFRRLLDMFGDVVLGIPHESFEDKLKGLKAKVGVQDDIDL
TAEHLKELCDLYEKVYDEHGKEFPHNPMDQLKACIKAVFGSWNSGRAIKYREVQGITSLLGTACNIQTMV
FGNLGPTSGTGVAFSRDPGTGKAVLNGEYLVNAQGEDVVAGIRTPEPISTMEKGFPKAYEQFIRNVHTLE
QHFKDMQDVEFTVENERLWMLQCRSGKRTGQAALKIACDLVDEGICTPEEALLKVEPDHVKQVLHPTFSA
EALESAVYKENVVAVGLAGGPGAAVGKVVFSTETAEEMTKEGVILVRETTSPEDVGGMWASRGIVTCRGG
VTSHAAVVARGWGKPCVVGCDDIDVDMKTKTMTIKETGEEFKEGDVISINGSTGEIVRVAIETTVPALEG
EFGKLLGWADEVPDVCRVMANADSGADAQKARDLGAQGIGLCRTEHMFFSPSRLPVVRRWILRDEGLEQV
QEFQREDFREIMHVMDGKPVTIRLLDPPLHEFLPHSSEINEKLSKQLGYDDSQALASDIEAMHEENPMLG
LRGCRLGIVREGLTAMQTEAIIHAAADLIEKNNDAKPYPRIMVPLIGSVAEFKNQALLIKRTAERVKKER
GIDVPYEIGTMIEVPRAALVSDQIAGVTDPEDGKRLCEFFSYGTNDLTQMTLGISRDDAGAFLQVYKDLG
IMEEDPFKSIDTEGVGFLLHLSAAKGRMVNPELSLSICGEHGGDAASIKFFDKVGLDYVSCSPFRVPVAR
LAAGQASVKRRKDEIEPMSRKDRVVKTSPMA
+>gb|BAA21653.1_pyruvate_orthophosphate_dikinase|Eleocharis_vivipara
+MERVCLHAIHGACKPDMDGNIRFGRKKRHLYKRLSSCRVRAMKLDQSGFEASRKQSSYALKAIATPMAVT
+TKKRVFTFGKNKTEGNKGMKELLGGKGANLAEMSSIGLSVPPGFTVSTEACQQYQESGHKMPPGLWDEII
+DGLKWVQQDMGARLGDPEKPLLVSVRSGAAVSMPGMMDTVLNLGLNDEVVSGLAKKSGERFAYDSYRRFI
+DMFGDVVMGISHEHFGDKLEEMKATKGVKNDTDLSANDLKELVVQYKEVYAKAKGEPFPTDPMKQLSLAV
+LAVFDSWDSPRAKKYRSINKITGLKGTAVNVQCMVFGNMGNTSGTGVLFTRNPSTGEKKLYGEFLVNAQG
+EDVVAGIRTPQELETMKDYFPQAYQELVDNCKILESHYKDMMDIEFTVQENRLWMLQCRTGKRTGKAAVK
+IAVDLVSEGLVDTRTAIKMVDPGHLDQLLHPQFENPKAYKDKVIASGLPASPGAAVGQVVFTAEDAEMWH
+AQGKAVILVRTETSPEDVGGMHAAAGILTARGGMTSHAAVVARGWGKCCVSGCSDIRVNDAEKVLLVGDK
+KLQEGEWISLNGSTGEVIMGKQPLSPPALSGDLGTFMAWVDEVRQIGVMANADTPEDALAARNNGAQGIG
+LCRTEHMFFASDERIKAVRQMIMSGTVEQRQKALDRLLPYQRSDFEGIFRAMDGLPVTIRLLDPPLHEFL
+PEGNIEDIVREMASETGSAEEEVFSRVEKLSEVNPMLGFRGCRLGISYPELTEMQARAIFEAAITMSNQG
+VKVLPEIMVPLVGTPQELGHQVSLIRQVADKVFSATGTSVSYKVGTMIEIPRAALVADEIAEHAEFFSFG
+TNDLTQMTFGYSRDDVGKFLPIYIAHGILQNDPFEVLDQKGVGELVKLATERGRKTRPDLKVGICGEHGG
+EPSSVAFFAKSGLNYVSCSPFRVPIARLAGAQVVVQK
+>gb|AOL23586.1|pyruvate,orthophosphate_dikinase|Erythrobacter litoralis
+MTTATATQTGTDTELRTVYRFGGDAPHDDPRQRDKVVTGGKGANLAEMASIGLPVPPGFTITTEECVRYL
+QAGEAFRDELRAEVAEALAHVERAVGKKFGDAADPLLVSVRSGARVSMPGMMDTVLNLGLNDETVEGLAK
+VSEDERFAWDSYRRFIQMYSDVVLGLDHGLFEEALEIAKEDQGYYNDTEMSADDWRSLVREYKRIVREEQ
+DEPFPQDVNDQLWGAIAAVFGSWDSERAKVYRRLNDIPADWGTAVNVQAMVFGNMGDTSATGVAFTRDPS
+TGDRSYYGEYLINAQGEDVVAGIRTPQYLTRAAREAAGAKPLSMEEALPQAYEELARVFDLLESHYRDMQ
+DIEFTVERGKLWMLQTRTGKRTAKAALKMATDMVDEGLIDRAEAVRRIDPMALDQLLHPTLDPDAERDVM
+TTGLPASPGAAAGKIVLDADTAEQWANRGDKVILVRVETSPEDIHGMHAAQGILTARGGMTSHAAVVARG
+MGRPCVSGASGISIDRTERTLRIGSQELKEGDTITLDGANGQVMLGEVPTVEPELAGDFATLMEWADELR
+RMRVRTNAETPEDCRMARQFGAEGIGLCRTEHMFFDAGRIKAVRQMILAEDEAGRRKALDQLLPEQRADF
+TAIFEVMAGLPCTIRLLDPPLHEFLPTRDEDFADLSDATGLGVDHLRRRANELHEFNPMLGHRGCRLGIT
+YPEIYEMQARAIFEAACAVEAESGDAPLPEIMIPLVATKKELSLLRALVDRVAEEVFGEKGTRIAYLVGT
+MIELPRAALLAGEIAEEGEFFSFGTNDLTQTTLGLSRDDAGRFLGTYVDKGIFPRDPFVSLDVDGVGQLV
+ELAATRGRATRPEIKLGICGEHGGDPASIGFCENVGLDYVSASPYRVPIARLAAAQAALAVSK
+>gb|CAK06583.1|pyruvate,phosphate orthophosphate dikinase (ec 2.7.9.1)|[Rhizobium leguminosarum bv. viciae 3841]
+MTKWVYRFGDGQAEGRARDHEVLGGKGANLAEMCALGLPVPPGLTIVSDACNTYYKNGRHIEDQVKAEVR
+AGIAAIEAITGRRFGSVSQPLLLSVRSGARVSMPGMMDTVLNLGLNDETVQALGHDAGDARFAWDSYRRF
+IQMYADVVMGLGNDAFEEILEDEKAKLGHEFDTELSASEWQHIVSLYKKLIEEELEQEFPQDPEVQLWGA
+VGAVFASWMSARAVTYRHLHNIPEGWGTAINIQAMVFGNLGNASATGVAFTRNPSTGERALYGEFLVNAQ
+GEDVVAGIRTPQSITEEGRISSGSEKPSMEKLMPEAFRELCRICTELEIHYRDMQDIEFTIERGKLWMLQ
+TRSAKRSTRAAMKIAVDMVDEGVITEDEAVLRIEPSSLDQLLHPTIDPRVTRQVIGSGLPASPGAATGAI
+VFTAEEAVEAESEGRKVILLRVETSPEDIHGMHAAEGILTTRGGMTSHAAVVARGMGIPCVVGAGTMRID
+VRNERLLGVGVTLKKGDIITIDGSAGQVLKGEVPMIQPALSGDFGRIMGWADRARRMTVRTNADTPADAL
+AARSFGAEGIGLCRTEHMFFEGERIHVMREMILAVDEKGRRVALDKLLPMQRLDFTGLFTVMHGLPVTIR
+LLDPPLHEFLPKTDDEVAEVAFAMGMEASVLRQRVDALHEFNPMLGHRGCRLAISYPEIVEMQARAIFEA
+AVAAAKETGAAVVPEIMVPLVGLRTELDYVKARIDEVAGAVMNEAGMKIDYLVGTMIELPRAALRAHVIA
+EAAEFFSFGTNDLTQTTFGISRDDASAFIPTYQRKGIIEHDPFISLDFDGVGELISIAAERGRRTRNDMK
+LGICGEHGGDPASIRFCETIGLDYVSCSPFRVPIARLAAAQAVIAGSLEDVRRGPKDLRASV
+>gi|NP_001105738.2|pyruvate, phosphate dikinase 1, chloroplastic precursor [Zea mays]
+MAASVSRAICVQKPGSKCTRDREATSFARRSVAAPRPPHAKAAGVIRSDSGAGRGQHCSPLRAVVDAAPI
+QTTKKRVFHFGKGKSEGNKTMKELLGGKGANLAEMASIGLSVPPGFTVSTEACQQYQDAGCALPAGLWAE
+IVDGLQWVEEYMGATLGDPQRPLLLSVRSGAAVSMPGMMDTVLNLGLNDEVAAGLAAKSGERFAYDSFRR
+FLDMFGNVVMDIPRSLFEEKLEHMKESKGLKNDTDLTASDLKELVGQYKEVYLSAKGEPFPSDPKKQLEL
+AVLAVFNSWESPRAKKYRSINQITGLRGTAVNVQCMVFGNMGNTSGTGVLFTRNPNTGEKKLYGEFLVNA
+QGEDVVAGIRTPEDLDAMKNLMPQAYDELVENCNILESHYKEMQDIEFTVQENRLWMLQCRTGKRTGKSA
+VKIAVDMVNEGLVEPRSAIKMVEPGHLDQLLHPQFENPSAYKDQVIATGLPASPGAAVGQVVFTAEDAEA
+WHSQGKAAILVRAETSPEDVGGMHAAVGILTERGGMTSHAAVVARGWGKCCVSGCSGIRVNDAEKLVTIG
+GHVLREGEWLSLNGSTGEVILGKQPLSPPALSGDLGTFMAWVDDVRKLKVLANADTPDDALTARNNGAQG
+IGLCRTEHMFFASDERIKAVRQMIMAPTLELRQQALDRLLPYQRSDFEGIFRAMDGLPVTIRLLDPPLHE
+FLPEGNIEDIVSELCAETGANQEDALARIEKLSEVNPMLGFRGCRLGISYPELTEMQARAIFEAAIAMTN
+QGVQVFPEIMVPLVGTPQELGHQVTLIRQVAEKVFANVGKTIGYKVGTMIEIPRAALVADEIAEQAEFFS
+FGTNDLTQMTFGYSRDDVGKFIPVYLAQGILQHDPFEVLDQRGVGELVKFATERGRKARPNLKVGICGEH
+GGEPSSVAFFAKAGLDYVSCSPFRVPIARLAAAQVLV
\ No newline at end of file
diff --git a/PYC.txt b/PYC.txt
new file mode 100644
index 0000000..754578c
--- /dev/null
+++ b/PYC.txt
@@ -0,0 +1,152 @@
+>jgi|Phatr2|30519|estExt_Genewise1.C_chr_230092|PYC1_PTRI
+MYRTRVLRRATATTRTRGAGSGSLSPSRFRLSRWSTTFAAPPLACGLLSCAAGLSQQCHQPIGRGLHTFH
+TPDEQLPKSGGPVGVDTPPFTKLLAANRGEIATRINRAAAELGISTAGIYSYEDRFTQHRYKCDQAFELD
+TSKSPVAQYLDIDKIVDICVKNKVQAVHPGYGFLSENETFAKKLDNAGIIFVGPTVQNLQAFGDKTAARN
+MAIACNVPVVAGSHDAFATAKEASAWINDPANQCDYPVIVKALMGGGGRGIRIVPTEKDLNAMFQQASNE
+AASAFGDGRCFVEKYVEKPRHVEVQCLGDGTGNVIHLWDRDCSVQRRHQKVVELAPAEGLSEDGRNQILN
+DAVRLLQNANYRNAGTVEFLVDKNGKHYFMEVNPRVQVEHTVTEEITGVDIVQSQILIASGKTLPELNLT
+QESIPSPMGVAMQCRVTTEDPAQDFRPDTGTINVFRMPAGMGIRLDDGPGFPGARITPHYDSLLVKITAK
+ARNRKEAAAKLIRALKEFRVRGVKTNKSFLLNVLKHPDFLEGVVDTGFIAANPHLLAPLREQDRAQKLLY
+YIANVVVNGTPKELGATGAPPSTVDPIIPVVEPNSGQQKKPSLKKIFDADGPDAFAKAVRNNKGLLITDT
+TWRDAHQSLLATRLRTKDMLNIAPATTVALANAYSLECWGGATFDVSLRFLRECPWERLSALREAVPDIP
+FQMLLRGANAVGYTSYPDNVVYEFCQMAKDTGMDVFRIFDSVNYIENMKLGIDAVGAAGGIVEAAVCYTG
+DVSNPNRGMYNLEYYLGFVRQLHGLGIHVLAIKDMAGLLKPEAGTMLVNAIRQEFPDLPIHVHTHDTAGT
+GVASMLACAKAGADAVDAAADAMSGTTSQPSLGALVASTQGTQWDTGLDLNQVQAVNDYWEEARGLYAPF
+ESGQKTGSSDVYEHEMPGGQYTNLLFQSSQLGLTGQWSKVKKAYAAANRLLGDIIKVTPSSKVTGDLAQF
+LVANDLTEKEVIEKAETLSFPKSVVEYFQGYLGIPPFGFPEPLRTKVLKGQTIEGYEGLTCFEGRPGADL
+KPMDMEAVRSKLEEKWGGQADHGVRNVDILSHAMYPAVFDEYKEFKNVFGKLDFLDTRTFLTGMRVNQEL
+RVEIEPGKQLVIKLDSVSEPDKDGLVTLQFELNGTLRTVQIQDKSVDSEKAVRPKAMAAVAGSVGAPMPG
+VVVETKVKKGDVVEQGDPLLSLSAMKMETTVSAPVSGTVVFFEVTAGDQVEAGDLLVEIEDE*
+>jgi|Phatr2|49339|estExt_fgenesh1_pg.C_chr_210250|PYC2_PTRI
+MRRVAVFVLVLSMPSMAAAFAPRRSWTTATTPAGIETAATMARRSLLSSLLRVSTGTDSEKDTNASTDSI
+QTDSVVTEASARNSKLVRNVPPFQRILAANRAEIAVRIMRAATELNAGTVAMYTHEDRYSQHRWGADQSF
+LLDKKNPTSSPISAYLDIPQIIRLALDAGVDAIHPGYGFLSESPEFAQACADASITFVGPTVENLQRFSD
+KTSARQAAIEADVPVVPGSDGALETEADVTAFVEANGLPVILKAAMGGGGKGMRVVRRMEDLIPFFQAAS
+SEALASFGDGAVFVERFVERPRHIEVQIIGDGTGNVVHLWERDCSIQRRHQKVIEMAPAWTLPDELRAQL
+HEYAVRLTSQAKYKNAGTVEFLIDAELRPYFIEVNPRIQVEHTVTEEVTGIDLVQAQIKIAAGATLEEVG
+LVQANIQPRGVAIQCRVTTENPERDFAPDTGTVTLYRHSAGKGVRMDGIGYSGMTITPYFDSMIVKYTAL
+GANFPETVARMKRVLQECRIRGVKTNVGFLLNVLSHPEFETGIVTTSFIDENPQLKQTSMSMYDFASEEQ
+ADPRKTFATERLVRYLANLAVNGQPPELGADSQKLTRTTAIADIPAPEIRSEGNAAVPSDESPNQPGWRH
+LLLEQGPKAYAKAVREHQGLLITDTTWRDAHQSLLATRMRTQELIKSADYTNMALANAFSLEMWGGATFD
+VAMRFLRECPWERLEALREKVPNVPFQMLLRGANAVGYTNYADNVVHKFCKQAHDSGVDVFRVFDSLNYI
+ENLQLGVDAAGEAGGFVEGAMSYTGDVADPTKGKYSLEYYMNLASELVDMGVHSLAIKDMAGLLTPKAST
+LLVSALREAHPDIPIHVHTHDTAGSGVASMLAAAQAGADIVDSSMDAFSGMTSQPSLGALVANLAGTERD
+TGIQLSNLPPLNSYWEDVRSLYAPFESGQLSGSSDVYFHEIPGGQYTNLLFQSKQLGLSDRWTEIKTKYA
+EANIILGDIPKVTPSSKVVGDLAQFLVSQNLEANEVLEKADTLAFPDSVINYLKGDIGVPPGGFPEPLRN
+KVLQSRNLEPIEGRPGKFLPDYNFDKERELLEKRFGKANIDEKDCLSYALYPDVFTEWKDFQALYGDVGK
+LPTRLFLNPMQVGDEVEIEIAKGQTLIVELVSIQDVKEDGTRTVIFEVNGEPWYMPVTDQNLLGDSAVRE
+KAVAPGQVGASMPGVVVGLKVKAGDTVQEGETVATLSAMKMETSIPATASGVIKRVLVNVGDKVNGDDLI
+LEIE*
+>jgi|Thaps3|11075|fgenesh1_pg.C_chr_19a_19000018|PYC_TPS
+MTIMQNPQACINPPIRMDASIINNKAPSVEVKIPPAAVIAPSTNKTLAELCSFKKVMAANRGEIAVRICR
+GATEFNLKTATIYAYEDRNSAHRWDSDESFLLPASGTPVGAYLNITNIINIAKENGVDAIHPGYGFLSES
+AEFAQACEDNGITFVGPSVENLVTFGDKTKARELAIKADVSVVPGTSEPLTTTEAAVAFVEEYGLPVIIK
+AAKGGGGKGMRVVNKKEDLIPLFEAASSEALASFGDGGCFVERYVTNAKHVEVQVIGDGKGNVVHLWERD
+CSVQRRHQKIVEIAPAVHHSMEVRKAVLEDALKITKACNYKNAGTVEFLVDDQGRHYFMEVNPRVQVEHT
+VTEQVTGLDIVQSTFLIAGGASLEDIGLVQENIIPRGVAMQCRITAEDPERDFAPDTGMLDVCRHSVGPG
+IRVDGYAYPGMVVQPYFDSLLVKYTASHKDWDGAIRRMRRALHDNHIRGVKTNIPFLLNVMDHPDFIAGS
+FDLNFIQDNPELLLNLPGTLSAQKGTLGQRYDHIEGYLKYIANLAVNGHPKSLGANDALVRIIDNCDIPA
+PDKNEIEAILSKKKKSSPHWRKILREQGPKALAKAVRDHQNVLVTDTTWRDAHQSLLATRMRTADLLKAA
+EATNTAFNGTSDVFSLEMWGGATFDVSMNFLRECPWKRLEELREAAPDMLFQMLLRGANAVGYTVYPDNV
+VYEFCKQAYKSGNDIFRVFDSLNYVDNMELGIKAAAASGGFVEAAICYTGDVTSSDPSNKYNLKYYLDFA
+TQLVDLGAHALAIKDMAGLLTPRAATLLVSELRSAFPDVPIHLHTHDTAGMGVAAMFAGAEAGADIVDGA
+IDAMSGLSSQPCLGALVSALGDKSNVDLDALQVLNEYWESVRHQYNPFEVQALNAAIGSNVYKHEIPGGQ
+YTNLLFQSKQLGLSGRFAEVKKAYALANKLLGDIPKVTPSSKTVGDLAQFIVGLKISGDELVENAATLPL
+PNSVVEYMQGALGPPPGGYPEPFRTNVLKGRPLKDGRSMFTARPGAELPDYDFVEAEKNLKEAYGNSRIG
+FKEVLSHAMYPQVFKDYLAFEKVYGDVEKLPTHMFLRPMTVGEESHLHLGPGKDYYIRLAAIDQFDEDLG
+TRTVTLEVNGEKWFIRTPDTVTTLESATAGGPAPKRREKKDPTEKGSIGTPMPGQIVAVNVEEGDEVKEG
+QTLFKLSAMKMETEIKAPISGTITRVLVSQSDSVEGDDLLAVVMAE*
+>gb|CAB02872.1_Pyruvate_carboxylase_1|Caenorhabditi_selegans
+MRFSRIPPIFANVVRQTHYRNYANGVIKPREFNKVMVANRGEIAIRVFRALTELNKTSVAIYAEQDKNSM
+HRLKADEAYLVGKGLPPVAAYLTIDQIIETALKHNIDAIHPGYGFLSERSDFAAACQNAGIVFIGPSPDV
+MARMGDKVAARQAAIEAGVQVVPGTPGPITTADEAVEFAKQYGTPIILKAAYGGGGRGIRRVDKLEEVEE
+AFRRSYSEAQAAFGDGSLFVEKFVERPRHIEVQLLGDHHGNIVHLYERDCSVQRRHQKVVEIAPAPALPE
+GVREKILADALRLARHVGYQNAGTVEFLVDQKGNYYFIEVNARLQVEHTVTEEITGVDLVQAQIRIAEGK
+SLDDLKLSQETIQTTGSAIQCRVTTEDPAKGFQPDSGRIEVFRSGEGMGIRLDSASAFAGSVISPHYDSL
+MVKVIASARNHPNAAAKMIRALKKFRIRGVKTNIPFLLNVLRQPSFLDASVDTYFIDEHPELFQFKPSQN
+RAQKLLNYLGEVKVNGPTTPLATDLKPAVVSPPIPYIPAGAKPPTGLRDVLVQRGPTEFAKEVRSRPGCM
+ITDTTFRDAHQSLLATRVRTYDMAAISPFVAQSFNGLFSLENWGGATFDVSMRFLHECPWERLQTLRKLI
+PNIPFQCLLRGANAMGYSNYPDNVIYKFCELAVKNGMDVFRVFDSLNYLPNLLVGMEAVGKAGGVVEAAI
+AYTGDVTDKSRDKYDLKYYLNLADQLVKAQAHILSIKDMAGVLKPEAAKLLIGALRDKFPDIPIHVHTHD
+TSGAGVAAMLECAKAGADVVDAAVDSMSGMTSQPSMGAIVASLQGTKHDTGLSLDDISKYSAYWESTRQL
+YAPFECATTMKSGNADVYKHEIPGGQYTNLQFQAFSLGLGPQFDEVKRMYREANLVLGDIIKVTPSSKIV
+GDLAQFMVQNNLTRETLVDRADDLSFPKSVVDFMQGNVGQPPYGFPEPLRTKVLRGKPKVDGRPGENAKP
+VDLDAVKVELEEKHGRTLSEEDVMSYSMFPTVFDEFETFRQQYGPVDKLPTRLFLTGLEIAEEVDVEIES
+GKTLAIQLLAEGKLNKRGEREVFFDLNGQMRSIFVVDKEASKEIVTRPRALPGVRGHIGAPMPGDVLELK
+IKEGDKVTKKQPLFVLSAMKMEMVIDSPIAGTVKAIHAPQGTKCSAGDLVVEVEP
+>gb|EAL26409.1_uncharacterized_protein_Dpse_GA13539_isoform_A|Droso_pse
+MFIPVAQSAFKALRSAQPRVRLYFVSKNAYSSQVEYKPIRSVLVANRGEIAIRVFRACTELGIKSVAVYS
+EQDKMHMHRQKADESYLVGKGLPPVEAYLNIPEIIRVCKENDVDAVHPGYGFLSERSDFAQAVIDAGLRF
+IGPSPKVVQNMGDKVAARVAAIEAGVPIVPGTDGPVTTKEEALEFCKMHGLPVIFKAAYGGGGRGMRVVR
+KMEEVEESFQRASSEAKAAFGNGAMFIEKFIERPRHIEVQLLGDKAGNVVHLYERDCSVQRRHQKVVEIA
+PAPRLPIELRDKMTEAAVRLARHVGYENAGTVEFLCDESGNFYFIEVNARLQVEHTVTEEITGIDLVQSQ
+IRIAEGMTLPELGYTQENIQPRGYAIQCRVTTEDPANDFQPNTGRLEVFRSGEGMGIRLDSASAYAGAII
+SPYYDSLLVKVIAHAGDLQSSAAKMNRALREFRIRGVKTNIPFLLNVLENQKFLNGVLDTYFIDEHPQLF
+KFRPTQNRAQKLLNYLGEVLVNGPQTPLATTLKPAEVSPHVPAIPLDLSPEALEREERGEAKVTEPPCGL
+RDILVRQGPEAFAKEVRSRKNLMLMDTTFRDAHQSLLATRVRSHDLLKISPYVAHKFNNLYALENWGGAT
+FDVALRFLHECPWERLEEMRKRIPNIPFQMLLRGANAVGYTSYPDNVVYKFCELAVQTGMDIFRVFDSLN
+YLPNLILGMEAAGKAGGVVEAAISYTGDVSDPKRTKYDLKYYTNLADELVKAGTHVLCIKDMAGLLKPEA
+ATLLITAIRDKHPDIPIHIHTHDTSGAGVASMLACAQAGADVVDVAVDSMSGMTSQPSMGAVVASLQGTP
+LDTGVDLRVVSEYSAYWEQTRTLYAPFECTTTMRSGNADVYLNEIPGGQYTNLQFQAFSLGLGDFFEDVK
+KAYREANLLLGDIIKVTPSSKVVGDLAQFMVQNNLTADQVLEKAEELSFPKSVVEFLQGHIGIPHGGFPE
+PLRSRVLKDMPRIEGRPGAALEPLDFDKLKQDLKESHPNITDRDVMSSALYPQVTNEYLFFREKFGPVDK
+LDTRIFLTGPKVGEEFEVTLERGKTLSLKAMAMAADLKPNGDREVFFEMNGQLRTVHILDKEAVKEIHVH
+PKANKAVKSEVGAPMPGTVIDIRVAVGDKVEKGQPLVVLSAMKMEMVVQAPQAGVVKKLEIANGMKLEGD
+DLLMIIE
+>gi|BAH22705.1|pyruvate_carboxylase|Ehux
+MTKILLMLALALGAAGLRWPAAVPQRRATSGRAAGARLERAVGPVAVAPVEAPSVSRSSESAVDAMRGAA
+EAPSPFKKLMAANRAEIAVRIMRAATELNVATVAIYGYEDRFSQHRWGADQSFQLEKKDPADAAVRAYLD
+IEQIVALAKREGVDAIHPGYGFLSESPEFAQACSDAGITFVGPTVANLKTFSDKTTARVAAIAADVPVVP
+GTDEPVTTEAGARAFVEEYGLPVIIKAAMGGGGKGMRLVRDMEELGASFASASTEAEAAFGDGSVFLERY
+IESPRHIEVQIIGDGKGGAVHLCERDCSVQRRYQKVVEIAPAWSLDPALRNKLHEDSLRLMRSAKYLNAG
+TVEFLVDGEGRHYFIEVNPRIQVEHTVTEEVTGIDLVQAQMRIASGASFEEVGLVQDQIQARGIAVQCRV
+TTENPERNFAPDTGTLSVYRHSAGYGMRQDGIGYSGMTVTPYYDSLLVKYTARGSNWGEVIRRMTRALQE
+ARIRGVKTNIPFLLNVLTHPEFKAGVVTTGFIDEHPELLQVTGKNWDFANVHQADQEKVMQVEKLLRYLA
+NLAVNGHPKELGANPARLRTAPQPQVKPPRVLIPGKDDAPTAGRRPGGWRSLLLAEGPAAYAKAVREHKG
+LLVMDTTWRDAHQSLLATRMRTADLVKAGAATNAALSNAFSLEMWGGATFDVAMRFLHECPWQRLERLRE
+EVPDVPFQMLLRGANAVGYTNYPDNLVYRFCKQAAASGIDVFRVFDSLNYLENLKLGIEAAGEAGGFVEA
+AICYTGDITDPSKGKYTLDYYLEYARQLAQLGVHSIAIKDMAGLLKPRAAALLVGAIRKELPDMLIHVHS
+HDTAGNSLASMLSAAEAGADVVDVAIDSMSGITSQPSLGALAAATAGSELDIGVRPQDLEPLNSYWEQVR
+SLYAPFESGQLSGSSDVYRNEIPGGQYTNLLFQASQLGLGDQWVEVKRKYAQANLLLGDIPKVTPSSKVV
+GDLAQLMVAQKLEPDQLIEQAESLAFPDSVVSYFQGGIGLPPGGFPEPLRSKVLKGRSLEDGRAAYDGRP
+GATMKPYDFDKELGLLQASYPSNKGERDALSYALYPQVFRDWQEHRAVYGEVEALPTEAFLHPMAVGDEV
+EFATEPGRSWIVKLVSVPKPDENGQTQVIMELNGERWFVPVTDNSVQSATAREKAGGSPGSVGSPMPGVV
+VDVKVKPGDTIREGEPLVVLSAMKMETAIPAPASGVVERLLVSAGDKVEGDDLLAQIGEGAPKEEGGSSA
+KGGLFSSLFKGSGE
+>gi|CAA96765.1|PYC1|Saccharomyces_cerevisiae
+MSQRKFAGLRDNFNLLGEKNKILVANRGEIPIRIFRTAHELSMQTVAIYSHEDRLSTHKQKADEAYVIGE
+VGQYTPVGAYLAIDEIISIAQKHQVDFIHPGYGFLSENSEFADKVVKAGITWIGPPAEVIDSVGDKVSAR
+NLAAKANVPTVPGTPGPIETVEEALDFVNEYGYPVIIKAAFGGGGRGMRVVREGDDVADAFQRATSEART
+AFGNGTCFVERFLDKPKHIEVQLLADNHGNVVHLFERDCSVQRRHQKVVEVAPAKTLPREVRDAILTDAV
+KLAKECGYRNAGTAEFLVDNQNRHYFIEINPRIQVEHTITEEITGIDIVAAQIQIAAGASLPQLGLFQDK
+ITTRGFAIQCRITTEDPAKNFQPDTGRIEVYRSAGGNGVRLDGGNAYAGTIISPHYDSMLVKCSCSGSTY
+EIVRRKMIRALIEFRIRGVKTNIPFLLTLLTNPVFIEGTYWTTFIDDTPQLFQMVSSQNRAQKLLHYLAD
+VAVNGSSIKGQIGLPKLKSNPSVPHLHDAQGNVINVTKSAPPSGWRQVLLEKGPAEFARQVRQFNGTLLM
+DTTWRDAHQSLLATRVRTHDLATIAPTTAHALAGRFALECWGGATFDVAMRFLHEDPWERLRKLRSLVPN
+IPFQMLLRGANGVAYSSLPDNAIDHFVKQAKDNGVDIFRVFDALNDLEQLKVGVDAVKKAGGVVEATVCF
+SGDMLQPGKKYNLDYYLEIAEKIVQMGTHILGIKDMAGTMKPAAAKLLIGSLRAKYPDLPIHVHTHDSAG
+TAVASMTACALAGADVVDVAINSMSGLTSQPSINALLASLEGNIDTGINVEHVRELDAYWAEMRLLYSCF
+EADLKGPDPEVYQHEIPGGQLTNLLFQAQQLGLGEQWAETKRAYREANYLLGDIVKVTPTSKVVGDLAQF
+MVSNKLTSDDVRRLANSLDFPDSVMDFFEGLIGQPYGGFPEPFRSDVLRNKRRKLTCRPGLELEPFDLEK
+IREDLQNRFGDVDECDVASYNMYPRVYEDFQKMRETYGDLSVLPTRSFLSPLETDEEIEVVIEQGKTLII
+KLQAVGDLNKKTGEREVYFDLNGEMRKIRVADRSQKVETVTKSKADMHDPLHIGAPMAGVIVEVKVHKGS
+LIKKGQPVAVLSAMKMEMIISSPSDGQVKEVFVSDGENVDSSDLLVLLEDQVPVETKA
+>gi|AAA82937.1|pyruvate_carboxylase_precursor|Homo_sapiens
+MLKFRTVHGGLRLLGIRRTSTAPAASPNVRRLEYKPIKKVMVANRGEIAIRVFRACTELGIRTVAIYSEQ
+DTGQMHRQKADEAYLIGRGLAPVQAYLHIPDIIKVAKENNVDAVHPGYGFLSERADFAQACQDAGVRFIG
+PSPEVVRKMGDKVEARAIAIAAGVPVVPGTDAPITSLHEAHEFSNTYGFPIIFKAAYGGGGRGMRVVHSY
+EELEENYTRAYSEALAAFGNGALFVEKFIEKPRHIEVQILGDQYGNILHLYERDCSIQRRHQKVVEIAPA
+AHLDPQLRTRLTSDSVKLAKQVGYENAGTVEFLVDRHGKHYFIEVNSRLQVEHTVTEEITDVDLVHAQIH
+VSEGRSLPDLGLRQENIRINGCAIQCRVTTEDPARSFQPDTGRIEVFRSGEGMGIRLDNASAFQGAVISP
+HYDSLLVKVIAHGKDHPTAATKMSRALAEFRVRGVKTNIAFLQNVLNNQQFLAGTVDTQFIDENPELFQL
+RPAQNRAQKLLHYLGHVMVNGPTTPIPVKASPSPTDPVVPAVPIGPPPAGFRDILLREGPEGFARAVRNH
+PGLLLMDTTFRDAHQSLLATRVRTHDLKKIAPYVAHNFSKLFSMENWGGATFDVAMRFLYECPWRRLQEL
+RELIPNIPFQMLLRGANAVGYTNYPDNVVFKFCEVAKENGMDVFRVFDSLNYLPNMLLGMEAAGSAGGVV
+EAAISYTGDVADPSRTKYSLQYYMGLAEELVRAGTHILCIKDMAGLLKPTACTMLVSSLRDRFPDLPLHI
+HTHDTSGAGVAAMLACAQAGADVVDVAADSMSGMTSQPSMGALVACTRGTPLDTEVPMERVFDYSEYWEG
+ARGLYAAFDCTATMKSGNSDVYENEIPGGQYTNLHFQAHSMGLGSKFKEVKKAYVEANQMLGDLIKVTPS
+SKIVGDLAQFMVQNGLSRAEAEAQAEELSFPRSVVEFLQGYIGVPHGGFPEPFRSKVLKDLPRVEGRPGA
+SLPPLDLQALEKELVDRHGEEVTPEDVLSAAMYPDVFAHFKDFTATFGPLDSLNTRLFLQGPKIAEEFEV
+ELERGKTLHIKALAVSDLNRAGQRQVFFELNGQLRSILVKDTQAMKEMHFHPKALKDVKGQIGAPMPGKV
+IDIKVVAGAKVAKGQPLCVLSAMKMETVVTSPMEGTVRKVHVTKDMTLEGDDLILEIE
+
diff --git a/Proteorhodopsins.txt b/Proteorhodopsins.txt
new file mode 100644
index 0000000..e158926
--- /dev/null
+++ b/Proteorhodopsins.txt
@@ -0,0 +1,43 @@
+>jgi|Fracy1|267528|estExt_fgenesh2_kg.C_10531|Fragilariopsis_cylindrus
+MISGTQFTIVYDVLSFSFATMMATTIFLWMRVPSVHEKYKSALIISGLVTFIASYHYLRMFNSWTEAYEW
+TGEGELAKTGSPFNDAYRYMDWLLTVPLLLIEIILVMKLPADESKSKATTLGIASAAMIAIGYPGELFMS
+EDNLGGRWVYWIGAMLPFLYIVQTLLVGLNDATQSEADPAVRKLIKGVQWWTVIAWCTYPVVYIFPMMGI
+SGSNAIVGIQLGYSVSDIISKCGVGLLIYQITIAKSLALKNGNEETP*
+>gi|OLV16852.1|Proteorhodopsin|Deinococcus_marmoris
+MRQRFTPLTWIIATLAVLLGTALAQSQNAPVEAAKLSLSSGQFGLVYQMFSITIAAMGAGFIFFVLAQQN
+LSPKYRPAMVVSALVVAIACYHYFRIFNSWNESYALTAGAYVATAVPFNDAYRYADWILTVPLLLVEAVA
+VLALATNVASGMIWRLALAAFVMIATGYPGEISGDTTTRLIWGTISTIPFIYIVYTLFVELGKSIDRQPP
+RVQVLTRNLRLLLFASWGFYPIAYLLPIFLGGGGLSASGVVGLQVGYSIADILAKVGFGTLIYFIALEKT
+AHDRSMGVTEDSTTPPATELPTRPV
+>gi|APE26978.1|Proteorhodopsin|Erythrobacter_gangjinensis
+MPTIENFVEYAVWQYDMVRHAFAFTVAVFAAGLVYFAMTAYQTHPAFRATSIISAVVMVSAALEIGQLWM
+LWNESFAFNPATQTFQVVDGERFSNGYRYMNWMIDVPLLMTQLVVVAGFTGAALFKKWGLLTFTGIAMII
+TGYVGQYFEPAAAGIAGYENGEQLWIWGAISTVFMIWMILVLANAVRDPQGEASNEVRKGLINCFWFLVI
+TWAIYPIAYMWPVIDGSATGVVVRQTLYTVADVTSKLVFGVMLSQVALRRSAELGYRAAGVAMMVHTPSR
+NQLTADEREENVLDEDRSRTGSV
+>gi|KOO22837.1|rhodopsin|Chrysochromulina_sp.CCMP291
+MMFPVTAGQFDLVYNALSFTLASMMASTIFFWIRMGSVSEKYKSAMTITGLVTFIAAYHYIRIFNSWNES
+YHYPEAADGVVQDPVITGQPFNDAYRYMDWMLTVPLLMIEIIFVMGLSPEETAAKATSLGVAAGLMIVLG
+YPGELIIEGDLNVRWMWWTLAMIPFLYVVHTLLIGLQGAIKEEKNEEVAKKLNMVCWATVVSWCTYPIVY
+VFPMLGLDGPSAVVAIQLGYCVSDIISKCGVGFLIYNITIAKSNPEGYAQVH
+>gi|AKG94905.1|rhodopsin|Prorocentrum_donghaiense
+MVMYPMSDMQYQAVYNTLSFALASMMATTMYLWSRSTAVRDQFKSAVLISGLVTFIAAYHYIRIFNSWVE
+AYEYSAGKPDPELTGVPFNDAYRYMDWLLTVPLLLIEILLVMKLDEATYNVKSKTLGVGSALMIVSGYYG
+EPTVTGDLTPRWICWFVSMCFFLYIVFELLVGLKAAIESETDPTIKGKIQLAQVMTVISWCTYPVVYLFP
+MLGITASNAVVAIQIGYCVSDIISKCGVGLVIYQVTYAKSNKDGALLA
+>gi|AIN36550.1|rhodopsin|Alexandrium_fundyense
+MAPIPDGFSYGQWSVVYNALSFGIAAMGSATIFFWLQLPNVSKSYRTALTITGIVTFIATYHYFRIFNSW
+VEAFNVTNSGGGDYTVKLTGAPFNDAYRYVDWLLTVPLLLVELILVMKLPAEQTTSMSWKLGFASALMVA
+LGYPGEIQDDLTVRWVWWGLAMIPFCYVVYELVVGLNDATKRQASATVSSLISSARYLTVISWCTYPFVY
+IVKNIGLSGPTATMYEQVGYSVADVVAKAVFGVMIWAIASEKSKLEEQGSLMSS
+>gi|EGF32634.1|Proteorhodopsin|Oxalobacteraceae_bacterium_IMCC9480
+MIIGESFMEAVTLGQYELVYNAFSFAIAVMGAATIFFFLGRSQVASAYRTALTITGLVTLIAAYHYLRIF
+NSWEAAFVITGDQIKASGIKFNDAYRYVDWLLTVPLLLIELILVMRLPRAETIAKSTKLGLLAALMVVLG
+YPGEISADGGTRWMWWGLAMIPFLIIVYDLFVGLKKSIDSQPAAARGLISTARWVTVISWCFYPVVFVFP
+MIGFTGSSAATAVQVGYTVSDIVAKAMFGVLIYMIAVRKSEAEGQHA
+>gi|ADY17807.1|rhodopsin_type_II|Oxyrrhis_marina
+MAPLTGDFSYGEWNAVYNALSFGIAAMGSATVFFWLQLGNVSKNYRTALTITGIVTWIATYHYFRIFNSW
+VEAFEVNEVGGAYAVKVSGTPFNDAYRYVDWLLTVPLLLIELILVMKLPAGETAALSTKLGVASAVMVAL
+GYPGEIQENLAVRWFWWALAMIPFAYVVFSLLVGLGAATAKQPESVAGLVSAARYLTAVSWLTYPFVYII
+KNVGLAGPTATMYEQIGYSVADVMAKAVFGVLIWAIANEKSRLEGEGKLLR
+
diff --git a/SHMT.txt b/SHMT.txt
new file mode 100644
index 0000000..7b76b62
--- /dev/null
+++ b/SHMT.txt
@@ -0,0 +1,83 @@
+>jgi|Thaps3|269942|estExt_thaps1_ua_kg.C_chr_180031|SMHT3_TPS
+_TPSMMSLRSSLPALRRAAATQSARIALPSAINTCTDLHQHHNHANVRTLSSSSSSGASLNQRLTQVDPTLSTL
+IEQEKARQRSSLVLIASENFTSRAVLDALGSVLSNKYSEGYPGARYYGGNENIDRVELLCQERALETFGL
+SGEEWGVNVQSLSGSPANFQVYTALLETHDRILSLDLPHGGHLSHGFQTPTKKISAVSRYFESMPYRLNS
+TTGQIDYDEMERSAELFRPKLIVAGASAYSRLIDYERIREIADKVGAYVMADMAHISGLIAAEVIPSCFP
+YADVVTTTTHKSLRGPRGAMIFFRKGKKGETKKGEPIMYDLEEKINFAVFPGLQGGPHNHTIGALAVALK
+QANTPEFVEYQKQVLKNCARLNSELQSLGYEIVSGGTDNHLVLVNVKSSKGIDGARVERVLELACIASNK
+NTVPGDTSALNPGGIRMGTPALTSRGFMEEDFAKVAHYFDRAVSIANKLKNTEEGKKMKGFREMCAVGPS
+VDPELVQLRKEVSEFASSFPTVGFEESEMEFKGEYNVDFVA*
+>jgi|Thaps3|26190|estExt_fgenesh1_pm.C_chr_40017|SHMT1_TPS
+MDASLSSAYAEAVQASTSSPSLTTSDPDISRLIVLEEDRQRYGLELIASENFVSRAVKEALGSCLTNKYS
+EGQVGKRYYGGNEYIDEIETICMERALSLFGLDPSEWGVNVQPYSGSPANFAAYTALLQPHDRIMGLDLP
+SGGHLTHGFQTPKKKVSATSVYFESMPYVVNPTTGLVDYDDMERRAKMFMPKLLIAGGSAYTREWNYARM
+RTIADSVGAYLMVDMAHISGLVAGKVVANPFEYADLVTSTTHKTLRGPRSGMIFAKLDMMESINQAVFPM
+LQGGPHNHQIGALAVALREASSPEFVQYARDVVANANALGKGLVKRGHKLVTGGTDNHIVLWDVKSTTGL
+TGSKVERLLELASITANKNSIPGDTSAVNPGGVRLGSPALTSRGLKEEDFDKVAEFLHRGCELAVKVQAV
+AKVKSDDGKVLMRFFEATLKEDDALREELDVLKKDVESFAGKFEMPGF*
+>jgi|Thaps3|26031|estExt_fgenesh1_kg.C_chr_140007|SHMT2_TPS
+MSDSNKRAKMTSFKDSEFTGLKPLSEHDPLLFDLIEKEKLRQYTSLELIASENFTSRAVMDCLGSALTNK
+YSEGLPHARYYGGNEIVDQVEELCQKRALEAYGLDEKEWGVNVQPYSGSPANFAVYTGLLRPHDRIMGLD
+LPSGGHLTHGFYTYSKKEGTRKAVSATSVYFESLPYQVDQTTGIINYDQLERDASLFKPAMIIAGGSAYP
+RDWDYARFRKIADENGALLIMDMAHISGLVATKEQKSPFEYCDVVTTTTHKSLRGPRAGMIFFRRDERGF
+EHKINQAVFPALQGGPHEHQIAGVATQLLEVMTPEFHQYSAQVRKNAQALGNKLISLGYSLATGGTENHL
+VLWDLKPQKLTGSKFEKVCDAVSITLNKNCVPGDRSAVTPGGVRIGAPALTTRKMVEADFEQIAMFLHEA
+LTIALKIQEESGPKLVDFVKCLEQNGEVEGLRKRVNEFASGFPMPGFDPKEMKYKL*
+>jgi|Phatr2|18665|estExt_gwp_gw1.C_chr_30286|SHMT1_PTRI
+MTSFKDQEFRGLLSLEEHDPELFDLIEQEKSRQWRSLELIASENFTSRAVMDCLGSALTNKYAEGLPGAR
+YYGGNEVVDQVEALCQKRALEAYGLDPEKWGVNVQPYSGSPANFAVYTALLKPHDRIMGLDLPSGGHLTH
+GFYTYSKKEGTRKAVSATSVYFESLPYRVHPETGYIDYDQLERDAGLFKPAMIIAGGSAYPRDYDYKRFR
+EIADANGALLMMDMAHTSGLVATGELDSPFEYADVVTTTTHKSLRGPRAGMIFFRKDERGFESRINQAVF
+PALQGGPHEHQIAGVATQLKEVCSPDFKVYSQQVKKNAKALADKLTSMGYSMASGGTENHLVLWDLKPQG
+ITGSKFEKVCDAVSITLNKNCVPGDVSAVTPGGVRIGTPALTTRTMVESDFEQIGQFLHEALEITLAIQE
+KSGPKLKDFLPLLEKNADIEALKVRVHDFATTFPMPGFDPATMKYKNPAGPSH*
+>jgi|Phatr2|54015|estExt_Phatr1_ua_kg.C_chr_10105|SHMT2_PTRI
+MLSVRSTLAPAIRRIATRTFAAGADLNKTLLETDPELSQLIEQEKARQRNSLVLIASENFTSKAVLDALG
+SVLSNKYSEGYPGARYYGGNENIDQVELLCQKRALEAFHLDPAEWGVNVQSLSGSPANFQVYTALLETHA
+RILALDLPHGGHLSHGYQTATKKISMVSRYFESMPYRLDESTGTIDYDQMEKSADLFRPKMIVAGASAYS
+RLIDYERIRKIADGVGAYVMSDMAHISGLVAAQVIPSCFEYSDVVTTTTHKSLRGPRGAMIFYRKGQKGT
+DKKGNPIMYDLEEKINFTVFPGLQGGPHNHTIGALATCLKQAATADFVVYQKQVLKNSSRLAEELNKLGY
+TLVSGGTDNHLVLIDVKSSAKIDGARVERILELACIATNKNTVPGDTSALMPGGIRMGTPALTSRGFKED
+DFTKVAHFFDRAVKIAVKLKNTDQGAKLKGFREMCAVGPSVDADLVQLRHDVSEFACLFPTVGFNEDEMT
+FEGEYNVDFVA*
+>jgi|Phatr2|17456|estExt_gwp_gw1.C_chr_10370|SHMT3_PTRI
+MGSYTVRLIATWPIILLLLLSIFESVRAFSLTTHPRGGSQLHISMQDAKTKRIERSMEDFDPEIARMIGS
+EERRQRVGLELIASENFASKAVRQVLGSCLTNKYSEGNVGRRYYGGNAFIDQIETLCMKRALDLYELDTE
+EWGVNVQPYSGSPANFAVYTALLNPHDRIMGLDLPSGGHLTHGFQTPKKKVSATSVYFESMPYVVSADTG
+LVNYDDMEKRAKMFLPKLLIAGGSAYPREWDYSRMRQIADSVGAKLMVDMAHISGLVAGKVAESPFPYAD
+VVTSTTHKTLRGPRSGMIFARREYIDAVNSAVFPSLQGGPHNQQIGALAVALKEATEPDFLKYTKDVIAN
+AKALAAGLEKRGHVLATGGTDNHLMLWNVRQLGLTGSKVEKVLDLASITTNKNSIPGDTSALNPGGVRLG
+TPALTSRGMSENDFEKVAEFLHRGSEIALKAEHVAELELDRDNGQSKVLLKHFVAVLELDRDVRNQIDDL
+RKDVENFASQFEMPGSDL*
+>gb|CAJ03206.1|serine_hydroxymethyltranferase|Leishmania_major_strain_Friedlin
+MASLIPTLTEQDPELANMIELEMGRQFRGLEMIASENLTSKAVLECLGSALTNKYAEGEPGNRYYGGTVF
+VDMVENLAKKRALAAFGLDPGEWGVNVQPYSGSPANFAVYTALLEPHSRIMGLDLPSGGHLTHGFYTPKK
+KVSATSIYFESFPYHVKEDGLIDYDALESVALVFRPKMIITGASAYARDFDYERFRHVCDEVGSLLFMDM
+AHTAGLIAGGVLKSPFPYADVVTTTTHKSLRGPRAGMIFYRKKDRQGKPTDHESRINQAVFPGCQGGPHE
+HQIAAIATQMREVCSQEWKAYARQVQSNARALAAALSSKGHVFVSGGTDNHLLLWNVRVHGLTGSKVEKL
+LDAVSISVNKNTIPGDKSAMTPGGIRVGTLALTSRGMVEADMSTVAEFLDRAIVLAKQIQAAMNAVKLSD
+FVEALQTHAGAAALRKDVEAFATTFAMPSFDVERIKYKDGLPEEQ
+>gb|P50433.1|Serine_hydroxymethyltransferase|Solanum_tuberosum
+MAMAIALRRLSATVDKPVKSLYNGGSLYYMSSLPNEAVYDKEKSGVAWPKQLNAPLEVVDPEIADIIEHE
+KARQWKGLELIPSENFTSVSVMQAVGSVMTNKYSEGYPGARYYGGNEYIDMAETLCQKRALEAFRLDPAK
+WGVNVQPLSGSPANFQVYTALLKPHERIMALDLPHGGHLSHGYQTDTKKISAVSIFFETMPYRLDESTGY
+IDYDQLEKSATLFRPKLIVAGASAYARLYDYDRIRKVCNKQKAILLADMAHISGLVAAGVIPSPFDYADV
+VTTTTHKSLRGPRGAMIFYRKGVKEVNKQGKEVFYDYEDKINQAVFPGLQGGPHNHTITGLAVALKQATT
+PEYRAYQEQVLSNSSKFAQALGEKGYELVSGGTDNHLVLVNMKNKGIDGSRVEKVLEAVHIAANKNTVPG
+DVSAMVPGGIRMGTPALTSRGFLEEDFVKVADFFDAAVKIAVKVKAETQGTKLKDFVATLESSAPIKSEI
+AKLRHDVEEYAKQFPTIGFEKETMKYKN
+>gb|NP_193129.1|serine_hydroxymethyltransferase_4|Arabidopsis_thaliana
+MEPVSSWGNTSLVSVDPEIHDLIEKEKRRQCRGIELIASENFTSFAVIEALGSALTNKYSEGIPGNRYYG
+GNEFIDEIENLCRSRALEAFHCDPAAWGVNVQPYSGSPANFAAYTALLQPHDRIMGLDLPSGGHLTHGYY
+TSGGKKISATSIYFESLPYKVNFTTGYIDYDKLEEKALDFRPKLLICGGSAYPRDWDYARFRAIADKVGA
+LLLCDMAHISGLVAAQEAANPFEYCDVVTTTTHKSLRGPRAGMIFYRKGPKPPKKGQPEGAVYDFEDKIN
+FAVFPALQGGPHNHQIGALAVALKQANTPGFKVYAKQVKANAVALGNYLMSKGYQIVTNGTENHLVLWDL
+RPLGLTGNKVEKLCDLCSITLNKNAVFGDSSALAPGGVRIGAPAMTSRGLVEKDFEQIGE
+>jgi|Thaps3|262555|thaps1_ua_kg.chr_5000194|SLA_LP_TPS
+ASNSQTKALFHHKRLPDHGWTDVQIQRLLLELSVLDTNCEESVKWTGAGEREGRIYAPLVSQRHFGFGHG
+IGRSGDVMEAQPKAVGSSALLRLTLRLTLDAVRRGAGLNGTLGKGDSRNGPASFGTLLPVCTGMSMALVL
+SGLRDRARTLDSASIGTEHVNTERNIVLWSRIDQKSCYKSILSAGLKCVVLPTKKHPDTDEVSTDLEALK
+EALDSFGNSILAVLTTTSCFCPRVPDEVDQVAKMIMSAGVSHVVNHAYGLQCQTTNKLLNRACIIGRVDA
+IICSTDKNFLVPVGGALILSPDSNVIETISKNYPGRASSSPMVDLFITLLSMGLNGYKGILEERKRLTEL
+FGQSLQRVATVFGETVLNCPRNTISFGMTLDNLATINGSDDELNSLITKFGSMLFTRCISGTRVVPRGST
+KTISGHTFEGFGSSNDDYPYAYMTSACAVGMGEEEMNEFFVRLEKSWIDYRKKLEQ
diff --git a/SLC4.txt b/SLC4.txt
new file mode 100644
index 0000000..27d4890
--- /dev/null
+++ b/SLC4.txt
@@ -0,0 +1,62 @@
+>jgi|Phatr2|54405|estExt_Phatr1_ua_kg.C_chr_70011|SLC4_3_PTRI
+MPPKHESQEDLKMSTSKQDEDEVRTIDFLDHDDGNQGNGWGRGIVKDFRKTVGTHWVNEMTNFNQKSIAV
+SFFIFFAAVAPAITFGAVYSKTTNDAIGAVEMLIATAWCGIVYALIGGQPIMINGGTGPVLAFSAVLFDI
+ADNMDVNFLTLNAWTGLWVAGFLIIAAFVDLNRLMKHATRFTDEIFALLIASIFVIDALGSPFSDVGIYW
+YFTRSHDSHDEFEDQEDYSYMATAFLSAVLCLGTTWLAFFLRDIKFSPYFPNDSWRTLISDFAVVASILI
+WTLIANGLFDNVEVERLNVPDSITPTQICCTADCMTSFPDDCPDITPYGRRSWIVDLGAVNGKSWIPFFA
+AIPALLAFILVFLDDGITWHLINHPSNKLTHGDAYNWDTVVIAAMIAVNSMLGLPWLVAATVRSLTHVNA
+LAERSENGKIISVQETRLTHLGIHLLVLAALFALDVLKLIPVPVLYGVFLYMGVASLASNQFFQRFLMFF
+MQPSKYPHEPHTKYMAPKRMHLFTGIQLGLFVILTVFRSISVIAIAFPIVIKACIPVRMYILPRYF
+>jgi|Thaps3|13887|gw1.8.48.1|SLC4_1_TPS
+NGKPEKFFQLFTGIRTDLTTRLLPYYKSDWSRPKSIFTVINAIVFAFVVQLIPALIFAELMDRETKGNLA
+AAETLLSAGIIGIIYAIISGQPLTLLGITGPVAILLGTSYGLAEQFDSEYWPFFWWLCIWTAILHFLTAI
+TGLVNFVWHISPFTTQIFEFFIGCSFVFESIRDLVEPLHLGKNTYASLVIGMLAFAICWRLHFAETWTLF
+SRQVRTFLTSYNMAITVIIVTADQKDSNSHGIERVHVRAPWDWQPSVDRPWLIDPTEGISTKGIFGALFP
+AFMLYLLFFIDHNISSILTQAPKYNLKKPASYHWDFFCLGLTIVPCGLLGLPPGSGLIPQAPLHTRALAT
+RKILERHGVKQEVTVHVEEQRWSALGQASLMFVALSLFTVISWIPKGALFGVFLYLGVGALHGNEIWHHI
+TLSFMYAKKRPPVPIVANVKWSTVQLYTLVQVCCAAAIFGVAQFASVGYIFPALVAALVPIRSYFVAWCF
+SENDLQYLD
+>jgi|Phatr2|45656|estExt_fgenesh1_pg.C_chr_70326|SLC4_1_PTRI
+MTNLVSRAYIVALLCMSSCWHSAAFHTTSFGKTSLGLKISSSRSPTFSSLKKAKVIASVTTKPLTKLSDS
+MSVVSPPVDERENNKDDETLFEGPFKGIIRDYKARLPLFASDIKDGLNVQCLAATMFLFFACLAPAVGFG
+GLFDVATGGAIGTVEMVSSTALCGLIYAITSAQPLTIIGSTGPVLAFVACLAQLAKMLNLPFLQLYSWTG
+LWTSAILFVSSITSASNLVKYLTRFTDEIFSLLISCIFVFEAVSDVGRTFSSPASTFTKALLTLTCAAST
+FTIATLLKGLRKTSLFPSRVRNTISNFAPTIGVVTASLIARWARVVHGTKLAGLPSLSIPAVFGTTSGRP
+WLVPILDFPVWARWAAFLPALMATVLLFLDQNITVRLVNNPRWKMEKGRRKNNVLDGMHADMFIVSILTA
+AQSLVGIPWLVAATVRSLSHVGALSKYDKEGKVVGTIEQRMTGISIHSLIGCAVLFSKPRKLLTQVPLPV
+LMGLFMYLGTSSLPGNEMWERVTGLFKDKTVAPKQRWSDKVPDKVTSTFTLIQVACLGAMFWVKESPFGV
+LFPVVIAMLAPLRFALEKQGIIKKEYMDVLDEE*
+>jgi|Emihu1|99943|fgeneshEH_pg.21__120|Bicarbonate_transporter_SLC4family
+MSKREEYPGDDNYSSVNHVADALETPATAGHDAEADPFAEETGSEPKSVPETPDQESGMVGAATKSSGAK
+RRKGKKPISELAPLEFSGRFAGGLRADLLRRVPLYVSDWTEAFTGGNCMKTTASICFLFFACLSPAVTFG
+AAFADATDNQLGVIETIISSGMSGLIYSFLSGQPLCILGATGPELAFPVVFYEICQWGGMEVDFLAARVW
+QALWCSLFTIIVALFDLSACMKVCTRFTEEIFSFLISIIFIVGAFTTLIKLYLADPDVEGDDPAAPANRA
+KAFLGTLLGLFTYFTAMWCRAFPKRNETTPLVRKLVANYGVTLSILLYSGINYGFRDVDVPCLDMPDEIV
+PTATLNGTGESRGWFVNPFGDETASGYDTPGVGFIFFAAVPALGLAVLGYLDQNLTTLLINRKDHNLKKG
+GAYHLDLLVCGIFIYPICGFFGLPFTHAATVRSMSHLMSLMTREDSTNEHGQTVSKVTNVVEQRVTHLGI
+HCLLLAALGLSAVLTKIPKVVLAGVFLYMGVTALPGVQLYERLWLWLIWDPKKYPQYDYVTQVARKPLHL
+YTLFQFSCLAVLYALTKVPNPYISVIFPFFIAFLPLIRKLVPKCFPSVWSKEDLKALDK
+>jgi|Thaps3|267979|estExt_thaps1_ua_kg.C_chr_10120|SLC4_2_TPS
+MRNDFARRRKWYISDWTDAFKKKRQVIPAVLFLYFACLAPAVSFGTIASEITNGSIGVVEFLLSSGMAGM
+LYSITCGQPMAFLAPTGLTLAFISGLFRFCTLRNLPFFPVYAWVGLWSSAFMMILGLSGSSKLIRYCTRF
+TDEVFNGLLSVNFIYEAFSSLRRNFVNADPMNLTMPFVALSMALGTFFSTMKVVKFESSKFFNTKVRGVI
+KNFGPVSVILFFTLVNLLPWFQKFHVPTLSVPDTFQLAGGRSFLVSLKEIPVKVRWLCALPAWLLTCLFF
+MDQNISVRLVNNPDNKLKKGEAYNMDMVALGGITGVLSVLGLPWMCGATVQSMNHVRAMSEMKVNEETGE
+TEVEVTETRLTGFTIHALLASTVLLLPWIKKIPIPVVSGVFLFLGRKLMTGNTFFKRVTDAFAESKRLRE
+DHPINLLGRKKMNAFTGIQVLCLLGLFAFKQIPSITIFFPAMILFLMFIRSFVLPKYFSEEEFVALEDPT
+PS*
+>jgi|Phatr2|32359|fgenesh1_pg.C_chr_1000960|SLC4_2_PTRI
+MKQSSKRHRKDGALQHTVLWIGILSAFCTTGSAFTSSALGRTKPSSLHLVPGSAAVLNLGRRPGKRSNYL
+RLSLPADRRTSVGSSKNKDNTDSTNNDATQSIEGTKEDVKEKIQFSPSYLEQIDRMRGYRRKRQWKRVLE
+EYSNGNSTETTAQKHAKNLFDTIVSQEMRDDIRRRKKVYWSDWEDGFKNKRKVIPAILFLYFACLSPAVS
+FGTIASEITQGSIGIVEFLLSSGLSGMAYAMMCGQPMAFIAPTGLTLAFISGLYRFCMVKALPFFPIYAW
+VGLWTSFFFVLLGLGGSSQLIRFCTRFTDEVFNALLSVNFIYEAVASLKRNFDLADPMNLTMPFVSLAMA
+LSTFWCTAKVAAFESSKYLNQKIRSIVKDFGPVTIFILMSIFNQRAWMKKFKVPTLTVPSSFQLSGGRNF
+LINLNAIPLNIKLACVLPAILLTSLFFMDQNISVRVVNNPDNKLKKGAAYNLDMVALGLITSCLSLVGLP
+WMCGATVQSLNHVRALTETRFNERTGEPEIIGVTETRVTGFAVHALICSTLAILPLLRFVPIPVVAGVFL
+FLGRKLMSGNSFLQRIRDCFVEKSRLPADHPIRYIGRKKTNIFTVTQIGCLGGLWFFKQNSTTAIFFPSV
+IGLLMLIRAFVLPKVFTEDELIDLGDPSPN*
+>gi|NM_003040.3_translation|SLC4_2|Homo_sapien
+MSSAPRRPAKGADSFCTPEPESLGPGTPGFPEQEEDELHRTLGVERFEEILQEAGSRGGEEPGRSYGEEDFEYHRQSSHHIHHPLSTHLPPDARRRKTPQGPGRKPRRRPGASPTGETPTIEEGEEDEDEASEAEGARALTQPSPVSTPSSVQFFLQEDDSADRKAERTSPSSPAPLPHQEATPRASKGAQAGTQVEEAEAEAVAVASGTAGGDDGGASGRPLPKAQPGHRSYNLQERRRIGSMTGAEQALLPRVPTDEIEAQTLATADLDLMKSHRFEDVPGVRRHLVRKNAKGSTQSGREGREPGPTPRARPRAPHKPHEVFVELNELLLDKNQEPQWRETARWIKFEEDVEEETERWGKPHVASLSFRSLLELRRTLAHGAVLLDLDQQTLPGVAHQVVEQMVISDQIKAEDRANVLRALLLKHSHPSDEKDFSFPRNISAGSLGSLLGHHHGQGAESDPHVTEPLMGGVPETRLEVERERELPPPAPPAGITRSKSKHELKLLEKIPENAEATVVLVGCVEFLSRPTMAFVRLREAVELDAVLEVPVPVRFLFLLLGPSSANMDYHEIGRSISTLMSDKQFHEAAYLADEREDLLTAINAFLDCSVVLPPSEVQGEELLRSVAHFQRQMLKKREEQGRLLPTGAGLEPKSAQDKALLQMVEAAGAAEDDPLRRTGRPFGGLIRDVRRRYPHYLSDFRDALDPQCLAAVIFIYFAALSPAITFGGLLGEKTQDLIGVSELIMSTALQGVVFCLLGAQPLLVIGFSGPLLVFEEAFFSFCSSNHLEYLVGRVWIGFWLVFLALLMVALEGSFLVRFVSRFTQEIFAFLISLIFIYETFYKLVKIFQEHPLHGCSASNSSEVDGGENMTWAGARPTLGPGNRSLAGQSGQGKPRGQPNTALLSLVLMAGTFFIAFFLRKFKNSRFFPGRIRRVIGDFGVPIAILIMVLVDYSIEDTYTQKLSVPSGFSVTAPEKRGWVINPLGEKSPFPVWMMVASLLPAILVFILIFMETQITTLIISKKERMLQKGSGFHLDLLLIVAMGGICALFGLPWLAAATVRSVTHANALTVMSKAVAPGDKPKIQEVKEQRVTGLLVALLVGLSIVIGDLLRQIPLAVLFGIFLYMGVTSLNGIQFYERLHLLLMPPKHHPDVTYVKKVRTLRMHLFTALQLLCLALLWAVMSTAASLAFPFILILTVPLRMVVLTRIFTDREMKCLDANEAEPVFDEREGVDEYNEMPMPV
+>gi|NM_000342 _translation|SLC4_1|Homo_sapien
+MEELQDDYEDMMEENLEQEEYEDPDIPESQMEEPAAHDTEATATDYHTTSHPGTHKVYVELQELVMDEKNQELRWMEAARWVQLEENLGENGAWGRPHLSHLTFWSLLELRRVFTKGTVLLDLQETSLAGVANQLLDRFIFEDQIRPQDREELLRALLLKHSHAGELEALGGVKPAVLTRSGDPSQPLLPQHSSLETQLFCEQGDGGTEGHSPSGILEKIPPDSEATLVLVGRADFLEQPVLGFVRLQEAAELEAVELPVPIRFLFVLLGPEAPHIDYTQLGRAAATLMSERVFRIDAYMAQSRGELLHSLEGFLDCSLVLPPTDAPSEQALLSLVPVQRELLRRRYQSSPAKPDSSFYKGLDLNGGPDDPLQQTGQLFGGLVRDIRRRYPYLSDITDAFSPQVLAAVIFIYFAALSPAITFGGLLGEKTRNQMGVSELLISTAVQGILFALLGAQPLLVVGFSGPLLVFEEAFFSFCETNGLEYIVGRVWIGFWLILLVVLVVAFEGSFLVRFISRYTQEIFSFLISLIFIYETFSKLIKIFQDHPLQKTYNYNVLMVPKPQGPLPNTALLSLVLMAGTFFFAMMLRKFKNSSYFPGKLRRVIGDFGVPISILIMVLVDFFIQDTYTQKLSVPDGFKVSNSSARGWVIHPLGLRSEFPIWMMFASALPALLVFILIFLESQITTLIVSKPERKMVKGSGFHLDLLLVVGMGGVAALFGMPWLSATTVRSVTHANALTVMGKASTPGAAAQIQEVKEQRISGLLVAVLVGLSILMEPILSRIPLAVLFGIFLYMGVTSLSGIQLFDRILLLFKPPKYHPDVPYVKRVKTWRMHLFTGIQIICLAVLWVVKSTPASLALPFVLILTVPLRRVLLPLIFRNVELQCLDADDAKATFDEEEGRDEYDEVAMPV
diff --git a/SPT.txt b/SPT.txt
new file mode 100644
index 0000000..dd80eb8
--- /dev/null
+++ b/SPT.txt
@@ -0,0 +1,59 @@
+>gi|223999211|ref|XP_002289278.1|T.pse|SPT_AGT_TPS
+MSSSMRAASSLLRSIPRATSFATVSSKPSHATQLISRNTNHLISPIAATTSLSSSATSSHRFFSSTGPPE
+EDELHYTSVAKGDMGEFQEYSVIFTNRALNLMSKPFQQVMRDLNMLLKKTYNADKVAIMPGSGTFGMEAV
+ARQFATDKHVMVIRNGWFSFRWTEIFDMGGHNHTIPSSHTVLKAQPVEPEDPNCPHMQYAPYPIDEVVAK
+VMEERPAVLFAPHVETSTGMILPDDYIRKAAKAVHDVGGLFVLDCIASGAIWADMKDLGVDSIISAPQKG
+WTGPACCALVMLSERAAEVMAETQETSFSMSLKRWCAIMDTYEKGGFGYHTTMPTDGLRDFHEISVETLN
+FGLPELKQAQYKLGAVARELLDSRGLTSVAAPGFQAPGVLVYYSPLGQDNPAMMNKFKVHGLQIAMGVPW
+RIDEPDGLKTFRLGLFGLDKMGDIPKCVGTLQKSLDAVLAESGHSIPEKKAA
+>jgi|Phatr2|40344|fgenesh1_pg.C_chr_23000065|SGAT_PTRI
+MFRSVASLALRGSIGTGRGVAQSPRVVPFGSAVTVRHSSNSHTNSSSHTPERLRYNVIPKSDFGAFKEYS
+VIHTDRSLNLMSDPFQRVMRDLNELLKVTYNADKVVILPGSGTFGMEAVARQFAQNEHVMVIRNGWFSYR
+WTEIFEMGSSEPGVEAGGVGAGIPTSHTVLKAQPVPVPGNDTGSSNTKTTHFAPHPIQDVVSRIHQERPA
+VLFAPHVETSTGMMLPDEYIQKAAQAMHDIGGLFVLDCIASGTVWVDMKALGVDVLISAPQKGWTGPPCA
+ALVMMSDRAVARMSQTSETSFSMSLKRWAALMDTYEKGGFAYHTTMPTDALRDFHEISVETLRFGLPELK
+TAQLNLGWWARGTLDRKGLVSVAAPGFQAPGVLVYYSPSQTDNPVMMSSFKAQGLQIAMGVPWKIDEPEG
+LKTFRIGLFGLDKLGKPDETIRVMEEALDQVLDSVGHTAKSKKVA*
+>gi|XP_003064521.1|SGAT|Micromonas_pusilla CCMP1545
+MKCPHDPSKPFHKSPLPLDHAGGLLEYSVVYTDRAMNHMSAPFCKIMNDIDATMKEAYNCSATIVMPGSG
+SYGMEAVARQWATNKKVLVLRNGYFSYRWTDIFEQTGIPSETIVLKGQPADNSSNPQFMPHDIEEVCAAI
+AREKPAVVFAPHVETSTGIILPDEYISRVSRAVHDVGGLFVLDCIASGTVWVDMKATGVDAILSAPQKGW
+TGPACSSLMMLSERGEHATRNTTSTSMVINMRKWLEIMDSYTNGGFAYYTTMPTDALGLFRDAALETKEI
+GFAKTKRMAWDLGDECRDMMKSKGLKTVSADGYEAPGVSVWYTPEPDMFNKFKKEGFQIAAGVPFMINEP
+PGNFTFRIGLFGLDKICNKDNTIKTLEGTLEKILASSAGGAKAAAA
+>gi|NP_495885.1|Serine--pyruvate_aminotransferase|Caenorhabditis_elegans
+MISTRFLRPSVSIFGFGIKSSMSSRAPPKALLQDMVVPPRQLFGPGPSNMADSIAETQSRNLLGHLHPEF
+VQIMADVRLGLQYVFKTDNKYTFAVSGTGHSGMECAMVNLLEPGDKFLVVEIGLWGQRAADLANRMGIEV
+KKITAPQGQAVPVEDIRKAIADYKPNLVFVCQGDSSTGVAQPLETIGDACREHGALFLVDTVASLGGTPF
+AADDLKVDCVYSATQKVLNAPPGLAPISFSDRAMEKIRNRKQRVASFYFDAIELGNYWGCDGELKRYHHT
+APISTVYALRAALSAIAKEGIDESIQRHKDNAQVLYATLKKHGLEPFVVDEKLRLPCLTTVKVPEGVDWK
+DVAGKMMTNGTEIAGGLGATVGKIWRIGTFGINSNSTKIENVVELLSKSIGEKSK
+>gu|WP_033827098.1|serine--pyruvate_aminotransferase|Bacillus_andreraoultii
+MRNKELLLIPGPTPVADSIYDAMVQETWGHTDLRFAKMYKESIEATKQMLKTDGEVFVISGSGTLAMEMA
+LVNTVASGEKLLVISHGYFGDRFIKLGQAYGIEIDVVQSEWGKHIDVTEVDKKLSENKYKAVTITHADTS
+TGVASNLDLLVPLIKKHGALVILDGVCATGAIEEDMSKTYGSPDAKIDVVLTGSQKAIGVPPGLAIVAFN
+QTALAAREELDRVPAYYCDIKNWLPIMHDPTKYFATPAVNMIYGYREGMRLVLEEGMEKRYIRHKQYGQA
+VRSSLREYGMKPLADEGVAAATLSCILYPDGVDDAEFRSSLAKKGVIVAGALAHLSGKAFRIGHMGNTTK
+EMLAEAIERIGETLIELGLAANIDRALEQFEESFNVTIN
+>jgi|Emihu1|123208|fgeneshEH_pg.1949__1
+MRAANASEAIVERIIATHLRPGIANLAPGTAHWSPPERLVEAALRGDGGYGDIRGEPALLAALREEHGRE
+HVMVTPGANQAFVHALLSTCDVGDEVLLWRPYYFSHLVALQLLGLVPVFADCDERGEPTHDLHQCHQQCS
+LCSPLRVGLPTPGAAYEHFTYGAAEHASAAELCEAGGGELLLCLRTFSKSYGLAAWRVGHLSYPHQLHDA
+MLKARDHARSPEIARDCTTRCSRRWDAPPTRTRGGWCSNGWWMCGQALEGLGEAWVREQASGEIWGDLGR
+SGEVATLEPARAMLWEALAPLRGGGDALQPAGAFYYFVRLPGCSSPGARVLGEGGEGGEAAEEEPSGLLH
+ASRLADCEAEEEAVRRLAAEHELLTLPGSAFGRPGHLRLSYGRLASAEEAEPIAERLFRAAEALRARWGE
+T*
+>gi|NP_178969.1|alanine:glyoxylate aminotransferase [Arabidopsis thaliana]
+MDYMYGPGRHHLFVPGPVNIPEPVIRAMNRNNEDYRSPAIPALTKTLLEDVKKIFKTTSGTPFLFPTTGT
+GAWESALTNTLSPGDRIVSFLIGQFSLLWIDQQKRLNFNVDVVESDWGQGANLQVLASKLSQDENHTIKA
+ICIVHNETATGVTNDISAVRTLLDHYKHPALLLVDGVSSICALDFRMDEWGVDVALTGSQKALSLPTGLG
+IVCASPKALEATKTSKSLKVFFDWNDYLKFYKLGTYWPYTPSIQLLYGLRAALDLIFEEGLENIIARHAR
+LGKATRLAVEAWGLKNCTQKEEWISNTVTAVMVPPHIDGSEIVRRAWQRYNLSLGLGLNKVAGKVFRIGH
+LGNVNELQLLGCLAGVEMILKDVGYPVVMGSGVAAASTYLQHHIPLIPSRI
+>gi|ONM02788.1| Serine--glyoxylate aminotransferase [Zea mays]
+MVDYVYGPGRTHLFVPGPVNIPDPVIRAMNRQNEDYRSPAVPALTKVLLEDVKKIFKTTTGTPLMIPTTG
+TGAWESALINTLSSGDRVVSFLIGQFSLLWIDQQRRLGFDVDAVESEWGQGADLAALERRLRDDAPRHAI
+KAVAIVHNETATGVTNDLAAVRALLDKHAHPALLLVDGVSSICALDFRMDEWGVDVALTGSQKALSMPTG
+MGIVCASPRALEASKTARFYDMGTYWPYTPSIQLLYGLRTALDLIFEEGLDNVVRRHNRLGTATRLAVEA
+WGLSNCCQKEEWFSDIVTAVVVPPNIDSAEVVRHAWKRYNLSLGLGLNKVAGKVFRIGHLGNLNELQLLG
+CLSGVEMVLKDVGYPVKLGSGVAAAAAYLSNSTPLIPSRI
diff --git a/TSR.txt b/TSR.txt
new file mode 100644
index 0000000..617ba7e
--- /dev/null
+++ b/TSR.txt
@@ -0,0 +1,26 @@
+>jgi|Thaps3|413|fgenesh1_pm.C_chr_4000054|TSR_TPS
+MSSVAFVGLGNMGRNMAMNLARNKPSVVTLLTVHDSHEPTLSSFMEQAKINGLSVSSTPNLASFADSNPD
+VIITSLPSCEASAAVVGEIVESLSPSREAIFIDTSTISVTTSRKLHELVTSTSTKFDYVDAPVSGGVKGA
+TDASLTFMVGCSSLATLSSVQPILQRMGKDIIPCGGPGSGSAVKLCNNAALAAQMLGVCEAMNLGDKLGV
+DPAVLAGVMNVSTAKSWSSTVNNPHPVAARGIGSGASANEYEGGFGTSLMLKDLNLAIDTAEEEHVSMPV
+TSLARELYRIADSHGYGKKDFGVMLQFLRGRDGSGDTR*
+>jgi|Phatr2|45141|estExt_fgenesh1_pg.C_chr_60132|TSR_PTRI
+MGLTAISVRRLSAFARFHGRRLALQYTCIRYYEDEAYTNAVVGFIGLGNMGLPMARNLAKKNKILAFDTN
+PDARHAASISIMEVSDTISHLKDCSMIFTMLPGCQVVDQVMSDLHNVVDHQNTIIVDCSTVSPTTSRRWH
+DAWKVNGCAMLDAPVSGGTKGAMEGTLTFMVGYDDKMRFEQAKPFLYCMGDRIIPCGGPGTGAATKLCNN
+VALAAQMVGICEAMNLGESLGVDPVLLAEVMNTSTASCWSSKVNNPHPSVARASGSPASQDYVGGFSARL
+MLKDLGLAAQAAEDNGVALPLVATSRELYKLAGLRGMADRDFGIMLQLLRGK*
+>gb|ELX81395.1|TSR|Salmonella_enterica_subsp._enterica_serovar_Dublin
+MKLGFIGLGIMGSPMAINLARAGHQLHVTTIGPVADELLSLGAVNVETARQVTEFADIIFIMVPDTPQVE
+EVLFGEHGCAKTSLQGKTIVDMSSISPIETKRFAQRVNEMGADYLDAPVSGGEIGAREGTLSIMVGGEQK
+VFDRVKPLFDILGKNITLVGGNGDGQTCKVANQIIVALNIEAVSEALVFASKAGADPVRVRQALMGGFAS
+SRILEVHGERMINRTFEPGFKIALHQKDLNLALQSAKALALNLPNTATCQELFNTCAANGGSQLDHSAMV
+QALELMANHKLS
+>gb|AKK40661.1|TSR|Escherichia_coli_APEC_O2-211
+MIDMTMKVGFIGLGIMGKPMSKNLLKAGYSLVVADRNPEAIADVIAAGAETASTAKAIAEQCDVIITMLP
+NSPHVKEVALGENGIIEGAKPGTVLIDMSSIAPLASREISEALKAKGIDMLDAPVSGGEPKAIDGTLSVM
+VGGDKAIFDKYYDLMKAMAGSVVHTGEIGAGNVTKLANQVIVALNIAAMSEALTLATKAGVNPDLVYQAI
+RGGLAGSTVLDAKAPMVMDRNFKPGFRIDLHIKDLANALDTSHGVGAQLPLTAAVMEMMQALRADGLGTA
+DHSALACYYEKLAKVEVTR
+
+
diff --git a/mafft_hmmbuild.sh b/mafft_hmmbuild.sh
new file mode 100644
index 0000000..7918829
--- /dev/null
+++ b/mafft_hmmbuild.sh
@@ -0,0 +1,8 @@
+#! /usr/bin/env bash
+#
+
+for i in SLC4 Bestrophin CA_beta CA_delta CA_alpha CA_zeta GOX GDCT PGP GCL HR SPT TSR ICL PK PEPC PEPCK MDH OMT ME PPDK PYC SHMT MS GlcDH ALAT_GGAT GK
+ do
+ mafft "$i".txt> "$i"_aln.txt
+ ./hmmbuild "$i".hmm "$i"_aln.txt
+ done
\ No newline at end of file
diff --git a/rip_counts_MMETSP.py b/rip_counts_MMETSP.py
new file mode 100644
index 0000000..c07d4dd
--- /dev/null
+++ b/rip_counts_MMETSP.py
@@ -0,0 +1,39 @@
+#!/usr/bin/python
+#rip_counts_annot_MMETSP
+#inputs: directory with .pep.fa files from MMETSP, outputs count and annot files for each taxa
+
+#import libraries
+import sys
+import os
+from ftplib import FTP #import ftp library
+import re #import regular expression tools
+
+# taxa directory
+t='/Users/maria_hernandez/Documents/Big_Data3050/CMM_MoreSP'
+files= os.listdir(t)
+
+#Pull out taxa and strian ID from .pep.fa files
+delimiter=' '
+all=delimiter.join(files)
+taxa=re.findall('(\S*).pep.fa.gz',all)
+print taxa
+
+#access ftp
+ftp= FTP('ftp.imicrobe.us') #set ftp server
+ftp.login() #log in
+ftp.cwd('camera/combined_assemblies') #change to main working directory
+
+
+#This looop pulls out all count data for each taxa and saves in count directory
+for ID in taxa:
+ #change to taxa directory/readcounts
+ ripdir= ID+"/readcounts"
+ ftp.cwd(ripdir)
+ #write to README file in working directory
+ savefile= t +"/counts/"+ID+"_cds_counts.txt"
+ command= "RETR "+savefile
+ ftp.retrbinary(command, open(savefile, 'wb').write)
+ ftp.cwd("~/camera/combined_assemblies")
+
+#close ftp connection
+ftp.quit()
\ No newline at end of file