diff --git a/ALAT_GGAT.txt b/ALAT_GGAT.txt new file mode 100644 index 0000000..e5e019b --- /dev/null +++ b/ALAT_GGAT.txt @@ -0,0 +1,42 @@ +>XP_002289904.1|alanine_aminotransferase|Thalassiosira_pseudonana_CCMP1335 +MQYAVRGEVVIRADAMAAEGRKIIYTNIGNPHAVGQKPITYYRQVLSLCDLPAECGVDNTQVAAAFPSDV +IERAIEMRDAIGPAGTGAYTNSQGIGKFRDDVAHFITARDEHVALPSNIFLSNGASAAIENVLTGLIGSN +RDAIMIPIPQYPIYSAIISRLGARQVGYFLERRTAAVERDGLDIRALTLINPGNPTGQVLGREDLEIICT +FCAKHNIVLLADEVYQRNIYDDKKEFVSAKKVAVETPGCENLQLISFHSTSKGLIGECGRRGGYMELHNI +DPYVQTQLYKLASSGLCSGVDGQMMTSLMVRPPLPGEESHELFSRQEFEIFSSLKRRAVSLVRGLNDIDG +MTCTPAEGAMYAFPRVELPPKALDAAAINDQTPDNLYALSLLEETGICVVPASGFGQKEGRIGFRTTFLP +PEDELNQAVVEFKRHHEWFCEKYA +>OEU21541.1|alanine_aminotransferase|Fragilariopsis_cylindrus_CCMP1102 +MEYAVRGTVVIAADRINDELKAEQSMGAESKYKFQKIIYTNIGNPQSVGQQPLTWPRQVLALIDLPDEEG +INHPNIQNIFPSDAIARARTIKIGLGGNGSGAYSHSKGIKMFREDVCTFLQNRDGIDVPTDVENIFLSNG +ASAAIFNLLTSLIADNKCGIMIPIPQYPIYSASVEQLGGQKVGYYLDEKNKWNLSIDELERSLKEALENG +TNVVAFVLINPGNPTGAVLTKQTVQDVVKFCSKHNLVLLADEVYQENVYNEQDKFYSCKRAAYDCGLLET +NSIELASFHSTSKGVFGECGRRGGYMELTGFDENIKNQLYKLASASLCSTVNGQCMTSLMCRGPSPDDVS +YESHEKEKLDIFNSLKKRSKIVNDGLNSIDGFSCQPAQGAMYCFPSIDNMPMKAINEAAEQNITPDTLYA +LSLLERTGICVVPASGFGQRPGRYGFRTTFLPSEDDMAYSVNAMKDHHKEFCQKYA +>NP_005300.1|alanine_aminotransferase1|Homo_sapiens +MASSTGDRSQAVRHGLRAKVLTLDGMNPRVRRVEYAVRGPIVQRALELEQELRQGVKKPFTEVIRANIGD +AQAMGQRPITFLRQVLALCVNPDLLSSPNFPDDAKKRAERILQACGGHSLGAYSVSSGIQLIREDVARYI +ERRDGGIPADPNNVFLSTGASDAIVTVLKLLVAGEGHTRTGVLIPIPQYPLYSATLAELGAVQVDYYLDE +ERAWALDVAELHRALGQARDHCRPRALCVINPGNPTGQVQTRECIEAVIRFAFEERLFLLADEVYQDNVY +AAGSQFHSFKKVLMEMGPPYAGQQELASFHSTSKGYMGECGFRGGYVEVVNMDAAVQQQMLKLMSVRLCP +PVPGQALLDLVVSPPAPTDPSFAQFQAEKQAVLAELAAKAKLTEQVFNEAPGISCNPVQGAMYSFPRVQL +PPRAVERAQELGLAPDMFFCLRLLEETGICVVPGSGFGQREGTYHFRMTILPPLEKLRLLLEKLSRFHAK +FTLEYS +>AAC62456.1|alanine_aminotransferase|Zea_mays +MAASVTVENLNPKVLKCEYAVRGEIVIHAQRRQQQLQTQPGSLPFDEILYCNIGNPQSLGQQPVTFFREV +LALCDHPCLLEKEETKSLFSADAISRAKQILATIPGRATGAYSHSQGIKGLRDAIAAGIMSRDGFPANAD +DIFITDGASPGVHMMMQLLIRNEKDGILCPIPQYPLYSASIALHGGTLVPYYLNEKNGWGLEISDFKTRL +EDVRSKGIDVRALVVINPGNPTGQVLAEDNQYDIVKFCKNEGLVLLADEVYQENIYVDNKKFNSFKKIVR +SMGYGEDDLPLVSLQSVSKGYYGECGKRGGYMEITGFSAPVREQIYKIASVNLCSNITGQILASLVMNPP +KAGDESYASYKAEKDGILESLARRAKALEDAFNKLEGFSCNKAEGAMYLFPQIHLPQKAIEAAKAAKKAP +DAFYALRLLESTGIVVVPGSGFGQVPGTWHIRCTILPQEDKIPAVISRFRAFHEAFLAEYRD +>XP_009315638.1|alanine_aminotransferase|Trypanosoma_grayi +MSTSRKAIHINPRVVEAQYAVRGLIPMRADEIKNALATPEGKGKYPFSSLVYCNIGNPQALEQKPLTFNR +QVMSLVDAPFLLDNAAIKAQYPADAVARAQEYLSHIGNRTGAYTDSAGYAFVREIVARHINERDHGAKPL +MDASSIMLTDGASTGVRLLLQILIGDASDGVMIPIPQYPLYTAQIALLGGTPAMYYLDENKGWALNVADL +ASAYDECVAQRKATPRVLVVINPGNPTGGVLERGVMEAVAKFCCDRGMVLMADEVYQENIYAEGKRFVSF +REVVLGLPAPYNTDTVLASLHSTSKGIIGECGRRGGYFSLTNAPAALTEQVVKMSSINLCSNVNGQLMTA +LMCAPPRAGDASYDAYWAEYNAIFGSLKKRALMLAKELNSIRGFACQPVEGAMYAFPTIQLPEKYAQHNA +ELNAREGRKLAPDARWALELLESSGIVVVPGSGFGQQPNTLHFRTTILPPEAQMERMVKALRGFQEDVWA +KYA \ No newline at end of file diff --git a/Bestrophin.txt b/Bestrophin.txt new file mode 100644 index 0000000..1726f65 --- /dev/null +++ b/Bestrophin.txt @@ -0,0 +1,54 @@ +>gi|224000585|ref|XP_002289965.1|T.pse|bestrophin1_TPS +MGPPIDPSVPVTDQVGEGSRKYRRTVYTHDDWVRHRSPDRFGNNLSTLFNSGIYKQVANEVFATTAVATF +VFLWNMIAGGYTDLAGVQHGPIIDSPLAQMVGLPMTAFTILTPSLGLLLVFRTNTSYGRWDEARKMWGLN +INHTRDLNRMATAWYGNEGNMDSVAFMGGDIPYSQPIDPVQRAYDLGQVSLFTWAFVRSMKRHLSPPEED +EEDFKAELRARLTPEQAENIINAAHRPNRALFDLSVAIENLPMHFLRKNAINTNLSIFEDTLGGCERLLS +SPVPLFYSRHTARFLSTWLLLLPFGLYEQFKDSWNHIAMIPATAFISVCLFGIEELATQLEEPFTILPMQ +GFCDKIGGWCDEIVSWAGQGQQEYTEENAMSNEQEMTYWR +>gi|223999673|ref|XP_002289509.1|T.pse|bestrophin2_TPS +MPSFTSLSTLLLLALSSPQISAFAPLSSTSTPINVAPSTTTSTTNLQMGPPKTDIVLSETYGEGSRKYRR +TVYTHNEWVKHRSSDRFAKNLFSMVNSGVYKSLAKEVFATTAVASAIVAWNGIAGGYTDFNGVEHGAIMS +FLPQLVLPLTPFTLLSPSLGLLLVFRTNSSYGRWDEARKMWGLNINHTRDLNRMATAWYGHDNQIIDPAK +RAEDLRQVSLYTWAFVRSMKRHLSPPSEDEEAFVEELYARMAPEQAEAIISAAHRPNRALYDLSVVIDKL +PMHFMRKNEINKNLSIFEDTLGGCERLLSSPVPLFYTRHTARFLSTWLLLLPLAMYQPFSGSWNHVAMIP +ATALTSVFLFGIDELSTQLEEPFTILPMQGFCDKIGGWCDEIVSWRGQGLDKEEQQYY +>gi|WP_077172616.1|bestrophin|Pseudomonas_psychrotolerans +MITRPQNPSLRELLFTVRGSIVQAIWPKLLYVVLLSLAVTLSHDVFLRFDFGLTTTPLTLWGLTLAIFLG +FRNTTAYQRFWEARGLWGELLIAGRNLARQVETLVPGLTAPERRQLLTPLLAFGYALRDHLRREAPSADL +QRVLVGEDALLAAPHRPSALIRRLGTRLVARAREEGLGDPLIANLDHQLDRLTAVLSGCERIRQTPIPYP +YILMLHRVVHVYCFLLPFCLVDSLGWFTPLAVLVLAYTFFGLDALGDQIADPFGTQPNHLPLDALSRGLE +IAVLDLLGEPTPEPIRAEAGLLR +>gi|NP_004174.1|bestrophin-1_isoform1|Homo_sapiens +MTITYTSQVANARLGSFSRLLLCWRGSIYKLLYGEFLIFLLCYYIIRFIYRLALTEEQQLMFEKLTLYCD +SYIQLIPISFVLGFYVTLVVTRWWNQYENLPWPDRLMSLVSGFVEGKDEQGRLLRRTLIRYANLGNVLIL +RSVSTAVYKRFPSAQHLVQAGFMTPAEHKQLEKLSLPHNMFWVPWVWFANLSMKAWLGGRIRDPILLQSL +LNEMNTLRTQCGHLYAYDWISIPLVYTQVVTVAVYSFFLTCLVGRQFLNPAKAYPGHELDLVVPVFTFLQ +FFFYVGWLKVAEQLINPFGEDDDDFETNWIVDRNLQVSLLAVDEMHQDLPRMEPDMYWNKPEPQPPYTAA +SAQFRRASFMGSTFNISLNKEEMEFQPNQEDEEDAHAGIIGRFLGLQSHDHHPPRANSRTKLLWPKRESL +LHEGLPKNHKAAKQNVRGQEDNKAWKLKAVDAFKSAPLYQRPGYYSAPQTPLSPTPMFFPLEPSAPSKLH +SVTGIDTKDKSLKTVSSGAKKSFELLSESDGALMEHPEVSQVRRKTVEFNLTDMPEIPENHLKEPLEQSP +TNIHTTLKDHMDPYWALENRDEAHS +>gi|AAR99655.1|bestrophin2|Homo_sapiens +MTVTYTARVANARFGGFSQLLLLWRGSIYKLLWRELLCFLGFYMALSAAYRFVLTEGQKRYFEKLVIYCD +QYASLIPVSFVLGFYVTLVVNRWWSQYLCMPLPDALMCVVAGTVHGRDDRGRLYRRTLMRYAGLSAVLIL +RSVSTAVFKRFPTIDHVVEAGFMTREERKKFENLNSSYNKYWVPCVWFSNLAAQARREGRIRDNSALKLL +LEELNVFRGKCGMLFHYDWISVPLVYTQVVTIALYSYFLACLIGRQFLDPAQGYKDHDLDLCVPIFTLLQ +FFFYAGWLKVAEQLINPFGEDDDDFETNFLIDRNFQVSMLAVDEMYDDLAVLEKDLYWDAAEARAPYTAA +TVFQLRQPSFQGSTFDITLAKEDMQFQRLDGLDGPMGGAPGDFLQRLLPAGAGMVAGGPLGRRLSFLLRK +NSCVSEASTGASCSCAVVPEGAAPECSCGDPLLDPGLPEPEAPPPAGPEPLTLIPGPVEPFSIVTMPGPR +GPAPPWLPSPIGEEEENLA +>gi|NP_988974.1|bestrophin-2|Xenopus_tropicalis +MTVTYTARVANARFGGFYKLLLLWRGSIYKLLYKEFLAFFLMYLALSIIYRFFLNEEQKLYFDKVAIYCN +NYANLIPVSFVLGFYVNLVVNRWWNQYLSLPFPDRVMCAISGTVHGSDETGRLYRRTLMRYCSLSGLLIL +RSVSTAAFKRFPTIDHVVEAGFMTRLERKKFENLQSSYNKYWVPCVWFCNLASQARSEGRIRDDHSFKML +MEELNTFRGNCGMLFHYDWISVPLVYTQVVTIAVYSFFLTCLIGRQFLDPARGYPGHELDLYVPVFTLLQ +FFFYAGWLKVAEQLINPFGEDDDDFEINFLIDRNFQVSMLAVDEMYSDVPPMEKDRYWNHSDPRPPYTAA +TLFQKHMPSFQGSTFNMAIPKEDMQFQPLSDIEEMNEDTLTHPPPLLSRFLPGVGPSPLSSSAALASHFA +APGSRLTLLRRSTSSFSSSSEFQCQEPVQDPPYSLVDSLGPGLNVQEGHTEELCNMGSQASLFLPPKTMD +GGENVQPVEEGEDAASLVAT +>gi|WP_068888990.1|bestrophin|Acinetobacter_celticus +MIVRDQPNIFKVLFSWRGTILPKILPPLGVVMLISAIIGVLSYIGYFKFPELPFVGFTVIGVVLSIFLGF +KNSACYERWWDARKLWGILIANSRHFDRDCRMLSQGRRERVIQHVIVFANVLRDRLRHQTANPTELVKTS +GMSQQALTQLYQQANAPQYTLSLIQWELMQALKDGEISDIIYTQMNDHVMDLSMVQTGCDRIATTPLPFA +YSVLLNRTVYFFCLILPFSLGSTLGIFTPLLVGVLAYTFLGLDALSSELEEPFGTQSNDLPLDSMVRTIE +IELLGTLGKPTPPPIQAQDNNLL \ No newline at end of file diff --git a/CA_alpha.txt b/CA_alpha.txt new file mode 100644 index 0000000..c7c5b49 --- /dev/null +++ b/CA_alpha.txt @@ -0,0 +1,76 @@ +>jgi|Emihu1|456048|estExtDG_Genemark1.C_1660056|CA2_alpha_EHUX +MSPNKHSWRYARPGPNHVADEWVQRETWGASFPTCINGIEQSPINIVTGEAIPMKSLPEISTDIDAAPHY +VSNTGSGFQLFETTPTESMIANGTFIDTIEGSSKGESWVGGQKFLFYQMHWHTPSENTIDGRSFPLEAHF +VHQLDDPMLVGTLHRLAVISLLYEPGPCNAFLDQFWEEFPMVPGFRQHFADGVNDFERLADEVINIDEGE +GYFYWHGSLTTPPCTEGVGWYMLKHRETVSDRQIDALRYALAVS +>XP_005764209.1 carbonic anhydrase [Emiliania huxleyi CCMP1516] +MGCTQSKHDASEDASNGTMLQAVLGHLGQLDGDAKLDHTTMSIVYDIFKDMDKDSDGTVDKSEFEKFLST +HPAAKTLWEGEGKASMSRSLKDAIADERLSFYELVAAFAPEAPHHAGDASGIGGLESLISDAAWGYRGYN +GPENWALLSPKNKLAATGKEQCPVDILPSTCVPCPAVDGDASLAYGVGPGTILNNGHTIQVNWKGGSMSV +GGTTFEAAQFHFHTASETTIRGMQYPLEMHVVHVTPGANERVSEPMRIAVLAVLFETRTDVEEVFLSQFF +DQLPSHVAHDQDDAETLTRPVDLSSISLDGGYYRLRGSLTTPPCTEGLEWSVLASPLPILPAQLETFRKA +LGKTVRNFRPTQPLNGRSITWVCACQA +>jgi|Thaps3|22391|estExt_fgenesh1_pg.C_chr_40655|CA1_alpha_TPS +MILLQPMTKRMSTSSHLVILVVLLRLQSSNSRSWLDCINTSKLENDGMPRVGKRHNATTSSLSSDAAITI +AQMSGNIGTSTTSEDTEIVIHGATTTLFEEVDPFRVTDSPSTVPSYSSSPPTLSPSASPTITPLPTTEKP +TRLPTLPPTFQTGKNEPLNPKPGYFNYDMNSDYGPHRWKRVDVEDDFFHTFDLKAEDTNNCGSGDHQSPI +DVCTKPRGNCKETHEMRPKSGDYKMDGELITKQILPSKLRLVMAPRTGDEPDPPQVDFSSNGRGIIDMTN +IDFKFPSEHTVCGSKFDGEMQYYMYHPGRERFVAVSFFLEASPTNPTNEHLQEVIDAFRTVFIKDKSLCA +EKQRLENYAQGFVSPANRKLHGEENKTLDSIEDDGELWNTTTIESNEDREYQRRLALKWHPFHPDIQKTI +HFWGYHGSFTEPPCTDDIVDWKIMDVPTPISTKQLAQLKQLLFNHVDKNCERTSVHNSDGSVARPTQETS +KYYKCTRDDYVSDEERGVCGDLGCINPFGEGLNPYYPPIVDVTGPPTRAPST +>jgi|Thaps3|262006|thaps1_ua_kg.chr_4000016|CA2_alpha_TPS +SEHRLCGKQYDAEMQLFHLHNEGNLEALAILIDADDGTSENPHFQKLLDFFQKKFNADKSMSRDWVWDPL +EPGYILRSIHFWAYSGSTTEPPCFEGVNWRIIDVPMKISPGQYQQLQRLMFDHSNARPVQP +>jgi|Thaps3|22257|estExt_fgenesh1_pg.C_chr_40398|CA11_alpha_TPS +MTRVSNIDSMMDGFGKLSRRAKILYLSSLAVSLAMVVFGACVLTLDYTTRTTSKVENSIGGVVNADDSDE +AKIQIETQTPTLSPSSSPIYTEKLSLVSSPAPSTSNLRATSAPTNSPVDIGTLQPVTRKPVQPKPTPRPA +SPKPSSPPSTRYPSISPSQHPTNSPSLSPVTPSPPPTLTQSILPSITNMPSLESLFQSHEVPKDPKPTYF +NYNGNSDYGPRSWENVTLLNSTENYWHEFGFNDNQCGVGAQSPIDVCTTPMRHCQEHHEFRSKLRVLMHR +REGDEPDPPHVDFAGVGAKSLDLLNIDIKIPSEHTVCGRRYDGEMQYYFYHPVKGSLIVIAWLFDAQNEF +ASNEHLQLVIDEFQALYDDTEGACLVNMTLNETGVTAPPHQRLSSRSDRELEKENHGCSGSNLNGPAPSN +AEYPIQQP +>jgi|Phatr2|35370|fgenesh1_pg.C_chr_7000291|CA1_PTRI +MRLIAISLCCLMPCTVRCRSWRNIEPLHGWNENDTSGTIWRMEFNPLFTSAPTSMPTTATPSDIPSSRPS +SFPSAPPSASPSVAPSPSPSTAPSESDPYRPNDPPKNPEQWYFNYDTSANALYGPGHAGIIQQQNNQFNV +GYKNNRWGSVGNPPNNYWTEFMDNGFGPWRGILANRNPTRNMCDRVGMQSPIDLRPSGAVCDEHHEVRSR +RGDFQIFEDEVTKEIQPNKLRLRYKRRPCRNLNELACQEPDPPNADFPNNWGGYADVTHIDFKVPGEHLI +RGEKFDGEMQIFHIHRGRRRMVVQSVTIRATSTGFNSYFQEAIDVFRAVYDINIARCSALRRKERRLVSN +AHIILGKNMTSKFHDYSSWGDFSTGLEDVELESKRSLRKSNWDPYHELLIPSIHFYRYDGSLTEPPCGEF +VSWFVSDTPMRISLSQLEEVKTILFKNVDENCQPTSVQFGHSVARPIQETAGRPVWQCTPREFGPDP* +>jgi|Phatr2|44526|estExt_fgenesh1_pg.C_chr_40337|CA2_PTRI +MVGLPSVLLCTLIAFTTAQTGRDLDRFNYRGTDGTDYGPEDWDQVSCTDTETCLGWPDGFETARGWDLGE +NHCRWCPLGTRQCGIHHQSPIDLQRNRAVPGDPEEKECIDVHWMAYYDSTCDWENLKALNAFSIERHALK +VNQPIEQLASGDYRLACRNASGRRFGRIDFSKGFSEWWLMSHMDIHVPSEHTQEGKRYDGEIHLYHFYSI +PGSQSSTNNEMASVTIFLEAYDDVPDYPMLNRLICQWRQVEDKTREECGLPSVETEYPGCFYYQRGHTID +GFNTIALTQDGTQRNLRQKSRNLRPKSMSVHDLILYNYAQSQTNSSYTPKRLLHSEEDHAEADPNFDWEK +FVTRQDGNANITQGNRQLLNYDHVGPWFNYFPMLGVRTEYYYRYSGTQTVPPCYGRFFEGNNRRQTNHWR +VLKDPIRVTQRQVDEMHRLLKERIASVDDPLASCEPDTAAKVDENDPTKISVARPVMETRSTHYKVFCEC +EDWRSKFPEDVEWCKKGLQDRLFNHPYNFETDGF* +>jgi|Phatr2|55029|estExt_Phatr1_ua_kg.C_chr_210026|CA3_PTRI +MSLSGIVCSRAKWFLLSIALPTLGLGLNKTAFSYNKKDEYSPDNWYRLDIAGNVCRGPRNSPIALESTPC +DAYEGYGLYSGTCTLNDLDFQLTELGVKIKYPKDGSCDINTLTVPGVSGNFRLLEVTIHGGSEHSIDGNF +SGAEIQLVHEKINSQEGHLAVLAILVEPEGPKDNLFFGTLLDEWRAVRADSTASCAKAGYDVPTLYWLAS +GTPVNTRHSYVRSYFTSPRFNAYSLLPTNTSFYRYYGGLTTPPCSEIVWWSVADTVMRISTGQYAELMTM +ITTGYVNVTDEAGCEPWSVASPSGSTSRPLQARNGRPVDRICPV* +>jgi|Phatr2|54251|estExt_Phatr1_ua_kg.C_chr_40037|CA7_PTRI +MRSFLLWSLVASFATAQEGSNLDRFNYRGTEGTDYGPEDWDQVSCSDTENCLGWPDAFEASRGWSLKDNF +CRWCPAGSSSCGTHHQSPIDLQRNRAVPGDPDENECIDVHWMAYYDSTCTWDTLKELNAFSVERHALKVV +QPITETTSGEWEIACRDDSGKRFGRIDFSKGFSQWWFLSHMDFHVPSEHTQEGKRYDGELHMYHFYSVTG +AEAGIDNEMASVAFFLEAYDDIPDYPMLNRLICQWREAEEKTREECGLPSILTEYPGCFFYNRGHTDSAV +TTQSISNGQRKLRTTSRNLRPKVKSVHDIILQNHEQMQSNATFKPHKLILSEDDHAEADPDFDWGAFVAE +QVAKSTSSQEHRELMNYDHVGPWFNYFPLVDVRTEYYYRYSGSQTVPPCYGRHIGGSRKQTNHWRFMKDP +LRVTQRQIDEMHRLLKERIAPLDDPLASCQPDTAAKVNEDDPTKISVARPLMETRDTHYKVFCECIDWPS +KWPEDRAWCEQGFMDRLYTHPYNFQTDGF* +>gbi|AQL05019.1|Alpha_carbonic_anhydrase_7|Zea_mays +MHALVRPWDTLPVLLLSRLCMVLLDALRAGWLGSVDEDEEDFSYRRNAGNGPARWGLIRREWATCNVGLL +QSPIGLSDTLAGLADRSGRLGRSYRPAAASLVNRGHSIMVRFNSNPGGVVIDGVAYRLRQMHWHAPSEHA +INGRRYALELQMVHQSDTNRYAVVSQLYRISRRRPDRTIHRLERYIRRIARRKNHEELIDEEVDPRRPGT +RSNRRPLQEANGRAITFYYTSPAHGRGANGD +>gi|OMO73707.1|Alpha_carbonic_anhydrase|Corchorus_olitorius +MKHQSKPIFVSAFLIIFAVLFLSHSASVSAQEVEDEREFDYLEKSGKGPKHWGDLKQEWAACKNGDLQSP +IDMSSLRVKVIKKSGEMKKRYKPCHAVVKNRGHDISLQWLDNDAGSIKINGTEYFLQQAHWHSPSEHTIN +GRRYALELHMVHQSKDPNLKNNLAVVGLLYKFGAPDSFISKLISNITSMNDHVQERYMGVIDPSAIKMGG +KKYYRYMGSLTVPPCTEGVIWTMNKKVRTVSRDQVRALRIAVHDYAEANARPVQPLNRREVELYGPNPGD +VSN + diff --git a/CA_beta.txt b/CA_beta.txt new file mode 100644 index 0000000..bad26ed --- /dev/null +++ b/CA_beta.txt @@ -0,0 +1,24 @@ +>jgi|Phatr2|45443|estExt_fgenesh1_pg.C_chr_70069|CA5_beta +MKFATAATVTLLALSTVDALNVKKLFRFGKTSLPKDSSPKPAAKGGYDLDVSELFDGNNKFIADKLAGDP +AYFDTLGTVHSPKYLYIGCVDARAPPNMIMGTEAGTMLTVRNIANMVVNNDLAVMSAIQFGINVLKIPNV +ILCGHYECGGVRASVANVDHAPPLSIWLRNIRDVYRLHAKELDAIKDPEERHRRLVDLNVIEQCVNLFKT +GVIQAKRIESYKDGGVAIPQVHPVVFDPKTGEVKKLKVDFDKYMAEINGIYDLYDLENAKVPM +>jgi|Phatr2|51305|estExt_fgenesh1_kg.C_chr_10001|CA4_beta +MKFLSASIALLACATSVEAFNANKAFRFGAKAMPEVSSESATSALSAGGAEKKSYDLDITEIFDGNKKFI +ETKKAQDAAYFDTLGTVHSPKYLYIGCVDARAPPNMIMGTEAGTMLTVRNIANMVVNNDLAVMSAIQFGI +NVLKIPHVIVCGHYECGGVRASVANVDHAPPLSIWLRNIRDVYRLHARELDAIKDPEDRHRRLVDLNVIE +QCVNLYKTGVIQAKRIESYQEGAPAAIPRVHPIVFDPKTGAIRKLQVDFDKYMSELDAIYDLYELENAKI +PA* +>gi|ONM39907.1|BetaCA_4|Zea_mays +MAVERLKTGFEQFKADVYDKKPELFEPLKAHQSPKYMVFACSDSRVCPSVTLGLHPGEAFAVRNIASMVP +PYDKTKYAGVGSAIEYAVCALKVEVIVVIGHSRCGGIKALLSLEDGAPDKFHFVEEWVRVGAPAKSKVLA +DHASAPFEDQCSILEKEAVNVSLENLKSYPFVKEGLEKGTLKLVGGHYDFVNGKFETWEP +>gi|SIT99918.1|beta-carbonic_anhydrase|Mycobacterium_bovis_AF2122/97 +MTVTDDYLANNVDYASGFKGPLPMPPSKHIAIVACMDARLDVYRMLGIKEGEAHVIRNAGCVVTDDVIRS +LAISQRLLGTREIILLHHTDCGMLTFTDDDFKRAIQDETGIRPTWSPESYPDAVEDVRQSLRRIEVNPFV +TKHTSLRGFVFDVATGKLNEVTP +>gi|XP_014177286.1|betaCA|Trichosporon_asahii_var.asahii_CBS_2479 +MSNYLQETHDRVFAQNKEWAAKQRAKDPEFFTRLAAGQSPEYLWIGCSDSRMPAEMITGLEPGEAFIHRN +IANMVNNLDLSAMAVINYAVRHLKVKHIIVCGHYGCGGVQAAMTPKDLGILNPWLRNIRDVYRLHEKELD +AIADDEKRYERLVELNVVEQCRNVIKTAAVQQSYAENEYPIVHGWVFDFRTGLLKDLEIDYAKVLKDIQK +IYNLTE diff --git a/CA_delta.txt b/CA_delta.txt new file mode 100644 index 0000000..79048d7 --- /dev/null +++ b/CA_delta.txt @@ -0,0 +1,61 @@ +>jgi|Thaps3|262009|thaps1_ua_kg.chr_4000019|CA7_delta_TPS +LTKKTARDWVWDPLEPGYILHFWAYSGSTTEPPCFEGVNWRIFDVPMKISPGQYQQLQRLMFDHVDPDTC +KLTSTHYNESNARPVQPYRGGANYRCRRSGYVSDKERKASGLRRGFKDPADWRGVDLLPWIEGEFPNV* +>jgi|Thaps3|233|fgenesh1_pm.C_chr_2000003|CA4_delta_TPS +MGDITPNTKPYFQSSMCPVNVHWHLGSEHYSYGEFDENGNGPHGNVARPSWANRDLATDGAAVADGFRCH +HYDENDPKFTTKYDWKHCHGMEVGETYEVHWPHSAAGACGTVNQYQTPFYDGVFCNLPMESFTTLGGQDI +ANAVGVHGQVFTIVNDESYFYPDMIRGMIVEPEMNMGQDIAMYTGSTTGDSRSNEMCSQYAPITWQVDRK +CHMISASSFDKLCYDMKMQRDDMSDDLHAHGSRELVKDEYVANNQANRNLRA +>jgi|Thaps3|814|fgenesh1_pm.C_chr_19a_19000002|CA5_delta_TPS +MVNNVDCVHTPGPQAGANVTKGYKGGMEVDYVPNTKPYFQSSMCPVNVHWHLGTEHYSAGEYDEFGTGPN +SVNNNLPQNQQVRPGYRCRHFDKSQPMFTNEYRWEFCVGMQVGETYEVHWPHSAAGACGTPDQYQTPFYD +GVFCNLDEEKFSTLSAQDVADAVGVQAQVFTVVNDERYFYPDLMRGFIKDGEYGKDIAMYTGSTTGTTRS +NEVCSSYAPITWQVDRKCHLISASSFDRLCETMRLQRDNMTLDMHAHGSRELVKDSLVANNQANRRLGGH +DHHHHHHGHDHADHLWADGHGHLHEEWF +>jgi|Thaps3|34125|e_gw1.5.359.1|CA6_delta_TPS +VPGPQAGGNVTKGYVGELDVGDLTPNTKQYFQSSMCPVNVHWHLGSEHYSYGEFDENGDGPHGNIPRPDW +ANRDLAGAGESVPDGFRCHHFDETDAKFTTKYEWKHCEGMEVGETYEVHWPHSAAGACGTVNQYQTPFYD +GVFCNLPMETFVTLGAQDIASAVGVHGQVFTVVNDESYFYPDMIRGMIVDPDMNMGQDVAMYTGSTTGDS +RSNEMCSQYAPITWQVDRKCHMISASSFDKLCYDMKMQRDDMSDDLHAHGSRELVMDSLVANNQAN* +>gi|OEU09193.1|delta_carbonic_anhydrase|Fragilariopsis_cylindrus_CCMP1102 +MTFYQAAVVALLASTVNNAVNAEEDCTSIVDLACGTEGFSTLCSVLTDVAPALDPDVVSSLKTVFAPTDD +AFAAVKFDLVTEEALLDILGYHLSTFELTGECGSLIEMADGKDTRTLCNKDKEPVFQKGWANSRAVMPQF +DPTAGIAVCGDATVYVIDSVLIPKDYFVDEEGEVVEDNVQEVIDAPDPNDGKDYFKELLIAKGTVTEGSN +TCANTNPQFPNINCLGEDGTVDVGPQAAANVTKGYVGGMEVDIVPITKSYYQAGLCPVNVHWHLGSEHFS +AGEFDCEDPKKCGPYHAADDAAHDDDGHTDDAGEGDSRRQLAGDARKGYQCNYYDEDDSKFTAPYDWQFC +DKTMEVGQTYEIHWPHSSAGACGTPNQYQTPFYDGVFCNLPLDVFQTLSAQDIASNVGVQAQVFTIVNDE +AYYYPNLFGGMIVDGDFGADMAIYTGSTTGTSRDNEVCSQYAPITWQVDRKCHMISASSFDKMCADMMAQ +RDDMTDDLYAHGAREVTADIITADNQQTRGRGLRLRKNNKN +>gi|XP_005772538.1|delta_CA|Emiliania_huxleyi_CCMP1516 +MSQADWLEQNVERISKDDLTETPTTAEALEGQPNEKAVIIGAAKATGSDIAYKLSHLLALVVAGVIALLA +SAALADGRSVIKLKDTSNLPRLTALTATLDGETINLKDHGLDYRADELLGPQYGVGLHHDSSGYGWGKAG +ARETLQEYIDELGLLQVIAAVPSVIATDGLKHPAHFLECTELKKAGLSAMSLAIIAEVASAVMIIFHGLA +LVGLLPLSAKLAKGFAGLVWFTLTAGFLIVVCLPIGVYETEWTCNKDFVPAIRLWDHFVYNWAFPVGYLG +YACSLLVFSVVLCFPSLEEGAQEFDKKKTKLGLVKVVAGLFVGLVIAASVSVGIAASQDAFKDPEVDPSV +NPCKAQKPYHAAPGDNYFRNIECMKDNLVQHLSEGQYDYHGTGPAYNSTNSTKDLYANHVHGYHDRDAED +YVSKEEYYESKKNKGDPYADDGKKKKEKKEWTERLGLRCHHYDDEHEMFKTVATGAKKPYEWKHCVEMMV +GETYEVPWPHSAAGACGTEWQYPDALLRRRLLQEGVVNILTPLNTYEKIGVQGQVFTIVNSDEEQYQYEN +LIDGAWMDGKDKWVDVAKYTGSTTGTTRNNEMCSRYAPITWQVDRTCHMISAKSFDKLCYDMKQKKDDMG +GDLYPHGAREIVADYLVANNQQSRK +>gb|ABS87870.1|delta_carbonic_anhydrase2|Lingulodinium_polyedrum +MVARLMLAASVLLVRAWGTGCPDDPEVDLCSETTTDESGTGTGTEEVNVNGAMRTRTSLMPMLXLAGVFR +SKNALFALPLLGXPLAAEAAAAAGTSGPSTCGAVKDMYKEQGCCGRPDKELDVVIVPKPTKRLFGANICE +GKQPVHATPGDNYFKNVDCLNGTTLQVLEQAGANVTLGYRGRLDASSRTPILTPYWQNGLCPVNVHWHLG +TEHYSKGQFDEHGTGPDIAAEEDAEGEADSRRLAVARRGYRCSKYDAKDAKFTTEYNWQHCEGMHVGETY +EVHWPHSAAGACGTPYQYQTPFYDGVFCVDGIVSLSPLNTYMKIGVQSQVYTIVNDETYYYPEMIKGMIV +DGHYGQDIAKYTGSTTGTSRDNEVCSRYTPITWQVDRKCHLISASSFDKMCADMKNQHDDMSSDLHAHGS +RVLVDRNFTGNNFHRRM +>ABG37687.1 delta-carbonic anhydrase [Emiliania huxleyi] +MSQADWLEHNVERISKDDLTETPTTAEALEGQPNAKAVTIGAAKATGSDIAYKLSHLLALVVAGVIALLA +SAALADGRSVIKLKDTSNLPRLTALTATLDGKTINLKDHGLDYRADELLGPQYGVGLHHDSSGYGWGKAG +ARETLQEYIDELGLLQVIAAVPSVIATDGLKHPAHFLECTELKKAGLSAMSLAIIAEVASAVMIIFHGLA +LVGLLPLSAKLAKGFAGLVWFTLTAGFLIVVCLAIGVYETEWTCNNDFVPAIRLSDHFVYNWAFPVGYLG +YACSLLVFSVVLCFTSLEEGAQEFDKKKTKLGLVTVVAGLFVGLVIAASVSVGIAASQDAFKEVEVDPSV +NPCKAQKPYHAAPGDNYFRNIECMKDNLVQVLEQAGANVTRGYVGGLDAGNWRTPILDHYDDTDLCTVNV +HWHLGAEHLSEGQYDYHGTGPAYNSTNSTKDLYANHVHGYHDRDAEDYVSKEEYYESKKNKGDPYADDGK +KKKEKKEWTERLGLRCHHYDDEHEMFKTAATGAKKPYEWKHCVEMMVGETYEVHWPHSAAGACGTEWQYQ +TPFYDGVFCKEGVVNILTPLNTYEKIGVQGQVFTIVNSDEEQYQYENLIDGAWMDGKDKWVDVAKYTGST +TGTTRNNEMCSRYAPITWQVDRTCHMISAKSFDKLCYDMKQKKDDMGGDLYPHGAREIVADYLVANNQQS +RK + + diff --git a/CA_zeta.txt b/CA_zeta.txt new file mode 100644 index 0000000..1b47915 --- /dev/null +++ b/CA_zeta.txt @@ -0,0 +1,10 @@ +>gi|XP_002295227.1|TPSE|CA3_zeta +MCMHVDLQVAMSSILSKLTGKDDTSAPPLTPKDIVAALQSRGWEAEIISASSISQDMVEVDPAGILKCVD +GRGSDNTRMAGPKMPGGIYAIAHNRGTTSVDGLKEITKEVASKGHVPSVHGDHSADMLGCGFFRLWVTGE +FDSMGYPRPEFDADQGAAAVKESGGVIEMHHGSHTEKVVYINLVENKTLEPDENDQRFIVDGWAAIKFNL +DVVKFLVAAAATVEMLGGPRIAKIVVA +>pdb|3BOH|Tweisflo|CA_zeta +SHMSLTPDQIVAALQERGWQAEIVTEFSLLNEMVDVDPQGILKCVDGRGSDNTQFCGPKMPGGIYAIAHN +RGVTTLEGLKQITKEVASKGHVPSVHGDHSSDMLGCGFFKLWVTGRFDDMGYPRPQFDADQGAKAVENAG +GVIEMHHGSHAEKVVYINLVENKTLEPDEDDQRFIVDGWAAGKFGLDVPKFLIAAAATVEMLGGPKKAKI +VIP diff --git a/GCL.txt b/GCL.txt new file mode 100644 index 0000000..1556797 --- /dev/null +++ b/GCL.txt @@ -0,0 +1,75 @@ +>jgi|Thaps3|35164|e_gw1.6.74.1|GCL_TPS +TFKGETAAHIIYSKLLEHGTEVVNGYSGGAILPLLDQFHQNHPRHGDKKKIRWITNSNESSAGHVAEGIA +KSSTEPDGKLAAGIIVATSGPGATNLVTPITDAMCDGVPLIVLCGQAATTAPQDAFQSCPAVEIMKPCTK +WSYQIKNAAEVPFAMDYAFYLARNGRPGPVFIDLPKDLQIQQLNDEVIGNFLDGLGLYTEDESYNVEHDN +EFMVDLIKNAKRPFIIAGQGANDSHEELMELAETLQIPVATTLHALGTFDERHPLATNMLGMHGHATPNY +LIQDCDLLLCIGSRFDDRITGRPSDFIPAARQAAKEGRGGVIHVDVRFSENAKQVKPTYFVHSTGKKFLQ +AVNSAIRANPPKDTSRTKQWIEKKKELEKEYPIRITKEVTQTNMNCQSVIAEMNRQLVESGKIDDTIFST +GVGIHQMAAAQLITWTQPRQMLSSGSLGTMGVSLGYCMGAKLANPKKWCISVDGDGSFNMTFTELKTIGE +EKIPVKLMILDNESQMMVEYWQRLFHDERYIAVRNKSPKYTTLASAFDIKSIYCECAEELEEKMRSFLFD +YDDEPVLFHVRIERTPCLPMVAPGQPLDNMILVDEDFEVDKSAAPS* +>jgi|Phatr2|56476|AGR_Contig7213|GCL_PTRI +MKFSTAALIFAVSATASTAFVPHAFVSPKSPRPALFSTELRKTDVTADLKNGVTVNPFDQSALAAGVSPL +TETGTATTSSSQHWDPQADAELAKLAAIEARAGAAAYMGQYEAQSGASLIYSKLVEHGVTVVNGFSGGAV +LPLLDQFHEGHPRHETSGVTPIRWITNSNEASSGHIAEGYAKSMPINGPHKPVGVAVATSGPGVTNLITP +LQDAICDGVPLVVLCGQAATVAPEDAFQSSPAVDLTRPCTKWSYQIKSAAEIPLVMDYAFYIARNGRPGP +VFVDLPKDLLNQILTGDLINSFIDAENPGDETSFARLQKMYRPDGEVFQALHLGTGGKGLPFEIYKDEAA +PQNTPTYKLKPVTHANTVDSYHADHHPSDRVIRTGKVVAGEHLPNEQGPLQVGGEMTKKITDLIMKAKKP +VIIAGQGCNDASAELKIFADRLQIPVATTLHGLGCFDERSELALNMVGMHGHPTPNFMVQEADLIICVGS +RFDDRITGRMSDFVPEARVAEEEGRGGVIHVDIRLTENAKQISPTFFVHSTGKKFLETMIEFLAGMDSKP +NTSAWIKRMKELQKEYPVKIPSFPSETVSVTNEDGSTTETTRTRASAQSVVAELDRQLLAADAMDDAIFT +TGVGIHQMVAAQLITWTQPRQMLSSGSLGTMGVALGYSIGAKLANADKMVIAVDGDGSFNMTFTELKTLA +EQGIPVKIMILDNDGQMMVEYWQRLFHDNRLIAVRNSANPDYSTLAKAFGIKSVYCDCEEDLEARMKEFL +FDDPDEPVLFHVRIERTPCLPLVAPGQPLQDMILEDVEVDVDKSAAPS* +>gi|OOC01793.1|glyoxylate_carboligase|Amycolatopsis_azurea_DSM43854 +MPRIPAMQAVVDVLVSEGVDTAFGCPGAAILPLYHAMQDSGIEHLIVRHEEGATHMADGWARTTGNVGVA +IGTSGPAGTNMITGLYTAQADSIPILCITGQADSRKLHTEAFQAVDIVEIAKPVTKWAVQVKEAAQLPWV +FREAFRIARSGRPGPVLIDLPIDVQRQEIEWDSSIDSPLPVIRTTPSPARVERALDLLLAAERPLILAGG +GVVLGGASDRLRTAAELLGVPVGVTLMGKGTFPEDHELFAGMAGIQTSQRWANAAFLEADLVLALGARFG +DRHTGDLDVYRGSRKFIHVDIEPTQLGKVFGPDLGIVSDTGAFLDALIEAASKRSPARDRAWPRRIGELK +ESLPRREDFEDTPIKAPRVFKEINEFYGEDAYFVTAIGLYQIWSGQFQRAHKPRHYQVCGQAGPLGWEIP +AAIGVKKAKPEAEVVGVVGDYSFQFLVEELAVAAQYDVGFVLIMLNNEYLGLIRQAETGYEMNFEVDIHY +DKNGTDNVKVMEAYGCSGTRVTEPGEIRTSLEWARKEAERTSRPVLVEIMIEREGNAAMGKALDSVVEFE +PIAG +>gi|SJM69470.1|Glyoxylate_carboligase|Gulosibacter_sp.10 +MAKMRAVDAAVLILEKEGATQAFGLPGAAINPFYSAMRAHGGIKHVLARHVEGASHMAEGYTRTRPGNIG +VCIGTSGPAGTDMITGLYSASADSIPILCITGQAPVAKLDKEDFQAVDIASIAKPVTKLAKTVLEAGQVP +GVFQEAFRLMRSGRPGPVLIDLPIDVQQTEIEFDIDSYEPSPVAKPAATRAQLERALELIEGAERPLLVA +GGGILNAAAEADFRALAEELGIPVVPTLMGWGIIPDDHPLHAGMVGLQTSHRYGNENLLASDLVFGIGNR +WANRHTGDVDTYRKGRTIIHADIEPTQIGRVFAPDYGIVSDAGELIRGLLELVRERSGSLRDRSGWAAEC +QDRKARLQRKTNFDNVPIKPQRVYQEMNRAFGEDARYVTTIGLSQIAGAQMLHVFKPRHWINAGQAGPLG +WTLPAALGAAVAEPETPVVALSGDYDFQFLIEELAVGAQHRIPYVHVVVNNSYLGLIRQAQRGFEMDFEV +SLAFENINSSLEVQGETVKGYGVDHVKVAEGLGCKAIRVEDPSKLQEAFAQAQELAAEHRVPVVVEVILE +RVTNISMAGASIDAVNEFEEIAETAEDAPTAILPIGSREQAAAPVASGA +>gb|SJN08827.1|Glyoxylate_carboligase|Leucobacter_sp.7(1) +MALMRAVDAAVLILEKEGATQAFGLPGAAINPFYSAMRAHGGIKHVLARHVEGASHMAEGFTRAEPGNIG +ICIGTSGPAGTDMITGLYSAAADSIPILCITGQAPVAKLDKEDFQAVDIASIAKPVTKMAKTVLEAGQVP +GVFQQAFYLMRSGRPGPVLIDLPIDVQQTQIEFDIDLYEPLPIAKPTASQAQIDGIFALLDAAERPVIVA +GGGIINADASAEFVTLAETLGVPVIPTLMGWGTIPDDHELMAGMVGLQTQHRYGNENLLASDLVIGLGNR +WANRHTGTLDVYTEGRKFVHIDIEPTQIGRVFSPDLGIVSDAGAAIAGLLATATERKAQGTLPDRSAWVA +ETQERKGSLQRKTNFDNVPIKPQRVYQEMNRAFGRDTRYVTTIGLSQIAGAQMLHVYKPRHWINCGQAGP +LGWTLPAALGVVAADPKTPVVALSGDYDFQFMIEELAVGAQFKLPYIHVVVNNSYLGLIRQAQRGFEMDY +HVSLAFDNINSPETEGYGVDHIKVAEGLGCKAIRVREADDLAASFQRAKDLMQEFQVPVVVEVILERITN +IAMSGAGIDAINEFEDLAEGPDDAPTATIPLKQPAEAAR +>gb|SJN20588.1|Glyoxylate_carboligase|Vibrio_sp.JB196 +MAKMKAIEAAVEVLRKEGVDIAFGVPGAAINPFYAAMKKVGGIDHVLARHVEGASHMAEGYTRTNDNNIG +VCVGTSGPAGTDMITGLYSASADSIPILCITGQAPRARLHKEDFQAVDIESIAKPVTKWATTVLEPALVP +RAFQQAFHIMRSGRPGPVLIDLPIDVQLAEIEFDIDTYEPLQPYQPTATRAQVEKALTMLTESEKPLIVS +GGGVINAGASAELQELAELLNVPVIPTLMGWGTIPDDHELMAGMVGLQTSHRYGNATMLASDFVLGIGNR +WANRHTGSVDVYTQGRKFVHVDIEPTQIGRVFCPDLGIVSDAKSALTLFLDVAKEMKASGKLKNTSDWVS +ECIERKASMLRKTHYEEVPMKPMRVYEEMNKAFGEDTCYVSTIGLSQIAAAQFLHVYKPRHWINCGQAGP +LGWTIPAALGVRAADPKRPIVAISGDYDFQFMIEELAVGAQFKLPYIHVVVNNSYLGLIRQAQRQFDIDY +CVQLAFDNQNAPEMEGYGVDHVAVVEGLGCKAIRVRNPEDAPAAFAQAKELMAKHQVPVVVEFILERVTN +ISMGVEIDGVNEFESLALDPNDAPTAITFNQ +>gb|WP_077156852.1|glyoxylate_carboligase|Burkholderia_sp.KK1 +MPKMRAVDAAVLVLEKEGIDTAFGVPGAAINPFYSAMKKSGGISHVLARHVEGASHMAEGYTRAAPGNIG +VCIGTSGPAGTDMITGLYSAQADSIPILAITGQAPRARLYKEDFQAVDIESIAKPVTKWAVTVREPALVP +RVFQQAFHLMRSGRPGPVLVDLPIDVQLAEIEFDIDTYEPLPVYKPKATRAQIEKALTMLNDAEKPLIVS +GGGVLNAAAEDLLVQFAETLGVPVIPTLMSWGAIPDDHPLMAGMVGLQTSHRYGNATMLASDFVLGIGNR +WANRHTGSVEVYTKGRKFVHVDIEPTQIGRVFGPDLGIVSDAKAALELFVEVAKEWKAAGKLKDRSAWVS +DCQQRKRTLQRKTHFDNVPMKPQRVYEEMNLAFDRDTCFVTTIGLSQIAGAQFLHVFKARNWINCGQAGP +LGWTIPAALGVRAADPQRKIVALSGDYDFQFMIEELAVGAQFKLPYVHVVVNNSYLGLIRQAQRGFDMDY +CVQLAFDNINAPELEGYGVDHVAVAEGLGCKALRVHKPEDIAPALKQAQALAAEHQVPVVVEMILERVTN +IAMGTEIDAINEFEELAETKADAPTAVTPLD + + diff --git a/GDCT.txt b/GDCT.txt new file mode 100644 index 0000000..d391257 --- /dev/null +++ b/GDCT.txt @@ -0,0 +1,57 @@ +>jgi|Thaps3|36208|e_gw1.9.80.1|GDCT_TPSE +MLKSTATALLRRAKRTSILPSTSSRSLASSTNEEPLVKTSLYNLHKELGGDMVPFAGYELPVLYKGDNGG +VMKEHLWCRSDGKASLFDVSHMGQIRWRGRDRAAFLEKIVVGDIAGLSEGSGCLSLVTNVNGGIIDDTVI +TNAGDYIYMVVNGATKFGDMKHFKEQMESFDGDVNMEYLEDSMQLLAIQGPGAAEAVSKLLPGAFDLTKM +AFMTGVDTTLDGVDGCRITRCGYTGEDGFEIAMPAEHAVSIASKLLSDPSVNPTGLGARDSLRLEAGLCL +YGHDLDENTNPIEATLGWTMGGPKSRRRTEGGFLGAEHILKPDGKFQKVARKRVGIKGMKAPAREHAEIF +DANGETKIGEVTSGTFSPCLKAPIAMGYVETELAKAGTEVNVQIRGKMQKAEIVRMPFVESRYYRIPE* +>jgi|Phatr2|56477|AGR_jgi|Phatr1|28288|GDCT_PTRI +MKRLSCVRGLRLRKGRVHLRCASSQTVPERANVVVVGGGIIGTSVAYHLAKAGVEDVLLLERDRLTSGTT +WHAAGLMNSFGSMSSTSTWSRQYTQELYRDILPTETGLETGYMGIGFIELACDADRLEAFRRIAAFNRFL +GVDVAEISPEQVKDLFPLCETSDVLSGFWVENDGRANPTDATMALAKGARLHGANIIEQCHVAGVTTSKP +NGNYRAKVTGVRLENETVIAANIVVNCAGMWARQFGEACGVYNIPNQAAEHYYLITEPMKEIDPSWPVIE +DSSKCVYIRPEGKGLMLGFFEWEGAAWKPEGVPLDFSFGELDPDWDRMMPYVEQAMKRVPAAENVGVKAL +FCGPESFTPDNRPIVGESPELRNYYIAAGLNSIGILTGGGIGKILAQWIQQGCSPHDVDVTAIDASRFQR +YQSNITYRNDRTGEALGNTYKVHYPDHQPTTCRNAKQSVLHERLVNANAFFQETSGWESPSWYAPHGTNP +KVETESFGRENWFLHWEAEHISCRNNVALFDMSFMSKFHVQGNDAGKFLNRLSTANVDGDWGMITYTQWL +DEQGYMAADLTITKMAENHFMVVATDTMLNKVYSHMLDRLVHGEHVFVTDVTGRYAQLNLQGPRSRELLQ +GLTSVDLNNFAFRRAEEIDIGLARVLCIRITYVGELGYELFVPVEQARHVYDCIVELGREFSLSHAGLKA +LGSLRMEKGYRDYGHDMDNTDRLLDCGLGFTCDFEKEGGFIGQKHVLAQKDAAKERGGLLKRIVNVLVLD +PAPLLHHGEILWKDGRRISDIRAASYGHTVGGAVGLSMLTRDIPVKKNWLDGSDWEVEVGSRKHPCRLSI +RPMYDPASVRVKDA* +>gi|OEU06768.1|GDCT|Fragilariopsis_cylindrus_CCMP1102 +MLLSSFRRSAATAAFRSPYVVASASAATRRTMMAAAAAEPLMKTSLNEWHKELGGEMVPFAGYELPVLYK +GDNVKNGGVMKEHLWCRSEGKASLFDVSHMGQIRWHGKDRAKFVEKMVVGDIQGLDTNHGCLSLITNDQG +GIKDDTVIVNAGDYIHMVVNGSMKFSDMAHFQKHLDDYDGDVTMEYLEDDMQLLALQGSGSADVLSKLLP +EGFNLKTMAFMTGLDTTIDGIENCRITRCGYTGEDGFEISTPSGISTIQIASKLCSDPNVNPTGLGARDS +LRLEAGLCLYGNDLNETINPIMGTLAWTLGKGGPNARRRQEQDFTGASTFLKEDGKLKKQARKRIGIIGM +KAPARQYTEIYDVDGIQLIGEITSGTFSPCLKKPIAMGYIDTIMSKNDTPIKLKIRNKMVDAHVTKMPFV +ESNYYRVPE +>gi|EWM26666.1|GDCT|Nannochloropsis_gaditana +MAPRESDWFCGGAYMSAGGINLASPLDLMTPHQKLSSDDRALEKTALFDMHLEMKGKMVPFAGYELPVLY +EMPEWGGIVKEHLHCRAKASVFDVSHMGQIKWHGKDRVKFLETLVVGDVAGLGVGEARLSLLTNKDGGII +DDTIITNAGDYTYMIVNGATKGGDMAHFKEQMESFKGDVCFEYFHEQQLLALQGPSAAETLQALLPADVD +LSKVNFMTGFDTTVGGLQARVTRCGYTGEDGFEVSVAWKDARALAELFLEGPGIRLAGLGARDSLRLESG +LCLYGNDIDATITPVEAALGWTMGGPKGRRRKEQGFLGAEKFLSPEGKFLPISRKRVGLAGFKAPARAHT +EIFDPSGVNKIGEVTSGTFSPSLNKPIAMGYVAKEFSAEGSKVAVKVRGKLQGADVTKMPFVTQHYYKAP +>jgi|Emihu1|422537|estExtDG_fgenesh_newKGs_pm.C_760029 +MPAESELRKTPLHAEHLALGAKMGPFGGWDMPIQYPDGIMKSHLFTRAKAGLFDVSHMLGVVVRGADRAL +PDGSGTLSVLTNEAGGIIDDMIITNAARGPSEKRKRGDHLYMVINAGHEDKDLPHMEAQLSKFDASVETL +PNNGILALQGPAAAEVLQALTPVDLSQMPFMSARPMEVAGEQCFVARSGYTGEDGFEIAVPPGGGSQHAV +RGLWSTLLEREDVTPVGLGARDSLRLEAGLCLYGERHASSARNDLDDTTSPVEGALAWVIAKRRRAEDGS +FCGSDRILAELRDKSRTRARCGFVVDQGAPVREGTALRDESGAEVGIVTSGGFSPCLKKGIGMCYVTPGR +NKSGTKLLAEVRGKTQSLTVTKMPFVEQRYYRGP* +>gi|AAL33597.1|glycine_cleavage_complex_T-protein_partial|Zea_mays +MRGGLWQLGQSVTRRLAQAEKKVIARRCFASEADLKKTALYDFHVANGGKMVPFAGWSMPIQYKDSIMDS +TINCRENGSLFDVAHMCGLSLKGKDCIPFLEKLVVGDIAGLAPGTGTLSVLTNEKGGAIDDTVITKVTDD +HIYLVVNAGCREKDLAHIEEHMKAFKAKGGDVSWHIHDERSLLALQGPLAAPVLQHLTKEDLSQVYFGQF +TFLDINGFPCYLTRTGYTGEDGFEISVPNEYAVDLAKAMLEKSEGKVRLTGLGARDSLRLEAGLCLYGND +LEQHITPIEAGLTWAVGKRRRAEGGFLGAEVILKQIADGPPQRRVGFISSGPPARGHSEIQNEKGESIGE +ITSGGFSPCLKKNIAMGYVKSGNHKAGTKVNILVRGKPYEGVVTKMPFVPT +>gi|NP_172650.1|Glycine_cleavage_T-protein_family|Arabidopsis_thaliana +MRGGSLWQLGQSITRRLAQSDKKVVSRRYFASEADLKKTALYDFHVAHGGKMVPFAGWSMPIQYKDSIMD +STVNCRENGSLFDVAHMCGLSLKGKDCVPFLETLVVADVAGLAPGTGSLTVFTNEKGGAIDDSVITKVTD +EHIYLVVNAGCRDKDLAHIEEHMKAFKSKGGDVSWHIHDERSLLALQGPLAAPVLQHLTKEDLSKLYFGN +FQILDINGSTCFLTRTGYTGEDGFEISVPDEHAVDLAKAILEKSEGKVRLTGLGARDSLRLEAGLCLYGN +DMEQHISPVEAGLTWAIGKRRRAEGGFLGADVILQQLKDGPTIRRVGFFSSGPPARSHSEVHDESGNKIG +EITSGGFSPNLKKNIAMGYVKSGQHKTGTKVKILVRGKPYEGSITKMPFVATKYYKPT \ No newline at end of file diff --git a/GK.txt b/GK.txt new file mode 100644 index 0000000..a8f69f7 --- /dev/null +++ b/GK.txt @@ -0,0 +1,31 @@ +>XP_002288791.1|GK|Thalassiosira_pseudonana_CCMP1335 +ASEILAEAIHKYTTTTKLEGVVIVKDDHATPNEIETLKKHNIVVRSASHPVPDVRSVSGANEILQSASNS +DEHTLVIACISGGGSALFCSPRDPLTLEELMATNAALLSSGMSVEKMNVIRKRLENGKGGKLAAAAYPAT +VLTLVLSDIIGDPLDLIASGPTVPDVSSWMDACQLVDEYGLELDDTPKSSHPAFSNMPSHDQLQSETILV +GNNHAAVMAAADMAEKLGYVPVVLGTRVDGEASVVAGVYTSMAEMLTQQRKNDGGKYPIAPLPAALIAGG +ETTVTLPPKCSGKGGRNQELALAAALKLQEMSLRDVVLVSVGTDGTDGPTDAAGAIVDGASITRIEQNNK +NKLSAKETLRNHDAYNFFDSDGDISLIRTGATGTNVADVCITLV +>OEU14163.1|glycerate_kinase|Fragilariopsis_cylindrus_CCMP1102 +MLSCKYLARTTVVASTYFGTFLPFVSSYSHVVKVLQVGRMSSSFNNYGHDNNLKYPAITTMSSSSSSSSS +SKIRQFSSSSNQEEHMTKDAMQIIHDAIRAVNPYTAIGSNFVRVNDTLKITNKEQQLEYNLPEDYDEIVI +VAFGKASTSMATAVVQQIFPKIKNNGDCDDSTSSGHNIPCRGVVICKDEHITANEREVLTDHGIEAYEAS +HPVPDARSSNAADKLLQMVSSRASPRTLVICCISGGGSSLFCRPTPPLTLQDLQQVNSVLLANGMDIQEM +NVLRKRLEQGKGGRLAAACFPSHVVALILSDVIGDPLDLIASGPTVPDTSTWEDGWRILQQYNLKDKLPK +VVVDMLQNGKNGRLEDSPSADHPVFENTKNILVGNNALAVEAASNTARSLGYNPVVLGTEIEGEAKEIAN +VYTAMASYLQNAFSQKTTKNSITQEQQYMITQSLPTAIIAGGETTVTLTPNSGKGGRNQELALSAALKLE +SLELRNVVLASVGTDGGDGPTDAAGAVVDATTIAGTRSQALEALANHNAYPYLDGLKGTTEWPPLIKTGP +TGTNVADICVTLIKAKPE +>KPK23911.1|glycerate_kinase|Nitrospira_bacterium_SG8_3 +MSVQQSILEEMRNQALEIFQAALRAVEPVEAILKHVKMEGESLLIGKRRMELSKFDRILVVGAGKADAPM +AQAVESLLGERVSDGIIVVKDGHGLPLQRVKVHEASHPVPDERGLGGTEEILSLVSGAGERDLVICLISG +GGSALLVAPAQGVTLKDKQQVTQLLLACGASIHEINTVRKHLSRVKGGGLAHAAHPATLVSLILSDVIGD +DLDTIASGPTVPDSTTFHQAGQILERYGIWDQVPGSVRMYVKKGVKGEIAETPKPGDPSFQRDAWELVGT +NLQALKAARKEAERLGYRTMILSGMMEGETREVAKAHAAIAKEVLNSENPIAPPACVLSGGETTVTLQGD +GKGGRNTEFALASAIALEGVEHVIVLSGGTDGTDGLTDAAGAFADGKTVVRARQGELDPTDYIRRNDSYT +FFETLGGLVITGPTRTNVMDVCVMLVRR +>BAH57057.1|GK|Arabidopsis thaliana] +MVHDYATTTNGTSKRCSALPTTNTVDVSSVSDLFEFICSGPLVNKIGITPQRVGQSIDKWLLYGSQLCRL +FQLNELKLTIPQKARLYHYYIPVFIWCEDQIALHNSKFKDGDDVPPLVIGFSAPQGCGKTTLVFALDYLF +KTTKKKSATISVDDFYLTAEGQAELRKKNPGNALLEYRGNAGSHDLKLSVETLEALSKLTKEGLKMKVPR +YNKSAYSGRGDRADSSTWPEVEGPLSVILFEGWMLGFKPLPADVVKAVDPQLEVVNKNLEAYYDAWDKYI +DAWVVIKIQDPSYVYRWRLQVCLSHNKTKQFLMRFFYTNTVLFLV \ No newline at end of file diff --git a/GOX.txt b/GOX.txt new file mode 100644 index 0000000..aecae2f --- /dev/null +++ b/GOX.txt @@ -0,0 +1,49 @@ +>jgi|Thaps3|406|fgenesh1_pm.C_chr_4000047|GOX_TPS +MHVKICNAGDYQRVARSILPTPLYEYLASGTDDEQTLSENESAFKAWYLRPRVMRPVGSISTVTTLFGQR +LSMPVFVSPAGVHALCDEVHGECAAARACGKVGTIFGLSQHATRSIEQVAEATQGNTNLWYQSYILKDRE +MTLRLARRAAKAGYRGIFLTVDSVRFGFREADARNNFSSLPEPHRLVNYDDEVSQAQHPKKAWVAPEASV +DKSKIYSGQEEAWDQNTEQLFEQNPSWEDVRWLKREVCRDLPLIVKGIMTAEDAIEAKKAGADGVMVSNH +GGRGLDSALPTIDVLPEIVAAVGDQFPVLLDSGIRRGTDVLKALALGATAVGIGKPLFFALSVGGEDAVL +NLLQMFQRETEAAMAICGCKSVSDVTRQLVTRHPSGSGRVGKYERSKL +>jgi|Thaps3|3353|fgenesh1_pg.C_chr_3000287|GOX2_TPS +MALNPHKLGVHKILKHIPLNALFDAPSKARLRKAVNIADLRLCAKQRAHKMVFDYLDAGADDEISLRRGK +DAYSELEMHFHILSGLKPPLDLSTKIFGQDVKLPFFGCPTAGNRMFHWEGETAAAKAAQHHGTLYGLSSL +ATTGITEIGKLTDGPKVFQLYVWKDRELVKEVLAKAKEGGFNAMALTVDFTWYGNRERDIRNDFSIPPKY +SMAQIVEAIRKPAWTYDFLSHEPYTYACINTDVPADSLAAFVNSQLCPEFDWRDAEWLLGEWNMPSAVKG +VCRPDDAIKAVETGFTTMWVSNHGARQLETSPATIDVLPSIREAVGPDVEIILDGGVQRGTDICKALALG +ADSVGVGKPYLYGLAAGGTEGVIKAYDILKVELDRAMGLLGAGTVDELKKRGPGLIKRRHASARDYPDRY +AYERGYGGGVI* +>jgi|Phatr2|22568|estExt_gwp_gw1.C_chr_180099|GOX_PTRI +MLEESEKRNLLNVDDYQVLAKTKLPHSLYEYLASGTADATTLRENRDAFARWYLRPRAMRPVGRISTRMV +LFGQGLSMPVFCSPAGVHALCHPDGECATARVCQDLGLLFGLSQHATKSIEQVAAAAPQSHRYYQAYILK +DRSITARLVQRAIQAGYSGIFLTVDSVRFGYREADARNGFDALPSPHRLANYDEVRQQNLDQTYNAKTHL +AWDQNSELLFEQNVSWKDVTWLKEEVCGGLPLIVKGIMTAEDAVLAIEAGADAIMVSNHGGRQLDTCLGS +IDVLPEVVMAVGGRVPVLLDGGVRRGTDVVKALALGAAAVGLGKPLFFALACGGESSLKDMLEILQTEIE +VAMALCGCETISDIQSSHITRHPGGHFQSRL* +>jgi|Phatr2|50804|estExt_fgenesh1_pm.C_chr_40021|GOX2_PTRI +MIFNPHKLGLHKILKHIPLNAIFDAPYKRKLARAVNIADLRLIAKSRAHKMVFDYLDAGADDEISLRRGK +DAYSEFEMHYKVLAGIKPPLDLSTKIFGQDVTLPFFGCPTAGNRMFHWEGETAAAKAAEHHGTMYGLSSL +ATTGITEIGELFNGPKVFQLYVWKDRELVKDVLAKAKEGGFNALALTVDFTWYGNRERDIRNDFSIPPKY +NITQTIEAIRKPAWTYDFLSHEPYTYACINTDVPADSLAAFVNSQLSPEFSWSDAEWLLGEWNGPAAPKG +VVRPEDAKKAIEIGFSSIWVSNHGARQLETSPATIDVLPSIRAAVGPDVEIIMDGGVQRGTDICKALALG +ADAVGVGKPYLWGLAAGGTAGVIKAYDILKVELDRAMGLLGTPTVAALKKEGPSLIKRRPGSARDYPDMY +AYERGYGGGVV* +>gb|Ectocarpus_siliculosus|CBN75171.1|Glycolate_Oxidase_(2-Hydroxyacid_Oxidase) +MGSPEKKVPVDLSRCISLDDFQRQAKPILGKALYEYVASGTDDEQTLSENRQAFKRMFLLPRMMRVVSDI +DLRLDVFGQRLSMPVFVSPAGVHKLMHPEGECATARACAEAGTLMGVSQHATVSLEDVAAAAPRCARWFQ +LYILKDRELTAGILRRSEKAGYTAICLTVDSVRFGSREADWRNNFNGLPPGVTLANYPTQDGYNDRVKDA +WDQNTEKLFDERATWSDIAWLKSLTSLPILVKGILTAQDAVSAVEAGASGVIVSNHGGRALDGSLSSIES +LAPVVKAVRSVPTGANVPIFLDSGVRRGTDVLKALALGATAVLLGRPMFFSLAVGGQEGVQRMLSIIRDE +LEAAMALCGCQRLQDITKDLVTDFREGGSTFHRPRL +>jgi|Emihu1|99212|fgeneshEH_pg.18__10 +MRLARRAFSSVPPFAAAVDDALLTRLERAAHRVTTNASICDRHGDDESHHRSVPPSAVVYAHSTEEVQAV +VRVCAETRTPLISFGAGTSLEGHIQAVQGGVCLDLSEMNAVLEVNPEDLDCRVQAGITRKSLNDHLRDTG +LTFPVDPGADASLGGMAACGASGTTAVKYGTMRENCLGLTAVLASGEVVRTGGRARKSSAGYDLTRLLVG +SEGTLAVLTEVQLKLYPLPAAVSAATCSFPTLSDAARAVAGLLQCGVPVSRSELLDASAIAAFNKYSTEV +ADLQEAPTLFLEVEGVSEAAVEAAAAVARECCADSGGGEFQWATSESERRRLWAARHATYYASLALRPGS +RGVVTDAVVPLSRLAEVMGETAADVAEAGVVGPIFGHAGDGNFHCILLLRDEDPPDYVERLSQLNDRLIR +RTLAAGGSCTGEHGVGVGKKQYLAREFGEGAVEMMRTVKRSLDPLGILNPGKVVDVSKHEAVL* +>gi|T002129.1|Arabidopsis_thaliana|glycolate_oxidase_translation|Arabidopsis_thaliana +MEITNVTEYDAIAKAKLPKMVYDYYASGAEDQWTLQENRNAFARILFRPRILIDVNKIDMATTVLGFKISMPIMVAPTAFQKMAHPDGEYATARAASAAGTIMTLSSWATSSVEEVASTGPGIRFFQLYVYKNRKVVEQLVRRAEKAGFKAIALTVDTPRLGRRESDIKNRFTLPPNLTLKNFEGLDLGKMDEANDSGLASYVAGQIDRTLSWKDIQWLQTITNMPILVKGVLTGEDARIAIQAGAAGIIVSNHGARQLDYVPATISALEEVVKATQGRVPVFLDGGVRRGTDVFKALALGASGIFIGRPVVFALAAEGEAGVKKVLQMLRDEFELTMALSGCRSLSEITRNHIVTEWDTPRHLPRL +>gi|BAA82872.1|GOX|Homo_Sapien +mlprlicindyeqhaksvlpksiydyyrsgandeetladniaafsrwklyprmlrnvaetdlstsvlgqrvsmpicvgatamqrmahvdgelatvracqslgtgmmlsswatssieevaeagpealrwlqlyiykdrevtkklvrqaekmgykaifvtvdtpylgnrlddvrnrfklppqlrmknfetstlsfspeenfgddsglaayvakaidpsiswedikwlrrltslpivakgilrgddareavkhglngilvsnhgarqldgvpatidvlpeiveavegkvevfldggvrkgtdvlkalalgakavfvgrpivwglafqgekgvqdvlxilkeefrlamalsgcqnvkvidktlvrknplavski diff --git a/GlcDH.txt b/GlcDH.txt new file mode 100644 index 0000000..73f3e45 --- /dev/null +++ b/GlcDH.txt @@ -0,0 +1,72 @@ +>sp|P0AEP9.1|GLCD_ECOLI|Glycolate_oxidase_subunit_GlcD +MSILYEERLDGALPDVDRTSVLMALREHVPGLEILHTDEEIIPYECDGLSAYRTRPLLVVLPKQMEQVTA +ILAVCHRLRVPVVTRGAGTGLSGGALPLEKGVLLVMARFKEILDINPVGRRARVQPGVRNLAISQAVAPH +NLYYAPDPSSQIACSIGGNVAENAGGVHCLKYGLTVHNLLKIEVQTLDGEALTLGSDALDSPGFDLLALF +TGSEGMLGVTTEVTVKLLPKPPVARVLLASFDSVEKAGLAVGDIIANGIIPGGLEMMDNLSIRAAEDFIH +AGYPVDAEAILLCELDGVESDVQEDCERVNDILLKAGATDVRLAQDEAERVRFWAGRKNAFPAVGRISPD +YYCMDGTIPRRALPGVLEGIARLSQQYDLRVANVFHAGDGNMHPLILFDANEPGEFARAEELGGKILELC +VEVGGSISGEHGIGREKINQMCAQFNSDEITTFHAVKAAFDPDGLLNPGKNIPTLHRCAEFGAMHVHHGH +LPFPELERF +>sp|P52073.1|GLCE_ECOLI|Glycolate_oxidase_subunit_GlcE +MLRECDYSQALLEQVNQAISDKTPLVIQGSNSKAFLGRPVTGQTLDVRCHRGIVNYDPTELVITARVGTP +LVTIEAALESAGQMLPCEPPHYGEEATWGGMVACGLAGPRRPWSGSVRDFVLGTRIITGAGKHLRFGGEV +MKNVAGYDLSRLMVGSYGCLGVLTEISMKVLPRPRASLSLRREISLQEAMSEIAEWQLQPLPISGLCYFD +NALWIRLEGGEGSVKAARELLGGEEVAGQFWQQLREQQLPFFSLPGTLWRISLPSDAPMMDLPGEQLIDW +GGALRWLKSTAEDNQIHRIARNAGGHATRFSAGDGGFAPLSAPLFRYHQQLKQQLDPCGVFNPGRMYAEL +>sp|Q55124|Q55124_SYNY3|Glycolate_oxidase_subunit_GlcD|Synechocystis +MAIFSPVNAVTDIIPQLEKIVGQDGVIKRKDELFTYECDGLTGYRQRPALVVLPRTTEQVATIVKLCHDR +QIPWIARGAGTGLSGGALPGADSLLIVTTRMRQILAVDYDNQTIVVQPGVVNNWVTQTVSGAGFYYAPDP +SSQIVCSIGGNIAENSGGVHCLKYGTTTNHVLGLKLVIPDGSIVEVGGQVPETPGYDLTGLFVGSEGTLG +IATEITLKILKTPESICVVLADFLSLEATAQSVADIIAAGIVPAGMEIMDNFSINAVEDVVATNCYPRDA +AAILLVELDGLPIEVELNQAKVEEICRNNGARNTAIAYDQETRLKMWKGRKAAFAAAGKLSPSYFVQDGV +VPRTQLVQILSDINDLSKKYGFAIANVFHAGDGNLHPLILYDQKVPGAWEKVEELGGEILKRCVELGGSL +SGEHGIGIDKNCFMPNMFNEVDLETMQWVRQCFNPDNLANPGKLFPTPRSCGEVANAQRLNLGQDKKMEE +IY +>gi|BAA18106.1|slr0806|Synechocystis +MDWSAIAASLTTQGLEVIQDPQQRKKLSTDYAHFSPILMAQLEGKQADLVVLARSEPEAIAVIRCCVANQ +IPLTVRGAGTGNYGQCVPLEGGIVLDLSPMQRIISLEPGRAVVEPGVKLGKLEQQAKQMGWELRLLPSTY +QTATVGGFVSGGSTGMGAVNYGTLFDPGNVQSLTVLTMEAEPQRLILSGEAAQPVIHGYGTNGIITEITL +PLTPALPWREAIVSFTNLSSAIAFAQNLAHQDGIVSKEISIQADPIPQYFSSLKSYYQPGAHWVMVIVSE +LDWLAFTQLAKASKGEIIFEQDPQSPGKKINLIEFNWNHTTLLARAVDPSLTYLQVFFYRDVEQILALAK +LFKDEIMFHIEIMRIQGQMCLAGFPLVKFINGDRLEEIMAAHQNLGARIANPHTYSLAGGSVQPLPESQL +IFKRQVDPLNLLNPGKLTD +>gi|BAA16857.1|glycolate_oxidase_subunit_GlcE|Synechocystis +MPMAVVSLPFSPQNFPHSSSCSVQDLPPHQQMAIAQALAEPEHAPSHWVAPESQQELQCLLSECDRNNWP +VIPCGNQSKLAWGGLAKPVQLLVSSAGLNRIVDHAVADLTVTVEAGVKLKDLQAILQPHQQFLPLNPLYH +DQATVGGIMATGCAGPWQQRYGGVRDLVLGFSFVRWDGQLAKAGGRVVKNVAGYDLMKLFVGSYGTLGFI +SQITFRLYPLPSHSQTVFLTGDTNQLAKLSQALRRSGLAPTAALICSPALVQALNLGEELGLLVRFQNLE +PVVQAQIDEVKKLAQTLTLASQSFDNQAESELWQRWENAMAGQGTTETILCKFGLLPAKAADFLQQLPGL +GHVQLGNGIGWVRFGQLDREKLNQQRQICQNYGGYVTVLEASPECKKHWDVWGQSGHGLAMMGRLKNQFD +PHNTFSPGRFVGGF +>sp|Q0ZAZ1|Q0ZAZ1_CHLRE|Glycolate_dehydrogenase|Chlamydomonas +MPRGQGKRLAQLLGAQLKQYAAEVRGISTAGGASRGGARGPASPSSLEQQTRQVAQVAVQQSTQQAVKVV +VPAIKVDLVGAVSSVSESDKVEPGVFKNVDGHRFEDGRYAAFVEEITKFIPKERQYSDPVRTFAYGTDAS +FYRLNPKLVVKVHNEDEVRRIMPIAERLQVPITFRAAGTSLSGQAITDSVLIKLSHTGKNFRNFTVHGDG +SVITVEPGLIGGEVNRILAAHQKKNKLPIQYKIGPDPSSIDSCMIGGIVSNNSSGMCCGVSQNTYHTLKD +MRVVFVDGTVLDTADPNSCTAFMKSHRSLVDGVVSLARRVQADKELTALIRRKFAIKCTTGYSLNALVDF +PVDNPIEIIKHLIIGSEGTLGFVSRATYNTVPEWPNKASAFIVFPDVRAACTGASVLRNETSVDAVELFD +RASLRECENNEDMMRLVPDIKGCDPMAAALLIECRGQDEAALQSRIEEVVRVLTAAGLPFGAKAAQPMAI +DAYPFHHDQKNAKVFWDVRRGLIPIVGAAREPGTSMLIEDVACPVDKLADMMIDLIDMFQRHGYHDASCI +GHALEGNLHLVFSQGFRNKEEVQRFSDMMEEMCHLVATKHSGSLKGEHGTGRNVAPFVEMEWGNKAYELM +WELKALFDPSHTLNPGVILNRDQDAHIKFLKPSPAASPIVNRCIECGFCESNCPSRDITLTPRQRISVYR +EMYRLKQLGPGASEEEKKQLAAMSSSYAYDGEQTCAADGMCQEKCPVKINTGDLIKSMRAEHMKEEKTAS +GMADWLAANFGVINSNVPRFLNIVNAMYSVVGSAPLSAISRALNAATNHFVPVWNPYMPKGAAPLKVPAP +PAPAAAEASGIPRKVVYMSSCVTRMMGPAASDTETAAVHEKVMSLFGKAGYEVIIPEGVASQCCGMMFNS +RGFKDAAASKGAELEAALLKASDNGKIPIVIDTSPCLAQVKSQISEPSLRFALYEPVEFIRHFLVDKLEW +KKVRDQVAIHVPCSSKKMGIEESFAKLAGLCANEVVPSGIPCCGMAGDRGMRFPELTGASLQHLNLPKTC +KDGYSTSRTCEMSLSNHAGINFRGLVYLVDEATAPKKQAAAAKTA +>gb|XP_002178591.1|glycolate_oxidase|Phaeodactylum_tricornutum +MIFNPHKLGLHKILKHIPLNAIFDAPYKRKLARAVNIADLRLIAKSRAHKMVFDYLDAGADDEISLRRGK +DAYSEFEMHYKVLAGIKPPLDLSTKIFGQDVTLPFFGCPTAGNRMFHWEGETAAAKAAEHHGTMYGLSSL +ATTGITEIGELFNGPKVFQLYVWKDRELVKDVLAKAKEGGFNALALTVDFTWYGNRERDIRNDFSIPPKY +NITQTIEAIRKPAWTYDFLSHEPYTYACINTDVPADSLAAFVNSQLSPEFSWSDAEWLLGEWNGPAAPKG +VVRPEDAKKAIEIGFSSIWVSNHGARQLETSPATIDVLPSIRAAVGPDVEIIMDGGVQRGTDICKALALG +ADAVGVGKPYLWGLAAGGTAGVIKAYDILKVELDRAMGLLGTPTVAALKKEGPSLIKRRPGSARDYPDMY +AYERGYGGGVV +>gb|XP_002183215.1|glycolate_oxidase|Phaeodactylum_tricornutum +MLEESEKRNLLNVDDYQVLAKTKLPHSLYEYLASGTADATTLRENRDAFARWYLRPRAMRPVGRISTRMV +LFGQGLSMPVFCSPAGVHALCHPDGECATARVCQDLGLLFGLSQHATKSIEQVAAAAPQSHRYYQAYILK +DRSITARLVQRAIQAGYSGIFLTVDSVRFGYREADARNGFDALPSPHRLANYDEVRQQNLDQTYNAKTHL +AWDQNSELLFEQNVSWKDVTWLKEEVCGGLPLIVKGIMTAEDAVLAIEAGADAIMVSNHGGRQLDTCLGS +IDVLPEVVMAVGGRVPVLLDGGVRRGTDVVKALALGAAAVGLGKPLFFALACGGESSLKDMLEILQTEIE +VAMALCGCETISDIQSSHITRHPGGHFQSRL diff --git a/HR.txt b/HR.txt new file mode 100644 index 0000000..9804d90 --- /dev/null +++ b/HR.txt @@ -0,0 +1,28 @@ +>jgi|Thaps3|2846|fgenesh1_pg.C_chr_2000801|HR +MAAAATRFLMRRPTAIFLNSSRLDYDKALDFSLLSRLTDLTLNNVDSISSVDEIVQKVVDSKAEIVITKE +MEVPLEALERLPTSVKLWCEAGTGYNNIPIAQARKQSIDVVNIPTYSTASVAHMVITYIMSFSSAIFKQA +KMLHDGDQTNFRVFQHPIYEITAKKLGLIGGSGTIGTAVIDVALPLGMDVLVSSRSGKLPSGHKYESNPR +VKVVSLDELLSTSDYVSINCPLNSDTRHSIGEREIRLMKPTAFLINTARGAIINEAELIQCMKENVIAGA +GLDTQEMEPPKPDSDLWKLDNVFLTPHIGWRRLETRQRLVDMTTDNIDHYIKGELQNVVN* +>jgi|Phatr2|56499|AGR_Contig1088|HR_PTRI +MRTRIVSATAILPLFHNLGRRRFAERKRVGLILALFSTPLSNCNGRRRAVSAFLSPTNTLSSTKTPSKSF +LYYSTKSRTSRSVTELLRLRASSLCLRSPKSTRTLSVMSLGAVSTESSERLVTTVTDHARTMADAAIHSV +DPVTAVRDHVRKLVDLSSTAAANHTSKPGTKATLLHIGIDPHNMVNLSLSDYDHILVVAFGKASSAMATA +LLERLTEGQPATNQLPSISGLVIVKDGHATPQQLEILQQSRYNISVREASHPVPDQRGVDASRKLLDLVH +TYASPRTLVFALLSGGGSALFCAPHESLTLLDLQQTNQALLQSGWSITDMNVVRKRLETGKGGRLAAAAH +PGTVVSLILSDVLGDPLDLIASGPTVPDTSTWSDAWALAETLPEKALPDAVRRLMRAGVDGHLPDSPSPS +HGVFARAVTCLVGNNAKAVTAAATTAQRLGYHPVILGTRTEGEARQVARWLVQLAQHLALPETPSKQFSL +ASLPAALICGGETTVTLPEQSQKHGKGGRNQELALAAALELQRVGLNSKNDVVVVVASVGTDGTDGPTDA +AGAIVDGHTVDRLPGDALLALETHNAYPYLAQTDANGRSPLLKTGPTGTNVADVYLVLIQKSRLK* +>gb|SJN46695.1|Hydroxypyruvate_reductase|Pseudoalteromonas_sp.JB197 +MKITILDNATLAKTSLDCIAQLGELTVHELTSAEQVVAHSKNADVLITNKAVVNRETMSQLKSLKLICVS +ATGTNNVDLVAAKELGIAVTNVAGYSTPSVVQHTFSLITNLLGNTHRYQADCQQGAWQKSEMFCRLDYSF +NDLQDKTFAIIGGGTLGSAVATVASAFGANVITAERKGAQCREGRIPFEQAIKTADIISVHCPLTDETRD +LITLNELKIMKPSSIIINTARGGIINEADLATALEQNLIAGAGVDVLTKEPAELTNPLANYKGNNLLLTP +HIAWASTESIVRLVNEVSLNIMAFTQQQSRNRLV +>Q9CA90.1|HPR2|Arabidopsis_thaliana +MESIGVLMMCPMSSYLENELEKRFNLLRFWTSPEKSVLLETHRNSIRAVVGNASAGADAQLISDLPNLEI +VSSFSVGLDKIDLGKCKEKGIRVTNTPDVLTEDVADLAIGLILALLRRLCECDRYVRSGKWKQGEFQLTT +KFSGKSVGIIGLGRIGTAIAKRAEAFSCPINYYSRTIKPDVAYKYYPTVVDLAQNSDILVVACPLTEQTR +HIVDRQVMDALGAKGVLINIGRGPHVDEQELIKALTEGRLGGAALDVFEQEPHVPEELFGLENVVLLPHV +GSGTVETRNAMADLVVGNLEAHFSGKSLLTPVV \ No newline at end of file diff --git a/ICL.txt b/ICL.txt new file mode 100644 index 0000000..c58f67a --- /dev/null +++ b/ICL.txt @@ -0,0 +1,38 @@ +>jgi|Thaps3|35523|e_gw1.7.84.1|ICL_TPS +MRDIAIIEQWWRDPRWKGTKRTYSASDVASLRNSSEARGSSFVNPKCSYSNRSSRKLYQLLTSLHAAGGY +SHTFGALDPVQVVQMAPHLSSIYISGWQCSSTASSTNEPGPDFADYPMNTVPLKCDQLVRAQLHHDRRQS +EERASAILSNKTPAPKVDYLTPIVADGDTGHGGLSAVMKLVKLFVEAGAAGVHFEDQKPGTKKCGHMGGK +VLVSTQEHVDRLVAARLAADVLGVELVIVARTDAEAATLLDSNVDGRDHPFILGATVPGTIPMNEAMKSA +SASGGGNAANMENEWNAKARPMTFGEAVLDKILSSGVSQRKKDEMSRMWYASDPDTLSNANARRIADSIF +GAKNSIYFDWEACRVREGYYRVKPGIEYCIQRARAYAPYADLIWMETATPGIPDARKFSEGVKKVYPNQM +LAYNLSPSFNWDASGMTDDELARFNDDLGRLGYTWQFITLAGFHSNGLVVTKLARSFGDEGMLAYVREIQ +RQEKEEEVELLKHQKWSGAELVDRMVNVASGGQSSTAAMGAGVTEDQFGKH* +>jgi|Phatr2|51088|estExt_fgenesh1_pm.C_chr_150006|ICL_PTRI +MKFTVASISGSSSASPTSNGATKEDVAKPKIPRGLRALPAAAASSRVSPGSELGMLRSEASSIDQWWKDP +RWKNTTRVYSSTDVACLRPSAQARNNLRQAPGVSFSSQQSDKLWSLLVQLQARKGYSHTFGALDPVQVTQ +MAPHLSSIYVSGWQCSSTASSTNEPGPDFADYPMNTVPNKVDQLVRAQLHHDRRQQQERSEALLAGKDPG +QPVDYLRPIVADADTGHGGLSAVMKLTKLMVEAGAAGMHLEDQKPGTKKCGHMGGKVLVSTQEHIDRLVA +SRLASDILGVNLILVARTDAEAATLLDSNIDGRDHPFILGVTTPGMPTLQDAVAKAPAGQANQVTTEWTK +QANLMTFGEAVLATIQRSSKPAYQKRQMEQRWMASNPNTLSNAQARRMADEILGQANAVNFDWESCRVRE +GYYQLRPGIEYCIQRAIAYAPYADLIWMETKIPAIDDAAQFSRGVHAVHPHQMLAYNLSPSFNWDASGMT +DSQIASFNDDLGRLGYVWQFITLAGFHGNGLVMTKLARAYGDRGMIAYVEQIQRQERIHKVELLTHQKWS +GAELVDQMVNVASGGVSSTAAMGAGVTEAQFGH* +>gi|Q9SE26.1|Isocitrate_lyase|Dendrobium_crumenatum +MASSSVPPMITEEEARFEAEVSAVESWWRTDRFRLTRRPYSARDVVSLRGTLHHSYASDQMAKKLWRTLK +SHQSAGTASRTFGALDPVQVTMMAKHLDTIYVSGWQCSSTHTATNEPGPDLADYPYNTVPNKVEHLFFAQ +LYHDRKQHEARVSMTREQRAKTPYVDYLRPIIADGDTGFGGATATVKLCKLFVERGAAGVHIEDQSSVTK +KCGHMAGKVLVAVSEHINRLVAARLQFDVMGVETVLVARTDAVAATLIQSNVDLRDHQFILGATNPDFKR +RSLAAVLSAAMAAGKTGAVLQAIEDDWLSRAGLMTFSDAVINGINRQNLPEYEKQRRLNEWAAATEYSKC +VSNEQGREIAERLGAGEIFWDWDIARTREGFYRFRGSVEAAVVRGRAFAPHADLIWMETSSPDLVECGKF +AQGMKASHPEIMLAYNLSPSFNWDAAGMTDEEMRDFIPRIAKMGFCWQFITLGGFHADALVTDTFAREFA +KQGMLAYVERIQREERNNGVDTLAHQKWSGANYYDRYLKTVQGGISSTAAMGKGVTEEQFKEESRTGTRG +LDRGGITVNAKSRL +>gb|AAA33976.1|glyoxysomal_isocitrate_lyase, partial|Glycine_max +EAEVAEVQAWWNSERFRLTKRPYTARDVVSLRGNLRQTYASNEMAKKLWRLLKNHQANGTASRTFGALDP +VQVTQMAKHLDTIYVSGWQCSATHTTSNEPGPDLADYPYDTVPNKVEHLFFAQQYHDRKQKEERMRMSRE +ERARTPYVDYLRPIIADGDTGFGGTTATVKLCKLFVERGAAGIHIEDQSSVTKKCGHMAGKVLVAISEHI +NRLVAARLQFDVMGVETVLVARTDAEAANLIQSNIDTRDHQFILGVTNPNLKGKSLATLMQQGMAAGKNG +AELQALEDEWLSKAQLKTLSEAVVEAIERQNNIGEEEKRRKLNEWMHHSSYERCLSNEEGREIAEKLGVR +NLFWDWDLPRTREGFYRFKGSVTASVVRGCAFSPHADVIWMETASPNVVECTEFSEGVRSKHPQMMLGYN +LSPSFNWDASGMSDEQMKDFIPKIAKLGYVWQFITVGGLHSNALITSTFARDFANRGMLAYVERIQREER +NNGVDTLAHQKWAGANYYDRYLKTVQGGVASTAAMGKGVTEEQFKESWTRSGAVNIDRGSIVVAKARM diff --git a/MAGIC_HMM.sh b/MAGIC_HMM.sh new file mode 100644 index 0000000..74cb720 --- /dev/null +++ b/MAGIC_HMM.sh @@ -0,0 +1,10 @@ +#! /usr/bin/env bash +# + +for i in SLC4 Bestrophin CA_beta CA_delta CA_zeta CA_alpha GOX GDCT PGP GCL HR SPT TSR ICL PK PEPC PEPCK MDH OMT ME PPDK PYC SHMT MS GlcDH ALAT_GGAT GK + do + #mafft "$i".txt> "$i"_aln.txt + #./hmmbuild "$i".hmm "$i"_aln.txt + ./hmmsearch -o "$1"_"$i"_hmmout.csv --tblout "$1"_"$i"_HMM.csv "$i".hmm "$1" + done + \ No newline at end of file diff --git a/MASTER_pepTOhmm.ipynb b/MASTER_pepTOhmm.ipynb new file mode 100644 index 0000000..84f0bc7 --- /dev/null +++ b/MASTER_pepTOhmm.ipynb @@ -0,0 +1,2097 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 1, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import sys\n", + "from ftplib import FTP #import the ftp library\n", + "import re \n", + "import os\n", + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 2, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#cwd = os.getcwd()\n", + "#print cwd" + ] + }, + { + "cell_type": "code", + "execution_count": 3, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "## different classes\n", + "#t='Dinophyceae'\n", + "#t='Bacillariophyta'\n", + "t='Haptophyta'\n", + "#t='Raphidophyceae'\n", + "need='mmetsp_taxonomy.txt'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### MAFFT and HMMbuild in shell" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "###run mafft and hmmbuild first to make script faster\n", + "#os.system('./mafft_hmmbuil.sh')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Species list for HMMFUNCTION" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "will not retrive files, but will create a list with names that we can pass to bash and other functions" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "set(['Exanthemachrysis', 'Gephyrocapsa', 'Chrysochromulina', 'Isochrysis', 'Pleurochrysis', 'Pavlova', 'Phaeocystis', 'Coccolithus', 'Imantonia', 'Unidentified eukaryote', 'Prymnesium', 'Chrysoculter', 'Scyphosphaera', 'Emiliania', 'Calcidiscus'])\n" + ] + } + ], + "source": [ + "mt=open('mmetsp_taxonomy.txt','r')\n", + "g=[] #make an empty list to store genus names\n", + "for line in mt:\n", + " if re.search(t,line): #if taxa name in line\n", + " g= g+line.split('\\t')[7:8]#pull out the 8th field should be genus, keeping as list\n", + "\n", + "g=set(g) #keep only unique genus names\n", + "print g\n", + "#close the taxonomy file\n", + "mt.close()" + ] + }, + { + "cell_type": "code", + "execution_count": 5, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Gephyrocapsa-oceanica-RCC1303.pep.fa.gz', 'Chrysochromulina-polylepis-CCMP1757.pep.fa.gz', 'Isochrysis-galbana-CCMP1323.pep.fa.gz', 'Isochrysis-sp-CCMP1244.pep.fa.gz', 'Isochrysis-sp-CCMP1324.pep.fa.gz', 'Pleurochrysis-carterae-CCMP645.pep.fa.gz', 'Pavlova-sp-CCMP459.pep.fa.gz', 'Prymnesium-parvum-Texoma1.pep.fa.gz', 'Emiliania-huxleyi-374.pep.fa.gz', 'Emiliania-huxleyi-379.pep.fa.gz', 'Emiliania-huxleyi-CCMP370.pep.fa.gz', 'Emiliania-huxleyi-PLYM219.pep.fa.gz']\n" + ] + } + ], + "source": [ + "ftp= FTP('ftp.imicrobe.us') #set home ftp server\n", + "ftp.login() #log in\n", + "ftp.cwd('camera/combined_assemblies') #ch\n", + "\n", + "files=ftp.nlst() #make a list of all files and directories in wd\n", + "delimiter=' '\n", + "all=delimiter.join(files)\n", + "\n", + "names=[]\n", + "\n", + "for genus in g:\n", + " string= genus+\"\\S*.pep.fa.gz\"\n", + " taxafiles=re.findall(string, all)\n", + " #print \"{} files matching genus=\".format(len(taxafiles))+genus\n", + " #print taxafiles\n", + " if len(taxafiles) > 0:\n", + " for filex in taxafiles:\n", + " command = \"RETR \"+filex\n", + " outfile = filex\n", + " #ftp.retrbinary(command, open(outfile, 'wb').write)\n", + " names.append(outfile)\n", + " \n", + "ftp.quit()\n", + "\n", + "print names\n" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Gephyrocapsa-oceanica-RCC1303.pep.fa.gz', 'Chrysochromulina-polylepis-CCMP1757.pep.fa.gz']\n" + ] + } + ], + "source": [ + "names=names[:2]\n", + "print names" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### Retrieve PEP.fa in shell" + ] + }, + { + "cell_type": "code", + "execution_count": 147, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/plain": [ + "0" + ] + }, + "execution_count": 147, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "###run retriver in shell\n", + "os.system('python ./MMETSP_sample_import.py {} {}'.format(t,need))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "#### Fix error on Dinos" + ] + }, + { + "cell_type": "code", + "execution_count": 6, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "30\n", + "27\n" + ] + } + ], + "source": [ + "##the last species is not available for some reason so I removed it\n", + "print len(names)\n", + "\n", + "if t=='Dinophyceae':\n", + " names.remove('Durinskia-baltica-CSIRO_CS-38.pep.fa.gz')\n", + " names.remove('Oxyrrhis-marina-CCMP1795.pep.fa.gz')\n", + " names.remove('Alexandrium-fundyense-CCMP1719.pep.fa.gz')\n", + " \n", + "print len(names)" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### GET counts" + ] + }, + { + "cell_type": "code", + "execution_count": 115, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "names=[i.split('.')[0] for i in names] #removes .pep.fa.gz from the names\n", + "\n", + "ftp= FTP('ftp.imicrobe.us') #set ftp server\n", + "ftp.login() #log in\n", + "ftp.cwd('camera/combined_assemblies') #change directory\n", + "\n", + "t='/Users/maria_hernandez/Documents/Big_Data3050/CMM_MoreSP/' #location for files\n", + "for ID in names:\n", + " #change to taxa directory/readcounts\n", + " ripdir= ID+\"/readcounts\"\n", + " ftp.cwd(ripdir) #change directory\n", + " savefile= t+ID+\"_cds_counts.txt\" #saves files with unique names\n", + " ftp.retrbinary('RETR cds.dat', open(savefile, 'wb').write)\n", + " ftp.cwd(\"~/camera/combined_assemblies\") #change directory to restart loop in right place\n", + " \n", + "ftp.quit() " + ] + }, + { + "cell_type": "code", + "execution_count": 16, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "names='Chrysochromulina-polylepis-CCMP1757.pep.fa.gz'" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### RUN HMM in shell" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "for i in names:\n", + " os.system('./MAGIC_HMM.sh {}'.format(i))" + ] + }, + { + "cell_type": "code", + "execution_count": 22, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "for i in names:\n", + " os.system('./MAGIC_one.sh {}'.format(i))" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### HMM READ" + ] + }, + { + "cell_type": "code", + "execution_count": 8, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def readHMM(Organism,gene_name):\n", + " \"\"\" Takes in organism and gene_name from HMM results and makes a table.\n", + " HMM results from --tblout that have the following name organism_genename_HMM.csv\n", + " Note: pep.fa files differ in structure and it can affect how the HMM output is written. If you can't read the file in \n", + " modify the fuction\"\"\"\n", + "\n", + " hold=pd.read_csv('{}_{}_HMM.csv'.format(Organism,gene_name),delim_whitespace=True,engine='python',skiprows=2,skipfooter=10)\n", + " #empty files have 12 rows so the following if statement will only work on files that are not empty\n", + " \n", + " if hold.shape[0]!=0:\n", + " readX=pd.read_csv('{}_{}_HMM.csv'.format(Organism,gene_name),delim_whitespace=True,engine='python',skiprows=3,\n", + " header=None ,skipfooter=10)\n", + " \n", + " samplenames=[]\n", + " for i in np.arange(1,readX.shape[1]+1):\n", + " samplenames.append(str(i))\n", + "\n", + " readX.columns=samplenames\n", + " \n", + " new=pd.DataFrame()\n", + " \n", + " new['CAMPEPid']=readX['1']\n", + " new['contig']=readX['19']\n", + " new['Evalue']=readX['5']\n", + " new['Annotation']='{}'.format(gene_name)\n", + "\n", + " new.contig=new.contig.str.split(\"|\").str[1]\n", + " new.contig=new.contig.str.split(\"_\").str[0]\n", + " return new\n", + " #return readX" + ] + }, + { + "cell_type": "code", + "execution_count": 9, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CAMPEPidcontigEvalueAnnotation
0CAMPEP_01937945061699641.600000e-125Bestrophin
1CAMPEP_01937836681409361.900000e-44Bestrophin
2CAMPEP_01937954301704281.400000e-43Bestrophin
3CAMPEP_01937888501559111.300000e-25Bestrophin
4CAMPEP_0193725542155314.000000e-24Bestrophin
\n", + "
" + ], + "text/plain": [ + " CAMPEPid contig Evalue Annotation\n", + "0 CAMPEP_0193794506 169964 1.600000e-125 Bestrophin\n", + "1 CAMPEP_0193783668 140936 1.900000e-44 Bestrophin\n", + "2 CAMPEP_0193795430 170428 1.400000e-43 Bestrophin\n", + "3 CAMPEP_0193788850 155911 1.300000e-25 Bestrophin\n", + "4 CAMPEP_0193725542 15531 4.000000e-24 Bestrophin" + ] + }, + "execution_count": 9, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "readHMM('Chrysochromulina-polylepis-CCMP1757.pep.fa.gz','Bestrophin').head()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def makeHMM(Organism,gene_type):\n", + " \"\"\"Combines the plots from each gene_type into a table\"\"\"\n", + " if gene_type=='CF':\n", + " a=readHMM(Organism,'PK')\n", + " b=readHMM(Organism,'PEPC')\n", + " c=readHMM(Organism,'PEPCK')\n", + " d=readHMM(Organism,'PK')\n", + " e=readHMM(Organism,'MDH')\n", + " f=readHMM(Organism,'OMT')\n", + " g=readHMM(Organism,'PYC')\n", + " h=readHMM(Organism,'PPDK')\n", + " i=readHMM(Organism,'ME')\n", + " \n", + " frames = [a,b,c,d,e,f,g,i,h]\n", + " result = pd.concat(frames)\n", + " return result\n", + " if gene_type=='PR':\n", + " a=readHMM(Organism,'SHMT')\n", + " b=readHMM(Organism,'GOX')\n", + " c=readHMM(Organism,'GDCT')\n", + " d=readHMM(Organism,'PGP')\n", + " e=readHMM(Organism,'ICL')\n", + " f=readHMM(Organism,'GCL')\n", + " g=readHMM(Organism,'HR')\n", + " h=readHMM(Organism,'SPT')\n", + " i=readHMM(Organism,'TSR')\n", + " j=readHMM(Organism,'MS')\n", + " k=readHMM(Organism,'GlcDH')\n", + " l=readHMM(Organism,'ALAT_GGAT')\n", + " m=readHMM(Organism,'GK')\n", + " \n", + " frames = [a,b,c,d,e,f,g,h,i,j,k,l,m]\n", + " result = pd.concat(frames)\n", + " return result\n", + " if gene_type=='BP':\n", + " a=readHMM(Organism,'CA_alpha')\n", + " b=readHMM(Organism,'CA_delta')\n", + " c=readHMM(Organism,'CA_beta')\n", + " #d=readHMM(Organism,'Ca_zeta')\n", + " e=readHMM(Organism,'Bestrophin')\n", + " f=readHMM(Organism,'SLC4')\n", + "\n", + " frames = [a,b,c,e,f]\n", + " result = pd.concat(frames)\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 13, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
CAMPEPidcontigEvalueAnnotation
0CAMPEP_019371699810130_18.300000e-164CA_delta
1CAMPEP_019372011412139_12.500000e-144CA_delta
2CAMPEP_019374095025073_12.000000e-112CA_delta
3CAMPEP_019374421027355_11.700000e-97CA_delta
0CAMPEP_019373163219523_17.400000e-17CA_beta
1CAMPEP_019373904823878_14.500000e-07CA_beta
0CAMPEP_0193794506169964_11.600000e-125Bestrophin
1CAMPEP_0193783668140936_11.900000e-44Bestrophin
2CAMPEP_0193795430170428_11.400000e-43Bestrophin
3CAMPEP_0193788850155911_11.300000e-25Bestrophin
4CAMPEP_019372554215531_14.000000e-24Bestrophin
5CAMPEP_01937052541979_12.600000e-23Bestrophin
6CAMPEP_0193801006173214_15.100000e-23Bestrophin
7CAMPEP_01937124286963_17.900000e-23Bestrophin
8CAMPEP_019374279826324_11.400000e-21Bestrophin
9CAMPEP_019372768016768_17.400000e-21Bestrophin
10CAMPEP_019374114625209_14.600000e-20Bestrophin
11CAMPEP_019373791223215_13.300000e-18Bestrophin
12CAMPEP_019373153419445_11.400000e-12Bestrophin
13CAMPEP_01937049421794_13.800000e-12Bestrophin
14CAMPEP_01937069603047_15.300000e-12Bestrophin
15CAMPEP_01937148208398_15.500000e-12Bestrophin
16CAMPEP_019376449856150_11.500000e-11Bestrophin
17CAMPEP_019373524621696_12.000000e-09Bestrophin
18CAMPEP_019376148251549_12.400000e-09Bestrophin
19CAMPEP_0193790988168202_13.800000e-09Bestrophin
20CAMPEP_0193785494145252_11.300000e-08Bestrophin
21CAMPEP_0193779272125944_11.300000e-08Bestrophin
22CAMPEP_0193779030125485_11.500000e-08Bestrophin
23CAMPEP_01937097645146_11.600000e-08Bestrophin
...............
44CAMPEP_01937083784061_14.700000e-06Bestrophin
45CAMPEP_0193783502140102_15.000000e-06Bestrophin
46CAMPEP_019374295426440_15.900000e-06Bestrophin
47CAMPEP_019372194213485_18.700000e-06Bestrophin
48CAMPEP_019374684831034_12.400000e-05Bestrophin
49CAMPEP_0193796456170945_13.800000e-05Bestrophin
50CAMPEP_019372492415193_16.700000e-05Bestrophin
51CAMPEP_0193792018168721_17.800000e-05Bestrophin
52CAMPEP_01937087284339_18.100000e-05Bestrophin
53CAMPEP_019373901023853_13.300000e-04Bestrophin
54CAMPEP_0193782824136333_15.700000e-04Bestrophin
55CAMPEP_0193800510172967_17.400000e-04Bestrophin
56CAMPEP_019376622261043_18.100000e-04Bestrophin
57CAMPEP_019373275220295_11.300000e-03Bestrophin
58CAMPEP_019376043450060_16.400000e-03Bestrophin
59CAMPEP_01937036741074_16.800000e-03Bestrophin
60CAMPEP_019376037449940_11.000000e-02Bestrophin
61CAMPEP_019375968648915_12.100000e-02Bestrophin
62CAMPEP_019375298839679_14.200000e-02Bestrophin
63CAMPEP_019376475256822_16.800000e-02Bestrophin
64CAMPEP_019374160825516_19.000000e-02Bestrophin
65CAMPEP_0193800210172820_12.000000e-01Bestrophin
66CAMPEP_019373898423837_11.300000e+00Bestrophin
0CAMPEP_019373432621185_11.100000e-131SLC4
1CAMPEP_0193793958169690_11.200000e-86SLC4
2CAMPEP_019373932024061_16.400000e-79SLC4
3CAMPEP_019374993635400_11.900000e-17SLC4
4CAMPEP_0193785558145391_15.900000e-14SLC4
5CAMPEP_01937093164808_13.300000e+00SLC4
6CAMPEP_0193780838129005_13.500000e+00SLC4
\n", + "

80 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " CAMPEPid contig Evalue Annotation\n", + "0 CAMPEP_0193716998 10130_1 8.300000e-164 CA_delta\n", + "1 CAMPEP_0193720114 12139_1 2.500000e-144 CA_delta\n", + "2 CAMPEP_0193740950 25073_1 2.000000e-112 CA_delta\n", + "3 CAMPEP_0193744210 27355_1 1.700000e-97 CA_delta\n", + "0 CAMPEP_0193731632 19523_1 7.400000e-17 CA_beta\n", + "1 CAMPEP_0193739048 23878_1 4.500000e-07 CA_beta\n", + "0 CAMPEP_0193794506 169964_1 1.600000e-125 Bestrophin\n", + "1 CAMPEP_0193783668 140936_1 1.900000e-44 Bestrophin\n", + "2 CAMPEP_0193795430 170428_1 1.400000e-43 Bestrophin\n", + "3 CAMPEP_0193788850 155911_1 1.300000e-25 Bestrophin\n", + "4 CAMPEP_0193725542 15531_1 4.000000e-24 Bestrophin\n", + "5 CAMPEP_0193705254 1979_1 2.600000e-23 Bestrophin\n", + "6 CAMPEP_0193801006 173214_1 5.100000e-23 Bestrophin\n", + "7 CAMPEP_0193712428 6963_1 7.900000e-23 Bestrophin\n", + "8 CAMPEP_0193742798 26324_1 1.400000e-21 Bestrophin\n", + "9 CAMPEP_0193727680 16768_1 7.400000e-21 Bestrophin\n", + "10 CAMPEP_0193741146 25209_1 4.600000e-20 Bestrophin\n", + "11 CAMPEP_0193737912 23215_1 3.300000e-18 Bestrophin\n", + "12 CAMPEP_0193731534 19445_1 1.400000e-12 Bestrophin\n", + "13 CAMPEP_0193704942 1794_1 3.800000e-12 Bestrophin\n", + "14 CAMPEP_0193706960 3047_1 5.300000e-12 Bestrophin\n", + "15 CAMPEP_0193714820 8398_1 5.500000e-12 Bestrophin\n", + "16 CAMPEP_0193764498 56150_1 1.500000e-11 Bestrophin\n", + "17 CAMPEP_0193735246 21696_1 2.000000e-09 Bestrophin\n", + "18 CAMPEP_0193761482 51549_1 2.400000e-09 Bestrophin\n", + "19 CAMPEP_0193790988 168202_1 3.800000e-09 Bestrophin\n", + "20 CAMPEP_0193785494 145252_1 1.300000e-08 Bestrophin\n", + "21 CAMPEP_0193779272 125944_1 1.300000e-08 Bestrophin\n", + "22 CAMPEP_0193779030 125485_1 1.500000e-08 Bestrophin\n", + "23 CAMPEP_0193709764 5146_1 1.600000e-08 Bestrophin\n", + ".. ... ... ... ...\n", + "44 CAMPEP_0193708378 4061_1 4.700000e-06 Bestrophin\n", + "45 CAMPEP_0193783502 140102_1 5.000000e-06 Bestrophin\n", + "46 CAMPEP_0193742954 26440_1 5.900000e-06 Bestrophin\n", + "47 CAMPEP_0193721942 13485_1 8.700000e-06 Bestrophin\n", + "48 CAMPEP_0193746848 31034_1 2.400000e-05 Bestrophin\n", + "49 CAMPEP_0193796456 170945_1 3.800000e-05 Bestrophin\n", + "50 CAMPEP_0193724924 15193_1 6.700000e-05 Bestrophin\n", + "51 CAMPEP_0193792018 168721_1 7.800000e-05 Bestrophin\n", + "52 CAMPEP_0193708728 4339_1 8.100000e-05 Bestrophin\n", + "53 CAMPEP_0193739010 23853_1 3.300000e-04 Bestrophin\n", + "54 CAMPEP_0193782824 136333_1 5.700000e-04 Bestrophin\n", + "55 CAMPEP_0193800510 172967_1 7.400000e-04 Bestrophin\n", + "56 CAMPEP_0193766222 61043_1 8.100000e-04 Bestrophin\n", + "57 CAMPEP_0193732752 20295_1 1.300000e-03 Bestrophin\n", + "58 CAMPEP_0193760434 50060_1 6.400000e-03 Bestrophin\n", + "59 CAMPEP_0193703674 1074_1 6.800000e-03 Bestrophin\n", + "60 CAMPEP_0193760374 49940_1 1.000000e-02 Bestrophin\n", + "61 CAMPEP_0193759686 48915_1 2.100000e-02 Bestrophin\n", + "62 CAMPEP_0193752988 39679_1 4.200000e-02 Bestrophin\n", + "63 CAMPEP_0193764752 56822_1 6.800000e-02 Bestrophin\n", + "64 CAMPEP_0193741608 25516_1 9.000000e-02 Bestrophin\n", + "65 CAMPEP_0193800210 172820_1 2.000000e-01 Bestrophin\n", + "66 CAMPEP_0193738984 23837_1 1.300000e+00 Bestrophin\n", + "0 CAMPEP_0193734326 21185_1 1.100000e-131 SLC4\n", + "1 CAMPEP_0193793958 169690_1 1.200000e-86 SLC4\n", + "2 CAMPEP_0193739320 24061_1 6.400000e-79 SLC4\n", + "3 CAMPEP_0193749936 35400_1 1.900000e-17 SLC4\n", + "4 CAMPEP_0193785558 145391_1 5.900000e-14 SLC4\n", + "5 CAMPEP_0193709316 4808_1 3.300000e+00 SLC4\n", + "6 CAMPEP_0193780838 129005_1 3.500000e+00 SLC4\n", + "\n", + "[80 rows x 4 columns]" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "makeHMM('Chrysochromulina-polylepis-CCMP1757.pep.fa.gz','BP')" + ] + }, + { + "cell_type": "code", + "execution_count": 31, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "def HMMcmp(Organism,gene_type):\n", + " result=makeHMM(Organism,gene_type)\n", + " contig_EV=dict(zip(result.contig,result.Evalue))\n", + " contig_Annot=dict(zip(result.contig,result.Annotation))\n", + " \n", + " \n", + " Organism = re.sub('.pep.fa.gz', '', Organism)\n", + " \n", + " expression=pd.read_csv('{}_cds_counts.txt'.format(Organism),delimiter='\\t',index_col=0)\n", + " \n", + " expression['log2CPM']=np.log2(expression.sum(axis=1)/1000000)\n", + " expression.index= expression.index.str.split(\"|\").str[1]\n", + " contig_CMP=dict(zip(expression.index,expression.log2CPM))\n", + " \n", + " Contig=[]\n", + " Evalue=[]\n", + " Annotation=[]\n", + " log2CPM=[]\n", + " \n", + " for i in expression.index:\n", + " a=contig_CMP.get(i)\n", + " if a>-16 or a==\"-inf\" or a==\"inf\":\n", + " Contig.append(i)\n", + " Evalue.append(contig_EV.get(i))\n", + " Annotation.append(contig_Annot.get(i))\n", + " log2CPM.append(a)\n", + " \n", + "\n", + " out=pd.DataFrame()\n", + " out['Contig']=Contig\n", + " out['Evalue']=Evalue\n", + " out['Annotation']=Annotation\n", + " out['log2CPM']=log2CPM\n", + " \n", + " return out" + ] + }, + { + "cell_type": "code", + "execution_count": 15, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ContigEvalueAnnotationlog2CPM
03_1NaNNone-6.713308
11_1NaNNone-6.339929
24_1NaNNone-9.924541
36_1NaNNone-14.844106
45_1NaNNone-9.441721
58_1NaNNone-10.127438
67_1NaNNone-11.431723
72_1NaNNone-7.236558
89_1NaNNone-15.609640
910_1NaNNone-15.231129
1011_1NaNNone-10.400187
1116_1NaNNone-8.201098
1215_1NaNNone-6.864807
1318_1NaNNone-12.488625
1417_1NaNNone-10.552190
1520_1NaNNone-14.761644
1623_1NaNNone-12.823044
1714_1NaNNone-7.118589
1822_1NaNNone-7.907814
1913_1NaNNone-10.756643
2024_1NaNNone-9.447753
2126_1NaNNone-11.150209
2225_1NaNNone-13.439715
2328_1NaNNone-15.609640
2427_1NaNNone-10.931569
2530_1NaNNone-11.055052
2631_1NaNNone-12.061204
2733_1NaNNone-10.766662
2835_1NaNNone-14.761644
2936_1NaNNone-10.797142
...............
49525173613_1NaNNone-9.884445
49526173610_1NaNNone-8.018679
49527173614_1NaNNone-9.277828
49528173615_1NaNNone-9.524301
49529173616_1NaNNone-9.744216
49530173617_1NaNNone-8.363612
49531173594_1NaNNone-10.517941
49532173619_1NaNNone-11.000831
49533173618_1NaNNone-9.690777
49534173620_1NaNNone-10.180025
49535173621_1NaNNone-7.328406
49536173623_1NaNNone-10.746693
49537173622_1NaNNone-9.873577
49538173624_1NaNNone-10.124214
49539173625_1NaNNone-5.489727
49540173628_1NaNNone-9.125825
49541173626_1NaNNone-8.247258
49542173629_1NaNNone-10.190102
49543173631_1NaNNone-7.165661
49544173632_1NaNNone-11.231129
49545173633_1NaNNone-9.676540
49546173634_1NaNNone-6.513189
49547173627_1NaNNone-10.186735
49548173635_1NaNNone-10.196859
49549173636_1NaNNone-9.434715
49550173637_1NaNNone-8.188417
49551173638_1NaNNone-5.760235
49552173639_1NaNNone-2.661282
49553173554_1NaNNone-7.592275
49554173630_1NaNNone-8.716036
\n", + "

49555 rows × 4 columns

\n", + "
" + ], + "text/plain": [ + " Contig Evalue Annotation log2CPM\n", + "0 3_1 NaN None -6.713308\n", + "1 1_1 NaN None -6.339929\n", + "2 4_1 NaN None -9.924541\n", + "3 6_1 NaN None -14.844106\n", + "4 5_1 NaN None -9.441721\n", + "5 8_1 NaN None -10.127438\n", + "6 7_1 NaN None -11.431723\n", + "7 2_1 NaN None -7.236558\n", + "8 9_1 NaN None -15.609640\n", + "9 10_1 NaN None -15.231129\n", + "10 11_1 NaN None -10.400187\n", + "11 16_1 NaN None -8.201098\n", + "12 15_1 NaN None -6.864807\n", + "13 18_1 NaN None -12.488625\n", + "14 17_1 NaN None -10.552190\n", + "15 20_1 NaN None -14.761644\n", + "16 23_1 NaN None -12.823044\n", + "17 14_1 NaN None -7.118589\n", + "18 22_1 NaN None -7.907814\n", + "19 13_1 NaN None -10.756643\n", + "20 24_1 NaN None -9.447753\n", + "21 26_1 NaN None -11.150209\n", + "22 25_1 NaN None -13.439715\n", + "23 28_1 NaN None -15.609640\n", + "24 27_1 NaN None -10.931569\n", + "25 30_1 NaN None -11.055052\n", + "26 31_1 NaN None -12.061204\n", + "27 33_1 NaN None -10.766662\n", + "28 35_1 NaN None -14.761644\n", + "29 36_1 NaN None -10.797142\n", + "... ... ... ... ...\n", + "49525 173613_1 NaN None -9.884445\n", + "49526 173610_1 NaN None -8.018679\n", + "49527 173614_1 NaN None -9.277828\n", + "49528 173615_1 NaN None -9.524301\n", + "49529 173616_1 NaN None -9.744216\n", + "49530 173617_1 NaN None -8.363612\n", + "49531 173594_1 NaN None -10.517941\n", + "49532 173619_1 NaN None -11.000831\n", + "49533 173618_1 NaN None -9.690777\n", + "49534 173620_1 NaN None -10.180025\n", + "49535 173621_1 NaN None -7.328406\n", + "49536 173623_1 NaN None -10.746693\n", + "49537 173622_1 NaN None -9.873577\n", + "49538 173624_1 NaN None -10.124214\n", + "49539 173625_1 NaN None -5.489727\n", + "49540 173628_1 NaN None -9.125825\n", + "49541 173626_1 NaN None -8.247258\n", + "49542 173629_1 NaN None -10.190102\n", + "49543 173631_1 NaN None -7.165661\n", + "49544 173632_1 NaN None -11.231129\n", + "49545 173633_1 NaN None -9.676540\n", + "49546 173634_1 NaN None -6.513189\n", + "49547 173627_1 NaN None -10.186735\n", + "49548 173635_1 NaN None -10.196859\n", + "49549 173636_1 NaN None -9.434715\n", + "49550 173637_1 NaN None -8.188417\n", + "49551 173638_1 NaN None -5.760235\n", + "49552 173639_1 NaN None -2.661282\n", + "49553 173554_1 NaN None -7.592275\n", + "49554 173630_1 NaN None -8.716036\n", + "\n", + "[49555 rows x 4 columns]" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "HMMcmp('Chrysochromulina-polylepis-CCMP1757.pep.fa.gz','BP')" + ] + }, + { + "cell_type": "code", + "execution_count": 32, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def HMMclean(Organism):\n", + " '''Will create an HMM table and filter out values that do not meet an e-value'''\n", + " a=HMMcmp(Organism,'BP')\n", + " b=HMMcmp(Organism,'CF')\n", + " c=HMMcmp(Organism,'PR')\n", + " \n", + " frames=[a,b,c]\n", + " HMM= pd.concat(frames)\n", + " \n", + " #print HMM\n", + " \n", + " HMM=HMM[HMM['Evalue'] < .00001]\n", + " \n", + " #return HMM.drop_duplicates()\n", + " \n", + " Genes= np.unique(HMM['Annotation'])\n", + " A=pd.DataFrame(0,index=[Organism],columns=Genes)\n", + " \n", + " for j in Genes:\n", + " counts=HMM.Annotation.value_counts()[j]\n", + " A[j]= counts\n", + " \n", + " return A\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 17, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ALAT_GGATBestrophinCA_betaCA_deltaGCLGDCTGlcDHHRICLMDH...OMTPEPCPEPCKPGPPKPYCSHMTSLC4SPTTSR
Chrysochromulina-polylepis-CCMP1757.pep.fa.gz84824214932...4711119104537
\n", + "

1 rows × 21 columns

\n", + "
" + ], + "text/plain": [ + " ALAT_GGAT Bestrophin CA_beta \\\n", + "Chrysochromulina-polylepis-CCMP1757.pep.fa.gz 8 48 2 \n", + "\n", + " CA_delta GCL GDCT GlcDH HR \\\n", + "Chrysochromulina-polylepis-CCMP1757.pep.fa.gz 4 2 1 4 9 \n", + "\n", + " ICL MDH ... OMT PEPC \\\n", + "Chrysochromulina-polylepis-CCMP1757.pep.fa.gz 3 2 ... 47 1 \n", + "\n", + " PEPCK PGP PK PYC SHMT \\\n", + "Chrysochromulina-polylepis-CCMP1757.pep.fa.gz 1 11 9 10 4 \n", + "\n", + " SLC4 SPT TSR \n", + "Chrysochromulina-polylepis-CCMP1757.pep.fa.gz 5 3 7 \n", + "\n", + "[1 rows x 21 columns]" + ] + }, + "execution_count": 17, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "HMMclean('Chrysochromulina-polylepis-CCMP1757.pep.fa.gz')" + ] + }, + { + "cell_type": "code", + "execution_count": 18, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Gephyrocapsa-oceanica-RCC1303.pep.fa.gz', 'Chrysochromulina-polylepis-CCMP1757.pep.fa.gz']\n" + ] + } + ], + "source": [ + "print names" + ] + }, + { + "cell_type": "code", + "execution_count": 33, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def gen_type(gene_type):\n", + " \"\"\" Allows you to extract specific gene_types\"\"\"\n", + " \n", + " if gene_type=='BP':\n", + " fixed=['SLC4','Bestrophin','CA_alpha','CA_beta','CA_delta','CA_zeta']\n", + " if gene_type=='PR':\n", + " fixed=['PGP','GOX','SPT','ALAT_GGAT','GDCT','SHMT','HR','GK','GlcDH','MS','ICL','GCL','TSR']\n", + " if gene_type=='CF':\n", + " fixed=['PK','PEPC','PEPCK','MDH','OMT','ME','PPDK','PYC']\n", + " \n", + " A=pd.DataFrame(0, index=names, columns=fixed)\n", + " \n", + " for i,j in enumerate(names):\n", + " for k in fixed:\n", + " B=HMMclean(j)\n", + " #print B\n", + " if k in list(B):\n", + " A[k][i]=B[k]\n", + " \n", + " A.index = A.index.str.split('.').str[0]\n", + " \n", + " C=A.transpose()\n", + " #C.to_csv('{}_{}_GeneCountHMM.csv'.format(t,gene_type))\n", + " \n", + " return A.transpose()" + ] + }, + { + "cell_type": "code", + "execution_count": 30, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "#gen_type('BP')" + ] + }, + { + "cell_type": "markdown", + "metadata": {}, + "source": [ + "### All sections in one table" + ] + }, + { + "cell_type": "code", + "execution_count": 34, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + " Gephyrocapsa-oceanica-RCC1303 \\\n", + "SLC4 6 \n", + "Bestrophin 37 \n", + "CA_alpha 2 \n", + "CA_beta 0 \n", + "CA_delta 2 \n", + "CA_zeta 0 \n", + "PK 4 \n", + "PEPC 1 \n", + "PEPCK 1 \n", + "MDH 4 \n", + "OMT 69 \n", + "ME 2 \n", + "PPDK 0 \n", + "PYC 16 \n", + "PGP 9 \n", + "GOX 1 \n", + "SPT 2 \n", + "ALAT_GGAT 13 \n", + "GDCT 4 \n", + "SHMT 4 \n", + "HR 10 \n", + "GK 0 \n", + "GlcDH 5 \n", + "MS 0 \n", + "ICL 4 \n", + "GCL 4 \n", + "TSR 7 \n", + "\n", + " Chrysochromulina-polylepis-CCMP1757 Isochrysis-galbana-CCMP1323 \\\n", + "SLC4 5 14 \n", + "Bestrophin 48 34 \n", + "CA_alpha 0 2 \n", + "CA_beta 2 7 \n", + "CA_delta 4 2 \n", + "CA_zeta 0 0 \n", + "PK 9 6 \n", + "PEPC 1 1 \n", + "PEPCK 1 2 \n", + "MDH 2 4 \n", + "OMT 47 75 \n", + "ME 3 8 \n", + "PPDK 0 0 \n", + "PYC 10 17 \n", + "PGP 11 11 \n", + "GOX 0 2 \n", + "SPT 3 6 \n", + "ALAT_GGAT 8 17 \n", + "GDCT 1 3 \n", + "SHMT 4 9 \n", + "HR 9 8 \n", + "GK 0 0 \n", + "GlcDH 4 7 \n", + "MS 0 2 \n", + "ICL 3 4 \n", + "GCL 2 2 \n", + "TSR 7 11 \n", + "\n", + " Isochrysis-sp-CCMP1244 Isochrysis-sp-CCMP1324 \\\n", + "SLC4 8 5 \n", + "Bestrophin 35 18 \n", + "CA_alpha 0 0 \n", + "CA_beta 5 4 \n", + "CA_delta 1 2 \n", + "CA_zeta 0 0 \n", + "PK 3 4 \n", + "PEPC 1 0 \n", + "PEPCK 1 1 \n", + "MDH 4 3 \n", + "OMT 61 28 \n", + "ME 3 3 \n", + "PPDK 0 0 \n", + "PYC 11 9 \n", + "PGP 7 4 \n", + "GOX 1 1 \n", + "SPT 2 4 \n", + "ALAT_GGAT 13 8 \n", + "GDCT 4 1 \n", + "SHMT 4 6 \n", + "HR 6 4 \n", + "GK 0 0 \n", + "GlcDH 4 2 \n", + "MS 1 1 \n", + "ICL 4 2 \n", + "GCL 4 1 \n", + "TSR 9 3 \n", + "\n", + " Pleurochrysis-carterae-CCMP645 Pavlova-sp-CCMP459 \\\n", + "SLC4 4 4 \n", + "Bestrophin 14 19 \n", + "CA_alpha 13 0 \n", + "CA_beta 15 9 \n", + "CA_delta 0 0 \n", + "CA_zeta 0 0 \n", + "PK 9 2 \n", + "PEPC 1 0 \n", + "PEPCK 2 2 \n", + "MDH 2 6 \n", + "OMT 51 27 \n", + "ME 2 2 \n", + "PPDK 3 1 \n", + "PYC 11 9 \n", + "PGP 3 3 \n", + "GOX 1 1 \n", + "SPT 2 3 \n", + "ALAT_GGAT 10 6 \n", + "GDCT 1 1 \n", + "SHMT 5 4 \n", + "HR 8 3 \n", + "GK 1 0 \n", + "GlcDH 2 5 \n", + "MS 0 1 \n", + "ICL 2 2 \n", + "GCL 1 4 \n", + "TSR 7 5 \n", + "\n", + " Prymnesium-parvum-Texoma1 Emiliania-huxleyi-374 \\\n", + "SLC4 17 8 \n", + "Bestrophin 32 14 \n", + "CA_alpha 0 0 \n", + "CA_beta 5 0 \n", + "CA_delta 6 1 \n", + "CA_zeta 0 0 \n", + "PK 6 3 \n", + "PEPC 2 1 \n", + "PEPCK 5 1 \n", + "MDH 4 3 \n", + "OMT 51 22 \n", + "ME 1 4 \n", + "PPDK 0 0 \n", + "PYC 15 9 \n", + "PGP 4 4 \n", + "GOX 2 1 \n", + "SPT 4 1 \n", + "ALAT_GGAT 14 7 \n", + "GDCT 4 2 \n", + "SHMT 5 4 \n", + "HR 7 4 \n", + "GK 0 0 \n", + "GlcDH 5 4 \n", + "MS 1 1 \n", + "ICL 6 3 \n", + "GCL 1 1 \n", + "TSR 9 4 \n", + "\n", + " Emiliania-huxleyi-379 Emiliania-huxleyi-CCMP370 \\\n", + "SLC4 9 6 \n", + "Bestrophin 22 38 \n", + "CA_alpha 0 3 \n", + "CA_beta 5 0 \n", + "CA_delta 1 1 \n", + "CA_zeta 0 0 \n", + "PK 3 5 \n", + "PEPC 1 1 \n", + "PEPCK 1 1 \n", + "MDH 3 5 \n", + "OMT 35 73 \n", + "ME 2 2 \n", + "PPDK 0 0 \n", + "PYC 9 13 \n", + "PGP 3 8 \n", + "GOX 1 1 \n", + "SPT 0 3 \n", + "ALAT_GGAT 11 15 \n", + "GDCT 3 4 \n", + "SHMT 5 6 \n", + "HR 5 7 \n", + "GK 0 0 \n", + "GlcDH 2 4 \n", + "MS 1 1 \n", + "ICL 2 8 \n", + "GCL 3 3 \n", + "TSR 6 6 \n", + "\n", + " Emiliania-huxleyi-PLYM219 \n", + "SLC4 8 \n", + "Bestrophin 32 \n", + "CA_alpha 4 \n", + "CA_beta 0 \n", + "CA_delta 2 \n", + "CA_zeta 0 \n", + "PK 5 \n", + "PEPC 1 \n", + "PEPCK 1 \n", + "MDH 2 \n", + "OMT 77 \n", + "ME 2 \n", + "PPDK 0 \n", + "PYC 16 \n", + "PGP 7 \n", + "GOX 1 \n", + "SPT 2 \n", + "ALAT_GGAT 12 \n", + "GDCT 3 \n", + "SHMT 5 \n", + "HR 6 \n", + "GK 0 \n", + "GlcDH 4 \n", + "MS 1 \n", + "ICL 4 \n", + "GCL 4 \n", + "TSR 9 \n" + ] + } + ], + "source": [ + "a=gen_type('BP')\n", + "b=gen_type('CF')\n", + "c=gen_type('PR')\n", + "\n", + "framesX=(a,b,c)\n", + "outFrame=pd.concat(framesX)\n", + "\n", + "outFrame.to_csv('COP_ADD.csv')\n", + "\n", + "print outFrame" + ] + }, + { + "cell_type": "code", + "execution_count": 24, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "SLC4 5\n", + "Bestrophin 48\n", + "CA_alpha 0\n", + "CA_beta 2\n", + "CA_delta 4\n", + "CA_zeta 0\n", + "PK 9\n", + "PEPC 1\n", + "PEPCK 1\n", + "MDH 2\n", + "OMT 47\n", + "ME 3\n", + "PPDK 0\n", + "PYC 10\n", + "PGP 11\n", + "GOX 0\n", + "SPT 3\n", + "ALAT_GGAT 8\n", + "GDCT 1\n", + "SHMT 4\n", + "HR 9\n", + "GK 0\n", + "GlcDH 4\n", + "MS 0\n", + "ICL 3\n", + "GCL 2\n", + "TSR 7\n", + "Name: Chrysochromulina-polylepis-CCMP1757, dtype: int64\n" + ] + } + ], + "source": [ + "print outFrame['Chrysochromulina-polylepis-CCMP1757']" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [conda root]", + "language": "python", + "name": "conda-root-py" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 1 +} diff --git a/MDH.txt b/MDH.txt new file mode 100644 index 0000000..434d77e --- /dev/null +++ b/MDH.txt @@ -0,0 +1,48 @@ +>jgi|Thaps3|20726|estExt_fgenesh1_pg.C_chr_10366|MDH1 +MIVNRLATKALTALRSTSSATLKSCFSTSTPTSAKVAVLGAAGGIGQPLSLLCKLSPEVSTLSCYDIVGT +PGVAADLSHIPTKSGTMGRLPSPVQWPMAGNGGLEETLTGADVVVIPAGVPRKPGMTRDDLFNTNASIVK +TLVEGCAQFCPDAVIAIISNPVNSTVPIAAEVLKKHGVYNPKKLAGVTTLDVCRANTFVANSQGLDPKDV +NVTVIGGHAGITILPLFSRVEGAKFTDEELEAITVRTQFGGDEVVAAKAGAGSATLSMAYAGYVFTENVL +KALRGEEIVQCAFVESGLTDAKYFASPVKFGKGGVEEILPLGALSAYEQGWFDKMMPELKKQIQKGEDFV +NN* +>jgi|Thaps3|25953|estExt_fgenesh1_kg.C_chr_40009|MDH2 +MKAFMKEVFLSYGVTPERAEVCSDVLIESDKRGIDSHGLGRLKPIYCDRMDDGILFPDKPIDIISESDTT +ALVDGNLGLGLYIGPHCMQMAIDKAKKHGVGFVAVRNSTHYGIAGYYATMATQQGCIGLTGTNARPSIAP +TFGVEPMMGTNPLTFGIPSSDDFPFVIDCATSVNQRGKIEKYAREGVDTPRGAVIDDQGIERTDTDGILR +DMVLGKCALTPVGGAGDKMGGYKGYGWATTVELLCTALQSGPWGEDICGVDRATGKPKPMPLGHFFLAID +IEKICPVDTFKKNSGEFLQALRDSKKAPNGPGRIWTAGEIENDARVERTAQGGMKVPIPLQKNMKALRDT +RPGLKEKYVKLLFE* +>jgi|Thaps3|41425|estExt_gwp_gw1.C_chr_70437|MDH3 +MAIQPSITRVAISGAAGNIGYALLPLLASGYVFGDDRSVELRLLEIPHAVKALAGVRMELIDCAFPCLTD +VIITTEPEEAFEGADVIVLVGGFPRKQGMERKDLIHANTKIFTTMGRAIEEVASPNVKVLVVANPANTNC +LVALNEASRIPSKNFCALTYLDHQRAKAQVAIRLGVRPNQVKNVSIWGNHSNTQYPDVLTDGYISFDSGE +DIPISTLMANDLEWTNDDFVQIVQNRGKHVIEVRGNSSALSAAQATADCLATWLVTGTKRGETISMAVYN +DKGYYGVKKGLVFSFPCECRDGDWFVKTGLELSDLAMEKLQVTENELKEEREDAEELIKQTRFRSMSTVS +TASLASSTSEMELSVPPRVLTSRI* +>jgi|Phatr2|51297|estExt_fgenesh1_pm.C_chr_310007|MDH_PTRI +MFTARSLASVASYSSASVARQMSSASKKVAVLGAAGGIGQPLSMLLKLSPAIGELACYDIVGTPGVAADL +SHIPTRARVSGCLPAAGAWPPRGNEGLGEALTGADVVVIPAGVPRKPGMTRDDLFNTNAGIVKTLIQGVA +EFCPEAVIAIISNPVNSTVPIAAEILKQKGVYNPRKLCGVTTFDVIRANTFAAAHMGVDPASVDVTVIGG +HAGITILPLYSQLEGFAPSDAEREAITVRTQFGGDEVVQAKAGSGSATLSMAYAGYLFTEKVLQGLNGEK +VTQCAYVQSDLTDCKYFASPCEFGPNGVEKVLGYGTLSAYEQAWFDKMIPDLQKQIKKGEDFVNA* +>gb|AAW79319.1_malate_dehydrogenase_partial|Isochrysis_galbana +AVLGAAGGIGQPLSLLCKLSDHIDEVACYDVVGTPGVAADLSHIPSGAKITGDLPSAGTWPPSHNAGLER +ALTGASVVVIPAGVPRKPGMTRDDLFNTNASIVKTLVEGCAKFCPDAVLAIISNPVNSTVPIAAEVLKKA +GVYNKNKVVGVTTLDVCRANTFLAEKLGKSPKDINVPVIGGHAGITILPLLSQVPGASALPADVTAALTH +RIQFGGDEVVQAKAGSGSATLSMAYAGFLFTEGLIKAMKGEEVIQCAYVESTLTPAAYFASPCKFGPEGV +KEVLGFGTLSAYEKQWFDKMVPDLQKQIAKGIDFVNSPA +>gb|XP_001418129.1|predicted_protein|Ostreococcus_lucimarinus2_CCE9901 +MPASTVDDVVTVLITGAAGQIGYALAPMVCAGAATGPGKKIALKLLDVEFASEALRGVKMEIMDCAFDAC +VSVDVFTDCEKACEGVDVAIMVGGFPRKQGMERKDVLGKNVAIYKQQASALASKAKKDVKIVVVANPANT +NAKILAKFAPSIPRGNVTCMTRLDHNRALAQLGERSGKATIEVKNAIIWGNHSSTQYPDVNHATIEGKPA +REVIGNDAYLDGDFVDVVRARGAAIIEARKLSSALSAASSVCDHVYDWIHGTKEGEWTSMGVISDGSYGV +PEGLVYSFPVTCTGGKWQIVQGLSIDERSRKLMDESAKELTEEFELAEQCLAESA +>gb|EKX43420.1|hypothetical_protein_GUITHDRAFT_163842|Guillardia_theta_CCMP2712 +MSKPVMTVCITGAAGQIGYALLPHLCSGKTFGPDQPVKLHLLDLNIEGVQTALNGVKMELEDATYPLLKG +VVCTGDAKVAFTGADAVIMLGAFPRKDGMERKDLLEKNCGIFKEQGELLNTVASKTVKVLVVGNPANTNC +LIAAECAPNIPRENFSALTRLDHNRAIAQLAIKASVPVEQVQNSIIWGNHSSTQYPDINAATINGKKAKE +VVNNDEWYKNEFIPCIQKRGAAIIAARKLSSALSAAQAISDHMHDWFLGTPAGQFVSMAVDSTGNKYGVA +DGLIYSFPVSCSNGKWTIKEGLEIDDFSKEKMKATEQELTDEKKTAMEILGK + + + diff --git a/ME.txt b/ME.txt new file mode 100644 index 0000000..cba49fc --- /dev/null +++ b/ME.txt @@ -0,0 +1,63 @@ +>XP_645111.1 NADP-dependent malate dehydrogenase [Dictyostelium discoideum AX4] +MQNKPSFILRNPSANKGTGFNNEEREKLGLKGLLPPKVESLQEQSDRALSQFTSFNTNLERYIFLNCLRD +RNETLFYYLLSNNLELMMPIIYTPTVGEACQKFGNEFRFAQGMYFASQDKGNIRAMMDNWPAEGVDIIVV +SDGSRILGLGDLGTNGMGIPVGKLQLYVAGAGFCPTRTLPVIIDSGTNTKKYLEDKYYLGERHPRIPDSE +YYPLVDEFLAAAFNKWPKVIVQFEDISNDHCFNLLDEYRNKYLCFNDDIQGTGSVILSGFINAVRSVQKP +IKEHRMVFLGAGSAGIGVADCIMSLFDEAGVSKEEARKSFWFVDSKGLITTTRGDELTSQKKQYAREDYT +YQLKSLLEVVRDVKPTAIIGLSGIGGSFSQEVIEEMAKHVEKPIVFALSNPTTNAECTAEQAYQWTDGRC +IFASGSPFKPVEYKGKTFVPGQGNNMYIFPGLGLAASVCEAKHVTDAMIITAAKTLASFVEDSEVLTGKI +YPGLQHIREISTRIAVKVIEKAYEEGMAQLPRPDNIEALVKSRQYVPSYDKSKN +>CAK01689.1 NAD-dependent malic enzyme [Bartonella tribocorum CIP 105476] +MSRERNNDFSSPIAELDSAALFYHQHPKPGKLEIQATKPLDNQRDLALAYSPGVAAPCLAIHEDPNLAAQ +YTSRSNLVAVISNGTAVLGLGNIGPLASKPVMEGKAVLFKKFANIDVFDIEIDASDIEQMVQTVSSLEPT +FGGINLEDIKAPECFEIEEKLRAKMNIPVFHDDQHGTAIIVSAAVLNALNLSGKKIENAKIVTSGAGAAA +LACLNLLVRLGAKVENIWLSDLEGVVYEGRKTLMDRWKVNYAQKTDARTLSDIIDNADIFLGLSAGGVLK +PEYLKKMAQNPLILALANPVPEIMPEKAHSIRPDAMICTGRSDYPNQVNNVLCFPYIFRGALDVGATAIN +EEMKMAAVHAIAALAREETSDVVARAYSKEPPNFGPDYLIPSPFDPRLILRIAPAVAKAAMATGVALRPI +EDMEAYYDILNRFVFLSGLTMKPVFAAAKTSKRKRVIYANGEDERVLRAAQVVIEEQTATPLLIGRPHVI +EARLKRFGLRIRPNIDFELTNPEDDPRFRDYVNLFFHYTGRRGVTPEMAKTIVRTSTTAIAALAVMREEA +DAMICGLEGRFERQLELIEQIIGLDPHVHRFSAMSLLISQQRTLFLTDTYVNENPSAEEIAEMTVLAAQE +VEAFGITPKAALLSHSNFGSKNTESARKMRRATEILAKLHPHLEADGEMHGDAALSKVFRDRVFPDSRLK +SEANLLVFPTLDSANITLNTVKSLTNALHVGPILIGAARPAHILTPSVTSRGVVNITALAVLAANRKNSL +VK +>AEE81903.1 NAD-dependent malic enzyme 2 [Arabidopsis thaliana] +MMWKNIAGLSKAAAAARTHGSRRCFSTAIPGPCIVHKRGADILHDPWFNKDTGFPLTERDRLGIRGLLPP +RVMTCVQQCDRFIESFRSLENNTKGEPENVVALAKWRMLNRLHDRNETLYYRVLIDNIKDFAPIIYTPTV +GLVCQNYSGLYRRPRGMYFSAKDKGEMMSMIYNWPAPQVDMIVITDGSRILGLGDLGVQGIGIPIGKLDM +YVAAAGINPQRVLPIMLDVGTNNEKLLQNDLYLGVRQPRLEGEEYLEIIDEFMEAAFTRWPKAVVQFEDF +QAKWAFGTLERYRKKFCMFNDDVQGTAGVALAGLLGTVRAQGRPISDFVNQKIVVVGAGSAGLGVTKMAV +QAVARMAGISESEATKNFYLIDKDGLVTTERTKLDPGAVLFAKNPAEIREGASIVEVVKKVRPHVLLGLS +GVGGIFNEEVLKAMRESDSCKPAIFAMSNPTLNAECTAADAFKHAGGNIVFASGSPFENVELENGKVGHV +NQANNMYLFPGIGLGTLLSGARIVTDGMLQAASECLASYMTDEEVQKGILYPSINNIRHITAEVGAAVLR +AAVTDDIAEGHGDVGPKDLSHMSKEDTVNYITRNMWFPVYSPLVHEK +>AEC06242.1 NAD-dependent malic enzyme 1 [Arabidopsis thaliana] +MGIANKLRLSSSSLSRILHRRILYSSAVRSFTTSEGHRPTIVHKQGLDILHDPWFNKGTAFTMTERNRLD +LRGLLPPNVMDSEQQIFRFMTDLKRLEEQARDGPSDPNALAKWRILNRLHDRNETMYYKVLINNIEEYAP +IVYTPTVGLVCQNYSGLFRRPRGMYFSAEDRGEMMSMVYNWPAEQVDMIVVTDGSRILGLGDLGVHGIGI +AVGKLDLYVAAAGINPQRVLPVMIDVGTNNEKLRNDPMYLGLQQRRLEDDDYIDVIDEFMEAVYTRWPHV +IVQFEDFQSKWAFKLLQRYRCTYRMFNDDVQGTAGVAIAGLLGAVRAQGRPMIDFPKMKIVVAGAGSAGI +GVLNAARKTMARMLGNTETAFDSAQSQFWVVDAQGLITEGRENIDPEAQPFARKTKEMERQGLKEGATLV +EVVREVKPDVLLGLSAVGGLFSKEVLEAMKGSTSTRPAIFAMSNPTKNAECTPQDAFSILGENMIFASGS +PFKNVEFGNGHVGHCNQGNNMYLFPGIGLGTLLSGAPIVSDGMLQAASECLAAYMSEEEVLEGIIYPPIS +RIRDITKRIAAAVIKEAIEEDLVEGYREMDAREIQKLDEEGLMEYVENNMWNPEYPTLVYKDD +>XP_002177890.1 predicted protein [Phaeodactylum tricornutum CCAP 1055/1] +MISSACRGSLKSLCSVQLRSNTRQHASLISCNNSPNVKRFSTAFSSQTDGEVLTVGGALSGEEPIHETGE +WAGCKRSFMIPIRISVRGTDILLDPLYNKGTAFKTGERDRLRFRGMLPHRIMNIHLQKERFLQALRAEDS +NIRKNVMLEDLHDRNETLYHRVLVDHIEEMAPYIYTPTVGQACMEFATRYRRPRGMYFTEEDRGHMAAMV +YNWPHRDVHVICVTDGSRILGLGDLGANGMGIPIGKLALYCAAGGIAPHRVLPVVFDAGTNNEALLQDKY +YLGVQRKRLKGAAYFRMMDEFMDAVRFRWPNVLVQFEDFSSEVAQTLLDRYRDDHLCFNDDIQGTGATTL +AGVLGALRAKGEEVTSLGDQRIVIAGAGSAGIGIAQVLMQAMEEQGRTPEEAKNAFYILDQNGLLGTDRA +NDLNAEQRVFVRSADNNLSLMDVVKKYKPTILLGVTTVGGLFTGDLIREMHSSCERPIIFPLSNPTNKAE +CTAEQAYEWTNGQCIFASGSPFDTIEFEDGRVFYPSQCNNMYVFPGLGLGASVCGAQKVTDRMLYVAAET +LANFVSKKDMEEGRLFPQLTRIREVSHRIAVAVVEEALREGLATKVKPADANDLDSFVGRKMYFPEYVPL +VEKREISI +>XP_002290550.1 NAD dependent malic enzyme [Thalassiosira pseudonana CCMP1335] +MYLNFSIPSTEILNNPLFNKSTAFKGGERDRLRFRGLLPPRRLNMKVQKQRILEEIRAEDSMIRKNMILE +EVHDRNETLYHRILVDHIEEMAPIIYTPTVGQACKEFGMRFRRPRGMYFCEEDRGHMAAMVYNWPQKDVH +VIVVTDGSRILGLGDLGANGMGIPIGKLSLYCAAGGIAPHRVMPVVLDVGTNNEELIKDPFYLGMQRPRL +QGTKYYHLVDEFIQAVRHRWPNVLIQFEDFSSDKAQKLLNKYRDEILCFNDDIQGTGATTLAGVLGGLRA +KGEQPTSLGEQRILIAGAGSAGIGVAQVLMQAMMEHGRTEEEAKKCFYIADEKGLLGTDRIHELSPEQAM +FARDEDGGLSLNEIVNKYKPTMILGMTAVGGLFTEQLIRNMAKHCERPIIFPLSNPTTKAECSAEQAFEW +TDGKCIFASGSPFEPVEMNDGRKFYPTQCNNMFVFPGIGLGVTLCGARTVSDRMLYVAAEALANYVTEDE +LAEGKVFPSINTIRDVSKKVAIAVIEEAISTGQASKLTEKDISDLDDFVSRKMYDPIYVPLIEKRTIEI + diff --git a/MS.txt b/MS.txt new file mode 100644 index 0000000..ae63d4b --- /dev/null +++ b/MS.txt @@ -0,0 +1,46 @@ +>jgi|Phatr2|54478|estExt_Phatr1_ua_kg.C_chr_80015 +MIEFRSEQVHVRVHAPANKAAEEMLTPDALRLLGLLCERFDVRRQALLAARKTHATSFDAGDVPHFLSAE +DHPAQRDPHWRCAPVPDDVQDRRVEITGPVDRKMVINGLNSGACVYMADFEDSTSPTWFNVIDGQLNLRD +AVRGTIAFTNAAGKVYTVQHASRPATLFVRPRGWHLDEAHVTVNGKVASGSLFDFAMYFFHNVHHLKEKG +TGPYFYLPKLESHKEAALWNDVFVAAQQFMGVPIGTIRATVLLETITAAFEMEEILYELRDHSLGLNCGR +WDYLFSFIKKFKHHTDKLTPDRNHLTMTTPLMEAYVKRLIYICHKRGTFAMGGMSASIPIKNDPAANDAA +MQKVADDKLREVTAGHDGSWVAHPALVKVAKDVFDEHMLTPNQITSKPGYVGSSINEQDLLRLPPIPHGK +AITSEGLARGVGIVLAYTEAWLRGIGCIPLHNAMEDAATAEISRAQIWQWRSQKASTQDDNRPITASRVA +ALVQQEVDRQCNGVAGKSKGKWRLAGNLVENMLNKDELDDFLTSVCYPHIVTTAYDDGRIAKL* +>AAP75564.1 malate synthase [Chlamydomonas reinhardtii] +MSVQTIPGVAILGPVTAEQASILSPEAQLFVATLHRTFNPRRKELLKRRDERQKDLDAGRLPDFLPETAA +VRADPGWKCAPPAPGLVDRRVEITGPVDRKMVINALNSGATQYMADFEDSHAPTWDGNLEGQVNMRDAVR +RAISYTGPNGKVYSLRTDGKLATLLVRPRGWHLVEAHFMVDGEPCSASLFDFGLFFFHNAAATLAAGQGP +YFYLPKMESHLEARLWNDVFNASQDMLRLPRGTVRATVLIETLLAAFEMEEILYELRDHSSGLNCGRWDY +IFSFIKKLRNHPQFVLPDRSAVTMTSPFMDAYVRLLIKTCHKRGVHAMGGMAAQIPIKDDPAANAAALAK +VRGDKEREVVAGHDGTWVAHPALVPIAMEIFNKHMPTPNQLHVRRDDVTVTAHNLLDVRGGALLAEGGIT +EKGLRDNLSVGLAYMENWLRGVGCVPIHNLMEDAATAEISRSAVWQWVRHHARTRDGRVVTAAWVNDLLA +QELDQLKSKMGAERFARSKYPLAAQLFQSTITGDAYSDFLTTLCYDHIVTKTPSRM +>jgi|Thaps3|26293|estExt_fgenesh1_pm.C_chr_60036 +MEARSPTVQIEVHAPICAAASEILTPDSLRFVGYLCNKFEDRRQALLNARKSKAMEFDSGGLPHFEKSDG +SGSGGDPHWRCASIPGDVMDRRVEITGPVDRKMVINGLNSGANVYMADFEDSTSPTWSNLTEGQRNLRDA +TRGKITYTNKQTGRVYALKEKTAVLFVRPRGWHLDEAHVTVNGRVASGSLFDFALYFYHNVHCLLQKGSR +PYFYLPKLEHYLEARLWNDVFKAAQSYFGVPYGTIRATVLLETITAAFQMEEILYELRDHSLGLNCGRWD +YLFSYIKKFKCHDDKIAPDRSHLTMTDTPLLKSYVDRLIYICHKRGTFAMGGMAAQIPIKGDPAANEAAM +ARIEKDKIREALAGHDGTWVAHPALVSLAKAVFDRYMPTPNQIDKNPGLTGKDVTEADLLRLQLVPKGTA +ITSTGLQKGVSIVLAYTEAWLRGIGCIPLNHHMEDAATAEISRAQIWQWKYHGVKTEDDGIVISASRISK +LVHDEVKRCSGGEDRGKWFLAGKLVEDMLTKDRLDDFLTTVCYPHILTTRYEGDVIPEDEPSSKL* +>NP_001190219.1 malate synthase [Arabidopsis thaliana] +MELETSVYRPNVAVYDSPDGVEVRGRYDQIFAKILTREALSFVAELQREFRGHVKYAMECRREARRRYNS +GAVPGFDPSTKFIRDGDWSCASVPPAVADRRVEITGPVERKMIINALNSGAKVFMADFEDALSPSWENLM +RGHVNLKDAVDGSITFHDKSRNRVYKLNDQTAKLFVRPRGWHLPEAHILIDGEPATGCLVDFGLYFFHNY +AKFRQTQGSGFGPFFYLPKMEHSREAKIWNSVFERAEKMAGIERGSIRATVLIETLPAVFQMNEILYELR +DHSVGLNCGRWDYIFSYVKTFQAHPDRLLPDRVLVGMGQHFMRSYSDLLIRTCHKRGVHAMGGMAAQIPI +RDDPKANEMALDLVRKDKLREVRAGHDGTWAAHPGLIPICMEAFTGHMGKSPNQIKSVKREDAAAITEED +LLQIPRGVRTLEGLRLNTRVGIQYLAAWLTGSGSVPLYNLMEDAATAEISRVQNWQWIRYGVELDGDGLG +VRVSKELFGRVVEEEMERIEKEVGKDKFKNGMYKEACKMFTKQCTAPELDDFLTLAVYNHIVAHYPINVS +RL +>gi|ONM16505.1| malate synthase1 [Zea mays] +MAASTAAPCYDAPEGVDVRGRYDREFAGILTRDALDFVAGLQREFRAAVRYAMEQRREAQRRYDAGELPR +FDPATTLVREGDWTCAPVPPAVADRTVEITGPAEPRKMVINALNSGAKVFMADFEDALSPTWENLMHGQV +NLRDAVAGTISFRDAARGRTYELNDRTAKLFVRPRGWHLPEAHILIDGEPAIGCLVDFGLYFFHNHAAFR +AGQGAGFGPFFYLPKMEHSREARIWNGVFQRAEKAAGIELGSIRATVLVETLPAVFQMNEILHELREHSA +GLNCGRWDYIFSYVKTFRAHPDRLLPDRALVGMAQHFMRSYSHLLIHTCHRRGVHAMGGMAAQIPIKDDA +AANEAALELVRKDKLREVRAGHDGTWAAHPGLIPAIREVFEGHLGGRPNQIGDAAGHEGASVNEEDLIQP +PRGARTVDGLRLNVRVGVQYLAAWLAGSGSVPLYNLMEDAATAEISRVQNWQWLRHGAALDAGGVEVRAT +PELLARVVEEEMARVEAEVGPDRFRKGRYAEAGRIFSRQCTAPELDDFLTLDAYNLIVAHHPGASPCKL diff --git a/OMT.txt b/OMT.txt new file mode 100644 index 0000000..b15d999 --- /dev/null +++ b/OMT.txt @@ -0,0 +1,34 @@ +>jgi|Thaps3|20731|estExt_fgenesh1_pg.C_chr_10375|OMT1_TPS MTSTTTSTSSVLTPPPLPKSVVFATSGLGGMLGWCVVHPANTIAVRMNLASMQGKKFSLNGMIKESGLMS VYDGLGAGVWRQVFYASSRFGLFETCRDKLHDIRGKTDFAGRVAVGAVTGATAAAISCPMEVATVRMSND ATLPLNERRNYKGVFDVVKRISTEEGVSALWRGVVPFAQRAALVGVFQVATLDQFKELYAHQFNQKKGSI PNVFCSAMTSGLIYSIATMPLEASKNRMASQKADVVTGKLPYTSTLQTMKSVSANEGFLALYNGFVPYYI RCGGHTVAMFIAVQLLRDQYNSMQH +>jgi|Thaps3|26366|estExt_fgenesh1_pm.C_chr_100013|OMT2 +MPESKSFAQIAEPFVCGGSAATFASIVIHPMDLAKVRMQLYGQLNPGKPVPGFTTLLTNMVKNDGIASVY +KGVDAAIGRQLVYGTARIGLHRAISDKMKEMNEGKPISFLMKTLSGMMSGSIAVCIGTPFDIALVRLQSD +SMAPVGERKNYKNVFDALTRTVSEEGAGALYKGLVPNILRGMSMNVGMLACYDQAKETVGKLLNDPMVNG +PALTTQVGASCVAGFTAALFSMPFDLIKSRLMAQKVDPVTNKLPYSGVMDCAMQVLKKEGPKGFYSGFSA +YYGRCAPHAMIILLSIESITQGYRNVLGLQK* +>jgi|Phatr2|8990|e_gw1.1.526.1|OMT1_PTRI +MVIPFPSYGFQLATSPTAGNSVSTPTKAADLSKPIIFATSGLGGCLGWAFVHPANTLAVRMNLASMSGKP +FSFPKMIQESGWMGLYDGISAGVLRQVFYATSRFGLFETFRDKLHEYRGKTDFGARIVVGATTGGIAAYL +SCPMEVAVVRMSNDSTLPMEERRNYKNVFDTASRVIKEEGPLAFWRGSNPFVIRAMMVGVFQVATLDQFK +DLYEHYLNQRRNSITNVFSAAMTSGLIYALATMPLEACKNRMASQKADKITGKLPYKTILQTLRKVSADE +GFLALYNGFLPYYIRCGGHTVSMFIIVQILRDSYMQYAL* +>jgi|Phatr2|16785|e_gw1.28.73.1|OMT2_PTRI +MSLTNNNSHSLAKTLEPFVCGGSAATFASVIIHPIDLAKVRMQLYGQLNPGKPIPSFPSIIKSIVTRDGP +LSVYKGVDAAIGRQMVYGTARIGLHRTFSDKLVELNDGKPISFLQKTLSGMLSGSIAVCIGTPFDIALVR +LQSDGMAEPQDRRNYKNVFDALLRTSKEEGVGALYKGLLPNILRGMSMNVGMLACYDQAKEVVAALLNDP +MTNGPSLPTRLGASATAGFTAALFSLPFDVMKSRLMAMKPNPLTGEMPYKGVVDCAVQMAKNEGPRSFFS +GFSAYYGRCAPHAMIILLSIESITNLYRQTFS* +>gb|CAA53720.1|Oxoglutarate/malate carrier protein|Caenorhabditis elegans +MAEDKTKRLGRWYFGGVAGAMAACCTHPLDLLKVQLQTQQQGKLTIGQLSLKIYKNDGILAFYNGVSASV +LRQLTYSTTRFGIYETVKKQLPQDQPLPFYQKALLAGFAGACGGMVGTPGDLVNVRMQNDSKLPLEQRRN +YKHALDGLVRITREEGFMKMFNGATMATSRAILMTIGQLSFYDQIKQTLISSGVAEDNLQTHFASSISAA +SVATVMTQPLDVMKTRMMNAAPGEFKGILDCFMFTAKLGPMGFFKGFIPAWARLAPHTVLTFIFFEQLRL +KFGYAPPVKA >gi|AFW56593.1|plastidic2-oxoglutarate/malate_transporter|Zea_mays +MASSTAASPLTCHHLGSVGARPRLPSLSISLRRRSSSSSKPTSLSHSLPSKHSLAPPPAASASSRRGLTP +VPASASAAAAPAPDPVPVPAPAPAPAPAPAAPPKKPALQGAAIKPLLASIATGVLIWLIPPPAGVPRNAW +QLLAIFLSTIVGIITQPLPLGAVALLGLGAAVLSRTLTFAAAFSAFGDPIPWLIALAFFFARGFIKTGLG +SRVAYAFVAAFGSSSLGLGYSLVFAEALLAPAIPSVSARAGGIFLPLVKSLCEACGSRAGDGTERRLGAW +LMLTCFQTSVVSSAMFLTAMAANPLSANLTAATIGEGIGWTLWAKAAIVPGLLSLVLVPLILYVIYPPEV +KASPDAPRLAKERLAKMGPMSKEETIMAGTLLLTVGLWIFGGMLNVDAVSAAILGLAVLLISGVVTWKEC +LAESVAWDTLTWFAALIAMAGYLNKFGLISWFSETVVKFVGGLGMSWQLSFGVLVLLYFYSHYFFASGAA +HIGAMFTAFLSVASALGTPSLFAAMVLSFLSNLMGGTTHYGIGSAPVFYGAGYVPLAQWWGYGFVISVVN +IIIWLGVGGFWWKIIGLW \ No newline at end of file diff --git a/PEPC.txt b/PEPC.txt new file mode 100644 index 0000000..b058965 --- /dev/null +++ b/PEPC.txt @@ -0,0 +1,97 @@ +>jgi|Phatr2|51136|estExt_fgenesh1_pm.C_chr_170007|PEPC1_PTRI MIDAASKLTATEALGVTRVFSIMLNLVNAAEVQHRNRQIRAHESTKDPSGGPLPKTEDSIRGTMETLLES KQATPEEIFAQLQKQKVEIVLTAHPTQVQRKSLLRKYRRVSEMLAYLERPDLDGFEKSSAQTSLQTILSS IWGADEIRRQKPTPQQEAAGGNAILESVLWDAVPAYLRKLDQQCRLTLGQSLPVDVCPIKFASWIGGDRD GNPNVTPEVTREVVLQQRLRAARLLLKDMYDLISELAISSRFSPAMDALADSVKDSQHKREKYRRVIGHL IKRLVKTARECELELSKLNTSASMVSQTLVEEAVDGWQDVDALDDATDLIKPLRIMYDSLVETGFGLVAD GLLVDIIRRLYVFGMSLVPLDIREESTKHTEALDAITRWLGIGSYSEWTEEARLSWLTSELSNKRPLYRI RELPKLGFNDSVLKTLNVFGTIATLRPSCLGAYVISQAQTASDVLAVMLLQKQYGMTDKNRNMMRVVPLF ETLNDLTNAPDKLEQLFSIPLYVGAVKGKQEVMVGYSDSAKDAGRLAACWAQYNSQERMVKVAAKHNIEL TFFHGKGGTVGRGGNPSVYRAIMSHPPNTINGRFRVTEQGEMITQNFGAPSIAERTLDIYTAGVCREAFS ERVEPSQAWRDQMQRISDVSCAEYRHLVREEPRFVPYFRQATPELELGSLNIGSRPAKRNPKGGIESLRA IPWTFAWTQTRTHLSAWLGVGAGLTTTDQSELKTLRAMYIEWPWFRETIDLIAMIVSKTDFSISKNYDDQ LVEKKEGLLKLGDEVREKMVQTRQAVLDVTESTDVAGAHVALMRGSSTIRHPYVDPVNVIQAELLKRLRV MDKKKSLLADEMEEQEILKDALIISINGIAQGMRNSG >jgi|Phatr2|27976|estExt_Genewise1.C_chr_100146|PEPC2_PTRI MLSSSCRRSFLAAKTRLRSCVTTSLSTGCPWSAISSGSTSRHIDRFFSTHSSFDEPNPSLFGASPLQAST VSSDATSIPSNEADRDIQLRADIKVMGSLLGRIIQTHEGAEVLEKVETMRGLAKTWRDQGAGRDPSTKQA ADQTFQNLAAYAKSFTDAELFTVSRAFTHFLAIANAAESHHRGRRLKQSRLLSDESSGALYPKPDSVGGV LPSLLAQGHDADAIYDALTSQTTELVLTAHPTEVNRRTILNKKRRIQRILTMADQQRQLGASSVFEQAEL NDALYREISSIWLSDEVSRIKPSPETEAEKGTLVLETVLWEAVPTFLRKLDATTREFLGKPLPLDSSPIR FASWMGGDRDGNPNVKPDTTRQVCLRNRQKAATLFARNLRTLEAELSLTTCSREVREVVGAAREPYRIFL QPMIRKMEATTDWAAQELAILQKRRSGDKSASGIASVASTNVEGIYLDQEEFRAELLTIYRSLQETGNEV AASGILTDIIRNLSSFGLTLIPLDVRQESDRHEEALDAITRYLGLGSYIQWDEQTRVSWLTTQISSKRPL LRAGVWYEHPDYFSPTAIDTLEISRMIAEQHEGSLGAYVISQATSASDVLAVLLLQLDAGVKKPLRVAPL FETLDDLNGAADTMRQLFSLPAYMGTIGGKQEVMIGYSDSAKDAGRMAATWAQYETQETLAKLAKEFGVD MTFFHGKGGTVGRGGNPQTFTAIMAHAPKTINGHFRVTEQGEMISQNFGYADRAERTMDIYTAAVLAEKL SERPKVKDEWRSMMKILSDISCEAYRQVVRKDERFVPYFRSATPELELSNLNIGSRPAKRKATGGVESLR AIPWNFAWTQTRFNLPTWLGVGDAIGQLLKSDRAPLLRELYREARAFQTMVDLVEMVLAKSEPAIAAHYD SVLVKDPKAKELGKEVRQLHMATEEAILDLTEHKKLGENNAVLQRALVVRNPYVDCLNILQVETLDRLRQ VEEGKEDKVLKDALLTTITGVANGMGNTG +>jgi|Thaps3|268546|estExt_thaps1_ua_kg.C_chr_30296|PEPC1_TPS +MGTLLGDAISTHHGRDVLEKVEALRTMAKESRRSGDSSSERLQSMVDFVSGLSATELVVVSRAFAHFLGV +ANAAEAHQRCRRLKLDLEREVSGEDVKGLLVEGAPTPEEVFKSLTSQTVEIVLTAHPTQVNRRTLLEKHG +RVQKILNDADGLRESGTPYQRKLLDDALRREIASIWQTDEVSRVKPTPQSEAERGTLVVETVLWEALPSF +LRKLDATMKWGLGEKYGLPLTASPFKFASWMGGDRDGNPNVTPDVTREVCLTNRIKAAQLLEGDVRELMG +VGTESEAMQRVRERSGDSRAPYRAYLNPVATKLANTATWAQQELRKSTTSFAPDEVYLHKDELMDELLTV +HQSLCDSGNTVVANGRLADIIRKLSSFGLTLVPLDIRQESDRHEEALNCITKYLGLGSYSQWDEGTRVSW +ITKQLQSKQPLIRDGAWNQPGNEQFFTPTSIDTLETFKMISDMHEESLGAYVISQCTSVSDILAVLLLQL +DAGVNKTLRVVPLFETLDDLNGAAATMEHLFSIPAYVGSLEGRKQEIMVGYSDSAKDAGRLAASWAQYET +QVTLSEVAKKHSVDVVYFHGKGGTVGRGGNPNTFEAILSHAPGTINGQFRVTEQGEMINQNFGFSDRAER +TLDIYTAALLAEQNTDRPLPTKEWKDMMDKLSQISCDAYRKIVRGDERFVPYFRAATPELELSNLNIGSR +PAKRKASGGVESLRAIPWIFAWTQTRLNLPTWLGVGEAINEVLSSPDEQTLRTMYKEWGSFRTTIDLVEM +TLSKSDSSIARHYENVLVRDPAAVALGGEIRNIHDATERAVMDLTGHKTLSEHDILLQRLMAVRNPYVDC +LNVLQAETLKRLRESEGSSEEEVLKDALLTTITGVANGMGNTG* +>jgi|Thaps3|34543|e_gw1.5.29.1|PEPC2_TPS +MDRAANPDDTAPFEEMKKLAYDINPRDTLGVMKTFSIALNLVNAAEVHHRIRLVRVSELKDDVNHIGPLP +MVEDSIRGTMEILLEGDCDDKDKLFERLTTQKCEIVLTAHPTEVNRKTIISKYRKISELLAYMERPDLHP +FERAEAVNNLRGIISAIWGADEIRRVKPTVQKEAAGGCAVIESVLWDAVPSYLRKLDAQCRVTLGKKFPV +DATPIKFASWIGGDRDGNPNCTPEVTLEVVTRQRLRAAKMFLNDLNMLYSELAISSRFSKELEALAASVK +KSDDNREKYRRVIGHLRRRLVRTVKECEAKLHTLTDTSEDVEPIIKSEELMTPLRIMYDSLVETGFELVA +DGHVSDIIRRVAVFGMTLVPLDIREESTRHTIAIDAITRHLGIGSYKEWDEEARLNWLQSELNNKRPLFR +IRDIEDNLLGLDPDNRKTLMVFKVASELDSESLGAYVISQANTASDVLLVMLLQKQFGMTEKNGKLMRVV +PLFETLTDLTNSPAQLERLFSITNYLGAINGKQEVMVGYSDSAKDAGRLAACWAQYTAQEAMANVADRYG +VELTFFHGKGGTVGRGGNPALYRAILSHPPNTINGRFRVTEQGEMIRQNFGSLEIAERTLDIYTAALLRE +SFTKRVEPKQEWRDQMERVSEVSCAAYRHTVRDDPRFVPYFRQATPELELGRLNIGSRPAKRNPKGGVDS +LRAIPWTFAWAQTRMHLSAWLGVGDGLRSDVSDQCRYMKTLQEMYEQWPWFREIISLISMLVSKTDFSIT +KNYDDLLVDSNLRSLGDEVRNKLVETRQAVIDVSGATDISGPHVQLMRASSTIRNPYVDSINVVQAEILK +VLR +>gb|Q6R2V6.1|Chlamydomonas_reinhardtii|Phosphoenolpyruvate_carboxylase +MTDSTYDFGAVRDDLTPLEDDCKLLGSLLDDCLRVEIGETMFKKIERIRALAQCASNLSIKGDAGASDML +SHRLAEELMNLDMDEAVPLTRACGHYLNLSGIAELHHGVRRDRATREPNPNSCDAVFARLITEGVDPEEL +YRAVSEQNVEVVLTAHPTQVNRRTLQYKHTRIAALLQQHDRSDLTAEERRNMVSELQREVAALWQTDELR +RQKPTPLDEARGGLHIVEQSLWAAVPQYMRRLSAALKKHTGHDLPLQATPFRFGSWMGGDRDGNPNVTAK +VTAHVTALARWMAADLYLREIDTLRFELSMNQCSAAVWKMARRIIAEGHTKRAGVVRAKAAAALHQTATD +AASHGGSAASAAAAAAAGGDVVADGTSGGGAAAAAGPAAAAAADDAFTFSRLGRPRPERPSTDVRSVGVL +AGGEGAAFPGGMILGTQPVSAHTAAEVSVPHELPGQDVEGGSEMDFNESRRASDAGDLGASQHPMLGGPS +AGASAEPTAHGYTTTATAAAAAADGTQPEPEVPGTPSYADPGTPDRLGALPGPFTPGPTPFREAANAAMS +TAASGGAGGGGGGGANRAASGLGGDPTFTRRSLMAQRLGTSSVQFARAHEHPGFHPYRIVLGHVRDRLAA +TRRRMEDLLSGREPAGEAHGGVGAGGGGGGGAAPWYESEDELAEPLMACYWSLWECGGGVIADGRLLDLI +RRVYTFGMCLMKLDLRQESTRHAEALDAVTSYLGLGSYLEWSEDQKIEWLTKELQGRRPLIPADMPMSAE +VREVLDTFKVAAHLGRDNLGAYVISMTKGASDVMAVELLQREARMQVGAEAGGRGGGGPEDGGSLRVVPL +FETLEDLDAAEDVMTRLLTNPWYREHLRAVHGDAQEVMLGYSDSGKDAGRLAANWALYKCQERLVAITKA +NNVKLTLFHGRGGTVGRGGGPTHIAIQSQPPGSVEGTFRITEQGEMVQAKFGISGVALSQLETYTTAVLL +ATMRPPSPPRREEWRAVMEMLSRVSCESYRNIVHHSPLFLRYFKHATPEAELGNLYIGSRPARRRNKDAS +ISTLRAIPWIFAWTQNRLILPSWLGIGAALTAAMTQGHLPTLQAMYREWPFFGSTVDLIEMILAKTDPRI +AALYEEVLVNDPEEKKLGAELRERLQRCQGAILKVTGHENLLSNNPTLSKLISMRSPFVDPINILQVEVL +RRLRQDPNNMRLRDALLISINGIAAGMRNTG +>gb|XP_003055786.1|predicted_protein|Micromonas_pusilla_CCMP1545 +MPTAMNLLKDAKFKELVADSHYLSGSGETHDFDHHEVLQESEDLMRALFFSIVRETTGTEFDDSLEAVYA +LSEQFHKSNDPADFAALTTKLGSLSDEETVMLASAFSNVLNLHNVSEHVAAAMEERHARLDDIPRGPAKT +TNGAIKGLIANGVSKETIYEALAEQEVDLVFTAHPTQALRRSMLKNFARIRQCLLDLQARRLSGYERAEI +LASMSSAIQAAWRTDEIRRNPPKPQDEMRAGLSYFNDTLFEGLPKFVRRIDTALINQGLPRIPLDKSIMK +FSSWMGGDRDGNPNVDSHCTKDAVYLARSKAADLYFDAIQNLIFSLSMWRCSESFKARAAARHAFVLSQQ +DDGALYAERKRRNYVDFWHALPLSQPYRVVLSEVRDRLYNTKEAIKDVIAGRVDALNPDDASIFTSKEQL +LEPLLACYNSLIDVGDKSVADGYLLDLIRQVNCFGLSLVRLDIRQESDRHADAMDAITKHIGLGSYNEWD +EEKKCAFLVAELEGKRPLVPRDLECTAEVQEVIDTFHMAGHLQRVCPGSLGTYVISMATVASDVLAVVLL +QRECGGHEDRLLRVAPLFERLDDLRDGPAQLRRLFSVPWYHKHIDGFQEVMIGYSDSGKDAGRMAAAWAL +YEGQENATHVGNEFGVKLTLFHGRGGTVGRGGGPSHLAIMSQPPATINGRLRVTVQGEVIEQNFGEHENC +FHTLDLYTAATLEHSLKPPTSPQTEWRDVMNVMSEESCEKYRKVVFETPEFIRYFAQATPAQELGSLNIG +SRPAKRKANPTVTALRAIPWIFAWTQSRFHLPVWLGMAEGFQKLKDDGKLPMLRAMYKDWPFFRVTMDLI +EMVLAKADFNVAEYYEKVLVEPGLHAFGATLRQQLVNTVKIVLEITEHPDLLTPQSDSKGQSSSTFLAEK +LSMRSTYITPLNIIQVENLKRLRAIESGEVSEEFMAKYAPSMPWSKEMLSLHGKNNWYHATVSDTLIITM +KGIAAGMQNTG +>gb|XP_001420862.1|predicted_protein|Ostreococcus_lucimarinus_CCE9901 +MLKAVFSGKDKAKITSHKRTGSLFASEEGEALDALARSSSYLSGRGETKEFNAHDVIEECDELLRTIFFA +VVRETAGDKFLGQLKSVYEASEKFGSSHDPKDFDAMQAMLETMEVDESLQFASAYSNLLNLHNISEQVAN +AMEERHRRLDDIPRGPAKTTNGAIKGLLRAGKSTEEIYSALAVQHVDLVLTAHPTQALRRSMLKSFGIIR +EKLLQLQRFRLSRYERAEVLDEIRSKVASAWRTDEIRRTPPKPQDEMRAGLTYFQQTIWDGIPTFMRRVD +TSLLANGCPRLPLDRSIVTFGSWMGGDRDGNPYVTASCTRDVVLLARVQGVNLLFRAIQRLIFDLSMWRC +NDAVKALAKDILENSETDNFTIFEERKKRNYDDFWKAIPEHEPYRVILAELRDKLYNTREALQRCIADND +VNIDMNDETIIRSKDELFAPLVVCYESLIEVGDAQIANAYLLDVIRQVQCFGLGLVKLDIRQESDRHAEA +LDAVTRYIGLGSYLEWSEEQKIEFLTRELESKRPLLPSDLECSDDVREVLDTCKMIAHLQQTCPGALGTY +VISMATSASDVLAVVLLQRECGCRKQDLLRVAPLFERLDDLNDAPRVLRQLFSVKWYHDHIAGFQEVMIG +YSDSGKDAGRMAAAWALYDGQERVVAAGKEFDVALTLFHGRGGTVGRGGGPAHIAMLSQPPGTVNGSIRV +TVQGEVIETDFGEKENCFHTLDLYTASVLEHTLKPPAHPRDEWRRVMDRMSEYSCAHYRKTVFETPDFVG +YFAQATPGAELGSLNIGSRPAKRKPSAGVTALRAIPWIFAWTQSRFHLPVWLGISTSFRRLIDEGELETL +RDMYKSWPFFEVTIDLVEMVLAKADPVVVAYYERALVDPKLHDFGASLRGELQESIDCILAVSEHIGLLA +KPEKVEANEAVQVHKKLAHKLHKRSLYITPLNVCQVRYLIAARALENEEDGDKLSMQKVKITLLEGYPFQ +DYNYKGAVNDVLKITMKGIAAGMQNTG +>gb|EKX31868.1|hypothetical_protein|GUITHDRAFT_121941|Guillardia_theta1_CCMP2712 +MQPLLQDTIVLRGLFFDALRHGMKGSSEKKRFMNQPSCHIEEHGYNTRNIDFNAASTNLECIEAFSENVK +KILSLSEKYAEQHTSKTLQDIVDVVERSSIEEVKIVARIFAVLLSQINIAERHHRYRRWSMYKRREIAIL +HFSDGQHHQADDCFKMLKENGFTPQKIHDSLCKQNLELVFTAHPTQSERRSILKKFAALDAALEALDLHG +ESQTPLQKELIFMRLQQTLLAIWRTNNMRTIKPSPEDEARYGLSVVEETLWDAVPQHYRIVDDSLRRLGQ +PPLPLNCNLITLGSWMGGDRDGNPYVTHDITKKIIYLSMMRACRLYYNEVEKLLWALSMTGEPSSEVLEW +LAEHQNDYHNETVEVEGRGGHQTKNWDFYRSEQTVEEPYRQVLVIVREMLERTIIRAEALSHGREPPPVD +GKCFRTTEELMKPFALIYDSLEQSGDHLVCHGKLKDLIRRIRTFGLYLVKLDIRQESSKHEEVMDAITSH +LGLGQYSTWSEETKIQWLTNELLSKRPLLGSQDFNCSPMVREVLDTFKVIMSCNAEPFGAYVISMTQSAS +DVLEVHLLQKEAGCRQHLRVAPLFETKEDLINAPKALLALYKNEWYRNHFDTVNTKYQEVMLGYSDSAKD +AGRLTSVWELYKAQESLVQISAEHKIPLNLFHGRGGSVGRGGGPQYLAILSQPAGSINGSLRVTIQGEVI +ENYFGSHRSCELTFERYTTAILKATLTPPAPPSDLFRDVMQRMSETSCAAYKKIVYDTPGFVDYFRAITP +EQELKTLNFGSRPSKRAKGGIETLRAIPWMFAWTQMRLHMPVWFGVGSAFKSEIDAGNLDTLREMYAKWP +FFQSTVELVEAVLSKVDVEITRLYEKMLVPADVLYIGEMIYKELDMSIECVKMITGRENLLSNNPIIKRL +YDIRRPMTDPLNILQAKVLRDMRMHENPPQELQESFAATVQGIAAGMGWTG + + diff --git a/PEPCK.txt b/PEPCK.txt new file mode 100644 index 0000000..e567884 --- /dev/null +++ b/PEPCK.txt @@ -0,0 +1,48 @@ +>jgi|Phatr2|23074|estExt_gwp_gw1.C_chr_210075|PEPCK_PTRI +MLLTTGAARVFLRSAAVSKSAVKTFAARAVLGSGRLSSPSSLYHGCSVQTFTSLPNDASTSCKEGREAYN +VSQTHKGTDACLKVGIDKLGITGPSTIYRNLNYDEIFEHEVKNGEGVVAKAEYGDTFCVDTGKFTGRSPK +DKWIVLNKGSETEANIDWNSINQATKPEVFDELYDKAVDYFNQRESCYVADVYCGANPSTRKKIRFLFDK +AWQQHFVTNMFIRPSDEAELDGFDPDFTVINCCAQVDDDWERHGLHSDTAVVFNIEKKTAVVFGTWYGGE +NKKGIFSLMNYWLPMQGHLPMHCSANVGKEGDVALFFGLSGTGKTTLSADPHRALIGDDEHGWDHDGIFN +FEGGCYAKTINLSEATEPDIYRAIHKDALLENVAIRDDGTPDYSNVSKTENGRVSYPIFNIPGYHKEQMA +GHPSNIIFLSCDAFGVMPPVARLSSGQAMYHFLSGYTAKVAGTERGITEPSATFSTCFGAAFMTMHPTVY +ADLLQEKLDKHGSHAYLVNSGWSGGAYGTGKRMSIKTTRTCIDAILDGSIHDAEFQVDPIFGYEVPKSLP +GLDDLLLDPKSTWDNQDAYDETAAKLAKMYSDNFKQYEGKGSIDYTKFGPKI* >jgi|Thaps3||gi|18203370|sp|Q9PP01|PEPCK_TPS +MKKFDKLGLDNIKEIFHNLSYDELNAHEKANNEGLSTDNDTFCVDTGIFTGRSPKDKYFVKQDPSSKYIA +WGKVNQPITKELFDKLLTKAKQELSGKKIYVQDVFCGASLQSRKAVRFVTEIAWQAHFVKNMFIRPSQEE +LENFKADFIVYNACKCINEDYKQDGLNSEVFVIFNVEENIAVIGGTWYGGEMKKGIFSMMNYWLPLENKL +SMHCSANVGEKDDVALFFGLSGTGKTTLSTDPKRRLIGDDEHGWDDEGVFNFEGGCYAKTINLDPEHEPE +IYGAIKRNALLENVVLRADKSVDYADASKTENTRVSYPIEHIENHEPSLKAGHPKNIIFLSADAFGILPP +VSKLSKEQAMYYFLSGYTAKVAGTERGITEPQATFSACFGEPFMPLHPTVYARLLGEKIEKHEVNVYLVN +TGWSGGSYGVGKRMSIKATRACINAILDGSITKCEFENFEVFDLAIPKALEGVESVLLNPINTWLDKNAY +IATRDKLAHMFIQNFKRYEDVKEGIEFSKFGPKI +>gb|EKX34557.1|phosphoenolpyruvate_carboxykinase|Guillardia_theta_CCMP2712 +MASSTSNDFTINTKNMFNVRQESCDNVVEQTLKLATYTTSKLVEPAVPFNFLDHPGCHRNMSFEDVRAKI +LARGEGSLVQESEALLVDTGKFTGRCPKDRYIVNAGEAASKVGWGDINRGITEEVFDKVMEGSAKRLCGL +EEVFVFDGFVGASRSSRKAVRVVTELAWHHHFCTNMFIRPTEEELASFKPDITILNSRYLFEGWKEAGLR +SETCVALDLNRGLSVITGTEYSGEMKKGAFSMMNYYLPLQNIMSMHCSATVGKGGDTAIFFGLSGTGKTT +LSADTTRFLIGDDEHGWDEEGIFNLEGGCYAKMIDLDPSKEPLIHAAIKENAILENIVLDQHGRPDYKDI +SKTENTRGSYPMWHIPNYQPSGTAPPPRNVIFLTCDAFGVLPPVSRLSTEQALYHLVCGYTSKVAGTEMG +ITEPTPTFSICFGGAFMPLPARVYAQLFRSKIEQHGCQVFLVNTGWSGGSYGTGRRMDINTTRAIVAAIL +DGSIEEATFSSPDPCFQLSVPLALPGVDAHVLNPRNTWASQEEYEATSRKLMGMYQANWQQFASDPFMAQ +LSRFGPGGDRST +>gb|XP_001694964.1|phosphoenolpyruvate_carboxykinase,splice_variant|Chlamydomonas_reinhardtii +MALLSSRSSANCTGRSVRRATVAPAPVVKPHSSVAMRFTNNTKQDAAVPAPCDMQFVLDSKFTRESGLQP +RVVFRNLTTPQLYEMALAHEPGTHITSSGALATLSGEKTGRSPKDKRVVRDPETEKDLWWGPYSPNYVMD +DRTFLTNRERAIDYLNTLDRVYVVDAFVNWDPESRLKVRVVTSRAYHALFMSNMLIKPTEEELKTFGEPD +FVIYNAGAFPANKYTQFMTSQTSIDLSLKHKEMVILGTMYAGEMKKGVFTLMHYLMPMQGKLSLHSGCNV +GADDDVTMFFGLSGTGKTTLSADPKRPLIGDDEHVWSDKGVFNIEGGCYAKCIGLKATTEPEIWNAIKFG +TVLENVDYNPVTREVDYESERLTENTRASYPIEFMNNARIPCVGPHPKNVVLLACDAFGALPPVSRLTLE +QAMYHFISGYTAKVAGTEMGVTEPTATFSACFGSAFLMLHPYKYATMLAEKMKAHGTTAWLINTGWTGGK +YGVGKRISLKHTRAIIDAIHSGELDKAEYVTTPIFGLQVPKAISGVPAEILSPENVWPNKDEFAMCLNSL +GHMFIRNFEHFNDGEQFVGKDTAARILTGGPQPIAKEDVEKKGFGAFKTQ +>gb|AAC98698.1|phosphoenolpyruvate_carboxykinase|Rattus_norvegicus +MPPQLHNGLDFSAKVIQGSLDSLPQEVRKFVEGNAQLCQPEYIHICDGSEEEYGRLLAHMQEEGVIRKLK +KYDNCWLALTDPRDVARIESKTVIITQEQRDTVPIPKSGQSQLGRWMSEEDFEKAFNARFPGCMKGRTMY +VIPFSMGPLGSPLAKIGIELTDSPYVVASMRIMTRMGTSVLEALGDGEFIKCLHSVGCPLPLKKPLVNNW +ACNPELTLIAHLPDRREIISFGSGYGGNSLLGKKCFALRIASRLAKEEGWLAEHMLILGITNPEGKKKYL +AAAFPSACGKTNLAMMNPTLPGWKVECVGDDIAWMKFDAQGNLRAINPENGFFGVAPGTSVKTNPNAIKT +IQKNTIFTNVAETSDGGVYWEGIDEPLAPGVTITSWKNKEWRPQDEEPCAHPNSRFCTPASQCPIIDPAW +ESPEGVPIEGIIFGGRRPAGVPLVYEALSWQHGVFVGAAMRSEATAAAEHKGKVIMHDPFAMRPFFGYNF +GKYLAHWLSMAHRPAAKLPKIFHVNWFRKDKNGKFLWPGFGENSRVLEWMFGRIEGEDSAKLTPIGYVPK +EDALNLKGLGDVNVEELFGISKEFWEKEVEEIDKYLEDQVNADLPYEIERELRALKQRISQM diff --git a/PEP_HMM_1.py b/PEP_HMM_1.py new file mode 100644 index 0000000..591d8f0 --- /dev/null +++ b/PEP_HMM_1.py @@ -0,0 +1,95 @@ +#!/usr/bin/python +#MMETSP_sample_import.py +#inputs: (1) taxonomic classification of interest, (2) mmetsp taxa file with path +#outputs: rips and writes peptide fasta files and count data for each taxa + +#import libraries +import sys +from ftplib import FTP #import the ftp library +import re +import os +import numpy as np + +######Get arguments from command line######## +t=sys.argv[1] #full or partial taxa name +need='mmetsp_taxonomy.txt' #mmetsp taxa file with path + +####Build files for HMM########## +#you only need to run this once +os.system('./mafft_hmmbuild.sh') + +###########RETRIEVE NAMES!!########## +mt=open('mmetsp_taxonomy.txt','r') +g=[] #make an empty list to store genus names +for line in mt: + if re.search(t,line): #if taxa name in line + g= g+line.split('\t')[7:8]#pull out the 8th field should be genus, keeping as list + +g=set(g) #keep only unique genus names +print g +#close the taxonomy file +mt.close() + +ftp= FTP('ftp.imicrobe.us') #set home ftp server +ftp.login() #log in +ftp.cwd('camera/combined_assemblies') #ch + +files=ftp.nlst() #make a list of all files and directories in wd +delimiter=' ' +all=delimiter.join(files) + +names=[] + +for genus in g: + string= genus+"\S*.pep.fa.gz" + taxafiles=re.findall(string, all) + print "{} files matching genus=".format(len(taxafiles))+genus + print taxafiles + if len(taxafiles) > 0: + for filex in taxafiles: + command = "RETR "+filex + outfile = filex + #ftp.retrbinary(command, open(outfile, 'wb').write) + names.append(outfile) + +ftp.quit() + +print 'Part 1' + +###########RETRIEVE PEP.FA########## +os.system('python ./MMETSP_sample_import.py {} {}'.format(t,need)) + +###########RETREIVE COUNTS########## +names2=[i.split('.')[0] for i in names] #removes .pep.fa.gz from the names + +ftp= FTP('ftp.imicrobe.us') #set ftp server +ftp.login() #log in +ftp.cwd('camera/combined_assemblies') #change directory + +#location for files +t=os.getcwd() +for ID in names2: + #change to taxa directory/readcounts + ripdir= ID+"/readcounts" + ftp.cwd(ripdir) #change directory + savefile= ID+"_cds_counts.txt" #saves files with unique names + ftp.retrbinary('RETR cds.dat', open(savefile, 'wb').write) + ftp.cwd("~/camera/combined_assemblies") #change directory to restart loop in right place + +ftp.quit() + +print 'Part 2' + +#if t=='Dinophyceae': + #names.remove('Durinskia-baltica-CSIRO_CS-38.pep.fa.gz') + #names.remove('Oxyrrhis-marina-CCMP1795.pep.fa.gz') + #names.remove('Alexandrium-fundyense-CCMP1719.pep.fa.gz') + +print names +print 'Part 2B' + +#########RUN HMM ######### +for i in names: + os.system('./MAGIC_HMM.sh {}'.format(i)) + +print 'COMPLETE' \ No newline at end of file diff --git a/PEP_HMM_PART2.ipynb b/PEP_HMM_PART2.ipynb new file mode 100644 index 0000000..827603f --- /dev/null +++ b/PEP_HMM_PART2.ipynb @@ -0,0 +1,869 @@ +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 40, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "import sys\n", + "from ftplib import FTP #import the ftp library\n", + "import re \n", + "import os\n", + "import pandas as pd\n", + "import numpy as np" + ] + }, + { + "cell_type": "code", + "execution_count": 57, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "#t='Dinophyceae'\n", + "#t='Bacillariophyta'\n", + "#t='Haptophyta'\n", + "t='Raphidophyceae'" + ] + }, + { + "cell_type": "code", + "execution_count": 58, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Chattonella-subsalsa-CCMP2191.pep.fa.gz', 'Heterosigma-akashiwo-CCMP2393.pep.fa.gz', 'Heterosigma-akashiwo-CCMP3107.pep.fa.gz', 'Heterosigma-akashiwo-CCMP452.pep.fa.gz', 'Heterosigma-akashiwo-NB.pep.fa.gz']\n" + ] + } + ], + "source": [ + "##retrive files\n", + "mt=open('mmetsp_taxonomy.txt','r')\n", + "g=[] #make an empty list to store genus names\n", + "for line in mt:\n", + " if re.search(t,line): #if taxa name in line\n", + " g= g+line.split('\\t')[7:8]#pull out the 8th field should be genus, keeping as list\n", + "\n", + "g=set(g) #keep only unique genus names\n", + "#print g\n", + "#close the taxonomy file\n", + "mt.close()\n", + "\n", + "ftp= FTP('ftp.imicrobe.us') #set home ftp server\n", + "ftp.login() #log in\n", + "ftp.cwd('camera/combined_assemblies') #ch\n", + "\n", + "files=ftp.nlst() #make a list of all files and directories in wd\n", + "delimiter=' '\n", + "all=delimiter.join(files)\n", + "\n", + "names=[]\n", + "\n", + "for genus in g:\n", + " string= genus+\"\\S*.pep.fa.gz\"\n", + " taxafiles=re.findall(string, all)\n", + " #print \"{} files matching genus=\".format(len(taxafiles))+genus\n", + " #print taxafiles\n", + " if len(taxafiles) > 0:\n", + " for filex in taxafiles:\n", + " command = \"RETR \"+filex\n", + " outfile = filex\n", + " #ftp.retrbinary(command, open(outfile, 'wb').write)\n", + " names.append(outfile)\n", + " \n", + "ftp.quit()\n", + "\n", + "if t=='Dinophyceae':\n", + " names.remove('Durinskia-baltica-CSIRO_CS-38.pep.fa.gz')\n", + " names.remove('Oxyrrhis-marina-CCMP1795.pep.fa.gz')\n", + " names.remove('Alexandrium-fundyense-CCMP1719.pep.fa.gz')\n", + "\n", + "print names" + ] + }, + { + "cell_type": "code", + "execution_count": 4, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "['Karenia-brevis-CCMP2229.pep.fa.gz', 'Karenia-brevis-SP1.pep.fa.gz', 'Karenia-brevis-SP3.pep.fa.gz']\n" + ] + } + ], + "source": [ + "#names=names[:3]\n", + "#print names" + ] + }, + { + "cell_type": "code", + "execution_count": 59, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "def readHMM(Organism,gene_name):\n", + " \"\"\" Takes in organism and gene_name from HMM results and makes a table.\n", + " HMM results from --tblout that have the following name organism_genename_HMM.csv\n", + " Note: pep.fa files differ in structure and it can affect how the HMM output is written. If you can't read the file in \n", + " modify the fuction\"\"\"\n", + "\n", + " hold=pd.read_csv('{}_{}_HMM.csv'.format(Organism,gene_name),delim_whitespace=True,engine='python',skiprows=2,skipfooter=10)\n", + " #empty files have 12 rows so the following if statement will only work on files that are not empty\n", + " \n", + " if hold.shape[0]!=0:\n", + " readX=pd.read_csv('{}_{}_HMM.csv'.format(Organism,gene_name),delim_whitespace=True,engine='python',skiprows=3,\n", + " header=None ,skipfooter=10)\n", + " \n", + " samplenames=[]\n", + " for i in np.arange(1,readX.shape[1]+1):\n", + " samplenames.append(str(i))\n", + "\n", + " readX.columns=samplenames\n", + " \n", + " new=pd.DataFrame()\n", + " new['CAMPEPid']=readX['1']\n", + " new['contig']=readX['19']\n", + " new['Evalue']=readX['5']\n", + " new['Annotation']='{}'.format(gene_name)\n", + "\n", + " new.contig=new.contig.str.split(\"|\").str[1]\n", + " return new\n", + " #return readX" + ] + }, + { + "cell_type": "code", + "execution_count": 60, + "metadata": { + "collapsed": true + }, + "outputs": [], + "source": [ + "##MAKEHMM\n", + "def makeHMM(Organism,gene_type):\n", + " \"\"\"Combines the plots from each gene_type into a table\"\"\"\n", + " if gene_type=='CF':\n", + " a=readHMM(Organism,'PK')\n", + " b=readHMM(Organism,'PEPC')\n", + " c=readHMM(Organism,'PEPCK')\n", + " d=readHMM(Organism,'PK')\n", + " e=readHMM(Organism,'MDH')\n", + " f=readHMM(Organism,'OMT')\n", + " g=readHMM(Organism,'PYC')\n", + " h=readHMM(Organism,'PPDK')\n", + " i=readHMM(Organism,'ME')\n", + " \n", + " frames = [a,b,c,d,e,f,g,i,h]\n", + " result = pd.concat(frames)\n", + " return result\n", + " if gene_type=='PR':\n", + " a=readHMM(Organism,'SHMT')\n", + " b=readHMM(Organism,'GOX')\n", + " c=readHMM(Organism,'GDCT')\n", + " d=readHMM(Organism,'PGP')\n", + " e=readHMM(Organism,'ICL')\n", + " f=readHMM(Organism,'GCL')\n", + " g=readHMM(Organism,'HR')\n", + " h=readHMM(Organism,'SPT')\n", + " i=readHMM(Organism,'TSR')\n", + " j=readHMM(Organism,'MS')\n", + " k=readHMM(Organism,'GlcDH')\n", + " l=readHMM(Organism,'ALAT_GGAT')\n", + " m=readHMM(Organism,'GK')\n", + " \n", + " frames = [a,b,c,d,e,f,g,h,i,j,k,l,m]\n", + " result = pd.concat(frames)\n", + " return result\n", + " if gene_type=='BP':\n", + " a=readHMM(Organism,'CA_alpha')\n", + " b=readHMM(Organism,'CA_delta')\n", + " c=readHMM(Organism,'CA_beta')\n", + " d=readHMM(Organism,'Ca_zeta')\n", + " e=readHMM(Organism,'Bestrophin')\n", + " f=readHMM(Organism,'SLC4')\n", + "\n", + " frames = [a,b,c,d,e,f]\n", + " result = pd.concat(frames)\n", + " return result" + ] + }, + { + "cell_type": "code", + "execution_count": 61, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "###COUNTS\n", + "def HMMcmp(Organism,gene_type):\n", + " result1=makeHMM(Organism,gene_type)\n", + " contig_EV=dict(zip(result1.contig,result1.Evalue))\n", + " contig_Annot=dict(zip(result1.contig,result1.Annotation))\n", + " \n", + " import re\n", + " Organism = re.sub('.pep.fa.gz', '', Organism)\n", + " \n", + " expression=pd.read_csv('{}_cds_counts.txt'.format(Organism),delimiter='\\t',index_col=0)\n", + " \n", + " #treatments=len(list(expression))\n", + " #print treatments\n", + " \n", + " expression['log2CPM']=np.log2(expression.sum(axis=1)/1000000)\n", + " expression.index= expression.index.str.split(\"|\").str[1]\n", + " contig_CMP=dict(zip(expression.index,expression.log2CPM))\n", + " \n", + " Contig=[]\n", + " Evalue=[]\n", + " Annotation=[]\n", + " log2CPM=[]\n", + " \n", + " for i in expression.index:\n", + " a=contig_CMP.get(i)\n", + " if a>-16 or a==\"-inf\" or a==\"inf\":\n", + " Contig.append(i)\n", + " Evalue.append(contig_EV.get(i))\n", + " Annotation.append(contig_Annot.get(i))\n", + " log2CPM.append(a)\n", + " \n", + " out=pd.DataFrame()\n", + " out['Contig']=Contig\n", + " out['Evalue']=Evalue\n", + " out['Annotation']=Annotation\n", + " out['log2CPM']=log2CPM\n", + " \n", + " return out" + ] + }, + { + "cell_type": "code", + "execution_count": 62, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ContigEvalueAnnotationlog2CPM
0267033_1NaNNone-12.137153
13814_1NaNNone-9.180025
277900_1NaNNone-11.266233
3163778_1NaNNone-13.609640
4156264_1NaNNone-15.231129
\n", + "
" + ], + "text/plain": [ + " Contig Evalue Annotation log2CPM\n", + "0 267033_1 NaN None -12.137153\n", + "1 3814_1 NaN None -9.180025\n", + "2 77900_1 NaN None -11.266233\n", + "3 163778_1 NaN None -13.609640\n", + "4 156264_1 NaN None -15.231129" + ] + }, + "execution_count": 62, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "HMMcmp('Heterosigma-akashiwo-CCMP2393.pep.fa.gz','BP').head()" + ] + }, + { + "cell_type": "code", + "execution_count": 63, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "###CLEAN\n", + "def HMMclean(Organism):\n", + " '''Will create an HMM table and filter out values that do not meet an e-value'''\n", + " aX=HMMcmp(Organism,'BP')\n", + " bX=HMMcmp(Organism,'CF')\n", + " cX=HMMcmp(Organism,'PR')\n", + " \n", + " frames=[aX,bX,cX]\n", + " HMM= pd.concat(frames)\n", + " \n", + " HMM=HMM[HMM['Evalue'] < .0001]\n", + " \n", + " #return HMM.drop_duplicates()\n", + " \n", + " Genes= np.unique(HMM['Annotation'])\n", + " AX=pd.DataFrame(0,index=[Organism],columns=Genes)\n", + " \n", + " for j in Genes:\n", + " counts=HMM.Annotation.value_counts()[j]\n", + " AX[j]= counts\n", + " \n", + " return AX\n", + " " + ] + }, + { + "cell_type": "code", + "execution_count": 23, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
ALAT_GGATBestrophinCA_alphaCA_betaGCLGDCTGKGOXGlcDHHR...MEOMTPEPCKPGPPKPYCSHMTSLC4SPTTSR
Heterosigma-akashiwo-CCMP2393.pep.fa.gz13659111425...4621393154316
\n", + "

1 rows × 22 columns

\n", + "
" + ], + "text/plain": [ + " ALAT_GGAT Bestrophin CA_alpha \\\n", + "Heterosigma-akashiwo-CCMP2393.pep.fa.gz 13 6 5 \n", + "\n", + " CA_beta GCL GDCT GK GOX GlcDH \\\n", + "Heterosigma-akashiwo-CCMP2393.pep.fa.gz 9 1 1 1 4 2 \n", + "\n", + " HR ... ME OMT PEPCK PGP PK \\\n", + "Heterosigma-akashiwo-CCMP2393.pep.fa.gz 5 ... 4 62 13 9 3 \n", + "\n", + " PYC SHMT SLC4 SPT TSR \n", + "Heterosigma-akashiwo-CCMP2393.pep.fa.gz 15 4 3 1 6 \n", + "\n", + "[1 rows x 22 columns]" + ] + }, + "execution_count": 23, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "HMMclean('Heterosigma-akashiwo-CCMP2393.pep.fa.gz')" + ] + }, + { + "cell_type": "code", + "execution_count": 64, + "metadata": { + "collapsed": false + }, + "outputs": [], + "source": [ + "###GENE_TYPE\n", + "def gen_type(gene_type):\n", + " \"\"\" Allows you to extract specific gene_types\"\"\"\n", + " \n", + " if gene_type=='BP':\n", + " fixed=['SLC4','Bestrophin','CA_alpha','CA_beta','CA_delta','CA_zeta']\n", + " if gene_type=='PR':\n", + " fixed=['PGP','GOX','SPT','ALAT_GGAT','GDCT','SHMT','HR','GK','GlcDH','MS','ICL','GCL','TSR']\n", + " if gene_type=='CF':\n", + " fixed=['PK','PEPC','PEPCK','MDH','OMT','ME','PPDK','PYC']\n", + " \n", + " A=pd.DataFrame(0, index=names, columns=fixed)\n", + " \n", + " for i,j in enumerate(names):\n", + " for k in fixed:\n", + " B=HMMclean(j)\n", + " #print B\n", + " if k in list(B):\n", + " A[k][i]=B[k]\n", + " \n", + " A.index = A.index.str.split('.').str[0] \n", + " ####the count is the number of transcriptomes taken into account\n", + " ###I just added gene_name to teh count to make teh values unique for when teh different gene_type frames are concatenated\n", + " A['count{}'.format(gene_type)]=0\n", + " \n", + " for i,j in enumerate(A.index):\n", + " expression=pd.read_csv('{}_cds_counts.txt'.format(j),delimiter='\\t',index_col=0)\n", + " A['count{}'.format(gene_type)][i]=len(list(expression))\n", + " \n", + " C=A.transpose()\n", + " C.to_csv('{}_{}_GeneCountHMM.csv'.format(t,gene_type))\n", + " \n", + " return A.transpose()" + ] + }, + { + "cell_type": "code", + "execution_count": 38, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "data": { + "text/html": [ + "
\n", + "\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + " \n", + "
Karenia-brevis-CCMP2229Karenia-brevis-SP1Karenia-brevis-SP3Karenia-brevis-WilsonPeridinium-aciculiferum-PAER_2Karlodinium-micrum-CCMP2283Prorocentrum-minimum-CCMP1329Prorocentrum-minimum-CCMP2233Symbiodinium-kawagutii-CCMP2468Symbiodinium-sp-C1...Alexandrium-monilatum-CCMP3105Alexandrium-temarense-CCMP1771Azadinium-spinosum-3D9Ceratium-fusus-PA161109Oxyrrhis-marina-LB1974Oxyrrhis-marinaGlenodinium-foliaceum-CCAP1116_3Scrippsiella-hangoei-SHTV5Scrippsiella-hangoei_like-SHHI_4Scrippsiella-trochoidea-CCMP3099
SLC412191432232212...41065012320
Bestrophin1321399012827784950032...8685628754597161572
CA_alpha2019191925207607...1923192416173529300
CA_beta765418000022...887209151823280
CA_delta9787065701...6720001000
CA_zeta0000000000...0000000000
countBP4224234342...4432342333
\n", + "

7 rows × 27 columns

\n", + "
" + ], + "text/plain": [ + " Karenia-brevis-CCMP2229 Karenia-brevis-SP1 Karenia-brevis-SP3 \\\n", + "SLC4 12 19 14 \n", + "Bestrophin 132 139 90 \n", + "CA_alpha 20 19 19 \n", + "CA_beta 7 6 5 \n", + "CA_delta 9 7 8 \n", + "CA_zeta 0 0 0 \n", + "countBP 4 2 2 \n", + "\n", + " Karenia-brevis-Wilson Peridinium-aciculiferum-PAER_2 \\\n", + "SLC4 32 2 \n", + "Bestrophin 128 27 \n", + "CA_alpha 19 25 \n", + "CA_beta 4 18 \n", + "CA_delta 7 0 \n", + "CA_zeta 0 0 \n", + "countBP 4 2 \n", + "\n", + " Karlodinium-micrum-CCMP2283 Prorocentrum-minimum-CCMP1329 \\\n", + "SLC4 3 2 \n", + "Bestrophin 78 49 \n", + "CA_alpha 20 7 \n", + "CA_beta 0 0 \n", + "CA_delta 6 5 \n", + "CA_zeta 0 0 \n", + "countBP 3 4 \n", + "\n", + " Prorocentrum-minimum-CCMP2233 Symbiodinium-kawagutii-CCMP2468 \\\n", + "SLC4 2 1 \n", + "Bestrophin 50 0 \n", + "CA_alpha 6 0 \n", + "CA_beta 0 0 \n", + "CA_delta 7 0 \n", + "CA_zeta 0 0 \n", + "countBP 3 4 \n", + "\n", + " Symbiodinium-sp-C1 ... \\\n", + "SLC4 2 ... \n", + "Bestrophin 32 ... \n", + "CA_alpha 7 ... \n", + "CA_beta 22 ... \n", + "CA_delta 1 ... \n", + "CA_zeta 0 ... \n", + "countBP 2 ... \n", + "\n", + " Alexandrium-monilatum-CCMP3105 Alexandrium-temarense-CCMP1771 \\\n", + "SLC4 4 10 \n", + "Bestrophin 86 85 \n", + "CA_alpha 19 23 \n", + "CA_beta 8 8 \n", + "CA_delta 6 7 \n", + "CA_zeta 0 0 \n", + "countBP 4 4 \n", + "\n", + " Azadinium-spinosum-3D9 Ceratium-fusus-PA161109 \\\n", + "SLC4 6 5 \n", + "Bestrophin 62 87 \n", + "CA_alpha 19 24 \n", + "CA_beta 7 20 \n", + "CA_delta 2 0 \n", + "CA_zeta 0 0 \n", + "countBP 3 2 \n", + "\n", + " Oxyrrhis-marina-LB1974 Oxyrrhis-marina \\\n", + "SLC4 0 1 \n", + "Bestrophin 54 59 \n", + "CA_alpha 16 17 \n", + "CA_beta 9 15 \n", + "CA_delta 0 0 \n", + "CA_zeta 0 0 \n", + "countBP 3 4 \n", + "\n", + " Glenodinium-foliaceum-CCAP1116_3 Scrippsiella-hangoei-SHTV5 \\\n", + "SLC4 2 3 \n", + "Bestrophin 71 61 \n", + "CA_alpha 35 29 \n", + "CA_beta 18 23 \n", + "CA_delta 1 0 \n", + "CA_zeta 0 0 \n", + "countBP 2 3 \n", + "\n", + " Scrippsiella-hangoei_like-SHHI_4 Scrippsiella-trochoidea-CCMP3099 \n", + "SLC4 2 0 \n", + "Bestrophin 57 2 \n", + "CA_alpha 30 0 \n", + "CA_beta 28 0 \n", + "CA_delta 0 0 \n", + "CA_zeta 0 0 \n", + "countBP 3 3 \n", + "\n", + "[7 rows x 27 columns]" + ] + }, + "execution_count": 38, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "gen_type('BP')" + ] + }, + { + "cell_type": "code", + "execution_count": 65, + "metadata": { + "collapsed": false + }, + "outputs": [ + { + "name": "stdout", + "output_type": "stream", + "text": [ + "COMPLETE\n" + ] + } + ], + "source": [ + "a=gen_type('BP')\n", + "b=gen_type('CF')\n", + "c=gen_type('PR')\n", + "\n", + "framesX=(a,b,c)\n", + "outFrame=pd.concat(framesX)\n", + "\n", + "outFrame.to_csv('{}_GeneCountHMM.csv'.format(t))\n", + "\n", + "print 'COMPLETE'" + ] + } + ], + "metadata": { + "anaconda-cloud": {}, + "kernelspec": { + "display_name": "Python [default]", + "language": "python", + "name": "python2" + }, + "language_info": { + "codemirror_mode": { + "name": "ipython", + "version": 2 + }, + "file_extension": ".py", + "mimetype": "text/x-python", + "name": "python", + "nbconvert_exporter": "python", + "pygments_lexer": "ipython2", + "version": "2.7.12" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} diff --git a/PGP.txt b/PGP.txt new file mode 100644 index 0000000..9fd4a2c --- /dev/null +++ b/PGP.txt @@ -0,0 +1,27 @@ +>jgi|Thaps3|25544|estExt_fgenesh1_pg.C_chr_19c_290042|PGP_TPS +MSMRPKDLLPGVDVFIFDCDGVIWRGDSVIPGIPQTLEKLRALGKKMYFVTNNSTKSRAGYKKKFDSLGL +NVPAEEIFSSSFAAAAYLEQSKFKETGKKVYVVGEVGIQEELDLIGVPHFGGPEDANKQPDMGPGCMVEH +DEDVGAVVVGFDRNINYYKIQYAQLCINENPGCEFIATNTDAVTHLTDAQEWAGNGSMVGAIKGCTGREP +TVVGKPSPLMIDYLCDKLGLDRGRICMVGDRLDTDILFGSDNGLKSLLVLSGVTTEEKLLSQENVITPDY +YADSIVDFFVDENAKVGA* +>jgi|Phatr2|48026|estExt_fgenesh1_pg.C_chr_150301|PGP_PTRI +MSLRMAGTLALFVTGATTRALGRSGGTSKRPFSLRLPLQTGTQPTLALSLSSSASANKPTLSLTEQMRKE +SEAELAKLAHHYEDRARNDPAFADLAPIIWKTLDEATAFVNDHIETIMFDCDGVVYRTPDECPGAKECIQ +RLLDKGKRVFFVTNNAASNRSQLRAKLSEILAIENLTDDMMVPSSYSCARFLQREILDRKGRGRLFVIGS +RGLCDELEQTGFEVLTGNGPLDSDASMTREDLATYPFSEHPVDAVVVANVLLQMNPDAPLVATNKDAFDL +VGVDGRHIPGNGCAVVALEHSSKRTAINVGKPSATLADLIAADHGINPSRTMFVGDRLDTDIQFGVENGM +HSVLVMTGVTTADSMVQLGNGTNDEPLPNIVIPHIGLLY* +>gi|EOD14003.1|2-phosphoglycolate_phosphatase|EHUX_CCMP1516 +MGLSLSSSLRIGLLSAAAGTAAGFSFGAAAASGVRRTASTGLRPASLAAQSSAAAVSFKDAGLCKKLDVP +ADLLEKVDVFIFDCDGVIWKGDSLIDKVPAVLAMLRAAGKKVFFVTNNSTKSRKGYLGKFKSLGLEETQP +EEIFSSSFAAAAYLEQTKFKESGKKVYIIGEVGIEEELDMIGVPWIGGGSDAGKKIALKSGYALPHDSDV +GAVIVGFDREINYHKIQYAQLCINENPGCEFIATNLDAVTHLTDAQEWAGNGAMAGAIKGCTGREPTLVG +KPSPLMIDYMVEKFGIERGRICMVGDRLDTDILFGQNNGLLSCLTLSGVTTEEKLLSPENEIKPDFYVDS +IADFL +>gi|NP_001119316.1|2-phosphoglycolate_phosphatase1|Arabidopsis_thaliana +MLSRSVASAVTPVSSSSLLPNSKPIFCLKTLSGYRSSSFCGGCIRKINHKPLRMTSSNITPRAMATQQLE +NADQLIDSVETFIFDCDGVIWKGDKLIEGVPETLDMLRAKGKRLVFVTNNSTKSRKQYGKKFETLGLNVN +EEEIFASSFAAAAYLQSINFPKDKKVYVIGEEGILKELELAGFQYLGGPDDGKRQIELKPGFLMEHDHDV +GAVVVGFDRYFNYYKIQYGTLCIRENPGCLFIATNRDAVTHLTDAQEWAGGGSMVGALVGSTQREPLVVG +KPSTFMMDYLADKFGIQKSQICMVGDRLDTDILFGQNGGCKTLLVLSGVTSISMLESPENKIQPDFYTSK +ISDFLSPKAATV diff --git a/PK.txt b/PK.txt new file mode 100644 index 0000000..fd194ac --- /dev/null +++ b/PK.txt @@ -0,0 +1,111 @@ +>jgi|Thaps3|22345|estExt_fgenesh1_pg.C_chr_40571|PK_TPS +MISNTSDVPLLAGGYINLDTVKATNNIGSRRTKIICTLGPACWDVSQLEELIESGMNVARFNFSHGDHDG +HKACLDRLRQAAKNMNQNVAVLLDTKGPEIRTGFFADGAKSINLVKGEELILTSDYAYKGDSKKLACSYE +KLASSVNPGQSILVADGSLVLTVVSCDETTGEVVTRVENNAKIGERKNMNLPGVVVDLPTLTEKDVDDIV +NWGIKHDVDYIAASFVRKASDVLFIRKILAENGGSGIKIISKIENQEGLQNYLEILQATDGIMVARGDLG +MEIPPEKVFLAQKYMIREANIAGKPVITATQMLESMITNPRPTRAECSDVANACYDGTDAVMLSGETANG +CYYRQAVEIMARTCAEAETSVNWNELYQSVRNSVRKRYQLSSSESLASSAVKTAVDVGAKVIVVYSESGA +TARHIAKFRPGMPVAVLTPSEQVARQSFGLLKGSYAFVVDTLEDTHKLDKEVMRECRVAGIAQAGDPVVI +VCGSTFGTGATNQIKVEFVQSDDGDADDGKAHLDNNNAEYNGCTIC* +>jgi|Phatr2|22404|estExt_gwp_gw1.C_chr_170082|PK1_PTRI +MKLSLLALTFALGHAFVPPSFLASPSSRKVLSSSRSASVAANAADVLAKTTSSSSTPSSLMPKETTVAAV +PKVAQRWRKSTKQVVTLGPASSNKEMIEKLFLAGADVFRLNFSHGSQEQKKELLIMIREVEEKYSHPIGI +LGDLQGPKLRVGEFSKPEGEFLELGQSFRLDLDNAKGDNKRVQLPHPEIIKASELGHALLVDDGKVKLVV +TAKGDDYLECRVDVAGMIKDRKGVNTPDSVLEISPLTPKDRSDLEYMLGIGVDWVALSFVQTPADMVEIH +ALIDEKLPSGQFKPAVMAKIEKPSCFYDDNLQRIVGLCNGIMVARGDLGVECPPEDVPLLQKEIIDECRN +QGRPVIVATQMLESMIEVPTPTRAEASDVATAIYDGADAIMLSAESAAGKFPEESVAMQQRIINRVEGDK +HYRSYLKQNEPDPENTPTDAIITAARQVAKTIGAKSIVCFSLRGSTVLRASKSRPGVPILALCPFKETSR +QLALSWGVYSDLPKAGSYGYTVSEEDMFNYDRPMVEKSTDDFDLVLKNACRAALKKGLVSDPDDLLVVTA +GLPFGTPGAANIIRVVPAAGPSCWDGVCRVD* +>jgi|Phatr2|49098|estExt_fgenesh1_pg.C_chr_200225|PK2_PTRI +MTASQTKITASGPELRGANITLDTIMKKTDVSTRQTKIVCTLGPACWEVEQLESLIDAGLSIARFNFSHG +DHEGHKACLDRLRQAADHKKKHVAVMLDTKGPEIRSGFFADGAKKISLVKGETIVLTSDYSFKGDKHKLA +CSYPVLAKSVTPGQQILVADGSLVLTVLSCDEAAGEVSCRVENNAGIGERKNMNLPGVIVDLPTLTDKDI +DDIQNWGIVNDIDFIAASFVRKASDVHKIREVLGEKGKGIKIICKIENQEGMDNYDEILEATDAIMVARG +DLGMEIPPEKVFLAQKMMIRQANIAGKPVVTATQMLESMITNPRPTRAECSDVANAVLDGTDCVMLSGET +ANGEYPTAAVTIMSETCCEAEGAQNTNMLYQAVRNSTLSQYGILSTSESIASSAAKTAIDVGAKAIIVCS +ESGMTATQVAKFRPGRPIHVLTHDVRVARQCSGYLRGASVEVISSMDQMDPAIDAYIECCKANGKAVAGD +AFVVVTGTVAQRGATNLMRVMYA* +>jgi|Phatr2|56445|AGR_estExt_Phatr1_ua_kg.C_chr_200033|PK3_PTRI +MSLSQSSDVPILAGGFITLDTVKHPTNTINRRTKIVCTIGPACWNVDQLEILIESGMNVARFNFSHGDHA +GHGAVLERVRQAAQNKGRNIAILLDTKGPEIRTGFFANGASKIELVKGETIVLTSDYKFKGDQHKLACSY +PALAQSVTQGQQILVADGSLVLTVLQTDEAAGEVSCRIDNNASMGERKNMNLPGVKVDLPTFTEKDVDDI +VNFGIKHKVDFIAASFVRKQSDVANLRQLLAENGGQQIKICCKIENQEGLENYDEILQATDSIMVARGDL +GMEIPPAKVFLAQKMMIREANIAGKPVITATQMLESMINNPRPTRAECSDVANAVLDGTDCVMLSGETAN +GPYFEEAVKVMARTCCEAENSRNYNSLYSAVRSSVMAKYGSVPPEESLASSAVKTAIDVNARLILVLSES +GMTAGYVSKFRPGRAIVCLTPSDAVARQTGGILKGVHSYVVDNLDNTEELIAETGVEAVKAGIASVGDLM +VVVSGTLYGIGKNNQVRVSVIEAPEGTVKETAAAMKRLVSFVYAADEIPGNAD* +>jgi|Phatr2|45997|estExt_fgenesh1_pg.C_chr_80378|PK4a_PTRI +MLSSTSTIPKLDGEVVTLSVIKKPTETKKRRTKIICTLGPACWSEEGLGQLMDAGMNVARFNFSHGDHEG +HGKVLERLRKVAKEKKRNIAVLLDTKGPEIRTGFFADGIDKINLSKGDTIVLTTDYDFKGDSKRLACSYP +TLAKSVTQGQAILIADGSLVLTVLSIDTANNEVQCRVENNASIGERKNMNLPGVVVDLPTFTERDVNDIV +NFGIKSKVDFIAASFVRKGSDVTNLRKLLADNGGPQIKIICKIENQEGLENYGDILEHTDAIMVARGDLG +MEIPSSKVFLAQKYMIREANVAGKPVVTATQMLESMVTNPRPTRAECSDVANAVYDGTDAVMLSGETANG +PHFEKAVLVMARTCCEAESSRNYNLLFQSVRNSIVIARGGLSTGESMASSAVKSALDIEAKLIVVMSETG +KMGNYVAKFRPGLSVLCMTPNETAARQASGLLLGMHTVVVDSLEKSEELVEELNYELVQSNFLKPGDKMV +VIAGRMAGMKEQLRIVTLDEGKSYGHIVSGTSFFFERTRLLDFND* +>jgi|Phatr2|27502|estExt_Genewise1.C_chr_80291|PK4b_PTRI +MLSSTSTIPKLDGEVVTLSIIKKPTETKKRRTKIICTLGPACWSEEGLGQLMDAGMNVARFNFSHGDHEG +HGKVLERLRKVAKEKKRNIAVLLDTKGPEIRTGFFADGIDKINLSKGDTIVLTTDYDFKGDSKRLACSYP +TLAKSVTQGQAILIADGSLVLTVLSIDTANNEVQCRVENNASIGERKNMNLPGVVVDLPTFTERDVNDIV +NFGIKNKVDFIAASFVRKGSDVTNLRKLLADNGGPQIKIICKIENQEGLENYGDILEHTDAIMVARGDLG +MEIPSSKVFLAQKYMIREANVAGKPVVTATQMLESMVTNPRPTRAECSDVANAVYDGTDAVMLSGETANG +PHFEKAVLVMARTCCEAESSRNYNLLFQSVRNSIVIARGGLSTGESMASSAVKSALDIEAKLIVVMSETG +KMGNYVAKFRPGLSVLCMTPNETAARQASGLLLGMHTVVVDSLEKSEELVEELNYELVQSNFLKPGDKMV +VIAGRMAGMKEQLRIVTLDEGKSYGHIVSGTSFFFERTRLLDFND* +>jgi|Phatr2|49002|estExt_fgenesh1_pg.C_chr_200111|PK5_PTRI +MMRSFLRHAHRRACAQQLRTIGTLRLNQMPVTGANTKIVCTIGPASDQAESLGQLVTYGMSVARLNFSHA +GDDYTYSEANMALLRNAVGKHHHLATGSSTDLPKNLRAILVDTKGPEIRTGILPGDVEIMDIPVGATVML +CIEDVSQEVLAEGEFKIHVDYESIAKTVKIGDKVLLDDGLIELEVMEVHPGSGTVLTSALNGGPIKKNKG +VNLPGVQLDLPALTDKDKRDLDWACRVGADFVAASFIRTPANVRSVIAYLDRCISKLPDVNGMKPLRPLV +ISKIESKEGVDNFDEILEESDGIMVARGDLGVEIPYSKVFAAQRMMVHKCNEIGKPVIVATQMLDSMMRN +PRPTRAEVTDVGTAVMDGADAVMLSGETAAGKYPIESIRAMASVAWEADQIVNSKSSIVWNEDLHEKMDL +MEQELDAVAASAVRSAQDMGAKMIVLITMSGRVARAVARHRPTVPVLAYCTDVQVARRLQLHRSIIPIML +QSEADPGDSSTRMGYLRAEAVRTAKELGFAHSGDRIIMVDRTVGKSHDMHEFSHNMKVVTLRDS* +>jgi|Phatr2|56172|estExt_Phatr1_ua_pm.C_chr_230011|PK6_PTRI +MFRRAVLSLSTRAIRTPVPCSVARGGASQVRSLAQTTFYLPDPADRSQDVHNRGNLQLSKIVATIGPTSE +QEEPLRLVTDAGMRIMRLNFSHATKEEVELRITNLALAQKALQPPGTLEMQDVRALLLDTKGPEIRSGKL +AHDESGHATVTLQQGQRIELFNDASRQQQSGSTEQALYIDYPGLHRCLHPSMKVLLDDGAITLTVQSVNV +EAATVSCVVDNAGELRSRAGVNLPLADTSDLPAMSDKDKQDIKYGMTMDIDYVAASFVQTAEGVNEIRGY +IQQCAQELGWDDSHPLPLIISKIETAGALQHFDAILAASDGIMVARGDLGVEIPLTQVTNAQKEMVAACN +AVGKPVIVATQMLESMAKSPRPTRAEVSDVTNAIYDGADCVMLSGETAKGKFPTEAVRTMNEIILAAERY +TTSGALGHSYHRPAFVGPKTADSAVAKAAVTASVERDCAAILVLTQHGSLPPLVSAYRPRVPIFAFCPTP +KLARQLQVYRGIHPIVDSTLTDGNDCKRPEQAVQEAKDMGLLQSGDEVVVVSMDGTTATMKIAIVS* +>gb|CAM77898.1_pyruvate_kinase|Magnetospirillum_gryphiswaldenseMSR1 +MRRTRKAKIIATLGPASSTPQAIESLFRAGADVFRLNFSHGSHADHQARYDTIRALEQKIGRPIGVLADL +QGPKLRVGKFADGKIKLETGATFRLDLSPELGTGVRAPLLHPEVFAAMNVGTELLLDDGKLRLRVEQHGG +DFAETRVIVGGELSNHKGVNVPNVVLPISPLTDKDKADLDFAVDMGADWIALSFVQRPGDVLEARKLIAR +KVGSRVRLLSKLEKPSAIDYLEEIVELSDAVMVARGDLGVECPPESVPILQKRIIKCCRSAGKPVVVATQ +MLDSMVHSPSPTRAEASDVATAIYDGADAVMLSAETASGDYPVDAVTMMDRIINRVEEDDQYTVITDASR +SQPENTTRDAISAAARQVAHTLKAAAVVTFTSSGSTTLRAARERPQQPIISLTSGIEVARQLALVWGAHC +VPTQEVRSFAEMVQTAAKAAQDESFAHPGDRIVITAGVPFGCSGTTNILRVAEIEESGEVL +>gb|AAU81896.1_pyruvate_kinase|Achlya_bisexualis +MAGAINLHKQGVELEAIMGDNEGQTRRSKIFCTIGPACWSVEKLTELIDAGMNVARFNFSHGDHKTHSEV +LNRLRTAIASRPHRHVAIMLDTKGPEIRTGFLATEDKKVHIEKDSIIEFTTDYEFLGDETKLACSYEDLP +TSVKVGGPILVADGSLVLEVTEILETGVKARALNSATLGERKNMNLPGAKVTLPTLTERDEDDLINWGLV +QGVDFIAASFVRCGQDIDNIRAVLGPRGRAIKIIAKIENQEGLENFDDILEKTDGIMVARGDLGMEIAPE +KVFLAQKMMIRKANIAGKPVVTATQMLESMIHNPRPTRAECTDVANAVLDGSDAVMLSGETANGDYPVEA +VRMMHKTCLQAEGAIHYDELYQALRNSVLETNGKMSTQEAIASSAVKTAIDMGAKMIVVLTETGTTARLI +AKYRPACPILVLTALGETARQCEGFLKGSYCRVMGSMIGTDSILYRATDLGKQFGWIKKGDAVVAIHGMM +EARSGSTNMLKVLVCD +>gb|AAU81895_1_pyruvate_kinase|Achlya_bisexualis +MLARSLRSRAVRSFARGLSNKPSKNDAFSMTKIVGTVGPVSENAKTTQELTNAGLKIMRINFSHATYDEA +HLRMSHLRASKGVHAKHTGKEFNVRAVLLDTQGPEIRGGAFPEKKINLTKGDMITLTTDVQYKEASTKDM +LYVTYEQLPATVKVGDTVLLDDGLISLTVKSIDVASGQVRCLIENSEVLGSRKGVNLPGLVVDLPALTAK +DKQDVEFGVEHDMDFIAVSFVRKPEDVNDVKDFVNSVMPKYWPAGHPAPLIISKIENYEGVSNFDRILEV +SDGIMVARGDLGVEIPMQEVLTCQKDMVSKCNAAGKPVIVATQMLESMIRNPRPTRAEILDVGNAVLDGA +DAVMLSGEVAQGKWPVESVKTMMSVIKEADAYVKREQYKKEALSQKEAVACAVATTAKSLHAAMIVVMTA +SGEVARLVSKHKPSVPVMCYTTSQKVGRQLQIHRGLYPIVAPTPCKMNLQEAISTAKKLGWLHNGDQVVM +LSSETPTGVVGQQYIMRVATVGEDIAH +>gi|NP_001193725.1|pyruvate_kinase_PKM_isoform_c|Homo_sapiens +MQWSSERGERLLTPGACSSEVPSAVPSRSGGSPGHTVFSSERSLLVRPRSHPEPKGEHYVTGSPTPENQR +TSAAMSKPHSEAGTAFIQTQQLHAAMADTFLEHMCRLDIDSPPITARNTGIICTIGPASRSVETLKEMIK +SGMNVARLNFSHGTHEYHAETIKNVRTATESFASDPILYRPVAVALDTKGPEIRTGLIKGSGTAEVELKK +GATLKITLDNAYMEKCDENILWLDYKNICKVVEVGSKIYVDDGLISLQVKQKGADFLVTEVENGGSLGSK +KGVNLPGAAVDLPAVSEKDIQDLKFGVEQDVDMVFASFIRKASDVHEVRKVLGEKGKNIKIISKIENHEG +VRRFDEILEASDGIMVARGDLGIEIPAEKVFLAQKMMIGRCNRAGKPVICATQMLESMIKKPRPTRAEGS +DVANAVLDGADCIMLSGETAKGDYPLEAVRMQHLIAREAEAAMFHRKLFEELVRASSHSTDLMEAMAMGS +VEASYKCLAAALIVLTESGRSAHQVARYRPRAPIIAVTRNPQTARQAHLYRGIFPVLCKDPVQEAWAEDV +DLRVNFAMNVGKARGFFKKGDVVIVLTGWRPGSGFTNTMRVVPVP +>potatoPKCYT1|S53332 +MANIDIAGIMKDLPNDGRIPKTKIVCTLGPSSRTVPMLEKLLRAGMNVARFNFSHGTHEYHQETLDNLKIAMQNT QILCAVMLDTKGPEIRTGFLTDGKPIQLKEGQEITVSTDYTIKGNEEMISMSYKKLVMDLKPGNTILCADGTITL TVLSCDPPSGTVRCRCENTATLGERKNVNLPGVVVDLPTLTEKDKEDILEWGVPNNIDMIALSFVRKGSDLVNVR KVLGPHAKRIQLMSKVENQEGVINFDEILRETDSFMVARGDLGMEIPVEKIFLAQKMMIYKCNLAGKAVVTATQM LESMIKSPAPTRAEATDVANAVLDGTDCVMLSGESAAGAYPELAVKIMSRICIEAESSLDNEAIFKEMIRCTPLP MSPLESLASSAVRTANKARAKLIVVLTRGGSTAKLVAKYRPAVPILSVVVPVLTTDSFDWSISDETPARHSLVYR GLIPLLGEGSAKATDSESTEVILEAALKSAVTRGLCKPGDAVVALHRIGSASVIKICVVK \ No newline at end of file diff --git a/PPDK.txt b/PPDK.txt new file mode 100644 index 0000000..555d84e --- /dev/null +++ b/PPDK.txt @@ -0,0 +1,74 @@ +>jgi|Thaps3|5500|fgenesh1_pg.C_chr_5000542|PPDK_TPS +MVINSQYIYPFGGSAPKPTVDPDKQIVGGKGLGLQVMFKIGVDVPPGFTLTTPLCQVYAKQNDLPADVWK +GVRENVRRIEIDMEKEFGSNENPLLFSCRSGAAMSMPGMMDTVLNIGLNDITVDGLAKATGNDRFAWDSY +RRLLDMFGEVVLGISHEDFEKRFDKVKEAANAKSDVDLGVEDLKKLCDEYKQVYLEEGKVFPMDPYEQLY +ACVKAVFGSWMTPRAVKYREINNIKNLIGTATNIQTMVFGNMGDDSGTGVAFSRNPSTGENLMYGEYLIN +AQGEDVVAGIRTPQPISQMQEVLPDAYAKFLENVDKLEHYFKDMQDVEFTVEKGQLWMLQCRNGKRTGVA +AIRIATELVEEGICTKSEALLKVEPRHVEQLLHPTFSPEALKSVAYTEGIVAKGLPGSPGAAVGKLVFNP +KQAEDERAKGESVILVRETTSPEDVGGMWAAAGILTARGGMTSHAAVVARGWGKPCVCGCSDIVVSEMDE +TVTVKSSNLVFKSGDVISINGGTGEVINAVIEVKTPDLKGDLSVLLGWADEEEGVMQVLANADSGPDASQ +AANNGAMGIGLCRTEHMFFAPERLPIVRRWIFHTECLDDLDHIKHFQRSDFKDLFVAMNGKHVTIRLLDP +PLHEFLPRPEQVHEKTAEELGFGKDVKRMLARIDSMHEENPMLGLRGCRLGIVKPEFTQMQVEAIMSAAA +DFMEEAPDTAKVHPRIMIPLIGSISEYKNQALIIKREAERIKVERGLDIPYEIGTMIEVPRAAVVSDKIA +ALVDDEDSKPLCTFFSYGTNDLTQMTMGISRDDSNGFIPKYLELGILEDDPFQTIDVEGVGYMVKHSATL +GKVVNPNLSLSVCGEHGGDPKSIEFFDSLGLNYVSCSPFRVPVARLAAGQIRVKRRMEEAKLAETAARAV +LIKAAEKTTALNVGGQTNVLVHQ* +>jgi|Phatr2|21988|estExt_gwp_gw1.C_chr_150095|PPDK_PTRI MKFSSAAATTVGLLLSGHAPMIFSFVTPPSRFASGHQASGSERSIISHSTESSSSLTSHNTNDSRSISSK MQFPLFMTATYSEAGKKKTSTISTTGSNSMVSEQEDSKHGIVPFGGSAAHVKTPDKQILGGKGLGLQEMS YVGIDVPPGFTLTTPLCQVFQENGDLPEEMWRQVEAAIQRVEQDMDRKFGDPSYPLLFSCRSGAAISMPG MMDTVLNVGLNKETVEGLAKATGNKRFAYDSFRRLLDMFGDVVLGIPHESFEDKLKGLKAKVGVQDDIDL TAEHLKELCDLYEKVYDEHGKEFPHNPMDQLKACIKAVFGSWNSGRAIKYREVQGITSLLGTACNIQTMV FGNLGPTSGTGVAFSRDPGTGKAVLNGEYLVNAQGEDVVAGIRTPEPISTMEKGFPKAYEQFIRNVHTLE QHFKDMQDVEFTVENERLWMLQCRSGKRTGQAALKIACDLVDEGICTPEEALLKVEPDHVKQVLHPTFSA EALESAVYKENVVAVGLAGGPGAAVGKVVFSTETAEEMTKEGVILVRETTSPEDVGGMWASRGIVTCRGG VTSHAAVVARGWGKPCVVGCDDIDVDMKTKTMTIKETGEEFKEGDVISINGSTGEIVRVAIETTVPALEG EFGKLLGWADEVPDVCRVMANADSGADAQKARDLGAQGIGLCRTEHMFFSPSRLPVVRRWILRDEGLEQV QEFQREDFREIMHVMDGKPVTIRLLDPPLHEFLPHSSEINEKLSKQLGYDDSQALASDIEAMHEENPMLG LRGCRLGIVREGLTAMQTEAIIHAAADLIEKNNDAKPYPRIMVPLIGSVAEFKNQALLIKRTAERVKKER GIDVPYEIGTMIEVPRAALVSDQIAGVTDPEDGKRLCEFFSYGTNDLTQMTLGISRDDAGAFLQVYKDLG IMEEDPFKSIDTEGVGFLLHLSAAKGRMVNPELSLSICGEHGGDAASIKFFDKVGLDYVSCSPFRVPVAR LAAGQASVKRRKDEIEPMSRKDRVVKTSPMA +>gb|BAA21653.1_pyruvate_orthophosphate_dikinase|Eleocharis_vivipara +MERVCLHAIHGACKPDMDGNIRFGRKKRHLYKRLSSCRVRAMKLDQSGFEASRKQSSYALKAIATPMAVT +TKKRVFTFGKNKTEGNKGMKELLGGKGANLAEMSSIGLSVPPGFTVSTEACQQYQESGHKMPPGLWDEII +DGLKWVQQDMGARLGDPEKPLLVSVRSGAAVSMPGMMDTVLNLGLNDEVVSGLAKKSGERFAYDSYRRFI +DMFGDVVMGISHEHFGDKLEEMKATKGVKNDTDLSANDLKELVVQYKEVYAKAKGEPFPTDPMKQLSLAV +LAVFDSWDSPRAKKYRSINKITGLKGTAVNVQCMVFGNMGNTSGTGVLFTRNPSTGEKKLYGEFLVNAQG +EDVVAGIRTPQELETMKDYFPQAYQELVDNCKILESHYKDMMDIEFTVQENRLWMLQCRTGKRTGKAAVK +IAVDLVSEGLVDTRTAIKMVDPGHLDQLLHPQFENPKAYKDKVIASGLPASPGAAVGQVVFTAEDAEMWH +AQGKAVILVRTETSPEDVGGMHAAAGILTARGGMTSHAAVVARGWGKCCVSGCSDIRVNDAEKVLLVGDK +KLQEGEWISLNGSTGEVIMGKQPLSPPALSGDLGTFMAWVDEVRQIGVMANADTPEDALAARNNGAQGIG +LCRTEHMFFASDERIKAVRQMIMSGTVEQRQKALDRLLPYQRSDFEGIFRAMDGLPVTIRLLDPPLHEFL +PEGNIEDIVREMASETGSAEEEVFSRVEKLSEVNPMLGFRGCRLGISYPELTEMQARAIFEAAITMSNQG +VKVLPEIMVPLVGTPQELGHQVSLIRQVADKVFSATGTSVSYKVGTMIEIPRAALVADEIAEHAEFFSFG +TNDLTQMTFGYSRDDVGKFLPIYIAHGILQNDPFEVLDQKGVGELVKLATERGRKTRPDLKVGICGEHGG +EPSSVAFFAKSGLNYVSCSPFRVPIARLAGAQVVVQK +>gb|AOL23586.1|pyruvate,orthophosphate_dikinase|Erythrobacter litoralis +MTTATATQTGTDTELRTVYRFGGDAPHDDPRQRDKVVTGGKGANLAEMASIGLPVPPGFTITTEECVRYL +QAGEAFRDELRAEVAEALAHVERAVGKKFGDAADPLLVSVRSGARVSMPGMMDTVLNLGLNDETVEGLAK +VSEDERFAWDSYRRFIQMYSDVVLGLDHGLFEEALEIAKEDQGYYNDTEMSADDWRSLVREYKRIVREEQ +DEPFPQDVNDQLWGAIAAVFGSWDSERAKVYRRLNDIPADWGTAVNVQAMVFGNMGDTSATGVAFTRDPS +TGDRSYYGEYLINAQGEDVVAGIRTPQYLTRAAREAAGAKPLSMEEALPQAYEELARVFDLLESHYRDMQ +DIEFTVERGKLWMLQTRTGKRTAKAALKMATDMVDEGLIDRAEAVRRIDPMALDQLLHPTLDPDAERDVM +TTGLPASPGAAAGKIVLDADTAEQWANRGDKVILVRVETSPEDIHGMHAAQGILTARGGMTSHAAVVARG +MGRPCVSGASGISIDRTERTLRIGSQELKEGDTITLDGANGQVMLGEVPTVEPELAGDFATLMEWADELR +RMRVRTNAETPEDCRMARQFGAEGIGLCRTEHMFFDAGRIKAVRQMILAEDEAGRRKALDQLLPEQRADF +TAIFEVMAGLPCTIRLLDPPLHEFLPTRDEDFADLSDATGLGVDHLRRRANELHEFNPMLGHRGCRLGIT +YPEIYEMQARAIFEAACAVEAESGDAPLPEIMIPLVATKKELSLLRALVDRVAEEVFGEKGTRIAYLVGT +MIELPRAALLAGEIAEEGEFFSFGTNDLTQTTLGLSRDDAGRFLGTYVDKGIFPRDPFVSLDVDGVGQLV +ELAATRGRATRPEIKLGICGEHGGDPASIGFCENVGLDYVSASPYRVPIARLAAAQAALAVSK +>gb|CAK06583.1|pyruvate,phosphate orthophosphate dikinase (ec 2.7.9.1)|[Rhizobium leguminosarum bv. viciae 3841] +MTKWVYRFGDGQAEGRARDHEVLGGKGANLAEMCALGLPVPPGLTIVSDACNTYYKNGRHIEDQVKAEVR +AGIAAIEAITGRRFGSVSQPLLLSVRSGARVSMPGMMDTVLNLGLNDETVQALGHDAGDARFAWDSYRRF +IQMYADVVMGLGNDAFEEILEDEKAKLGHEFDTELSASEWQHIVSLYKKLIEEELEQEFPQDPEVQLWGA +VGAVFASWMSARAVTYRHLHNIPEGWGTAINIQAMVFGNLGNASATGVAFTRNPSTGERALYGEFLVNAQ +GEDVVAGIRTPQSITEEGRISSGSEKPSMEKLMPEAFRELCRICTELEIHYRDMQDIEFTIERGKLWMLQ +TRSAKRSTRAAMKIAVDMVDEGVITEDEAVLRIEPSSLDQLLHPTIDPRVTRQVIGSGLPASPGAATGAI +VFTAEEAVEAESEGRKVILLRVETSPEDIHGMHAAEGILTTRGGMTSHAAVVARGMGIPCVVGAGTMRID +VRNERLLGVGVTLKKGDIITIDGSAGQVLKGEVPMIQPALSGDFGRIMGWADRARRMTVRTNADTPADAL +AARSFGAEGIGLCRTEHMFFEGERIHVMREMILAVDEKGRRVALDKLLPMQRLDFTGLFTVMHGLPVTIR +LLDPPLHEFLPKTDDEVAEVAFAMGMEASVLRQRVDALHEFNPMLGHRGCRLAISYPEIVEMQARAIFEA +AVAAAKETGAAVVPEIMVPLVGLRTELDYVKARIDEVAGAVMNEAGMKIDYLVGTMIELPRAALRAHVIA +EAAEFFSFGTNDLTQTTFGISRDDASAFIPTYQRKGIIEHDPFISLDFDGVGELISIAAERGRRTRNDMK +LGICGEHGGDPASIRFCETIGLDYVSCSPFRVPIARLAAAQAVIAGSLEDVRRGPKDLRASV +>gi|NP_001105738.2|pyruvate, phosphate dikinase 1, chloroplastic precursor [Zea mays] +MAASVSRAICVQKPGSKCTRDREATSFARRSVAAPRPPHAKAAGVIRSDSGAGRGQHCSPLRAVVDAAPI +QTTKKRVFHFGKGKSEGNKTMKELLGGKGANLAEMASIGLSVPPGFTVSTEACQQYQDAGCALPAGLWAE +IVDGLQWVEEYMGATLGDPQRPLLLSVRSGAAVSMPGMMDTVLNLGLNDEVAAGLAAKSGERFAYDSFRR +FLDMFGNVVMDIPRSLFEEKLEHMKESKGLKNDTDLTASDLKELVGQYKEVYLSAKGEPFPSDPKKQLEL +AVLAVFNSWESPRAKKYRSINQITGLRGTAVNVQCMVFGNMGNTSGTGVLFTRNPNTGEKKLYGEFLVNA +QGEDVVAGIRTPEDLDAMKNLMPQAYDELVENCNILESHYKEMQDIEFTVQENRLWMLQCRTGKRTGKSA +VKIAVDMVNEGLVEPRSAIKMVEPGHLDQLLHPQFENPSAYKDQVIATGLPASPGAAVGQVVFTAEDAEA +WHSQGKAAILVRAETSPEDVGGMHAAVGILTERGGMTSHAAVVARGWGKCCVSGCSGIRVNDAEKLVTIG +GHVLREGEWLSLNGSTGEVILGKQPLSPPALSGDLGTFMAWVDDVRKLKVLANADTPDDALTARNNGAQG +IGLCRTEHMFFASDERIKAVRQMIMAPTLELRQQALDRLLPYQRSDFEGIFRAMDGLPVTIRLLDPPLHE +FLPEGNIEDIVSELCAETGANQEDALARIEKLSEVNPMLGFRGCRLGISYPELTEMQARAIFEAAIAMTN +QGVQVFPEIMVPLVGTPQELGHQVTLIRQVAEKVFANVGKTIGYKVGTMIEIPRAALVADEIAEQAEFFS +FGTNDLTQMTFGYSRDDVGKFIPVYLAQGILQHDPFEVLDQRGVGELVKFATERGRKARPNLKVGICGEH +GGEPSSVAFFAKAGLDYVSCSPFRVPIARLAAAQVLV \ No newline at end of file diff --git a/PYC.txt b/PYC.txt new file mode 100644 index 0000000..754578c --- /dev/null +++ b/PYC.txt @@ -0,0 +1,152 @@ +>jgi|Phatr2|30519|estExt_Genewise1.C_chr_230092|PYC1_PTRI +MYRTRVLRRATATTRTRGAGSGSLSPSRFRLSRWSTTFAAPPLACGLLSCAAGLSQQCHQPIGRGLHTFH +TPDEQLPKSGGPVGVDTPPFTKLLAANRGEIATRINRAAAELGISTAGIYSYEDRFTQHRYKCDQAFELD +TSKSPVAQYLDIDKIVDICVKNKVQAVHPGYGFLSENETFAKKLDNAGIIFVGPTVQNLQAFGDKTAARN +MAIACNVPVVAGSHDAFATAKEASAWINDPANQCDYPVIVKALMGGGGRGIRIVPTEKDLNAMFQQASNE +AASAFGDGRCFVEKYVEKPRHVEVQCLGDGTGNVIHLWDRDCSVQRRHQKVVELAPAEGLSEDGRNQILN +DAVRLLQNANYRNAGTVEFLVDKNGKHYFMEVNPRVQVEHTVTEEITGVDIVQSQILIASGKTLPELNLT +QESIPSPMGVAMQCRVTTEDPAQDFRPDTGTINVFRMPAGMGIRLDDGPGFPGARITPHYDSLLVKITAK +ARNRKEAAAKLIRALKEFRVRGVKTNKSFLLNVLKHPDFLEGVVDTGFIAANPHLLAPLREQDRAQKLLY +YIANVVVNGTPKELGATGAPPSTVDPIIPVVEPNSGQQKKPSLKKIFDADGPDAFAKAVRNNKGLLITDT +TWRDAHQSLLATRLRTKDMLNIAPATTVALANAYSLECWGGATFDVSLRFLRECPWERLSALREAVPDIP +FQMLLRGANAVGYTSYPDNVVYEFCQMAKDTGMDVFRIFDSVNYIENMKLGIDAVGAAGGIVEAAVCYTG +DVSNPNRGMYNLEYYLGFVRQLHGLGIHVLAIKDMAGLLKPEAGTMLVNAIRQEFPDLPIHVHTHDTAGT +GVASMLACAKAGADAVDAAADAMSGTTSQPSLGALVASTQGTQWDTGLDLNQVQAVNDYWEEARGLYAPF +ESGQKTGSSDVYEHEMPGGQYTNLLFQSSQLGLTGQWSKVKKAYAAANRLLGDIIKVTPSSKVTGDLAQF +LVANDLTEKEVIEKAETLSFPKSVVEYFQGYLGIPPFGFPEPLRTKVLKGQTIEGYEGLTCFEGRPGADL +KPMDMEAVRSKLEEKWGGQADHGVRNVDILSHAMYPAVFDEYKEFKNVFGKLDFLDTRTFLTGMRVNQEL +RVEIEPGKQLVIKLDSVSEPDKDGLVTLQFELNGTLRTVQIQDKSVDSEKAVRPKAMAAVAGSVGAPMPG +VVVETKVKKGDVVEQGDPLLSLSAMKMETTVSAPVSGTVVFFEVTAGDQVEAGDLLVEIEDE* +>jgi|Phatr2|49339|estExt_fgenesh1_pg.C_chr_210250|PYC2_PTRI +MRRVAVFVLVLSMPSMAAAFAPRRSWTTATTPAGIETAATMARRSLLSSLLRVSTGTDSEKDTNASTDSI +QTDSVVTEASARNSKLVRNVPPFQRILAANRAEIAVRIMRAATELNAGTVAMYTHEDRYSQHRWGADQSF +LLDKKNPTSSPISAYLDIPQIIRLALDAGVDAIHPGYGFLSESPEFAQACADASITFVGPTVENLQRFSD +KTSARQAAIEADVPVVPGSDGALETEADVTAFVEANGLPVILKAAMGGGGKGMRVVRRMEDLIPFFQAAS +SEALASFGDGAVFVERFVERPRHIEVQIIGDGTGNVVHLWERDCSIQRRHQKVIEMAPAWTLPDELRAQL +HEYAVRLTSQAKYKNAGTVEFLIDAELRPYFIEVNPRIQVEHTVTEEVTGIDLVQAQIKIAAGATLEEVG +LVQANIQPRGVAIQCRVTTENPERDFAPDTGTVTLYRHSAGKGVRMDGIGYSGMTITPYFDSMIVKYTAL +GANFPETVARMKRVLQECRIRGVKTNVGFLLNVLSHPEFETGIVTTSFIDENPQLKQTSMSMYDFASEEQ +ADPRKTFATERLVRYLANLAVNGQPPELGADSQKLTRTTAIADIPAPEIRSEGNAAVPSDESPNQPGWRH +LLLEQGPKAYAKAVREHQGLLITDTTWRDAHQSLLATRMRTQELIKSADYTNMALANAFSLEMWGGATFD +VAMRFLRECPWERLEALREKVPNVPFQMLLRGANAVGYTNYADNVVHKFCKQAHDSGVDVFRVFDSLNYI +ENLQLGVDAAGEAGGFVEGAMSYTGDVADPTKGKYSLEYYMNLASELVDMGVHSLAIKDMAGLLTPKAST +LLVSALREAHPDIPIHVHTHDTAGSGVASMLAAAQAGADIVDSSMDAFSGMTSQPSLGALVANLAGTERD +TGIQLSNLPPLNSYWEDVRSLYAPFESGQLSGSSDVYFHEIPGGQYTNLLFQSKQLGLSDRWTEIKTKYA +EANIILGDIPKVTPSSKVVGDLAQFLVSQNLEANEVLEKADTLAFPDSVINYLKGDIGVPPGGFPEPLRN +KVLQSRNLEPIEGRPGKFLPDYNFDKERELLEKRFGKANIDEKDCLSYALYPDVFTEWKDFQALYGDVGK +LPTRLFLNPMQVGDEVEIEIAKGQTLIVELVSIQDVKEDGTRTVIFEVNGEPWYMPVTDQNLLGDSAVRE +KAVAPGQVGASMPGVVVGLKVKAGDTVQEGETVATLSAMKMETSIPATASGVIKRVLVNVGDKVNGDDLI +LEIE* +>jgi|Thaps3|11075|fgenesh1_pg.C_chr_19a_19000018|PYC_TPS +MTIMQNPQACINPPIRMDASIINNKAPSVEVKIPPAAVIAPSTNKTLAELCSFKKVMAANRGEIAVRICR +GATEFNLKTATIYAYEDRNSAHRWDSDESFLLPASGTPVGAYLNITNIINIAKENGVDAIHPGYGFLSES +AEFAQACEDNGITFVGPSVENLVTFGDKTKARELAIKADVSVVPGTSEPLTTTEAAVAFVEEYGLPVIIK +AAKGGGGKGMRVVNKKEDLIPLFEAASSEALASFGDGGCFVERYVTNAKHVEVQVIGDGKGNVVHLWERD +CSVQRRHQKIVEIAPAVHHSMEVRKAVLEDALKITKACNYKNAGTVEFLVDDQGRHYFMEVNPRVQVEHT +VTEQVTGLDIVQSTFLIAGGASLEDIGLVQENIIPRGVAMQCRITAEDPERDFAPDTGMLDVCRHSVGPG +IRVDGYAYPGMVVQPYFDSLLVKYTASHKDWDGAIRRMRRALHDNHIRGVKTNIPFLLNVMDHPDFIAGS +FDLNFIQDNPELLLNLPGTLSAQKGTLGQRYDHIEGYLKYIANLAVNGHPKSLGANDALVRIIDNCDIPA +PDKNEIEAILSKKKKSSPHWRKILREQGPKALAKAVRDHQNVLVTDTTWRDAHQSLLATRMRTADLLKAA +EATNTAFNGTSDVFSLEMWGGATFDVSMNFLRECPWKRLEELREAAPDMLFQMLLRGANAVGYTVYPDNV +VYEFCKQAYKSGNDIFRVFDSLNYVDNMELGIKAAAASGGFVEAAICYTGDVTSSDPSNKYNLKYYLDFA +TQLVDLGAHALAIKDMAGLLTPRAATLLVSELRSAFPDVPIHLHTHDTAGMGVAAMFAGAEAGADIVDGA +IDAMSGLSSQPCLGALVSALGDKSNVDLDALQVLNEYWESVRHQYNPFEVQALNAAIGSNVYKHEIPGGQ +YTNLLFQSKQLGLSGRFAEVKKAYALANKLLGDIPKVTPSSKTVGDLAQFIVGLKISGDELVENAATLPL +PNSVVEYMQGALGPPPGGYPEPFRTNVLKGRPLKDGRSMFTARPGAELPDYDFVEAEKNLKEAYGNSRIG +FKEVLSHAMYPQVFKDYLAFEKVYGDVEKLPTHMFLRPMTVGEESHLHLGPGKDYYIRLAAIDQFDEDLG +TRTVTLEVNGEKWFIRTPDTVTTLESATAGGPAPKRREKKDPTEKGSIGTPMPGQIVAVNVEEGDEVKEG +QTLFKLSAMKMETEIKAPISGTITRVLVSQSDSVEGDDLLAVVMAE* +>gb|CAB02872.1_Pyruvate_carboxylase_1|Caenorhabditi_selegans +MRFSRIPPIFANVVRQTHYRNYANGVIKPREFNKVMVANRGEIAIRVFRALTELNKTSVAIYAEQDKNSM +HRLKADEAYLVGKGLPPVAAYLTIDQIIETALKHNIDAIHPGYGFLSERSDFAAACQNAGIVFIGPSPDV +MARMGDKVAARQAAIEAGVQVVPGTPGPITTADEAVEFAKQYGTPIILKAAYGGGGRGIRRVDKLEEVEE +AFRRSYSEAQAAFGDGSLFVEKFVERPRHIEVQLLGDHHGNIVHLYERDCSVQRRHQKVVEIAPAPALPE +GVREKILADALRLARHVGYQNAGTVEFLVDQKGNYYFIEVNARLQVEHTVTEEITGVDLVQAQIRIAEGK +SLDDLKLSQETIQTTGSAIQCRVTTEDPAKGFQPDSGRIEVFRSGEGMGIRLDSASAFAGSVISPHYDSL +MVKVIASARNHPNAAAKMIRALKKFRIRGVKTNIPFLLNVLRQPSFLDASVDTYFIDEHPELFQFKPSQN +RAQKLLNYLGEVKVNGPTTPLATDLKPAVVSPPIPYIPAGAKPPTGLRDVLVQRGPTEFAKEVRSRPGCM +ITDTTFRDAHQSLLATRVRTYDMAAISPFVAQSFNGLFSLENWGGATFDVSMRFLHECPWERLQTLRKLI +PNIPFQCLLRGANAMGYSNYPDNVIYKFCELAVKNGMDVFRVFDSLNYLPNLLVGMEAVGKAGGVVEAAI +AYTGDVTDKSRDKYDLKYYLNLADQLVKAQAHILSIKDMAGVLKPEAAKLLIGALRDKFPDIPIHVHTHD +TSGAGVAAMLECAKAGADVVDAAVDSMSGMTSQPSMGAIVASLQGTKHDTGLSLDDISKYSAYWESTRQL +YAPFECATTMKSGNADVYKHEIPGGQYTNLQFQAFSLGLGPQFDEVKRMYREANLVLGDIIKVTPSSKIV +GDLAQFMVQNNLTRETLVDRADDLSFPKSVVDFMQGNVGQPPYGFPEPLRTKVLRGKPKVDGRPGENAKP +VDLDAVKVELEEKHGRTLSEEDVMSYSMFPTVFDEFETFRQQYGPVDKLPTRLFLTGLEIAEEVDVEIES +GKTLAIQLLAEGKLNKRGEREVFFDLNGQMRSIFVVDKEASKEIVTRPRALPGVRGHIGAPMPGDVLELK +IKEGDKVTKKQPLFVLSAMKMEMVIDSPIAGTVKAIHAPQGTKCSAGDLVVEVEP +>gb|EAL26409.1_uncharacterized_protein_Dpse_GA13539_isoform_A|Droso_pse +MFIPVAQSAFKALRSAQPRVRLYFVSKNAYSSQVEYKPIRSVLVANRGEIAIRVFRACTELGIKSVAVYS +EQDKMHMHRQKADESYLVGKGLPPVEAYLNIPEIIRVCKENDVDAVHPGYGFLSERSDFAQAVIDAGLRF +IGPSPKVVQNMGDKVAARVAAIEAGVPIVPGTDGPVTTKEEALEFCKMHGLPVIFKAAYGGGGRGMRVVR +KMEEVEESFQRASSEAKAAFGNGAMFIEKFIERPRHIEVQLLGDKAGNVVHLYERDCSVQRRHQKVVEIA +PAPRLPIELRDKMTEAAVRLARHVGYENAGTVEFLCDESGNFYFIEVNARLQVEHTVTEEITGIDLVQSQ +IRIAEGMTLPELGYTQENIQPRGYAIQCRVTTEDPANDFQPNTGRLEVFRSGEGMGIRLDSASAYAGAII +SPYYDSLLVKVIAHAGDLQSSAAKMNRALREFRIRGVKTNIPFLLNVLENQKFLNGVLDTYFIDEHPQLF +KFRPTQNRAQKLLNYLGEVLVNGPQTPLATTLKPAEVSPHVPAIPLDLSPEALEREERGEAKVTEPPCGL +RDILVRQGPEAFAKEVRSRKNLMLMDTTFRDAHQSLLATRVRSHDLLKISPYVAHKFNNLYALENWGGAT +FDVALRFLHECPWERLEEMRKRIPNIPFQMLLRGANAVGYTSYPDNVVYKFCELAVQTGMDIFRVFDSLN +YLPNLILGMEAAGKAGGVVEAAISYTGDVSDPKRTKYDLKYYTNLADELVKAGTHVLCIKDMAGLLKPEA +ATLLITAIRDKHPDIPIHIHTHDTSGAGVASMLACAQAGADVVDVAVDSMSGMTSQPSMGAVVASLQGTP +LDTGVDLRVVSEYSAYWEQTRTLYAPFECTTTMRSGNADVYLNEIPGGQYTNLQFQAFSLGLGDFFEDVK +KAYREANLLLGDIIKVTPSSKVVGDLAQFMVQNNLTADQVLEKAEELSFPKSVVEFLQGHIGIPHGGFPE +PLRSRVLKDMPRIEGRPGAALEPLDFDKLKQDLKESHPNITDRDVMSSALYPQVTNEYLFFREKFGPVDK +LDTRIFLTGPKVGEEFEVTLERGKTLSLKAMAMAADLKPNGDREVFFEMNGQLRTVHILDKEAVKEIHVH +PKANKAVKSEVGAPMPGTVIDIRVAVGDKVEKGQPLVVLSAMKMEMVVQAPQAGVVKKLEIANGMKLEGD +DLLMIIE +>gi|BAH22705.1|pyruvate_carboxylase|Ehux +MTKILLMLALALGAAGLRWPAAVPQRRATSGRAAGARLERAVGPVAVAPVEAPSVSRSSESAVDAMRGAA +EAPSPFKKLMAANRAEIAVRIMRAATELNVATVAIYGYEDRFSQHRWGADQSFQLEKKDPADAAVRAYLD +IEQIVALAKREGVDAIHPGYGFLSESPEFAQACSDAGITFVGPTVANLKTFSDKTTARVAAIAADVPVVP +GTDEPVTTEAGARAFVEEYGLPVIIKAAMGGGGKGMRLVRDMEELGASFASASTEAEAAFGDGSVFLERY +IESPRHIEVQIIGDGKGGAVHLCERDCSVQRRYQKVVEIAPAWSLDPALRNKLHEDSLRLMRSAKYLNAG +TVEFLVDGEGRHYFIEVNPRIQVEHTVTEEVTGIDLVQAQMRIASGASFEEVGLVQDQIQARGIAVQCRV +TTENPERNFAPDTGTLSVYRHSAGYGMRQDGIGYSGMTVTPYYDSLLVKYTARGSNWGEVIRRMTRALQE +ARIRGVKTNIPFLLNVLTHPEFKAGVVTTGFIDEHPELLQVTGKNWDFANVHQADQEKVMQVEKLLRYLA +NLAVNGHPKELGANPARLRTAPQPQVKPPRVLIPGKDDAPTAGRRPGGWRSLLLAEGPAAYAKAVREHKG +LLVMDTTWRDAHQSLLATRMRTADLVKAGAATNAALSNAFSLEMWGGATFDVAMRFLHECPWQRLERLRE +EVPDVPFQMLLRGANAVGYTNYPDNLVYRFCKQAAASGIDVFRVFDSLNYLENLKLGIEAAGEAGGFVEA +AICYTGDITDPSKGKYTLDYYLEYARQLAQLGVHSIAIKDMAGLLKPRAAALLVGAIRKELPDMLIHVHS +HDTAGNSLASMLSAAEAGADVVDVAIDSMSGITSQPSLGALAAATAGSELDIGVRPQDLEPLNSYWEQVR +SLYAPFESGQLSGSSDVYRNEIPGGQYTNLLFQASQLGLGDQWVEVKRKYAQANLLLGDIPKVTPSSKVV +GDLAQLMVAQKLEPDQLIEQAESLAFPDSVVSYFQGGIGLPPGGFPEPLRSKVLKGRSLEDGRAAYDGRP +GATMKPYDFDKELGLLQASYPSNKGERDALSYALYPQVFRDWQEHRAVYGEVEALPTEAFLHPMAVGDEV +EFATEPGRSWIVKLVSVPKPDENGQTQVIMELNGERWFVPVTDNSVQSATAREKAGGSPGSVGSPMPGVV +VDVKVKPGDTIREGEPLVVLSAMKMETAIPAPASGVVERLLVSAGDKVEGDDLLAQIGEGAPKEEGGSSA +KGGLFSSLFKGSGE +>gi|CAA96765.1|PYC1|Saccharomyces_cerevisiae +MSQRKFAGLRDNFNLLGEKNKILVANRGEIPIRIFRTAHELSMQTVAIYSHEDRLSTHKQKADEAYVIGE +VGQYTPVGAYLAIDEIISIAQKHQVDFIHPGYGFLSENSEFADKVVKAGITWIGPPAEVIDSVGDKVSAR +NLAAKANVPTVPGTPGPIETVEEALDFVNEYGYPVIIKAAFGGGGRGMRVVREGDDVADAFQRATSEART +AFGNGTCFVERFLDKPKHIEVQLLADNHGNVVHLFERDCSVQRRHQKVVEVAPAKTLPREVRDAILTDAV +KLAKECGYRNAGTAEFLVDNQNRHYFIEINPRIQVEHTITEEITGIDIVAAQIQIAAGASLPQLGLFQDK +ITTRGFAIQCRITTEDPAKNFQPDTGRIEVYRSAGGNGVRLDGGNAYAGTIISPHYDSMLVKCSCSGSTY +EIVRRKMIRALIEFRIRGVKTNIPFLLTLLTNPVFIEGTYWTTFIDDTPQLFQMVSSQNRAQKLLHYLAD +VAVNGSSIKGQIGLPKLKSNPSVPHLHDAQGNVINVTKSAPPSGWRQVLLEKGPAEFARQVRQFNGTLLM +DTTWRDAHQSLLATRVRTHDLATIAPTTAHALAGRFALECWGGATFDVAMRFLHEDPWERLRKLRSLVPN +IPFQMLLRGANGVAYSSLPDNAIDHFVKQAKDNGVDIFRVFDALNDLEQLKVGVDAVKKAGGVVEATVCF +SGDMLQPGKKYNLDYYLEIAEKIVQMGTHILGIKDMAGTMKPAAAKLLIGSLRAKYPDLPIHVHTHDSAG +TAVASMTACALAGADVVDVAINSMSGLTSQPSINALLASLEGNIDTGINVEHVRELDAYWAEMRLLYSCF +EADLKGPDPEVYQHEIPGGQLTNLLFQAQQLGLGEQWAETKRAYREANYLLGDIVKVTPTSKVVGDLAQF +MVSNKLTSDDVRRLANSLDFPDSVMDFFEGLIGQPYGGFPEPFRSDVLRNKRRKLTCRPGLELEPFDLEK +IREDLQNRFGDVDECDVASYNMYPRVYEDFQKMRETYGDLSVLPTRSFLSPLETDEEIEVVIEQGKTLII +KLQAVGDLNKKTGEREVYFDLNGEMRKIRVADRSQKVETVTKSKADMHDPLHIGAPMAGVIVEVKVHKGS +LIKKGQPVAVLSAMKMEMIISSPSDGQVKEVFVSDGENVDSSDLLVLLEDQVPVETKA +>gi|AAA82937.1|pyruvate_carboxylase_precursor|Homo_sapiens +MLKFRTVHGGLRLLGIRRTSTAPAASPNVRRLEYKPIKKVMVANRGEIAIRVFRACTELGIRTVAIYSEQ +DTGQMHRQKADEAYLIGRGLAPVQAYLHIPDIIKVAKENNVDAVHPGYGFLSERADFAQACQDAGVRFIG +PSPEVVRKMGDKVEARAIAIAAGVPVVPGTDAPITSLHEAHEFSNTYGFPIIFKAAYGGGGRGMRVVHSY +EELEENYTRAYSEALAAFGNGALFVEKFIEKPRHIEVQILGDQYGNILHLYERDCSIQRRHQKVVEIAPA +AHLDPQLRTRLTSDSVKLAKQVGYENAGTVEFLVDRHGKHYFIEVNSRLQVEHTVTEEITDVDLVHAQIH +VSEGRSLPDLGLRQENIRINGCAIQCRVTTEDPARSFQPDTGRIEVFRSGEGMGIRLDNASAFQGAVISP +HYDSLLVKVIAHGKDHPTAATKMSRALAEFRVRGVKTNIAFLQNVLNNQQFLAGTVDTQFIDENPELFQL +RPAQNRAQKLLHYLGHVMVNGPTTPIPVKASPSPTDPVVPAVPIGPPPAGFRDILLREGPEGFARAVRNH +PGLLLMDTTFRDAHQSLLATRVRTHDLKKIAPYVAHNFSKLFSMENWGGATFDVAMRFLYECPWRRLQEL +RELIPNIPFQMLLRGANAVGYTNYPDNVVFKFCEVAKENGMDVFRVFDSLNYLPNMLLGMEAAGSAGGVV +EAAISYTGDVADPSRTKYSLQYYMGLAEELVRAGTHILCIKDMAGLLKPTACTMLVSSLRDRFPDLPLHI +HTHDTSGAGVAAMLACAQAGADVVDVAADSMSGMTSQPSMGALVACTRGTPLDTEVPMERVFDYSEYWEG +ARGLYAAFDCTATMKSGNSDVYENEIPGGQYTNLHFQAHSMGLGSKFKEVKKAYVEANQMLGDLIKVTPS +SKIVGDLAQFMVQNGLSRAEAEAQAEELSFPRSVVEFLQGYIGVPHGGFPEPFRSKVLKDLPRVEGRPGA +SLPPLDLQALEKELVDRHGEEVTPEDVLSAAMYPDVFAHFKDFTATFGPLDSLNTRLFLQGPKIAEEFEV +ELERGKTLHIKALAVSDLNRAGQRQVFFELNGQLRSILVKDTQAMKEMHFHPKALKDVKGQIGAPMPGKV +IDIKVVAGAKVAKGQPLCVLSAMKMETVVTSPMEGTVRKVHVTKDMTLEGDDLILEIE + diff --git a/Proteorhodopsins.txt b/Proteorhodopsins.txt new file mode 100644 index 0000000..e158926 --- /dev/null +++ b/Proteorhodopsins.txt @@ -0,0 +1,43 @@ +>jgi|Fracy1|267528|estExt_fgenesh2_kg.C_10531|Fragilariopsis_cylindrus +MISGTQFTIVYDVLSFSFATMMATTIFLWMRVPSVHEKYKSALIISGLVTFIASYHYLRMFNSWTEAYEW +TGEGELAKTGSPFNDAYRYMDWLLTVPLLLIEIILVMKLPADESKSKATTLGIASAAMIAIGYPGELFMS +EDNLGGRWVYWIGAMLPFLYIVQTLLVGLNDATQSEADPAVRKLIKGVQWWTVIAWCTYPVVYIFPMMGI +SGSNAIVGIQLGYSVSDIISKCGVGLLIYQITIAKSLALKNGNEETP* +>gi|OLV16852.1|Proteorhodopsin|Deinococcus_marmoris +MRQRFTPLTWIIATLAVLLGTALAQSQNAPVEAAKLSLSSGQFGLVYQMFSITIAAMGAGFIFFVLAQQN +LSPKYRPAMVVSALVVAIACYHYFRIFNSWNESYALTAGAYVATAVPFNDAYRYADWILTVPLLLVEAVA +VLALATNVASGMIWRLALAAFVMIATGYPGEISGDTTTRLIWGTISTIPFIYIVYTLFVELGKSIDRQPP +RVQVLTRNLRLLLFASWGFYPIAYLLPIFLGGGGLSASGVVGLQVGYSIADILAKVGFGTLIYFIALEKT +AHDRSMGVTEDSTTPPATELPTRPV +>gi|APE26978.1|Proteorhodopsin|Erythrobacter_gangjinensis +MPTIENFVEYAVWQYDMVRHAFAFTVAVFAAGLVYFAMTAYQTHPAFRATSIISAVVMVSAALEIGQLWM +LWNESFAFNPATQTFQVVDGERFSNGYRYMNWMIDVPLLMTQLVVVAGFTGAALFKKWGLLTFTGIAMII +TGYVGQYFEPAAAGIAGYENGEQLWIWGAISTVFMIWMILVLANAVRDPQGEASNEVRKGLINCFWFLVI +TWAIYPIAYMWPVIDGSATGVVVRQTLYTVADVTSKLVFGVMLSQVALRRSAELGYRAAGVAMMVHTPSR +NQLTADEREENVLDEDRSRTGSV +>gi|KOO22837.1|rhodopsin|Chrysochromulina_sp.CCMP291 +MMFPVTAGQFDLVYNALSFTLASMMASTIFFWIRMGSVSEKYKSAMTITGLVTFIAAYHYIRIFNSWNES +YHYPEAADGVVQDPVITGQPFNDAYRYMDWMLTVPLLMIEIIFVMGLSPEETAAKATSLGVAAGLMIVLG +YPGELIIEGDLNVRWMWWTLAMIPFLYVVHTLLIGLQGAIKEEKNEEVAKKLNMVCWATVVSWCTYPIVY +VFPMLGLDGPSAVVAIQLGYCVSDIISKCGVGFLIYNITIAKSNPEGYAQVH +>gi|AKG94905.1|rhodopsin|Prorocentrum_donghaiense +MVMYPMSDMQYQAVYNTLSFALASMMATTMYLWSRSTAVRDQFKSAVLISGLVTFIAAYHYIRIFNSWVE +AYEYSAGKPDPELTGVPFNDAYRYMDWLLTVPLLLIEILLVMKLDEATYNVKSKTLGVGSALMIVSGYYG +EPTVTGDLTPRWICWFVSMCFFLYIVFELLVGLKAAIESETDPTIKGKIQLAQVMTVISWCTYPVVYLFP +MLGITASNAVVAIQIGYCVSDIISKCGVGLVIYQVTYAKSNKDGALLA +>gi|AIN36550.1|rhodopsin|Alexandrium_fundyense +MAPIPDGFSYGQWSVVYNALSFGIAAMGSATIFFWLQLPNVSKSYRTALTITGIVTFIATYHYFRIFNSW +VEAFNVTNSGGGDYTVKLTGAPFNDAYRYVDWLLTVPLLLVELILVMKLPAEQTTSMSWKLGFASALMVA +LGYPGEIQDDLTVRWVWWGLAMIPFCYVVYELVVGLNDATKRQASATVSSLISSARYLTVISWCTYPFVY +IVKNIGLSGPTATMYEQVGYSVADVVAKAVFGVMIWAIASEKSKLEEQGSLMSS +>gi|EGF32634.1|Proteorhodopsin|Oxalobacteraceae_bacterium_IMCC9480 +MIIGESFMEAVTLGQYELVYNAFSFAIAVMGAATIFFFLGRSQVASAYRTALTITGLVTLIAAYHYLRIF +NSWEAAFVITGDQIKASGIKFNDAYRYVDWLLTVPLLLIELILVMRLPRAETIAKSTKLGLLAALMVVLG +YPGEISADGGTRWMWWGLAMIPFLIIVYDLFVGLKKSIDSQPAAARGLISTARWVTVISWCFYPVVFVFP +MIGFTGSSAATAVQVGYTVSDIVAKAMFGVLIYMIAVRKSEAEGQHA +>gi|ADY17807.1|rhodopsin_type_II|Oxyrrhis_marina +MAPLTGDFSYGEWNAVYNALSFGIAAMGSATVFFWLQLGNVSKNYRTALTITGIVTWIATYHYFRIFNSW +VEAFEVNEVGGAYAVKVSGTPFNDAYRYVDWLLTVPLLLIELILVMKLPAGETAALSTKLGVASAVMVAL +GYPGEIQENLAVRWFWWALAMIPFAYVVFSLLVGLGAATAKQPESVAGLVSAARYLTAVSWLTYPFVYII +KNVGLAGPTATMYEQIGYSVADVMAKAVFGVLIWAIANEKSRLEGEGKLLR + diff --git a/SHMT.txt b/SHMT.txt new file mode 100644 index 0000000..7b76b62 --- /dev/null +++ b/SHMT.txt @@ -0,0 +1,83 @@ +>jgi|Thaps3|269942|estExt_thaps1_ua_kg.C_chr_180031|SMHT3_TPS +_TPSMMSLRSSLPALRRAAATQSARIALPSAINTCTDLHQHHNHANVRTLSSSSSSGASLNQRLTQVDPTLSTL +IEQEKARQRSSLVLIASENFTSRAVLDALGSVLSNKYSEGYPGARYYGGNENIDRVELLCQERALETFGL +SGEEWGVNVQSLSGSPANFQVYTALLETHDRILSLDLPHGGHLSHGFQTPTKKISAVSRYFESMPYRLNS +TTGQIDYDEMERSAELFRPKLIVAGASAYSRLIDYERIREIADKVGAYVMADMAHISGLIAAEVIPSCFP +YADVVTTTTHKSLRGPRGAMIFFRKGKKGETKKGEPIMYDLEEKINFAVFPGLQGGPHNHTIGALAVALK +QANTPEFVEYQKQVLKNCARLNSELQSLGYEIVSGGTDNHLVLVNVKSSKGIDGARVERVLELACIASNK +NTVPGDTSALNPGGIRMGTPALTSRGFMEEDFAKVAHYFDRAVSIANKLKNTEEGKKMKGFREMCAVGPS +VDPELVQLRKEVSEFASSFPTVGFEESEMEFKGEYNVDFVA* +>jgi|Thaps3|26190|estExt_fgenesh1_pm.C_chr_40017|SHMT1_TPS +MDASLSSAYAEAVQASTSSPSLTTSDPDISRLIVLEEDRQRYGLELIASENFVSRAVKEALGSCLTNKYS +EGQVGKRYYGGNEYIDEIETICMERALSLFGLDPSEWGVNVQPYSGSPANFAAYTALLQPHDRIMGLDLP +SGGHLTHGFQTPKKKVSATSVYFESMPYVVNPTTGLVDYDDMERRAKMFMPKLLIAGGSAYTREWNYARM +RTIADSVGAYLMVDMAHISGLVAGKVVANPFEYADLVTSTTHKTLRGPRSGMIFAKLDMMESINQAVFPM +LQGGPHNHQIGALAVALREASSPEFVQYARDVVANANALGKGLVKRGHKLVTGGTDNHIVLWDVKSTTGL +TGSKVERLLELASITANKNSIPGDTSAVNPGGVRLGSPALTSRGLKEEDFDKVAEFLHRGCELAVKVQAV +AKVKSDDGKVLMRFFEATLKEDDALREELDVLKKDVESFAGKFEMPGF* +>jgi|Thaps3|26031|estExt_fgenesh1_kg.C_chr_140007|SHMT2_TPS +MSDSNKRAKMTSFKDSEFTGLKPLSEHDPLLFDLIEKEKLRQYTSLELIASENFTSRAVMDCLGSALTNK +YSEGLPHARYYGGNEIVDQVEELCQKRALEAYGLDEKEWGVNVQPYSGSPANFAVYTGLLRPHDRIMGLD +LPSGGHLTHGFYTYSKKEGTRKAVSATSVYFESLPYQVDQTTGIINYDQLERDASLFKPAMIIAGGSAYP +RDWDYARFRKIADENGALLIMDMAHISGLVATKEQKSPFEYCDVVTTTTHKSLRGPRAGMIFFRRDERGF +EHKINQAVFPALQGGPHEHQIAGVATQLLEVMTPEFHQYSAQVRKNAQALGNKLISLGYSLATGGTENHL +VLWDLKPQKLTGSKFEKVCDAVSITLNKNCVPGDRSAVTPGGVRIGAPALTTRKMVEADFEQIAMFLHEA +LTIALKIQEESGPKLVDFVKCLEQNGEVEGLRKRVNEFASGFPMPGFDPKEMKYKL* +>jgi|Phatr2|18665|estExt_gwp_gw1.C_chr_30286|SHMT1_PTRI +MTSFKDQEFRGLLSLEEHDPELFDLIEQEKSRQWRSLELIASENFTSRAVMDCLGSALTNKYAEGLPGAR +YYGGNEVVDQVEALCQKRALEAYGLDPEKWGVNVQPYSGSPANFAVYTALLKPHDRIMGLDLPSGGHLTH +GFYTYSKKEGTRKAVSATSVYFESLPYRVHPETGYIDYDQLERDAGLFKPAMIIAGGSAYPRDYDYKRFR +EIADANGALLMMDMAHTSGLVATGELDSPFEYADVVTTTTHKSLRGPRAGMIFFRKDERGFESRINQAVF +PALQGGPHEHQIAGVATQLKEVCSPDFKVYSQQVKKNAKALADKLTSMGYSMASGGTENHLVLWDLKPQG +ITGSKFEKVCDAVSITLNKNCVPGDVSAVTPGGVRIGTPALTTRTMVESDFEQIGQFLHEALEITLAIQE +KSGPKLKDFLPLLEKNADIEALKVRVHDFATTFPMPGFDPATMKYKNPAGPSH* +>jgi|Phatr2|54015|estExt_Phatr1_ua_kg.C_chr_10105|SHMT2_PTRI +MLSVRSTLAPAIRRIATRTFAAGADLNKTLLETDPELSQLIEQEKARQRNSLVLIASENFTSKAVLDALG +SVLSNKYSEGYPGARYYGGNENIDQVELLCQKRALEAFHLDPAEWGVNVQSLSGSPANFQVYTALLETHA +RILALDLPHGGHLSHGYQTATKKISMVSRYFESMPYRLDESTGTIDYDQMEKSADLFRPKMIVAGASAYS +RLIDYERIRKIADGVGAYVMSDMAHISGLVAAQVIPSCFEYSDVVTTTTHKSLRGPRGAMIFYRKGQKGT +DKKGNPIMYDLEEKINFTVFPGLQGGPHNHTIGALATCLKQAATADFVVYQKQVLKNSSRLAEELNKLGY +TLVSGGTDNHLVLIDVKSSAKIDGARVERILELACIATNKNTVPGDTSALMPGGIRMGTPALTSRGFKED +DFTKVAHFFDRAVKIAVKLKNTDQGAKLKGFREMCAVGPSVDADLVQLRHDVSEFACLFPTVGFNEDEMT +FEGEYNVDFVA* +>jgi|Phatr2|17456|estExt_gwp_gw1.C_chr_10370|SHMT3_PTRI +MGSYTVRLIATWPIILLLLLSIFESVRAFSLTTHPRGGSQLHISMQDAKTKRIERSMEDFDPEIARMIGS +EERRQRVGLELIASENFASKAVRQVLGSCLTNKYSEGNVGRRYYGGNAFIDQIETLCMKRALDLYELDTE +EWGVNVQPYSGSPANFAVYTALLNPHDRIMGLDLPSGGHLTHGFQTPKKKVSATSVYFESMPYVVSADTG +LVNYDDMEKRAKMFLPKLLIAGGSAYPREWDYSRMRQIADSVGAKLMVDMAHISGLVAGKVAESPFPYAD +VVTSTTHKTLRGPRSGMIFARREYIDAVNSAVFPSLQGGPHNQQIGALAVALKEATEPDFLKYTKDVIAN +AKALAAGLEKRGHVLATGGTDNHLMLWNVRQLGLTGSKVEKVLDLASITTNKNSIPGDTSALNPGGVRLG +TPALTSRGMSENDFEKVAEFLHRGSEIALKAEHVAELELDRDNGQSKVLLKHFVAVLELDRDVRNQIDDL +RKDVENFASQFEMPGSDL* +>gb|CAJ03206.1|serine_hydroxymethyltranferase|Leishmania_major_strain_Friedlin +MASLIPTLTEQDPELANMIELEMGRQFRGLEMIASENLTSKAVLECLGSALTNKYAEGEPGNRYYGGTVF +VDMVENLAKKRALAAFGLDPGEWGVNVQPYSGSPANFAVYTALLEPHSRIMGLDLPSGGHLTHGFYTPKK +KVSATSIYFESFPYHVKEDGLIDYDALESVALVFRPKMIITGASAYARDFDYERFRHVCDEVGSLLFMDM +AHTAGLIAGGVLKSPFPYADVVTTTTHKSLRGPRAGMIFYRKKDRQGKPTDHESRINQAVFPGCQGGPHE +HQIAAIATQMREVCSQEWKAYARQVQSNARALAAALSSKGHVFVSGGTDNHLLLWNVRVHGLTGSKVEKL +LDAVSISVNKNTIPGDKSAMTPGGIRVGTLALTSRGMVEADMSTVAEFLDRAIVLAKQIQAAMNAVKLSD +FVEALQTHAGAAALRKDVEAFATTFAMPSFDVERIKYKDGLPEEQ +>gb|P50433.1|Serine_hydroxymethyltransferase|Solanum_tuberosum +MAMAIALRRLSATVDKPVKSLYNGGSLYYMSSLPNEAVYDKEKSGVAWPKQLNAPLEVVDPEIADIIEHE +KARQWKGLELIPSENFTSVSVMQAVGSVMTNKYSEGYPGARYYGGNEYIDMAETLCQKRALEAFRLDPAK +WGVNVQPLSGSPANFQVYTALLKPHERIMALDLPHGGHLSHGYQTDTKKISAVSIFFETMPYRLDESTGY +IDYDQLEKSATLFRPKLIVAGASAYARLYDYDRIRKVCNKQKAILLADMAHISGLVAAGVIPSPFDYADV +VTTTTHKSLRGPRGAMIFYRKGVKEVNKQGKEVFYDYEDKINQAVFPGLQGGPHNHTITGLAVALKQATT +PEYRAYQEQVLSNSSKFAQALGEKGYELVSGGTDNHLVLVNMKNKGIDGSRVEKVLEAVHIAANKNTVPG +DVSAMVPGGIRMGTPALTSRGFLEEDFVKVADFFDAAVKIAVKVKAETQGTKLKDFVATLESSAPIKSEI +AKLRHDVEEYAKQFPTIGFEKETMKYKN +>gb|NP_193129.1|serine_hydroxymethyltransferase_4|Arabidopsis_thaliana +MEPVSSWGNTSLVSVDPEIHDLIEKEKRRQCRGIELIASENFTSFAVIEALGSALTNKYSEGIPGNRYYG +GNEFIDEIENLCRSRALEAFHCDPAAWGVNVQPYSGSPANFAAYTALLQPHDRIMGLDLPSGGHLTHGYY +TSGGKKISATSIYFESLPYKVNFTTGYIDYDKLEEKALDFRPKLLICGGSAYPRDWDYARFRAIADKVGA +LLLCDMAHISGLVAAQEAANPFEYCDVVTTTTHKSLRGPRAGMIFYRKGPKPPKKGQPEGAVYDFEDKIN +FAVFPALQGGPHNHQIGALAVALKQANTPGFKVYAKQVKANAVALGNYLMSKGYQIVTNGTENHLVLWDL +RPLGLTGNKVEKLCDLCSITLNKNAVFGDSSALAPGGVRIGAPAMTSRGLVEKDFEQIGE +>jgi|Thaps3|262555|thaps1_ua_kg.chr_5000194|SLA_LP_TPS +ASNSQTKALFHHKRLPDHGWTDVQIQRLLLELSVLDTNCEESVKWTGAGEREGRIYAPLVSQRHFGFGHG +IGRSGDVMEAQPKAVGSSALLRLTLRLTLDAVRRGAGLNGTLGKGDSRNGPASFGTLLPVCTGMSMALVL +SGLRDRARTLDSASIGTEHVNTERNIVLWSRIDQKSCYKSILSAGLKCVVLPTKKHPDTDEVSTDLEALK +EALDSFGNSILAVLTTTSCFCPRVPDEVDQVAKMIMSAGVSHVVNHAYGLQCQTTNKLLNRACIIGRVDA +IICSTDKNFLVPVGGALILSPDSNVIETISKNYPGRASSSPMVDLFITLLSMGLNGYKGILEERKRLTEL +FGQSLQRVATVFGETVLNCPRNTISFGMTLDNLATINGSDDELNSLITKFGSMLFTRCISGTRVVPRGST +KTISGHTFEGFGSSNDDYPYAYMTSACAVGMGEEEMNEFFVRLEKSWIDYRKKLEQ diff --git a/SLC4.txt b/SLC4.txt new file mode 100644 index 0000000..27d4890 --- /dev/null +++ b/SLC4.txt @@ -0,0 +1,62 @@ +>jgi|Phatr2|54405|estExt_Phatr1_ua_kg.C_chr_70011|SLC4_3_PTRI +MPPKHESQEDLKMSTSKQDEDEVRTIDFLDHDDGNQGNGWGRGIVKDFRKTVGTHWVNEMTNFNQKSIAV +SFFIFFAAVAPAITFGAVYSKTTNDAIGAVEMLIATAWCGIVYALIGGQPIMINGGTGPVLAFSAVLFDI +ADNMDVNFLTLNAWTGLWVAGFLIIAAFVDLNRLMKHATRFTDEIFALLIASIFVIDALGSPFSDVGIYW +YFTRSHDSHDEFEDQEDYSYMATAFLSAVLCLGTTWLAFFLRDIKFSPYFPNDSWRTLISDFAVVASILI +WTLIANGLFDNVEVERLNVPDSITPTQICCTADCMTSFPDDCPDITPYGRRSWIVDLGAVNGKSWIPFFA +AIPALLAFILVFLDDGITWHLINHPSNKLTHGDAYNWDTVVIAAMIAVNSMLGLPWLVAATVRSLTHVNA +LAERSENGKIISVQETRLTHLGIHLLVLAALFALDVLKLIPVPVLYGVFLYMGVASLASNQFFQRFLMFF +MQPSKYPHEPHTKYMAPKRMHLFTGIQLGLFVILTVFRSISVIAIAFPIVIKACIPVRMYILPRYF +>jgi|Thaps3|13887|gw1.8.48.1|SLC4_1_TPS +NGKPEKFFQLFTGIRTDLTTRLLPYYKSDWSRPKSIFTVINAIVFAFVVQLIPALIFAELMDRETKGNLA +AAETLLSAGIIGIIYAIISGQPLTLLGITGPVAILLGTSYGLAEQFDSEYWPFFWWLCIWTAILHFLTAI +TGLVNFVWHISPFTTQIFEFFIGCSFVFESIRDLVEPLHLGKNTYASLVIGMLAFAICWRLHFAETWTLF +SRQVRTFLTSYNMAITVIIVTADQKDSNSHGIERVHVRAPWDWQPSVDRPWLIDPTEGISTKGIFGALFP +AFMLYLLFFIDHNISSILTQAPKYNLKKPASYHWDFFCLGLTIVPCGLLGLPPGSGLIPQAPLHTRALAT +RKILERHGVKQEVTVHVEEQRWSALGQASLMFVALSLFTVISWIPKGALFGVFLYLGVGALHGNEIWHHI +TLSFMYAKKRPPVPIVANVKWSTVQLYTLVQVCCAAAIFGVAQFASVGYIFPALVAALVPIRSYFVAWCF +SENDLQYLD +>jgi|Phatr2|45656|estExt_fgenesh1_pg.C_chr_70326|SLC4_1_PTRI +MTNLVSRAYIVALLCMSSCWHSAAFHTTSFGKTSLGLKISSSRSPTFSSLKKAKVIASVTTKPLTKLSDS +MSVVSPPVDERENNKDDETLFEGPFKGIIRDYKARLPLFASDIKDGLNVQCLAATMFLFFACLAPAVGFG +GLFDVATGGAIGTVEMVSSTALCGLIYAITSAQPLTIIGSTGPVLAFVACLAQLAKMLNLPFLQLYSWTG +LWTSAILFVSSITSASNLVKYLTRFTDEIFSLLISCIFVFEAVSDVGRTFSSPASTFTKALLTLTCAAST +FTIATLLKGLRKTSLFPSRVRNTISNFAPTIGVVTASLIARWARVVHGTKLAGLPSLSIPAVFGTTSGRP +WLVPILDFPVWARWAAFLPALMATVLLFLDQNITVRLVNNPRWKMEKGRRKNNVLDGMHADMFIVSILTA +AQSLVGIPWLVAATVRSLSHVGALSKYDKEGKVVGTIEQRMTGISIHSLIGCAVLFSKPRKLLTQVPLPV +LMGLFMYLGTSSLPGNEMWERVTGLFKDKTVAPKQRWSDKVPDKVTSTFTLIQVACLGAMFWVKESPFGV +LFPVVIAMLAPLRFALEKQGIIKKEYMDVLDEE* +>jgi|Emihu1|99943|fgeneshEH_pg.21__120|Bicarbonate_transporter_SLC4family +MSKREEYPGDDNYSSVNHVADALETPATAGHDAEADPFAEETGSEPKSVPETPDQESGMVGAATKSSGAK +RRKGKKPISELAPLEFSGRFAGGLRADLLRRVPLYVSDWTEAFTGGNCMKTTASICFLFFACLSPAVTFG +AAFADATDNQLGVIETIISSGMSGLIYSFLSGQPLCILGATGPELAFPVVFYEICQWGGMEVDFLAARVW +QALWCSLFTIIVALFDLSACMKVCTRFTEEIFSFLISIIFIVGAFTTLIKLYLADPDVEGDDPAAPANRA +KAFLGTLLGLFTYFTAMWCRAFPKRNETTPLVRKLVANYGVTLSILLYSGINYGFRDVDVPCLDMPDEIV +PTATLNGTGESRGWFVNPFGDETASGYDTPGVGFIFFAAVPALGLAVLGYLDQNLTTLLINRKDHNLKKG +GAYHLDLLVCGIFIYPICGFFGLPFTHAATVRSMSHLMSLMTREDSTNEHGQTVSKVTNVVEQRVTHLGI +HCLLLAALGLSAVLTKIPKVVLAGVFLYMGVTALPGVQLYERLWLWLIWDPKKYPQYDYVTQVARKPLHL +YTLFQFSCLAVLYALTKVPNPYISVIFPFFIAFLPLIRKLVPKCFPSVWSKEDLKALDK +>jgi|Thaps3|267979|estExt_thaps1_ua_kg.C_chr_10120|SLC4_2_TPS +MRNDFARRRKWYISDWTDAFKKKRQVIPAVLFLYFACLAPAVSFGTIASEITNGSIGVVEFLLSSGMAGM +LYSITCGQPMAFLAPTGLTLAFISGLFRFCTLRNLPFFPVYAWVGLWSSAFMMILGLSGSSKLIRYCTRF +TDEVFNGLLSVNFIYEAFSSLRRNFVNADPMNLTMPFVALSMALGTFFSTMKVVKFESSKFFNTKVRGVI +KNFGPVSVILFFTLVNLLPWFQKFHVPTLSVPDTFQLAGGRSFLVSLKEIPVKVRWLCALPAWLLTCLFF +MDQNISVRLVNNPDNKLKKGEAYNMDMVALGGITGVLSVLGLPWMCGATVQSMNHVRAMSEMKVNEETGE +TEVEVTETRLTGFTIHALLASTVLLLPWIKKIPIPVVSGVFLFLGRKLMTGNTFFKRVTDAFAESKRLRE +DHPINLLGRKKMNAFTGIQVLCLLGLFAFKQIPSITIFFPAMILFLMFIRSFVLPKYFSEEEFVALEDPT +PS* +>jgi|Phatr2|32359|fgenesh1_pg.C_chr_1000960|SLC4_2_PTRI +MKQSSKRHRKDGALQHTVLWIGILSAFCTTGSAFTSSALGRTKPSSLHLVPGSAAVLNLGRRPGKRSNYL +RLSLPADRRTSVGSSKNKDNTDSTNNDATQSIEGTKEDVKEKIQFSPSYLEQIDRMRGYRRKRQWKRVLE +EYSNGNSTETTAQKHAKNLFDTIVSQEMRDDIRRRKKVYWSDWEDGFKNKRKVIPAILFLYFACLSPAVS +FGTIASEITQGSIGIVEFLLSSGLSGMAYAMMCGQPMAFIAPTGLTLAFISGLYRFCMVKALPFFPIYAW +VGLWTSFFFVLLGLGGSSQLIRFCTRFTDEVFNALLSVNFIYEAVASLKRNFDLADPMNLTMPFVSLAMA +LSTFWCTAKVAAFESSKYLNQKIRSIVKDFGPVTIFILMSIFNQRAWMKKFKVPTLTVPSSFQLSGGRNF +LINLNAIPLNIKLACVLPAILLTSLFFMDQNISVRVVNNPDNKLKKGAAYNLDMVALGLITSCLSLVGLP +WMCGATVQSLNHVRALTETRFNERTGEPEIIGVTETRVTGFAVHALICSTLAILPLLRFVPIPVVAGVFL +FLGRKLMSGNSFLQRIRDCFVEKSRLPADHPIRYIGRKKTNIFTVTQIGCLGGLWFFKQNSTTAIFFPSV +IGLLMLIRAFVLPKVFTEDELIDLGDPSPN* +>gi|NM_003040.3_translation|SLC4_2|Homo_sapien +MSSAPRRPAKGADSFCTPEPESLGPGTPGFPEQEEDELHRTLGVERFEEILQEAGSRGGEEPGRSYGEEDFEYHRQSSHHIHHPLSTHLPPDARRRKTPQGPGRKPRRRPGASPTGETPTIEEGEEDEDEASEAEGARALTQPSPVSTPSSVQFFLQEDDSADRKAERTSPSSPAPLPHQEATPRASKGAQAGTQVEEAEAEAVAVASGTAGGDDGGASGRPLPKAQPGHRSYNLQERRRIGSMTGAEQALLPRVPTDEIEAQTLATADLDLMKSHRFEDVPGVRRHLVRKNAKGSTQSGREGREPGPTPRARPRAPHKPHEVFVELNELLLDKNQEPQWRETARWIKFEEDVEEETERWGKPHVASLSFRSLLELRRTLAHGAVLLDLDQQTLPGVAHQVVEQMVISDQIKAEDRANVLRALLLKHSHPSDEKDFSFPRNISAGSLGSLLGHHHGQGAESDPHVTEPLMGGVPETRLEVERERELPPPAPPAGITRSKSKHELKLLEKIPENAEATVVLVGCVEFLSRPTMAFVRLREAVELDAVLEVPVPVRFLFLLLGPSSANMDYHEIGRSISTLMSDKQFHEAAYLADEREDLLTAINAFLDCSVVLPPSEVQGEELLRSVAHFQRQMLKKREEQGRLLPTGAGLEPKSAQDKALLQMVEAAGAAEDDPLRRTGRPFGGLIRDVRRRYPHYLSDFRDALDPQCLAAVIFIYFAALSPAITFGGLLGEKTQDLIGVSELIMSTALQGVVFCLLGAQPLLVIGFSGPLLVFEEAFFSFCSSNHLEYLVGRVWIGFWLVFLALLMVALEGSFLVRFVSRFTQEIFAFLISLIFIYETFYKLVKIFQEHPLHGCSASNSSEVDGGENMTWAGARPTLGPGNRSLAGQSGQGKPRGQPNTALLSLVLMAGTFFIAFFLRKFKNSRFFPGRIRRVIGDFGVPIAILIMVLVDYSIEDTYTQKLSVPSGFSVTAPEKRGWVINPLGEKSPFPVWMMVASLLPAILVFILIFMETQITTLIISKKERMLQKGSGFHLDLLLIVAMGGICALFGLPWLAAATVRSVTHANALTVMSKAVAPGDKPKIQEVKEQRVTGLLVALLVGLSIVIGDLLRQIPLAVLFGIFLYMGVTSLNGIQFYERLHLLLMPPKHHPDVTYVKKVRTLRMHLFTALQLLCLALLWAVMSTAASLAFPFILILTVPLRMVVLTRIFTDREMKCLDANEAEPVFDEREGVDEYNEMPMPV +>gi|NM_000342 _translation|SLC4_1|Homo_sapien +MEELQDDYEDMMEENLEQEEYEDPDIPESQMEEPAAHDTEATATDYHTTSHPGTHKVYVELQELVMDEKNQELRWMEAARWVQLEENLGENGAWGRPHLSHLTFWSLLELRRVFTKGTVLLDLQETSLAGVANQLLDRFIFEDQIRPQDREELLRALLLKHSHAGELEALGGVKPAVLTRSGDPSQPLLPQHSSLETQLFCEQGDGGTEGHSPSGILEKIPPDSEATLVLVGRADFLEQPVLGFVRLQEAAELEAVELPVPIRFLFVLLGPEAPHIDYTQLGRAAATLMSERVFRIDAYMAQSRGELLHSLEGFLDCSLVLPPTDAPSEQALLSLVPVQRELLRRRYQSSPAKPDSSFYKGLDLNGGPDDPLQQTGQLFGGLVRDIRRRYPYLSDITDAFSPQVLAAVIFIYFAALSPAITFGGLLGEKTRNQMGVSELLISTAVQGILFALLGAQPLLVVGFSGPLLVFEEAFFSFCETNGLEYIVGRVWIGFWLILLVVLVVAFEGSFLVRFISRYTQEIFSFLISLIFIYETFSKLIKIFQDHPLQKTYNYNVLMVPKPQGPLPNTALLSLVLMAGTFFFAMMLRKFKNSSYFPGKLRRVIGDFGVPISILIMVLVDFFIQDTYTQKLSVPDGFKVSNSSARGWVIHPLGLRSEFPIWMMFASALPALLVFILIFLESQITTLIVSKPERKMVKGSGFHLDLLLVVGMGGVAALFGMPWLSATTVRSVTHANALTVMGKASTPGAAAQIQEVKEQRISGLLVAVLVGLSILMEPILSRIPLAVLFGIFLYMGVTSLSGIQLFDRILLLFKPPKYHPDVPYVKRVKTWRMHLFTGIQIICLAVLWVVKSTPASLALPFVLILTVPLRRVLLPLIFRNVELQCLDADDAKATFDEEEGRDEYDEVAMPV diff --git a/SPT.txt b/SPT.txt new file mode 100644 index 0000000..dd80eb8 --- /dev/null +++ b/SPT.txt @@ -0,0 +1,59 @@ +>gi|223999211|ref|XP_002289278.1|T.pse|SPT_AGT_TPS +MSSSMRAASSLLRSIPRATSFATVSSKPSHATQLISRNTNHLISPIAATTSLSSSATSSHRFFSSTGPPE +EDELHYTSVAKGDMGEFQEYSVIFTNRALNLMSKPFQQVMRDLNMLLKKTYNADKVAIMPGSGTFGMEAV +ARQFATDKHVMVIRNGWFSFRWTEIFDMGGHNHTIPSSHTVLKAQPVEPEDPNCPHMQYAPYPIDEVVAK +VMEERPAVLFAPHVETSTGMILPDDYIRKAAKAVHDVGGLFVLDCIASGAIWADMKDLGVDSIISAPQKG +WTGPACCALVMLSERAAEVMAETQETSFSMSLKRWCAIMDTYEKGGFGYHTTMPTDGLRDFHEISVETLN +FGLPELKQAQYKLGAVARELLDSRGLTSVAAPGFQAPGVLVYYSPLGQDNPAMMNKFKVHGLQIAMGVPW +RIDEPDGLKTFRLGLFGLDKMGDIPKCVGTLQKSLDAVLAESGHSIPEKKAA +>jgi|Phatr2|40344|fgenesh1_pg.C_chr_23000065|SGAT_PTRI +MFRSVASLALRGSIGTGRGVAQSPRVVPFGSAVTVRHSSNSHTNSSSHTPERLRYNVIPKSDFGAFKEYS +VIHTDRSLNLMSDPFQRVMRDLNELLKVTYNADKVVILPGSGTFGMEAVARQFAQNEHVMVIRNGWFSYR +WTEIFEMGSSEPGVEAGGVGAGIPTSHTVLKAQPVPVPGNDTGSSNTKTTHFAPHPIQDVVSRIHQERPA +VLFAPHVETSTGMMLPDEYIQKAAQAMHDIGGLFVLDCIASGTVWVDMKALGVDVLISAPQKGWTGPPCA +ALVMMSDRAVARMSQTSETSFSMSLKRWAALMDTYEKGGFAYHTTMPTDALRDFHEISVETLRFGLPELK +TAQLNLGWWARGTLDRKGLVSVAAPGFQAPGVLVYYSPSQTDNPVMMSSFKAQGLQIAMGVPWKIDEPEG +LKTFRIGLFGLDKLGKPDETIRVMEEALDQVLDSVGHTAKSKKVA* +>gi|XP_003064521.1|SGAT|Micromonas_pusilla CCMP1545 +MKCPHDPSKPFHKSPLPLDHAGGLLEYSVVYTDRAMNHMSAPFCKIMNDIDATMKEAYNCSATIVMPGSG +SYGMEAVARQWATNKKVLVLRNGYFSYRWTDIFEQTGIPSETIVLKGQPADNSSNPQFMPHDIEEVCAAI +AREKPAVVFAPHVETSTGIILPDEYISRVSRAVHDVGGLFVLDCIASGTVWVDMKATGVDAILSAPQKGW +TGPACSSLMMLSERGEHATRNTTSTSMVINMRKWLEIMDSYTNGGFAYYTTMPTDALGLFRDAALETKEI +GFAKTKRMAWDLGDECRDMMKSKGLKTVSADGYEAPGVSVWYTPEPDMFNKFKKEGFQIAAGVPFMINEP +PGNFTFRIGLFGLDKICNKDNTIKTLEGTLEKILASSAGGAKAAAA +>gi|NP_495885.1|Serine--pyruvate_aminotransferase|Caenorhabditis_elegans +MISTRFLRPSVSIFGFGIKSSMSSRAPPKALLQDMVVPPRQLFGPGPSNMADSIAETQSRNLLGHLHPEF +VQIMADVRLGLQYVFKTDNKYTFAVSGTGHSGMECAMVNLLEPGDKFLVVEIGLWGQRAADLANRMGIEV +KKITAPQGQAVPVEDIRKAIADYKPNLVFVCQGDSSTGVAQPLETIGDACREHGALFLVDTVASLGGTPF +AADDLKVDCVYSATQKVLNAPPGLAPISFSDRAMEKIRNRKQRVASFYFDAIELGNYWGCDGELKRYHHT +APISTVYALRAALSAIAKEGIDESIQRHKDNAQVLYATLKKHGLEPFVVDEKLRLPCLTTVKVPEGVDWK +DVAGKMMTNGTEIAGGLGATVGKIWRIGTFGINSNSTKIENVVELLSKSIGEKSK +>gu|WP_033827098.1|serine--pyruvate_aminotransferase|Bacillus_andreraoultii +MRNKELLLIPGPTPVADSIYDAMVQETWGHTDLRFAKMYKESIEATKQMLKTDGEVFVISGSGTLAMEMA +LVNTVASGEKLLVISHGYFGDRFIKLGQAYGIEIDVVQSEWGKHIDVTEVDKKLSENKYKAVTITHADTS +TGVASNLDLLVPLIKKHGALVILDGVCATGAIEEDMSKTYGSPDAKIDVVLTGSQKAIGVPPGLAIVAFN +QTALAAREELDRVPAYYCDIKNWLPIMHDPTKYFATPAVNMIYGYREGMRLVLEEGMEKRYIRHKQYGQA +VRSSLREYGMKPLADEGVAAATLSCILYPDGVDDAEFRSSLAKKGVIVAGALAHLSGKAFRIGHMGNTTK +EMLAEAIERIGETLIELGLAANIDRALEQFEESFNVTIN +>jgi|Emihu1|123208|fgeneshEH_pg.1949__1 +MRAANASEAIVERIIATHLRPGIANLAPGTAHWSPPERLVEAALRGDGGYGDIRGEPALLAALREEHGRE +HVMVTPGANQAFVHALLSTCDVGDEVLLWRPYYFSHLVALQLLGLVPVFADCDERGEPTHDLHQCHQQCS +LCSPLRVGLPTPGAAYEHFTYGAAEHASAAELCEAGGGELLLCLRTFSKSYGLAAWRVGHLSYPHQLHDA +MLKARDHARSPEIARDCTTRCSRRWDAPPTRTRGGWCSNGWWMCGQALEGLGEAWVREQASGEIWGDLGR +SGEVATLEPARAMLWEALAPLRGGGDALQPAGAFYYFVRLPGCSSPGARVLGEGGEGGEAAEEEPSGLLH +ASRLADCEAEEEAVRRLAAEHELLTLPGSAFGRPGHLRLSYGRLASAEEAEPIAERLFRAAEALRARWGE +T* +>gi|NP_178969.1|alanine:glyoxylate aminotransferase [Arabidopsis thaliana] +MDYMYGPGRHHLFVPGPVNIPEPVIRAMNRNNEDYRSPAIPALTKTLLEDVKKIFKTTSGTPFLFPTTGT +GAWESALTNTLSPGDRIVSFLIGQFSLLWIDQQKRLNFNVDVVESDWGQGANLQVLASKLSQDENHTIKA +ICIVHNETATGVTNDISAVRTLLDHYKHPALLLVDGVSSICALDFRMDEWGVDVALTGSQKALSLPTGLG +IVCASPKALEATKTSKSLKVFFDWNDYLKFYKLGTYWPYTPSIQLLYGLRAALDLIFEEGLENIIARHAR +LGKATRLAVEAWGLKNCTQKEEWISNTVTAVMVPPHIDGSEIVRRAWQRYNLSLGLGLNKVAGKVFRIGH +LGNVNELQLLGCLAGVEMILKDVGYPVVMGSGVAAASTYLQHHIPLIPSRI +>gi|ONM02788.1| Serine--glyoxylate aminotransferase [Zea mays] +MVDYVYGPGRTHLFVPGPVNIPDPVIRAMNRQNEDYRSPAVPALTKVLLEDVKKIFKTTTGTPLMIPTTG +TGAWESALINTLSSGDRVVSFLIGQFSLLWIDQQRRLGFDVDAVESEWGQGADLAALERRLRDDAPRHAI +KAVAIVHNETATGVTNDLAAVRALLDKHAHPALLLVDGVSSICALDFRMDEWGVDVALTGSQKALSMPTG +MGIVCASPRALEASKTARFYDMGTYWPYTPSIQLLYGLRTALDLIFEEGLDNVVRRHNRLGTATRLAVEA +WGLSNCCQKEEWFSDIVTAVVVPPNIDSAEVVRHAWKRYNLSLGLGLNKVAGKVFRIGHLGNLNELQLLG +CLSGVEMVLKDVGYPVKLGSGVAAAAAYLSNSTPLIPSRI diff --git a/TSR.txt b/TSR.txt new file mode 100644 index 0000000..617ba7e --- /dev/null +++ b/TSR.txt @@ -0,0 +1,26 @@ +>jgi|Thaps3|413|fgenesh1_pm.C_chr_4000054|TSR_TPS +MSSVAFVGLGNMGRNMAMNLARNKPSVVTLLTVHDSHEPTLSSFMEQAKINGLSVSSTPNLASFADSNPD +VIITSLPSCEASAAVVGEIVESLSPSREAIFIDTSTISVTTSRKLHELVTSTSTKFDYVDAPVSGGVKGA +TDASLTFMVGCSSLATLSSVQPILQRMGKDIIPCGGPGSGSAVKLCNNAALAAQMLGVCEAMNLGDKLGV +DPAVLAGVMNVSTAKSWSSTVNNPHPVAARGIGSGASANEYEGGFGTSLMLKDLNLAIDTAEEEHVSMPV +TSLARELYRIADSHGYGKKDFGVMLQFLRGRDGSGDTR* +>jgi|Phatr2|45141|estExt_fgenesh1_pg.C_chr_60132|TSR_PTRI +MGLTAISVRRLSAFARFHGRRLALQYTCIRYYEDEAYTNAVVGFIGLGNMGLPMARNLAKKNKILAFDTN +PDARHAASISIMEVSDTISHLKDCSMIFTMLPGCQVVDQVMSDLHNVVDHQNTIIVDCSTVSPTTSRRWH +DAWKVNGCAMLDAPVSGGTKGAMEGTLTFMVGYDDKMRFEQAKPFLYCMGDRIIPCGGPGTGAATKLCNN +VALAAQMVGICEAMNLGESLGVDPVLLAEVMNTSTASCWSSKVNNPHPSVARASGSPASQDYVGGFSARL +MLKDLGLAAQAAEDNGVALPLVATSRELYKLAGLRGMADRDFGIMLQLLRGK* +>gb|ELX81395.1|TSR|Salmonella_enterica_subsp._enterica_serovar_Dublin +MKLGFIGLGIMGSPMAINLARAGHQLHVTTIGPVADELLSLGAVNVETARQVTEFADIIFIMVPDTPQVE +EVLFGEHGCAKTSLQGKTIVDMSSISPIETKRFAQRVNEMGADYLDAPVSGGEIGAREGTLSIMVGGEQK +VFDRVKPLFDILGKNITLVGGNGDGQTCKVANQIIVALNIEAVSEALVFASKAGADPVRVRQALMGGFAS +SRILEVHGERMINRTFEPGFKIALHQKDLNLALQSAKALALNLPNTATCQELFNTCAANGGSQLDHSAMV +QALELMANHKLS +>gb|AKK40661.1|TSR|Escherichia_coli_APEC_O2-211 +MIDMTMKVGFIGLGIMGKPMSKNLLKAGYSLVVADRNPEAIADVIAAGAETASTAKAIAEQCDVIITMLP +NSPHVKEVALGENGIIEGAKPGTVLIDMSSIAPLASREISEALKAKGIDMLDAPVSGGEPKAIDGTLSVM +VGGDKAIFDKYYDLMKAMAGSVVHTGEIGAGNVTKLANQVIVALNIAAMSEALTLATKAGVNPDLVYQAI +RGGLAGSTVLDAKAPMVMDRNFKPGFRIDLHIKDLANALDTSHGVGAQLPLTAAVMEMMQALRADGLGTA +DHSALACYYEKLAKVEVTR + + diff --git a/mafft_hmmbuild.sh b/mafft_hmmbuild.sh new file mode 100644 index 0000000..7918829 --- /dev/null +++ b/mafft_hmmbuild.sh @@ -0,0 +1,8 @@ +#! /usr/bin/env bash +# + +for i in SLC4 Bestrophin CA_beta CA_delta CA_alpha CA_zeta GOX GDCT PGP GCL HR SPT TSR ICL PK PEPC PEPCK MDH OMT ME PPDK PYC SHMT MS GlcDH ALAT_GGAT GK + do + mafft "$i".txt> "$i"_aln.txt + ./hmmbuild "$i".hmm "$i"_aln.txt + done \ No newline at end of file diff --git a/rip_counts_MMETSP.py b/rip_counts_MMETSP.py new file mode 100644 index 0000000..c07d4dd --- /dev/null +++ b/rip_counts_MMETSP.py @@ -0,0 +1,39 @@ +#!/usr/bin/python +#rip_counts_annot_MMETSP +#inputs: directory with .pep.fa files from MMETSP, outputs count and annot files for each taxa + +#import libraries +import sys +import os +from ftplib import FTP #import ftp library +import re #import regular expression tools + +# taxa directory +t='/Users/maria_hernandez/Documents/Big_Data3050/CMM_MoreSP' +files= os.listdir(t) + +#Pull out taxa and strian ID from .pep.fa files +delimiter=' ' +all=delimiter.join(files) +taxa=re.findall('(\S*).pep.fa.gz',all) +print taxa + +#access ftp +ftp= FTP('ftp.imicrobe.us') #set ftp server +ftp.login() #log in +ftp.cwd('camera/combined_assemblies') #change to main working directory + + +#This looop pulls out all count data for each taxa and saves in count directory +for ID in taxa: + #change to taxa directory/readcounts + ripdir= ID+"/readcounts" + ftp.cwd(ripdir) + #write to README file in working directory + savefile= t +"/counts/"+ID+"_cds_counts.txt" + command= "RETR "+savefile + ftp.retrbinary(command, open(savefile, 'wb').write) + ftp.cwd("~/camera/combined_assemblies") + +#close ftp connection +ftp.quit() \ No newline at end of file