Skip to content

Latest commit

 

History

History
548 lines (450 loc) · 16.2 KB

151124.md

File metadata and controls

548 lines (450 loc) · 16.2 KB
---[p.69]------------------

Pairwise sequence alignment
Type Algorithm Sequence1='VSPAGMASGYD' Sequence2='IPGKASYD'
Global alignment Needleman-Wunsch algorithm(*) image
Local alignment Smith-Waterman algorithm(*) image
(*)動的計画法を使って配列アライメントを行っている
参考資料
https://www.ibm.com/developerworks/jp/java/library/j-seqalign/
http://www.jst.go.jp/nbdc/bird/jinzai/literacy/streaming/h21_3_1.pdf
http://demonstrations.wolfram.com/GlobalAndLocalSequenceAlignmentAlgorithms/
----------global alignment実行-------------------------

> source("http://Bioconductor.org/biocLite.R")
Bioconductor version 3.1 (BiocInstaller 1.18.5), ?biocLite for help
A newer version of Bioconductor is available for this version of R, ?BiocUpgrade
  for help
> biocLite("Biostrings")
BioC_mirror: http://bioconductor.org
Using Bioconductor version 3.1 (BiocInstaller 1.18.5), R version 3.2.2.

略
> library(Biostrings)
 要求されたパッケージ BiocGenerics をロード中です 
 要求されたパッケージ parallel をロード中です 

略

 要求されたパッケージ XVector をロード中です 
> 
> 
> s1<-"GAATTCGGCTA"
> s2<-"GATTACCTA"
> 
> score1<-nucleotideSubstitutionMatrix(match=1,mismatch=-1,baseOnly=TRUE)
> 
> 
> score1

   A  C  G  T
A  1 -1 -1 -1
C -1  1 -1 -1
G -1 -1  1 -1
T -1 -1 -1  1
> 
> gapOpen<- 2
> gapExtend<- 1
> 
> a1<-pairwiseAlignment(s1,s2,substitutionMatrix=score1,gapOpening=gapOpen,gapExtension=gapExtend,type="global",scoreOnly=FALSE)
> 
> 
> a1
Global PairwiseAlignmentsSingleSubject (1 of 1)
pattern: [1] GAATTCGGCTA 
subject: [1] GATTAC--CTA 
score: 1 
> 
> 
> data(package="Biostrings")
> 
> 
----別windowで表示---------------

Data sets in package �eBiostrings�f:

BLOSUM100               Scoring matrices
BLOSUM45                Scoring matrices
BLOSUM50                Scoring matrices
BLOSUM62                Scoring matrices
BLOSUM80                Scoring matrices
HNF4alpha               Known HNF4alpha binding sequences
PAM120                  Scoring matrices
PAM250                  Scoring matrices
PAM30                   Scoring matrices
PAM40                   Scoring matrices
PAM70                   Scoring matrices
phiX174Phage            Versions of bacteriophage phiX174 complete
                        genome and sample short reads
quPhiX174 (srPhiX174)   Versions of bacteriophage phiX174 complete
                        genome and sample short reads
srPhiX174               Versions of bacteriophage phiX174 complete
                        genome and sample short reads
wtPhiX174 (srPhiX174)   Versions of bacteriophage phiX174 complete
                        genome and sample short reads
yeastSEQCHR1            An annotation data file for CHR1 in the yeastSEQ
                        package
-----protein sequence-------------------------

> data(BLOSUM62)
> 
> subMat<-"BLOSUM62"
> 
> sp1<-"PAWHEAE"
> sp2<-"HEAGAWGHE"
> 
> a2<-pairwiseAlignment(sp1,sp2,substitutionMatrix=subMat,gapOpening=gapOpen,gapExtension=gapExtend,type="global",scoreOnly=FALSE)
> 
> a2
Global PairwiseAlignmentsSingleSubject (1 of 1)
pattern: [1] P---AW-HE 
subject: [1] HEAGAWGHE 
score: 14 

---showでも確認可能--------------


> show(a1)
Global PairwiseAlignmentsSingleSubject (1 of 1)
pattern: [1] GAATTCGGCTA 
subject: [1] GATTAC--CTA 
score: 1 
> 
> 
> 
> show(a2)
Global PairwiseAlignmentsSingleSubject (1 of 1)
pattern: [1] P---AW-HE 
subject: [1] HEAGAWGHE 
score: 14 
> 

----------local alignment実行-------------------------


> 
> a2_local<-pairwiseAlignment(sp1,sp2,substitutionMatrix=subMat,gapOpening=gapOpen,gapExtension=gapExtend,type="local",scoreOnly=FALSE)
> 
> 
> a2_local
Local PairwiseAlignmentsSingleSubject (1 of 1)
pattern: [2] AW-HE 
subject: [5] AWGHE 
score: 25 
> 

---------dot plot-----------------------------

注:HumanChimp.fastaをダウンロードしてください

> mseq<-read.fasta(file="HumanChimp.fasta")
> 
> mseq
$`sp|P68871|HBB_HUMAN`
  [1] "m" "v" "h" "l" "t" "p" "e" "e" "k" "s" "a" "v" "t" "a" "l" "w" "g"
 [18] "k" "v" "n" "v" "d" "e" "v" "g" "g" "e" "a" "l" "g" "r" "l" "l" "v"
 [35] "v" "y" "p" "w" "t" "q" "r" "f" "f" "e" "s" "f" "g" "d" "l" "s" "t"
 [52] "p" "d" "a" "v" "m" "g" "n" "p" "k" "v" "k" "a" "h" "g" "k" "k" "v"
 [69] "l" "g" "a" "f" "s" "d" "g" "l" "a" "h" "l" "d" "n" "l" "k" "g" "t"
 [86] "f" "a" "t" "l" "s" "e" "l" "h" "c" "d" "k" "l" "h" "v" "d" "p" "e"
[103] "n" "f" "r" "l" "l" "g" "n" "v" "l" "v" "c" "v" "l" "a" "h" "h" "f"
[120] "g" "k" "e" "f" "t" "p" "p" "v" "q" "a" "a" "y" "q" "k" "v" "v" "a"
[137] "g" "v" "a" "n" "a" "l" "a" "h" "k" "y" "h"
attr(,"name")
[1] "sp|P68871|HBB_HUMAN"
attr(,"Annot")
[1] ">sp|P68871|HBB_HUMAN"
attr(,"class")
[1] "SeqFastadna"

$`sp|P68873|HBB_PANTR`
  [1] "m" "v" "h" "l" "t" "p" "e" "e" "k" "s" "a" "v" "t" "a" "l" "w" "g"
 [18] "k" "v" "n" "v" "d" "e" "v" "g" "g" "e" "a" "l" "g" "r" "l" "l" "v"
 [35] "v" "y" "p" "w" "t" "q" "r" "f" "f" "e" "s" "f" "g" "d" "l" "s" "t"
 [52] "p" "d" "a" "v" "m" "g" "n" "p" "k" "v" "k" "a" "h" "g" "k" "k" "v"
 [69] "l" "g" "a" "f" "s" "d" "g" "l" "a" "h" "l" "d" "n" "l" "k" "g" "t"
 [86] "f" "a" "t" "l" "s" "e" "l" "h" "c" "d" "k" "l" "h" "v" "d" "p" "e"
[103] "n" "f" "r" "l" "l" "g" "n" "v" "l" "v" "c" "v" "l" "a" "h" "h" "f"
[120] "g" "k" "e" "f" "t" "p" "p" "v" "q" "a" "a" "y" "q" "k" "v" "v" "a"
[137] "g" "v" "a" "n" "a" "l" "a" "h" "k" "y" "h"
attr(,"name")
[1] "sp|P68873|HBB_PANTR"
attr(,"Annot")
[1] ">sp|P68873|HBB_PANTR"
attr(,"class")
[1] "SeqFastadna"

$`sp|P02112|HBB_CHICK`
  [1] "m" "v" "h" "w" "t" "a" "e" "e" "k" "q" "l" "i" "t" "g" "l" "w" "g"
 [18] "k" "v" "n" "v" "a" "e" "c" "g" "a" "e" "a" "l" "a" "r" "l" "l" "i"
 [35] "v" "y" "p" "w" "t" "q" "r" "f" "f" "a" "s" "f" "g" "n" "l" "s" "s"
 [52] "p" "t" "a" "i" "l" "g" "n" "p" "m" "v" "r" "a" "h" "g" "k" "k" "v"
 [69] "l" "t" "s" "f" "g" "d" "a" "v" "k" "n" "l" "d" "n" "i" "k" "n" "t"
 [86] "f" "s" "q" "l" "s" "e" "l" "h" "c" "d" "k" "l" "h" "v" "d" "p" "e"
[103] "n" "f" "r" "l" "l" "g" "d" "i" "l" "i" "i" "v" "l" "a" "a" "h" "f"
[120] "s" "k" "d" "f" "t" "p" "e" "c" "q" "a" "a" "w" "q" "k" "l" "v" "r"
[137] "v" "v" "a" "h" "a" "l" "a" "r" "k" "y" "h"
attr(,"name")
[1] "sp|P02112|HBB_CHICK"
attr(,"Annot")
[1] ">sp|P02112|HBB_CHICK"
attr(,"class")
[1] "SeqFastadna"

$`sp|Q90486|HBB1_DANRE`
  [1] "m" "v" "e" "w" "t" "d" "a" "e" "r" "t" "a" "i" "l" "g" "l" "w" "g"
 [18] "k" "l" "n" "i" "d" "e" "i" "g" "p" "q" "a" "l" "s" "r" "c" "l" "i"
 [35] "v" "y" "p" "w" "t" "q" "r" "y" "f" "a" "t" "f" "g" "n" "l" "s" "s"
 [52] "p" "a" "a" "i" "m" "g" "n" "p" "k" "v" "a" "a" "h" "g" "r" "t" "v"
 [69] "m" "g" "g" "l" "e" "r" "a" "i" "k" "n" "m" "d" "n" "v" "k" "n" "t"
 [86] "y" "a" "a" "l" "s" "v" "m" "h" "s" "e" "k" "l" "h" "v" "d" "p" "d"
[103] "n" "f" "r" "l" "l" "a" "d" "c" "i" "t" "v" "c" "a" "a" "m" "k" "f"
[120] "g" "q" "a" "g" "f" "n" "a" "d" "v" "q" "e" "a" "w" "q" "k" "f" "l"
[137] "a" "v" "v" "v" "s" "a" "l" "c" "r" "q" "y" "h"
attr(,"name")
[1] "sp|Q90486|HBB1_DANRE"
attr(,"Annot")
[1] ">sp|Q90486|HBB1_DANRE"
attr(,"class")
[1] "SeqFastadna"

$`sp|P02062|HBB_HORSE`
  [1] "v" "q" "l" "s" "g" "e" "e" "k" "a" "a" "v" "l" "a" "l" "w" "d" "k"
 [18] "v" "n" "e" "e" "e" "v" "g" "g" "e" "a" "l" "g" "r" "l" "l" "v" "v"
 [35] "y" "p" "w" "t" "q" "r" "f" "f" "d" "s" "f" "g" "d" "l" "s" "n" "p"
 [52] "g" "a" "v" "m" "g" "n" "p" "k" "v" "k" "a" "h" "g" "k" "k" "v" "l"
 [69] "h" "s" "f" "g" "e" "g" "v" "h" "h" "l" "d" "n" "l" "k" "g" "t" "f"
 [86] "a" "a" "l" "s" "e" "l" "h" "c" "d" "k" "l" "h" "v" "d" "p" "e" "n"
[103] "f" "r" "l" "l" "g" "n" "v" "l" "v" "v" "v" "l" "a" "r" "h" "f" "g"
[120] "k" "d" "f" "t" "p" "e" "l" "q" "a" "s" "y" "q" "k" "v" "v" "a" "g"
[137] "v" "a" "n" "a" "l" "a" "h" "k" "y" "h"
attr(,"name")
[1] "sp|P02062|HBB_HORSE"
attr(,"Annot")
[1] ">sp|P02062|HBB_HORSE"
attr(,"class")
[1] "SeqFastadna"

> dotPlot(mseq[[1]],mseq[[2]],col=c("gray","red"),xlab="Human",ylab="Chimp")

image

---[p.75]------------------------------------------------------------------------------------------

Multiple sequence alignment

注:fastaMSA.fastaをダウンロードしてください


> install.packages("muscle")
Installing package into ‘C:/Users/ekaminuma/Documents/R/win-library/3.2’
(as ‘lib’ is unspecified)
Warning in install.packages :
  package ‘muscle’ is not available (for R version 3.2.2)

> library(muscle)


> msa<-read.fasta(file="fastaMSA.fasta")
> 
> msa
$`sp|P00403|COX2_HUMAN`
 略
attr(,"name")
[1] "sp|P00403|COX2_HUMAN"
attr(,"Annot")
[1] ">sp|P00403|COX2_HUMAN"
attr(,"class")
[1] "SeqFastadna"

$`sp|P68530|COX2_BOVIN`
略 attr(,"name")
[1] "sp|P68530|COX2_BOVIN"
attr(,"Annot")
[1] ">sp|P68530|COX2_BOVIN"
attr(,"class")
[1] "SeqFastadna"

$`sp|P00406|COX2_RAT`
略
attr(,"name")
[1] "sp|P00406|COX2_RAT"
attr(,"Annot")
[1] ">sp|P00406|COX2_RAT"
attr(,"class")
[1] "SeqFastadna"

$`sp|P24894|COX2_CAEEL`
  [1] "m" "n" "n" "f" "f" "q" "g" "y" "n" "l" "l" "f" "q" "h"
略
[225] "w" "c" "f" "g" "t" "m" "e"
attr(,"name")
[1] "sp|P24894|COX2_CAEEL"
attr(,"Annot")
[1] ">sp|P24894|COX2_CAEEL"
attr(,"class")
[1] "SeqFastadna"

$`sp|P48660|COX2_HORSE`
略
attr(,"name")
[1] "sp|P48660|COX2_HORSE"
attr(,"Annot")
[1] ">sp|P48660|COX2_HORSE"
attr(,"class")
[1] "SeqFastadna"

$`sp|Q38PR9|COX2_MAMPR`
  [1] "m" "a" "y" "p" "l" "q" "l" "g" "f" "q" "d" "a" "t" "s"
略
[225] "s" "l" "a"
attr(,"name")
[1] "sp|Q38PR9|COX2_MAMPR"
attr(,"Annot")
[1] ">sp|Q38PR9|COX2_MAMPR"
attr(,"class")
[1] "SeqFastadna"

$`sp|Q9TA26|COX2_LOXAF`
  [1] "m" "a" "y" "p" "l" "q" "l" "g" "f" "q" "d" "a" "t" "s"
略
[225] "s" "l" "a" "q"
attr(,"name")
[1] "sp|Q9TA26|COX2_LOXAF"
attr(,"Annot")
[1] ">sp|Q9TA26|COX2_LOXAF"
attr(,"class")
[1] "SeqFastadna"

$`sp|P26456|COX2_GORGO`
  [1] "m" "a" "h" "a" "a" "q" "v" "g" "l" "q" "d" "a" "t" "s"
略
[225] "f" "a" "l"
attr(,"name")
[1] "sp|P26456|COX2_GORGO"
attr(,"Annot")
[1] ">sp|P26456|COX2_GORGO"
attr(,"class")
[1] "SeqFastadna"

$`sp|P48890|COX2_FELCA`
  [1] "m" "a" "y" "p" "f" "q" "l" "g" "f" "q" "d" "a" "t" "s"
 略
225] "s" "m" "l"
attr(,"name")
[1] "sp|P48890|COX2_FELCA"
attr(,"Annot")
[1] ">sp|P48890|COX2_FELCA"
attr(,"class")
[1] "SeqFastadna"

$`sp|P00405|COX2_MOUSE`
  [1] "m" "a" "y" "p" "f" "q" "l" "g" "l" "q" "d" "a" "t" "s"
 略
[225] "s" "m" "i"
attr(,"name")
[1] "sp|P00405|COX2_MOUSE"
attr(,"Annot")
[1] ">sp|P00405|COX2_MOUSE"
attr(,"class")
[1] "SeqFastadna"


> msa<-readAAStringSet("fastaMSA.fasta",format="fasta")
> 
> msa
  A AAStringSet instance of length 10
     width seq                        names               
 [1]   227 MAHAAQVGLQDA...KIFEMGPVFTL sp|P00403|COX2_HUMAN
 [2]   227 MAYPMQLGFQDA...KYFEKWSASML sp|P68530|COX2_BOVIN
 [3]   227 MAYPFQLGLQDA...KYFENWSASMI sp|P00406|COX2_RAT
 [4]   231 MNNFFQGYNLLF...NFKSWCFGTME sp|P24894|COX2_CAEEL
 [5]   227 MAYPFQLGFQDA...KHFEEWSASML sp|P48660|COX2_HORSE
 [6]   227 MAYPLQLGFQDA...KYFESWSASLA sp|Q38PR9|COX2_MAMPR
 [7]   228 MAYPLQLGFQDA...YFENWSASLAQ sp|Q9TA26|COX2_LOXAF
 [8]   227 MAHAAQVGLQDA...KIFEMGPVFAL sp|P26456|COX2_GORGO
 [9]   227 MAYPFQLGFQDA...TYFEKWSASML sp|P48890|COX2_FELCA
[10]   227 MAYPFQLGLQDA...KYFENWSASMI sp|P00405|COX2_MOUSE
 


 
> msa_aln<-muscle(msa,out=NULL,quiet=FALSE)
Option -out must have value
Invalid option "out"

MUSCLE v3.8.31 by Robert C. Edgar

http://www.drive5.com/muscle
This software is donated to the public domain.
Please cite: Edgar, R.C. Nucleic Acids Res 32(5), 1792-97.


Basic usage

    muscle -in <inputfile> -out <outputfile>

Common options (for a complete list please see the User Guide):

    -in <inputfile>    Input file in FASTA format (default stdin)
    -out <outputfile>  Output alignment in FASTA format (default stdout)
    -diags             Find diagonals (faster for similar sequences)
    -maxiters <n>      Maximum number of iterations (integer, default 16)
    -maxhours <h>      Maximum time to iterate in hours (default no limit)
    -html              Write output in HTML format (default FASTA)
    -msf               Write output in GCG MSF format (default FASTA)
    -clw               Write output in CLUSTALW format (default FASTA)
    -clwstrict         As -clw, with 'CLUSTAL W (1.81)' header
    -log[a] <logfile>  Log to file (append if -loga, overwrite if -log)
    -quiet             Do not write progress messages to the screen
    -version           Display version information and exit

Without refinement (very fast, avg accuracy similar to T-Coffee): -maxiters 2
Fastest possible (amino acids): -maxiters 1 -diags -sv -distance1 kbit20_3
Fastest possible (nucleotides): -maxiters 1 -diags
file1d10f04a91 10 seqs, max length 141, avg  length 133
184 MB(1%)00:00:00                Iter   1  100.00%  K-mer dist pass 1
184 MB(1%)00:00:00                Iter   1  100.00%  K-mer dist pass 2
184 MB(1%)00:00:00                Iter   1  100.00%  Align node       
184 MB(1%)00:00:00                Iter   1  100.00%  Root alignment
184 MB(1%)00:00:00                Iter   2  100.00%  Refine tree   
184 MB(1%)00:00:00                Iter   2  100.00%  Root alignment
1806 MB(14%)00:00:00                Iter   2  100.00%  Root alignment
1806 MB(14%)00:00:00                Iter   3  100.00%  Refine biparts
1806 MB(14%)00:00:00                Iter   4  100.00%  Refine biparts00:00:00                Iter   4   76.47%  Refine biparts
1806 MB(14%)00:00:00                Iter   5  100.00%  Refine biparts
1806 MB(14%)00:00:00                Iter   6  100.00%  Refine biparts
1806 MB(14%)00:00:00                Iter   7  100.00%  Refine biparts
1806 MB(14%)00:00:00                Iter   8  100.00%  Refine biparts
1806 MB(14%)00:00:00                Iter   9  100.00%  Refine biparts
1806 MB(14%)00:00:00                Iter  10  100.00%  Refine biparts
1806 MB(14%)00:00:00                Iter  10  100.00%  Refine biparts


注:以下、本と違います

> AAStr<-as(msa_aln,"AAStringSet") 
> writeXStringSet(AAStr,file="msa_aln.fa")

保存したmsa_aln.faファイルの中身

image



---[p.77]-------------------------------------------------------------------------------------------

Phylogenetic analysis and Tree plotting

> install.packages("ape")
Installing package into ‘C:/Users/ekaminuma/Documents/R/win-library/3.2’
略
> library(ape)
略

> myset<-c("U15717","U15718","U15719","U15720","U15721","U15722","U15723","U15724")
> 
> seqall<-read.GenBank(myseq)
Error in file(file, "r") : cannot open the connection
In addition: Warning message:
In file(file, "r") : cannot open: HTTP status was '400 Bad Request'
> 
> 
> seqall<-read.GenBank(myset)
> 
> seqall
8 DNA sequences in binary format stored in a list.

All sequences of same length: 1045 

Labels: U15717 U15718 U15719 U15720 U15721 U15722 ...

Base composition:
    a     c     g     t 
0.267 0.351 0.134 0.247 
> 
> mydist<-dist.dna(seqall)
> 
> mydist
             U15717       U15718       U15719       U15720       U15721
U15718 0.0963968720                                                    
U15719 0.0519601191 0.0821667377                                       
U15720 0.0155113115 0.0932071967 0.0489261666                          
U15721 0.0624466459 0.0797534702 0.0498979851 0.0551246867             
U15722 0.0164965334 0.0920721224 0.0478790458 0.0009578547 0.0540676038
U15723 0.0699980300 0.0885445872 0.0647927035 0.0636627810 0.0396102719
U15724 0.0722326361 0.0842641651 0.0562718695 0.0615634135 0.0397830924
             U15722       U15723
U15718                          
U15719                          
U15720                          
U15721                          
U15722                          
U15723 0.0625875118             
U15724 0.0604916017 0.0417080298
> 
> phylo<-triangMtd(mydist)
> 
> phylo

Phylogenetic tree with 8 tips and 6 internal nodes.

Tip labels:
	U15717, U15718, U15719, U15720, U15721, U15722, ...

Unrooted; includes branch lengths.



> plot(phylo,type="phylogram",edge.color="red",main="A:Phylogram")
> 

image



> plot(phylo,type="cladogram",edge.color="red",main="B:Cladogram")
> 

image




> plot(phylo,type="fan",edge.color="red",main="C:Fan")
> 

image



> plot(phylo,type="unrooted",edge.color="red",main="D:Unrooted")
> 

image