Skip to content

Latest commit

 

History

History
401 lines (308 loc) · 13 KB

160615.md

File metadata and controls

401 lines (308 loc) · 13 KB
========[p.241]====================

Reading alignment data

> download.file(url="http://genome.ucsc.edu/goldenPath/help/examples/bamExample.bam",destfile="bamExample.bam")
trying URL 'http://genome.ucsc.edu/goldenPath/help/examples/bamExample.bam'
Content type 'application/octet-stream' length 2281952 bytes (2.2 MB)
downloaded 2.2 MB

> library(Rsamtools)
 要求されたパッケージ S4Vectors をロード中です 
 要求されたパッケージ stats4 をロード中です 
Creating a generic function for ‘nchar’ from package ‘base’ in package ‘S4Vectors’
 要求されたパッケージ IRanges をロード中です 

 次のパッケージを付け加えます: ‘IRanges’ 

 以下のオブジェクトは ‘package:R.oo’ からマスクされています: 

     trim 

 要求されたパッケージ GenomeInfoDb をロード中です 
 要求されたパッケージ GenomicRanges をロード中です 
 要求されたパッケージ XVector をロード中です 
 要求されたパッケージ Biostrings をロード中です 

 次のパッケージを付け加えます: ‘Biostrings’ 

 以下のオブジェクトは ‘package:graph’ からマスクされています: 

     complement 

> bam <- scanBam("bamExample.bam")

読み込みできないので、DRAのAnalysisからダウンロード
(589kB)
ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/DRA002/DRA002949/DRZ006812/provisional/Im2653_comp20761_c0_seq1.bam

> bam <- scanBam("Im2653_comp20761_c0_seq1.bam") 


> names(bam[[1]]) 
 [1] "qname"  "flag"   "rname"  "strand" "pos"    "qwidth"
 [7] "mapq"   "cigar"  "mrnm"   "mpos"   "isize"  "seq"   
[13] "qual"  



> bam

: (長いので略)
[870]  123 -281 -142 -313 -376  169 -154 -234 -123 -306 -305
[881]  115 -194 -169 -115 -169 -270 -322  174  109 -202  128
[892] -211 -189 -109 -215 -241 -293 -169 -128 -265 -285 -174

[[1]]$seq
  A DNAStringSet instance of length 902
      width seq
  [1]   101 CACACACACACACACACATTTGGC...CCGTGAGTTGCTCCACGCAGTGC
  [2]   101 CACACACACACACACACATTTGGC...CCGTGAGTTGCTCCACGCAGTGC
  [3]   101 CACACACACACATACATTTGGCCC...GTGAGTTGCTCCACGCAGTGCCA
  [4]   101 CACACATTTGGCCCTGCTGTCTGA...CCACGCAGTGCCAAGCTGAGTAC
  [5]   101 CTGTCTGATGCAGCAGGGCCAAAA...TGAGTACCTAGCCAGATGCCCAT
  ...   ... ...
[898]   101 TTCATGCGAAACATACGACATGCC...GAGGCCATGAGCAGGACTCCTGG
[899]   101 TACGACATGCCGAGCACAGTTTCA...GGACTCCTGGTGGTTGGGCCCTG
[900]   101 ACGACATGCCGAGCACAGTTTCAC...GACTCCTGGTGGTTGGGCCCTGG
[901]   101 ACAGTTTCACTGCACTGCAAATGG...GGGCCCTGGTTCATTGATTCAAC
[902]   101 TGGCCTAACAATCATCCTAGCATA...ACTAGACTATCCCGGCTAAAAAC

[[1]]$qual
  A PhredQuality instance of length 902
      width seq
  [1]   101 @CCFFFFFHHHHHJBHIIJIJIJI...DDBBBBDADDDDDCCDDBDB8CC
  [2]   101 @@?DDDDDDFABFGGGFHEHEG<E...>@'8?824>3:::4>?BB<B<>@
  [3]   101 CCCFFFFFHHHHHJJJJJJJIJJJ...DDDDDDEDDDDDDDDDDDDDDDD
  [4]   101 CCCFFFFFHHHHHJJJJJJJIJJJ...DDDDDDBBBDDDCDDDDDCDCDD
  [5]   101 CCCFFFFFGHHHHDFGIIJJJIJI...DDDDCCACDDCDDDDDDCCDDDC
  ...   ... ...
[898]   101 DDDDDDDDEEDDDEDDEEFFFHHJ...JJJJJJJJJJHHHHHFFFFFBCB
[899]   101 <[email protected]@@B
[900]   101 8BBBBBBBBBBBBA;BA>@;;>;D...:1C3BGEIGEFFFFFDDDDD??<
[901]   101 ADDDDEDDCDDDDDDECEEEEDEE...IIGBJIJJJJHGGHHFFFFFCC@
[902]   101 8A:3@4(3CACA:@:9@A@CDB>>...GFAHDGEHAGFFHDF>DD?;@@@

 
> countBam("Im2653_comp20761_c0_seq1.bam") 
  space start end width                         file records
1    NA    NA  NA    NA Im2653_comp20761_c0_seq1.bam     902
  nucleotides
1       91102




属性情報の取得 <what>
> what <- c("rname", "strand", "pos", "qwidth", "seq") 
> 
> param <- ScanBamParam(what=what)
> 
> bam2 <- scanBam("Im2653_comp20761_c0_seq1.bam", param=param) 
> 
> names(bam2[[1]]) 
[1] "rname"  "strand" "pos"    "qwidth" "seq"   



> 

> 

> bam_df <- do.call("DataFrame", bam[[1]]) 
> 
> bam_df
DataFrame with 902 rows and 13 columns
                                         qname      flag
                                   <character> <integer>
1   HWI-ST917:264:C3FU7ACXX:7:1206:13646:90191       163
2   HWI-ST917:264:C3FU7ACXX:7:2111:16640:59593       163
3   HWI-ST917:264:C3FU7ACXX:7:2208:14945:63984       163
4     HWI-ST917:264:C3FU7ACXX:7:1205:1669:5278        99
5   HWI-ST917:264:C3FU7ACXX:7:1216:19870:55898        99
...                                        ...       ...
898 HWI-ST917:264:C3FU7ACXX:7:2310:14647:34385       147
899  HWI-ST917:264:C3FU7ACXX:7:1113:5351:60085       147
900   HWI-ST917:264:C3FU7ACXX:7:1314:9257:5965       147
901  HWI-ST917:264:C3FU7ACXX:7:2103:8877:36268        83
902 HWI-ST917:264:C3FU7ACXX:7:2116:20880:93056        83
                rname   strand       pos    qwidth      mapq
             <factor> <factor> <integer> <integer> <integer>
1   comp20761_c0_seq1        +         1       101       100
2   comp20761_c0_seq1        +         1       101       100
3   comp20761_c0_seq1        +         3       101       100
4   comp20761_c0_seq1        +        13       101       100
5   comp20761_c0_seq1        +        29       101       100
...               ...      ...       ...       ...       ...
898 comp20761_c0_seq1        -      2731       101       100
899 comp20761_c0_s

: 以下略
> table(bam_df$rname == '21' & bam_df$flag == 16) 

FALSE 
  902 

-------[p.244]-------------------------

Preprocessing the raw NGS data

<準備>
ftp://ftp.ddbj.nig.ac.jp/ddbj_database/dra/fastq/SRA000/SRA000241/SRX000122/より直接、次の2つのSRRデータをダウンロードする。

SRR000657.fastq.bz2
SRR000648.fastq.bz2

> library(R.utils) 

<bz2を解凍>
> bunzip2(filename=list.files(pattern = ".fastq.bz2$")[1]) 

> library(ShortRead) 

> myFiles <- list.files(getwd(), "fastq", full=TRUE) 
> 
> myFiles
[1] "C:/Users/ekaminuma/Documents/SRR000648.fastq"    
[2] "C:/Users/ekaminuma/Documents/SRR000657.fastq.bz2"
[3] "C:/Users/ekaminuma/Documents/SRR351672.fastq.gz" 





参考
http://bi.biopapyrus.net/transcriptome/qc/shortread.html



> help(package="ShortRead")
> 
> 
> ?readFastq
No documentation for ‘readFastq’ in specified packages and libraries:
you could try ‘??readFastq’


> file <- system.file("extdata", "SRR000648.fastq", package = "Biostrings")
> 


> install.packages("C:/Users/ekaminuma/Documents/lambda.r_1.1.7")
Installing package into ‘C:/Users/ekaminuma/Documents/R/win-library/3.2’
(as ‘lib’ is unspecified)
Warning in install.packages :
  package ‘C:/Users/ekaminuma/Documents/lambda.r_1.1.7’ is not available (for R version 3.2.5)

> 




---------------------------------------
---
Analyzing RNAseq data with the edgeR package


edgeR:参考URL
http://bi.biopapyrus.net/transcriptome/de-analysis/examples/2g-edger.html

> source("http://bioconductor.org/biocLite.R")
Bioconductor version 3.1 (BiocInstaller 1.18.5), ?biocLite for help
A newer version of Bioconductor is available for this version of R, ?BiocUpgrade for help
> 
> biocLite("edgeR")
BioC_mirror: http://bioconductor.org

> biocLite("goseq")
BioC_mirror: http://bioconductor.org
> library(edgeR)
 要求されたパッケージ limma をロード中です 
> library(goseq)


> dat1 <- read.table(system.file("extdata","Li_sum.txt",package='goseq'),sep='\t',header=TRUE,stringsAsFactors=FALSE,row.names=1)
> 
> head(dat1)
                lane1 lane2 lane3 lane4 lane5 lane6 lane8
ENSG00000215688     0     0     0     0     0     0     0
ENSG00000215689     0     0     0     0     0     0     0
ENSG00000220823     0     0     0     0     0     0     0
ENSG00000242499     0     0     0     0     0     0     0
ENSG00000224938     0     0     0     0     0     0     0
ENSG00000239242     0     0     0     0     0     0     0

> 

> dim(dat1)
[1] 49506     7

> 

> summary(dat1)
     lane1              lane2              lane3         
 Min.   :    0.00   Min.   :    0.00   Min.   :    0.00  
 1st Qu.:    0.00   1st Qu.:    0.00   1st Qu.:    0.00  
 Median :    0.00   Median :    0.00   Median :    0.00  
 Mean   :   23.81   Mean   :   27.98   Mean   :   34.67  
 3rd Qu.:    3.00   3rd Qu.:    3.00   3rd Qu.:    4.00  
 Max.   :20459.00   Max.   :23220.00   Max.   :25287.00  
     lane4              lane5              lane6         
 Min.   :    0.00   Min.   :    0.00   Min.   :    0.00  
 1st Qu.:    0.00   1st Qu.:    0.00   1st Qu.:    0.00  
 Median :    0.00   Median :    0.00   Median :    0.00  
 Mean   :   35.71   Mean   :   42.98   Mean   :   43.27  
 3rd Qu.:    4.00   3rd Qu.:    4.00   3rd Qu.:    4.00  
 Max.   :25154.00   Max.   :29335.00   Max.   :29815.00  
     lane8         
 Min.   :    0.00  
 1st Qu.:    0.00  
 Median :    0.00  
 Mean   :   16.49  
 3rd Qu.:    1.00  
 Max.   :16361.00  



> treat1 <- factor(rep(c("Control","Treatment"),times=c(4,3)))
> treat1
[1] Control   Control   Control   Control   Treatment Treatment Treatment
Levels: Control Treatment
> 

Treatment/Controlラベルと共に変数作成
> myDG <- DGEList(dat1,lib.size=colSums(dat1),group=treat1)

> 
> 
> myDG
An object of class "DGEList"
$counts
                lane1 lane2 lane3 lane4 lane5 lane6 lane8
ENSG00000215688     0     0     0     0     0     0     0
ENSG00000215689     0     0     0     0     0     0     0
ENSG00000220823     0     0     0     0     0     0     0
ENSG00000242499     0     0     0     0     0     0     0
ENSG00000224938     0     0     0     0     0     0     0
49501 more rows ...

$samples
          group lib.size norm.factors
lane1   Control  1178832            1
lane2   Control  1384945            1
lane3   Control  1716355            1
lane4   Control  1767927            1
lane5 Treatment  2127868            1
lane6 Treatment  2142158            1
lane8 Treatment   816171            1


> 
分散を推定する
> disp1<-estimateCommonDisp(myDG)
> disp1
An object of class "DGEList"
$counts
                lane1 lane2 lane3 lane4 lane5 lane6 lane8
ENSG00000215688     0     0     0     0     0     0     0
ENSG00000215689     0     0     0     0     0     0     0
ENSG00000220823     0     0     0     0     0     0     0
ENSG00000242499     0     0     0     0     0     0     0
ENSG00000224938     0     0     0     0     0     0     0
49501 more rows ...

$samples
          group lib.size norm.factors
lane1   Control  1178832            1
lane2   Control  1384945            1
lane3   Control  1716355            1
lane4   Control  1767927            1
lane5 Treatment  2127868            1
lane6 Treatment  2142158            1
lane8 Treatment   816171            1

$common.dispersion
[1] 0.05688364

$pseudo.counts
                       lane1        lane2        lane3        lane4
ENSG00000215688 6.938894e-17 6.938894e-17 6.938894e-17 6.938894e-17
ENSG00000215689 6.938894e-17 6.938894e-17 6.938894e-17 6.938894e-17
ENSG00000220823 6.938894e-17 6.938894e-17 6.938894e-17 6.938894e-17
ENSG00000242499 6.938894e-17 6.938894e-17 6.938894e-17 6.938894e-17
ENSG00000224938 6.938894e-17 6.938894e-17 6.938894e-17 6.938894e-17
                       lane5        lane6        lane8
ENSG00000215688 6.938894e-17 6.938894e-17 6.938894e-17
ENSG00000215689 6.938894e-17 6.938894e-17 6.938894e-17
ENSG00000220823 6.938894e-17 6.938894e-17 6.938894e-17
ENSG00000242499 6.938894e-17 6.938894e-17 6.938894e-17
ENSG00000224938 6.938894e-17 6.938894e-17 6.938894e-17
49501 more rows ...

$pseudo.lib.size
[1] 1516319

$AveLogCPM
[1] 0.330418 0.330418 0.330418 0.330418 0.330418
49501 more elements ...


> 

> dim(disp1)
[1] 49506     7
> 



Fisher正確確率検定で、Treatment/Control間の発現量差統計量を遺伝子毎に求める。
> test1<- exactTest(disp1)
> 

pvalueでソートしてトップ10を返す
> res1 <- topTags(test1,sort.by="PValue")

> res1
Comparison of groups:  Treatment-Control 
                    logFC   logCPM       PValue          FDR
ENSG00000127954 11.557868 6.680748 2.574972e-80 1.274766e-75
ENSG00000151503  5.398963 8.499530 1.781732e-65 4.410321e-61
ENSG00000096060  4.897600 9.446705 7.983756e-60 1.317479e-55
ENSG00000091879  5.737627 6.282646 1.207655e-54 1.494654e-50
ENSG00000132437 -5.880436 7.951910 2.950042e-52 2.920896e-48
ENSG00000166451  4.564246 8.458467 7.126763e-52 5.880292e-48
ENSG00000131016  5.254737 6.607957 1.066807e-51 7.544766e-48
ENSG00000163492  7.085400 5.128514 2.716461e-45 1.681014e-41
ENSG00000113594  4.051053 8.603264 9.272066e-44 5.100255e-40
ENSG00000116285  4.108522 7.864773 6.422468e-43 3.179507e-39

> 
以下、normalize処理を行う場合を計算してみる。

> myDG_N <- calcNormFactors(myDG)
> disp2<-estimateCommonDisp(myDG_N)
> test2<- exactTest(disp2)
> res2 <- topTags(test2,sort.by="PValue")

> res2
Comparison of groups:  Treatment-Control 
                    logFC   logCPM       PValue          FDR
ENSG00000127954 11.607362 6.734264 1.595901e-79 7.900668e-75
ENSG00000151503  5.497511 8.552671 3.162115e-65 7.827182e-61
ENSG00000096060  4.996257 9.498881 1.114759e-59 1.839575e-55
ENSG00000091879  5.832994 6.330300 7.662689e-55 9.483727e-51
ENSG00000166451  4.662131 8.508101 6.705155e-52 6.638908e-48
ENSG00000131016  5.348436 6.655286 1.155963e-51 9.537853e-48
ENSG00000132437 -5.775718 7.913259 9.102128e-50 6.437285e-46
ENSG00000163492  7.172754 5.170649 3.028147e-45 1.873893e-41
ENSG00000113594  4.145879 8.649581 5.891177e-44 3.240540e-40
ENSG00000116285  4.209285 7.914680 2.604681e-43 1.289474e-39