-
Notifications
You must be signed in to change notification settings - Fork 6
/
Copy pathlncRNA_next500.R
194 lines (153 loc) · 7.42 KB
/
lncRNA_next500.R
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
#
#
# fastqc -t 8 -f fastq -o lncRNA_QC /mnt/date/Sequencing/FastQ/totalRNA_2017_03_14/A*.fastq
library(gplots)
library(xlsx)
library(Rsubread)
library(edgeR)
library(limma)
library(org.Hs.eg.db)
library(DESeq2)
library(gplots)
library(genefilter)
library(RColorBrewer)
library(org.Rn.eg.db)
library(cluster)
library(factoextra)
library(clusterProfiler)
library(pathview)
library(sva)
library(systemPipeR)
library(rtracklayer)
library(stringr)
library(GenomicFeatures)
setwd('/home/zhenyisong/biodata/wanglab/wangdata/lncRNAClinic/rsubread')
setwd('D:\\wangli_data\\Rdata')
load('lincRNA_SE.Rdata')
human.genome_ref.path <- "/mnt/date/igenomes/Homo_sapiens/UCSC/hg19/Sequence/WholeGenomeFasta/genome.fa"
human.hg19.lncGene.GTF <- '/mnt/date/genomelib/annotation/human_hg19_GenePusLncRNA.gtf'
human.hg19.lncRNA.GTF <- '/mnt/date/genomelib/annotation/NONCODE2016.hg19.gtf'
human.hg38.lncRNA.fpkm <- '/mnt/date/biodata/wanglab/wangdata/lncRNAClinic/annotation/humanLncGene.fpkm'
setwd('/mnt/date/Sequencing/FastQ/totalRNA_2017_03_14')
reads.files.names <- list.files(pattern = '*.fastq$')[1:16]
raw.data.path <- '/mnt/date/Sequencing/FastQ/totalRNA_2017_03_14'
setwd('/home/zhenyisong/biodata/wanglab/wangdata/lncRNAClinic')
#unlink('rsubread', force = TRUE, recursive = TRUE)
#dir.create('rsubread')
output.path <- '/home/zhenyisong/biodata/wanglab/wangdata/lncRNAClinic/rsubread'
setwd('/home/zhenyisong/biodata/wanglab/wangdata/lncRNAClinic/rsubread')
human.base <- 'h19_index'
read.path.1 <- reads.files.names[grep("R1",reads.files.names )]
lincRNA.outputs.files <- paste0(output.path,'/', read.path.1,'.bam')
read.path.1 <- paste0(raw.data.path, '/',read.path.1)
read.path.2 <- reads.files.names[grep("R2",reads.files.names )]
read.path.2 <- paste0(raw.data.path, '/',read.path.2)
# generate QC report
name.vector.args <- c(read.path.1, read.path.2)
sample.names <- sub('_001.fastq','',basename(name.vector.args))
names(name.vector.args) <- sample.names
#rawdata.list <- seeFastq(fastq = name.vector.args, batchsize = 10000, klength = 8)
#pdf("fastqReport.pdf", height = 18, width = 4 * length(rawdata.list))
#seeFastqPlot(rawdata.list )
#dev.off()
#
#buildindex( basename = human.base, reference = human.genome_ref.path )
#align( index = human.base,
# readfile1 = read.path.1,
# readfile2 = read.path.2,
# input_format = "FASTQ",
# type = 'rna',
# output_file = lincRNA.outputs.files,
# output_format = "BAM",
# PE_orientation = 'fr',
# nthreads = 8,
# indels = 1,
# maxMismatches = 3,
# phredOffset = 33,
# unique = T )
#
align( index = human.base,
readfile1 = read.path.1,
input_format = "FASTQ",
type = 'rna',
output_file = lincRNA.outputs.files,
output_format = "BAM",
nthreads = 15,
indels = 1,
maxMismatches = 3,
phredOffset = 33,
unique = T )
hg19.lncGene <- featureCounts( lincRNA.outputs.files, useMetaFeatures = TRUE,
annot.ext = human.hg19.lncGene.GTF, isGTFAnnotationFile = TRUE,
nthreads = 15, strandSpecific = 2, allowMultiOverlap = TRUE)
gene.counts <- hg19.lncGene$counts
gene.exprs <- DGEList(counts = gene.counts)
gene.exprs <- calcNormFactors(gene.exprs)
colnames(gene.counts) <- c('70OM_A','70PM_A','7OM_A','7PM_A','70OM_B','70PM_B','7OM_B','7PM_B')
write.csv(gene.counts, file = 'lncRNA_PEmodel.csv')
dge.tmm <- t(t(gene.exprs$counts) * gene.exprs$samples$norm.factors)
log.counts <- log(dge.tmm + 1.0)
colnames(log.counts) <- c('70OM_A','70PM_A','7OM_A','7PM_A','70OM_B','70PM_B','7OM_B','7PM_B')
write.csv(log.counts, file = 'lncRNA_PEmodel.logTrans.csv')
log.genes <- cbind( log.counts[,1] - log.counts[,3], log.counts[,1]- log.counts[,2],
log.counts[,3] - log.counts[,4], log.counts[,2] - log.counts[,4],
log.counts[,5] - log.counts[,7], log.counts[,5] - log.counts[,6],
log.counts[,7] - log.counts[,8], log.counts[,6] - log.counts[,8])
colnames(log.genes) <- c('70OM-TOM/A','70OM-70PM/A','TOM-TPM/A','70PM-TPM/A','70OM-TOM/B','70OM-70PM/B','TOM-TPM/B','70PM-TPM/B')
write.csv(log.genes, file = 'lncRNA_PEmodel.LFC.csv')
# post-processing QC
#---
##hg19.lnRNA.GTF.transcript <- import.gff( human.hg19.lncRNA.GTF, format = "gtf", genome = "hg19",
## feature.type = "transcript", sequenceRegionsAsSeqinfo = TRUE)
##hg19.lncRNA.gene.db <- makeTxDbFromGFF(human.hg19.lncRNA.GTF, format = 'gtf')
##gene.len <- width( ranges(hg19.lnRNA.GTF) )
lncRNA.index <- grepl( 'NONHSAG', rownames(gene.counts), ignore.case = FALSE)
hg19.lncRNA.names <- sub("\\.\\d+$","", rownames(gene.counts)[lncRNA.index])
hg19.lncRNA.len <- hg19.lncGene$annotation$Length[lncRNA.index]
hg19.lncRNA.exprs <- gene.counts[lncRNA.index,]
hg19.lncRNA.rpkm <- rpkm(hg19.lncRNA.exprs, gene.length = hg19.lncRNA.len )
hg38.lncRNA.pub.fpkm <- read.table(human.hg38.lncRNA.fpkm, header = F)
# this is tedous long time
# should use the do.call instead of
#----
results <- hg38.lncRNA.pub.fpkm$V3
hg38.lncRNA.fpkm.matrix <- NULL
for( i in (1:length(results))) {
hg38.lncRNA.fpkm.matrix <- rbind(hg38.lncRNA.fpkm.matrix , as.double(str_split(results[i],",")[[1]]) )
}
rownames(hg38.lncRNA.fpkm.matrix) <- hg38.lncRNA.pub.fpkm$V1
match.index <- match(hg19.lncRNA.names,hg38.lncRNA.pub.fpkm$V1)
hg19.lncRNA.allTissue.fpkm <- hg38.lncRNA.fpkm.matrix[match.index,]
setwd('/home/zhenyisong/biodata/wanglab/wangdata/lncRNAClinic/annotation')
tissues.colnames <- read.table(file = 'lncRNA_tissue.txt', header = F, comment.char = "#")
samplePlusAll.fpkm <- cbind(hg19.lncRNA.allTissue.fpkm, hg19.lncRNA.rpkm)
mean(is.na(hg19.lncRNA.allTissue.fpkm))
# 0.005390896
# http://stackoverflow.com/questions/3798998/cor-shows-only-na-or-1-for-correlations-why
cor(log(samplePlusAll.fpkm + 1), method = 'spearman', use = "complete.obs")
#setwd('/home/zhenyisong/biodata/wanglab/wangdata/lncRNAClinic/rsubread')
#save.image(file = 'lincRNA_SE.Rdata')
#quit('no')
#
# branch -- see the code at
setwd('E:\\FuWai\\wangli.lab\\lncRNAClinic')
hisat2.counts <- read.csv( file = 'transcript_count_matrix.csv',
header = TRUE, row.names = 1)
#group.A.counts <- hisat2.counts[,1:4] + 1
#group.B.counts <- hisat2.counts[,5:8] + 1
gene.exprs <- DGEList(counts = hisat2.counts)
gene.exprs <- calcNormFactors(gene.exprs)
norm.counts <- t(t(gene.exprs$counts) * gene.exprs$samples$norm.factors)
norm.counts <- norm.counts + 1
keytypes(org.Hs.eg.db)
gene.refseq <- rownames(hisat2.counts)
columns <- c("ENTREZID","SYMBOL", "REFSEQ", "GENENAME");
GeneInfo <- select( org.Hs.eg.db, keys = as.character(gene.refseq),
keytype = "REFSEQ", columns = columns);
norm.counts.ann <- cbind(norm.counts,GeneInfo )
norm.counts.log.ann <- cbind(log(norm.counts), GeneInfo)
setwd("C:\\Users\\Yisong\\Desktop")
write.csv( norm.counts.ann, file = 'lncRNAClinic.csv')
write.csv( norm.counts.ann, file = 'lncRNAClinic.log.txt')
# cannot use the limma to process the data and caculate the p.value
#---