From a70a553965c04eccfce728ef3870d1c9334e6819 Mon Sep 17 00:00:00 2001 From: Trung Nghia Vu Date: Tue, 21 Aug 2018 06:31:28 +0200 Subject: [PATCH] update v0.99.2 --- DESCRIPTION | 2 + NEWS | 4 +- R/BPglm.R | 188 ++++++++++++++++++++++++++------------------ inst/doc/BPSC.html | 6 +- man/BPSC-package.Rd | 2 +- 5 files changed, 119 insertions(+), 83 deletions(-) diff --git a/DESCRIPTION b/DESCRIPTION index e1fe3ef..ac3077f 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -10,3 +10,5 @@ Suggests: knitr License: GPL-3 LazyData: true RoxygenNote: 6.0.1 +NeedsCompilation: no +Packaged: 2018-08-21 04:05:55 UTC; nghiavtr diff --git a/NEWS b/NEWS index f4df00b..8742c17 100644 --- a/NEWS +++ b/NEWS @@ -1,5 +1,5 @@ -Date: 15 Aug 2018 -- Use t-test for weird isoform data, for example all samples of the control group are unexpressed, to avoid NA results in BPglm. Great thanks to Tian Mou for the contribution! +Date: 21 Aug 2018 +- Fix issues for weird isoform data, for example all samples of the control group are unexpressed, to avoid NA results in BPglm. Great thanks to Tian Mou for the contribution! - Use RMarkdown instead of Sweave for vignette document. Date: 24 Feb 2017 - Use quasi-Possion as the family function if the glm fitting with BPfam is not converged diff --git a/R/BPglm.R b/R/BPglm.R index 9f3fa06..8097b27 100644 --- a/R/BPglm.R +++ b/R/BPglm.R @@ -84,72 +84,89 @@ BPglm <- function (data, controlIds, design, coef=2, keepFit=FALSE,minExp=1e-4, i.converged=NA i.bpconverged=NA oo4=NA + par=NA x=data[i,] - control.x=x[controlIds] - bpstat=control.x - if (!is.null(extreme.quant)) bpstat=control.x[control.x <= quantile(control.x,prob=extreme.quant)] - bpstat[bpstat0)>0 - par=getInitParam(bpstat,para.num=4) - if (sum(bpstat>0) > 0.05*length(bpstat)){ - ##### beta-Poisson estimation for control group - oo4=estimateBP(bpstat,para.num=4,tbreak.num=tbreak.num,break.thres=break.thres) + if (!isNearZeroVariance){ + control.x=x[controlIds] + bpstat=control.x + if (!is.null(extreme.quant)) bpstat=control.x[control.x <= quantile(control.x,prob=extreme.quant)] + bpstat[bpstat0] - mypar=getInitParam(bpstat2,para.num=4) - oo.tmp=estimateBP(bpstat2,para.num=4,tbreak.num=tbreak.num,break.thres=break.thres,useExt=useExt,param0=mypar) - if (!is.na(oo.tmp$PVAL)) mypar=oo.tmp$par - oo4e=estimateBP(bpstat,para.num=4,tbreak.num=tbreak.num,break.thres=break.thres,useExt=useExt,param0=mypar) - # select the better model - if (is.na(oo4$PVAL)) oo4=oo4e - else { - if (!is.na(oo4e$PVAL) && oo4e$PVAL > oo4$PVAL && oo4e$X2>=0) oo4=oo4e - } + par=getInitParam(bpstat,para.num=4) + if (sum(bpstat>0) > 0.05*length(bpstat)){ + ##### beta-Poisson estimation for control group + oo4=estimateBP(bpstat,para.num=4,tbreak.num=tbreak.num,break.thres=break.thres) + if (estIntPar){ + bpstat2=bpstat[bpstat>0] + mypar=getInitParam(bpstat2,para.num=4) + oo.tmp=estimateBP(bpstat2,para.num=4,tbreak.num=tbreak.num,break.thres=break.thres,useExt=useExt,param0=mypar) + if (!is.na(oo.tmp$PVAL)) mypar=oo.tmp$par + oo4e=estimateBP(bpstat,para.num=4,tbreak.num=tbreak.num,break.thres=break.thres,useExt=useExt,param0=mypar) + # select the better model + if (is.na(oo4$PVAL)) oo4=oo4e + else { + if (!is.na(oo4e$PVAL) && oo4e$PVAL > oo4$PVAL && oo4e$X2>=0) oo4=oo4e + } + + } + par=oo4$par } - par=oo4$par - } - ##### start GLM - fdat=data.frame(x=x,design[,-1]) - colnames(fdat)=c("x",colnames(design)[-1]) + ##### start GLM + fdat=data.frame(x=x,design[,-1]) + colnames(fdat)=c("x",colnames(design)[-1]) - try({ - alp=par[1];bet=par[2];lam1=par[3];lam2=par[4] - fam0=do.call("BPfam", list(alp=alp, bet=bet, lam1=lam1, lam2=lam2, link = "log")) + try({ + alp=par[1];bet=par[2];lam1=par[3];lam2=par[4] + fam0=do.call("BPfam", list(alp=alp, bet=bet, lam1=lam1, lam2=lam2, link = "log")) - fit=glm(x~.,data=fdat,family=fam0) - i.pval=summary(fit)$coefficients[coef,4] - i.tval=summary(fit)$coefficients[coef,3] - i.bpconverged=fit$converged - ##### if the fitting is not converged, use quassipoisson - i.converged=i.bpconverged - if (!i.converged){ - fit=glm(x~.,data=fdat,family=quasipoisson) + fit=glm(x~.,data=fdat,family=fam0) i.pval=summary(fit)$coefficients[coef,4] i.tval=summary(fit)$coefficients[coef,3] - i.converged=fit$converged - } - }, silent=TRUE) # keep silent if errors occur + i.bpconverged=fit$converged + ##### if the fitting is not converged, use quassipoisson + i.converged=i.bpconverged + if (!i.converged){ + fit=glm(x~.,data=fdat,family=quasipoisson) + i.pval=summary(fit)$coefficients[coef,4] + i.tval=summary(fit)$coefficients[coef,3] + i.converged=fit$converged + } + }, silent=TRUE) # keep silent if errors occur + } - # Modification: if the fitting is not converged or all samples of one group are all unexpressed, use t.test via lm - x1=x; x1[x0) i.pval=NA + # Modification: if the fitting is not converged or all samples of one group are all unexpressed, use t.test with unequal variance or Analysis of Variance + if (isNearZeroVariance) i.pval=NA if(is.na(i.pval)) { design_group=design colnames(design_group)[coef]="group" fdat_norm=data.frame(x=log2(x+1)) fdat_norm=cbind(fdat_norm,design_group) fdat_norm=fdat_norm[,-2] #remove intercept - fit=lm(group~.,data=fdat_norm) + #fit=lm(group~.,data=fdat_norm) + #i.pval=summary(fit)$coefficients[2,4] + #i.tval=summary(fit)$coefficients[2,3] + #i.converged=NA #no information + gNum=length(unique(fdat_norm$group)) + if (gNum==2){ #use t-test + gtest=t.test(x ~ group,data=fdat_norm) + i.pval=gtest$p.value + i.tval=gtest$statistic + i.converged=NA #no information + }else{ #use aov + gtest=aov(x ~ group,data=fdat_norm) + i.pval=summary(gtest)[[1]][["Pr(>F)"]][1] + i.tval=summary(gtest)[[1]][["F value"]][1] + i.converged=NA #no information + } #i.pval = t.test(x ~ group)$p.value - i.pval=summary(fit)$coefficients[2,4] - i.tval=summary(fit)$coefficients[2,3] - i.converged=NA #no information - #t.test(x ~ group,data=fdat_norm)$p.value } # End modification @@ -167,16 +184,23 @@ BPglm <- function (data, controlIds, design, coef=2, keepFit=FALSE,minExp=1e-4, } # end of foreach }else{ - fitRes=ind=TVAL=PVAL=NULL; - for (i in 1:nrow(data)){ - x=data[i,] - fit=NA - i.pval=NA - i.tval=NA - i.converged=NA - i.bpconverged=NA - oo4=NA + fitRes=ind=TVAL=PVAL=NULL; + for (i in 1:nrow(data)){ + fit=NA + i.pval=NA + i.tval=NA + i.converged=NA + i.bpconverged=NA + oo4=NA + par=NA + x=data[i,] + #check if data is weird + gvar=tapply(x,design[,coef],var) + lowvar=sapply(gvar, function(z) sum(z<0.1*gvar)) + isNearZeroVariance=sum(lowvar>0)>0 + + if (!isNearZeroVariance){ control.x=x[controlIds] bpstat=control.x if (!is.null(extreme.quant)) bpstat=control.x[control.x <= quantile(control.x,prob=extreme.quant)] @@ -221,37 +245,47 @@ BPglm <- function (data, controlIds, design, coef=2, keepFit=FALSE,minExp=1e-4, i.converged=fit$converged } }, silent=TRUE) # keep silent if errors occur + } - # Modification: if the fitting is not converged or all samples of one group are all unexpressed, use t.test via lm - x1=x; x1[x0) i.pval=NA + # Modification: if the fitting is not converged or all samples of one group are all unexpressed, use t.test with unequal variance or Analysis of Variance + if (isNearZeroVariance) i.pval=NA if(is.na(i.pval)) { design_group=design colnames(design_group)[coef]="group" fdat_norm=data.frame(x=log2(x+1)) fdat_norm=cbind(fdat_norm,design_group) fdat_norm=fdat_norm[,-2] #remove intercept - fit=lm(group~.,data=fdat_norm) + #fit=lm(group~.,data=fdat_norm) + #i.pval=summary(fit)$coefficients[2,4] + #i.tval=summary(fit)$coefficients[2,3] + #i.converged=NA #no information + gNum=length(unique(fdat_norm$group)) + if (gNum==2){ #use t-test + gtest=t.test(x ~ group,data=fdat_norm) + i.pval=gtest$p.value + i.tval=gtest$statistic + i.converged=NA #no information + }else{ #use aov + gtest=aov(x ~ group,data=fdat_norm) + i.pval=summary(gtest)[[1]][["Pr(>F)"]][1] + i.tval=summary(gtest)[[1]][["F value"]][1] + i.converged=NA #no information + } #i.pval = t.test(x ~ group)$p.value - i.pval=summary(fit)$coefficients[2,4] - i.tval=summary(fit)$coefficients[2,3] - i.converged=NA #no information - #t.test(x ~ group,data=fdat_norm)$p.value } - # End modification - res=list(); - res[["PVAL"]]=i.pval - res[["TVAL"]]=i.tval - res[["CONVERGED"]]=i.converged - res[["BPCONVERGED"]]=i.bpconverged - res[["ind"]]=i - if(keepFit) res[["fit"]]=fit else res[["fit"]]=NA - res[["par"]]=par; - res[["oo4"]]=oo4; + # End modification + res=list(); + res[["PVAL"]]=i.pval + res[["TVAL"]]=i.tval + res[["CONVERGED"]]=i.converged + res[["BPCONVERGED"]]=i.bpconverged + res[["ind"]]=i + if(keepFit) res[["fit"]]=fit else res[["fit"]]=NA + res[["par"]]=par; + res[["oo4"]]=oo4; - fitRes=c(fitRes,res) - } + fitRes=c(fitRes,res) + } } ind=unlist(fitRes[which(names(fitRes)=="ind")]) diff --git a/inst/doc/BPSC.html b/inst/doc/BPSC.html index d7fc996..b072b7e 100644 --- a/inst/doc/BPSC.html +++ b/inst/doc/BPSC.html @@ -11,7 +11,7 @@ - + User guide for BPSC package version 0.99.2 @@ -124,7 +124,7 @@

User guide for BPSC package version 0.99.2

Trung Nghia Vu

-

2018-08-20

+

2018-08-21

@@ -133,7 +133,7 @@

2018-08-20

Introduction

Single-cell RNA-sequencing technology allows detection of gene expression in single-cell level. One typical feature of the data is a bimodality in the cellular distribution (see the figure below) even for highly expressed genes, primarily caused by a proportion of non-expressing cells.

hist(log2(rBP(10000,alp=0.2160247,bet=0.5989889,lam1=171.9596728,lam2=0.9094114)+1),prob=TRUE,col='grey',breaks=10, xlab="log2(expression+1)", main="")
-

+

The figure is produced by the data generated from the four-parameter beta-Poisson model in figure 1 of our recent study (Vu et al., 2016): alp=0.2160247, bet=0.5989889, lam1=171.9596728, lam2=0.9094114

The standard and the over-dispersed gamma-Poisson models that are commonly assumed in bulk-cell RNA-sequencing are not able to capture this property. In the recent study [1], we introduce a beta-Poisson mixture model to capture the bimodality of the single-cell RNA-seq gene expression distribution. We also propose a method that combines the beta-Poisson model with a generalized linear model to do differential expression analysis for single-cell RNAseq data. In this document, we present practical uses of the beta-Poisson model for differential expression analysis and model fitting.

diff --git a/man/BPSC-package.Rd b/man/BPSC-package.Rd index 80dd64d..df66bdb 100644 --- a/man/BPSC-package.Rd +++ b/man/BPSC-package.Rd @@ -13,7 +13,7 @@ Beta-Poisson model for single-cell RNA-seq data analyses Package: \tab BPSC\cr Type: \tab Package\cr Version: \tab 0.99.2\cr -Date: \tab 2018-08-20\cr +Date: \tab 2018-08-21\cr License: \tab GPL-3\cr LazyLoad: \tab yes\cr }