Skip to content

Commit

Permalink
v1.71
Browse files Browse the repository at this point in the history
- Large update to discrete missing value code
* works (for test units) on two or more variables with missingness
* Optional do_discrete_imputation argument returns samples for discrete
missing values
* some details of the algorithm in the README
  • Loading branch information
Richard McElreath committed Dec 27, 2017
1 parent 155b430 commit ca0b4ab
Show file tree
Hide file tree
Showing 4 changed files with 135 additions and 16 deletions.
4 changes: 2 additions & 2 deletions DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
Package: rethinking
Type: Package
Title: Statistical Rethinking book package
Version: 1.70
Date: 2017-8-15
Version: 1.71
Date: 2017-12-20
Author: Richard McElreath
Maintainer: Richard McElreath <[email protected]>
Imports: coda, MASS, mvtnorm, loo
Expand Down
69 changes: 60 additions & 9 deletions R/map2stan.r
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
##################
# map2stan itself

map2stan <- function( flist , data , start , pars , constraints=list() , types=list() , sample=TRUE , iter=2000 , warmup=floor(iter/2) , chains=1 , debug=FALSE , verbose=FALSE , WAIC=TRUE , cores=1 , rng_seed , rawstanfit=FALSE , control=list(adapt_delta=0.95) , add_unique_tag=TRUE , code , log_lik=FALSE , DIC=FALSE , declare_all_data=TRUE , ... ) {
map2stan <- function( flist , data , start , pars , constraints=list() , types=list() , sample=TRUE , iter=2000 , warmup=floor(iter/2) , chains=1 , debug=FALSE , verbose=FALSE , WAIC=TRUE , cores=1 , rng_seed , rawstanfit=FALSE , control=list(adapt_delta=0.95) , add_unique_tag=TRUE , code , log_lik=FALSE , DIC=FALSE , declare_all_data=TRUE , do_discrete_imputation=FALSE , ... ) {

if ( missing(rng_seed) ) rng_seed <- sample( 1:1e5 , 1 )
set.seed(rng_seed)
Expand Down Expand Up @@ -1138,22 +1138,40 @@ map2stan <- function( flist , data , start , pars , constraints=list() , types=l
# build Stan code

misstestcode <- "if ( "
misstests <- rep(NA,nx)
for ( j in 1:nx ) {
# build joint test for any missingness for case i
misstestcode <- concat( misstestcode , linmod$discrete_miss_list[[j]] , "[i]<0" )
if ( nx > 1 && j < nx ) # add logical OR
misstestcode <- concat( misstestcode , " || " )
# individual test for variable j for case i
misstests[j] <- concat( "if ( " , linmod$discrete_miss_list[[j]] , "[i]<0 )" )
}
misstestcode <- concat( misstestcode , " ) {" )
# save for later --- we need this test again in likelihood loop
fp[['lm']][[i]][['misstestcode']] <- misstestcode

# make version of lm with x vars swapped out for values in row of miss matrix
lm_aliased <- linmod$RHS
for ( j in 1:nx ) lm_aliased <- gsub(
concat(linmod$discrete_miss_list[[j]],"[i]") ,
concat(linmod$parameter,"_missmatrix[mmrow,",j,"]") ,
lm_aliased , fixed=TRUE )
lm_replace_txt <- concat(inden(2),mxtlm,"[i,mmrow] = ",lm_aliased,";")
lm_var_declare <- concat( inden(2) , "vector[" , nx , "] dimiproxy;" )
lm_var_assignments <- inden(2)
for ( j in 1:nx ) {
# assign local proxy variable for each variable with discrete missingness
# 1. test whether var j is missing for case i
# 2. if missing, assign dimiproxy[j] = missmatrix[mmrow,j]
# 3. if observed, assign dimiproxy[j] = var[i]
if ( j > 1 ) lm_var_assignments <- concat( lm_var_assignments , inden(4) )
lm_var_assignments <- concat( lm_var_assignments ,
"dimiproxy[",j,"] = " , linmod$discrete_miss_list[[j]] , "[i];\n" ,
inden(4) , misstests[j] ,
" dimiproxy[",j,"] = " , linmod$parameter,"_missmatrix[mmrow,",j,"];\n" )
lm_aliased <- gsub(
concat(linmod$discrete_miss_list[[j]],"[i]") ,
#concat(linmod$parameter,"_missmatrix[mmrow,",j,"]") ,
concat("dimiproxy[",j,"]") ,
lm_aliased , fixed=TRUE )
}#j
lm_replace_txt <- concat( inden(2) , mxtlm , "[i,mmrow] = " , lm_aliased , ";" )

# link function
if ( linmod$link != "identity" & linmod$use_link==TRUE ) {
Expand All @@ -1167,6 +1185,7 @@ map2stan <- function( flist , data , start , pars , constraints=list() , types=l
}

# build code to add log prob factors to terms
# these terms handle the marginalization over unknown states
terms_txt <- concat(inden(2),mxt,"[i,mmrow] = 0;\n")
for ( j in 1:nx ) {
terms_txt <- concat(terms_txt,inden(4),"if ( ",linmod$parameter,"_missmatrix[mmrow,",j,"]==1 )\n")
Expand All @@ -1179,6 +1198,8 @@ map2stan <- function( flist , data , start , pars , constraints=list() , types=l
code_lines <- c(
misstestcode,
concat(indent,"for ( mmrow in 1:rows(",linmod$parameter,"_missmatrix) ) {"),
lm_var_declare,
lm_var_assignments,
lm_replace_txt,
terms_txt,
concat(indent,"}//mmrow"),
Expand Down Expand Up @@ -1269,7 +1290,7 @@ map2stan <- function( flist , data , start , pars , constraints=list() , types=l
# add loop for non-vectorized distribution
txt1 <- concat( indent , "for ( i in 1:" , lik$N_name , " ) {\n" )
m_model_txt <- concat( m_model_txt , txt1 )
if ( has_discrete_missingness==FALSE )
if ( has_discrete_missingness==FALSE || do_discrete_imputation==TRUE )
m_gq <- concat( m_gq , txt1 )

xindent <- indent
Expand Down Expand Up @@ -1299,7 +1320,7 @@ map2stan <- function( flist , data , start , pars , constraints=list() , types=l
txt1 <- fp[['lm']][[ lms_with_dm[[1]][[2]] ]]$misstestcode # assume just one for now
txt1 <- concat( inden(2) , txt1 , "\n" )
m_model_txt <- concat( m_model_txt , txt1 )
#m_gq <- concat( m_gq , txt1 )
if ( do_discrete_imputation==TRUE ) m_gq <- concat( m_gq , txt1 )
xindent <- concat( xindent , indent )
}

Expand Down Expand Up @@ -1370,6 +1391,36 @@ map2stan <- function( flist , data , start , pars , constraints=list() , types=l
m_model_txt <- concat( m_model_txt , inden(3) , "target += log_sum_exp(" , sym , mixterms_suffix , "[i]);\n" )
# close off discrete missing branch
m_model_txt <- concat( m_model_txt , inden(2) , "} else \n" )

if ( do_discrete_imputation==TRUE ) {
# for each variable in discrete_miss_bank,
# need to compute posterior probability using log probabilities in mixture
m_gq <- concat( m_gq , inden(3) , "for ( mmrow in 1:rows(", sym , missmatrix_suffix ,") )\n" )
m_gq <- concat( m_gq , inden(4) , sym , mixterms_suffix , "[i,mmrow] = " , sym , mixterms_suffix , "[i,mmrow] + " , LLtxt , ";\n" )
ndmb <- length( discrete_miss_bank )
for ( ix in 1:ndmb ) {
var_name <- names(discrete_miss_bank)[ix]
# add vector of imputed values to start of generated quantities
m_gq <- concat( inden(1) , "vector[N] " , var_name , "_impute;\n" , m_gq )
# add to pars so the samples get returned!
pars_elect[[concat(var_name,"_impute")]] <- concat(var_name,"_impute")
# add calculation to body of generated quantities
# uses softmax to exp each term and normalize them
# dot product with missmatrix extracts terms that include variable ix
m_gq <- concat( m_gq , inden(3) , var_name , "_impute[i] = " , var_name , "[i];\n" )
m_gq <- concat( m_gq , inden(3) , misstests[[ix]] )
m_gq <- concat( m_gq , var_name ,
"_impute[i] = dot_product( softmax(to_vector(" , sym , mixterms_suffix , "[i])) , " , sym , missmatrix_suffix , "[:,",ix,"] )" , ";\n" )
}#ix
# close off discrete missing branch
m_gq <- concat( m_gq , inden(2) , "} else {\n" )
# need to fill observed values in _impute vector, or will get sampling error for undefined values
for ( ix in 1:ndmb ) {
var_name <- names(discrete_miss_bank)[ix]
m_gq <- concat( m_gq , inden(3) , var_name , "_impute[i] = " , var_name , "[i];\n" )
}#ix
m_gq <- concat( m_gq , inden(2) , "}\n" )
}
}
m_model_txt <- concat( m_model_txt , indent , xindent , outcome , " ~ " , lik$likelihood , "( " , parstxt , " )" , lik$T_text , ";\n" )

Expand Down Expand Up @@ -1426,7 +1477,7 @@ map2stan <- function( flist , data , start , pars , constraints=list() , types=l
if ( tmplt$vectorized==FALSE || has_discrete_missingness==TRUE ) {
txt1 <- concat( indent , "}//i \n" )
m_model_txt <- concat( m_model_txt , txt1 )
if ( has_discrete_missingness==FALSE )
if ( has_discrete_missingness==FALSE || do_discrete_imputation==TRUE )
m_gq <- concat( m_gq , txt1 )
}

Expand Down
70 changes: 66 additions & 4 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -219,7 +219,7 @@ What ``map2stan`` does is notice the missing values, see the distribution assign

## Semi-automated marginalization for binary discrete missing values

Binary (0/1) variables with missing values present a special obstacle, because Stan cannot sample discrete parameters. So instead of imputing binary missing values, ``map2stan`` can simply average (marginalize) over them. As in the above case, when ``map2stan`` detects missing values in a predictor variable, it will try to find a distribution for the variable containing them. If this variable is binary (0/1), then it will construct a mixture model in which each term is the log-likelihood conditional on the variables taking a particular combination of 0/1 values.
Binary (0/1) variables with missing values present a special obstacle, because Stan cannot sample discrete parameters. So instead of imputing binary missing values, ``map2stan`` can average (marginalize) over them. As in the above case, when ``map2stan`` detects missing values in a predictor variable, it will try to find a distribution for the variable containing them. If this variable is binary (0/1), then it will construct a mixture model in which each term is the log-likelihood conditional on the variables taking a particular combination of 0/1 values.

Following the example in the previous section, we can simulate missingness in a binary predictor:
```
Expand All @@ -234,14 +234,76 @@ The model definition is analogous to the previous, but also requires some care i
f6 <- alist(
y ~ dnorm( mu , sigma ),
mu <- a + b*x,
x ~ bernoulli( phi_x ),
x ~ bernoulli( phi ),
a ~ dnorm( 0 , 100 ),
b ~ dnorm( 0 , 10 ),
phi_x ~ beta( 1 , 1 ),
phi ~ beta( 1 , 1 ),
sigma ~ dcauchy(0,2)
)
m6 <- map2stan( f6 , data=list(y=y,x=x) , constraints=list(phi_x="lower=0,upper=1") )
m6 <- map2stan( f6 , data=list(y=y,x=x) , constraints=list(phi="lower=0,upper=1") )
```
The algorithm works, in theory, for any number of binary predictors with missing values. For example, with two predictors, each with missingness:
```
N <- 100
N_miss <- 10
x1 <- rbinom( N , size=1 , prob=0.5 )
x2 <- rbinom( N , size=1 , prob=0.1 )
y <- rnorm( N , 2*x1 - x2 , 1 )
x1[ sample(1:N,size=N_miss) ] <- NA
x2[ sample(1:N,size=N_miss) ] <- NA
f7 <- alist(
y ~ dnorm( mu , sigma ),
mu <- a + b1*x1 + b2*x2,
x1 ~ bernoulli( phi1 ),
x2 ~ bernoulli( phi2 ),
a ~ dnorm( 0 , 100 ),
c(b1,b2) ~ dnorm( 0 , 10 ),
phi1 ~ beta( 1 , 1 ),
phi2 ~ beta( 1 , 1 ),
sigma ~ dcauchy(0,2)
)
m7 <- map2stan( f7 , data=list(y=y,x1=x1,x2=x2) ,
constraints=list(phi1="lower=0,upper=1",phi2="lower=0,upper=1") )
```
While the unobserved values for the binary predictors are usually not of interest, they can be computed from the posterior distribution. Adding the argument ``do_discrete_imputation=TRUE`` instructs ``map2stan`` to perform these calculations automatically. Example:
```
m6 <- map2stan( f6 , data=list(y=y,x=x) , constraints=list(phi="lower=0,upper=1") ,
do_discrete_imputation=TRUE )
precis( m6 , depth=2 )
```
The output contains samples for each case with imputed probilities that ``x`` takes the value 1.

The algorithm works by constructing a list of mixture terms that are needed to to compute the probability of each observed ``y`` value. In the simplest case, with only one predictor with missing values, the implied mixture likelihood contains two terms:
```
Pr(y[i]) = Pr(x[i]=1)Pr(y[i]|x[i]=1) + Pr(x[i]=0)Pr(y[i]|x[i]=0)
```
In the parameters of our example model ``m6`` above, this is:
```
Pr(y[i]) = phi*N(y[i]|a+b,sigma) + (1-phi)*N(y[i]|a,sigma)
```
It is now a simple matter to loop over cases ``i`` and compute the above for each. Similarly the posterior probability of that ``x[i]==1`` is given as:
```
Pr(x[i]==1|y[i]) = phi*N(y[i]|a+b,sigma) / Pr(y[i])
```
When only one predictor has missingness, then this is simple. What about when there are two or more? In that case, all the possible combinations of missingness have to be accounted for. For example, suppose there are two predictors, ``x1`` and ``x2``, both with missingness on case ``i``. Now the implied mixture likelihood is:
```
Pr(y[i]) = Pr(x1=1)Pr(x2=1)*Pr(y[i]|x1=1,x2=1) + Pr(x1=1)Pr(x2=0)Pr(y[i]|x1=1,x2=0) + Pr(x1=0)Pr(x2=1)Pr(y[i]|x1=0,x2=1) + Pr(x1=0)Pr(x2=0)Pr(y[i]|x1=0,x2=0)
```
There are four combinations of unobserved values, and so four terms in the mixture likelihood. When ``x2`` is instead observed, we can substitute the observed value into the above, and then the mixture simplifies readily to our previous two-term likelihood:
```
Pr(y[i]|x2[i]==1) = Pr(x1=1)Pr(x2=1)Pr(y[i]|x1=1,x2=1) + Pr(x1=1)Pr(x2=0)Pr(y[i]|x1=1,x2=1) + Pr(x1=0)Pr(x2=1)Pr(y[i]|x1=0,x2=1) + Pr(x1=0)Pr(x2=0)Pr(y[i]|x1=0,x2=1)
= [Pr(x1=1)Pr(x2=1)+Pr(x1=1)Pr(x2=0)]Pr(y[i]|x1=1,x2=1)
+ [Pr(x1=0)Pr(x2=1)+Pr(x1=0)Pr(x2=0)]Pr(y[i]|x1=0,x2=1)
= Pr(x1=1)Pr(y[i]|x1=1,x2=1) + Pr(x1=0)Pr(y[i]|x1=0,x2=1)
```
This implies that if we loop over cases ``i`` and insert any observed values into the general mixture likelihood, we can compute the relevant mixture for the specific combination of missingness on each case ``i``. That is what ``map2stan`` does. The general mixture terms can be generated algorithmically. The code below generates a matrix of terms for ``n`` binary variables with missingness.
```
ncombinations <- 2^n
d <- matrix(NA,nrow=ncombinations,ncol=n)
for ( col_var in 1:n )
d[,col_var] <- rep( 0:1 , each=2^(col_var-1) , length.out=ncombinations )
```
Rows of ``d`` contain terms, columns contain variables, and the values in each column are the corresponding values of each variable. The algorithm builds a linear model for each row in this matrix, composes the mixture likelihood as the sum of these rows, and performs proper substitutions of observed values. All calculations are done on the log scale, for precision.

## Gaussian process

Expand Down
8 changes: 7 additions & 1 deletion man/map2stan.Rd
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,9 @@
map2stan( flist , data , start , pars , constraints=list() , types=list() ,
sample=TRUE , iter=2000 , warmup=floor(iter/2) , chains=1 , debug=FALSE ,
verbose=FALSE , WAIC=TRUE , cores=1 , rng_seed , rawstanfit=FALSE ,
control=list(adapt_delta=0.95) , add_unique_tag=TRUE , code , ... )
control=list(adapt_delta=0.95) , add_unique_tag=TRUE , code ,
log_lik=FALSE , DIC=FALSE , declare_all_data=TRUE ,
do_discrete_imputation=FALSE , ... )
}
%- maybe also 'usage' for other objects documented here.
\arguments{
Expand All @@ -32,6 +34,10 @@ map2stan( flist , data , start , pars , constraints=list() , types=list() ,
\item{control}{Optional list of control parameters for \code{stan}. Default increases target acceptance rate (\code{adapt_delta}) to 0.95.}
\item{add_unique_tag}{When \code{TRUE}, adds a comment to the Stan model code with the date-time stamp. This makes each model unique and will force Stan to recompile. Useful for avoiding segfault bugs when reusing compiled objects.}
\item{code}{Optional list of custom Stan code to insert in model. See details and example.}
\item{log_lik}{Return log likelihood of each observation in samples. Used for calculating WAIC and LOO.}
\item{DIC}{Return deviance and DIC. This is deprecated and may be removed in a later version.}
\item{declare_all_data}{When \code{TRUE}, all variables in the data list are declared in the Stan model code. When \code{FALSE}, only used variables are declared.}
\item{do_discrete_imputation}{When \code{TRUE}, samples for missing binary predictors are returned. Not necessary to marginalize over discrete missing values.}
\item{...}{Additional arguments to pass to \code{\link{stan}}}
}
\details{
Expand Down

0 comments on commit ca0b4ab

Please sign in to comment.