v1.71

- Large update to discrete missing value code * works (for test units) on two or more variables with missingness * Optional do_discrete_imputation argument returns samples for discrete missing values * some details of the algorithm in the README
rmcelreath · Dec 27, 2017 · ca0b4ab · ca0b4ab
1 parent 155b430
commit ca0b4ab
Show file tree

Hide file tree

Showing 4 changed files with 135 additions and 16 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,8 +1,8 @@
 Package: rethinking
 Type: Package
 Title: Statistical Rethinking book package
-Version: 1.70
-Date: 2017-8-15
+Version: 1.71
+Date: 2017-12-20
 Author: Richard McElreath
 Maintainer: Richard McElreath <[email protected]>
 Imports: coda, MASS, mvtnorm, loo

diff --git a/R/map2stan.r b/R/map2stan.r
@@ -11,7 +11,7 @@
 ##################
 # map2stan itself
 
-map2stan <- function( flist , data , start , pars , constraints=list() , types=list() , sample=TRUE , iter=2000 , warmup=floor(iter/2) , chains=1 , debug=FALSE , verbose=FALSE , WAIC=TRUE , cores=1 , rng_seed , rawstanfit=FALSE , control=list(adapt_delta=0.95) , add_unique_tag=TRUE , code , log_lik=FALSE , DIC=FALSE , declare_all_data=TRUE , ... ) {
+map2stan <- function( flist , data , start , pars , constraints=list() , types=list() , sample=TRUE , iter=2000 , warmup=floor(iter/2) , chains=1 , debug=FALSE , verbose=FALSE , WAIC=TRUE , cores=1 , rng_seed , rawstanfit=FALSE , control=list(adapt_delta=0.95) , add_unique_tag=TRUE , code , log_lik=FALSE , DIC=FALSE , declare_all_data=TRUE , do_discrete_imputation=FALSE , ... ) {
 
     if ( missing(rng_seed) ) rng_seed <- sample( 1:1e5 , 1 )
     set.seed(rng_seed)
@@ -1138,22 +1138,40 @@ map2stan <- function( flist , data , start , pars , constraints=list() , types=l
                 # build Stan code
 
                 misstestcode <- "if ( "
+                misstests <- rep(NA,nx)
                 for ( j in 1:nx ) {
+                    # build joint test for any missingness for case i
                     misstestcode <- concat( misstestcode , linmod$discrete_miss_list[[j]] , "[i]<0" )
                     if ( nx > 1 && j < nx ) # add logical OR
                         misstestcode <- concat( misstestcode , " || " )
+                    # individual test for variable j for case i
+                    misstests[j] <- concat( "if ( " , linmod$discrete_miss_list[[j]] , "[i]<0 )" )
                 }
                 misstestcode <- concat( misstestcode , " ) {" )
                 # save for later --- we need this test again in likelihood loop
                 fp[['lm']][[i]][['misstestcode']] <- misstestcode
 
                 # make version of lm with x vars swapped out for values in row of miss matrix
                 lm_aliased <- linmod$RHS
-                for ( j in 1:nx ) lm_aliased <- gsub( 
-                    concat(linmod$discrete_miss_list[[j]],"[i]") , 
-                    concat(linmod$parameter,"_missmatrix[mmrow,",j,"]") , 
-                    lm_aliased , fixed=TRUE )
-                lm_replace_txt <- concat(inden(2),mxtlm,"[i,mmrow] = ",lm_aliased,";")
+                lm_var_declare <- concat( inden(2) , "vector[" , nx , "] dimiproxy;" )
+                lm_var_assignments <- inden(2)
+                for ( j in 1:nx ) {
+                    # assign local proxy variable for each variable with discrete missingness
+                    # 1. test whether var j is missing for case i
+                    # 2. if missing, assign dimiproxy[j] = missmatrix[mmrow,j]
+                    # 3. if observed, assign dimiproxy[j] = var[i]
+                    if ( j > 1 ) lm_var_assignments <- concat( lm_var_assignments , inden(4) )
+                    lm_var_assignments <- concat( lm_var_assignments , 
+                        "dimiproxy[",j,"] = " , linmod$discrete_miss_list[[j]] , "[i];\n" ,
+                        inden(4) , misstests[j] , 
+                        " dimiproxy[",j,"] = " , linmod$parameter,"_missmatrix[mmrow,",j,"];\n" )
+                    lm_aliased <- gsub( 
+                        concat(linmod$discrete_miss_list[[j]],"[i]") , 
+                        #concat(linmod$parameter,"_missmatrix[mmrow,",j,"]") , 
+                        concat("dimiproxy[",j,"]") , 
+                        lm_aliased , fixed=TRUE )
+                }#j
+                lm_replace_txt <- concat( inden(2) , mxtlm , "[i,mmrow] = " , lm_aliased , ";" )
 
                 # link function
                 if ( linmod$link != "identity" & linmod$use_link==TRUE ) {
@@ -1167,6 +1185,7 @@ map2stan <- function( flist , data , start , pars , constraints=list() , types=l
                 }
 
                 # build code to add log prob factors to terms
+                # these terms handle the marginalization over unknown states
                 terms_txt <- concat(inden(2),mxt,"[i,mmrow] = 0;\n")
                 for ( j in 1:nx ) {
                     terms_txt <- concat(terms_txt,inden(4),"if ( ",linmod$parameter,"_missmatrix[mmrow,",j,"]==1 )\n")
@@ -1179,6 +1198,8 @@ map2stan <- function( flist , data , start , pars , constraints=list() , types=l
                 code_lines <- c(
                 misstestcode,
                 concat(indent,"for ( mmrow in 1:rows(",linmod$parameter,"_missmatrix) ) {"),
+                lm_var_declare,
+                lm_var_assignments,
                 lm_replace_txt,
                 terms_txt,
                 concat(indent,"}//mmrow"),
@@ -1269,7 +1290,7 @@ map2stan <- function( flist , data , start , pars , constraints=list() , types=l
                 # add loop for non-vectorized distribution
                 txt1 <- concat( indent , "for ( i in 1:" , lik$N_name , " ) {\n" )
                 m_model_txt <- concat( m_model_txt , txt1 )
-                if ( has_discrete_missingness==FALSE )
+                if ( has_discrete_missingness==FALSE || do_discrete_imputation==TRUE )
                     m_gq <- concat( m_gq , txt1 )
 
                 xindent <- indent
@@ -1299,7 +1320,7 @@ map2stan <- function( flist , data , start , pars , constraints=list() , types=l
                 txt1 <- fp[['lm']][[ lms_with_dm[[1]][[2]] ]]$misstestcode # assume just one for now
                 txt1 <- concat( inden(2) , txt1 , "\n" )
                 m_model_txt <- concat( m_model_txt , txt1 )
-                #m_gq <- concat( m_gq , txt1 )
+                if ( do_discrete_imputation==TRUE ) m_gq <- concat( m_gq , txt1 )
                 xindent <- concat( xindent , indent )
             }
 
@@ -1370,6 +1391,36 @@ map2stan <- function( flist , data , start , pars , constraints=list() , types=l
                     m_model_txt <- concat( m_model_txt , inden(3) , "target += log_sum_exp(" , sym , mixterms_suffix , "[i]);\n" )
                     # close off discrete missing branch
                     m_model_txt <- concat( m_model_txt , inden(2) , "} else \n" )
+
+                    if ( do_discrete_imputation==TRUE ) {
+                        # for each variable in discrete_miss_bank,
+                        #  need to compute posterior probability using log probabilities in mixture
+                        m_gq <- concat( m_gq , inden(3) , "for ( mmrow in 1:rows(", sym , missmatrix_suffix ,") )\n" )
+                        m_gq <- concat( m_gq , inden(4) , sym , mixterms_suffix , "[i,mmrow] = " , sym , mixterms_suffix , "[i,mmrow] + " , LLtxt , ";\n" )
+                        ndmb <- length( discrete_miss_bank )
+                        for ( ix in 1:ndmb ) {
+                            var_name <- names(discrete_miss_bank)[ix]
+                            # add vector of imputed values to start of generated quantities
+                            m_gq <- concat( inden(1) , "vector[N] " , var_name , "_impute;\n" , m_gq )
+                            # add to pars so the samples get returned!
+                            pars_elect[[concat(var_name,"_impute")]] <- concat(var_name,"_impute")
+                            # add calculation to body of generated quantities
+                            # uses softmax to exp each term and normalize them
+                            # dot product with missmatrix extracts terms that include variable ix
+                            m_gq <- concat( m_gq , inden(3) , var_name , "_impute[i] = " , var_name , "[i];\n" )
+                            m_gq <- concat( m_gq , inden(3) , misstests[[ix]] )
+                            m_gq <- concat( m_gq , var_name , 
+                                "_impute[i] = dot_product( softmax(to_vector(" , sym , mixterms_suffix , "[i])) , " , sym , missmatrix_suffix , "[:,",ix,"] )" , ";\n" )
+                        }#ix
+                        # close off discrete missing branch
+                        m_gq <- concat( m_gq , inden(2) , "} else {\n" )
+                        # need to fill observed values in _impute vector, or will get sampling error for undefined values
+                        for ( ix in 1:ndmb ) {
+                            var_name <- names(discrete_miss_bank)[ix]
+                            m_gq <- concat( m_gq , inden(3) , var_name , "_impute[i] = " , var_name , "[i];\n" )
+                        }#ix
+                        m_gq <- concat( m_gq , inden(2) , "}\n" )
+                    }
                 }
                 m_model_txt <- concat( m_model_txt , indent , xindent , outcome , " ~ " , lik$likelihood , "( " , parstxt , " )" , lik$T_text , ";\n" )
 
@@ -1426,7 +1477,7 @@ map2stan <- function( flist , data , start , pars , constraints=list() , types=l
             if ( tmplt$vectorized==FALSE || has_discrete_missingness==TRUE ) {
                 txt1 <- concat( indent , "}//i \n" )
                 m_model_txt <- concat( m_model_txt , txt1 )
-                if ( has_discrete_missingness==FALSE )
+                if ( has_discrete_missingness==FALSE || do_discrete_imputation==TRUE )
                     m_gq <- concat( m_gq , txt1 )
             }
 

diff --git a/README.md b/README.md
@@ -219,7 +219,7 @@ What ``map2stan`` does is notice the missing values, see the distribution assign
 
 ## Semi-automated marginalization for binary discrete missing values
 
-Binary (0/1) variables with missing values present a special obstacle, because Stan cannot sample discrete parameters. So instead of imputing binary missing values, ``map2stan`` can simply average (marginalize) over them. As in the above case, when ``map2stan`` detects missing values in a predictor variable, it will try to find a distribution for the variable containing them. If this variable is binary (0/1), then it will construct a mixture model in which each term is the log-likelihood conditional on the variables taking a particular combination of 0/1 values.
+Binary (0/1) variables with missing values present a special obstacle, because Stan cannot sample discrete parameters. So instead of imputing binary missing values, ``map2stan`` can average (marginalize) over them. As in the above case, when ``map2stan`` detects missing values in a predictor variable, it will try to find a distribution for the variable containing them. If this variable is binary (0/1), then it will construct a mixture model in which each term is the log-likelihood conditional on the variables taking a particular combination of 0/1 values.
 
 Following the example in the previous section, we can simulate missingness in a binary predictor:
 ```
@@ -234,14 +234,76 @@ The model definition is analogous to the previous, but also requires some care i
 f6 <- alist(
     y ~ dnorm( mu , sigma ),
     mu <- a + b*x,
-    x ~ bernoulli( phi_x ),
+    x ~ bernoulli( phi ),
     a ~ dnorm( 0 , 100 ),
     b ~ dnorm( 0  , 10 ),
-    phi_x ~ beta( 1 , 1 ),
+    phi ~ beta( 1 , 1 ),
     sigma ~ dcauchy(0,2)
 )
-m6 <- map2stan( f6 , data=list(y=y,x=x) , constraints=list(phi_x="lower=0,upper=1") )
+m6 <- map2stan( f6 , data=list(y=y,x=x) , constraints=list(phi="lower=0,upper=1") )
 ```
+The algorithm works, in theory, for any number of binary predictors with missing values. For example, with two predictors, each with missingness:
+```
+N <- 100
+N_miss <- 10
+x1 <- rbinom( N , size=1 , prob=0.5 )
+x2 <- rbinom( N , size=1 , prob=0.1 )
+y <- rnorm( N , 2*x1 - x2  , 1 )
+x1[ sample(1:N,size=N_miss) ] <- NA
+x2[ sample(1:N,size=N_miss) ] <- NA
+f7 <- alist(
+    y ~ dnorm( mu , sigma ),
+    mu <- a + b1*x1 + b2*x2,
+    x1 ~ bernoulli( phi1 ),
+    x2 ~ bernoulli( phi2 ),
+    a ~ dnorm( 0 , 100 ),
+    c(b1,b2) ~ dnorm( 0  , 10 ),
+    phi1 ~ beta( 1 , 1 ),
+    phi2 ~ beta( 1 , 1 ),
+    sigma ~ dcauchy(0,2)
+)
+m7 <- map2stan( f7 , data=list(y=y,x1=x1,x2=x2) , 
+      constraints=list(phi1="lower=0,upper=1",phi2="lower=0,upper=1") )
+```
+While the unobserved values for the binary predictors are usually not of interest, they can be computed from the posterior distribution. Adding the argument ``do_discrete_imputation=TRUE`` instructs ``map2stan`` to perform these calculations automatically. Example:
+```
+m6 <- map2stan( f6 , data=list(y=y,x=x) , constraints=list(phi="lower=0,upper=1") ,
+      do_discrete_imputation=TRUE )
+precis( m6 , depth=2 )
+```
+The output contains samples for each case with imputed probilities that ``x`` takes the value 1.
+
+The algorithm works by constructing a list of mixture terms that are needed to to compute the probability of each observed ``y`` value. In the simplest case, with only one predictor with missing values, the implied mixture likelihood contains two terms:
+```
+Pr(y[i]) = Pr(x[i]=1)Pr(y[i]|x[i]=1) + Pr(x[i]=0)Pr(y[i]|x[i]=0)
+```
+In the parameters of our example model ``m6`` above, this is:
+```
+Pr(y[i]) = phi*N(y[i]|a+b,sigma) + (1-phi)*N(y[i]|a,sigma)
+```
+It is now a simple matter to loop over cases ``i`` and compute the above for each. Similarly the posterior probability of that ``x[i]==1`` is given as:
+```
+Pr(x[i]==1|y[i]) = phi*N(y[i]|a+b,sigma) / Pr(y[i])
+```
+When only one predictor has missingness, then this is simple. What about when there are two or more? In that case, all the possible combinations of missingness have to be accounted for. For example, suppose there are two predictors, ``x1`` and ``x2``, both with missingness on case ``i``. Now the implied mixture likelihood is:
+```
+Pr(y[i]) = Pr(x1=1)Pr(x2=1)*Pr(y[i]|x1=1,x2=1) + Pr(x1=1)Pr(x2=0)Pr(y[i]|x1=1,x2=0) + Pr(x1=0)Pr(x2=1)Pr(y[i]|x1=0,x2=1) + Pr(x1=0)Pr(x2=0)Pr(y[i]|x1=0,x2=0)
+```
+There are four combinations of unobserved values, and so four terms in the mixture likelihood. When ``x2`` is instead observed, we can substitute the observed value into the above, and then the mixture simplifies readily to our previous two-term likelihood:
+```
+Pr(y[i]|x2[i]==1) = Pr(x1=1)Pr(x2=1)Pr(y[i]|x1=1,x2=1) + Pr(x1=1)Pr(x2=0)Pr(y[i]|x1=1,x2=1) + Pr(x1=0)Pr(x2=1)Pr(y[i]|x1=0,x2=1) + Pr(x1=0)Pr(x2=0)Pr(y[i]|x1=0,x2=1)
+                  = [Pr(x1=1)Pr(x2=1)+Pr(x1=1)Pr(x2=0)]Pr(y[i]|x1=1,x2=1) 
+                    + [Pr(x1=0)Pr(x2=1)+Pr(x1=0)Pr(x2=0)]Pr(y[i]|x1=0,x2=1)
+                  = Pr(x1=1)Pr(y[i]|x1=1,x2=1) + Pr(x1=0)Pr(y[i]|x1=0,x2=1)
+```
+This implies that if we loop over cases ``i`` and insert any observed values into the general mixture likelihood, we can compute the relevant mixture for the specific combination of missingness on each case ``i``. That is what ``map2stan`` does. The general mixture terms can be generated algorithmically. The code below generates a matrix of terms for ``n`` binary variables with missingness.
+```
+ncombinations <- 2^n
+d <- matrix(NA,nrow=ncombinations,ncol=n)
+for ( col_var in 1:n ) 
+    d[,col_var] <- rep( 0:1 , each=2^(col_var-1) , length.out=ncombinations )
+```
+Rows of ``d`` contain terms, columns contain variables, and the values in each column are the corresponding values of each variable. The algorithm builds a linear model for each row in this matrix, composes the mixture likelihood as the sum of these rows, and performs proper substitutions of observed values. All calculations are done on the log scale, for precision.
 
 ## Gaussian process
 

diff --git a/man/map2stan.Rd b/man/map2stan.Rd
@@ -9,7 +9,9 @@
 map2stan( flist , data , start , pars , constraints=list() , types=list() , 
   sample=TRUE , iter=2000 , warmup=floor(iter/2) , chains=1 , debug=FALSE , 
   verbose=FALSE , WAIC=TRUE , cores=1 , rng_seed , rawstanfit=FALSE , 
-  control=list(adapt_delta=0.95) , add_unique_tag=TRUE , code , ... )
+  control=list(adapt_delta=0.95) , add_unique_tag=TRUE , code , 
+  log_lik=FALSE , DIC=FALSE , declare_all_data=TRUE , 
+  do_discrete_imputation=FALSE , ... )
 }
 %- maybe also 'usage' for other objects documented here.
 \arguments{
@@ -32,6 +34,10 @@ map2stan( flist , data , start , pars , constraints=list() , types=list() ,
   \item{control}{Optional list of control parameters for \code{stan}. Default increases target acceptance rate (\code{adapt_delta}) to 0.95.}
   \item{add_unique_tag}{When \code{TRUE}, adds a comment to the Stan model code with the date-time stamp. This makes each model unique and will force Stan to recompile. Useful for avoiding segfault bugs when reusing compiled objects.}
   \item{code}{Optional list of custom Stan code to insert in model. See details and example.}
+  \item{log_lik}{Return log likelihood of each observation in samples. Used for calculating WAIC and LOO.}
+  \item{DIC}{Return deviance and DIC. This is deprecated and may be removed in a later version.}
+  \item{declare_all_data}{When \code{TRUE}, all variables in the data list are declared in the Stan model code. When \code{FALSE}, only used variables are declared.}
+  \item{do_discrete_imputation}{When \code{TRUE}, samples for missing binary predictors are returned. Not necessary to marginalize over discrete missing values.}
   \item{...}{Additional arguments to pass to \code{\link{stan}}}
 }
 \details{