1.41 - maintenance and Chapter 6 revision

- added data(Dinosaurs) and its man page - compare() defaults to WAIC now and displays stderr - compare plot method contains more swag - added ensemble() to construct ensemble predictions from WAIC/DIC weights - added mcreplicate() to multicore replicate using mclapply - various man page updates to keep up with code - fix for simulating from map models without linear models - map2stan defaults to computing WAIC after sampling - sim.train.test() added, with man page - sim method for map updated to handle pointwise loglik for WAIC calculation when ll=TRUE - WAIC methods updated to perform pointwise calculations, allowing computation of stderr and stderr of differences in WAIC
rmcelreath · Sep 15, 2014 · 100b125 · 100b125
1 parent b1a47b5
commit 100b125
Show file tree

Hide file tree

Showing 22 changed files with 734 additions and 59 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: rethinking
 Type: Package
 Title: Statistical Rethinking book package
-Version: 1.4
+Version: 1.41
 Date: 2014-09-02
 Author: Richard McElreath
 Maintainer: Richard McElreath <[email protected]>

diff --git a/R/compare.r b/R/compare.r
@@ -1,14 +1,15 @@
 # compare
 
 # compare class definition and show method
-setClass( "compareIC" , representation( output="data.frame" ) )
+setClass( "compareIC" , representation( output="data.frame" , dSE="matrix" ) )
+
 compare.show <- function( object ) {
     print( round( object@output , 2 ) )
 }
 setMethod( "show" , "compareIC" , function(object) compare.show(object) )
 
-# new compare function, defaulting to DIC
-compare <- function( ... , n=1e3 , sort="DIC" , WAIC=FALSE ) {
+# new compare function, defaulting to WAIC
+compare <- function( ... , n=1e3 , sort="WAIC" , WAIC=TRUE , refresh=0 ) {
     # retrieve list of models
     L <- list(...)
     if ( is.list(L[[1]]) && length(L)==1 )
@@ -18,14 +19,28 @@ compare <- function( ... , n=1e3 , sort="DIC" , WAIC=FALSE ) {
     mnames <- match.call()
     mnames <- as.character(mnames)[2:(length(L)+1)]
 
+    dSE.matrix <- matrix( NA , nrow=length(L) , ncol=length(L) )
     if ( WAIC==FALSE ) {
         DIC.list <- lapply( L , function(z) DIC( z , n=n ) )
         pD.list <- sapply( DIC.list , function(x) attr(x,"pD") )
     } else {
         # use WAIC instead of DIC
-        WAIC.list <- lapply( L , function(z) WAIC( z , n=n ) )
+        # WAIC is processed pointwise, so can compute SE of differences, and summed later
+        WAIC.list <- lapply( L , function(z) WAIC( z , n=n , refresh=refresh , pointwise=TRUE ) )
         pD.list <- sapply( WAIC.list , function(x) attr(x,"pWAIC") )
-        DIC.list <- WAIC.list
+        se.list <- sapply( WAIC.list , function(x) attr(x,"se") )
+        DIC.list <- (-2)*sapply( WAIC.list , sum )
+        # compute SE of differences between adjacent models from top to bottom in ranking
+        colnames(dSE.matrix) <- mnames
+        rownames(dSE.matrix) <- mnames
+        for ( i in 1:(length(L)-1) ) {
+            for ( j in (i+1):length(L) ) {
+                waic_ptw1 <- WAIC.list[[i]]
+                waic_ptw2 <- WAIC.list[[j]]
+                dSE.matrix[i,j] <- as.numeric( sqrt( length(waic_ptw1)*var( waic_ptw1 - waic_ptw2 ) ) )
+                dSE.matrix[j,i] <- dSE.matrix[i,j]
+            }#j
+        }#i
     }
 
     DIC.list <- unlist(DIC.list)
@@ -35,26 +50,72 @@ compare <- function( ... , n=1e3 , sort="DIC" , WAIC=FALSE ) {
 
     if ( WAIC==FALSE )
         result <- data.frame( DIC=DIC.list , pD=pD.list , dDIC=dDIC , weight=w.DIC )
-    else
-        result <- data.frame( WAIC=DIC.list , pWAIC=pD.list , dWAIC=dDIC , weight=w.DIC )
+    else {
+        # find out which model has dWAIC==0
+        topm <- which( dDIC==0 )
+        dSEcol <- dSE.matrix[,topm]
+        result <- data.frame( WAIC=DIC.list , pWAIC=pD.list , dWAIC=dDIC , 
+                              weight=w.DIC , SE=se.list , dSE=dSEcol )
+    }
 
     rownames(result) <- mnames
 
     if ( !is.null(sort) ) {
-        if ( WAIC==TRUE & sort=="DIC" ) sort <- "WAIC"
-        result <- result[ order( result[[sort]] ) , ]
+        if ( sort!=FALSE ) {
+            if ( WAIC==FALSE & sort=="WAIC" ) sort <- "DIC"
+            result <- result[ order( result[[sort]] ) , ]
+        }
     }
 
-    new( "compareIC" , output=result )
+    new( "compareIC" , output=result , dSE=dSE.matrix )
 }
 
 # plot method for compareIC results shows deviance in and expected deviance out of sample, for each model, ordered top-to-bottom by rank
-setMethod("plot" , "compareIC" , function(x,y,...) {
+setMethod("plot" , "compareIC" , function(x,y,xlim,SE=TRUE,dSE=TRUE,weights=FALSE,...) {
     dev_in <- x@output[[1]] - x@output[[2]]
     dev_out <- x@output[[1]]
+    if ( !is.null(x@output[['SE']]) ) devSE <- x@output[['SE']]
+    dev_out_lower <- dev_out - devSE
+    dev_out_upper <- dev_out + devSE
+    if ( weights==TRUE ) {
+        dev_in <- ICweights(dev_in)
+        dev_out <- ICweights(dev_out)
+        dev_out_lower <- ICweights(dev_out_lower)
+        dev_out_upper <- ICweights(dev_out_upper)
+    }
     n <- length(dev_in)
-    dotchart( dev_in[n:1] , labels=rownames(x@output)[n:1] , xlab="deviance" , pch=16 , xlim=c(min(dev_in),max(dev_out)) , ... )
+    if ( missing(xlim) ) {
+        xlim <- c(min(dev_in),max(dev_out))
+        if ( SE==TRUE & !is.null(x@output[['SE']]) ) {
+            xlim <- c(min(dev_in),max(dev_out_upper))
+        }
+    }
+    main <- colnames(x@output)[1]
+    set_nice_margins()
+    dotchart( dev_in[n:1] , labels=rownames(x@output)[n:1] , xlab="deviance" , pch=16 , xlim=xlim , ... )
     points( dev_out[n:1] , 1:n )
+    mtext(main)
+    # standard errors
+    if ( !is.null(x@output[['SE']]) & SE==TRUE ) {
+        for ( i in 1:n ) {
+            lines( c(dev_out_lower[i],dev_out_upper[i]) , rep(n+1-i,2) , lwd=0.75 )
+        }
+    }
+    if ( !all(is.na(x@dSE)) & dSE==TRUE ) {
+        # plot differences and stderr of differences
+        dcol <- col.alpha("black",0.5)
+        abline( v=dev_out[1] , lwd=0.5 , col=dcol )
+        diff_dev_lower <- dev_out - x@output$dSE
+        diff_dev_upper <- dev_out + x@output$dSE
+        if ( weights==TRUE ) {
+            diff_dev_lower <- ICweights(diff_dev_lower)
+            diff_dev_upper <- ICweights(diff_dev_upper)
+        }
+        for ( i in 2:n ) {
+            points( dev_out[i] , n+2-i-0.5 , cex=0.5 , pch=2 , col=dcol )
+            lines( c(diff_dev_lower[i],diff_dev_upper[i]) , rep(n+2-i-0.5,2) , lwd=0.5 , col=dcol )
+        }
+    }
 })
 
 # AICc/BIC model comparison table
@@ -154,8 +215,53 @@ ICweights <- function( dev ) {
 }
 
 # build ensemble of samples using DIC/WAIC weights
-ensemble <- function( ... ) {
+ensemble <- function( ... , data , n=1e3 , WAIC=TRUE , refresh=0 ) {
+    # retrieve list of models
     L <- list(...)
+    if ( is.list(L[[1]]) && length(L)==1 )
+        L <- L[[1]]
+    # retrieve model names from function call
+    mnames <- match.call()
+    mnames <- as.character(mnames)[2:(length(L)+1)]
+    ictab <- compare( ... , WAIC=WAIC , refresh=refresh , n=n , sort=FALSE )
+    rownames(ictab@output) <- mnames
+    weights <- ictab@output$weight
+    # generate simulated predictions for each model
+    # then compose ensemble using weights
+    if ( missing(data) ) {
+        link.list <- lapply( L , function(m) link(m , n=n , refresh=refresh ) )
+        sim.list <- lapply( L , function(m) sim(m , n=n , refresh=refresh ) )
+    } else {
+        link.list <- lapply( L , function(m) link(m , data=data , n=n , refresh=refresh ) )
+        sim.list <- lapply( L , function(m) sim(m , data=data , n=n , refresh=refresh ) )
+    }
+    #print(str(sim.list))
+    # compose weighted predictions
+    # calculate indices corresponding to each model
+    idx <- round( weights * n )
+    idx_start <- rep(1,length(idx))
+    idx_end <- idx
+    for ( i in 2:length(idx) ) {
+        idx_start[i] <- idx_start[i-1] + idx[i-1]
+        idx_end[i] <- min( idx_end[i-1] + idx[i] , n )
+    }
+    #print(idx_start)
+    #print(idx_end)
+    link_out <- link.list[[1]]
+    sim_out <- sim.list[[1]]
+    for ( i in 1:length(idx) ) {
+        idxrange <- idx_start[i]:idx_end[i]
+        link_out[idxrange,] <- link.list[[i]][idxrange,]
+        sim_out[idxrange,] <- sim.list[[i]][idxrange,]
+    }
+    result <- list( link=link_out , sim=sim_out )
+    names(weights) <- mnames
+    idxtab <- cbind( idx_start , idx_end )
+    rownames(idxtab) <- mnames
+    attr(result,"weights") <- weights
+    attr(result,"indices") <- idxtab
+    attr(result,"ictab") <- ictab
+    result
 }
 
 # tests

diff --git a/R/glimmer.R b/R/glimmer.R
@@ -0,0 +1,146 @@
+# glimmer
+# translate glmer style model formulas into equivalent formula alist suitable for map2stan
+# just a stub for now
+
+glimmer <- function( formula , family="Gaussian" , ... ) {
+
+    # templates
+    family_liks <- list(
+        Gaussian = "dnorm( mu , sigma )",
+        Binomial = "dbinom( size , p )",
+        Poisson = "dpois( lambda )"
+    )
+
+    # check input
+    if ( class(formula)!="formula" ) stop( "Input must be a glmer-style formula." )
+
+    f <- formula
+    flist <- alist()
+
+    # get outcome
+    outcome <- f[[2]]
+
+    # build likelihood
+    flist[[1]] <- as.formula( concat( as.character(outcome) , " ~ " , family_liks[[family]] ) )
+
+    # build linear model
+
+
+}
+
+
+parse_glimmer_formula <- function( formula , data ) {
+    ## take a formula and parse into fixed effect and varying effect lists
+    nobars <- function(term) {
+        if (!('|' %in% all.names(term))) return(term)
+        if (is.call(term) && term[[1]] == as.name('|')) return(NULL)
+        if (length(term) == 2) {
+        nb <- nobars(term[[2]])
+        if (is.null(nb)) return(NULL)
+        term[[2]] <- nb
+        return(term)
+        }
+        nb2 <- nobars(term[[2]])
+        nb3 <- nobars(term[[3]])
+        if (is.null(nb2)) return(nb3)
+        if (is.null(nb3)) return(nb2)
+        term[[2]] <- nb2
+        term[[3]] <- nb3
+        term
+    }
+    findbars <- function(term) {
+        if (is.name(term) || !is.language(term)) return(NULL)
+        if (term[[1]] == as.name("(")) return(findbars(term[[2]]))
+        if (!is.call(term)) stop("term must be of class call")
+        if (term[[1]] == as.name('|')) return(term)
+        if (length(term) == 2) return(findbars(term[[2]]))
+        c(findbars(term[[2]]), findbars(term[[3]]))
+    }
+    subbars <- function(term)
+    ### Substitute the '+' function for the '|' function
+    {
+        if (is.name(term) || !is.language(term)) return(term)
+        if (length(term) == 2) {
+        term[[2]] <- subbars(term[[2]])
+        return(term)
+        }
+        stopifnot(length(term) >= 3)
+        if (is.call(term) && term[[1]] == as.name('|'))
+        term[[1]] <- as.name('+')
+        for (j in 2:length(term)) term[[j]] <- subbars(term[[j]])
+        term
+    }
+    hasintercept <- function(term) {
+        attr( terms(term) , "intercept" )==1
+    }
+
+    # find fixed effects list by deleting random effects and expanding
+    f_nobars <- nobars( formula )
+    # catch implied intercept error -- happens when right side of formula is only () blocks
+    if ( class(f_nobars)=="name" & length(f_nobars)==1 ) {
+        f_nobars <- nobars( as.formula( paste( deparse(formula) , "+ 1" ) ) )
+    }
+    fixef <- colnames( model.matrix( f_nobars , data ) )
+
+    # convert to all fixed effects and build needed model matrix
+    mdat <- model.matrix( subbars( formula ) , data )
+    outcome_name <- deparse( f_nobars[[2]] )
+    # mdat <- cbind( data[[outcome_name]] , mdat )
+    # colnames(mdat)[1] <- outcome_name
+    outcome <- model.frame( f_nobars , data )[,1]
+    if ( class(outcome)=="matrix" ) {
+        # fix outcome name
+        outcome_name <- colnames( outcome )[1]
+    }
+
+    # check for any varying effects
+    if ( formula == nobars(formula) ) {
+        # no varying effects
+        ranef <- list()
+    } else {
+        # find varying effects list
+        var <- findbars( formula )
+        ranef <- list()
+        for ( i in 1:length(var) ) {
+            name <- all.vars( var[[i]][[3]] )
+
+            if ( FALSE ) { # check index variables?
+                if ( TRUE ) {
+                    # check that grouping vars are class integer
+                    if ( class( data[[name]] )!="integer" ) {
+                        stop( paste( "Grouping variables must be integer type. '" , name , "' is instead of type: " , class( data[[name]] ) , "." , sep="" ) )
+                    }
+                    # check that values are contiguous
+                    if ( min(data[[name]]) != 1 ) 
+                        stop( paste( "Group variable '" , name , "' doesn't start at index 1." , sep="" ) )
+                    ulist <- unique( data[[name]] )
+                    diffs <- ulist[2:length(ulist)] - ulist[1:(length(ulist)-1)]
+                    if ( any(diffs != 1) )
+                        stop( paste( "Group variable '" , name , "' is not contiguous." , sep="" ) )
+                } else {
+                    # new code to coerce grouping vars to type integer
+                    # first, save the labels by converting to character
+                    #glabels <- as.character( data[[name]] )
+                    # second, coerce to factor then integer to make contiguous integer index
+                    mdat[,name] <- as.integer( as.factor( mdat[,name] ) )
+                }
+            }
+
+            # parse formula
+            v <- var[[i]][[2]]
+            if ( class(v)=="numeric" ) {
+                # just intercept
+                ranef[[ name ]] <- "(Intercept)"
+            } else {
+                # should be class "call" or "name"
+                # need to convert formula to model matrix headers
+                f <- as.formula( paste( "~" , deparse( v ) , sep="" ) )
+                ranef[[ name ]] <- colnames( model.matrix( f , data ) )
+            }
+        }
+    }
+
+    # result sufficient information to build Stan model code
+    list( y=outcome , yname=outcome_name , fixef=fixef , ranef=ranef , dat=as.data.frame(mdat) )
+}
+
diff --git a/R/map-predict.r b/R/map-predict.r
@@ -24,6 +24,13 @@ function( fit , data , n=1000 , post , probs=NULL , refresh=0.1 , flatten=TRUE ,
         n <- length(post[[1]])
 
     nlm <- length(fit@links)
+    f_do_lm <- TRUE
+    if ( nlm==0 ) {
+        # no linear models!
+        # so need to flag to skip lm insertions into eval environment for ll
+        nlm <- 1
+        f_do_lm <- FALSE
+    }
 
     link_out <- vector(mode="list",length=nlm)
 
@@ -32,8 +39,21 @@ function( fit , data , n=1000 , post , probs=NULL , refresh=0.1 , flatten=TRUE ,
         ref_inc <- floor(n*refresh)
         ref_next <- ref_inc
 
-        parout <- fit@links[[i]][[1]]
-        lm <- fit@links[[i]][[2]]
+        if ( f_do_lm==TRUE ) {
+            parout <- fit@links[[i]][[1]]
+            lm <- fit@links[[i]][[2]]
+        } else {
+            parout <- "ll"
+            lm <- "0"
+            # hacky solution -- find density function and insert whatever expression in typical link spot
+            flik <- as.character(fit@formula[[1]][[3]][[1]])
+            # mu for Gaussian
+            if ( flik=="dnorm" ) lm <- as.character( fit@formula[[1]][[3]][[2]] )
+            # p for binomial -- assume in third spot, after size
+            if ( flik=="dbinom" ) lm <- as.character( fit@formula[[1]][[3]][[3]] )
+            # lambda for poisson
+            if ( flik=="dpois" ) lm <- as.character( fit@formula[[1]][[3]][[2]] )
+        }
         # empty matrix to hold samples-by-cases values of linear model
         value <- matrix(NA,nrow=n,ncol=length(data[[1]]))
         # for each sample

diff --git a/R/map2stan-class.r b/R/map2stan-class.r
@@ -121,8 +121,10 @@ setMethod("show", "map2stan", function(object){
 
     if ( !is.null(attr(object,"WAIC")) ) {
         waic <- attr(object,"WAIC")
-        cat("\nWAIC: ")
-        cat( round(as.numeric(waic),2) , "\n" )
+        use_waic <- waic
+        if ( length(waic)>1 ) use_waic <- (-2)*sum(waic)
+        cat("\nWAIC (SE): ")
+        cat( concat(round(as.numeric(use_waic),2) , " (" , round(as.numeric(attr(waic,"se")),1) , ")" , "\n" ) )
 
         cat("pWAIC: ")
         cat( round(as.numeric(attr(waic,"pWAIC")),2) , "\n" )