Skip to content

Commit

Permalink
1.41 - maintenance and Chapter 6 revision
Browse files Browse the repository at this point in the history
- added data(Dinosaurs) and its man page
- compare() defaults to WAIC now and displays stderr
- compare plot method contains more swag
- added ensemble() to construct ensemble predictions from WAIC/DIC
weights
- added mcreplicate() to multicore replicate using mclapply
- various man page updates to keep up with code
- fix for simulating from map models without linear models
- map2stan defaults to computing WAIC after sampling
- sim.train.test() added, with man page
- sim method for map updated to handle pointwise loglik for WAIC
calculation when ll=TRUE
- WAIC methods updated to perform pointwise calculations, allowing
computation of stderr and stderr of differences in WAIC
  • Loading branch information
Richard McElreath committed Sep 15, 2014
1 parent b1a47b5 commit 100b125
Show file tree
Hide file tree
Showing 22 changed files with 734 additions and 59 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
Package: rethinking
Type: Package
Title: Statistical Rethinking book package
Version: 1.4
Version: 1.41
Date: 2014-09-02
Author: Richard McElreath
Maintainer: Richard McElreath <[email protected]>
Expand Down
132 changes: 119 additions & 13 deletions R/compare.r
Original file line number Diff line number Diff line change
@@ -1,14 +1,15 @@
# compare

# compare class definition and show method
setClass( "compareIC" , representation( output="data.frame" ) )
setClass( "compareIC" , representation( output="data.frame" , dSE="matrix" ) )

compare.show <- function( object ) {
print( round( object@output , 2 ) )
}
setMethod( "show" , "compareIC" , function(object) compare.show(object) )

# new compare function, defaulting to DIC
compare <- function( ... , n=1e3 , sort="DIC" , WAIC=FALSE ) {
# new compare function, defaulting to WAIC
compare <- function( ... , n=1e3 , sort="WAIC" , WAIC=TRUE , refresh=0 ) {
# retrieve list of models
L <- list(...)
if ( is.list(L[[1]]) && length(L)==1 )
Expand All @@ -18,14 +19,28 @@ compare <- function( ... , n=1e3 , sort="DIC" , WAIC=FALSE ) {
mnames <- match.call()
mnames <- as.character(mnames)[2:(length(L)+1)]

dSE.matrix <- matrix( NA , nrow=length(L) , ncol=length(L) )
if ( WAIC==FALSE ) {
DIC.list <- lapply( L , function(z) DIC( z , n=n ) )
pD.list <- sapply( DIC.list , function(x) attr(x,"pD") )
} else {
# use WAIC instead of DIC
WAIC.list <- lapply( L , function(z) WAIC( z , n=n ) )
# WAIC is processed pointwise, so can compute SE of differences, and summed later
WAIC.list <- lapply( L , function(z) WAIC( z , n=n , refresh=refresh , pointwise=TRUE ) )
pD.list <- sapply( WAIC.list , function(x) attr(x,"pWAIC") )
DIC.list <- WAIC.list
se.list <- sapply( WAIC.list , function(x) attr(x,"se") )
DIC.list <- (-2)*sapply( WAIC.list , sum )
# compute SE of differences between adjacent models from top to bottom in ranking
colnames(dSE.matrix) <- mnames
rownames(dSE.matrix) <- mnames
for ( i in 1:(length(L)-1) ) {
for ( j in (i+1):length(L) ) {
waic_ptw1 <- WAIC.list[[i]]
waic_ptw2 <- WAIC.list[[j]]
dSE.matrix[i,j] <- as.numeric( sqrt( length(waic_ptw1)*var( waic_ptw1 - waic_ptw2 ) ) )
dSE.matrix[j,i] <- dSE.matrix[i,j]
}#j
}#i
}

DIC.list <- unlist(DIC.list)
Expand All @@ -35,26 +50,72 @@ compare <- function( ... , n=1e3 , sort="DIC" , WAIC=FALSE ) {

if ( WAIC==FALSE )
result <- data.frame( DIC=DIC.list , pD=pD.list , dDIC=dDIC , weight=w.DIC )
else
result <- data.frame( WAIC=DIC.list , pWAIC=pD.list , dWAIC=dDIC , weight=w.DIC )
else {
# find out which model has dWAIC==0
topm <- which( dDIC==0 )
dSEcol <- dSE.matrix[,topm]
result <- data.frame( WAIC=DIC.list , pWAIC=pD.list , dWAIC=dDIC ,
weight=w.DIC , SE=se.list , dSE=dSEcol )
}

rownames(result) <- mnames

if ( !is.null(sort) ) {
if ( WAIC==TRUE & sort=="DIC" ) sort <- "WAIC"
result <- result[ order( result[[sort]] ) , ]
if ( sort!=FALSE ) {
if ( WAIC==FALSE & sort=="WAIC" ) sort <- "DIC"
result <- result[ order( result[[sort]] ) , ]
}
}

new( "compareIC" , output=result )
new( "compareIC" , output=result , dSE=dSE.matrix )
}

# plot method for compareIC results shows deviance in and expected deviance out of sample, for each model, ordered top-to-bottom by rank
setMethod("plot" , "compareIC" , function(x,y,...) {
setMethod("plot" , "compareIC" , function(x,y,xlim,SE=TRUE,dSE=TRUE,weights=FALSE,...) {
dev_in <- x@output[[1]] - x@output[[2]]
dev_out <- x@output[[1]]
if ( !is.null(x@output[['SE']]) ) devSE <- x@output[['SE']]
dev_out_lower <- dev_out - devSE
dev_out_upper <- dev_out + devSE
if ( weights==TRUE ) {
dev_in <- ICweights(dev_in)
dev_out <- ICweights(dev_out)
dev_out_lower <- ICweights(dev_out_lower)
dev_out_upper <- ICweights(dev_out_upper)
}
n <- length(dev_in)
dotchart( dev_in[n:1] , labels=rownames(x@output)[n:1] , xlab="deviance" , pch=16 , xlim=c(min(dev_in),max(dev_out)) , ... )
if ( missing(xlim) ) {
xlim <- c(min(dev_in),max(dev_out))
if ( SE==TRUE & !is.null(x@output[['SE']]) ) {
xlim <- c(min(dev_in),max(dev_out_upper))
}
}
main <- colnames(x@output)[1]
set_nice_margins()
dotchart( dev_in[n:1] , labels=rownames(x@output)[n:1] , xlab="deviance" , pch=16 , xlim=xlim , ... )
points( dev_out[n:1] , 1:n )
mtext(main)
# standard errors
if ( !is.null(x@output[['SE']]) & SE==TRUE ) {
for ( i in 1:n ) {
lines( c(dev_out_lower[i],dev_out_upper[i]) , rep(n+1-i,2) , lwd=0.75 )
}
}
if ( !all(is.na(x@dSE)) & dSE==TRUE ) {
# plot differences and stderr of differences
dcol <- col.alpha("black",0.5)
abline( v=dev_out[1] , lwd=0.5 , col=dcol )
diff_dev_lower <- dev_out - x@output$dSE
diff_dev_upper <- dev_out + x@output$dSE
if ( weights==TRUE ) {
diff_dev_lower <- ICweights(diff_dev_lower)
diff_dev_upper <- ICweights(diff_dev_upper)
}
for ( i in 2:n ) {
points( dev_out[i] , n+2-i-0.5 , cex=0.5 , pch=2 , col=dcol )
lines( c(diff_dev_lower[i],diff_dev_upper[i]) , rep(n+2-i-0.5,2) , lwd=0.5 , col=dcol )
}
}
})

# AICc/BIC model comparison table
Expand Down Expand Up @@ -154,8 +215,53 @@ ICweights <- function( dev ) {
}

# build ensemble of samples using DIC/WAIC weights
ensemble <- function( ... ) {
ensemble <- function( ... , data , n=1e3 , WAIC=TRUE , refresh=0 ) {
# retrieve list of models
L <- list(...)
if ( is.list(L[[1]]) && length(L)==1 )
L <- L[[1]]
# retrieve model names from function call
mnames <- match.call()
mnames <- as.character(mnames)[2:(length(L)+1)]
ictab <- compare( ... , WAIC=WAIC , refresh=refresh , n=n , sort=FALSE )
rownames(ictab@output) <- mnames
weights <- ictab@output$weight
# generate simulated predictions for each model
# then compose ensemble using weights
if ( missing(data) ) {
link.list <- lapply( L , function(m) link(m , n=n , refresh=refresh ) )
sim.list <- lapply( L , function(m) sim(m , n=n , refresh=refresh ) )
} else {
link.list <- lapply( L , function(m) link(m , data=data , n=n , refresh=refresh ) )
sim.list <- lapply( L , function(m) sim(m , data=data , n=n , refresh=refresh ) )
}
#print(str(sim.list))
# compose weighted predictions
# calculate indices corresponding to each model
idx <- round( weights * n )
idx_start <- rep(1,length(idx))
idx_end <- idx
for ( i in 2:length(idx) ) {
idx_start[i] <- idx_start[i-1] + idx[i-1]
idx_end[i] <- min( idx_end[i-1] + idx[i] , n )
}
#print(idx_start)
#print(idx_end)
link_out <- link.list[[1]]
sim_out <- sim.list[[1]]
for ( i in 1:length(idx) ) {
idxrange <- idx_start[i]:idx_end[i]
link_out[idxrange,] <- link.list[[i]][idxrange,]
sim_out[idxrange,] <- sim.list[[i]][idxrange,]
}
result <- list( link=link_out , sim=sim_out )
names(weights) <- mnames
idxtab <- cbind( idx_start , idx_end )
rownames(idxtab) <- mnames
attr(result,"weights") <- weights
attr(result,"indices") <- idxtab
attr(result,"ictab") <- ictab
result
}

# tests
Expand Down
146 changes: 146 additions & 0 deletions R/glimmer.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,146 @@
# glimmer
# translate glmer style model formulas into equivalent formula alist suitable for map2stan
# just a stub for now

glimmer <- function( formula , family="Gaussian" , ... ) {

# templates
family_liks <- list(
Gaussian = "dnorm( mu , sigma )",
Binomial = "dbinom( size , p )",
Poisson = "dpois( lambda )"
)

# check input
if ( class(formula)!="formula" ) stop( "Input must be a glmer-style formula." )

f <- formula
flist <- alist()

# get outcome
outcome <- f[[2]]

# build likelihood
flist[[1]] <- as.formula( concat( as.character(outcome) , " ~ " , family_liks[[family]] ) )

# build linear model


}


parse_glimmer_formula <- function( formula , data ) {
## take a formula and parse into fixed effect and varying effect lists
nobars <- function(term) {
if (!('|' %in% all.names(term))) return(term)
if (is.call(term) && term[[1]] == as.name('|')) return(NULL)
if (length(term) == 2) {
nb <- nobars(term[[2]])
if (is.null(nb)) return(NULL)
term[[2]] <- nb
return(term)
}
nb2 <- nobars(term[[2]])
nb3 <- nobars(term[[3]])
if (is.null(nb2)) return(nb3)
if (is.null(nb3)) return(nb2)
term[[2]] <- nb2
term[[3]] <- nb3
term
}
findbars <- function(term) {
if (is.name(term) || !is.language(term)) return(NULL)
if (term[[1]] == as.name("(")) return(findbars(term[[2]]))
if (!is.call(term)) stop("term must be of class call")
if (term[[1]] == as.name('|')) return(term)
if (length(term) == 2) return(findbars(term[[2]]))
c(findbars(term[[2]]), findbars(term[[3]]))
}
subbars <- function(term)
### Substitute the '+' function for the '|' function
{
if (is.name(term) || !is.language(term)) return(term)
if (length(term) == 2) {
term[[2]] <- subbars(term[[2]])
return(term)
}
stopifnot(length(term) >= 3)
if (is.call(term) && term[[1]] == as.name('|'))
term[[1]] <- as.name('+')
for (j in 2:length(term)) term[[j]] <- subbars(term[[j]])
term
}
hasintercept <- function(term) {
attr( terms(term) , "intercept" )==1
}

# find fixed effects list by deleting random effects and expanding
f_nobars <- nobars( formula )
# catch implied intercept error -- happens when right side of formula is only () blocks
if ( class(f_nobars)=="name" & length(f_nobars)==1 ) {
f_nobars <- nobars( as.formula( paste( deparse(formula) , "+ 1" ) ) )
}
fixef <- colnames( model.matrix( f_nobars , data ) )

# convert to all fixed effects and build needed model matrix
mdat <- model.matrix( subbars( formula ) , data )
outcome_name <- deparse( f_nobars[[2]] )
# mdat <- cbind( data[[outcome_name]] , mdat )
# colnames(mdat)[1] <- outcome_name
outcome <- model.frame( f_nobars , data )[,1]
if ( class(outcome)=="matrix" ) {
# fix outcome name
outcome_name <- colnames( outcome )[1]
}

# check for any varying effects
if ( formula == nobars(formula) ) {
# no varying effects
ranef <- list()
} else {
# find varying effects list
var <- findbars( formula )
ranef <- list()
for ( i in 1:length(var) ) {
name <- all.vars( var[[i]][[3]] )

if ( FALSE ) { # check index variables?
if ( TRUE ) {
# check that grouping vars are class integer
if ( class( data[[name]] )!="integer" ) {
stop( paste( "Grouping variables must be integer type. '" , name , "' is instead of type: " , class( data[[name]] ) , "." , sep="" ) )
}
# check that values are contiguous
if ( min(data[[name]]) != 1 )
stop( paste( "Group variable '" , name , "' doesn't start at index 1." , sep="" ) )
ulist <- unique( data[[name]] )
diffs <- ulist[2:length(ulist)] - ulist[1:(length(ulist)-1)]
if ( any(diffs != 1) )
stop( paste( "Group variable '" , name , "' is not contiguous." , sep="" ) )
} else {
# new code to coerce grouping vars to type integer
# first, save the labels by converting to character
#glabels <- as.character( data[[name]] )
# second, coerce to factor then integer to make contiguous integer index
mdat[,name] <- as.integer( as.factor( mdat[,name] ) )
}
}

# parse formula
v <- var[[i]][[2]]
if ( class(v)=="numeric" ) {
# just intercept
ranef[[ name ]] <- "(Intercept)"
} else {
# should be class "call" or "name"
# need to convert formula to model matrix headers
f <- as.formula( paste( "~" , deparse( v ) , sep="" ) )
ranef[[ name ]] <- colnames( model.matrix( f , data ) )
}
}
}

# result sufficient information to build Stan model code
list( y=outcome , yname=outcome_name , fixef=fixef , ranef=ranef , dat=as.data.frame(mdat) )
}

24 changes: 22 additions & 2 deletions R/map-predict.r
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,13 @@ function( fit , data , n=1000 , post , probs=NULL , refresh=0.1 , flatten=TRUE ,
n <- length(post[[1]])

nlm <- length(fit@links)
f_do_lm <- TRUE
if ( nlm==0 ) {
# no linear models!
# so need to flag to skip lm insertions into eval environment for ll
nlm <- 1
f_do_lm <- FALSE
}

link_out <- vector(mode="list",length=nlm)

Expand All @@ -32,8 +39,21 @@ function( fit , data , n=1000 , post , probs=NULL , refresh=0.1 , flatten=TRUE ,
ref_inc <- floor(n*refresh)
ref_next <- ref_inc

parout <- fit@links[[i]][[1]]
lm <- fit@links[[i]][[2]]
if ( f_do_lm==TRUE ) {
parout <- fit@links[[i]][[1]]
lm <- fit@links[[i]][[2]]
} else {
parout <- "ll"
lm <- "0"
# hacky solution -- find density function and insert whatever expression in typical link spot
flik <- as.character(fit@formula[[1]][[3]][[1]])
# mu for Gaussian
if ( flik=="dnorm" ) lm <- as.character( fit@formula[[1]][[3]][[2]] )
# p for binomial -- assume in third spot, after size
if ( flik=="dbinom" ) lm <- as.character( fit@formula[[1]][[3]][[3]] )
# lambda for poisson
if ( flik=="dpois" ) lm <- as.character( fit@formula[[1]][[3]][[2]] )
}
# empty matrix to hold samples-by-cases values of linear model
value <- matrix(NA,nrow=n,ncol=length(data[[1]]))
# for each sample
Expand Down
6 changes: 4 additions & 2 deletions R/map2stan-class.r
Original file line number Diff line number Diff line change
Expand Up @@ -121,8 +121,10 @@ setMethod("show", "map2stan", function(object){

if ( !is.null(attr(object,"WAIC")) ) {
waic <- attr(object,"WAIC")
cat("\nWAIC: ")
cat( round(as.numeric(waic),2) , "\n" )
use_waic <- waic
if ( length(waic)>1 ) use_waic <- (-2)*sum(waic)
cat("\nWAIC (SE): ")
cat( concat(round(as.numeric(use_waic),2) , " (" , round(as.numeric(attr(waic,"se")),1) , ")" , "\n" ) )

cat("pWAIC: ")
cat( round(as.numeric(attr(waic,"pWAIC")),2) , "\n" )
Expand Down
Loading

0 comments on commit 100b125

Please sign in to comment.