1.92 test test

- adding tests for book code - WAIC/PSIS refactored to return matrices - added Laffer data example - sim() method for ulam() refactored - still needs some tests
rmcelreath · Nov 11, 2019 · f1a47a3 · f1a47a3
1 parent 2e01a81
commit f1a47a3
Show file tree

Hide file tree

Showing 27 changed files with 1,705 additions and 158 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,11 +1,12 @@
 Package: rethinking
 Type: Package
 Title: Statistical Rethinking book package
-Version: 1.91
+Version: 1.92
 Date: 2019-10-25
 Author: Richard McElreath
 Maintainer: Richard McElreath <[email protected]>
 Imports: coda, MASS, mvtnorm, loo, shape
 Depends: rstan (>= 2.10.0), parallel, methods, stats, graphics, dagitty
+Suggests: testthat
 Description: Utilities for fitting and comparing models
 License: GPL (>= 3)
diff --git a/R/ICnumeric_class.R b/R/ICnumeric_class.R
@@ -0,0 +1,11 @@
+# IC class and show method
+
+setClass( "ICnumeric" , contains="numeric" )
+
+setMethod( "show" , "ICnumeric" , function(object) {
+    cat( round( object , 1 ) )
+    if ( !is.null( attr(object,"se") ) ) {
+        cat( concat( " ±" , round( attr(object,"se") , 1 ) ) )
+    }
+    cat("\n")
+} )
diff --git a/R/compare.r b/R/compare.r
@@ -14,7 +14,7 @@ setMethod( "show" , "compareIC" , function(object) {
 } )
 
 # new compare function, defaulting to WAIC
-compare <- function( ... , n=1e3 , sort="WAIC" , func=WAIC , WAIC=TRUE , refresh=0 ) {
+compare <- function( ... , n=1e3 , sort="WAIC" , func=WAIC , WAIC=TRUE , refresh=0 , warn=TRUE ) {
 
     # retrieve list of models
     L <- list(...)
@@ -32,65 +32,48 @@ compare <- function( ... , n=1e3 , sort="WAIC" , func=WAIC , WAIC=TRUE , refresh
 
     # check class of fit models and warn when more than one class represented
     classes <- as.character(sapply( L , class ))
-    if ( any(classes!=classes[1]) ) {
+    if ( any(classes!=classes[1]) & warn==TRUE ) {
         warning("Not all model fits of same class.\nThis is usually a bad idea, because it implies they were fit by different algorithms.\nCheck yourself, before you wreck yourself.")
     }
 
     # check nobs for all models
     # if different, warn
     nobs_list <- try( sapply( L , nobs ) )
-    if ( any(nobs_list != nobs_list[1]) ) {
+    if ( any(nobs_list != nobs_list[1]) & warn==TRUE ) {
         nobs_out <- paste( mnames , nobs_list , "\n" )
         nobs_out <- concat(nobs_out)
         warning(concat(
-            "Different numbers of observations found for at least two models.\nInformation criteria only valid for comparing models fit to exactly same observations.\nNumber of observations for each model:\n",nobs_out))
+            "Different numbers of observations found for at least two models.\nModel comparison is valid only for models fit to exactly the same observations.\nNumber of observations for each model:\n",nobs_out))
     }
 
     dSE.matrix <- matrix( NA , nrow=length(L) , ncol=length(L) )
     # deprecate WAIC==TRUE/FALSE flag
     # catch it and convert to func intent
     if ( WAIC==FALSE ) func <- DIC # assume old code that wants DIC
+
     if ( the_func=="DIC" ) {
         IC.list <- lapply( L , function(z) DIC( z , n=n ) )
         p.list <- sapply( IC.list , function(x) attr(x,"pD") )
     }
-    if ( the_func=="WAIC" ) {
-        # use WAIC
-        # WAIC is processed pointwise, so can compute SE of differences, and summed later
-        WAIC.list <- lapply( L , function(z) WAIC( z , n=n , refresh=refresh , pointwise=TRUE ) )
-        p.list <- sapply( WAIC.list , function(x) sum(attr(x,"pWAIC")) )
-        se.list <- sapply( WAIC.list , function(x) attr(x,"se") )
-        IC.list <- sapply( WAIC.list , sum )
-        # compute SE of differences between adjacent models from top to bottom in ranking
-        colnames(dSE.matrix) <- mnames
-        rownames(dSE.matrix) <- mnames
-        for ( i in 1:(length(L)-1) ) {
-            for ( j in (i+1):length(L) ) {
-                waic_ptw1 <- WAIC.list[[i]]
-                waic_ptw2 <- WAIC.list[[j]]
-                dSE.matrix[i,j] <- as.numeric( sqrt( length(waic_ptw1)*var( waic_ptw1 - waic_ptw2 ) ) )
-                dSE.matrix[j,i] <- dSE.matrix[i,j]
-            }#j
-        }#i
-    }
-    if ( the_func=="LOO" | the_func=="PSIS" ) {
-        # use LOO
-        LOO.list <- lapply( L , function(z) PSIS( z , n=n , refresh=refresh , pointwise=TRUE ) )
-        p.list <- sapply( LOO.list , function(x) sum(attr(x,"pPSIS")) )
-        se.list <- sapply( LOO.list , function(x) attr(x,"se") )
-        IC.list <- sapply( LOO.list , sum )
+
+    if ( the_func %in% c("WAIC","PSIS","LOO") ) {
+        IC.list.pw <- lapply( L , function(z) do.call( the_func , list( z , n=n , refresh=refresh , pointwise=TRUE , warn=warn ) ) )
+        p.list <- sapply( IC.list.pw , function(x) sum( x$penalty ) )
+        se.list <- sapply( IC.list.pw , function(x) x$std_err[1] )
+        IC.list <- sapply( IC.list.pw , function(x) sum( x[[1]] ) )
         # compute SE of differences between adjacent models from top to bottom in ranking
         colnames(dSE.matrix) <- mnames
         rownames(dSE.matrix) <- mnames
         for ( i in 1:(length(L)-1) ) {
             for ( j in (i+1):length(L) ) {
-                loo_ptw1 <- LOO.list[[i]]
-                loo_ptw2 <- LOO.list[[j]]
-                dSE.matrix[i,j] <- as.numeric( sqrt( length(loo_ptw1)*var( loo_ptw1 - loo_ptw2 ) ) )
+                ic_ptw1 <- IC.list.pw[[i]][[1]]
+                ic_ptw2 <- IC.list.pw[[j]][[1]]
+                dSE.matrix[i,j] <- as.numeric( sqrt( length(ic_ptw1)*var( ic_ptw1 - ic_ptw2 ) ) )
                 dSE.matrix[j,i] <- dSE.matrix[i,j]
             }#j
         }#i
     }
+
     if ( !(the_func %in% c("DIC","WAIC","LOO","PSIS")) ) {
         # unrecognized IC function; just wing it
         IC.list <- lapply( L , function(z) func( z ) )
@@ -103,6 +86,7 @@ compare <- function( ... , n=1e3 , sort="WAIC" , func=WAIC , WAIC=TRUE , refresh
 
     if ( the_func=="DIC" )
         result <- data.frame( DIC=IC.list , pD=p.list , dDIC=dIC , weight=w.IC )
+
     if ( the_func=="WAIC" ) {
         # find out which model has dWAIC==0
         topm <- which( dIC==0 )
@@ -116,6 +100,7 @@ compare <- function( ... , n=1e3 , sort="WAIC" , func=WAIC , WAIC=TRUE , refresh
         result <- data.frame( PSIS=IC.list , pPSIS=p.list , dPSIS=dIC , 
                               weight=w.IC , SE=se.list , dSE=dSEcol )
     }
+
     if ( !(the_func %in% c("DIC","WAIC","LOO","PSIS")) ) {
         result <- data.frame( IC=IC.list , dIC=dIC , weight=w.IC )
     }

diff --git a/R/sim.r b/R/sim.r
@@ -6,55 +6,11 @@ function( fit , data , n=1000 , ... ) {
 }
 )
 
-setMethod("sim", "map",
-function( fit , data , n=1000 , post , vars , ll=FALSE , refresh=0 , replace=list() , debug=FALSE , ... ) {
-    # when ll=FALSE, simulates sampling, one sample for each sample in posterior
-    # when ll=TRUE, computes loglik of each observation, for each sample in posterior
-
-    # vars argument optional
-    # when missing, simulations only first observed variable in formula
-    # when present, should be a character vector naming variables to simulate, in order to simulate them
-    # order matters, because some may be needed for the others to simulate right
-
-    ########################################
-    # check arguments
-    if ( class(fit)!="map" ) stop("Requires map/quap fit")
-    if ( missing(data) ) {
-        data <- fit@data
-    } else {
-        # make sure it's a list, otherwise can't hold sample matrices from sims
-        data <- as.list(data)
-        # check for vars to sim that are not in data list
-        if ( !missing(vars) ) {
-            for ( i in 1:length(vars) ) {
-                if ( !(vars[i] %in% names(data)) ) {
-                    # insert dummy for var - will get simulated over later
-                    data[[ vars[i] ]] <- rep( 0 , length(data[[1]]) )
-                }
-            }#i
-        }
-    }
-
-    if ( missing(post) ) {
-        post <- extract.samples(fit,n=n)
-    } else {
-        n <- dim(post[[1]])[1]
-        if ( is.null(n) ) n <- length(post[[1]])
-    }
-
-    # variables to sim
-    if ( missing(vars) ) {
-        # extract likelihood, assuming it is first element of formula
-        lik <- flist_untag(fit@formula)[[1]]
-        # discover outcome
-        vars <- as.character(lik[[2]])
-    } else {
-        # vars listed
-        if ( debug==TRUE ) print(vars)
-    }
+# function at heart of all sim methods
+sim_core <- function( fit , data , post , vars , n , refresh=0 , replace=list() , debug=FALSE , ll=FALSE , ... ) {
 
-    # loop over vars
     sim_vars <- list()
+
     for ( var in vars ) {
 
         # find a distribtional assignment with var on left
@@ -206,6 +162,60 @@ function( fit , data , n=1000 , post , vars , ll=FALSE , refresh=0 , replace=lis
         data[[ var ]] <- sim_out # make available to subsequent vars
 
     }#var
+
+    return(sim_vars)
+
+}
+
+setMethod("sim", "map",
+function( fit , data , n=1000 , post , vars , ll=FALSE , refresh=0 , replace=list() , debug=FALSE , ... ) {
+    # when ll=FALSE, simulates sampling, one sample for each sample in posterior
+    # when ll=TRUE, computes loglik of each observation, for each sample in posterior
+
+    # vars argument optional
+    # when missing, simulations only first observed variable in formula
+    # when present, should be a character vector naming variables to simulate, in order to simulate them
+    # order matters, because some may be needed for the others to simulate right
+
+    ########################################
+    # check arguments
+    if ( class(fit)!="map" ) stop("Requires map/quap fit")
+    if ( missing(data) ) {
+        data <- fit@data
+    } else {
+        # make sure it's a list, otherwise can't hold sample matrices from sims
+        data <- as.list(data)
+        # check for vars to sim that are not in data list
+        if ( !missing(vars) ) {
+            for ( i in 1:length(vars) ) {
+                if ( !(vars[i] %in% names(data)) ) {
+                    # insert dummy for var - will get simulated over later
+                    data[[ vars[i] ]] <- rep( 0 , length(data[[1]]) )
+                }
+            }#i
+        }
+    }
+
+    if ( missing(post) ) {
+        post <- extract.samples(fit,n=n)
+    } else {
+        n <- dim(post[[1]])[1]
+        if ( is.null(n) ) n <- length(post[[1]])
+    }
+
+    # variables to sim
+    if ( missing(vars) ) {
+        # extract likelihood, assuming it is first element of formula
+        lik <- flist_untag(fit@formula)[[1]]
+        # discover outcome
+        vars <- as.character(lik[[2]])
+    } else {
+        # vars listed
+        if ( debug==TRUE ) print(vars)
+    }
+
+    # loop over vars
+    sim_vars <- sim_core( fit=fit , data=data , post=post , vars=vars , n=n , refresh=refresh , replace=replace , debug=debug , ll=ll , ... )
 
     # result
     if ( length(sim_vars)==1 ) sim_vars <- sim_vars[[1]]

diff --git a/R/ulam-function.R b/R/ulam-function.R
@@ -28,9 +28,17 @@ ulam <- function( flist , data , pars , pars_omit , start , chains=1 , cores=1 ,
         # pre-scan for character variables and remove
         x_to_remove <- c()
         for ( i in 1:length(data) ) {
-            if ( class(data[[i]]) %in% c("character","factor") ) {
+            if ( class(data[[i]]) =="character" ) {
                 x_to_remove <- c( x_to_remove , names(data)[i] )
             }
+            if ( class(data[[i]]) =="factor" ) {
+                if ( coerce_int==FALSE )
+                    x_to_remove <- c( x_to_remove , names(data)[i] )
+                else {
+                    # coerce factor to index variable
+                    data[[i]] <- as.integer( data[[i]] )
+                }
+            }
         }#i
         if ( length(x_to_remove)>0 ) {
             if ( messages==TRUE ) {
@@ -43,7 +51,7 @@ ulam <- function( flist , data , pars , pars_omit , start , chains=1 , cores=1 ,
         # pre-scan for index variables (integer) that are numeric by accident
         for ( i in 1:length(data) ) {
             if ( class(data[[i]])!="character" ) {
-                if ( all( as.integer(data[[i]])==data[[i]] ) ) {
+                if ( all( as.integer(data[[i]])==data[[i]] , na.rm=TRUE ) ) {
                     #data[[i]] <- as.integer(data[[i]])
                     if ( class(data[[i]])!="integer" & class(data[[i]])!="matrix" ) {
                         if ( coerce_int==TRUE ) {

diff --git a/R/ulam-link-sim.R b/R/ulam-link-sim.R
@@ -102,7 +102,7 @@ setMethod( "link" , "ulam" , function(fit ,data ,...) link_ulam(fit,data,...) )
 
 # sim method assumes outcome var is first var on left in formula line 1
 # can use variable argument to specify any other variable
-sim_ulam <- function( fit , data , post , variable , n=1000 , replace=list() , ... ) {
+sim_ulam <- function( fit , data , post , vars , variable , n=1000 , replace=list() , ... ) {
 
     ########################################
     # check arguments
@@ -211,6 +211,61 @@ sim_ulam <- function( fit , data , post , variable , n=1000 , replace=list() , .
 
     return(sim_out)
 }
-setMethod( "sim" , "ulam" , function(fit,data,...) sim_ulam(fit,data,...) )
+
+sim_ulam_new <- function( fit , data , post , vars , variable , n=1000 , replace=list() , debug=FALSE , ll=FALSE , refresh=0 , ... ) {
+
+    ########################################
+    # check arguments
+    if ( missing(data) ) {
+        data <- fit@data
+    } else {
+        # make sure it's a list, otherwise can't hold sample matrices from sims
+        data <- as.list(data)
+        # check for vars to sim that are not in data list
+        if ( !missing(vars) ) {
+            for ( i in 1:length(vars) ) {
+                if ( !(vars[i] %in% names(data)) ) {
+                    # insert dummy for var - will get simulated over later
+                    data[[ vars[i] ]] <- rep( 0 , length(data[[1]]) )
+                }
+            }#i
+        }
+    }
+
+    if ( n==0 ) {
+        n <- stan_total_samples(fit@stanfit)
+    } else {
+        tot_samples <- stan_total_samples(fit@stanfit)
+        n <- min(n,tot_samples)
+    }
+
+    if ( missing(post) ) {
+        post <- extract.samples(fit,n=n)
+    } else {
+        n <- dim(post[[1]])[1]
+        if ( is.null(n) ) n <- length(post[[1]])
+    }
+
+    # variables to sim
+    if ( missing(vars) ) {
+        # extract likelihood, assuming it is first element of formula
+        lik <- flist_untag(fit@formula)[[1]]
+        # discover outcome
+        vars <- as.character(lik[[2]])
+    } else {
+        # vars listed
+        if ( debug==TRUE ) print(vars)
+    }
+
+    # loop over vars
+    sim_vars <- sim_core( fit=fit , data=data , post=post , vars=vars , n=n , refresh=refresh , replace=replace , debug=debug , ll=ll , ... )
+
+    # result
+    if ( length(sim_vars)==1 ) sim_vars <- sim_vars[[1]]
+    return( sim_vars )
+
+}
+
+setMethod( "sim" , "ulam" , function(fit,data,...) sim_ulam_new(fit,data,...) )