Update documentation of lsmeans

insightsengineering · Mar 20, 2024 · aa939cd · aa939cd
1 parent edbf35f
commit aa939cd
Show file tree

Hide file tree

Showing 7 changed files with 140 additions and 69 deletions.
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -18,7 +18,7 @@ LazyData: true
 Roxygen: list(markdown = TRUE)
 URL: https://insightsengineering.github.io/rbmi/, https://github.com/insightsengineering/rbmi
 BugReports: https://github.com/insightsengineering/rbmi/issues
-RoxygenNote: 7.2.3
+RoxygenNote: 7.3.1
 Suggests: 
     dplyr,
     tidyr,

diff --git a/NAMESPACE b/NAMESPACE
@@ -1,6 +1,9 @@
 # Generated by roxygen2: do not edit by hand
 
 S3method(as.data.frame,pool)
+S3method(collapse_values,default)
+S3method(collapse_values,factor)
+S3method(collapse_values,numeric)
 S3method(draws,approxbayes)
 S3method(draws,bayes)
 S3method(draws,bmlmi)
@@ -19,6 +22,7 @@ S3method(print,imputation_list_df)
 S3method(print,imputation_list_single)
 S3method(print,imputation_single)
 S3method(print,pool)
+S3method(repr,numeric)
 S3method(validate,analysis)
 S3method(validate,bmlmi)
 S3method(validate,bootstrap)

diff --git a/R/dataclasses.R b/R/dataclasses.R
@@ -285,6 +285,7 @@ repr <- function(x, ...) {
     UseMethod("repr")
 }
 
+#' @export
 repr.numeric <- function(x, ...) {
     paste0("c(", paste0(round(x, 2), collapse = ", "), ")")
 }
diff --git a/R/lsmeans.R b/R/lsmeans.R
@@ -2,30 +2,34 @@
 #' Least Square Means
 #'
 #'
-#' Estimates the least square means from a linear model. This is done by
-#' generating a prediction from the model using an hypothetical observation
-#' that is constructed by averaging the data. See details for more information.
+#' Estimates the least square means from a linear model. The exact implementation
+#' / interpretation depends on the weighting scheme; see details for more information.
 #'
 #' @details
 #'
-#' The lsmeans are obtained by calculating hypothetical patients
-#' and predicting their expected values. These hypothetical patients
-#' are constructed by expanding out all possible combinations of each
-#' categorical covariate and by setting any numerical covariates equal
-#' to the mean.
+#' For `.weights = "proportional"` (the default) the lsmeans are obtained by
+#' taking the average of the fitted values for each patient.
+#' In terms of the interactions between two continuous covariates 
+#' this is equivalent to constructing a hypothetical patient whose
+#' interaction term is `mean(X * Y)`. This is opposed to the
+#' `emmeans` package which calculates the interaction term
+#' of the hypothetical patient as `mean(X) * mean(Y)`. The approach
+#' outlined here is equivalent to standardization or g-computation.
+#' 
+#' For `.weights = "equal"` the lsmeans are obtained by taking the model fitted
+#' value of a hypothetical patient whose covariates are defined as follows:
+#' - Continuous covariates are set to `mean(X)`
+#' - Dummy categorical variables are set to `1/N` where `N` is the number of levels
+#' - Continuous * continuous interactions are set to `mean(X) * mean(Y)`
+#' - Continuous * categorical interactions are set to `mean(X) * 1/N`
 #'
-#' A final lsmean value is calculated by averaging these hypothetical
-#' patients. If `.weights` equals `"proportional"` then the values are weighted
-#' by the frequency in which they occur in the full dataset. If `.weights`
-#' equals `"equal"` then each hypothetical patient is given an equal weight
-#' regardless of what actually occurs in the dataset.
+#' Regardless of the weighting scheme any named arguments passed via `...` will
+#' fix the value of the covariate to the specified value.
+#' For example, `lsmeans(model, trt = "A")` will fix the dummy variable `trtA` to 1
+#' for all patients (real or hypothetical) when calculating the lsmeans.
 #'
-#' Use the `...` argument to fix specific variables to specific values.
-#'
-#' See the references for identical implementations as done in SAS and
-#' in R via the `emmeans` package. This function attempts to re-implement the
-#' `emmeans` derivation for standard linear models but without having to include
-#' all of it's dependencies.
+#' See the references for similar implementations as done in SAS and
+#' in R via the `emmeans` package.
 #'
 #' @param model A model created by `lm`.
 #' @param ... Fixes specific variables to specific values i.e.
@@ -67,7 +71,7 @@ lsmeans <- function(model, ..., .weights = c("proportional", "equal")) {
         proportional = ls_design_proportional
     )
 
-    design <- design_fun(data, frm, covars, fix)
+    design <- design_fun(data, frm, fix)
     beta <- matrix(coef(model), nrow = 1)
     list(
         est = as.vector(beta %*% design),
@@ -88,41 +92,61 @@ lsmeans <- function(model, ..., .weights = c("proportional", "equal")) {
 #'
 #' @param data A data.frame
 #' @param frm Formula used to fit the original model
-#' @param covars a character vector of variables names that exist in
-#' `data` which should be extracted (`ls_design_equal` only)
 #' @param fix A named list of variables with fixed values
 #' @name ls_design
-ls_design_equal <- function(data, frm, covars, fix) {
-    collection <- list()
-    for (var in covars) {
-        values <- data[[var]]
-        if (is.factor(values)) {
-            lvls <- levels(values)
-            if (var %in% names(fix)) {
-                collection[[var]] <- factor(fix[[var]], levels = lvls)
-            } else {
-                collection[[var]] <- factor(lvls, levels = lvls)
-            }
-        } else if (is.numeric(values)) {
-            if (var %in% names(fix)) {
-                collection[[var]] <- fix[[var]]
-            } else {
-                collection[[var]] <- mean(values)
-            }
-        } else {
-            stop("invalid data type")
+ls_design_equal <- function(data, frm, fix) {
+    assert_that(
+        inherits(frm, "formula"),
+        is.data.frame(data),
+        is.list(fix),
+        all(names(fix) %in% colnames(data))
+    )
+    frm2 <- update(frm, NULL ~ .)
+    data2 <- model.frame(frm2, data)
+    collection <- lapply(as.list(data2), collapse_values)
+
+    for (var in names(fix)) {
+        if (is.numeric(data2[[var]])) {
+            collection[[var]] <- fix[[var]]
+        } else if (is.factor(data2[[var]])) {
+            collection[[var]] <- factor(fix[[var]], levels = levels(data2[[var]]))
         }
     }
-    design <- matrix(
-        apply(model.matrix(frm, expand.grid(collection)), 2, mean),
+
+    all_combinations <- expand.grid(collection)
+
+    design_matrix <- model.matrix(frm2, all_combinations)
+
+    result <- matrix(
+        apply(design_matrix, 2, mean),
         ncol = 1
     )
-    return(design)
+    return(result)
+}
+
+
+collapse_values <- function(x) {
+    UseMethod("collapse_values")
+}
+
+#' @export
+collapse_values.factor <- function(x) {
+    return(factor(levels(x), levels(x)))
+}
+
+#' @export
+collapse_values.numeric <- function(x) {
+    return(mean(x))
+}
+
+#' @export
+collapse_values.default <- function(x) {
+    stop("invalid data type")
 }
 
 
 #' @rdname ls_design
-ls_design_proportional <- function(data, frm, covars, fix) {
+ls_design_proportional <- function(data, frm, fix) {
     data2 <- data
     for (var in names(fix)) {
         value <- data[[var]]

diff --git a/man/ls_design.Rd b/man/ls_design.Rd
diff --git a/man/lsmeans.Rd b/man/lsmeans.Rd
diff --git a/tests/testthat/test-lsmeans.R b/tests/testthat/test-lsmeans.R
@@ -216,6 +216,44 @@ test_that("LSmeans (proportional) from rbmi equals means of average prediction",
 })
 
 
+test_that("LSmeans(proportional) returns equivalent results to 'counterfactual'", {
+    set.seed(2412)
+    n <- 10000
+    # Main goal here is to provide a more involved test of interactions between
+    # cont * cont, cont * cont * cont model terms
+    dat <- tibble(
+        v1 = rnorm(n),
+        v2 = rnorm(n),
+        v3 = rnorm(n),
+        c1 = sample(c("A", "B"), size = n, replace = TRUE, prob = c(0.8, 0.2)),
+        c2 = sample(c("X", "Y"), size = n, replace = TRUE, prob = c(0.6, 0.4)),
+        error = rnorm(n, 0, 4),
+        outcome = 30 +
+            5 * v1 +
+            3 * v2 +
+            2 * v3 +
+            8 * v1 * v2 +
+            9 * v1 * v3 +
+            10 * v2 * v3 +
+            12 * v1 * v2 * v3 +
+            4 * (c1 == "B") +
+            6 * (c2 == "Y") +
+            7 * (c1 == "B" & c2 == "Y") +
+            13 * (c1 == "B") * v1 +
+            error
+    )
+    mod <- lm(outcome ~ v1 * v2 * v3 + c1 * c2 + v1 * c1, data = dat)
+
+    expect_equal(
+        lsmeans(mod, c1 = "A", .weights = "proportional")$est,
+        mean(predict(mod, newdata = dat |> mutate(c1 = "A")))
+    )
+
+    expect_equal(
+        lsm2 <- lsmeans(mod, c1 = "B", c2 = "Y", .weights = "proportional")$est,
+        mean(predict(mod, newdata = dat |> mutate(c1 = "B", c2 = "Y")))
+    )
+})
 
 
 #### Toy code to experiment with
@@ -263,3 +301,4 @@ test_that("LSmeans (proportional) from rbmi equals means of average prediction",
 
 
 
+