From cbf145f54aeff9b957a56d2d85c02a9879f89a06 Mon Sep 17 00:00:00 2001
From: jgilber2 <jgilber2@its.jnj.com>
Date: Wed, 14 Aug 2024 12:41:13 -0700
Subject: [PATCH] New branch for testing patch

---
 DESCRIPTION                              |   2 +-
 R/RunDiagnostics.R                       | 104 ++++++++++++-----------
 tests/testthat/test-1-ResultsDataModel.R |   4 +-
 tests/testthat/test-2-againstCdm.R       |   6 +-
 4 files changed, 61 insertions(+), 55 deletions(-)

diff --git a/DESCRIPTION b/DESCRIPTION
index 52d02780e..e13a4810d 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -38,7 +38,7 @@ Imports:
   SqlRender (>= 1.9.0),
   stringr,
   tidyr (>= 1.2.0),
-  CohortGenerator (>= 0.8.0),
+  CohortGenerator (>= 0.10.0),
   remotes,
   scales
 Suggests:
diff --git a/R/RunDiagnostics.R b/R/RunDiagnostics.R
index 0f0f8cab8..d93c02ba7 100644
--- a/R/RunDiagnostics.R
+++ b/R/RunDiagnostics.R
@@ -136,11 +136,11 @@ getDefaultCovariateSettings <- function() {
 #' @param incremental                 Create only cohort diagnostics that haven't been created before?
 #' @param incrementalFolder           If \code{incremental = TRUE}, specify a folder where records are kept
 #'                                    of which cohort diagnostics has been executed.
-#' @param runOnSample                 Logical. If TRUE, the function will operate on a sample of the data.
+#' @param runFeatureExtractionOnSample Logical. If TRUE, the function will operate on a sample of the data.
 #'                                    Default is FALSE, meaning the function will operate on the full data set.
 #'
-#' @param sampleN                     Integer. The number of records to include in the sample if runOnSample is TRUE.
-#'                                    Default is 1000. Ignored if runOnSample is FALSE.
+#' @param sampleN                     Integer. The number of records to include in the sample if runFeatureExtractionOnSample is TRUE.
+#'                                    Default is 1000. Ignored if runFeatureExtractionOnSample is FALSE.
 #'
 #' @param seed                        Integer. The seed for the random number generator used to create the sample.
 #'                                    This ensures that the same sample can be drawn again in future runs. Default is 64374.
@@ -228,7 +228,7 @@ executeDiagnostics <- function(cohortDefinitionSet,
                                irWashoutPeriod = 0,
                                incremental = FALSE,
                                incrementalFolder = file.path(exportFolder, "incremental"),
-                               runOnSample = FALSE,
+                               runFeatureExtractionOnSample = FALSE,
                                sampleN = 1000,
                                seed = 64374,
                                seedArgs = NULL,
@@ -250,7 +250,7 @@ executeDiagnostics <- function(cohortDefinitionSet,
       incremental = callingArgs$incremental,
       temporalCovariateSettings = callingArgs$temporalCovariateSettings
     ) %>%
-    RJSONIO::toJSON(digits = 23, pretty = TRUE)
+      RJSONIO::toJSON(digits = 23, pretty = TRUE)
 
   exportFolder <- normalizePath(exportFolder, mustWork = FALSE)
   incrementalFolder <- normalizePath(incrementalFolder, mustWork = FALSE)
@@ -279,25 +279,25 @@ executeDiagnostics <- function(cohortDefinitionSet,
   errorMessage <- checkmate::makeAssertCollection()
   checkmate::assertList(cohortTableNames, null.ok = FALSE, types = "character", add = errorMessage, names = "named")
   checkmate::assertNames(names(cohortTableNames),
-    must.include = c(
-      "cohortTable",
-      "cohortInclusionTable",
-      "cohortInclusionResultTable",
-      "cohortInclusionStatsTable",
-      "cohortSummaryStatsTable",
-      "cohortCensorStatsTable"
-    ),
-    add = errorMessage
+                         must.include = c(
+                           "cohortTable",
+                           "cohortInclusionTable",
+                           "cohortInclusionResultTable",
+                           "cohortInclusionStatsTable",
+                           "cohortSummaryStatsTable",
+                           "cohortCensorStatsTable"
+                         ),
+                         add = errorMessage
   )
   checkmate::assertDataFrame(cohortDefinitionSet, add = errorMessage)
   checkmate::assertNames(names(cohortDefinitionSet),
-    must.include = c(
-      "json",
-      "cohortId",
-      "cohortName",
-      "sql"
-    ),
-    add = errorMessage
+                         must.include = c(
+                           "json",
+                           "cohortId",
+                           "cohortName",
+                           "sql"
+                         ),
+                         add = errorMessage
   )
 
   cohortTable <- cohortTableNames$cohortTable
@@ -474,17 +474,17 @@ executeDiagnostics <- function(cohortDefinitionSet,
     sort()
   cohortTableColumnNamesExpected <-
     getResultsDataModelSpecifications() %>%
-    dplyr::filter(.data$tableName == "cohort") %>%
-    dplyr::pull(.data$columnName) %>%
-    SqlRender::snakeCaseToCamelCase() %>%
-    sort()
+      dplyr::filter(.data$tableName == "cohort") %>%
+      dplyr::pull(.data$columnName) %>%
+      SqlRender::snakeCaseToCamelCase() %>%
+      sort()
   cohortTableColumnNamesRequired <-
     getResultsDataModelSpecifications() %>%
-    dplyr::filter(.data$tableName == "cohort") %>%
-    dplyr::filter(.data$isRequired == "Yes") %>%
-    dplyr::pull(.data$columnName) %>%
-    SqlRender::snakeCaseToCamelCase() %>%
-    sort()
+      dplyr::filter(.data$tableName == "cohort") %>%
+      dplyr::filter(.data$isRequired == "Yes") %>%
+      dplyr::pull(.data$columnName) %>%
+      SqlRender::snakeCaseToCamelCase() %>%
+      sort()
 
   expectedButNotObsevered <-
     setdiff(x = cohortTableColumnNamesExpected, y = cohortTableColumnNamesObserved)
@@ -549,23 +549,6 @@ executeDiagnostics <- function(cohortDefinitionSet,
     }
   }
 
-  if (runOnSample & !isTRUE(attr(cohortDefinitionSet, "isSampledCohortDefinition"))) {
-    cohortDefinitionSet <-
-      CohortGenerator::sampleCohortDefinitionSet(
-        connection = connection,
-        cohortDefinitionSet = cohortDefinitionSet,
-        tempEmulationSchema = tempEmulationSchema,
-        cohortDatabaseSchema = cohortDatabaseSchema,
-        cohortTableNames = cohortTableNames,
-        n = sampleN,
-        seed = seed,
-        seedArgs = seedArgs,
-        identifierExpression = sampleIdentifierExpression,
-        incremental = incremental,
-        incrementalFolder = incrementalFolder
-      )
-  }
-
   ## CDM source information----
   timeExecution(
     exportFolder,
@@ -871,17 +854,40 @@ executeDiagnostics <- function(cohortDefinitionSet,
       cohortIds,
       parent = "executeDiagnostics",
       expr = {
+
+        feCohortDefinitionSet <- cohortDefinitionSet
+        feCohortTable <- cohortTable
+
+        if (runFeatureExtractionOnSample & !isTRUE(attr(cohortDefinitionSet, "isSampledCohortDefinition"))) {
+          feCohortTable <- cohortTableNames$cohortSampleTable
+          feCohortDefinitionSet <-
+            CohortGenerator::sampleCohortDefinitionSet(
+              connection = connection,
+              cohortDefinitionSet = cohortDefinitionSet,
+              tempEmulationSchema = tempEmulationSchema,
+              cohortDatabaseSchema = cohortDatabaseSchema,
+              cohortTableNames = cohortTableNames,
+              n = sampleN,
+              seed = seed,
+              seedArgs = seedArgs,
+              identifierExpression = sampleIdentifierExpression,
+              incremental = incremental,
+              incrementalFolder = incrementalFolder
+            )
+        }
+
+
         executeCohortCharacterization(
           connection = connection,
           databaseId = databaseId,
           exportFolder = exportFolder,
           cdmDatabaseSchema = cdmDatabaseSchema,
           cohortDatabaseSchema = cohortDatabaseSchema,
-          cohortTable = cohortTable,
+          cohortTable = feCohortTable,
           covariateSettings = temporalCovariateSettings,
           tempEmulationSchema = tempEmulationSchema,
           cdmVersion = cdmVersion,
-          cohorts = cohortDefinitionSet,
+          cohorts = feCohortDefinitionSet,
           cohortCounts = cohortCounts,
           minCellCount = minCellCount,
           instantiatedCohorts = instantiatedCohorts,
diff --git a/tests/testthat/test-1-ResultsDataModel.R b/tests/testthat/test-1-ResultsDataModel.R
index 07d71a30e..9c34d976c 100644
--- a/tests/testthat/test-1-ResultsDataModel.R
+++ b/tests/testthat/test-1-ResultsDataModel.R
@@ -123,7 +123,7 @@ VALUES ('Synthea','Synthea','OHDSI Community','SyntheaTM is a Synthetic Patient
           incremental = TRUE,
           incrementalFolder = file.path(folder, "incremental"),
           temporalCovariateSettings = temporalCovariateSettings,
-          runOnSample = TRUE
+          runFeatureExtractionOnSample = TRUE
         )
       },
       "CDM Source table has more than one record while only one is expected."
@@ -149,7 +149,7 @@ VALUES ('Synthea','Synthea','OHDSI Community','SyntheaTM is a Synthetic Patient
       incremental = TRUE,
       incrementalFolder = file.path(folder, "incremental"),
       temporalCovariateSettings = temporalCovariateSettings,
-      runOnSample = TRUE
+      runFeatureExtractionOnSample = TRUE
     )
   }
 
diff --git a/tests/testthat/test-2-againstCdm.R b/tests/testthat/test-2-againstCdm.R
index fa2ffeb15..ccac99aac 100644
--- a/tests/testthat/test-2-againstCdm.R
+++ b/tests/testthat/test-2-againstCdm.R
@@ -45,7 +45,7 @@ test_that("Cohort diagnostics in incremental mode", {
       incremental = TRUE,
       incrementalFolder = file.path(folder, "incremental"),
       temporalCovariateSettings = temporalCovariateSettings,
-      runOnSample = TRUE
+      runFeatureExtractionOnSample = TRUE
     )
   )
 
@@ -76,7 +76,7 @@ test_that("Cohort diagnostics in incremental mode", {
       incremental = TRUE,
       incrementalFolder = file.path(folder, "incremental"),
       temporalCovariateSettings = temporalCovariateSettings,
-      runOnSample = TRUE
+      runFeatureExtractionOnSample = TRUE
     )
   )
   # generate sqlite file
@@ -123,7 +123,7 @@ test_that("Cohort diagnostics in incremental mode", {
         incremental = FALSE,
         incrementalFolder = file.path(folder, "incremental"),
         temporalCovariateSettings = temporalCovariateSettings,
-        runOnSample = TRUE
+        runFeatureExtractionOnSample = TRUE
       )
     })