fix: Use better low-frequency replacement values (#115)

Use `9.99` as the replacement value for both the public and internal versions, setting the threshold at `10` for both. Also use use better rounding for nicer display of results.
SAFEHR-data · Dec 4, 2024 · c0a2d8c · c0a2d8c
1 parent a1e6e0d
commit c0a2d8c
Show file tree

Hide file tree

Showing 10 changed files with 19 additions and 12 deletions.
diff --git a/.env.sample b/.env.sample
@@ -13,8 +13,8 @@ PREPROCESS_DB_CDM_SCHEMA=           # Schema name in the database to connect the
 PREPROCESS_SUMMARISE_LEVEL=monthly  # Level to summarise record counts at (monthly or quarterly)
 
 # Low-frequency replacement
-LOW_FREQUENCY_THRESHOLD=5
-LOW_FREQUENCY_REPLACEMENT=2.5
+LOW_FREQUENCY_THRESHOLD=10
+LOW_FREQUENCY_REPLACEMENT=9.999999
 
 # For testing
 TEST_DB_PATH=./data-raw/test_db/eunomia
diff --git a/app/R/mod_datatable.R b/app/R/mod_datatable.R
@@ -64,12 +64,21 @@ mod_datatable_server <- function(id, selected_dates, bundle_concepts) {
 
   moduleServer(id, function(input, output, session) {
     concepts_with_counts <- reactive({
+      low_freq_threshold <- as.numeric(Sys.getenv("LOW_FREQUENCY_THRESHOLD"))
+
       join_counts_to_concepts(all_concepts, monthly_counts, selected_dates()) |>
         # Reorder and select the columns we want to display
         dplyr::select(
           "concept_id", "concept_name",
           "total_records", "mean_persons",
           "domain_id", "vocabulary_id", "concept_class_id"
+        ) |>
+        # Conditionally round numbers for better display
+        dplyr::mutate(
+          dplyr::across(
+            dplyr::where(is.double),
+            function(x) ifelse(x > low_freq_threshold, round(x), round(x, 2))
+          )
         )
     })
     output$datatable <- DT::renderDT(concepts_with_counts(),
@@ -78,8 +87,8 @@ mod_datatable_server <- function(id, selected_dates, bundle_concepts) {
       colnames = c(
         "ID" = "concept_id",
         "Name" = "concept_name",
-        "Records" = "total_records",
-        "Patients" = "mean_persons",
+        "Total Records" = "total_records",
+        "Average Patients" = "mean_persons",
         "Domain ID" = "domain_id",
         "Vocabulary ID" = "vocabulary_id",
         "Concept Class ID" = "concept_class_id"
@@ -111,10 +120,10 @@ join_counts_to_concepts <- function(concepts, monthly_counts, selected_dates) {
     filter_dates(selected_dates) |>
     dplyr::group_by(.data$concept_id) |>
     dplyr::summarise(
-      # round to avoid decimal values in in total_records because of low-req replacement
-      total_records = sum(round(.data$record_count)),
-      mean_persons = round(mean(.data$person_count, na.rm = TRUE), 2),
-      mean_records_per_person = round(mean(.data$records_per_person, na.rm = TRUE), 2)
+      total_records = sum(.data$record_count),
+      # Note that we can only calculate the average number of persons per month here
+      # as we cannot identify unique patients across months
+      mean_persons = mean(.data$person_count, na.rm = TRUE),
     )
   # Use inner_join so we only keep concepts for which we have counts in the selected dates
   dplyr::inner_join(concepts, summarised_counts, by = "concept_id")

diff --git a/app/tests/testthat/test-mod_datatable.R b/app/tests/testthat/test-mod_datatable.R
@@ -66,13 +66,12 @@ test_that("Adding records and patients counts to concepts table works", {
   concepts_with_counts <- join_counts_to_concepts(mock_selection_data, mock_monthly_counts, selected_dates)
 
   expect_in(
-    c("concept_id", "concept_name", "total_records", "mean_persons", "mean_records_per_person"),
+    c("concept_id", "concept_name", "total_records", "mean_persons"),
     names(concepts_with_counts)
   )
   expect_equal(nrow(concepts_with_counts), 3)
   expect_equal(concepts_with_counts$total_records, c(100, 200, 300))
   expect_equal(concepts_with_counts$mean_persons, c(10, 10, 10))
-  expect_equal(concepts_with_counts$mean_records_per_person, c(10, 10, 10))
 })
 
 test_that("Added counts depends on selected dates", {
@@ -81,7 +80,6 @@ test_that("Added counts depends on selected dates", {
 
   expect_equal(concepts_with_counts$total_records, c(100, 100, 100))
   expect_equal(concepts_with_counts$mean_persons, c(10, 10, 10))
-  expect_equal(concepts_with_counts$mean_records_per_person, c(10, 10, 10))
 })
 
 test_that("Only concepts with data for the selected date range are kept", {

diff --git a/data/test_data/internal/omopcat_concepts.parquet b/data/test_data/internal/omopcat_concepts.parquet
diff --git a/data/test_data/internal/omopcat_monthly_counts.parquet b/data/test_data/internal/omopcat_monthly_counts.parquet
diff --git a/data/test_data/internal/omopcat_summary_stats.parquet b/data/test_data/internal/omopcat_summary_stats.parquet
diff --git a/data/test_data/public/omopcat_concepts.parquet b/data/test_data/public/omopcat_concepts.parquet
diff --git a/data/test_data/public/omopcat_monthly_counts.parquet b/data/test_data/public/omopcat_monthly_counts.parquet
diff --git a/data/test_data/public/omopcat_summary_stats.parquet b/data/test_data/public/omopcat_summary_stats.parquet
diff --git a/public.env.sample b/public.env.sample
@@ -14,7 +14,7 @@ PREPROCESS_SUMMARISE_LEVEL=quarterly  # Level to summarise record counts at (mon
 
 # Low-frequency replacement
 LOW_FREQUENCY_THRESHOLD=10
-LOW_FREQUENCY_REPLACEMENT=5
+LOW_FREQUENCY_REPLACEMENT=9.99
 
 # For testing
 TEST_DB_PATH=./data-raw/test_db/eunomia