Skip to content

Commit

Permalink
Merge remote-tracking branch 'origin/rel-1724247361'
Browse files Browse the repository at this point in the history
  • Loading branch information
Crunch.io Jenkins Account committed Aug 21, 2024
2 parents 1ae4568 + 0e7b92f commit 9489964
Show file tree
Hide file tree
Showing 57 changed files with 4,783 additions and 1,622 deletions.
2 changes: 1 addition & 1 deletion DESCRIPTION
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ Suggests:
VignetteBuilder: knitr
Language: en-US
Encoding: UTF-8
RoxygenNote: 7.2.3
RoxygenNote: 7.3.2
Roxygen: list(markdown = TRUE)
LazyData: true
Collate:
Expand Down
180 changes: 138 additions & 42 deletions R/as-data-frame.R
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,9 @@
#' levels matching the Crunch Variable's categories (the default), or, if
#' `categorical.mode` is specified as "id" or "numeric", a numeric vector of
#' category ids or numeric values, respectively
#' * Array variables (Categorical Array, Multiple Response) are decomposed into
#' their constituent categorical subvariables. An array with three subvariables,
#' for example, will result in three columns in the `data.frame`.
#' * Array variables (Categorical Array, Multiple Response) can be decomposed into
#' their constituent categorical subvariables or put in 'packed' data.frame columns,
#' see the `array_strategy` argument.
#'
#' Column names in the `data.frame` are the variable/subvariable aliases.
#'
Expand All @@ -46,6 +46,14 @@
#' @param categorical.mode what mode should categoricals be pulled as? One of
#' factor, numeric, id (default: factor)
#' @param include.hidden logical: should hidden variables be included? (default: `TRUE`)
#' @param array_strategy Strategy to import array variables: "alias" (the default)
#' reads them as flat variables with the subvariable aliases, unless there are duplicate
#' aliases in which case they are qualified in brackets after the array alias,
#' like "array_alias\[subvar_alias\]". "qualified_alias" always uses the bracket notation.
#' "packed" reads them in what the tidyverse calls "packed" data.frame columns, with the
#' alias from the array variable, and subvariables as the columns of the data.frame.
#' @param verbose Whether to output a message to the console when subvariable aliases
#' are qualified when array_strategy="alias" (defaults to TRUE)
#' @param ... additional arguments passed to `as.data.frame` (default method).
#' @return When called on a `CrunchDataset`, the method returns an object of
#' class `CrunchDataFrame` unless `force = TRUE`, in which case the return is a
Expand All @@ -70,7 +78,7 @@ as.data.frame.CrunchDataset <- function(x,
include.hidden = include.hidden
)
if (force) {
out <- as.data.frame(out)
out <- as.data.frame(out, ...)
}
return(out)
}
Expand All @@ -82,63 +90,151 @@ as.data.frame.CrunchDataFrame <- function(x,
row.names = NULL,
optional = FALSE,
include.hidden = attr(x, "include.hidden"),
array_strategy = c("alias", "qualified_alias", "packed"),
verbose = TRUE,
...) {
array_strategy <- match.arg(array_strategy)
ds <- attr(x, "crunchDataset")
tmp <- tempfile()
on.exit(unlink(tmp))
write.csv(ds, tmp, categorical = "id", include.hidden = include.hidden)
# TODO: use variableMetadata to provide all `colClasses`?
# meta <- variableMetadata(ds)
ds_out <- read.csv(tmp, stringsAsFactors = FALSE, check.names = FALSE)
return(csvToDataFrame(ds_out, x))
write.csv(
ds,
tmp,
categorical = "id",
header_field = "qualified_alias",
missing_values = "",
include.hidden = include.hidden
)

parsing_info <- csvColInfo(ds, verbose = verbose && array_strategy == "alias")

# guessing has been good enough (and distinguishes between Date and POSIXct class for us)
# except for text variables, so continue to guess the parsing info for all columns besides text
col_classes <- setNames(
ifelse(parsing_info$var_type == "text", "character", NA_character_),
parsing_info$qualified_alias
)

ds_out <- read.csv(
tmp,
stringsAsFactors = FALSE,
check.names = FALSE,
colClasses = col_classes,
na.strings = ""
)
dup_csv_names <- duplicated(names(ds_out))
if (any(dup_csv_names)) {
stop(
"csv has duplicate column headers, cannot parse: ",
paste0(unique(names(ds_out)[dup_csv_names]), collapse = ", ")
)
}
return(csvToDataFrame(ds_out, x, parsing_info, array_strategy, categorical.mode = attr(x, "mode")))

Check warning on line 132 in R/as-data-frame.R

View workflow job for this annotation

GitHub Actions / test-coverage

file=R/as-data-frame.R,line=132,col=101,[line_length_linter] Lines should not be more than 100 characters. This line is 103 characters.
}

csvColInfo <- function(ds, verbose = TRUE) {
# Get variable metadata for variables included in the export
meta <- variableMetadata(ds)[urls(allVariables(ds))]
flattened_meta <- flattenVariableMetadata(meta)

orig_aliases <- aliases(flattened_meta)
parent_aliases <- vapply(flattened_meta, function(x) x$parent_alias %||% NA_character_, character(1))

Check warning on line 141 in R/as-data-frame.R

View workflow job for this annotation

GitHub Actions / test-coverage

file=R/as-data-frame.R,line=141,col=101,[line_length_linter] Lines should not be more than 100 characters. This line is 105 characters.
qualified_aliases <- ifelse(
is.na(parent_aliases),
orig_aliases,
paste0(parent_aliases, "[", orig_aliases, "]")
)
# cond_qualified_aliases are only qualified if there are duplicates
dup_aliases <- orig_aliases[duplicated(orig_aliases)]
cond_qualified_aliases <- ifelse(orig_aliases %in% dup_aliases, qualified_aliases, orig_aliases)
out <- data.frame(
orig_alias = orig_aliases,
parent_alias = parent_aliases,
qualified_alias = qualified_aliases,
cond_qualified_alias = cond_qualified_aliases,
var_type = types(flattened_meta)
)
out <- out[!out$var_type %in% ARRAY_TYPES, ]

if (verbose) {
msg_rows <- out$cond_qualified_alias != out$orig_alias
if (any(msg_rows)) {
alias_info <- paste0(out$orig_alias[msg_rows], " -> ", out$cond_qualified_alias[msg_rows])

Check warning on line 162 in R/as-data-frame.R

View workflow job for this annotation

GitHub Actions / test-coverage

file=R/as-data-frame.R,line=162,col=101,[line_length_linter] Lines should not be more than 100 characters. This line is 102 characters.
message(
"Some column names are qualified because there were duplicate aliases ",
"in dataset:\n", paste0(alias_info, collapse = ", ")
)
}
}

attr(out, "meta") <- meta
out
}

csvToDataFrame <- function(csv_df, crdf) {
ds <- attr(crdf, "crunchDataset")
mode <- attr(crdf, "mode")
## Use `variableMetadata` to avoid a GET on each variable entity for
## categories and subvariables
## Subset variableMetadata on the urls of the variables in the ds in case
## `ds` has only a subset of variables
ds@variables <- variableMetadata(ds)[urls(allVariables(ds))]
csvToDataFrame <- function(csv_df,
cr_data,
parsing_info,
array_strategy = c("alias", "qualified_alias", "packed"),
categorical.mode = "factor") {
array_strategy <- match.arg(array_strategy)
meta <- attr(parsing_info, "meta")
## CrunchDataFrames contain both server variables and local variables.
## Iterate over the names of crdf to preserve the desired order.
## Nest individual columns in a list and then unlist all because array
## variables can return multiple columns
out <- unlist(lapply(names(crdf), function(a) {
v <- ds[[a]]
var_order <- if (inherits(cr_data, "CrunchDataFrame")) names(cr_data) else aliases(allVariables(cr_data))

Check warning on line 182 in R/as-data-frame.R

View workflow job for this annotation

GitHub Actions / test-coverage

file=R/as-data-frame.R,line=182,col=101,[line_length_linter] Lines should not be more than 100 characters. This line is 109 characters.
## Iterate over the names of cr_data to preserve the desired order.
## Nest everything in an extra layer of lists because one layer is removed
out <- unlist(lapply(var_order, function(a) {
meta_idx <- match(a, aliases(meta))
v <- if (!is.na(meta_idx)) meta[[meta_idx[1]]] else NULL
if (is.null(v)) {
## Not in the dataset, so it exists only in the CRDF. Get it there.
return(structure(list(crdf[[a]]), .Names = a))
} else if (is.Array(v)) {
return(structure(list(cr_data[[a]]), .Names = a))
} else if (type(v) %in% ARRAY_TYPES) {
## Find the subvar columns in the csv_df and parse them as categorical
if (is.NumericArray(v)) {
cp <- columnParser("numeric")
if (type(v) == "numeric_array") {
cp <- numericCsvParser
} else {
cp <- columnParser("categorical")
}
sub_a <- aliases(subvariables(v))
return(structure(lapply(csv_df[sub_a], cp, v, mode), .Names = sub_a))
} else if (is.Numeric(v)) {
# When data is downloaded using write.csv it includes the name of
# the No Data category instead of a missing value, and this is read
# into R as a character vector. The data needs to be downloaded in
# this form to preserve the missing categories for categorical data.
# We use as.numeric to convert this to numeric and coerce the "No
# Data" elements to NA. So c("1", "No Data", "2.7") becomes c(1, NA,
# 2.7). as.numeric issues a warning when coercion creates NAs, and
# because we expect that, we suppress the warning.
df_vect <- suppressWarnings(as.numeric(csv_df[[a]]))
return(structure(list(df_vect), .Names = a))
subvar_info <- parsing_info[!is.na(parsing_info$parent_alias) & parsing_info$parent_alias == alias(v), ]

Check warning on line 198 in R/as-data-frame.R

View workflow job for this annotation

GitHub Actions / test-coverage

file=R/as-data-frame.R,line=198,col=101,[line_length_linter] Lines should not be more than 100 characters. This line is 116 characters.
cols <- csv_df[, subvar_info$qualified_alias]
if (array_strategy == "alias"){
return(structure(lapply(cols, cp, v, categorical.mode), .Names = subvar_info$cond_qualified_alias))

Check warning on line 201 in R/as-data-frame.R

View workflow job for this annotation

GitHub Actions / test-coverage

file=R/as-data-frame.R,line=201,col=101,[line_length_linter] Lines should not be more than 100 characters. This line is 115 characters.
} else if (array_strategy == "qualified_alias") {
return(structure(lapply(cols, cp, v, categorical.mode), .Names = subvar_info$qualified_alias))

Check warning on line 203 in R/as-data-frame.R

View workflow job for this annotation

GitHub Actions / test-coverage

file=R/as-data-frame.R,line=203,col=101,[line_length_linter] Lines should not be more than 100 characters. This line is 110 characters.
} else { # array_strategy==packed
# Extra list layer to hold the array variable's alias
return(structure(
list(
structure(
lapply(cols, cp, v, categorical.mode),
class = "data.frame",
.Names = subvar_info$orig_alias,
row.names = c(NA, -nrow(csv_df))
)
),
.Names = alias(v)
))
}
} else {
cp <- columnParser(type(v))
return(structure(list(cp(csv_df[[a]], v, mode)), .Names = a))
type <- type(v)
cp <- switch(type, "numeric" = numericCsvParser, "text" = textCsvParser, columnParser(type))

Check warning on line 220 in R/as-data-frame.R

View workflow job for this annotation

GitHub Actions / test-coverage

file=R/as-data-frame.R,line=220,col=101,[line_length_linter] Lines should not be more than 100 characters. This line is 104 characters.
return(structure(list(cp(csv_df[[a]], v, categorical.mode)), .Names = a))
}
}), recursive = FALSE)

## Wrap that list of columns in a data.frame structure
return(structure(out, class = "data.frame", row.names = c(NA, -nrow(ds))))
return(structure(out, class = "data.frame", row.names = c(NA, -nrow(csv_df))))
}

# We pass missing_values to export so no longer have to worry about finding text
# in a numeric variable
numericCsvParser <- function(col, ...) col

# When data comes from a csv it should already be text (and definitely won't be
# a list with missing reasons included like JSON's text columnParser)
textCsvParser <- function(col, ...) col


#' as.data.frame method for catalog objects
#'
#' This method gives you a view of a catalog, such as a `VariableCatalog`, as a
Expand Down
1 change: 1 addition & 0 deletions R/variable-metadata.R
Original file line number Diff line number Diff line change
Expand Up @@ -56,6 +56,7 @@ flattenVariableMetadata <- function(vm) {
## Add the parent ref
x$parent <- u
x$parent_alias <- this$alias
x$type <- if (this$type == "numeric_array") "numeric" else "categorical"
return(x)
})
return(out)
Expand Down
2 changes: 2 additions & 0 deletions R/variable-type.R
Original file line number Diff line number Diff line change
Expand Up @@ -65,6 +65,8 @@ is.subvariable <- function(x) {

CASTABLE_TYPES <- c("numeric", "text", "categorical") ## Add datetime when server supports

ARRAY_TYPES <- c("categorical_array", "multiple_response", "numeric_array")

#' Change Crunch variable types
#'
#' Numeric, text, and categorical variables can be cast to one another by
Expand Down
1 change: 1 addition & 0 deletions R/variable.R
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
setMethod("tuple", "VariableTuple", function(x) x)
setMethod("tuple", "CrunchVariable", function(x) x@tuple)
setMethod("tuple<-", "CrunchVariable", function(x, value) {
x@tuple <- value
Expand Down
111 changes: 111 additions & 0 deletions dev-misc/fixture-creation/dup-dataset.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,111 @@
library(crunch)
library(here)
library(fs)
library(httptest)
library(purrr)

setupCrunchAuth("team")

source(here("dev-misc/fixture-creation/redactors.R"))

# Make a dataset with duplicate aliases (in subvariables)
ds <- newDataset(data.frame(
x1 = 1:3,
x2 = 2:4,
y1 = factor(letters[1:3], levels = letters[1:5]),
y2 = factor(letters[2:4], levels = letters[1:5]),
z = factor(letters[11:13], levels = letters[11:15])
), "dup test")

ds$x <- deriveArray(
list(
VarDef(ds$x1, name = "x1", alias = "x1"),
VarDef(ds$x2, name = "x2_derived", alias = "x2_derived")
),
name = "x",
numeric = TRUE
)

ds$y <- deriveArray(
list(
VarDef(ds$y1, name = "y1", alias = "y1"),
VarDef(ds$y2, name = "z", alias = "z")
),
name = "y",
numeric = FALSE
)

mv(projects()[["Vegetables fixture"]], ds, projects()[["Vegetables fixture"]])
ds <- refresh(ds)
ds_url <- self(ds)

# Capture fixtures ----
set_redactor(response_redactor(ds, "dup"))
set_requester(request_redactor(ds, "dup"))
## Capture dataset fixtures ----
### General dataset capture ----
temp_dir <- tempfile()
httpcache::clearCache()
dir_create(temp_dir)
start_capturing(temp_dir)

ds <- loadDataset(ds_url)
aliases(allVariables(ds))
# Don't actually export because httptest doesn't get it right
# but we do need the export views and metadata
exporters <- crGET(shojiURL(ds, "views", "export"))
var_meta <- variableMetadata(ds)

stop_capturing()

stabilize_json_files(
temp_dir,
list(
"app.crunch.io/api/datasets/dup.json",
list(list("body", "current_editor_name"), "User"),
list(list("body", "owner_name"), "User"),
list(list("body", "creation_time"), "2024-01-01T21:25:59.791000"),
list(list("body", "modification_time"), "2024-01-01T21:26:43.038000"),
list(list("body", "access_time"), "2024-01-01T21:26:43.038000"),
list(
# --- Only keep the palettes from the project folder so changes to crunch org
# --- don't affect fixtures. Maybe it'd be better to ask for a rcrunch test
# --- account, but this is okay for now
list("body", "palette", "analysis"),
function(x) {
purrr::keep(x, ~.$name %in% c("Default green palette for fixture", "purple palette for fixture"))
}
),
list(list("urls", "owner_url"), "https://app.crunch.io/api/projects/pid/")
)
)

dir_delete(here("mocks/app.crunch.io/api/datasets/dup/"))
file_copy(
path(temp_dir, "app.crunch.io/api/datasets/dup.json"),
here("mocks/app.crunch.io/api/datasets/dup.json"),
overwrite = TRUE
)
dir_copy(
path(temp_dir, "app.crunch.io/api/datasets/dup/"),
here("mocks/app.crunch.io/api/datasets/dup/"),
overwrite = TRUE
)


write.csv(
ds,
here("mocks", "dataset-fixtures", "dup.csv"),
categorical = "id",
include.hidden = TRUE,
missing_values = ""#,
# header_field = "qualified_alias" # This will only work after #188045851 ships
)

# Mock what header_field="qualified_alias" will look like after #188045851 ships
lines <- readLines(here("mocks", "dataset-fixtures", "dup.csv"))
lines[1] <- "x1,x2,y1,y2,z,x[x1],x[x2_derived],y[y1],y[z]"
writeLines(lines, here("mocks", "dataset-fixtures", "dup.csv"))


with_consent(delete(ds))
5 changes: 4 additions & 1 deletion dev-misc/fixture-creation/redactors.R
Original file line number Diff line number Diff line change
Expand Up @@ -178,6 +178,8 @@ ids_from_ds <- function(ds, desired_ds_id) {


stable_var_alias_order <- function(ds) {
if (name(ds) != "Vegetables example") return(aliases(allVariables(ds)))

saved_order_path <- here::here("dev-misc/fixture-creation/var_order.csv")
saved_var_order <- suppressWarnings(try(read.csv(saved_order_path, stringsAsFactors = FALSE)[[1]], silent = TRUE))
if (inherits(saved_var_order, "try-error")) {
Expand All @@ -203,6 +205,7 @@ ids_from_folders <- function(ds) {
)

out <- unlist(out)
if (length(out) == 0) return()
setNames(out, sprintf("vdir_%02d", seq_along(out)))
}

Expand All @@ -214,7 +217,7 @@ ids_below <- function(folder) {
}

ids_from_decks <- function(ds) {
if (length(decks(ds)) == 0) return
if (length(decks(ds)) == 0) return()
deck_ids <- lapply(seq_along(decks(ds)), function(deck_num) {
deck <- refresh(decks(ds)[[deck_num]])
slide_ids <- lapply(seq_along(refresh(slides(deck))), function(slide_num) {
Expand Down
Loading

0 comments on commit 9489964

Please sign in to comment.