-
Notifications
You must be signed in to change notification settings - Fork 0
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
- Loading branch information
0 parents
commit 9631edd
Showing
11 changed files
with
761 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,4 @@ | ||
.Rproj.user | ||
data-raw | ||
.DS_Store | ||
.Rhistory |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,13 @@ | ||
Version: 1.0 | ||
|
||
RestoreWorkspace: Default | ||
SaveWorkspace: Default | ||
AlwaysSaveHistory: Default | ||
|
||
EnableCodeIndexing: Yes | ||
UseSpacesForTab: Yes | ||
NumSpacesForTab: 2 | ||
Encoding: UTF-8 | ||
|
||
RnwWeave: Sweave | ||
LaTeX: pdfLaTeX |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,126 @@ | ||
|
||
|
||
library(tidyverse) | ||
library(readxl) | ||
library(qxl) | ||
|
||
update_relevant <- function(x, swap) { | ||
for (j in seq_along(swap)) { | ||
x <- gsub( | ||
paste0("(?<=\\{)", as.character(swap[j]), "(?=\\})"), | ||
names(swap[j]), | ||
x, | ||
perl = TRUE | ||
) | ||
} | ||
|
||
x | ||
} | ||
|
||
|
||
## recreate mortality survey data and kobo dict, but with shorter variable names | ||
dict_survey <- readxl::read_xls("data-raw/KoboMortalitySurvey.xls", sheet = 1) | ||
dict_choices <- readxl::read_xls("data-raw/KoboMortalitySurvey.xls", sheet = 2) | ||
dict_settings <- readxl::read_xls("data-raw/KoboMortalitySurvey.xls", sheet = 3) | ||
|
||
names_shorten <- setNames( | ||
dict_survey$name[!is.na(dict_survey$name)], | ||
dict_survey$name_short[!is.na(dict_survey$name)] | ||
) | ||
|
||
dict_survey_out <- dict_survey %>% | ||
select(-name) %>% | ||
rename(name = name_short) %>% | ||
mutate(relevant = map_chr(relevant, update_relevant, swap = names_shorten)) | ||
|
||
dat_hh <- readxl::read_xlsx("data-raw/MortalitySurveyData.xlsx", sheet = 1) | ||
dat_mb <- readxl::read_xlsx("data-raw/MortalitySurveyData.xlsx", sheet = 2) | ||
|
||
dat_hh_short <- dat_hh %>% rename(!!!any_of(names_shorten)) | ||
dat_mb_short <- dat_mb %>% rename(!!!any_of(names_shorten)) | ||
|
||
qxl::qxl( | ||
list(`Mortality Survey` = dat_hh_short, hh_member = dat_mb_short), | ||
file = "data/mortality_survey_data.xlsx" | ||
) | ||
|
||
qxl::qxl( | ||
list(survey = dict_survey_out, choices = dict_choices, settings = dict_settings), | ||
file = "data/mortality_survey_kobo.xlsx" | ||
) | ||
|
||
rm(dict_survey, dict_choices, dat_hh, dat_mb) | ||
|
||
|
||
|
||
|
||
## create a simpler version of mortality survey dataset by merging a few cols of | ||
# household-level data with member-level data | ||
|
||
dict_survey <- readxl::read_xlsx("data/mortality_survey_kobo.xlsx", sheet = 1) | ||
dict_choices <- readxl::read_xlsx("data/mortality_survey_kobo.xlsx", sheet = 2) | ||
dict_settings <- readxl::read_xlsx("data/mortality_survey_kobo.xlsx", sheet = 3) | ||
|
||
dat_hh <- readxl::read_xlsx("data/mortality_survey_data.xlsx", sheet = 1) | ||
dat_mb <- readxl::read_xlsx("data/mortality_survey_data.xlsx", sheet = 2) | ||
|
||
dat_simple_prep <- dat_mb %>% | ||
select( | ||
id = `_index`, | ||
`_parent_index`, | ||
sex:cause_death_other | ||
) %>% | ||
left_join(dat_hh, by = c("_parent_index" = "_index")) %>% | ||
select( | ||
id, | ||
date, | ||
location, | ||
cluster, | ||
source_water, | ||
source_water_other, | ||
sex:cause_death_other | ||
) | ||
|
||
set.seed(59402910) | ||
|
||
dat_simple <- dat_simple_prep %>% | ||
slice_sample(n = 1000) %>% | ||
mutate(id = paste0("PID", stringr::str_pad(1:n(), width = 3, pad = "0"))) %>% | ||
mutate(cluster = as.character(sample(1:5, n(), replace = TRUE))) | ||
|
||
dat_simple %>% | ||
count(location, cluster) | ||
|
||
dat_hh %>% | ||
count(source_water, source_water_other) | ||
|
||
dat_simple %>% | ||
count(source_water, source_water_other) | ||
|
||
dat_simple %>% | ||
count(arrived, departed, born, died) | ||
|
||
dat_simple %>% | ||
count(died, cause_death, cause_death_other) | ||
|
||
|
||
qxl::qxl( | ||
list(`Mortality Survey` = dat_simple), | ||
"data/mortality_survey_simple_data.xlsx" | ||
) | ||
|
||
dict_survey_simple_prep <- dict_survey %>% | ||
filter(name %in% names(dat_simple)) %>% | ||
mutate(list_name = stringr::str_extract(type, pattern = "(?<= )\\w*$"), .after = type) | ||
|
||
dict_choices_simple <- dict_choices %>% | ||
semi_join(dict_survey_simple_prep, by = "list_name") | ||
|
||
dict_survey_simple <- dict_survey_simple_prep %>% | ||
select(-list_name) | ||
|
||
qxl::qxl( | ||
list(survey = dict_survey_simple, options = dict_choices_simple, settings = dict_settings), | ||
"data/mortality_survey_simple_kobo.xlsx" | ||
) | ||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,55 @@ | ||
|
||
library(tidyverse) | ||
library(rio) | ||
library(datadict) | ||
|
||
|
||
## Load data | ||
|
||
# import dataset | ||
dat <- rio::import("data/mortality_survey_simple_data.xlsx", setclass = "tbl") | ||
|
||
# import ODK dictionary (note the main dictionary and multiple-choice options are in separate sheets) | ||
odk_survey <- rio::import("data/mortality_survey_simple_kobo.xlsx", sheet = "survey", setclass = "tbl") | ||
odk_choices <- rio::import("data/mortality_survey_simple_kobo.xlsx", sheet = "options", setclass = "tbl") | ||
|
||
|
||
## Exercise 1 | ||
dict_dat <- datadict::dict_from_data(dat) | ||
dict_odk <- datadict::dict_from_odk(odk_survey, odk_choices) | ||
|
||
|
||
## Exercise 2 | ||
dat_reclass <- dat %>% | ||
mutate( | ||
across(starts_with("date"), as.Date), | ||
across(c(age_months, age_years, muac), as.integer) | ||
) | ||
|
||
dict_dat_reclass <- datadict::dict_from_data(dat_reclass) | ||
|
||
|
||
## Exercise 3 | ||
dict_odk_options <- datadict::coded_options(dict_odk) | ||
dict_dat_options <- datadict::coded_options(dict_dat_reclass) | ||
|
||
|
||
## Exercise 4 | ||
datadict::valid_dict(dict_dat_reclass) | ||
|
||
dict_nonvalid <- dict_dat_reclass | ||
dict_nonvalid$variable_name[6] <- "source_water" # duplicate variable name | ||
dict_nonvalid$type[10] <- NA_character_ # missing variable type | ||
|
||
datadict::valid_dict(dict_nonvalid) | ||
|
||
|
||
## Exercise 5 | ||
datadict::valid_data(dat_reclass, dict_dat_reclass) | ||
|
||
dat_nonvalid <- dat_reclass[,-12] # remove column present in dict | ||
dat_nonvalid$age_years[5] <- "5yrs" # non-valid value of numeric variable | ||
|
||
datadict::valid_data(dat_nonvalid, dict_dat_reclass) | ||
|
||
|
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,3 @@ | ||
|
||
Materials for OCA data sharing training on producing a data dictionary and | ||
pseudonymisation. |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,139 @@ | ||
--- | ||
title: "OCA Data Sharing Practical: Data dictionaries" | ||
author: "Patrick Barks" | ||
date: "Generated `r format(Sys.time(), format = '%Y-%m-%d')`" | ||
output: | ||
html_document: | ||
toc: false | ||
theme: united | ||
highlight: tango | ||
editor_options: | ||
chunk_output_type: console | ||
always_allow_html: true | ||
--- | ||
|
||
```{r setup, include=FALSE} | ||
knitr::opts_chunk$set() | ||
``` | ||
|
||
## Contents | ||
- [Objectives](#objectives) | ||
- [Background](#background) | ||
- [Dictionary format](#format) | ||
- [The `datadict` package](#datadict) | ||
- [Excercises](#exercises) | ||
|
||
## <a name="objectives"></a>Objectives | ||
|
||
Learn to: | ||
|
||
- prepare an OCA-style data dictionary, starting from either a raw dataset or a ODK/Kobo dictionary | ||
- compare datasets and corresponding data dictionaries to ensure completeness and validity | ||
|
||
## <a name="objectives"></a>Background | ||
|
||
Documenting the variables in a dataset is a crucial part of data management and | ||
ensures that a dataset is interpretable by researchers who were not directly | ||
involved in study design or data collection. Whereas data files will generally | ||
contain codenames for variables (e.g. `patagegrp` for “Patient’s age group”), | ||
and sometimes also coded data values, the data dictionary describes each | ||
variable and the set of possible values in plain language that is more broadly | ||
interpretable. | ||
|
||
Data collection platforms such as Kobo, REDCap, and OpenClinica have their own | ||
specialized data dictionary format. To facilitate dataset and data dictionary | ||
validation, the OCA Data Sharing Platform uses its own standardized dictionary | ||
format, defined in the next section. | ||
|
||
## <a name="format"></a>Dictionary format | ||
|
||
Every dataset shared or archived on the OCA platform must have an accompanying | ||
data dictionary that includes, at a minimum, the fields described below. Each | ||
variable in the dataset (i.e. each column) must have a dictionary entry for each | ||
of the required fields. | ||
|
||
| Required field | Description | Example entry | | ||
| -------------- | ----------- | ------------- | | ||
| variable_name | Variable name (i.e. exact column name within the corresponding dataset) | "sample_type" | | ||
| short_label | Short phrase describing the variable in words | "Type of laboratory sample collected" | | ||
| origin | Was the variable a part of the original data collection instrument (option "original"), or was it later derived (option "derived") | "original" | | ||
| type | Variable type (options: "Numeric", "Date", "Time", "Datetime", "Coded list", or "Free text") | "Coded list" | | ||
| choices | The list of options (pairs of codes and labels) corresponding to a variable of type "Coded list". | "1, Blood \| 2, Nasal swab \| 3, Throat swab \| 4, Other" | | ||
|
||
## <a name="datadict"></a>The `datadict` package | ||
|
||
The R package [`datadict`](https://github.com/epicentre-msf/datadict) contains a | ||
variety of functions to aid in the preparation of an OCA-style data dictionary: | ||
|
||
- `dict_from_data()`: prepare a dictionary template from a raw dataset | ||
- `dict_from_odk()`: prepare a dictionary template from an ODK/Kobo dictionary | ||
- `dict_from_redcap()`: prepare a dictionary template from a REDCap dictionary | ||
- `valid_dict()`: verify that a dictionary is consistent with the OCA format | ||
- `valid_data()`: verify that a dataset corresponds to its associated data dictionary | ||
|
||
Note that dictionary templates produces by the `dict_from_` functions may still | ||
require further processing by the user (e.g. with additional R scripts, or by | ||
hand in Excel). | ||
|
||
## <a name="exercises"></a>Exercises | ||
|
||
This repository includes an example dataset based on a mortality survey. Load | ||
the dataset and corresponding ODK data dictionary using the example code below, | ||
and work through the following exercises using functions from the `datadict` | ||
package where possible. | ||
|
||
```{r, message=FALSE} | ||
library(rio) | ||
library(datadict) | ||
# import dataset | ||
dat <- rio::import("data/mortality_survey_simple_data.xlsx", setclass = "tbl") | ||
# import ODK dictionary (note the main dictionary and multiple-choice options are in separate sheets) | ||
odk_survey <- rio::import("data/mortality_survey_simple_kobo.xlsx", sheet = "survey", setclass = "tbl") | ||
odk_choices <- rio::import("data/mortality_survey_simple_kobo.xlsx", sheet = "options", setclass = "tbl") | ||
``` | ||
|
||
#### Exercise 1: | ||
|
||
With the `datadict` package we can prepare a dictionary template *either* from | ||
the raw dataset (using `dict_from_data()`) or from the ODK dictionary (using | ||
`dict_from_odk()`). Try both approaches separately and compare the resulting | ||
dictionary templates. What are some differences? Can you guess why these | ||
differences are occurring? | ||
|
||
#### Exercise 2: | ||
|
||
When producing a dictionary template using `dict_from_data()`, the variable type | ||
is determined by the class of the original column (e.g. character, numeric, | ||
Date). The column classes that are read in by e.g. `rio::import()` might not | ||
always correspond to the variable types that we have in mind (e.g. numbers, | ||
dates, and times are sometimes read in as class "character"). Where necessary, | ||
transform the columns of `dat` using functions like `as.numeric()` or | ||
`as.Date()` and then produce another dictionary template using | ||
`dict_from_data()`. What are the differences that remain between this dictionary | ||
template and the template derived from the ODK dictionary? | ||
|
||
#### Exercise 3: | ||
|
||
Examine the options for the 'Coded list' type variables in the dictionary | ||
produced in exercise 2, and compare these to the corresponding options produced | ||
by `dict_from_odk()`. Why does the dictionary produced by `dict_from_data()` | ||
have fewer 'Coded list' options for some variables? Does this matter in terms of | ||
data sharing? | ||
|
||
Hint: check out the function `datadict::coded_options()` to extract a long-form | ||
table of Coded list variables and corresponding options from a dictionary. | ||
|
||
#### Exercise 4: | ||
|
||
Use the function `valid_dict()` to check that the dictionary you produced in | ||
Exercise 2 complies with the OCA standard. Assuming it does, edit the dictionary | ||
so that it fails at least two of the checks implemented by `valid_dict()`. | ||
|
||
#### Exercise 5: | ||
|
||
Use the function `valid_data()` to check for consistency between the dataset and | ||
dictionary produced in Exercise 2. Assuming all checks pass, edit the dataset so | ||
that it fails at least two of the checks implemented by `valid_data()`. | ||
|
Large diffs are not rendered by default.
Oops, something went wrong.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.