Skip to content

Commit

Permalink
Initial commit
Browse files Browse the repository at this point in the history
  • Loading branch information
patrickbarks committed Mar 8, 2022
0 parents commit 9631edd
Show file tree
Hide file tree
Showing 11 changed files with 761 additions and 0 deletions.
4 changes: 4 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
@@ -0,0 +1,4 @@
.Rproj.user
data-raw
.DS_Store
.Rhistory
13 changes: 13 additions & 0 deletions OCA-training.Rproj
Original file line number Diff line number Diff line change
@@ -0,0 +1,13 @@
Version: 1.0

RestoreWorkspace: Default
SaveWorkspace: Default
AlwaysSaveHistory: Default

EnableCodeIndexing: Yes
UseSpacesForTab: Yes
NumSpacesForTab: 2
Encoding: UTF-8

RnwWeave: Sweave
LaTeX: pdfLaTeX
126 changes: 126 additions & 0 deletions R/data_prep.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,126 @@


library(tidyverse)
library(readxl)
library(qxl)

update_relevant <- function(x, swap) {
for (j in seq_along(swap)) {
x <- gsub(
paste0("(?<=\\{)", as.character(swap[j]), "(?=\\})"),
names(swap[j]),
x,
perl = TRUE
)
}

x
}


## recreate mortality survey data and kobo dict, but with shorter variable names
dict_survey <- readxl::read_xls("data-raw/KoboMortalitySurvey.xls", sheet = 1)
dict_choices <- readxl::read_xls("data-raw/KoboMortalitySurvey.xls", sheet = 2)
dict_settings <- readxl::read_xls("data-raw/KoboMortalitySurvey.xls", sheet = 3)

names_shorten <- setNames(
dict_survey$name[!is.na(dict_survey$name)],
dict_survey$name_short[!is.na(dict_survey$name)]
)

dict_survey_out <- dict_survey %>%
select(-name) %>%
rename(name = name_short) %>%
mutate(relevant = map_chr(relevant, update_relevant, swap = names_shorten))

dat_hh <- readxl::read_xlsx("data-raw/MortalitySurveyData.xlsx", sheet = 1)
dat_mb <- readxl::read_xlsx("data-raw/MortalitySurveyData.xlsx", sheet = 2)

dat_hh_short <- dat_hh %>% rename(!!!any_of(names_shorten))
dat_mb_short <- dat_mb %>% rename(!!!any_of(names_shorten))

qxl::qxl(
list(`Mortality Survey` = dat_hh_short, hh_member = dat_mb_short),
file = "data/mortality_survey_data.xlsx"
)

qxl::qxl(
list(survey = dict_survey_out, choices = dict_choices, settings = dict_settings),
file = "data/mortality_survey_kobo.xlsx"
)

rm(dict_survey, dict_choices, dat_hh, dat_mb)




## create a simpler version of mortality survey dataset by merging a few cols of
# household-level data with member-level data

dict_survey <- readxl::read_xlsx("data/mortality_survey_kobo.xlsx", sheet = 1)
dict_choices <- readxl::read_xlsx("data/mortality_survey_kobo.xlsx", sheet = 2)
dict_settings <- readxl::read_xlsx("data/mortality_survey_kobo.xlsx", sheet = 3)

dat_hh <- readxl::read_xlsx("data/mortality_survey_data.xlsx", sheet = 1)
dat_mb <- readxl::read_xlsx("data/mortality_survey_data.xlsx", sheet = 2)

dat_simple_prep <- dat_mb %>%
select(
id = `_index`,
`_parent_index`,
sex:cause_death_other
) %>%
left_join(dat_hh, by = c("_parent_index" = "_index")) %>%
select(
id,
date,
location,
cluster,
source_water,
source_water_other,
sex:cause_death_other
)

set.seed(59402910)

dat_simple <- dat_simple_prep %>%
slice_sample(n = 1000) %>%
mutate(id = paste0("PID", stringr::str_pad(1:n(), width = 3, pad = "0"))) %>%
mutate(cluster = as.character(sample(1:5, n(), replace = TRUE)))

dat_simple %>%
count(location, cluster)

dat_hh %>%
count(source_water, source_water_other)

dat_simple %>%
count(source_water, source_water_other)

dat_simple %>%
count(arrived, departed, born, died)

dat_simple %>%
count(died, cause_death, cause_death_other)


qxl::qxl(
list(`Mortality Survey` = dat_simple),
"data/mortality_survey_simple_data.xlsx"
)

dict_survey_simple_prep <- dict_survey %>%
filter(name %in% names(dat_simple)) %>%
mutate(list_name = stringr::str_extract(type, pattern = "(?<= )\\w*$"), .after = type)

dict_choices_simple <- dict_choices %>%
semi_join(dict_survey_simple_prep, by = "list_name")

dict_survey_simple <- dict_survey_simple_prep %>%
select(-list_name)

qxl::qxl(
list(survey = dict_survey_simple, options = dict_choices_simple, settings = dict_settings),
"data/mortality_survey_simple_kobo.xlsx"
)

55 changes: 55 additions & 0 deletions R/exercises.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,55 @@

library(tidyverse)
library(rio)
library(datadict)


## Load data

# import dataset
dat <- rio::import("data/mortality_survey_simple_data.xlsx", setclass = "tbl")

# import ODK dictionary (note the main dictionary and multiple-choice options are in separate sheets)
odk_survey <- rio::import("data/mortality_survey_simple_kobo.xlsx", sheet = "survey", setclass = "tbl")
odk_choices <- rio::import("data/mortality_survey_simple_kobo.xlsx", sheet = "options", setclass = "tbl")


## Exercise 1
dict_dat <- datadict::dict_from_data(dat)
dict_odk <- datadict::dict_from_odk(odk_survey, odk_choices)


## Exercise 2
dat_reclass <- dat %>%
mutate(
across(starts_with("date"), as.Date),
across(c(age_months, age_years, muac), as.integer)
)

dict_dat_reclass <- datadict::dict_from_data(dat_reclass)


## Exercise 3
dict_odk_options <- datadict::coded_options(dict_odk)
dict_dat_options <- datadict::coded_options(dict_dat_reclass)


## Exercise 4
datadict::valid_dict(dict_dat_reclass)

dict_nonvalid <- dict_dat_reclass
dict_nonvalid$variable_name[6] <- "source_water" # duplicate variable name
dict_nonvalid$type[10] <- NA_character_ # missing variable type

datadict::valid_dict(dict_nonvalid)


## Exercise 5
datadict::valid_data(dat_reclass, dict_dat_reclass)

dat_nonvalid <- dat_reclass[,-12] # remove column present in dict
dat_nonvalid$age_years[5] <- "5yrs" # non-valid value of numeric variable

datadict::valid_data(dat_nonvalid, dict_dat_reclass)


3 changes: 3 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@

Materials for OCA data sharing training on producing a data dictionary and
pseudonymisation.
139 changes: 139 additions & 0 deletions Rmd/practical_data_dict.Rmd
Original file line number Diff line number Diff line change
@@ -0,0 +1,139 @@
---
title: "OCA Data Sharing Practical: Data dictionaries"
author: "Patrick Barks"
date: "Generated `r format(Sys.time(), format = '%Y-%m-%d')`"
output:
html_document:
toc: false
theme: united
highlight: tango
editor_options:
chunk_output_type: console
always_allow_html: true
---

```{r setup, include=FALSE}
knitr::opts_chunk$set()
```

## Contents
- [Objectives](#objectives)
- [Background](#background)
- [Dictionary format](#format)
- [The `datadict` package](#datadict)
- [Excercises](#exercises)

## <a name="objectives"></a>Objectives

Learn to:

- prepare an OCA-style data dictionary, starting from either a raw dataset or a ODK/Kobo dictionary
- compare datasets and corresponding data dictionaries to ensure completeness and validity

## <a name="objectives"></a>Background

Documenting the variables in a dataset is a crucial part of data management and
ensures that a dataset is interpretable by researchers who were not directly
involved in study design or data collection. Whereas data files will generally
contain codenames for variables (e.g. `patagegrp` for “Patient’s age group”),
and sometimes also coded data values, the data dictionary describes each
variable and the set of possible values in plain language that is more broadly
interpretable.

Data collection platforms such as Kobo, REDCap, and OpenClinica have their own
specialized data dictionary format. To facilitate dataset and data dictionary
validation, the OCA Data Sharing Platform uses its own standardized dictionary
format, defined in the next section.

## <a name="format"></a>Dictionary format

Every dataset shared or archived on the OCA platform must have an accompanying
data dictionary that includes, at a minimum, the fields described below. Each
variable in the dataset (i.e. each column) must have a dictionary entry for each
of the required fields.

| Required field | Description | Example entry |
| -------------- | ----------- | ------------- |
| variable_name | Variable name (i.e. exact column name within the corresponding dataset) | "sample_type" |
| short_label | Short phrase describing the variable in words | "Type of laboratory sample collected" |
| origin | Was the variable a part of the original data collection instrument (option "original"), or was it later derived (option "derived") | "original" |
| type | Variable type (options: "Numeric", "Date", "Time", "Datetime", "Coded list", or "Free text") | "Coded list" |
| choices | The list of options (pairs of codes and labels) corresponding to a variable of type "Coded list". | "1, Blood \| 2, Nasal swab \| 3, Throat swab \| 4, Other" |

## <a name="datadict"></a>The `datadict` package

The R package [`datadict`](https://github.com/epicentre-msf/datadict) contains a
variety of functions to aid in the preparation of an OCA-style data dictionary:

- `dict_from_data()`: prepare a dictionary template from a raw dataset
- `dict_from_odk()`: prepare a dictionary template from an ODK/Kobo dictionary
- `dict_from_redcap()`: prepare a dictionary template from a REDCap dictionary
- `valid_dict()`: verify that a dictionary is consistent with the OCA format
- `valid_data()`: verify that a dataset corresponds to its associated data dictionary

Note that dictionary templates produces by the `dict_from_` functions may still
require further processing by the user (e.g. with additional R scripts, or by
hand in Excel).

## <a name="exercises"></a>Exercises

This repository includes an example dataset based on a mortality survey. Load
the dataset and corresponding ODK data dictionary using the example code below,
and work through the following exercises using functions from the `datadict`
package where possible.

```{r, message=FALSE}
library(rio)
library(datadict)
# import dataset
dat <- rio::import("data/mortality_survey_simple_data.xlsx", setclass = "tbl")
# import ODK dictionary (note the main dictionary and multiple-choice options are in separate sheets)
odk_survey <- rio::import("data/mortality_survey_simple_kobo.xlsx", sheet = "survey", setclass = "tbl")
odk_choices <- rio::import("data/mortality_survey_simple_kobo.xlsx", sheet = "options", setclass = "tbl")
```

#### Exercise 1:

With the `datadict` package we can prepare a dictionary template *either* from
the raw dataset (using `dict_from_data()`) or from the ODK dictionary (using
`dict_from_odk()`). Try both approaches separately and compare the resulting
dictionary templates. What are some differences? Can you guess why these
differences are occurring?

#### Exercise 2:

When producing a dictionary template using `dict_from_data()`, the variable type
is determined by the class of the original column (e.g. character, numeric,
Date). The column classes that are read in by e.g. `rio::import()` might not
always correspond to the variable types that we have in mind (e.g. numbers,
dates, and times are sometimes read in as class "character"). Where necessary,
transform the columns of `dat` using functions like `as.numeric()` or
`as.Date()` and then produce another dictionary template using
`dict_from_data()`. What are the differences that remain between this dictionary
template and the template derived from the ODK dictionary?

#### Exercise 3:

Examine the options for the 'Coded list' type variables in the dictionary
produced in exercise 2, and compare these to the corresponding options produced
by `dict_from_odk()`. Why does the dictionary produced by `dict_from_data()`
have fewer 'Coded list' options for some variables? Does this matter in terms of
data sharing?

Hint: check out the function `datadict::coded_options()` to extract a long-form
table of Coded list variables and corresponding options from a dictionary.

#### Exercise 4:

Use the function `valid_dict()` to check that the dictionary you produced in
Exercise 2 complies with the OCA standard. Assuming it does, edit the dictionary
so that it fails at least two of the checks implemented by `valid_dict()`.

#### Exercise 5:

Use the function `valid_data()` to check for consistency between the dataset and
dictionary produced in Exercise 2. Assuming all checks pass, edit the dataset so
that it fails at least two of the checks implemented by `valid_data()`.

421 changes: 421 additions & 0 deletions Rmd/practical_data_dict.html

Large diffs are not rendered by default.

Binary file added data/mortality_survey_data.xlsx
Binary file not shown.
Binary file added data/mortality_survey_kobo.xlsx
Binary file not shown.
Binary file added data/mortality_survey_simple_data.xlsx
Binary file not shown.
Binary file added data/mortality_survey_simple_kobo.xlsx
Binary file not shown.

0 comments on commit 9631edd

Please sign in to comment.