inst/doc/c-explore-and-prepare-input-datasets-and-data-dictionaries.R

## ----include = FALSE----------------------------------------------------------
knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>"
)

## ----eval=FALSE---------------------------------------------------------------
# # Load relevant packages
# library(Rmonize)
# library(madshapR) # Additional functions to work with data dictionaries
# library(fabR) # Additional functions to help with importing data
# 
# library(tidyverse) # Collection of R packages for data science

## ----eval=FALSE---------------------------------------------------------------
# # Get original datasets that were read from different file formats
# 
# # Datasets from .sav files that contain variable metadata
# original_dataset_study1 <- Rmonize_examples$original_dataset_study1
# original_dataset_study2 <- Rmonize_examples$original_dataset_study2
# 
# # Dataset from .csv file, with no data dictionary
# original_dataset_study3 <- Rmonize_examples$original_dataset_study3
# 
# # Datasets from .xlsx files, with data dictionaries in separate files
# original_dataset_study4 <- Rmonize_examples$original_dataset_study4
# original_dataset_study5 <- Rmonize_examples$original_dataset_study5

## ----eval=FALSE---------------------------------------------------------------
# # Existing metadata (e.g., from datasets from .sav files) can be extracted as data dictionaries
# original_dd_study1 <- data_dict_extract(original_dataset_study1)
# original_dd_study2 <- data_dict_extract(original_dataset_study2)
# 
# # If data dictionaries are provided as separate files, assign them separately
# original_dd_study4 <- Rmonize_examples$original_data_dictionary_study4
# original_dd_study5 <- Rmonize_examples$original_data_dictionary_study5
# 
# # A minimal data dictionary can be extracted from any dataset, but it provides very limited information
# extracted_dd_study3 <- data_dict_extract(original_dataset_study3)

## ----eval=FALSE---------------------------------------------------------------
# # Confirm that the dataset is a tibble
# is_dataset(original_dataset_study4) # TRUE
# 
# # Evaluate dataset only
# dataset_evaluate(original_dataset_study4)

## ----eval=FALSE---------------------------------------------------------------
# # Make the data dictionary a list containing a data frame named 'Variables'
# modified_dd_study4 <- list(
#   "Variables" = original_dd_study4)
# 
# # Rename columns using standardized names
# modified_dd_study4$Variables <-
#   modified_dd_study4$Variables %>%
#   rename(name = Variable,
#          label = Label,
#          valueType = `Data type`)
# 
# # Evaluate data dictionary only
# data_dict_evaluate(modified_dd_study4)

## ----eval=FALSE---------------------------------------------------------------
# # Get the accepted 'valueType' values and their R equivalents
# madshapR::valueType_list
# 
# # Recode variable valueTypes
# compatible_dd_study4 <- modified_dd_study4
# 
# compatible_dd_study4$Variables <-
#   compatible_dd_study4$Variables %>%
#   mutate(valueType = case_match(
#     valueType,
#     "character" ~ "text",
#     "numeric" ~ "decimal",
#     "integer" ~ "integer"
#   ))
# 
# # Use function data_dict_expand() to create `Categories`
# ?madshapR::data_dict_expand() # See the function documentation
# 
# # Rename column `Category codes` with a standardized name for easy processing
# compatible_dd_study4$Variables <-
#   compatible_dd_study4$Variables %>%
#   rename(`Categories::label` = `Category codes`)
# 
# # Create 'Categories' data frame
# compatible_dd_study4 <-
#   data_dict_expand(
#   data_dict = compatible_dd_study4) %>%
#   as_data_dict_mlstr() # Ensure correct formatting
# 
# # Correctly code categories that indicate types of missing values
# compatible_dd_study4$Categories <-
#   compatible_dd_study4$Categories %>%
#   mutate(
#     missing = ifelse(
#       label %in% c("Don't know", "Prefer not to answer"), TRUE, FALSE))
# 
# # Rerun the data dictionary evaluation to confirm the corrections were made
# data_dict_evaluate(compatible_dd_study4)

## ----eval=FALSE---------------------------------------------------------------
# # Format the dataset
# compatible_dataset_study4 <-
#   as_dataset(original_dataset_study4, col_id = "ID")
# 
# # Evaluate the dataset and data dictionary together, as separate objects
# # Tip: Assign the output to an object for easier viewing.
# dataset_evaluation_study4 <- dataset_evaluate(
#   dataset = compatible_dataset_study4,
#   data_dict = compatible_dd_study4)
# 
# # View the new informational messages in the RStudio viewer
# View(dataset_evaluation_study4[["Dataset assessment"]])
# 
# # Note: the informational messages about duplicated rows are a side effect
# # of a synthesized dataset with few variables. Rows are more likely to have
# # the same values.

## ----eval=FALSE---------------------------------------------------------------
# # Correct a variable name in the data dictionary to match the dataset
# formatted_dd_study4 <- compatible_dd_study4
# formatted_dd_study4$Variables <-
#   formatted_dd_study4$Variables %>%
#   mutate(name = ifelse(name == "marital", "marital_v1", name))
# formatted_dd_study4$Categories <-
#   formatted_dd_study4$Categories %>%
#   mutate(variable = ifelse(variable == "marital", "marital_v1", variable))
# 
# # Adjust two variable valueTypes in the dataset to match the data dictionary
# formatted_dataset_study4 <- compatible_dataset_study4 %>%
#   mutate(
#     # First convert "NA" strings to NA empty strings
#     across(c(drink_four_preg_v1, drink_four_preg_v3), ~ na_if(.x, "NA")),
#     # Convert the variables to integer
#     across(c(drink_four_preg_v1, drink_four_preg_v3), as_any_integer))

## ----eval=FALSE---------------------------------------------------------------
# # Associate a dataset with its data dictionary (only possible when there are no
# # errors in the previous evaluation)
# dataset_with_dd_study4 <- data_dict_apply(
#   dataset = formatted_dataset_study4,
#   data_dict = formatted_dd_study4)
# 
# # If you want, evaluate the dataset and data dictionary together
# # (confirms that there are no errors, but otherwise provides same information)
# dataset_evaluate(dataset_with_dd_study4)

## ----eval=FALSE---------------------------------------------------------------
# # Summarize a dataset with an associated data dictionary
# summary_dataset_with_dd_study4 <- dataset_summarize(
#   dataset = dataset_with_dd_study4)
# 
# # View the summary outputs
# View(summary_dataset_with_dd_study4)
# summary_dataset_with_dd_study4$`Numerical variable summary` %>% View()
# 
# # WARNING: This script creates a folder 'tmp'.
# output_path <- paste0('tmp/',basename(tempdir()))
# dir.create(output_path)
# write_excel_allsheets(
#   summary_dataset_with_dd_study4, paste0(output_path,"/summary_dataset_with_dd_study4.xlsx"))

## ----eval=FALSE---------------------------------------------------------------
# # Produce a visual report of the dataset and variables
# # You must specify a folder to contain the visual report files, and the folder name must not already exist.
# # WARNING: This script creates a folder 'tmp'.
# bookdown_path <- paste0('tmp/',basename(tempdir()))
# if(dir.exists(bookdown_path)) file.remove(bookdown_path)
# 
# dataset_visualize(
#   dataset = dataset_with_dd_study4,
#   bookdown_path = bookdown_path,
#   dataset_summary = summary_dataset_with_dd_study4)
# 
# # Open the visual report in a browser.
# bookdown_open(bookdown_path)
# # Or open 'bookdown_path/docs/index.html'.

## ----eval=FALSE---------------------------------------------------------------
# # Prepare a version of the cleaned validated input dataset (with associated data dictionary)
# input_dd_study4 <- formatted_dd_study4
# input_dataset_study4 <- formatted_dataset_study4 %>%
#   # Erroneous values noted in summary reports can be removed
#   mutate(age_v1 = ifelse(age_v1>100, NA, age_v1)) %>%
#   # If desired, specify the column that provides unique IDs
#   as_dataset(col_id = "ID") %>%
#   # Associate the data dictionary
#   data_dict_apply(data_dict = input_dd_study4)
# 
# # Prepare a version of the cleaned validated input dataset as an R file
# saveRDS(input_dataset_study4, paste0(output_path,"/input_dataset_study4.rds"))

Try the Rmonize package in your browser

Any scripts or data that you put into this service are public.

Rmonize documentation built on July 1, 2025, 1:10 a.m.