epidict: Epidemiology data dictionaries and random data generators

Documented in gen_data

#' Generate random linelist or survey data
#'
#' Based on a dictionary generator like [msf_dict()] or [msf_dict_survey()],
#' this function will generate a randomized data set based on values defined in
#' the dictionaries. The randomized dataset produced should mimic an excel
#' export from DHIS2 for outbreaks and a Kobo export for surveys.
#'
#' @param dictionary Specify which dictionary you would like to use.
#'
#' @param varnames Specify name of column that contains variable names.
#'   If `dictionary` is a survey, `varnames` needs to be "name"`.
#'
#' @param numcases Specify the number of cases you want (default is 300)
#'
#' @param org the organization the dictionary belongs to. Currently, only MSF
#'   exists. In the future, dictionaries from WHO and other organizations may
#'   become available.
#'
#' @return a data frame with cases in rows and variables in columns. The number
#'   of columns will vary from dictionary to dictionary, so please use the
#'   dictionary functions to generate a corresponding dictionary.
#' @export
#' @examples
#'
#' if (require("dplyr") & require("matchmaker")) {
#'   withAutoprint({
#'
#'     # You will often want to use MSF dictionaries to translate codes to human-
#'     # readable variables. Here, we generate a data set of 20 cases:
#'     dat <- gen_data(
#'       dictionary = "Cholera",
#'       varnames = "data_element_shortname",
#'       numcases = 20,
#'       org = "MSF"
#'     )
#'     print(dat)
#'
#'     # We want the expanded dictionary, so we will select `compact = FALSE`
#'     dict <- msf_dict(disease = "Cholera", long = TRUE, compact = FALSE, tibble = TRUE)
#'     print(dict)
#'
#'     # Now we can use matchmaker to filter the data:
#'     dat_clean <- matchmaker::match_df(dat, dict,
#'       from = "option_code",
#'       to = "option_name",
#'       by = "data_element_shortname",
#'       order = "option_order_in_set"
#'     )
#'     print(dat_clean)
#'
#'   })
#' }
gen_data <- function(dictionary, varnames = "data_element_shortname", numcases = 300, org = "MSF") {

  # Three datasets:
  # 1) dat_dict = msf data dicationary generated by (msf_dict)
  # 2) dat_output = formatting of data dictionary to make use for sampling
  # 3) dis_output = dictionary dataset generated from sampling (exported)

  # define which ones are outbreaks and which ones are survey datasets
  # get msf dictionary specific data dictionary
  dict <- get_dictionary(dictionary, org)
  disease <- unlist(dict, use.names = FALSE)
  is_survey <- length(dict$survey) == 1

  # Match the function for providing the data dictionaries. Note that each of
  # the dictionary functions will be prefixed by a lowercase org name, for
  # example, MSF will have msf_dict and WHO will have who_dict
  lorg <- tolower(org)
  ns <- asNamespace("epidict")
  if (is_survey) {
    GET_DICT <- get(sprintf("%s_dict_survey", lorg), envir = ns)
  } else {
    GET_DICT <- get(sprintf("%s_dict", lorg), envir = ns)
  }

  # Match the function for data generator. At the moment, these are all internal
  GENERATE <- get(sprintf("gen_%s_data", lorg), envir = ns)
  dictionary <- GET_DICT(disease = disease, tibble = FALSE, compact = TRUE)

  GENERATE(disease, dictionary, is_survey, varnames = varnames, numcases = numcases)
}