ox: Reads OpenClinica xml export files

#' Data for a group of items, as a tidy dataframe
#'
#' Returns a tidy dataframe from a \code{ox_all} object, containing data for all
#' items of the specified \code{group}. In the resulting dataframe, each item is
#' a variable, and each row is an observation; dataframe variables are
#' identified by their \code{item_oid} (or optionally, by their \code{item_name}),
#' and are one of the following classes: \code{Date}, \code{numeric}, or
#' \code{character}. Optionally, items with codelists can be defined as
#' \code{factor}.
#'
#' @param ox_obj An object of class \code{ox_all}, as returned by \code{ox::ox_all()}.
#'
#' @param group A group of items, as \code{character} value. Must be one
#' of the \code{group_oid} values in \code{ox_obj$metadata$group_def}.
#'
#' @param define_factors A \code{logical} value. When \code{TRUE}, items
#' with codelists are defined as factors using the codelist. Defaults to
#' \code{FALSE}.
#'
#' @param use_item_names A \code{logical} value. When \code{TRUE},
#' \code{item_name} in \code{ox_obj$metadata$item_def} are used as variable
#' names in the resulting dataframe. Othewise, \code{item_oid} are used.
#' Defaults to \code{FALSE}.
#'
#' @return A dataframe with study, subject, event, form, group and group repeat keys,
#' plus all items in \code{group} as variables.
#'
#' @export
#'
#' @examples
#' # The example odm1.3 xml file address
#' my_file <- system.file("extdata",
#'                        "odm1.3_full_example.xml",
#'                        package = "ox",
#'                        mustWork = TRUE)
#'
#' # Parsing the xml file
#' library(XML)
#' doc <- xmlParse(my_file)
#'
#' # Create ox_all object
#' my_study <- ox_all(doc)
#'
#' # Item groups
#' unique(my_study$metadata$group_def$group_oid)
#'
#' # Extract data for a group
#' demo <- ox_xtract_group(my_study,
#'                         group = "IG_DEMO_DEMOGRAPHICDATA")
#' head(demo)
#'
#' # Same, using item names to identify vars, and
#' # defining factors for items with codelist
#' demo_2 <- ox_xtract_group(my_study,
#'                           group = "IG_DEMO_DEMOGRAPHICDATA",
#'                           define_factors = TRUE,
#'                           use_item_names = TRUE)
#' head(demo_2)
#'
ox_xtract_group <- function (ox_obj, group,
                             define_factors = FALSE,
                             use_item_names = FALSE) {

  if ( class(ox_obj)[1] != "ox_all") {
    stop("ox_obj should be an object of class ox_all", call. = FALSE)
  }

  if ( class(group) != "character" | length(group) != 1) {
    stop("group should be character of length 1", call. = FALSE)
  }

  if ( !group %in% ox_obj$metadata$group_def$group_oid ) {
    stop("group is none of the group_oid in ox_obj", call. = FALSE)
  }


  if ( class(define_factors) != "logical" | length(define_factors) > 1) {
    stop("define_factors should be logical of length 1", call. = FALSE)
  }

  if ( class(use_item_names) != "logical" | length(use_item_names) > 1) {
    stop("use_item_names should be logical of length 1", call. = FALSE)
  }

  # to denormalize the group data ----
  ox_obj$data %>%
    dplyr::filter(group_oid == group) %>%
    dplyr::select(study_oid,
                  subject_key,
                  dplyr::contains("subject_id"),
                  event_oid,
                  dplyr::contains("event_repeat_key"),    # because NOT always present!
                  form_oid,
                  group_oid,
                  item_oid,
                  dplyr::contains("group_repeat_key"),    # just in case
                  value) %>%
    tidyr::spread(item_oid , value) -> k

  # to define the var order of non-key vars
  ox_obj$metadata$item_ref %>%
    dplyr::filter(group_oid == group) %>%
    dplyr::arrange(item_order_number) %>%
    dplyr::pull(item_oid) -> vars_in_order

  # to identify key vars
  key_vars <- names(k)[!(names(k) %in% vars_in_order)]

  # basic output ----
  k %>%
    dplyr::select(dplyr::one_of(key_vars),
                  dplyr::one_of(vars_in_order)) -> res


  # all item values are character; some codelist values may be character
  # the match would be problematic if we change the item value type before
  # defining factors. That's why factor definition comes first.

  # define_factors ----
  if (define_factors == TRUE) {
    # res <- define_factors(res)
    ox_obj$metadata$codelist_ref %>%
      dplyr::left_join(ox_obj$metadata$codelist_item) %>%
      dplyr::select(item_oid, codelist_oid, coded_value, code_label) -> dic

    # identify vars with codelist
    vars_with_cl <- names(res)[names(res) %in% unique(dic$item_oid)]

    for (i in vars_with_cl) {

      # subset codelist for var
      dic %>%
        dplyr::filter(item_oid == i) -> var_dic

      # define factor
      res[[i]] <- factor(res[[i]],
                         levels = var_dic$coded_value,
                         labels = var_dic$code_label)

    }
  }

  # define vartypes ----
  ox_obj$metadata$item_def %>%
    dplyr::select(item_oid, item_name, item_data_type, item_significant_digits) %>%
    dplyr::filter(item_oid %in% names(res)[7:length(res)]) -> item_info

  # dates
  item_info %>%
    dplyr::filter(item_data_type == "date") %>%
    dplyr::pull(item_oid) -> dates

  if (length(dates) > 0) {
    for (i in 1:length(dates)) {
      res[[dates[i]]] <- as.Date(res[[dates[i]]])
    }
  }

  # numerics
  item_info %>%
    dplyr::filter(item_data_type %in% c("integer","float")) %>%
    dplyr::pull(item_oid) -> numerics

  lapply(res, class)[7:length(res)] -> resvar_class
  resvar_class[!resvar_class %in% c("factor", "Date")] -> k
  names(k) -> no_factor_or_date

  intersect(numerics, no_factor_or_date) -> convert_to_numeric

  if (length(convert_to_numeric) > 0) {
    for (i in 1:length(convert_to_numeric)) {
      res[[convert_to_numeric[i]]] <- as.numeric(res[[convert_to_numeric[i]]])
    }
  }

  # use item_names ----
  if (use_item_names == TRUE) {
    res_names <- names(res)[7:length(res)]

    for (i in 1:length(res_names)) {
      new_name <- item_info$item_name[item_info$item_oid == res_names[i]]
      res_names[i] <- ifelse(res_names[i] %in% item_info$item_oid,
                             item_info$item_name[item_info$item_oid == res_names[i]],
                             res_names[i])
    }
    names(res)[7:length(res)] <- res_names
  }

  # return ----
  res
}