R/util_combine_missing_lists.R

Defines functions CAN_THIS_BE_REMOVED_util_combine_missing_lists

Documented in CAN_THIS_BE_REMOVED_util_combine_missing_lists

#' Combine missing-lists for a set of variables to be displayed in the same
#' heat-map
#'
#' @param resp_vars [variable list] the name of the measurement variables
#' @param study_data [data.frame] the data frame that contains the measurements
#' @param meta_data [data.frame] the data frame that contains metadata
#'                               attributes of study data
#' @param label_col [variable attribute] the name of the column in the metadata
#'                                       with labels of variables
#' @param cause_label_df [data.frame] missing code table. If missing codes have
#'                                    labels the respective data frame can be
#'                                    specified here, see [cause_label_df]
#' @param assume_consistent_codes [logical] if TRUE and no labels are given and
#'                                          the same missing/jump code is used
#'                                          for more than one variable, the
#'                                          labels assigned for this code will
#'                                          be the same for all variables.
#' @param expand_codes [logical] if TRUE, code labels are copied from other
#'                               variables, if the code is the same and the
#'                               label is set somewhere
#' @param suppressWarnings [logical] warn about consistency issues with missing
#'                                   and jump lists
#' @param include_sysmiss [logical] Optional, if TRUE system missingness (NAs)
#'                                  is evaluated in the summary plot
#'
#' @return a [list] with:
#'   - `ModifiedStudyData`: data frame with re-coded (if needed) study data
#'   - `cause_label_df`: data frame with re-coded missing codes suitable for all
#'                       variables
#'
#' @family metadata_management
#' @concept robustness
#' @keywords internal


# FIXME
CAN_THIS_BE_REMOVED_util_combine_missing_lists <-
  function(resp_vars, study_data, meta_data, label_col,
           include_sysmiss, cause_label_df, assume_consistent_codes = TRUE,
           expand_codes = assume_consistent_codes,
           suppressWarnings = FALSE) {

    validated <-
      util_validate_missing_lists(meta_data = meta_data,
                                cause_label_df = cause_label_df,
                                assume_consistent_codes =
                                  assume_consistent_codes,
                                expand_codes = expand_codes,
                                label_col = label_col,
                                suppressWarnings = suppressWarnings)
    cause_label_df <- validated$cause_label_df

    prep_prepare_dataframes(.replace_missings = FALSE)

    # correct variable use?
    util_correct_variable_use("resp_vars",
                              allow_more_than_one = TRUE,
                              allow_null          = TRUE,
                              allow_any_obs_na    = TRUE
    )

    if (missing(resp_vars) || length(resp_vars) == 0)
      resp_vars <- colnames(ds1)

    if (missing(cause_label_df) || !prod(dim(cause_label_df))) {
      mc_lab <- data.frame(
        CODE_VALUE = NA,
        CODE_LABEL = NA,
        AUTO = NA,
        stringsAsFactors = FALSE
      )[FALSE, , FALSE]
    } else {
      mc_lab <- cause_label_df
    }

    overall_max_val <- max(
      suppressWarnings(as.numeric(as.matrix(study_data))), na.rm = TRUE) # TODO: Here may be a problem, if we have DATETIME missing codes

    mc_labs <- lapply(setNames(nm = resp_vars), function(rv) {
      ml <-
        util_get_code_list(rv, MISSING_LIST,
                           split_char = SPLIT_CHAR,
                           mdf = meta_data,
                           label_col = label_col,
                           warning_if_no_list = FALSE)
      jl <-
        util_get_code_list(rv, JUMP_LIST,
                           split_char = SPLIT_CHAR,
                           mdf = meta_data,
                           label_col = label_col,
                           warning_if_no_list = FALSE)
      #### Remove all codes unused by the current variable ####
      mc_lab <- mc_lab[mc_lab$CODE_VALUE %in% c(ml, jl), , drop = FALSE]
      mc_lab$CODE_CLASS[mc_lab$CODE_VALUE %in% ml] <- "MISSING"
      mc_lab$CODE_CLASS[mc_lab$CODE_VALUE %in% jl] <- "JUMP"

      #### Amend all missing codes ####
      if (any(!(ml %in% mc_lab$CODE_VALUE))) {
        code_label <- names(ml)
        if (any(is.na(ml == names(ml)))) { # nocov start
          util_warning(c("Internal problem with %s (for missing list)",
                         "This should not have happened, sorry. Please",
                         "report this issue."),
                       dQuote("util_get_code_list"),
                       applicability_problem = FALSE)
        } # nocov end
        if (assume_consistent_codes) {
          code_label[ml == names(ml)] <- paste("MISSING",
                                               ml[!(ml %in% mc_lab$CODE_VALUE)],
                                               sep = " ")
        } else {
          code_label[ml == names(ml)] <- paste("MISSING", rv,
                                               ml[!(ml %in% mc_lab$CODE_VALUE)],
                                               sep = " ")
        }
        mc_lab <- rbind.data.frame(mc_lab, data.frame(
          CODE_VALUE = ml[!(ml %in% mc_lab$CODE_VALUE)],
          CODE_LABEL = code_label[!(ml %in% mc_lab$CODE_VALUE)],
          resp_vars = rep(rv, sum(!(ml %in% mc_lab$CODE_VALUE), na.rm = TRUE)),
          CODE_CLASS = rep("MISSING",
                           sum(!(ml %in% mc_lab$CODE_VALUE), na.rm = TRUE)),
          AUTO = rep(TRUE, sum(!(ml %in% mc_lab$CODE_VALUE), na.rm = TRUE)),
          stringsAsFactors = FALSE
        ))
      }
      if (any(!(jl %in% mc_lab$CODE_VALUE))) {
        code_label <- names(jl)
        if (any(is.na(jl == names(jl)))) { # nocov start
          util_warning(c("Internal problem with %s (for jump list)",
                         "This should not have happened, sorry. Please",
                         "report this issue."),
                       dQuote("util_get_code_list"),
                       applicability_problem = FALSE)
        } # nocov end
        if (assume_consistent_codes) {
          code_label[jl == names(jl)] <- paste("JUMP",
                                               jl[!(jl %in% mc_lab$CODE_VALUE)],
                                               sep = " ")
        } else {
          code_label[jl == names(jl)] <- paste("JUMP", rv,
                                               jl[!(jl %in% mc_lab$CODE_VALUE)],
                                               sep = " ")
        }
        mc_lab <- rbind.data.frame(mc_lab, data.frame(
          CODE_VALUE = jl[!(jl %in% mc_lab$CODE_VALUE)],
          CODE_LABEL = code_label[!(jl %in% mc_lab$CODE_VALUE)],
          resp_vars = rep(rv, sum(!(jl %in% mc_lab$CODE_VALUE), na.rm = TRUE)),
          CODE_CLASS = rep("JUMP",
                           sum(!(jl %in% mc_lab$CODE_VALUE), na.rm = TRUE)),
          AUTO = rep(TRUE,
                     sum(!(jl %in% mc_lab$CODE_VALUE), na.rm = TRUE)),
          stringsAsFactors = FALSE
        ))
        mc_lab <- rbind.data.frame(mc_lab, data.frame(
          CODE_VALUE = jl[!(jl %in% mc_lab$CODE_VALUE)],
          CODE_LABEL = code_label[!(jl %in% mc_lab$CODE_VALUE)],
          resp_vars = rep(rv, sum(!(ml %in% mc_lab$CODE_VALUE), na.rm = TRUE)),
          CODE_CLASS = rep("JUMP",
                           sum(!(ml %in% mc_lab$CODE_VALUE), na.rm = TRUE)),
          AUTO = rep(TRUE,
                     sum(!(ml %in% mc_lab$CODE_VALUE), na.rm = TRUE)),
          stringsAsFactors = FALSE
        ))
      }

      if (nrow(mc_lab)) {
        mc_lab$resp_vars <- rv
      } else {
        mc_lab <- data.frame(
          CODE_VALUE = NA,
          CODE_LABEL = NA,
          resp_vars = rv,
          AUTO = NA,
          stringsAsFactors = FALSE
        )[FALSE, , FALSE]
      }

      mc_lab
    })

    # prepare a generator for globally unused codes

    overall_max_val <- max(overall_max_val,# TODO: Here may be a problem, if we have DATETIME missing codes
                           vapply(mc_labs, function(x) {
                             if (is.data.frame(x) && any(!is.finite(
                               x$CODE_VALUE)))
                               max(x$CODE_VALUE, na.rm = TRUE)
                             else
                               NA_integer_
                             }, FUN.VALUE = numeric(1)), na.rm = TRUE)

    get_code_generator <- function(init = overall_max_val + 1) {
      props <- new.env(parent = emptyenv())
      props$cnt <- force(init)
      function(...) {
        r <- props$cnt
        props$cnt <- props$cnt + 1
        r
      }
    }

    code_generator <- get_code_generator()

    large_list <- do.call(rbind.data.frame, mc_labs)

    if (expand_codes) {
      if (assume_consistent_codes) {
        is_auto_code <- function(x) {
          r <- any(x[["CODE_LABEL"]] %in%
                     paste(c("MISSING", "JUMP"), x[["CODE_VALUE"]]))
          r
        }
      } else {
        is_auto_code <- function(x) {
          r <- any(x[["CODE_LABEL"]] %in%
                     paste(c("MISSING", "JUMP"), x[["resp_vars"]], x[["CODE_VALUE"]]))
          r
        }
      }
      detect_auto_codes <- function(dfr) {
        dfr$AUTO <- apply(dfr, 1, is_auto_code)
        dfr
      }
      if (nrow(large_list) > 0) {
        auto_codes <- detect_auto_codes(large_list)
      } else {
        auto_codes <- large_list
        auto_codes <- cbind(auto_codes, data.frame(AUTO = logical(0)))
      }
      l <- split(auto_codes, list(large_list$CODE_VALUE, large_list$CODE_CLASS))
      expand_code <- lapply(l, function(x) {sum(!x$AUTO) == 1})
      l[unlist(expand_code)] <- lapply(l[unlist(expand_code)], function(dfr) {
        dfr$CODE_LABEL <- dfr$CODE_LABEL[!dfr$AUTO]
        dfr
      })
      if (length(l) > 0) {
        large_list <-
          unsplit(l, list(large_list$CODE_VALUE, large_list$CODE_CLASS))
      }
      large_list$AUTO <- NULL
    }
    # browser()
    duplicated_codes <-
      vapply(
        split(large_list, list(large_list$CODE_VALUE, large_list$CODE_CLASS)),
        function(mc_lab) {
        length(unique(mc_lab$CODE_LABEL)) >= 2
    }, FUN.VALUE = logical(1))

    recode <- unlist(lapply(setNames(nm = names(duplicated_codes[duplicated_codes])),
      function(mc) {
        v <- setNames(nm = unique(
          large_list[large_list$CODE_VALUE == mc, "CODE_LABEL", drop = TRUE]))
        lapply(v, function(lab) {
          list(harmonized_code = code_generator(),
               variables = unique(large_list[large_list$CODE_VALUE == mc &
                                             large_list$CODE_LABEL == lab,
                                             "resp_vars", drop = TRUE]),
               specific_code = mc)
        })
    }), recursive = FALSE) # these are re-coded if not auto-created

    recoded_study_data <- study_data

    for (rc in recode) {
      cn <- util_map_labels(rc$variables,
                            meta_data,
                            from = label_col,
                            to = VAR_NAMES)
      recoded_study_data[, cn] <-
        unlist(lapply(recoded_study_data[, cn], function(v) {
          v[v == rc$specific_code] <- rc$harmonized_code
          v
        }), recursive = FALSE)
      large_list[large_list$CODE_VALUE == rc$specific_code &
                 large_list$resp_vars %in% rc$variables, "CODE_VALUE"] <-
        rc$harmonized_code
    }

    cause_label_df <- large_list[, c("CODE_VALUE", "CODE_LABEL"), drop = FALSE]
    rownames(cause_label_df) <- NULL

    sm_code <- code_generator()
    cause_label_df <- rbind.data.frame(data.frame(
      CODE_VALUE = sm_code,
      CODE_LABEL = .SM_LAB,
      stringsAsFactors = FALSE
    ), cause_label_df)
    for (v in colnames(recoded_study_data)) {
      if (inherits(recoded_study_data[[v]], "POSIXt") ||
          inherits(recoded_study_data[[v]], "POSIXct") ||
          inherits(recoded_study_data[[v]], "POSIXlt") ||
          inherits(recoded_study_data[[v]], "Date")) {
        recoded_study_data[is.na(recoded_study_data[[v]]), v] <-
          as.POSIXct.numeric(sm_code,
                             origin = min(as.POSIXct(Sys.Date()), 0))
      } else {
        recoded_study_data[is.na(recoded_study_data[[v]]), v] <-
          sm_code
      }
    }

    cause_label_df <- cause_label_df[!duplicated(cause_label_df), , FALSE]


  list(ModifiedStudyData = recoded_study_data,
       cause_label_df = cause_label_df)
}

Try the dataquieR package in your browser

Any scripts or data that you put into this service are public.

dataquieR documentation built on May 29, 2024, 7:18 a.m.