MetaConIdentifier: Metadata Confounding Identifier

Documented in standardize_metadata

#' @title standardize_metadata
#'
#' @description A function to standardize metadata by truncating it to a
#' subset of clinically relevant variables, specifying their variable types,
#' and converting missing values into a single format.
#'
#' @param metadata The corresponding metadata for a gene count matrix.
#' @param first_column_as_id Boolean value specifying whether the first
#' column in the metadata is the identifier/key. If not, it is assumed that
#' the row names are.
#' @param variable_subset A character vector of the metadata variables that
#' the user wishes to subset. This should be the most clinically relevant
#' and population relevant variables such as age, sex, and race.
#' @param variable_type_vec A named character vector specifying the type
#' of each variable. There are 3 types: categorical, numeric, and ordinal.
#' @param missing_value_lst A named character list specifying the missing
#' value(s), if it exists, in each variable.
#'
#' @return A data.frame object of the cleaned metadata, with classes of each
#' column specifying the variable type and all missing values converted to NA.
#'
#' @examples
#' # Using tcga_metadata from package.
#' library(MetaConIdentifier)
#' tcga_meta_new <- standardize_metadata(tcga_meta_original,
#' first_column_as_id = FALSE, variable_subset = tcga_variable_subset,
#' variable_type_vec = tcga_variable_type_vec, missing_value_lst = NULL)
#'
#' # The clean metadata should contain 2 classes: data.frame and metaStandard.
#' class(tcga_meta_new)
#'
#'
#' @export
#'
standardize_metadata <- function(metadata, first_column_as_id = TRUE,
                                 variable_subset, variable_type_vec,
                                 missing_value_lst = NULL){

  # START WITH VALIDATION CHECKS.

  # 1) Start by checking whether metadata is a data.frame object.
  if (!is.data.frame(metadata)){
    stop("Metadata is not a data.frame object. Cannot proceed further.")
  }

  # ============================================================================

  # 2) Verify if first column acts as the identifier/key.
  if (first_column_as_id){
    message("Using first column as the identifiers/keys.")

    if (length(unique(metadata[ , 1])) != nrow(metadata)){
      stop("First column contains duplicate IDs. Cannot proceed further.")

      rownames(metadata) <- metadata[ , 1]
      metadata <- metadata[ , -c(1)]
    }
  } else{
    message("Using row names as the identifiers/keys.")

    if (length(unique(rownames(metadata))) != nrow(metadata)){
      stop("Row names contains duplicate IDs. Cannot proceed further.")
    }
  }

  # ============================================================================

  # 3) Verify vector of subsetted variable names.
  if (!is.vector(variable_subset)){
    stop("A subset of variable names was not chosen in vector format. Please
         choose a subset of clinically relevant and population type variables
         such as age, sex, and race.")
  } else if (!all(variable_subset %in% colnames(metadata))){
    stop("At least one variable in the subset does not exist as part of
         the metadata. This check is case sensitive.")
  } else{
    sub_metadata <- metadata[ , variable_subset ]
  }

  # ============================================================================

  # 4) Normalize variables by uppercasing all character columns + removing
  # any leading or trailing whitespaces.

  var_names <- colnames(sub_metadata)[sapply(sub_metadata, is.character)]
  sub_metadata[ , var_names ] <- as.data.frame(
    sapply(sub_metadata[ , var_names ], toupper)
    )
  sub_metadata[ , var_names ] <- as.data.frame(
    sapply(sub_metadata[ , var_names ], trimws)
    )

  # IMPORTANT: Some manual normalization by the user may be required (e.g.
  # converting temperature to Celsius or Fahrenheit for consistency).

  # ============================================================================

  # 5) Identify each variable in the metadata as categorical, numeric,
  # or ordinal.
  if (!is.vector(variable_type_vec) || is.null(names(variable_type_vec))){
    stop("variable_type_vec is not a named vector.")
  } else if ((length(variable_type_vec) != ncol(sub_metadata)) ||
             any(names(variable_type_vec) != colnames(sub_metadata))){
    stop("variable_type_vec names do not match the subsetted variable names.")
  } else if (any(!(tolower(sort(unique(variable_type_vec))) %in%
             c("categorical", "numeric", "ordinal")))){
    stop("variable_type_vec values contain types other than categorical,
         numeric, and ordinal.")
  } else{
    var_names <- names(variable_type_vec)

    for (i in seq_along(variable_type_vec)){
      class(sub_metadata[ , var_names[i]]) <- tolower(variable_type_vec[[i]])
    }
  }

  # ============================================================================

  # 6) Convert all missing values to a standardized format: NA value.
  # The missing value will depend on each column in the metadata.
  if (is.null(missing_value_lst)){
    # Only existing NA's will be evaluated.
    invisible()
  } else if (is.list(missing_value_lst) && is.null(names(missing_value_lst))){
    stop("missing_value_lst is not a named list.")
  } else if (any(!(names(missing_value_lst) %in% colnames(metadata)))){
    stop("At least 1 missing_value_lst name is not in the subsetted
         variable names.")
  } else{
    col_names <- names(missing_value_lst)

    for (i in seq_along(missing_value_lst)){
      values <- sub_metadata[ , col_names[i]]
      sub_metadata[
        which(values %in% missing_value_lst[[i]]) , col_names[i]
      ] <- NA
    }
  }

  # ============================================================================

  # 7) Give the clean metadata the metaStandard class so that run_ca() can
  # identify metadata that have undergone standardization.
  class(sub_metadata) <- c("data.frame", "metaStandard")

  return (sub_metadata)
}

# [END]