#' @title standardize_metadata
#'
#' @description A function to standardize metadata by truncating it to a
#' subset of clinically relevant variables, specifying their variable types,
#' and converting missing values into a single format.
#'
#' @param metadata The corresponding metadata for a gene count matrix.
#' @param first_column_as_id Boolean value specifying whether the first
#' column in the metadata is the identifier/key. If not, it is assumed that
#' the row names are.
#' @param variable_subset A character vector of the metadata variables that
#' the user wishes to subset. This should be the most clinically relevant
#' and population relevant variables such as age, sex, and race.
#' @param variable_type_vec A named character vector specifying the type
#' of each variable. There are 3 types: categorical, numeric, and ordinal.
#' @param missing_value_lst A named character list specifying the missing
#' value(s), if it exists, in each variable.
#'
#' @return A data.frame object of the cleaned metadata, with classes of each
#' column specifying the variable type and all missing values converted to NA.
#'
#' @examples
#' # Using tcga_metadata from package.
#' library(MetaConIdentifier)
#' tcga_meta_new <- standardize_metadata(tcga_meta_original,
#' first_column_as_id = FALSE, variable_subset = tcga_variable_subset,
#' variable_type_vec = tcga_variable_type_vec, missing_value_lst = NULL)
#'
#' # The clean metadata should contain 2 classes: data.frame and metaStandard.
#' class(tcga_meta_new)
#'
#'
#' @export
#'
standardize_metadata <- function(metadata, first_column_as_id = TRUE,
variable_subset, variable_type_vec,
missing_value_lst = NULL){
# START WITH VALIDATION CHECKS.
# 1) Start by checking whether metadata is a data.frame object.
if (!is.data.frame(metadata)){
stop("Metadata is not a data.frame object. Cannot proceed further.")
}
# ============================================================================
# 2) Verify if first column acts as the identifier/key.
if (first_column_as_id){
message("Using first column as the identifiers/keys.")
if (length(unique(metadata[ , 1])) != nrow(metadata)){
stop("First column contains duplicate IDs. Cannot proceed further.")
rownames(metadata) <- metadata[ , 1]
metadata <- metadata[ , -c(1)]
}
} else{
message("Using row names as the identifiers/keys.")
if (length(unique(rownames(metadata))) != nrow(metadata)){
stop("Row names contains duplicate IDs. Cannot proceed further.")
}
}
# ============================================================================
# 3) Verify vector of subsetted variable names.
if (!is.vector(variable_subset)){
stop("A subset of variable names was not chosen in vector format. Please
choose a subset of clinically relevant and population type variables
such as age, sex, and race.")
} else if (!all(variable_subset %in% colnames(metadata))){
stop("At least one variable in the subset does not exist as part of
the metadata. This check is case sensitive.")
} else{
sub_metadata <- metadata[ , variable_subset ]
}
# ============================================================================
# 4) Normalize variables by uppercasing all character columns + removing
# any leading or trailing whitespaces.
var_names <- colnames(sub_metadata)[sapply(sub_metadata, is.character)]
sub_metadata[ , var_names ] <- as.data.frame(
sapply(sub_metadata[ , var_names ], toupper)
)
sub_metadata[ , var_names ] <- as.data.frame(
sapply(sub_metadata[ , var_names ], trimws)
)
# IMPORTANT: Some manual normalization by the user may be required (e.g.
# converting temperature to Celsius or Fahrenheit for consistency).
# ============================================================================
# 5) Identify each variable in the metadata as categorical, numeric,
# or ordinal.
if (!is.vector(variable_type_vec) || is.null(names(variable_type_vec))){
stop("variable_type_vec is not a named vector.")
} else if ((length(variable_type_vec) != ncol(sub_metadata)) ||
any(names(variable_type_vec) != colnames(sub_metadata))){
stop("variable_type_vec names do not match the subsetted variable names.")
} else if (any(!(tolower(sort(unique(variable_type_vec))) %in%
c("categorical", "numeric", "ordinal")))){
stop("variable_type_vec values contain types other than categorical,
numeric, and ordinal.")
} else{
var_names <- names(variable_type_vec)
for (i in seq_along(variable_type_vec)){
class(sub_metadata[ , var_names[i]]) <- tolower(variable_type_vec[[i]])
}
}
# ============================================================================
# 6) Convert all missing values to a standardized format: NA value.
# The missing value will depend on each column in the metadata.
if (is.null(missing_value_lst)){
# Only existing NA's will be evaluated.
invisible()
} else if (is.list(missing_value_lst) && is.null(names(missing_value_lst))){
stop("missing_value_lst is not a named list.")
} else if (any(!(names(missing_value_lst) %in% colnames(metadata)))){
stop("At least 1 missing_value_lst name is not in the subsetted
variable names.")
} else{
col_names <- names(missing_value_lst)
for (i in seq_along(missing_value_lst)){
values <- sub_metadata[ , col_names[i]]
sub_metadata[
which(values %in% missing_value_lst[[i]]) , col_names[i]
] <- NA
}
}
# ============================================================================
# 7) Give the clean metadata the metaStandard class so that run_ca() can
# identify metadata that have undergone standardization.
class(sub_metadata) <- c("data.frame", "metaStandard")
return (sub_metadata)
}
# [END]
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.