#' Impute missing values with a random forest.
#'
#' This is a short convenient wrapper for the \code{missForest::missForest} function.
#'
#' @param input An input dataset (of class "data.frame" from base or "tbl_df" from tibble) with missing values.
#' @param id_variable_names A character vector of variable names containing identifier data that will be ignored.
#'
#' @return A list containing the imputed dataset (observation_level; of class "tbl_df"), out of bag errors (model_level; of class "tbl_df"), and the result object returned by the internal missForest::missForest.
#'
#' @importFrom magrittr "%>%"
#' @export
learn_imputation_randomforest <- function(input, id_variable_names) {
imputable_variables <- input %>%
dplyr::select(-id_variable_names) %>%
as.matrix()
identifier_variables <- dplyr::select(input, id_variable_names)
result <- missForest::missForest(
xmis = imputable_variables,
maxiter = 10,
ntree = 100,
mtry = floor(sqrt(ncol(imputable_variables))),
variablewise = TRUE,
replace = TRUE,
classwt = NULL,
cutoff = NULL,
strata = NULL,
sampsize = NULL,
nodesize = NULL,
maxnodes = NULL,
parallelize = "no"
)
# tidy results
imputed_dataset <- identifier_variables %>%
dplyr::bind_cols(tibble::as_tibble(result$ximp))
out_of_bag_errors <- tibble::tibble(
variable = colnames(imputable_variables),
out_of_bag_error = result$OOBerror
)
list(
observation_level = imputed_dataset,
model_level = out_of_bag_errors,
model = result
)
}
#' Remove values outside 3 standard deviations from the median.
#'
#' @param x A numeric vector.
#'
#' @return A numeric vector with outliers mutated to NA_real_
#'
#' @export
remove_outliers_3sd_median <- function(x) {
x_sd <- sd(x, na.rm = TRUE)
low_limit <- median(x, na.rm = TRUE) - 3*x_sd
high_limit <- median(x, na.rm = TRUE) + 3*x_sd
x[x > high_limit | x < low_limit] <- NA_real_
return(x)
}
#' Recode values of variables using metadata that maps old values to new values.
#'
#' For iterating through variables and using each variable's metadata to recode the values
#' from codes to descriptive values. Finished if actually needed.
#'
#' @param data The original dataframe to be recoded.
#' @param metadata A dataframe with old and new values.
#' @param code An unquoted name of the metadata variable that contains old values.
#' @param value An unquoted name of the metadata variable that contains new values.
#'
#' @return A dataframe with recoded values.
recode_with <- function(data, metadata, variable_name, code, value) {
variable_name <- enquo(variable_name)
value <- enquo(value)
code <- enquo(code)
map <- metadata %>%
pull(!!value) %>%
as.list() %>%
set_names(metadata %>% pull(!!code))
}
#' Conceal all values in a dataframe by replacing them with random values.
#'
#' For concealing raw data that shouldn't be shown as it is. Re-use not recommended yet.
#' This is a quick hack that works properly only on certain types of data.
#' In non-id variables, the function preserves only the variable names, range of continuous variables,
#' unique values of discrete variables, and number of rows. Identifiers that are specified
#' in the argument `id` are replaced with random codes. The same id gets the same code so
#' that dependent observations (e.g. time series of people) don't falsely turn into independent
#' observations for a larger population of people.
#'
#' @param dataset A dataframe to be concealed.
#' @param id A character vector of names of the identifier variables.
#'
#' @return A dataframe with the same basic structure (rows, columns, unique values, minimums, maximums) but completely randomized to conceal sensitive data.
conceal_all <- function(dataset, id) {
rnd_start <- runif(n = 1, min = 1L, max = 230L) %>%
round(0) %>%
as.integer()
rnd_unif <- function(x) {
length(x) %>%
runif(min(x, na.rm = TRUE), max(x, na.rm = TRUE)) %>%
signif(5) %>%
round(5) %>%
inset(is.na(x), NA) %>%
sample()
}
rnd_disc <- function(x) {
unique(x) %>%
sample(size = length(x), replace = TRUE) %>%
inset(is.na(x), NA) %>%
sample()
}
rnd_ids <- function(x) {
x %>%
# not safe!
openssl::sha224(key = "3489g348j53g89j") %>%
str_sub(4, 7)
}
dataset_id <- dataset %>%
select(one_of(id)) %>%
mutate_all(rnd_ids)
dataset %>%
select_at(vars(-one_of(id))) %>%
mutate_if(is.numeric, rnd_unif) %>%
mutate_if(negate(is.numeric), rnd_disc) %>%
bind_cols(dataset_id, .) %>%
sample_frac()
}
#' Calculate sums of TVS lipids within lipid classes.
#'
#' @param lipids The TVS lipids data object returned by import_data or import_lipids.
#' @param variables The TVS variables metadata object.
#'
#' @importFrom magrittr "%>%"
#' @export
classify_lipids <- function(lipids, variables, summary_function = "sum") {
lipid_class_map <- variables %>%
filter(class_main == "Lipid") %>%
select(name_snakecase, class_hierarchy) %>%
separate(
col = class_hierarchy,
into = str_c("class", as.character(1:4)),
sep = " > ",
fill = "right"
) %>%
gather(level, class, -name_snakecase) %>%
select(-level) %>%
filter(!is.na(class)) %>%
mutate(class = str_to_lower(class)) %>%
mutate(class = str_replace_all(class, "-", "_")) %>%
mutate(class = str_replace_all(class, " ", "_")) %>%
mutate(class = str_replace_all(class, "__", "_"))
# initiate results dataframe
lipids_classes <- lipids %>%
select(patient_id, sample_id, sample_type)
# compute sums and bind results iteratively
for (c in unique(lipid_class_map$class)) {
columns <- lipid_class_map %>%
filter(class == c) %>%
pull(name_snakecase)
# summarize over columns
if (summary_function == "sum") {
new_lipid_class <- lipids %>%
select(columns) %>%
transmute(class = rowSums(., na.rm = TRUE)) %>%
set_colnames(c)
} else {
stop(stringr::str_c("summary measure *", summary_function, "* hasn't been implemented yet"))
}
lipids_classes <- lipids_classes %>%
bind_cols(new_lipid_class)
}
lipids_classes <- lipids_classes %>%
arrange(patient_id, sample_id)
return(lipids_classes)
}
#' Classify TVS mRNA probes to genes.
#'
#' @param lipids The TVS mRNA data object returned by import_data or import_mrna.
#' @param variables The TVS variables metadata object.
#'
#' @importFrom magrittr "%>%"
#' @export
classify_mrna_probes <- function(mrna_probes, variables, summary_function = "max") {
if (summary_function != "max") {
stop(stringr::str_c("summary measure *", summary_function, "* hasn't been implemented yet"))
}
probe_gene_map <- variables %>%
filter(class_main == "mRNA") %>%
select(name_snakecase, class_hierarchy) %>%
separate(
col = class_hierarchy,
into = c("chromosome", "gene"),
sep = " > ",
fill = "right"
) %>%
select(-chromosome) %>%
distinct() %>%
mutate(gene = str_to_lower(gene)) %>%
# just in case
mutate(gene = str_replace_all(gene, "-", "_")) %>%
mutate(gene = str_replace_all(gene, " ", "_")) %>%
mutate(gene = str_replace_all(gene, "__", "_"))
# initiate results dataframe
mrna_genes <- mrna_probes %>%
select(patient_id, sample_id, sample_type)
sorted_genes <- probe_gene_map %>%
pull(gene) %>%
unique() %>%
sort()
pb <- progress::progress_bar$new(
format = " iterating genes [:bar] :percent eta: :eta",
total = length(sorted_genes),
clear = FALSE,
width = 60
)
# compute sums and bind results iteratively
for (g in sorted_genes) {
pb$tick()
selected_probe_names <- probe_gene_map %>%
filter(gene == g) %>%
pull(name_snakecase)
selected_probes <- mrna_probes %>%
colnames() %>%
is_in(selected_probe_names)
# summarize over columns
new_gene <- mrna_probes %>%
magrittr::extract(selected_probes) %>%
mutate(index = dplyr::row_number()) %>%
gather("probe", "value", -index) %>%
group_by(index) %>%
summarise(max_value = max(value, na.rm = TRUE)) %>%
magrittr::set_colnames(c("index", stringr::str_c(summary_function, "_", g))) %>%
arrange(index) %>%
select(-index)
# base::cbind seems to be a bit faster than dplyr::bind_cols
mrna_genes <- cbind(mrna_genes, new_gene)
}
mrna_genes <- mrna_genes %>%
as_tibble() %>%
arrange(patient_id, sample_id)
return(mrna_genes)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.