R/mutaters.R

Defines functions learn_imputation_randomforest remove_outliers_3sd_median recode_with conceal_all

Documented in conceal_all recode_with

#' Impute missing values with a random forest.
#' 
#' This is a short convenient wrapper for the \code{missForest::missForest} function.
#' 
#' @param input An input dataset (of class "data.frame" from base or "tbl_df" from tibble) with missing values.
#' @param id_variable_names A character vector of variable names containing identifier data that will be ignored.
#' 
#' @return A list containing the imputed dataset (observation_level; of class "tbl_df"), out of bag errors (model_level; of class "tbl_df"), and the result object returned by the internal missForest::missForest.
#' 
#' @importFrom magrittr "%>%"
#' @export

learn_imputation_randomforest <- function(input, id_variable_names) {
    
    imputable_variables <- input %>%
        dplyr::select(-id_variable_names) %>%
        as.matrix()
    
    identifier_variables <- dplyr::select(input, id_variable_names)
    
    result <- missForest::missForest(
        xmis = imputable_variables,
        maxiter = 10, 
        ntree = 100, 
        mtry = floor(sqrt(ncol(imputable_variables))),
        variablewise = TRUE,
        replace = TRUE,
        classwt = NULL, 
        cutoff = NULL, 
        strata = NULL,
        sampsize = NULL, 
        nodesize = NULL, 
        maxnodes = NULL,
        parallelize = "no"
    )
    
    # tidy results
    imputed_dataset <- identifier_variables %>%
        dplyr::bind_cols(tibble::as_tibble(result$ximp))
    
    out_of_bag_errors <- tibble::tibble(
        variable = colnames(imputable_variables),
        out_of_bag_error = result$OOBerror
    )
    
    list(
        observation_level = imputed_dataset, 
        model_level = out_of_bag_errors,
        model = result
    )
}


#' Remove values outside 3 standard deviations from the median.
#' 
#' @param x A numeric vector.
#' 
#' @return A numeric vector with outliers mutated to NA_real_
#' 
#' @export

remove_outliers_3sd_median <- function(x) {
    
    x_sd <- sd(x, na.rm = TRUE)
    
    low_limit <- median(x, na.rm = TRUE) - 3*x_sd
    high_limit <- median(x, na.rm = TRUE) + 3*x_sd
    
    x[x > high_limit | x < low_limit] <- NA_real_
    
    return(x)
}


#' Recode values of variables using metadata that maps old values to new values.
#'
#' For iterating through variables and using each variable's metadata to recode the values
#' from codes to descriptive values. Finished if actually needed.
#'
#' @param data The original dataframe to be recoded.
#' @param metadata A dataframe with old and new values.
#' @param code An unquoted name of the metadata variable that contains old values.
#' @param value An unquoted name of the metadata variable that contains new values.
#'
#' @return A dataframe with recoded values.

recode_with <- function(data, metadata, variable_name, code, value) {

    variable_name <- enquo(variable_name)
    value <- enquo(value)
    code <- enquo(code)

    map <- metadata %>%
        pull(!!value) %>%
        as.list() %>%
        set_names(metadata %>% pull(!!code))

}

#' Conceal all values in a dataframe by replacing them with random values.
#'
#' For concealing raw data that shouldn't be shown as it is. Re-use not recommended yet.
#' This is a quick hack that works properly only on certain types of data.
#' In non-id variables, the function preserves only the variable names, range of continuous variables,
#' unique values of discrete variables, and number of rows. Identifiers that are specified
#' in the argument `id` are replaced with random codes. The same id gets the same code so
#' that dependent observations (e.g. time series of people) don't falsely turn into independent
#' observations for a larger population of people.
#'
#' @param dataset A dataframe to be concealed.
#' @param id A character vector of names of the identifier variables.
#'
#' @return A dataframe with the same basic structure (rows, columns, unique values, minimums, maximums) but completely randomized to conceal sensitive data.

conceal_all <- function(dataset, id) {

    rnd_start <- runif(n = 1, min = 1L, max = 230L) %>%
        round(0) %>%
        as.integer()

    rnd_unif <- function(x) {
        length(x) %>%
            runif(min(x, na.rm = TRUE), max(x, na.rm = TRUE)) %>%
            signif(5) %>%
            round(5) %>%
            inset(is.na(x), NA) %>%
            sample()
    }

    rnd_disc <- function(x) {
        unique(x) %>%
            sample(size = length(x), replace = TRUE) %>%
            inset(is.na(x), NA) %>%
            sample()
    }

    rnd_ids <- function(x) {
        x %>%
            # not safe!
            openssl::sha224(key = "3489g348j53g89j") %>%
            str_sub(4, 7)
    }

    dataset_id <- dataset %>%
        select(one_of(id)) %>%
        mutate_all(rnd_ids)

    dataset %>%
        select_at(vars(-one_of(id))) %>%
        mutate_if(is.numeric, rnd_unif) %>%
        mutate_if(negate(is.numeric), rnd_disc) %>%
        bind_cols(dataset_id, .) %>%
        sample_frac()

}

#' Calculate sums of TVS lipids within lipid classes.
#' 
#' @param lipids The TVS lipids data object returned by import_data or import_lipids.
#' @param variables The TVS variables metadata object.
#' 
#' @importFrom magrittr "%>%"
#' @export

classify_lipids <- function(lipids, variables, summary_function = "sum") { 
    
    lipid_class_map <- variables %>%
        filter(class_main == "Lipid") %>%
        select(name_snakecase, class_hierarchy) %>%
        separate(
            col = class_hierarchy, 
            into = str_c("class", as.character(1:4)), 
            sep = " > ",
            fill = "right"
        ) %>%
        gather(level, class, -name_snakecase) %>%
        select(-level) %>%
        filter(!is.na(class)) %>%
        mutate(class = str_to_lower(class)) %>%
        mutate(class = str_replace_all(class, "-", "_")) %>%
        mutate(class = str_replace_all(class, " ", "_")) %>%
        mutate(class = str_replace_all(class, "__", "_"))
    
    # initiate results dataframe
    lipids_classes <- lipids %>%
        select(patient_id, sample_id, sample_type)
    
    # compute sums and bind results iteratively
    for (c in unique(lipid_class_map$class)) {
        
        columns <- lipid_class_map %>%
            filter(class == c) %>%
            pull(name_snakecase)
        
        # summarize over columns
        if (summary_function == "sum") {
            
            new_lipid_class <- lipids %>%
                select(columns) %>%
                transmute(class = rowSums(., na.rm = TRUE)) %>%
                set_colnames(c)
            
        } else {
            
            stop(stringr::str_c("summary measure *", summary_function, "* hasn't been implemented yet"))
            
        }
        
        lipids_classes <- lipids_classes %>%
            bind_cols(new_lipid_class)
    }
    
    lipids_classes <- lipids_classes %>%
        arrange(patient_id, sample_id)
    
    return(lipids_classes)
    
}

#' Classify TVS mRNA probes to genes.
#' 
#' @param lipids The TVS mRNA data object returned by import_data or import_mrna.
#' @param variables The TVS variables metadata object.
#' 
#' @importFrom magrittr "%>%"
#' @export

classify_mrna_probes <- function(mrna_probes, variables, summary_function = "max") { 
    
    if (summary_function != "max") {
        
        stop(stringr::str_c("summary measure *", summary_function, "* hasn't been implemented yet"))
        
    }    
    
    probe_gene_map <- variables %>%
        filter(class_main == "mRNA") %>%
        select(name_snakecase, class_hierarchy) %>%
        separate(
            col = class_hierarchy, 
            into = c("chromosome", "gene"), 
            sep = " > ",
            fill = "right"
        ) %>%
        select(-chromosome) %>%
        distinct() %>%
        mutate(gene = str_to_lower(gene)) %>%
        # just in case
        mutate(gene = str_replace_all(gene, "-", "_")) %>%
        mutate(gene = str_replace_all(gene, " ", "_")) %>%
        mutate(gene = str_replace_all(gene, "__", "_"))
    
    # initiate results dataframe
    mrna_genes <- mrna_probes %>%
        select(patient_id, sample_id, sample_type)
    
    sorted_genes <- probe_gene_map %>%
        pull(gene) %>%
        unique() %>%
        sort()
    
    pb <- progress::progress_bar$new(
        format = "  iterating genes [:bar] :percent eta: :eta",
        total = length(sorted_genes), 
        clear = FALSE, 
        width = 60
    )
    
    # compute sums and bind results iteratively
    for (g in sorted_genes) {
        
        pb$tick()
        
        selected_probe_names <- probe_gene_map %>%
            filter(gene == g) %>%
            pull(name_snakecase)
        
        selected_probes <- mrna_probes %>%
            colnames() %>%
            is_in(selected_probe_names)
        
        # summarize over columns
        new_gene <- mrna_probes %>%
            magrittr::extract(selected_probes) %>%
            mutate(index = dplyr::row_number()) %>%
            gather("probe", "value", -index) %>%
            group_by(index) %>%
            summarise(max_value = max(value, na.rm = TRUE)) %>%
            magrittr::set_colnames(c("index", stringr::str_c(summary_function, "_", g))) %>%
            arrange(index) %>%
            select(-index)
        
        # base::cbind seems to be a bit faster than dplyr::bind_cols
        mrna_genes <- cbind(mrna_genes, new_gene)
    }
    
    mrna_genes <- mrna_genes %>%
        as_tibble() %>%
        arrange(patient_id, sample_id)
    
    return(mrna_genes)
    
}
eteppo/tvs-project documentation built on Aug. 13, 2019, 8:53 a.m.