R/models.R

Defines functions learn_pca_svd

#' Learn principal components of a dataset with singular value decomposition.
#' 
#' This is a short tidy wrapper for the \code{pcaMethods::pca} function. It removes missing observations, 
#' imputes missing values with the \code{tvsproject::learn_imputation_randomforest} function, applies 
#' mean centering and unit variance scaling, and finally computes the singular value decomposition.
#' 
#' @param input An input dataset of class "data.frame" from base or "tbl_df" from tibble.
#' @param id_variable_names A character vector of variable names containing identifier data that will be ignored.
#'
#' @return A list containing principal components (observation_level), available model parameters and statistics (model_level), and the original result object (model).
#'   
#' @importFrom magrittr "%>%"
#' @export

learn_pca_svd <- function(input, id_variable_names) {
    
    # preprocess missing values
    imputed_input <- input %>%
        # include if at least one non-id variable has an observed value (warning!)
        filter_at(vars(-id_variable_names), any_vars(!is.na(.))) %>%
        learn_imputation_randomforest(id_variable_names = id_variable_names) %>%
        pluck("observation_level")
    
    pca_object <- imputed_input %>%
        select(-id_variable_names) %>%
        pcaMethods::pca(
            nPcs = 2,
            center = TRUE, # preprocessing: substract mean
            scale = "uv", # preprocessing: divide by sd
            method = "svd", # optimization & evaluation: singular value decomposition
            cv = "none" # evaluation: no cross-validation
        )
    
    # tidy results (no broom:: methods available)
    pca_features <- pca_object@scores %>%
        as_tibble(.name_repair = ~ c("pc1", "pc2"))
    
    pca_obs <- imputed_input %>%
        select(id_variable_names) %>%
        bind_cols(pca_features) %>%
        full_join(select(input, id_variable_names), by = id_variable_names)
    
    pca_sd <- pca_object@sDev %>%
        enframe("unit", "sd") %>%
        mutate(unit = str_to_lower(unit)) %>%
        gather("feature", "value", -unit) %>%
        mutate(value = as.character(value))
    
    pca_r2 <- pca_object@R2 %>%
        enframe(name = "unit", value = "r_squared") %>%
        mutate(unit = c("pc1", "pc2")) %>%
        gather("feature", "value", -unit) %>%
        mutate(value = as.character(value))
    
    pca_loadings <- pca_object@loadings %>%
        as_tibble(rownames = "unit") %>%
        rename(pc1_loading = PC1, pc2_loading = PC2) %>%
        gather("feature", "value", -unit) %>%
        mutate(value = as.character(value))
    
    pca_mod <- bind_rows(pca_r2, pca_sd, pca_loadings)
    
    list(
        observation_level = pca_obs,
        model_level = pca_mod,
        model = pca_object
    )
}

#' Learn a uniform manifold approximation and projection of a dataset.
#' 
#' This function is a convenient wrapper for the \code{umap::umap} function. It removes missing
#' observations, imputes missing values with the \code{tvsproject::learn_imputation_randomforest} function,
#' runs the UMAP algorithm for each supplied neighbor count, extracts the results, and returns them in a tidy format.
#' 
#' @param input An input dataset of class "data.frame" from base or "tbl_df" from tibble.
#' @param id_variable_names A character vector of variable names containing identifier data that will be ignored.
#' @param n_neighbors A numeric vector of neighbor counts to use in the UMAP algorithm.
#' 
#' @return A list containing a dataset of two projected features learned by UMAP for each neighbor count (observation_level) and a list of the original result objects. An empty model-level object is given for consistency across the package. 
#' 
#' @importFrom magrittr "%>%"
#' @export

learn_umap <- function(input, id_variable_names, n_neighbors) {
    
    # preprocess missing values
    imputed_input <- input %>%
        # include if at least one non-id variable has an observed value (warning!)
        filter_at(vars(-id_variable_names), any_vars(!is.na(.))) %>%
        learn_imputation_randomforest(id_variable_names = id_variable_names) %>%
        pluck("observation_level")
    
    # umap hyperparameters (umap::umap.defaults)
    #            n_neighbors: n_neighbors
    #           n_components: 2
    #                 metric: euclidean
    #               n_epochs: 200
    #                  input: data
    #                   init: spectral
    #               min_dist: 0.1
    #       set_op_mix_ratio: 1
    #     local_connectivity: 1
    #              bandwidth: 1
    #                  alpha: 1
    #                  gamma: 1
    #   negative_sample_rate: 5
    #                      a: NA
    #                      b: NA
    #                 spread: 1
    #           random_state: NA
    #        transform_state: NA
    #            knn_repeats: 1
    #                verbose: FALSE
    #        umap_learn_args: NA 
    
    learn_one_umap <- function(imputed_input, id_variable_names, n_neighbors) {
        
        custom_settings <- umap::umap.defaults
        custom_settings$n_neighbors <- n_neighbors
        
        umap_object <- imputed_input %>%
            select(-id_variable_names) %>%
            umap::umap(
                config = custom_settings,
                method = "naive" # calls the R implemention instead of python's umap-learn
            )
        
        return(umap_object)
    }
    
    umap_objects <- purrr::map(
        n_neighbors, 
        learn_one_umap, 
        imputed_input = imputed_input, 
        id_variable_names = id_variable_names
    )
    
    get_umap_obs <- function(umap_object) {
        
        umap_features <- umap_object$layout %>%
            as_tibble(.name_repair = ~ c("umap1", "umap2"))
        
        umap_obs <- imputed_input %>%
            select(id_variable_names) %>%
            bind_cols(umap_features) %>%
            full_join(select(input, id_variable_names), by = id_variable_names) %>%
            mutate(n_neighbors = umap_object$config$n_neighbors)
        
        return(umap_obs)
    }
    
    umap_obs_all <- purrr::map_dfr(umap_objects, get_umap_obs)
    
    list(
        observation_level = umap_obs_all,
        model_level = tibble::tibble(.rows = 0),
        model = umap_objects
    )
    
}


#' Learn a cluster hierarchy of a dataset or a pre-computed similarity matrix.
#' 
#' This is a short convenient wrapper for the \code{stats::hclust} function.
#' 
#' @param input An input dataset of class "data.frame" from base or "tbl_df" from tibble.
#' 
#' @return A list containing observation- and model-level results and the model object.

learn_cluster_hierarchy <- function(input) {
    
    # TODO: tidy results, possibly change to hclust
    
    agnes_object <- cluster::agnes(
        input, # data or pre-computed dissimilarity matrix
        diss = inherits(input, "dist"),
        metric = "euclidean" # if input is data, use euclidean distance as similarity
    )
    
    list(
        observation_level = NA,
        model_level = NA,
        model = agnes_object
    )
}

#' Learn a Topological Overlap Matrix of a dataset.
#'
#' This function removes missing observations, applies base 10 logarithm, mean centering, and unit variance scaling, 
#' computes an adjacency matrix using robust biweight midcorrelation raised to a power, and finally computes 
#' the Topological Overlap Matrix on the adjacency matrix.
#'   
#' @param input An input dataset of class "data.frame" from base or "tbl_df" from tibble.
#' @param id_variable_names A character vector of variable names containing identifier data that will be ignored.
#' @param power A numeric vector of powers to apply in the adjacency measure (robust biweight midcorrelation).
#' 
#' @return A list containing a tidy dataset of TOM similarities between variables for each power (model_level; of class "tbl_df") and a list of original objects returned by \code{WGCNA::TOMsimilarity} (model). The observation-level output is an empty character vector for consistency.

learn_tom <- function(input, id_variable_names, power) {
    
    # preprocessing
    preprocessed_input <- input %>%
        # include if at least one non-id variable has an observed value (warning!)
        filter_at(vars(-id_variable_names), any_vars(!is.na(.))) %>%
        # log 10 transformation
        mutate_at(vars(-id_variable_names), log10) %>%
        # mean centering
        mutate_at(vars(-id_variable_names), ~ . - mean(., na.rm = TRUE)) %>%
        # unit variance scaling
        mutate_at(vars(-id_variable_names), ~ . / sd(., na.rm = TRUE))
    
    nonid_input <- select(preprocessed_input, -id_variable_names)
    
    learn_one_tom <- function(input, power) {
        input %>%
            adjacency(
                power = power, 
                type = "unsigned", # |bicor|^power
                corFnc = "bicor", # robust biweight midcorrelation
                corOptions = list(use = "pairwise.complete.obs", robustX = TRUE, maxPOutliers = 1),
            ) %>%
            TOMsimilarity(
                TOMType = "unsigned",
                TOMDenom = "min"
            )
    }
    
    # map vector of powers to the tom function
    tom_objects <- purrr::map(power, learn_one_tom, input = nonid_input)
    
    tidy_tom_object <- function(tom_object, power, input) {
        
        tom_object %>%
            as_tibble(.name_repair = ~ colnames(input)) %>%
            mutate(from = colnames(input)) %>%
            mutate(power = power) %>%
            gather("to", "tom_similarity", -from, -power) %>%
            mutate(unit = str_c("edge_", 1:nrow(.))) %>%
            select(power, unit, from, to, tom_similarity)
    }
    
    # map list of tom objects to the tidying function
    tom_mod <- purrr::map2_dfr(tom_objects, power, tidy_tom_object, input = nonid_input)
    
    list(
        observation_level = NA_character_,
        model_level = tom_mod,
        model = tom_objects
    )
}
eteppo/tvs-project documentation built on Aug. 13, 2019, 8:53 a.m.