#' Learn principal components of a dataset with singular value decomposition.
#'
#' This is a short tidy wrapper for the \code{pcaMethods::pca} function. It removes missing observations,
#' imputes missing values with the \code{tvsproject::learn_imputation_randomforest} function, applies
#' mean centering and unit variance scaling, and finally computes the singular value decomposition.
#'
#' @param input An input dataset of class "data.frame" from base or "tbl_df" from tibble.
#' @param id_variable_names A character vector of variable names containing identifier data that will be ignored.
#'
#' @return A list containing principal components (observation_level), available model parameters and statistics (model_level), and the original result object (model).
#'
#' @importFrom magrittr "%>%"
#' @export
learn_pca_svd <- function(input, id_variable_names) {
# preprocess missing values
imputed_input <- input %>%
# include if at least one non-id variable has an observed value (warning!)
filter_at(vars(-id_variable_names), any_vars(!is.na(.))) %>%
learn_imputation_randomforest(id_variable_names = id_variable_names) %>%
pluck("observation_level")
pca_object <- imputed_input %>%
select(-id_variable_names) %>%
pcaMethods::pca(
nPcs = 2,
center = TRUE, # preprocessing: substract mean
scale = "uv", # preprocessing: divide by sd
method = "svd", # optimization & evaluation: singular value decomposition
cv = "none" # evaluation: no cross-validation
)
# tidy results (no broom:: methods available)
pca_features <- pca_object@scores %>%
as_tibble(.name_repair = ~ c("pc1", "pc2"))
pca_obs <- imputed_input %>%
select(id_variable_names) %>%
bind_cols(pca_features) %>%
full_join(select(input, id_variable_names), by = id_variable_names)
pca_sd <- pca_object@sDev %>%
enframe("unit", "sd") %>%
mutate(unit = str_to_lower(unit)) %>%
gather("feature", "value", -unit) %>%
mutate(value = as.character(value))
pca_r2 <- pca_object@R2 %>%
enframe(name = "unit", value = "r_squared") %>%
mutate(unit = c("pc1", "pc2")) %>%
gather("feature", "value", -unit) %>%
mutate(value = as.character(value))
pca_loadings <- pca_object@loadings %>%
as_tibble(rownames = "unit") %>%
rename(pc1_loading = PC1, pc2_loading = PC2) %>%
gather("feature", "value", -unit) %>%
mutate(value = as.character(value))
pca_mod <- bind_rows(pca_r2, pca_sd, pca_loadings)
list(
observation_level = pca_obs,
model_level = pca_mod,
model = pca_object
)
}
#' Learn a uniform manifold approximation and projection of a dataset.
#'
#' This function is a convenient wrapper for the \code{umap::umap} function. It removes missing
#' observations, imputes missing values with the \code{tvsproject::learn_imputation_randomforest} function,
#' runs the UMAP algorithm for each supplied neighbor count, extracts the results, and returns them in a tidy format.
#'
#' @param input An input dataset of class "data.frame" from base or "tbl_df" from tibble.
#' @param id_variable_names A character vector of variable names containing identifier data that will be ignored.
#' @param n_neighbors A numeric vector of neighbor counts to use in the UMAP algorithm.
#'
#' @return A list containing a dataset of two projected features learned by UMAP for each neighbor count (observation_level) and a list of the original result objects. An empty model-level object is given for consistency across the package.
#'
#' @importFrom magrittr "%>%"
#' @export
learn_umap <- function(input, id_variable_names, n_neighbors) {
# preprocess missing values
imputed_input <- input %>%
# include if at least one non-id variable has an observed value (warning!)
filter_at(vars(-id_variable_names), any_vars(!is.na(.))) %>%
learn_imputation_randomforest(id_variable_names = id_variable_names) %>%
pluck("observation_level")
# umap hyperparameters (umap::umap.defaults)
# n_neighbors: n_neighbors
# n_components: 2
# metric: euclidean
# n_epochs: 200
# input: data
# init: spectral
# min_dist: 0.1
# set_op_mix_ratio: 1
# local_connectivity: 1
# bandwidth: 1
# alpha: 1
# gamma: 1
# negative_sample_rate: 5
# a: NA
# b: NA
# spread: 1
# random_state: NA
# transform_state: NA
# knn_repeats: 1
# verbose: FALSE
# umap_learn_args: NA
learn_one_umap <- function(imputed_input, id_variable_names, n_neighbors) {
custom_settings <- umap::umap.defaults
custom_settings$n_neighbors <- n_neighbors
umap_object <- imputed_input %>%
select(-id_variable_names) %>%
umap::umap(
config = custom_settings,
method = "naive" # calls the R implemention instead of python's umap-learn
)
return(umap_object)
}
umap_objects <- purrr::map(
n_neighbors,
learn_one_umap,
imputed_input = imputed_input,
id_variable_names = id_variable_names
)
get_umap_obs <- function(umap_object) {
umap_features <- umap_object$layout %>%
as_tibble(.name_repair = ~ c("umap1", "umap2"))
umap_obs <- imputed_input %>%
select(id_variable_names) %>%
bind_cols(umap_features) %>%
full_join(select(input, id_variable_names), by = id_variable_names) %>%
mutate(n_neighbors = umap_object$config$n_neighbors)
return(umap_obs)
}
umap_obs_all <- purrr::map_dfr(umap_objects, get_umap_obs)
list(
observation_level = umap_obs_all,
model_level = tibble::tibble(.rows = 0),
model = umap_objects
)
}
#' Learn a cluster hierarchy of a dataset or a pre-computed similarity matrix.
#'
#' This is a short convenient wrapper for the \code{stats::hclust} function.
#'
#' @param input An input dataset of class "data.frame" from base or "tbl_df" from tibble.
#'
#' @return A list containing observation- and model-level results and the model object.
learn_cluster_hierarchy <- function(input) {
# TODO: tidy results, possibly change to hclust
agnes_object <- cluster::agnes(
input, # data or pre-computed dissimilarity matrix
diss = inherits(input, "dist"),
metric = "euclidean" # if input is data, use euclidean distance as similarity
)
list(
observation_level = NA,
model_level = NA,
model = agnes_object
)
}
#' Learn a Topological Overlap Matrix of a dataset.
#'
#' This function removes missing observations, applies base 10 logarithm, mean centering, and unit variance scaling,
#' computes an adjacency matrix using robust biweight midcorrelation raised to a power, and finally computes
#' the Topological Overlap Matrix on the adjacency matrix.
#'
#' @param input An input dataset of class "data.frame" from base or "tbl_df" from tibble.
#' @param id_variable_names A character vector of variable names containing identifier data that will be ignored.
#' @param power A numeric vector of powers to apply in the adjacency measure (robust biweight midcorrelation).
#'
#' @return A list containing a tidy dataset of TOM similarities between variables for each power (model_level; of class "tbl_df") and a list of original objects returned by \code{WGCNA::TOMsimilarity} (model). The observation-level output is an empty character vector for consistency.
learn_tom <- function(input, id_variable_names, power) {
# preprocessing
preprocessed_input <- input %>%
# include if at least one non-id variable has an observed value (warning!)
filter_at(vars(-id_variable_names), any_vars(!is.na(.))) %>%
# log 10 transformation
mutate_at(vars(-id_variable_names), log10) %>%
# mean centering
mutate_at(vars(-id_variable_names), ~ . - mean(., na.rm = TRUE)) %>%
# unit variance scaling
mutate_at(vars(-id_variable_names), ~ . / sd(., na.rm = TRUE))
nonid_input <- select(preprocessed_input, -id_variable_names)
learn_one_tom <- function(input, power) {
input %>%
adjacency(
power = power,
type = "unsigned", # |bicor|^power
corFnc = "bicor", # robust biweight midcorrelation
corOptions = list(use = "pairwise.complete.obs", robustX = TRUE, maxPOutliers = 1),
) %>%
TOMsimilarity(
TOMType = "unsigned",
TOMDenom = "min"
)
}
# map vector of powers to the tom function
tom_objects <- purrr::map(power, learn_one_tom, input = nonid_input)
tidy_tom_object <- function(tom_object, power, input) {
tom_object %>%
as_tibble(.name_repair = ~ colnames(input)) %>%
mutate(from = colnames(input)) %>%
mutate(power = power) %>%
gather("to", "tom_similarity", -from, -power) %>%
mutate(unit = str_c("edge_", 1:nrow(.))) %>%
select(power, unit, from, to, tom_similarity)
}
# map list of tom objects to the tidying function
tom_mod <- purrr::map2_dfr(tom_objects, power, tidy_tom_object, input = nonid_input)
list(
observation_level = NA_character_,
model_level = tom_mod,
model = tom_objects
)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.