R/feature_extraction.R

Defines functions subsample_dimred dimensionality_reduction

Documented in dimensionality_reduction subsample_dimred

#' Dimensionality reduction suite
#'
#' Applies multiple dimensionality reduction techniques to input data.
#'
#' Method options
#' \itemize{
#' \item "none" - Returns original data as is
#' \item "pca" - Principal Component Analysis
#' \item "tsne" - t-distributed stochastic neighbor embedding
#' \item "umap" - Uniform Manifold Approximation and Projection for Dimension Reduction
#' }
#'
#' @param x Data matrix, features on columns and samples on rows.
#' @param dimred_methods Vector of method names, see details for options.
#' @param output_dimensions Vector of dimensionalities to compute using each applicable method.
#' @param pca_dims PCA specific output dimensions.
#' @param umap_dims UMAP specific output dimensions.
#' @param tsne_perplexities Vector of t-SNE perplexity settings to generate embeddings with.
#' @param tsne_pca Whether to apply PCA before t-SNE, which massively boosts performance.
#' @param umap_neighbors UMAP parameter, affects manifold computation.
#' @param initial_dims Number of principal components used in t-SNE and UMAP.
#' @param ... Extra arguments are ignored.
#'
#' @return Returns a \code{list} of embeddings, elements are named based on methods used
#' @export
#' @importFrom FactoMineR PCA
#' @importFrom Rtsne Rtsne
#' @importFrom uwot umap
dimensionality_reduction <- function(
    x,
    dimred_methods = c("pca", "umap"), 
    output_dimensions = NULL, 
    pca_dims = c(2),
    umap_dims = c(2),
    tsne_perplexities = c(45),
    tsne_pca = TRUE, 
    umap_neighbors = 20,
    initial_dims = 50,
    ...
) {
  out <- list()
  if ("none" %in% dimred_methods) {
    out[["none"]] <- x
    dimred_methods <- dimred_methods[dimred_methods != "none"]
  }
  n_samples <- nrow(x)
  
  for (m in dimred_methods) {
    if (m %in% c("pca", "umap")) {
      if (is.null(output_dimensions)) {
        if (m == "pca") {
          dims <- pca_dims
        } else if (m == "umap") {
          dims <- umap_dims
        }
      } else {
        dims <- output_dimensions
      }
    } else if (m == "tsne") {
      # Rtsne throws error if perplexity is too high compared to data size 
      # We can remove the offending values (possibly yielding 0 t-SNE based reductions)
      dims <- tsne_perplexities[3 * tsne_perplexities < nrow(x) - 1]
    } else {
      warning(paste("Unsupported method:", m))
      dims <- c()
    }
    if (m == "pca") pca_temp <- FactoMineR::PCA(
      x,
      scale.unit = FALSE,
      ncp = max(dims),
      graph = FALSE)
    # Handle everything in the same loop
    for (d in dims) {
      if (m == "pca") {
        if (ncol(pca_temp$ind$coord) < d) {
          stop(paste(
            "Number of extracted PCs was lower than defined parameter.", 
            "Please ensure that the maximum number of PCs does not exceed", 
            "the number of non-duplicate data points in each subset."))
        }
        temp <- pca_temp$ind$coord[,1:d]
        colnames(temp) <- paste0("dim", 1:d)
      } else if (m == "tsne") {
        if (3 * d > dim(x)[1] - 1) stop("t-SNE perplexity is too high.")
        if (tsne_pca) {
          tsne_pca_temp <- FactoMineR::PCA(
            x, 
            scale.unit = FALSE, 
            ncp = min(c(initial_dims, dim(x))), 
            graph = FALSE)
          tsne_input <- tsne_pca_temp$ind$coord
        } else {
          tsne_input <- x
        }
        temp <- Rtsne::Rtsne(
          tsne_input,
          dims = 2,
          perplexity = d,
          check_duplicates = FALSE,
          pca = FALSE,
          partial_pca = FALSE,
          verbose = FALSE)$Y
        colnames(temp) <- paste0("dim", 1:2)
      } else if (m == "umap") {
        temp <- uwot::umap(
          x,
          n_neighbors = umap_neighbors,
          n_components = d,
          pca = min(c(initial_dims, dim(x))),
          n_threads = 1, 
          verbose = FALSE,
          init = "normlaplacian")
        colnames(temp) <- paste0("dim", 1:d)
      } else {
        # never run
        temp <- NA
      }
      out[[paste0(m,d)]] <- temp
    }
  }
  return(out)
}

#' Dimensionality reduction on subsampled data sets
#'
#' @param dat_list A list of data.frames or data.tables.
#' @param sub_index A data.frame indicating cv folds and runs such as returned by \code{\link{subsampling}}.
#' @param sub_split_data Can be set to FALSE if \code{dat_list} elements already contain the columns \code{"run"} and \code{"fold"}.
#' @param parallel number of parallel threads
#' @param by column names to split data by such that each split corresponds to one dataset and fold
#' @param ... Extra arguments are passed to \code{\link{dimensionality_reduction}}.
#'
#' @return list of data sets
#' @export
#'
#' @importFrom foreach foreach %dopar%
#' @importFrom data.table data.table
subsample_dimred <- function(
    dat_list, 
    sub_index, 
    sub_split_data = TRUE, 
    parallel = 1, 
    by = c("run", "fold"), 
    ...
) {
  temp_list <- list()
  if (sub_split_data) {
    for (i in 1:length(sub_index)) {
      temp <- sub_index[[i]]
      datname <- names(sub_index)[i]
      if (is.null(datname)) datname <- i
      temp$datname <- datname
      temp <- split_by_safe(temp, by)
      temp <- lapply(
        temp, 
        function(x) as.data.frame(merge(dat_list[[datname]], x, by = "id")))
      temp_list <- c(temp_list, temp)
    }
  } else {
    for (i in 1:length(dat_list)) {
      temp <- split_by_safe(dat_list[[i]], by)
      temp_list <- c(temp_list, temp)
    }
  }
  
  parallel_clust <- setup_parallelization(parallel)
  
  out <- tryCatch(
    foreach(
      i = temp_list, 
      .combine = c#,
      #.export = c("dimensionality_reduction"), #"dat_list"),
      #.packages = c("FactoMineR", "Rtsne", "uwot", "plyr")
      ) %dopar% {
        sel <- grep("^dim[0-9]+$", colnames(i))
        dr_temp <- dimensionality_reduction(i[,sel], ...)
        dr_temp <- lapply(dr_temp, function(x) cbind(i[,-sel], as.data.frame(x)))
        dr_temp
      }, 
    finally = close_parallel_cluster(parallel_clust)
  )
  return(out)
}
vittoriofortino84/COPS documentation built on Jan. 28, 2025, 3:16 p.m.