Spectre: High-dimensional cytometry and imaging analysis.

Documented in run.umap

#' Run the UMAP algorithm (using umap::umap())
#'
#' Method to run a UMAP dimensionality reduction algorithm.
#' A UMAP (uniform manifold approximation and projection) plot is a useful means to visualise data.
#' As it is a dimensionality reduction algorithm, some data will be lost.
#' It is good practice to validate any populations (namely through manual gating).
#' For more information on parameter choices, see ?umap::umap.defaults.
#' Uses the R package "umap" to calculate plots and "data.table" to handle data.
#'
#' @param dat NO DEFAULT. Input data.table or data.frame.
#' @param use.cols NO DEFAULT. Vector of column names or numbers for clustering.
#' @param umap.x.name DEFAULT = "UMAP_X". Character. Name of UMAP x-axis.
#' @param umap.y.name DEFAULT = "UMAP_Y". Character. Name of UMAP y-axis.
#' @param umap.seed DEFAULT = 42. Numeric. Seed value for reproducibility.
#' @param neighbours DEFAULT = 15. Numeric. Number of nearest neighbours.
#' @param n_components DEFAULT = 2. Numeric. Number of dimensions for output results.
#' @param metric DEFAULT = "euclidean". Character or function. Determines how distances between data points are computed. Can also be "manhattan".
#' @param n_epochs DEFAULT = 200. Numeric. Number of iterations performed during layout optimisation.
#' @param input DEFAULT = "data". Character. Determines whether primary input argument is a data or distance matrix. Can also be "dist".
#' @param init DEFAULT = "spectral". Character or matrix. Deafult "spectral" computes an initial embedding using eigenvectors of the connectivity graph matrix. Can also use "random" (creates an initial layout based on random coordinates).
#' @param min_dist DEFAULT = 0.1. Numeric. Determines how close points appear in final layout.
#' @param set_op_mix_ratio DEFAULT = 1. Numeric in range [0,1]. Determines who the knn-graph is used to create a fuzzy simplicial graph.
#' @param local_connectivity DEFAULT = 1. Numeric. Used during construction of fuzzy simplicial set.
#' @param bandwidth DEFAULT = 1. Numeric. Used during construction of fuzzy simplicial set.
#' @param alpha DEFAULT = 1. Numeric. Initial value of "learning rate" of layout optimisation.
#' @param gamma DEFAULT = 1. Numeric. Together with alpha, it determines the learning rate of layout optimisation.
#' @param negative_sample_rate DEFAULT = 5. Numeric. Determines how many non-neighbour points are used per point and per iteration during layout optimisation.
#' @param a_gradient DEFAULT = NA. Numeric. Contributes to gradient calculations during layout optimisation. When left at NA, a suitable value will be estimated automatically.
#' @param b_gradient DEFAULT = NA. Numeric. Contributes to gradient calculations during layout optimisation. When left at NA, a suitable value will be estimated automatically.
#' @param spread DEFAULT = 1. Numeric. Used during automatic estimation of a_gradient/b_gradient parameters.
#' @param transform_state DEFAULT = 42. Numeric. Seed for random number generation used during predict().
#' @param knn.repeats DEFAULT = 1. Numeric. Number of times to restart knn search.
#' @param verbose DEFAULT = TRUE. Logical. Determines whether to show progress messages.
#' @param umap_learn_args DEFAULT = NA. Vector. Vector of arguments to python package umap-learn.
#'
#' @usage run.umap(dat, use.cols, umap.x.name = "UMAP_X", umap.y.name = "UMAP_Y", umap.seed = 42, neighbours = 15, n_components = 2, metric = "euclidean", n_epochs = 200, input = "data", init = "spectral", min_dist = 0.1, set_op_mix_ratio = 1, local_connectivity = 1, bandwidth = 1, alpha = 1, gamma = 1, negative_sample_rate = 5, a_gradient = NA, b_gradient = NA, spread = 1, transform_state = 42, knn.repeats = 1, verbose = TRUE, umap_learn_args = NA)
#'
#' @examples
#' # Run UMAP on a subset of the  demonstration dataset
#'
#' cell.dat <- do.subsample(Spectre::demo.asinh, 10000) # Subsample the demo dataset to 10000 cells
#' cell.dat <- Spectre::run.umap(dat = cell.dat,
#'                               use.cols = names(demo.asinh)[c(2:10)])
#' @author
#' Thomas Ashhurst, \email{thomas.ashhurst@@sydney.edu.au}
#' Felix Marsh-Wakefield, \email{felix.marsh-wakefield@@sydney.edu.au}
#' @export

run.umap <- function(dat,
                     use.cols,
                     umap.x.name = "UMAP_X",
                     umap.y.name = "UMAP_Y",
                     umap.seed = 42,
                     neighbours = 15,
                     n_components = 2,
                     metric = "euclidean",
                     n_epochs = 200,
                     input = "data",
                     init = "spectral",
                     min_dist = 0.1,
                     set_op_mix_ratio = 1,
                     local_connectivity = 1,
                     bandwidth = 1,
                     alpha = 1,
                     gamma = 1,
                     negative_sample_rate = 5,
                     a_gradient = NA,
                     b_gradient = NA,
                     spread = 1,
                     transform_state = 42,
                     knn.repeats = 1,
                     verbose = TRUE,
                     umap_learn_args = NA
                     )
{

  ### Test data
      # dat <- iris
      # umap.seed <- 42
      # use.cols <- c(1:4)

  ## Check that necessary packages are installed
  if(!is.element('umap', installed.packages()[,1])) stop('umap is required but not installed')
  if(!is.element('data.table', installed.packages()[,1])) stop('data.table is required but not installed')

  ## Require packages
  require(umap)
  require(data.table)

  ###
  custom.config <- umap::umap.defaults
  custom.config$random_state <- umap.seed

  custom.config$n_neighbors <- neighbours
  custom.config$n_components <- n_components
  custom.config$metric <- metric
  custom.config$n_epochs <- n_epochs
  custom.config$input <- input
  custom.config$init <- init
  custom.config$min_dist <- min_dist
  custom.config$set_op_mix_ratio <- set_op_mix_ratio
  custom.config$local_connectivity <- local_connectivity
  custom.config$bandwidth <- bandwidth
  custom.config$alpha <- alpha
  custom.config$gamma <- gamma
  custom.config$negative_sample_rate <- negative_sample_rate
  custom.config$a <- a_gradient
  custom.config$b <- b_gradient
  custom.config$spread <- spread
  custom.config$transform_state <- transform_state
  custom.config$knn.repeats <- knn.repeats
  custom.config$verbose <- verbose
  custom.config$umap_learn_args <- umap_learn_args

  dat.start <- data.table(dat)
  dat.bk <- data.table(dat)

  dat.bk <- dat.bk[, ..use.cols]

  res <- umap::umap(d = dat.bk,
              config = custom.config)

  umap.res <- res$layout
  head(umap.res)

  umap.res <- as.data.frame(umap.res)
  head(umap.res)

  names(umap.res)[names(umap.res) == "V1"] <- umap.x.name
  names(umap.res)[names(umap.res) == "V2"] <- umap.y.name

  #assign("umap.res", umap.res, envir = globalenv())
  res <- cbind(dat.start, umap.res) # Merge UMAP results with data
  return(res)
}