R/dimensional_reduction.R

Defines functions PrepDR L2Norm JackRandom fftRtsne EmpiricalP CheckFeatures ScoreJackStraw.Seurat ScoreJackStraw.DimReduc ScoreJackStraw.JackStrawData RunUMAP.Seurat RunUMAP.default RunTSNE.Seurat RunTSNE.dist RunTSNE.DimReduc RunTSNE.matrix RunPCA.Seurat RunPCA.Assay RunPCA.default ProjectDim PCASigGenes JackStraw

Documented in JackStraw PCASigGenes ProjectDim RunPCA.Assay RunPCA.default RunPCA.Seurat RunTSNE.DimReduc RunTSNE.dist RunTSNE.matrix RunTSNE.Seurat RunUMAP.default RunUMAP.Seurat ScoreJackStraw.DimReduc ScoreJackStraw.JackStrawData ScoreJackStraw.Seurat

#' @include generics.R
#'
NULL

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Functions
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

#' Determine statistical significance of PCA scores.
#'
#' Randomly permutes a subset of data, and calculates projected PCA scores for
#' these 'random' genes. Then compares the PCA scores for the 'random' genes
#' with the observed PCA scores to determine statistical signifance. End result
#' is a p-value for each gene's association with each principal component.
#'
#' @param object Seurat object
#' @param reduction DimReduc to use. ONLY PCA CURRENTLY SUPPORTED.
#' @param assay Assay used to calculate reduction.
#' @param dims Number of PCs to compute significance for
#' @param num.replicate Number of replicate samplings to perform
#' @param prop.freq Proportion of the data to randomly permute for each
#' replicate
#' @param verbose Print progress bar showing the number of replicates
#' that have been processed.
#' @param maxit maximum number of iterations to be performed by the irlba function of RunPCA
#'
#' @return Returns a Seurat object where JS(object = object[['pca']], slot = 'empirical')
#' represents p-values for each gene in the PCA analysis. If ProjectPCA is
#' subsequently run, JS(object = object[['pca']], slot = 'full') then
#' represents p-values for all genes.
#'
#' @importFrom methods new
#' @importFrom pbapply pblapply pbsapply
#' @importFrom future.apply future_lapply future_sapply
#' @importFrom future nbrOfWorkers
#'
#' @references Inspired by Chung et al, Bioinformatics (2014)
#'
#' @export
#'
#' @examples
#' \dontrun{
#' pbmc_small = suppressWarnings(JackStraw(pbmc_small))
#' head(JS(object = pbmc_small[['pca']], slot = 'empirical'))
#' }
#'
JackStraw <- function(
  object,
  reduction = "pca",
  assay = NULL,
  dims = 20,
  num.replicate = 100,
  prop.freq = 0.01,
  verbose = TRUE,
  maxit = 1000
) {
  if (reduction != "pca") {
    stop("Only pca for reduction is currently supported")
  }
  if (verbose && nbrOfWorkers() == 1) {
    my.lapply <- pblapply
    my.sapply <- pbsapply
  } else {
    my.lapply <- future_lapply
    my.sapply <- future_sapply
  }
  assay <- assay %||% DefaultAssay(object = object)
  if (dims > length(x = object[[reduction]])) {
    dims <- length(x = object[[reduction]])
    warning("Number of dimensions specified is greater than those available. Setting dims to ", dims, " and continuing", immediate. = TRUE)
  }
  if (dims > nrow(x = object)) {
    dims <- nrow(x = object)
    warning("Number of dimensions specified is greater than the number of cells. Setting dims to ", dims, " and continuing", immediate. = TRUE)
  }
  loadings <- Loadings(object = object[[reduction]], projected = FALSE)
  reduc.features <- rownames(x = loadings)
  if (length(x = reduc.features) < 3) {
    stop("Too few features")
  }
  if (length(x = reduc.features) * prop.freq < 3) {
    warning(
      "Number of variable genes given ",
      prop.freq,
      " as the prop.freq is low. Consider including more variable genes and/or increasing prop.freq. ",
      "Continuing with 3 genes in every random sampling."
    )
  }
  data.use <- GetAssayData(object = object, assay = assay, slot = "scale.data")[reduc.features, ]
  rev.pca <- object[[paste0('RunPCA.', assay)]]$rev.pca
  weight.by.var <- object[[paste0('RunPCA.', assay)]]$weight.by.var
  fake.vals.raw <- my.lapply(
    X = 1:num.replicate,
    FUN = JackRandom,
    scaled.data = data.use,
    prop.use = prop.freq,
    r1.use = 1,
    r2.use = dims,
    rev.pca = rev.pca,
    weight.by.var = weight.by.var,
    maxit = maxit
  )
  fake.vals <- sapply(
    X = 1:dims,
    FUN = function(x) {
      return(as.numeric(x = unlist(x = lapply(
        X = 1:num.replicate,
        FUN = function(y) {
          return(fake.vals.raw[[y]][, x])
        }
      ))))
    }
  )
  fake.vals <- as.matrix(x = fake.vals)
  jackStraw.empP <- as.matrix(
    my.sapply(
      X = 1:dims,
      FUN = function(x) {
        return(unlist(x = lapply(
          X = abs(loadings[, x]),
          FUN = EmpiricalP,
          nullval = abs(fake.vals[,x])
        )))
      }
    )
  )
  colnames(x = jackStraw.empP) <- paste0("PC", 1:ncol(x = jackStraw.empP))
  jackstraw.obj <- new(
    Class = "JackStrawData",
    empirical.p.values  = jackStraw.empP,
    fake.reduction.scores = fake.vals,
    empirical.p.values.full = matrix()
  )
  JS(object = object[[reduction]]) <- jackstraw.obj
  object <- LogSeuratCommand(object = object)
  return(object)
}

#' Significant genes from a PCA
#'
#' Returns a set of genes, based on the JackStraw analysis, that have
#' statistically significant associations with a set of PCs.
#'
#' @param object Seurat object
#' @param pcs.use PCS to use.
#' @param pval.cut P-value cutoff
#' @param use.full Use the full list of genes (from the projected PCA). Assumes
#' that \code{ProjectDim} has been run. Currently, must be set to FALSE.
#' @param max.per.pc Maximum number of genes to return per PC. Used to avoid genes from one PC dominating the entire analysis.
#'
#' @return A vector of genes whose p-values are statistically significant for
#' at least one of the given PCs.
#'
#' @export
#'
#' @seealso \code{\link{ProjectDim}} \code{\link{JackStraw}}
#'
#' @examples
#' PCASigGenes(pbmc_small, pcs.use = 1:2)
#'
PCASigGenes <- function(
  object,
  pcs.use,
  pval.cut = 0.1,
  use.full = FALSE,
  max.per.pc = NULL
) {
  # pvals.use <- GetDimReduction(object,reduction.type = "pca",slot = "jackstraw")@empirical.p.values
  empirical.use <- ifelse(test = use.full, yes = 'full', no = 'empirical')
  pvals.use <- JS(object = object[['pca']], slot = empirical.use)
  if (length(x = pcs.use) == 1) {
    pvals.min <- pvals.use[, pcs.use]
  }
  if (length(x = pcs.use) > 1) {
    pvals.min <- apply(X = pvals.use[, pcs.use], MARGIN = 1, FUN = min)
  }
  names(x = pvals.min) <- rownames(x = pvals.use)
  features <- names(x = pvals.min)[pvals.min < pval.cut]
  if (!is.null(x = max.per.pc)) {
    top.features <- TopFeatures(
      object = object[['pca']],
      dim = pcs.use,
      nfeatures = max.per.pc,
      projected = use.full,
      balanced = FALSE
    )
    features <- intersect(x = top.features, y = features)
  }
  return(features)
}

#' Project Dimensional reduction onto full dataset
#'
#' Takes a pre-computed dimensional reduction (typically calculated on a subset
#' of genes) and projects this onto the entire dataset (all genes). Note that
#' the cell loadings will remain unchanged, but now there are gene loadings for
#' all genes.
#'
#' @param object Seurat object
#' @param reduction Reduction to use
#' @param assay Assay to use
#' @param dims.print Number of dims to print features for
#' @param nfeatures.print Number of features with highest/lowest loadings to print for
#' each dimension
#' @param overwrite Replace the existing data in feature.loadings
#' @param do.center Center the dataset prior to projection (should be set to TRUE)
#' @param verbose Print top genes associated with the projected dimensions
#'
#' @return Returns Seurat object with the projected values
#'
#' @export
#'
#' @examples
#' pbmc_small
#' pbmc_small <- ProjectDim(object = pbmc_small, reduction = "pca")
#' # Vizualize top projected genes in heatmap
#' DimHeatmap(object = pbmc_small, reduction = "pca", dims = 1, balanced = TRUE)
#'
ProjectDim <- function(
  object,
  reduction = "pca",
  assay = NULL,
  dims.print = 1:5,
  nfeatures.print = 20,
  overwrite = FALSE,
  do.center = FALSE,
  verbose = TRUE
) {
  redeuc <- object[[reduction]]
  assay <- assay %||% DefaultAssay(object = redeuc)
  data.use <- GetAssayData(
    object = object[[assay]],
    slot = "scale.data"
  )
  if (do.center) {
    data.use <- scale(x = as.matrix(x = data.use), center = TRUE, scale = FALSE)
  }
  cell.embeddings <- Embeddings(object = redeuc)
  new.feature.loadings.full <- data.use %*% cell.embeddings
  rownames(x = new.feature.loadings.full) <- rownames(x = data.use)
  colnames(x = new.feature.loadings.full) <- colnames(x = cell.embeddings)
  Loadings(object = redeuc, projected = TRUE) <- new.feature.loadings.full
  if (overwrite) {
    Loadings(object = redeuc, projected = FALSE) <- new.feature.loadings.full
  }
  object[[reduction]] <- redeuc
  if (verbose) {
    print(
      x = redeuc,
      dims = dims.print,
      nfeatures = nfeatures.print,
      projected = TRUE
    )
  }
  object <- LogSeuratCommand(object = object)
  return(object)
}

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Methods for Seurat-defined generics
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

#' @param assay Name of Assay PCA is being run on
#' @param npcs Total Number of PCs to compute and store (50 by default)
#' @param rev.pca By default computes the PCA on the cell x gene matrix. Setting
#' to true will compute it on gene x cell matrix.
#' @param weight.by.var Weight the cell embeddings by the variance of each PC
#' (weights the gene loadings if rev.pca is TRUE)
#' @param verbose Print the top genes associated with high/low loadings for
#' the PCs
#' @param ndims.print PCs to print genes for
#' @param nfeatures.print Number of genes to print for each PC
#' @param reduction.key dimensional reduction key, specifies the string before
#' the number for the dimension names. PC by default
#' @param seed.use Set a random seed. By default, sets the seed to 42. Setting
#' NULL will not set a seed.
#' @param approx Use truncated singular value decomposition to approximate PCA
#'
#' @importFrom irlba irlba
#' @importFrom stats prcomp
#' @importFrom utils capture.output
#'
#' @rdname RunPCA
#' @export
#'
RunPCA.default <- function(
  object,
  assay = NULL,
  npcs = 50,
  rev.pca = FALSE,
  weight.by.var = TRUE,
  verbose = TRUE,
  ndims.print = 1:5,
  nfeatures.print = 30,
  reduction.key = "PC_",
  seed.use = 42,
  approx = TRUE,
  ...
) {
  if (!is.null(x = seed.use)) {
    set.seed(seed = seed.use)
  }
  if (rev.pca) {
    npcs <- min(npcs, ncol(x = object) - 1)
    pca.results <- irlba(A = object, nv = npcs, ...)
    total.variance <- sum(RowVar(x = t(x = object)))
    sdev <- pca.results$d/sqrt(max(1, nrow(x = object) - 1))
    if (weight.by.var) {
      feature.loadings <- pca.results$u %*% diag(pca.results$d)
    } else{
      feature.loadings <- pca.results$u
    }
    cell.embeddings <- pca.results$v
  }
  else {
    total.variance <- sum(RowVar(x = object))
    if (approx) {
      npcs <- min(npcs, nrow(x = object) - 1)
      pca.results <- irlba(A = t(x = object), nv = npcs, ...)
      feature.loadings <- pca.results$v
      sdev <- pca.results$d/sqrt(max(1, ncol(object) - 1))
      if (weight.by.var) {
        cell.embeddings <- pca.results$u %*% diag(pca.results$d)
      } else {
        cell.embeddings <- pca.results$u
      }
    } else {
      npcs <- min(npcs, nrow(x = object))
      pca.results <- prcomp(x = t(object), rank. = npcs, ...)
      feature.loadings <- pca.results$rotation
      sdev <- pca.results$sdev
      if (weight.by.var) {
        cell.embeddings <- pca.results$x %*% diag(pca.results$sdev[1:npcs]^2)
      } else {
        cell.embeddings <- pca.results$x
      }
    }
  }
  rownames(x = feature.loadings) <- rownames(x = object)
  colnames(x = feature.loadings) <- paste0(reduction.key, 1:npcs)
  rownames(x = cell.embeddings) <- colnames(x = object)
  colnames(x = cell.embeddings) <- colnames(x = feature.loadings)
  reduction.data <- CreateDimReducObject(
    embeddings = cell.embeddings,
    loadings = feature.loadings,
    assay = assay,
    stdev = sdev,
    key = reduction.key,
    misc = list(total.variance = total.variance)
  )
  if (verbose) {
    msg <- capture.output(print(
      x = reduction.data,
      dims = ndims.print,
      nfeatures = nfeatures.print
    ))
    message(paste(msg, collapse = '\n'))
  }
  return(reduction.data)
}

#' @param features Features to compute PCA on. If features=NULL, PCA will be run
#' using the variable features for the Assay. Note that the features must be present
#' in the scaled data. Any requested features that are not scaled or have 0 variance
#' will be dropped, and the PCA will be run using the remaining features.
#'
#' @rdname RunPCA
#' @export
#' @method RunPCA Assay
#'
RunPCA.Assay <- function(
  object,
  assay = NULL,
  features = NULL,
  npcs = 50,
  rev.pca = FALSE,
  weight.by.var = TRUE,
  verbose = TRUE,
  ndims.print = 1:5,
  nfeatures.print = 30,
  reduction.key = "PC_",
  seed.use = 42,
  ...
) {
  data.use <- PrepDR(
    object = object,
    features = features,
    verbose = verbose
  )
  reduction.data <- RunPCA(
    object = data.use,
    assay = assay,
    npcs = npcs,
    rev.pca = rev.pca,
    weight.by.var = weight.by.var,
    verbose = verbose,
    ndims.print = ndims.print,
    nfeatures.print = nfeatures.print,
    reduction.key = reduction.key,
    seed.use = seed.use,
    ...

  )
  return(reduction.data)
}

#' @param reduction.name dimensional reduction name,  pca by default
#'
#' @rdname RunPCA
#' @export
#' @method RunPCA Seurat
#'
RunPCA.Seurat <- function(
  object,
  assay = NULL,
  features = NULL,
  npcs = 50,
  rev.pca = FALSE,
  weight.by.var = TRUE,
  verbose = TRUE,
  ndims.print = 1:5,
  nfeatures.print = 30,
  reduction.name = "pca",
  reduction.key = "PC_",
  seed.use = 42,
  ...
) {
  assay <- assay %||% DefaultAssay(object = object)
  assay.data <- GetAssay(object = object, assay = assay)
  reduction.data <- RunPCA(
    object = assay.data,
    assay = assay,
    features = features,
    npcs = npcs,
    rev.pca = rev.pca,
    weight.by.var = weight.by.var,
    verbose = verbose,
    ndims.print = ndims.print,
    nfeatures.print = nfeatures.print,
    reduction.key = reduction.key,
    seed.use = seed.use,
    ...
  )
  object[[reduction.name]] <- reduction.data
  object <- LogSeuratCommand(object = object)
  return(object)
}

#' @param assay Name of assay that that t-SNE is being run on
#' @param seed.use Random seed for the t-SNE. If NULL, does not set the seed
#' @param tsne.method Select the method to use to compute the tSNE. Available
#' methods are:
#' \itemize{
#' \item{Rtsne: }{Use the Rtsne package Barnes-Hut implementation of tSNE (default)}
# \item{tsne: }{standard tsne - not recommended for large datasets}
#' \item{FIt-SNE: }{Use the FFT-accelerated Interpolation-based t-SNE. Based on
#' Kluger Lab code found here: https://github.com/KlugerLab/FIt-SNE}
#' }
#' @param dim.embed The dimensional space of the resulting tSNE embedding
#' (default is 2). For example, set to 3 for a 3d tSNE
#' @param reduction.key dimensional reduction key, specifies the string before the number for the dimension names. tSNE_ by default
#'
#' @importFrom Rtsne Rtsne
#'
#' @rdname RunTSNE
#' @export
#' @method RunTSNE matrix
#'
RunTSNE.matrix <- function(
  object,
  assay = NULL,
  seed.use = 1,
  tsne.method = "Rtsne",
  dim.embed = 2,
  reduction.key = "tSNE_",
  ...
) {
  if (!is.null(x = seed.use)) {
    set.seed(seed = seed.use)
  }
  tsne.data <- switch(
    EXPR = tsne.method,
    'Rtsne' = Rtsne(
      X = object,
      dims = dim.embed,
      ... # PCA/is_distance
    )$Y,
    'FIt-SNE' = fftRtsne(X = object, dims = dim.embed, rand_seed = seed.use, ...),
    stop("Invalid tSNE method: please choose from 'Rtsne' or 'FIt-SNE'")
  )
  colnames(x = tsne.data) <- paste0(reduction.key, 1:ncol(x = tsne.data))
  rownames(x = tsne.data) <- rownames(x = object)
  tsne.reduction <- CreateDimReducObject(
    embeddings = tsne.data,
    key = reduction.key,
    assay = assay,
    global = TRUE
  )
  return(tsne.reduction)
}

#' @param cells Which cells to analyze (default, all cells)
#' @param dims Which dimensions to use as input features
#'
#' @rdname RunTSNE
#' @export
#' @method RunTSNE DimReduc
#'
RunTSNE.DimReduc <- function(
  object,
  cells = NULL,
  dims = 1:5,
  seed.use = 1,
  tsne.method = "Rtsne",
  dim.embed = 2,
  reduction.key = "tSNE_",
  ...
) {
  args <- as.list(x = sys.frame(which = sys.nframe()))
  args <- c(args, list(...))
  args$object <- args$object[[cells, args$dims]]
  args$dims <- NULL
  args$cells <- NULL
  args$assay <- DefaultAssay(object = object)
  return(do.call(what = 'RunTSNE', args = args))
}

#' @rdname RunTSNE
#' @export
#' @method RunTSNE dist
#'
RunTSNE.dist <- function(
  object,
  assay = NULL,
  seed.use = 1,
  tsne.method = "Rtsne",
  dim.embed = 2,
  reduction.key = "tSNE_",
  ...
) {
  args <- as.list(x = sys.frame(which = sys.nframe()))
  args <- c(args, list(...))
  args$object <- as.matrix(x = args$object)
  args$is_distance <- TRUE
  return(do.call(what = 'RunTSNE', args = args))
}

#' @param reduction Which dimensional reduction (e.g. PCA, ICA) to use for
#' the tSNE. Default is PCA
#' @param features If set, run the tSNE on this subset of features
#' (instead of running on a set of reduced dimensions). Not set (NULL) by default;
#' \code{dims} must be NULL to run on features
#' @param distance.matrix If set, runs tSNE on the given distance matrix
#' instead of data matrix (experimental)
#' @param reduction.name dimensional reduction name, specifies the position in the object$dr list. tsne by default
#'
#' @rdname RunTSNE
#' @export
#' @method RunTSNE Seurat
#'
RunTSNE.Seurat <- function(
  object,
  reduction = "pca",
  cells = NULL,
  dims = 1:5,
  features = NULL,
  seed.use = 1,
  tsne.method = "Rtsne",
  dim.embed = 2,
  distance.matrix = NULL,
  reduction.name = "tsne",
  reduction.key = "tSNE_",
  ...
) {
  cells <- cells %||% Cells(x = object)
  tsne.reduction <- if (!is.null(x = distance.matrix)) {
    RunTSNE(
      object = distance.matrix,
      assay = DefaultAssay(object = object),
      seed.use = seed.use,
      tsne.method = tsne.method,
      dim.embed = dim.embed,
      reduction.key = reduction.key,
      is_distance = TRUE,
      ...
    )
  } else if (!is.null(x = dims)) {
    RunTSNE(
      object = object[[reduction]],
      cells = cells,
      dims = dims,
      seed.use = seed.use,
      tsne.method = tsne.method,
      dim.embed = dim.embed,
      reduction.key = reduction.key,
      pca = FALSE,
      ...
    )
  } else if (!is.null(x = features)) {
    RunTSNE(
      object = t(x = as.matrix(x = GetAssayData(object = object)[features, cells])),
      assay = DefaultAssay(object = object),
      seed.use = seed.use,
      tsne.method = tsne.method,
      dim.embed = dim.embed,
      reduction.key = reduction.key,
      pca = FALSE,
      ...
    )
  } else {
    stop("Unknown way of running tSNE")
  }
  object[[reduction.name]] <- tsne.reduction
  object <- LogSeuratCommand(object = object)
  return(object)
}

#' @importFrom uwot umap
#' @importFrom future nbrOfWorkers
#'
#' @rdname RunUMAP
#' @method RunUMAP default
#' @export
#'
RunUMAP.default <- function(
  object,
  assay = NULL,
  n.neighbors = 30L,
  n.components = 2L,
  metric = 'cosine',
  n.epochs = NULL,
  learning.rate = 1.0,
  min.dist = 0.3,
  spread = 1.0,
  set.op.mix.ratio = 1.0,
  local.connectivity = 1L,
  repulsion.strength = 1,
  negative.sample.rate = 5,
  a = NULL,
  b = NULL,
  uwot.sgd = FALSE,
  seed.use = 42,
  metric.kwds = NULL,
  angular.rp.forest = FALSE,
  reduction.key = 'UMAP_',
  verbose = TRUE,
  ...
) {
  CheckDots(...)
  if (!is.null(x = seed.use)) {
    set.seed(seed = seed.use)
  }
  if (metric == 'correlation') {
    warning(
      "UWOT does not implement the correlation metric, using cosine instead",
      call. = FALSE,
      immediate. = TRUE
    )
    metric <- 'cosine'
  }
  umap.output <- umap(
    X = object,
    n_threads = nbrOfWorkers(),
    n_neighbors = as.integer(x = n.neighbors),
    n_components = as.integer(x = n.components),
    metric = metric,
    n_epochs = n.epochs,
    learning_rate = learning.rate,
    min_dist = min.dist,
    spread = spread,
    set_op_mix_ratio = set.op.mix.ratio,
    local_connectivity = local.connectivity,
    repulsion_strength = repulsion.strength,
    negative_sample_rate = negative.sample.rate,
    a = a,
    b = b,
    fast_sgd = uwot.sgd,
    verbose = verbose
  )
  colnames(x = umap.output) <- paste0(reduction.key, 1:ncol(x = umap.output))
  if (inherits(x = object, what = 'dist')) {
    rownames(x = umap.output) <- attr(x = object, "Labels")
  } else {
    rownames(x = umap.output) <- rownames(x = object)
  }
  umap.reduction <- CreateDimReducObject(
    embeddings = umap.output,
    key = reduction.key,
    assay = assay,
    global = TRUE
  )
  return(umap.reduction)
}

#' @param dims Which dimensions to use as input features, used only if
#' \code{features} is NULL
#' @param reduction Which dimensional reduction (PCA or ICA) to use for the
#' UMAP input. Default is PCA
#' @param features If set, run UMAP on this subset of features (instead of running on a
#' set of reduced dimensions). Not set (NULL) by default; \code{dims} must be NULL to run
#' on features
#' @param graph Name of graph on which to run UMAP
#' @param assay Assay to pull data for when using \code{features}.
#' @param n.neighbors This determines the number of neighboring points used in
#' local approximations of manifold structure. Larger values will result in more
#' global structure being preserved at the loss of detailed local structure. In
#' general this parameter should often be in the range 5 to 50.
#' @param n.components The dimension of the space to embed into.
#' @param metric metric: This determines the choice of metric used to measure
#' distance in the input space. A wide variety of metrics are already coded, and
#' a user defined function can be passed as long as it has been JITd by numba.
#' @param n.epochs he number of training epochs to be used in optimizing the low dimensional
#' embedding. Larger values result in more accurate embeddings. If NULL is specified, a value will
#' be selected based on the size of the input dataset (200 for large datasets, 500 for small).
#' @param learning.rate The initial learning rate for the embedding optimization.
#' @param min.dist This controls how tightly the embedding is allowed compress points together.
#' Larger values ensure embedded points are moreevenly distributed, while smaller values allow the
#' algorithm to optimise more accurately with regard to local structure. Sensible values are in
#' the range 0.001 to 0.5.
#' @param spread The effective scale of embedded points. In combination with min.dist this
#' determines how clustered/clumped the embedded points are.
#' @param set.op.mix.ratio Interpolate between (fuzzy) union and intersection as the set operation
#' used to combine local fuzzy simplicial sets to obtain a global fuzzy simplicial sets. Both fuzzy
#' set operations use the product t-norm. The value of this parameter should be between 0.0 and
#' 1.0; a value of 1.0 will use a pure fuzzy union, while 0.0 will use a pure fuzzy intersection.
#' @param local.connectivity The local connectivity required - i.e. the number of nearest neighbors
#' that should be assumed to be connected at a local level. The higher this value the more connected
#' the manifold becomes locally. In practice this should be not more than the local intrinsic
#' dimension of the manifold.
#' @param repulsion.strength Weighting applied to negative samples in low dimensional embedding
#' optimization. Values higher than one will result in greater weight being given to negative
#' samples.
#' @param negative.sample.rate The number of negative samples to select per positive sample in the
#' optimization process. Increasing this value will result in greater repulsive force being applied,
#' greater optimization cost, but slightly more accuracy.
#' @param a More specific parameters controlling the embedding. If NULL, these values are set
#' automatically as determined by min. dist and spread. Parameter of differentiable approximation of
#' right adjoint functor.
#' @param b More specific parameters controlling the embedding. If NULL, these values are set
#' automatically as determined by min. dist and spread. Parameter of differentiable approximation of
#' right adjoint functor.
#' @param uwot.sgd Set \code{uwot::umap(fast_sgd = TRUE)}; see \code{\link[uwot]{umap}} for more details
#' @param metric.kwds A dictionary of arguments to pass on to the metric, such as the p value for
#' Minkowski distance. If NULL then no arguments are passed on.
#' @param angular.rp.forest Whether to use an angular random projection forest to initialise the
#' approximate nearest neighbor search. This can be faster, but is mostly on useful for metric that
#' use an angular style distance such as cosine, correlation etc. In the case of those metrics
#' angular forests will be chosen automatically.
#' @param reduction.name Name to store dimensional reduction under in the Seurat object
#' @param reduction.key dimensional reduction key, specifies the string before
#' the number for the dimension names. UMAP by default
#' @param seed.use Set a random seed. By default, sets the seed to 42. Setting
#' NULL will not set a seed
#' @param verbose Controls verbosity
#'
#' @rdname RunUMAP
#' @export
#' @method RunUMAP Seurat
#'
RunUMAP.Seurat <- function(
  object,
  dims = NULL,
  reduction = 'pca',
  features = NULL,
  graph = NULL,
  assay = 'RNA',
  n.neighbors = 30L,
  n.components = 2L,
  metric = 'cosine',
  n.epochs = NULL,
  learning.rate = 1,
  min.dist = 0.3,
  spread = 1,
  set.op.mix.ratio = 1,
  local.connectivity = 1L,
  repulsion.strength = 1,
  negative.sample.rate = 5L,
  a = NULL,
  b = NULL,
  uwot.sgd = FALSE,
  seed.use = 42L,
  metric.kwds = NULL,
  angular.rp.forest = FALSE,
  verbose = TRUE,
  reduction.name = 'umap',
  reduction.key = 'UMAP_',
  ...
) {
  CheckDots(...)
  if (sum(c(is.null(x = dims), is.null(x = features), is.null(x = graph))) < 2) {
      stop("Please specify only one of the following arguments: dims, features, or graph")
  }
  if (!is.null(x = features)) {
    data.use <- as.matrix(x = t(x = GetAssayData(object = object, slot = 'data', assay = assay)[features, , drop = FALSE]))
    if (ncol(x = data.use) < n.components) {
      stop(
        "Please provide as many or more features than n.components: ",
        length(x = features),
        " features provided, ",
        n.components,
        " UMAP components requested",
        call. = FALSE
      )
    }
  } else if (!is.null(x = dims)) {
    data.use <- Embeddings(object[[reduction]])[, dims]
    assay <- DefaultAssay(object = object[[reduction]])
    if (length(x = dims) < n.components) {
      stop(
        "Please provide as many or more dims than n.components: ",
        length(x = dims),
        " dims provided, ",
        n.components,
        " UMAP components requested",
        call. = FALSE
      )
    }
  } else if (!is.null(x = graph)) {
    data.use <- object[[graph]]
  } else {
    stop("Please specify one of dims, features, or graph")
  }
  object[[reduction.name]] <- RunUMAP(
    object = data.use,
    assay = assay,
    n.neighbors = n.neighbors,
    n.components = n.components,
    metric = metric,
    n.epochs = n.epochs,
    learning.rate = learning.rate,
    min.dist = min.dist,
    spread = spread,
    set.op.mix.ratio = set.op.mix.ratio,
    local.connectivity = local.connectivity,
    repulsion.strength = repulsion.strength,
    negative.sample.rate = negative.sample.rate,
    a = a,
    b = b,
    uwot.sgd = uwot.sgd,
    seed.use = seed.use,
    metric.kwds = metric.kwds,
    angular.rp.forest = angular.rp.forest,
    reduction.key = reduction.key,
    verbose = verbose
  )
  object <- LogSeuratCommand(object = object)
  return(object)
}

#' @param dims Which dimensions to examine
#' @param score.thresh Threshold to use for the proportion test of PC
#' significance (see Details)
#'
#' @importFrom stats prop.test
#'
#' @rdname ScoreJackStraw
#' @export
#' @method ScoreJackStraw JackStrawData
#'
ScoreJackStraw.JackStrawData <- function(
  object,
  dims = 1:5,
  score.thresh = 1e-5,
  ...
) {
  CheckDots(...)
  pAll <- JS(object = object, slot = "empirical.p.values")
  pAll <- pAll[, dims, drop = FALSE]
  pAll <- as.data.frame(pAll)
  pAll$Contig <- rownames(x = pAll)
  score.df <- NULL
  for (i in dims) {
    pc.score <- suppressWarnings(prop.test(
      x = c(
        length(x = which(x = pAll[, i] <= score.thresh)),
        floor(x = nrow(x = pAll) * score.thresh)
      ),
      n = c(nrow(pAll), nrow(pAll))
    )$p.val)
    if (length(x = which(x = pAll[, i] <= score.thresh)) == 0) {
      pc.score <- 1
    }
    if (is.null(x = score.df)) {
      score.df <- data.frame(PC = paste0("PC", i), Score = pc.score)
    } else {
      score.df <- rbind(score.df, data.frame(PC = paste0("PC", i), Score = pc.score))
    }
  }
  score.df$PC <- dims
  score.df <- as.matrix(score.df)
  JS(object = object, slot = 'overall') <- score.df
  return(object)
}

#' @rdname ScoreJackStraw
#' @export
#' @method ScoreJackStraw DimReduc
#'
ScoreJackStraw.DimReduc <- function(object, dims = 1:5, score.thresh = 1e-5, ...) {
  JS(object = object) <- ScoreJackStraw(
    object = JS(object = object),
    dims = dims,
    score.thresh = 1e-5,
    ...
  )
  return(object)
}

#' @param reduction Reduction associated with JackStraw to score
#' @param do.plot Show plot. To return ggplot object, use \code{JackStrawPlot} after
#' running ScoreJackStraw.
#'
#' @seealso \code{\link{JackStrawPlot}}
#'
#' @rdname ScoreJackStraw
#' @export
#' @method ScoreJackStraw Seurat
#'
ScoreJackStraw.Seurat <- function(
  object,
  reduction = "pca",
  dims = 1:5,
  score.thresh = 1e-5,
  do.plot = FALSE,
  ...
) {
  object[[reduction]] <- ScoreJackStraw(
    object = object[[reduction]],
    dims = dims,
    ...
  )
  if (do.plot) {
    CheckDots(..., fxns = 'JackStrawPlot')
    suppressWarnings(expr = print(JackStrawPlot(
      object = object,
      reduction = reduction,
      dims = dims,
      ...
    )))
  }
  object <- LogSeuratCommand(object = object)
  return(object)
}

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Methods for R-defined generics
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Internal
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

# Check that features are present and have non-zero variance
#
# @param data.use Feature matrix (features are rows)
# @param features Features to check
# @param object.name Name of object for message printing
# @param verbose Print warnings
#
# @return Returns a vector of features that is the subset of features
# that have non-zero variance
#
CheckFeatures <- function(
  data.use,
  features,
  object.name,
  verbose = TRUE
) {
  if (any(!features %in% rownames(x = data.use))) {
    missing.features <- features[!features %in% rownames(x = data.use)]
    features <- setdiff(x = features, y = missing.features)
    if (verbose){
      warning(
        paste0(
          "The following ", length(x = missing.features),
           " features are not scaled in ",
           object.name,
           ": ",
           paste0(missing.features, collapse = ", ")
        ))
    }
  }
  if (inherits(x = data.use, what = 'dgCMatrix')) {
    features.var <- SparseRowVar(mat = data.use[features, ], display_progress = F)
  }
  else {
    features.var <- RowVar(x = data.use[features, ])
  }
  no.var.features <- features[features.var == 0]
  if (length(x = no.var.features) > 0 && verbose) {
    warning(
     paste0(
       "The following features have zero variance in ",
       object.name,
       ": ",
       paste0(no.var.features, collapse = ", ")
    ))
  }
  features <- setdiff(x = features, y = no.var.features)
  features <- features[!is.na(x = features)]
  return(features)
}

#internal
EmpiricalP <- function(x, nullval) {
  return(sum(nullval > x) / length(x = nullval))
}

# FIt-SNE helper function for calling fast_tsne from R
#
# Based on Kluger Lab FIt-SNE v1.1.0 code on https://github.com/KlugerLab/FIt-SNE/blob/master/fast_tsne.R
# commit d2cf403 on Feb 8, 2019
#
#' @importFrom utils file_test
#
fftRtsne <- function(X,
  dims = 2,
  perplexity = 30,
  theta = 0.5,
  check_duplicates = TRUE,
  max_iter = 1000,
  fft_not_bh = TRUE,
  ann_not_vptree = TRUE,
  stop_early_exag_iter = 250,
  exaggeration_factor = 12.0,
  no_momentum_during_exag = FALSE,
  start_late_exag_iter = -1.0,
  late_exag_coeff = 1.0,
  mom_switch_iter = 250,
  momentum = 0.5,
  final_momentum = 0.8,
  learning_rate = 200,
  n_trees = 50,
  search_k = -1,
  rand_seed = -1,
  nterms = 3,
  intervals_per_integer = 1,
  min_num_intervals = 50,
  K = -1,
  sigma = -30,
  initialization = NULL,
  data_path = NULL,
  result_path = NULL,
  load_affinities = NULL,
  fast_tsne_path = NULL,
  nthreads = getOption('mc.cores', default = 1),
  perplexity_list = NULL,
  get_costs = FALSE,
  df = 1.0,
  ...
) {
  CheckDots(...)
  if (is.null(x = data_path)) {
    data_path <- tempfile(pattern = 'fftRtsne_data_', fileext = '.dat')
  }
  if (is.null(x = result_path)) {
    result_path <- tempfile(pattern = 'fftRtsne_result_', fileext = '.dat')
  }
  if (is.null(x = fast_tsne_path)) {
    # suppressWarnings(expr = fast_tsne_path <- system2(command = 'which', args = 'fast_tsne', stdout = TRUE))
    fast_tsne_path <- SysExec(progs = ifelse(
      test = .Platform$OS.type == 'windows',
      yes = 'FItSNE.exe',
      no = 'fast_tsne'
      ))
    if (length(x = fast_tsne_path) == 0) {
      stop("no fast_tsne_path specified and fast_tsne binary is not in the search path")
    }
  }
  fast_tsne_path <- normalizePath(path = fast_tsne_path)
  if (!file_test(op = '-x', x = fast_tsne_path)) {
    stop("fast_tsne_path '", fast_tsne_path, "' does not exist or is not executable")
  }
  # check fast_tsne version
  ft.out <- suppressWarnings(expr = system2(command = fast_tsne_path, stdout = TRUE))
  if (grepl(pattern = '= t-SNE v1.1', x = ft.out[1])) {
    version_number <- '1.1.0'
  } else if (grepl(pattern = '= t-SNE v1.0', x = ft.out[1])) {
    version_number <- '1.0'
  } else {
    message("First line of fast_tsne output is")
    message(ft.out[1])
    stop("Our FIt-SNE wrapper requires FIt-SNE v1.X.X, please install the appropriate version from github.com/KlugerLab/FIt-SNE and have fast_tsne_path point to it if it's not in your path")
  }
  is.wholenumber <- function(x, tol = .Machine$double.eps ^ 0.5) {
    return(abs(x = x - round(x = x)) < tol)
  }
  if (version_number == '1.0' && df != 1.0) {
    stop("This version of FIt-SNE does not support df!=1. Please install the appropriate version from github.com/KlugerLab/FIt-SNE")
  }
  if (!is.numeric(x = theta) || (theta < 0.0) || (theta > 1.0) ) {
    stop("Incorrect theta.")
  }
  if (nrow(x = X) - 1 < 3 * perplexity) {
    stop("Perplexity is too large.")
  }
  if (!is.matrix(x = X)) {
    stop("Input X is not a matrix")
  }
  if (!(max_iter > 0)) {
    stop("Incorrect number of iterations.")
  }
  if (!is.wholenumber(x = stop_early_exag_iter) || stop_early_exag_iter < 0) {
    stop("stop_early_exag_iter should be a positive integer")
  }
  if (!is.numeric(x = exaggeration_factor)) {
    stop("exaggeration_factor should be numeric")
  }
  if (!is.wholenumber(x = dims) || dims <= 0) {
    stop("Incorrect dimensionality.")
  }
  if (search_k == -1) {
    if (perplexity > 0) {
      search_k <- n_trees * perplexity * 3
    } else if (perplexity == 0) {
      search_k <- n_trees * max(perplexity_list) * 3
    } else {
      search_k <- n_trees * K * 3
    }
  }
  nbody_algo <- ifelse(test = fft_not_bh, yes = 2, no = 1)
  if (is.null(load_affinities)) {
    load_affinities <- 0
  } else {
    if (load_affinities == 'load') {
      load_affinities <- 1
    } else if (load_affinities == 'save') {
      load_affinities <- 2
    } else {
      load_affinities <- 0
    }
  }
  knn_algo <- ifelse(test = ann_not_vptree, yes = 1, no = 2)
  f <- file(description = data_path, open = "wb")
  n = nrow(x = X)
  D = ncol(x = X)
  writeBin(object = as.integer(x = n), con = f, size = 4)
  writeBin(object = as.integer(x = D), con = f, size = 4)
  writeBin(object = as.numeric(x = theta), con = f, size = 8) #theta
  writeBin(object = as.numeric(x = perplexity), con = f, size = 8) #theta
  if (perplexity == 0) {
    writeBin(object = as.integer(x = length(x = perplexity_list)), con = f, size = 4)
    writeBin(object = perplexity_list, con = f)
  }
  writeBin(object = as.integer(x = dims), con = f, size = 4) #theta
  writeBin(object = as.integer(x = max_iter), con = f, size = 4)
  writeBin(object = as.integer(x = stop_early_exag_iter), con = f, size = 4)
  writeBin(object = as.integer(x = mom_switch_iter), con = f, size = 4)
  writeBin(object = as.numeric(x = momentum), con = f, size = 8)
  writeBin(object = as.numeric(x = final_momentum), con = f, size = 8)
  writeBin(object = as.numeric(x = learning_rate), con = f, size = 8)
  writeBin(object = as.integer(x = K), con = f, size = 4) #K
  writeBin(object = as.numeric(x = sigma), con = f, size = 8) #sigma
  writeBin(object = as.integer(x = nbody_algo), con = f, size = 4)  #not barnes hut
  writeBin(object = as.integer(x = knn_algo), con = f, size = 4)
  writeBin(object = as.numeric(x = exaggeration_factor), con = f, size = 8) #compexag
  writeBin(object = as.integer(x = no_momentum_during_exag), con = f, size = 4)
  writeBin(object = as.integer(x = n_trees), con = f, size = 4)
  writeBin(object = as.integer(x = search_k), con = f, size = 4)
  writeBin(object = as.integer(x = start_late_exag_iter), con = f, size = 4)
  writeBin(object = as.numeric(x = late_exag_coeff), con = f, size = 8)
  writeBin(object = as.integer(x = nterms), con = f, size = 4)
  writeBin(object = as.numeric(x = intervals_per_integer), con = f, size = 8)
  writeBin(object = as.integer(x = min_num_intervals), con = f, size = 4)
  tX = c(t(X))
  writeBin(object = tX, con = f)
  writeBin(object = as.integer(x = rand_seed), con = f, size = 4)
  if (version_number != "1.0") {
    writeBin(object = as.numeric(x = df), con = f, size = 8)
  }
  writeBin(object = as.integer(x = load_affinities), con = f, size = 4)
  if (!is.null(x = initialization)) {
    writeBin(object = c(t(x = initialization)), con = f)
  }
  close(con = f)
  if (version_number == "1.0") {
    flag <- system2(
     command = fast_tsne_path,
     args = c(data_path, result_path, nthreads)
    )
  } else {
    flag <- system2(
      command = fast_tsne_path,
      args = c(version_number, data_path, result_path, nthreads)
    )
  }
  if (flag != 0) {
    stop('tsne call failed')
  }
  f <- file(description = result_path, open = "rb")
  n <- readBin(con = f, what = integer(), n = 1, size = 4)
  d <- readBin(con = f, what = integer(), n = 1, size = 4)
  Y <- readBin(con = f, what = numeric(), n = n * d)
  Y <- t(x = matrix(Y, nrow = d))
  if (get_costs) {
    tmp <- readBin(con = f, what = integer(), n = 1, size = 4)
    costs <- readBin(con = f, what = numeric(), n = max_iter, size = 8)
    Yout <- list(Y = Y, costs = costs)
  } else {
    Yout <- Y
  }
  close(con = f)
  file.remove(data_path)
  file.remove(result_path)
  return(Yout)
}

#internal
#
JackRandom <- function(
  scaled.data,
  prop.use = 0.01,
  r1.use = 1,
  r2.use = 5,
  seed.use = 1,
  rev.pca = FALSE,
  weight.by.var = weight.by.var,
  maxit = 1000
) {
  if (!is.null(x = seed.use)) {
    set.seed(seed = seed.use)
  }
  rand.genes <- sample(
    x = rownames(x = scaled.data),
    size = nrow(x = scaled.data) * prop.use
  )
  # make sure that rand.genes is at least 3
  if (length(x = rand.genes) < 3) {
    rand.genes <- sample(x = rownames(x = scaled.data), size = 3)
  }
  data.mod <- scaled.data
  data.mod[rand.genes, ] <- MatrixRowShuffle(x = scaled.data[rand.genes, ])
  temp.object <- RunPCA(
    object = data.mod,
    assay = "temp",
    npcs = r2.use,
    features = rownames(x = data.mod),
    rev.pca = rev.pca,
    weight.by.var = weight.by.var,
    verbose = FALSE,
    maxit = maxit
  )
  return(Loadings(temp.object)[rand.genes, r1.use:r2.use])
}

# Calculates the l2-norm of a vector
#
# Modified from PMA package
# @references Witten, Tibshirani, and Hastie, Biostatistics 2009
# @references \url{https://github.com/cran/PMA/blob/master/R/PMD.R}
#
# @param vec numeric vector
#
# @return returns the l2-norm.
#
L2Norm <- function(vec) {
  a <- sqrt(x = sum(vec ^ 2))
  if (a == 0) {
    a <- .05
  }
  return(a)
}

# Prep data for dimensional reduction
#
# Common checks and preparatory steps before running certain dimensional
# reduction techniques
#
# @param object        Assay object
# @param features  Features to use as input for the dimensional reduction technique.
#                      Default is variable features
# @ param verbose   Print messages and warnings
#
#
PrepDR <- function(
  object,
  features = NULL,
  verbose = TRUE
) {
  if (length(x = VariableFeatures(object = object)) == 0 && is.null(x = features)) {
    stop("Variable features haven't been set. Run FindVariableFeatures() or provide a vector of feature names.")
  }
  data.use <- GetAssayData(object = object, slot = "scale.data")
  if (nrow(x = data.use ) == 0) {
    stop("Data has not been scaled. Please run ScaleData and retry")
  }
  features <- features %||% VariableFeatures(object = object)
  features.keep <- unique(x = features[features %in% rownames(x = data.use)])
  if (length(x = features.keep) < length(x = features)) {
    features.exclude <- setdiff(x = features, y = features.keep)
    if (verbose) {
      warning(paste0("The following ", length(x = features.exclude), " features requested have not been scaled (running reduction without them): ", paste0(features.exclude, collapse = ", ")))
    }
  }
  features <- features.keep
  features.var <- apply(X = data.use[features, ], MARGIN = 1, FUN = var)
  features.keep <- features[features.var > 0]
  if (length(x = features.keep) < length(x = features)) {
    features.exclude <- setdiff(x = features, y = features.keep)
    if (verbose) {
      warning(paste0("The following ", length(x = features.exclude), " features requested have zero variance (running reduction without them): ", paste0(features.exclude, collapse = ", ")))
    }
  }
  features <- features.keep
  features <- features[!is.na(x = features)]
  data.use <- data.use[features, ]
  return(data.use)
}
lambdamoses/SeuratBasics documentation built on May 6, 2020, 9:32 a.m.