R/dimensional_reduction.R

#' Run Principal Component Analysis on gene expression using IRLBA
#'
#' Run a PCA dimensionality reduction. For details about stored PCA calculation
#' parameters, see \code{PrintPCAParams}.
#'
#' @param object Seurat object
#' @param pc.genes Genes to use as input for PCA. Default is object@@var.genes
#' @param pcs.compute Total Number of PCs to compute and store
#' @param use.imputed Run PCA on imputed values (FALSE by default)
#' @param rev.pca By default computes the PCA on the cell x gene matrix. Setting
#' to true will compute it on gene x cell matrix.
#' @param weight.by.var Weight the cell embeddings by the variance of each PC
#' (weights the gene loadings if rev.pca is TRUE)
#' @param do.print Print the top genes associated with high/low loadings for
#' the PCs
#' @param pcs.print PCs to print genes for
#' @param genes.print Number of genes to print for each PC
#' @param reduction.name dimensional reduction name, specifies the position in the object$dr list. pca by default
#' @param reduction.key dimensional reduction key, specifies the string before the number for the dimension names. PC by default
#' @param assay.type Data type, RNA by default. Can be changed for multimodal
#' @param \dots Additional arguments to be passed to IRLBA
#'
#'@importFrom irlba irlba
#'
#' @return Returns Seurat object with the PCA calculation stored in
#' object@@dr$pca.
#'
#' @importFrom irlba irlba
#'
#' @export
#'
RunPCA <- function(
  object,
  pc.genes = NULL,
  pcs.compute = 20,
  use.imputed = FALSE,
  rev.pca = FALSE,
  weight.by.var = TRUE,
  do.print = TRUE,
  pcs.print = 1:5,
  genes.print = 30,
  reduction.name = "pca",
  reduction.key = "PC",
  assay.type="RNA",
  ...
) {
  data.use <- PrepDR(
    object = object,
    genes.use = pc.genes,
    use.imputed = use.imputed,
    assay.type = assay.type)
  pcs.compute <- min(pcs.compute, ncol(x = data.use))
  if (rev.pca) {
    pca.results <- irlba(A = data.use, nv = pcs.compute, ...)
    sdev <- pca.results$d/sqrt(max(1, nrow(data.use) - 1))
    if(weight.by.var){
      gene.loadings <- pca.results$u %*% diag(pca.results$d)
    } else{
      gene.loadings <- pca.results$u
    }
    cell.embeddings <- pca.results$v
  }
  else {
    pca.results <- irlba(A = t(x = data.use), nv = pcs.compute, ...)
    gene.loadings <- pca.results$v
    sdev <- pca.results$d/sqrt(max(1, ncol(data.use) - 1))
    if(weight.by.var){
      cell.embeddings <- pca.results$u %*% diag(pca.results$d)
    } else {
      cell.embeddings <- pca.results$u
    }
  }
  rownames(x = gene.loadings) <- rownames(x = data.use)
  colnames(x = gene.loadings) <- paste0(reduction.key, 1:pcs.compute)
  rownames(x = cell.embeddings) <- colnames(x = data.use)
  colnames(x = cell.embeddings) <- colnames(x = gene.loadings)
  pca.obj <- new(
    Class = "dim.reduction",
    gene.loadings = gene.loadings,
    cell.embeddings = cell.embeddings,
    sdev = sdev,
    key = reduction.key
  )
  #object@dr[reduction.name] <- pca.obj
  eval(expr = parse(text = paste0("object@dr$", reduction.name, "<- pca.obj")))

  parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("RunPCA"))]
  object <- SetCalcParams(object = object, calculation = "RunPCA", ... = parameters.to.store)
  if(is.null(object@calc.params$RunPCA$pc.genes)){
    object@calc.params$RunPCA$pc.genes <- rownames(data.use)
  }
  if(do.print){
    PrintDim(object = object, dims.print = pcs.print, genes.print = genes.print,reduction.type = reduction.name)
  }
  return(object)
}


#' Run Independent Component Analysis on gene expression
#'
#' Run fastica algorithm from the ica package for ICA dimensionality reduction.
#' For details about stored ICA calculation parameters, see
#' \code{PrintICAParams}.
#'
#' @param object Seurat object
#' @param ic.genes Genes to use as input for ICA. Default is object@@var.genes
#' @param ics.compute Number of ICs to compute
#' @param use.imputed Run ICA on imputed values (FALSE by default)
#' @param rev.ica By default, computes the dimensional reduction on the cell x
#' gene matrix. Setting to true will compute it on the transpose (gene x cell
#' matrix).
#' @param print.results Print the top genes associated with each dimension
#' @param ics.print ICs to print genes for
#' @param genes.print Number of genes to print for each IC
#' @param ica.function ICA function from ica package to run (options: icafast,
#' icaimax, icajade)
#' @param seed.use Random seed to use for fastica
#' @param reduction.name dimensional reduction name, specifies the position in the object$dr list. ica by default
#' @param reduction.key dimensional reduction key, specifies the string before the number for the dimension names. IC by default
#' @param \dots Additional arguments to be passed to fastica
#'
#' @importFrom ica icafast icaimax icajade
#'
#' @return Returns Seurat object with an ICA calculation stored in
#' object@@dr$ica
#'
#' @export
#'
RunICA <- function(
  object,
  ic.genes = NULL,
  ics.compute = 50,
  use.imputed = FALSE,
  rev.ica = FALSE,
  print.results = TRUE,
  ics.print = 1:5,
  genes.print = 50,
  ica.function = "icafast",
  seed.use = 1,
  reduction.name = "ica",
  reduction.key = "IC",
  ...
) {
  data.use <- PrepDR(
    object = object,
    genes.use = ic.genes,
    use.imputed = use.imputed)
  set.seed(seed = seed.use)
  ics.compute <- min(ics.compute, ncol(x = data.use))
  ica.fxn <- eval(parse(text = ica.function))
  if (rev.ica) {
    ica.results <- ica.fxn(data.use, nc = ics.compute,...)
    cell.embeddings <- ica.results$M
  } else {
    ica.results <- ica.fxn(t(x = data.use), nc = ics.compute,...)
    cell.embeddings <- ica.results$S
  }
  gene.loadings <- (as.matrix(x = data.use ) %*% as.matrix(x = cell.embeddings))
  colnames(x = gene.loadings) <- paste0(reduction.key, 1:ncol(x = gene.loadings))
  colnames(x = cell.embeddings) <- paste0(reduction.key, 1:ncol(x = cell.embeddings))
  ica.obj <- new(
    Class = "dim.reduction",
    gene.loadings = gene.loadings,
    cell.embeddings = cell.embeddings,
    sdev = sqrt(x = ica.results$vafs),
    key = "IC"
  )

  eval(expr = parse(text = paste0("object@dr$", reduction.name, "<- ica.obj")))

  parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("ICA"))]
  object <- SetCalcParams(object = object, calculation = "ICA", ... = parameters.to.store)
  if(is.null(object@calc.params$ICA$ic.genes)){
    object@calc.params$ICA$ic.genes <- rownames(data.use)
  }
  if(print.results){
    PrintDim(object = object, dims.print = ics.print, genes.print = genes.print,reduction.type = reduction.name)
  }
  return(object)
}


#' Run t-distributed Stochastic Neighbor Embedding
#'
#' Run t-SNE dimensionality reduction on selected features. Has the option of
#' running in a reduced dimensional space (i.e. spectral tSNE, recommended),
#' or running based on a set of genes. For details about stored TSNE calculation
#' parameters, see \code{PrintTSNEParams}.
#'
#' @param object Seurat object
#' @param reduction.use Which dimensional reduction (e.g. PCA, ICA) to use for
#' the tSNE. Default is PCA
#' @param cells.use Which cells to analyze (default, all cells)
#' @param dims.use Which dimensions to use as input features
#' @param genes.use If set, run the tSNE on this subset of genes
#' (instead of running on a set of reduced dimensions). Not set (NULL) by default
#' @param seed.use Random seed for the t-SNE
#' @param do.fast If TRUE, uses the Barnes-hut implementation, which runs
#' faster, but is less flexible. TRUE by default.
#' @param add.iter If an existing tSNE has already been computed, uses the
#' current tSNE to seed the algorithm and then adds additional iterations on top
#' of this
#' @param dim.embed The dimensional space of the resulting tSNE embedding
#' (default is 2). For example, set to 3 for a 3d tSNE
#' @param \dots Additional arguments to the tSNE call. Most commonly used is
#' perplexity (expected number of neighbors default is 30)
#' @param distance.matrix If set, runs tSNE on the given distance matrix
#' instead of data matrix (experimental)
#' @param reduction.name dimensional reduction name, specifies the position in the object$dr list. tsne by default
#' @param reduction.key dimensional reduction key, specifies the string before the number for the dimension names. tSNE_ by default
#'
#' @return Returns a Seurat object with a tSNE embedding in
#' object@@dr$tsne@cell.embeddings
#'
#' @importFrom Rtsne Rtsne
#' @importFrom tsne tsne
#'
#' @export
#'
RunTSNE <- function(
  object,
  reduction.use = "pca",
  cells.use = NULL,
  dims.use = 1:5,
  genes.use = NULL,
  seed.use = 1,
  do.fast = TRUE,
  add.iter = 0,
  dim.embed = 2,
  distance.matrix = NULL,
  reduction.name = "tsne",
  reduction.key = "tSNE_",
  ...
) {
  if (! is.null(x = distance.matrix)) {
    genes.use <- rownames(x = object@data)
  }
  if (is.null(x = genes.use)) {
    data.use <- GetDimReduction(
      object = object,
      reduction.type = reduction.use,
      slot = "cell.embeddings"
    )[, dims.use]
  }
  if (! is.null(x = genes.use)) {
      data.use <- t(PrepDR(
        object = object,
        genes.use = genes.use))
  }
  set.seed(seed = seed.use)
  if (do.fast) {
    if (is.null(x = distance.matrix)) {
      data.tsne <- Rtsne(X = as.matrix(x = data.use), dims = dim.embed, ...)
    } else {
      data.tsne <- Rtsne(
        X = as.matrix(x = distance.matrix),
        dims = dim.embed,
        is_distance=TRUE
      )
    }
    data.tsne <- data.tsne$Y
  } else {
    data.tsne <- tsne(X = data.use, k = dim.embed, ...)
  }
  if (add.iter > 0) {
    data.tsne <- tsne(
      X = data.use,
      initial_config = as.matrix(x = data.tsne),
      max_iter = add.iter,
      ...
    )
  }
  colnames(x = data.tsne) <- paste0(reduction.key, 1:ncol(x = data.tsne))
  rownames(x = data.tsne) <- rownames(x = data.use)
  object <- SetDimReduction(
    object = object,
    reduction.type = reduction.name,
    slot = "cell.embeddings",
    new.data = data.tsne
  )
  object <- SetDimReduction(
    object = object,
    reduction.type = reduction.name,
    slot = "key",
    new.data = reduction.key
  )
  parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("RunTSNE"))]
  object <- SetCalcParams(object = object, calculation = "RunTSNE", ... = parameters.to.store)
  if(!is.null(GetCalcParam(object = object, calculation = "RunTSNE", parameter = "genes.use"))){
    object@calc.params$RunTSNE$genes.use <- colnames(data.use)
    object@calc.params$RunTSNE$cells.use <- rownames(data.use)
  }
  return(object)
}

#' Project Dimensional reduction onto full dataset
#'
#' Takes a pre-computed dimensional reduction (typically calculated on a subset
#' of genes) and projects this onto the entire dataset (all genes). Note that
#' the cell loadings will remain unchanged, but now there are gene loadings for
#' all genes.
#'
#'
#' @param object Seurat object
#' @param reduction.type Reduction to use
#' @param dims.print Number of dims to print genes for
#' @param dims.store Number of dims to store (default is 30)
#' @param genes.print Number of genes with highest/lowest loadings to print for
#' each PC
#' @param replace.dim Replace the existing data (overwrite
#' object@@dr$XXX@gene.loadings), not done by default.
#' @param do.center Center the dataset prior to projection (should be set to TRUE)
#' @param do.print Print top genes associated with the projected dimensions
#' @param assay.type Data type, RNA by default. Can be changed for multimodal
#' datasets (i.e. project a PCA done on RNA, onto CITE-seq data)
#'
#' @return Returns Seurat object with the projected values in
#' object@@dr$XXX@gene.loadings.full
#'
#' @export
#'
ProjectDim <- function(
  object,
  reduction.type = "pca",
  dims.print = 1:5,
  dims.store = 30,
  genes.print = 30,
  replace.dim = FALSE,
  do.center = FALSE,
  do.print = TRUE,
  assay.type = "RNA"
) {
  if (! reduction.type %in% names(x = object@dr)) {
    stop(paste(reduction.type, "dimensional reduction has not been computed"))
  }
  data.use <- GetAssayData(
    object = object,
    assay.type = assay.type,
    slot = "scale.data"
  )
  if (do.center) {
    data.use <- scale(x = as.matrix(x = data.use), center = TRUE, scale = FALSE)
  }
  cell.embeddings <- GetDimReduction(
    object = object,
    reduction.type = reduction.type,
    slot = "cell.embeddings"
  )
  new.gene.loadings.full <- FastMatMult(m1 = data.use, m2 = cell.embeddings)
  rownames(x = new.gene.loadings.full) <- rownames(x = data.use)
  colnames(x = new.gene.loadings.full) <- colnames(x = cell.embeddings)
  object <- SetDimReduction(
    object = object,
    reduction.type = reduction.type,
    slot = "gene.loadings.full",
    new.data = new.gene.loadings.full
  )
  if (replace.dim) {
    object <- SetDimReduction(
      object = object,
      reduction.type = reduction.type,
      slot = "gene.loadings",
      new.data = new.gene.loadings.full
    )
  }
  if (do.print) {
    PrintDim(
      object = object,
      reduction.type = reduction.type,
      genes.print = genes.print,
      use.full = TRUE,
      dims.print = dims.print
    )
  }
  return(object)
}


#' Project Principal Components Analysis onto full dataset
#'
#' Takes a pre-computed PCA (typically calculated on a subset of genes) and
#' projects this onto the entire dataset (all genes). Note that the cell
#' loadings remains unchanged, but now there are gene loading scores for all
#' genes.
#'
#' @param object Seurat object
#' @param do.print Print top genes associated with the projected PCs
#' @param pcs.print Number of PCs to print genes for
#' @param pcs.store Number of PCs to store (default is 30)
#' @param genes.print Number of genes with highest/lowest loadings to print for
#' each PC
#' @param replace.pc Replace the existing PCA (overwite
#' object@@dr$pca@gene.loadings), not done by default.
#' @param do.center Center the dataset prior to projection (should be set to TRUE)
#'
#' @return Returns Seurat object with the projected PCA values in
#' object@@dr$pca@gene.loadings.full
#'
#' @export
#'
ProjectPCA <- function(
  object,
  do.print = TRUE,
  pcs.print = 1:5,
  pcs.store = 30,
  genes.print = 30,
  replace.pc = FALSE,
  do.center = FALSE
) {
  return(ProjectDim(
    object,
    reduction.type = "pca",
    dims.print = pcs.print,
    genes.print = 30,
    replace.dim = replace.pc,
    do.center = do.center,
    do.print = do.print,
    dims.store = pcs.store
  ))
}

#' Perform Canonical Correlation Analysis
#'
#' Runs a canonical correlation analysis using a diagonal implementation of CCA.
#' For details about stored CCA calculation parameters, see
#' \code{PrintCCAParams}.
#'
#' @param object Seurat object
#' @param object2 Optional second object. If object2 is passed, object1 will be
#' considered as group1 and object2 as group2.
#' @param group1 First set of cells (or IDs) for CCA
#' @param group2 Second set of cells (or IDs) for CCA
#' @param group.by Factor to group by (column vector stored in object@@meta.data)
#' @param num.cc Number of canonical vectors to calculate
#' @param genes.use Set of genes to use in CCA. Default is object@@var.genes. If
#' two objects are given, the default is the union of both variable gene sets
#' that are also present in both objects.
#' @param scale.data Use the scaled data from the object
#' @param rescale.groups Rescale each set of cells independently
#' @return Returns Seurat object with the CCA stored in the @@dr$cca slot. If
#' one object is passed, the same object is returned. If two are passed, a
#' combined object is returned.
#' @export
RunCCA <- function(
  object,
  object2,
  group1,
  group2,
  group.by,
  num.cc = 20,
  genes.use,
  scale.data = TRUE,
  rescale.groups = FALSE
) {
  if (! missing(x = object2) && (! missing(x = group1) || ! missing(x = group2))) {
    warning("Both object2 and group set. Continuing with objects defining the groups")
  }
  if (! missing(x = object2)) {
    if (missing(x = genes.use)) {
      genes.use <- union(x = object@var.genes, y = object2@var.genes)
      if (length(x = genes.use) == 0) {
        stop("No variable genes present. Run MeanVarPlot and retry")
      }
    }
    if (scale.data) {
      possible.genes <- intersect(
        x = rownames(x = object@scale.data),
        y = rownames(x = object2@scale.data)
      )
      genes.use <- genes.use[genes.use %in% possible.genes]
      data.use1 <- object@scale.data[genes.use, ]
      data.use2 <- object2@scale.data[genes.use, ]
    } else {
      possible.genes <- intersect(
        x = rownames(object@data),
        y = rownames(object2@data)
      )
      genes.use <- genes.use[genes.use %in% possible.genes]
      data.use1 <- object@data[genes.use, ]
      data.use2 <- object2@data[genes.use, ]
    }
    if (length(x = genes.use) == 0) {
      stop("0 valid genes in genes.use")
    }
  } else {
    if (missing(x = group1)) {
      stop("group1 not set")
    }
    if (missing(x = group2)) {
      stop("group2 not set")
    }
    if (! missing(x = group.by)) {
      if (! group.by %in% colnames(x = object@meta.data)) {
        stop("invalid group.by parameter")
      }
    }
    if (missing(x = genes.use)) {
      genes.use <- object@var.genes
      if (length(x = genes.use) == 0) {
        stop("No variable genes present. Run MeanVarPlot and retry")
      }
    }
    if (missing(x = group.by)) {
      cells.1 <- CheckGroup(object = object, group = group1, group.id = "group1")
      cells.2 <- CheckGroup(object = object, group = group2, group.id = "group2")
    } else {
      object.current.ids <- object@ident
      object <- SetAllIdent(object = object, id = group.by)
      cells.1 <- CheckGroup(object = object, group = group1, group.id = "group1")
      cells.2 <- CheckGroup(object = object, group = group2, group.id = "group2")
      object <- SetIdent(
        object = object,
        cells.use = object@cell.names,
        ident.use = object.current.ids
      )
    }
    if (scale.data) {
      if (rescale.groups) {
        data.use1 <- ScaleData(
          object = object,
          data.use = object@data[genes.use, cells.1]
        )
        data.use1 <- data.use1@scale.data
        data.use2 <- ScaleData(
          object = object,
          data.use = object@data[genes.use, cells.2]
        )
        data.use2 <- data.use2@scale.data
      } else {
        data.use1 <- object@scale.data[genes.use, cells.1]
        data.use2 <- object@scale.data[genes.use, cells.2]
      }
    } else {
      data.use1 <- object@data[genes.use, cells.1]
      data.use2 <- object@data[genes.use, cells.2]
    }
  }
  genes.use <- CheckGenes(data.use = data.use1, genes.use = genes.use)
  genes.use <- CheckGenes(data.use = data.use2, genes.use = genes.use)
  data.use1 <- data.use1[genes.use, ]
  data.use2 <- data.use2[genes.use, ]

  cat("Running CCA\n", file = stderr())

  cca.results <- CanonCor(
      mat1 = data.use1,
      mat2 = data.use2,
      standardize = TRUE,
      k = num.cc
  )
  cca.data <- rbind(cca.results$u, cca.results$v)
  colnames(x = cca.data) <- paste0("CC", 1:num.cc)
  if (! missing(x = object2)) {
    cat("Merging objects\n", file = stderr())
    combined.object <- MergeSeurat(
      object1 = object,
      object2 = object2,
      do.scale = FALSE,
      do.center = FALSE
    )
    # to improve, to pull the same normalization and scale params as previously used
    combined.object <- ScaleData(object = combined.object)
    combined.object@scale.data[is.na(x = combined.object@scale.data)] <- 0
    combined.object@var.genes <- genes.use
    rownames(cca.data) <- colnames(combined.object@data)
    combined.object <- SetDimReduction(
      object = combined.object,
      reduction.type = "cca",
      slot = "cell.embeddings",
      new.data = cca.data
    )
    combined.object <- SetDimReduction(
      object = combined.object,
      reduction.type = "cca",
      slot = "key",
      new.data = "CC"
    )
    combined.object <- ProjectDim(
      object = combined.object,
      reduction.type = "cca",
      do.print = FALSE
    )
    combined.object <- SetDimReduction(
      object = combined.object,
      reduction.type = "cca",
      slot = "gene.loadings",
      new.data = GetGeneLoadings(
        object = combined.object,
        reduction.type = "cca",
        use.full = TRUE,
        genes.use = genes.use
      )
    )
    parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("RunCCA"))]
    combined.object <- SetCalcParams(
      object = combined.object,
      calculation = "RunCCA",
      ... = parameters.to.store
    )
    combined.object <- SetSingleCalcParam(
      object = combined.object,
      calculation = "RunCCA",
      parameter = "object.project",
      value = object@project.name
    )
    combined.object <- SetSingleCalcParam(
      object = combined.object,
      calculation = "RunCCA",
      parameter = "object2.project",
      value = object2@project.name
    )
    return(combined.object)
  } else {
    object <- SetDimReduction(
      object = object,
      reduction.type = "cca",
      slot = "cell.embeddings",
      new.data = cca.data
    )
    object <- SetDimReduction(
      object = object,
      reduction.type = "cca",
      slot = "key",
      new.data = "CC"
    )

    object <- ProjectDim(
      object = object,
      reduction.type = "cca",
      do.print = FALSE
    )
    object@scale.data[is.na(x = object@scale.data)] <- 0
    parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("RunCCA"))]
    object <- SetCalcParams(
      object = object,
      calculation = "RunCCA",
      ... = parameters.to.store
    )
    return(object)
  }
}

#' Calculate the ratio of variance explained by ICA or PCA to CCA
#'
#' @param object Seurat object
#' @param reduction.type type of dimensional reduction to compare to CCA (pca,
#' pcafast, ica)
#' @param grouping.var variable to group by
#' @param dims.use Vector of dimensions to project onto (default is the 1:number
#'  stored for cca)
#'
#' @return Returns Seurat object with ratio of variance explained stored in
#' object@@meta.data$var.ratio
#' @export
#'
CalcVarExpRatio <- function(
  object,
  reduction.type = "pca",
  grouping.var,
  dims.use
) {
  if (missing(x = grouping.var)) {
    stop("Need to provide grouping variable")
  }
  if (missing(x = dims.use)) {
    dims.use <- 1:ncol(x = GetCellEmbeddings(object = object, reduction.type = "cca"))
  }
  parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("CalcVarExpRatio"))]
  object <- SetCalcParams(object = object,
                          calculation = "CalcVarExpRatio",
                          ... = parameters.to.store)
  groups <- as.vector(x = unique(x = FetchData(
    object = object,
    vars.all = grouping.var
  )[, 1]))
  genes.use <- rownames(x = GetGeneLoadings(object = object, reduction.type = "cca"))
  var.ratio <- data.frame()
  for (group in groups) {
    cat(paste("Calculating for", group, "\n"), file = stderr())
    group.cells <- WhichCells(
      object = object,
      subset.name = grouping.var,
      accept.value = group
    )
    cat(paste("\t Separating", group, "cells\n"), file = stderr())
    group.object <- SubsetData(object = object, cells.use = group.cells)
    cat("\t Running Dimensional Reduction \n", file = stderr())
    ldp.cca <- CalcLDProj(
      object = group.object,
      reduction.type = "cca",
      dims.use = dims.use,
      genes.use = genes.use
    )
    group.object <- CalcProjectedVar(
      object = group.object,
      low.dim.data = ldp.cca,
      reduction.type = "cca",
      dims.use = dims.use,
      genes.use = genes.use
    )
    if (reduction.type == "pca") {
      temp.matrix=PrepDR(group.object,genes.use = genes.use)
      group.object <- RunPCA(
        object = group.object,
        pc.genes = genes.use,
        do.print = FALSE,
        center=rowMeans(temp.matrix)
      )
      ldp.pca <- CalcLDProj(
        object = group.object,
        reduction.type = "pca",
        dims.use = dims.use,
        genes.use = genes.use
      )
      group.object <- CalcProjectedVar(
        object = group.object,
        low.dim.data = ldp.pca,
        reduction.type = "pca",
        dims.use = dims.use,
        genes.use = genes.use
      )
      group.var.ratio <- group.object@meta.data[, "cca.var", drop = FALSE] /
        group.object@meta.data[, "pca.var", drop = FALSE]
    } else if (reduction.type == "ica") {
      group.object <- RunICA(
        object = group.object,
        ic.genes = genes.use,
        print.results = FALSE
      )
      ldp.ica <- CalcLDProj(
        object = group.object,
        reduction.type = "ica",
        dims.use = dims.use,
        genes.use = genes.use
      )
      group.object <- CalcProjectedVar(
        object = group.object,
        low.dim.data = ldp.ica,
        reduction.type = "ica",
        dims.use = dims.use,
        genes.use = genes.use
      )
      group.var.ratio <- group.object@meta.data[, "cca.var", drop = FALSE] /
        group.object@meta.data[, "ica.var", drop = FALSE]
    } else {
        stop(paste("reduction.type", reduction.type, "not supported"))
    }
    var.ratio <- rbind(var.ratio, group.var.ratio)
  }
  var.ratio$cell.name <- rownames(x = var.ratio)
  eval(expr = parse(text = paste0(
    "object@meta.data$var.ratio.",
    reduction.type,
    "<- NULL"
  )))
  colnames(x = var.ratio) <- c(
    paste0("var.ratio.", reduction.type),
    "cell.name"
  )
  object@meta.data$cell.name <- rownames(x = object@meta.data)
  object@meta.data <- merge(x = object@meta.data, y = var.ratio, by = "cell.name")
  rownames(x = object@meta.data) <- object@meta.data$cell.name
  object@meta.data$cell.name <- NULL
  return(object)
}

#' Align subspaces using dynamic time warping (DTW)
#'
#' Aligns subspaces so that they line up across grouping variable (only
#' implemented for case with 2 categories in grouping.var)
#'
#'
#' @param object Seurat object
#' @param reduction.type reduction to align scores for
#' @param grouping.var Name of the grouping variable for which to align the scores
#' @param dims.align Dims to align, default is all
#' @param num.genes Number of genes to use in construction of "metagene"
#' @param show.plots show debugging plots
#' @param \dots Additional parameters to ScaleData
#'
#' @return Returns Seurat object with the dims aligned, stored in
#'  object@@dr$reduction.type.aligned
#'
#' @importFrom dtw dtw
#' @importFrom graphics points
#' @importFrom stats quantile density
#' @importFrom pbapply pbapply
#' @importFrom graphics par plot lines
#'
#' @export
#'
AlignSubspace <- function(
  object,
  reduction.type,
  grouping.var,
  dims.align,
  num.genes = 30,
  show.plots = FALSE,
  ...
) {
  parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("AlignSubspace"))]
  object <- SetCalcParams(
    object = object,
    calculation = paste0("AlignSubspace.", reduction.type),
    ... = parameters.to.store
  )
  ident.orig <- object@ident
  object <- SetAllIdent(object = object, id = grouping.var)
  levels.split <- names(x = sort(x = table(object@ident)))
  if (length(x = levels.split) != 2) {
    stop(paste0(
      "There are not two options for ",
      grouping.var,
      ". \n Current groups include: ",
      paste(levels.split, collapse = ", ")
    ))
  }
  objects <- list(
    SubsetData(object = object, ident.use = levels.split[1]),
    SubsetData(object = object, ident.use = levels.split[2])
  )
  object@ident <- ident.orig
  cc.loadings <- list()
  scaled.data <- list()
  cc.embeds <- list()
  for (i in 1:2) {
    cat(paste0("Rescaling group ", i, "\n"), file = stderr())
    objects[[i]] <- ScaleData(object = objects[[i]], ...)
    objects[[i]]@scale.data[is.na(x = objects[[i]]@scale.data)] <- 0
    objects[[i]] <- ProjectDim(
      object = objects[[i]],
      reduction.type = reduction.type,
      do.print = FALSE
    )
    cc.loadings[[i]] <- GetGeneLoadings(
      object = objects[[i]],
      reduction.type = reduction.type,
      use.full = TRUE
    )
    cc.embeds[[i]] <- GetCellEmbeddings(
      object = objects[[i]],
      reduction.type = reduction.type
    )
    scaled.data[[i]] <- objects[[i]]@scale.data
  }
  cc.embeds.both <- GetCellEmbeddings(object = object, reduction.type = reduction.type)
  colnames(cc.embeds.both) <- paste0("A", colnames(x = cc.embeds.both))
  cc.embeds.orig <- cc.embeds.both
  for (cc.use in dims.align) {
    cat(paste0("Aligning dimension ", cc.use, "\n"), file = stderr())
    genes.rank <- data.frame(
      rank(x = abs(x = cc.loadings[[1]][, cc.use])),
      rank(x = abs(x = cc.loadings[[2]][, cc.use])),
      cc.loadings[[1]][, cc.use],
      cc.loadings[[2]][, cc.use]
    )
    genes.rank$min <- apply(X = genes.rank[,1:2], MARGIN = 1, FUN = min)

    genes.rank <- genes.rank[order(genes.rank$min, decreasing = TRUE), ]
    genes.top <- rownames(x = genes.rank)[1:200]
    bicors <- list()
    for (i in 1:2) {
      cc.vals <- cc.embeds[[i]][, cc.use]
      bicors[[i]] <- pbsapply(
        X = genes.top,
        FUN = function(x) {
          return(BiweightMidcor(x = cc.vals, y = scaled.data[[i]][x, ]))
        }
      )
    }
    genes.rank <- data.frame(
      rank(x = abs(x = bicors[[1]])),
      rank(x = abs(x = bicors[[2]])),
      bicors[[1]],
      bicors[[2]]
    )
    genes.rank$min <- apply(X = abs(x = genes.rank[, 1:2]), MARGIN = 1, FUN = min)
    genes.rank <- genes.rank[order(genes.rank$min, decreasing = TRUE), ]
    genes.use <- rownames(x = genes.rank)[1:num.genes]

    genes.rank$min2 <- apply(X = abs(x = genes.rank[, 3:4]), MARGIN = 1, FUN = min)
    genes.subset=subset(genes.rank,min2>0.15)
    nGene=nrow(genes.subset)
    print(paste0("CC ", cc.use, ": ", dim(genes.subset)[1], " genes"))

    metagenes <- list()
    multvar.data <- list()
    for (i in 1:2) {
      scaled.use <- sweep(
        x = scaled.data[[i]][genes.use, ],
        MARGIN = 1,
        STATS = sign(x = genes.rank[genes.use, i + 2]),
        FUN = "*"
      )
      scaled.use <- scaled.use[, names(x = sort(x = cc.embeds[[i]][, cc.use]))]
      metagenes[[i]] <- apply(
        X = scaled.use[genes.use, ],
        MARGIN = 2,
        FUN = mean,
        remove.na = TRUE
      )
      metagenes[[i]] <- (
        cc.loadings[[i]][genes.use, cc.use] %*% scaled.data[[i]][genes.use, ]
      )[1, colnames(x = scaled.use)]
    }

    mean.difference <- mean(x = ReferenceRange(x = metagenes[[1]])) -
      mean(x = ReferenceRange(x = metagenes[[2]]))
    metric.use <- "Euclidean"
    align.1 <- ReferenceRange(x = metagenes[[1]])
    align.2 <- ReferenceRange(x = metagenes[[2]])
    a1q <- sapply(
      X = seq(from = 0, to = 1, by = 0.001),
      FUN = function(x) {
        return(quantile(x = align.1, probs = x))
      }
    )
    a2q <- sapply(
      X = seq(from = 0, to = 1, by = 0.001),
      FUN = function(x) {
        quantile(x = align.2, probs = x)
      }
    )
    iqr <- (a1q - a2q)[100:900]
    iqr.x <- which.min(x = abs(x = iqr))
    iqrmin <- iqr[iqr.x]
    if (show.plots) {
      print(iqrmin)
    }
    align.2 <- align.2 + iqrmin
    alignment <- dtw(
      x = align.1,
      y = align.2,
      keep = TRUE,
      dist.method = metric.use
    )
    alignment.map <- data.frame(alignment$index1, alignment$index2)
    alignment.map$cc_data1 <- sort(cc.embeds[[1]][, cc.use])[alignment$index1]
    alignment.map$cc_data2 <- sort(cc.embeds[[2]][, cc.use])[alignment$index2]
    alignment.map.orig <- alignment.map
    alignment.map <- alignment.map[! duplicated(x = alignment.map$alignment.index1), ]
    cc.embeds.both[names(x = sort(x = cc.embeds[[1]][, cc.use])), cc.use] <- alignment.map$cc_data2
    if (show.plots) {
      par(mfrow = c(3, 2))
      plot(x = ReferenceRange(x = metagenes[[1]]), main = cc.use)
      plot(x = ReferenceRange(x = metagenes[[2]]))
      plot(
        x = ReferenceRange(x = metagenes[[1]])[(alignment.map.orig$alignment.index1)],
        pch = 16
      )
      points(
        x = ReferenceRange(metagenes[[2]])[(alignment.map.orig$alignment.index2)],
        col = "red",
        pch = 16,
        cex = 0.4
      )
      plot(x = density(x = alignment.map$cc_data2))
      lines(x = density(x = sort(x = cc.embeds[[2]][, cc.use])), col = "red")
      plot(x = alignment.map.orig$cc_data1)
      points(x = alignment.map.orig$cc_data2, col = "red")
    }
  }
  new.type <- paste0(reduction.type, ".aligned")
  new.key <- paste0(
    "A",
    GetDimReduction(
      object = object,
      reduction.type = reduction.type,
      slot = "key"
    )
  )
  object <- SetDimReduction(
    object = object,
    reduction.type = new.type,
    slot = "cell.embeddings",
    new.data = scale(x = cc.embeds.both)
  )
  object <- SetDimReduction(
    object = object,
    reduction.type = new.type,
    slot = "key",
    new.data = new.key
  )
  return(object)
}

#' Run diffusion map
#'
#' @param object Seurat object
#' @param cells.use Which cells to analyze (default, all cells)
#' @param dims.use Which dimensions to use as input features
#' @param genes.use If set, run the diffusion map procedure on this subset of
#' genes (instead of running on a set of reduced dimensions). Not set (NULL) by
#' default
#' @param reduction.use Which dimensional reduction (PCA or ICA) to use for the
#' diffusion map. Default is PCA
#' @param q.use Quantile to use
#' @param max.dim Max dimension to keep from diffusion calculation
#' @param scale.clip Max/min value for scaled data. Default is 3
#' @param reduction.name dimensional reduction name, specifies the position in the object$dr list. dm by default
#' @param reduction.key dimensional reduction key, specifies the string before the number for the dimension names. DM by default
#' @param ... Additional arguments to the diffuse call
#'
#' @return Returns a Seurat object with a diffusion map
#'
#' @import diffusionMap
#' @importFrom stats dist
#'
#' @export
#'
RunDiffusion <- function(
  object,
  cells.use = NULL,
  dims.use = 1:5,
  genes.use = NULL,
  reduction.use = 'pca',
  q.use = 0.01,
  max.dim = 2,
  scale.clip = 10,
  reduction.name = "dm",
  reduction.key = "DM",
  ...
) {
  cells.use <- SetIfNull(x = cells.use, default = colnames(x = object@data))
  if (is.null(x = genes.use)) {
    dim.code <- GetDimReduction(
      object = object,
      reduction.type = reduction.use,
      slot = 'key'
    )
    dim.codes <- paste0(dim.code, dims.use)
    data.use <- FetchData(object = object, vars.all = dim.codes)
  }
  if (! is.null(x = genes.use)) {
    genes.use <- intersect(x = genes.use, y = rownames(x = object@scale.data))
    data.use <- MinMax(
      data = t(x = object@data[genes.use, cells.use]),
      min = -1 * scale.clip,
      max = scale.clip
    )
  }
  parameters.to.store <- as.list(environment(), all = TRUE)[names(formals("RunDiffusion"))]
  object <- SetCalcParams(object = object,
                          calculation = "RunDiffusion",
                          ... = parameters.to.store)
  data.dist <- dist(data.use)
  data.diffusion <- data.frame(
    diffuse(
      D = data.dist,
      neigen = max.dim,
      maxdim = max.dim,
      ...
    )$X
  )
  colnames(x = data.diffusion) <- paste0(reduction.key, 1:ncol(x = data.diffusion))
  rownames(x = data.diffusion) <- cells.use
  for (i in 1:max.dim) {
    x <- data.diffusion[,i]
    x <- MinMax(
      data = x,
      min = quantile(x = x, probs = q.use),
      quantile(x = x, probs = 1-q.use)
    )
    data.diffusion[, i] <- x
  }
  object <- SetDimReduction(
    object = object,
    reduction.type = reduction.use,
    slot = "cell.embeddings",
    new.data = as.matrix(x = data.diffusion)
  )
  object <- SetDimReduction(
    object = object,
    reduction.type = reduction.use,
    slot = "key",
    new.data = "DM"
  )
  return(object)
}

#' Run Canonical Correlation Analysis (CCA) on multimodal data
#'
#' CCA finds a shared correlation structure betwen two different datasets, enabling integrated downstream analysis
#'
#' @param object Seurat object
#' @param assay.1 First assay for multimodal analysis. Default is RNA
#' @param assay.2 Second assay for multimodal analysis. Default is CITE for CITE-Seq analysis.
#' @param features.1 Features of assay 1 to consider (default is variable genes)
#' @param features.2 Features of assay 2 to consider (default is all features, i.e. for CITE-Seq, all antibodies)
#' @param num.cc Number of canonical correlations to compute and store. Default is 20, but will calculate less if either assay has <20 features.
#' @param normalize.variance Z-score the embedding of each CC to 1, so each CC contributes equally in downstream analysis (default is T)
#'
#' @importFrom PMA CCA
#'
#' @return Returns object after CCA, with results stored in dimensional reduction cca.assay1 (ie. cca.RNA) and cca.assay2. For example, results can be visualized using DimPlot(object,reduction.use="cca.RNA")
#'
#' @export
#'
MultiModal_CCA <- function(
  object,
  assay.1 = "RNA",
  assay.2 = "CITE",
  features.1 = NULL,
  features.2 = NULL,
  num.cc = 20,
  normalize.variance = TRUE
) {
  #first pull out data, define features
  data.1 <- GetAssayData(
    object = object,
    assay.type = assay.1,
    slot = "scale.data"
  )
  data.2 <- GetAssayData(
    object = object,
    assay.type = assay.2,
    slot = "scale.data"
  )
  if (is.null(x = features.1)) {
    if ((assay.1 == "RNA") && length(x = object@var.genes) > 0) {
      features.1 <- object@var.genes
    } else {
      features.1 <- rownames(x = data.1)
    }
  }
  features.2 <- SetIfNull(x = features.2, default = rownames(x = data.2))
  data.1 <- t(x = data.1[features.1, ])
  data.2 <- t(x = data.2[features.2, ])
  #data.1.var=apply(data.1,2,var)
  #data.2.var=apply(data.2,2,var)
  data.1 <- data.1[,apply(data.1,2,var)>0]
  data.2 <- data.2[,apply(data.2,2,var)>0]

  num.cc <- min(20, min(ncol(data.1), ncol(data.2)))
  cca.data <- list(data.1, data.2)
  names(x = cca.data) <- c(assay.1, assay.2)
  # now run CCA
  out <- CCA(
    x = cca.data[[1]],
    z = cca.data[[2]],
    typex = "standard",
    typez = "standard",
    K = num.cc,
    penaltyz = 1,
    penaltyx = 1
  )
  cca.output <- list(out$u, out$v)
  embeddings.cca <- list()
  for (i in 1:length(x = cca.data)) {
    assay.use <- names(x = cca.data)[i]
    rownames(x = cca.output[[i]]) <- colnames(x = cca.data[[i]])
    embeddings.cca[[i]] <- cca.data[[i]] %*% cca.output[[i]]
    colnames(x = embeddings.cca[[i]]) <- paste0(
      assay.use,
      "CC",
      1:ncol(x = embeddings.cca[[i]])
    )
    colnames(x = cca.output[[i]]) <- colnames(x = embeddings.cca[[i]])
    if (normalize.variance) {
      embeddings.cca[[i]] <- scale(x = embeddings.cca[[i]])
    }
    object <- SetDimReduction(
      object = object,
      reduction.type = paste0(assay.use, "CCA"),
      slot = "cell.embeddings",
      new.data = embeddings.cca[[i]]
    )
    object <- SetDimReduction(
      object = object,
      reduction.type = paste0(assay.use, "CCA"),
      slot = "key",
      new.data = paste0(assay.use, "CC")
    )
    object <- SetDimReduction(
      object = object,
      reduction.type = paste0(assay.use, "CCA"),
      slot = "gene.loadings",
      new.data =  cca.output[[i]]
    )
  }
  return(object)
}
mayer-lab/SeuratForMayer2018 documentation built on May 25, 2019, 9:34 p.m.