R/utilities.R

Defines functions ToNumeric SysExec Sweep RemoveLastField ReLu RandomName PercentAbove Parenting PackageCheck OldParamHints ModifyParam Melt MeanRange MaxN MatrixRowShuffle MapVals LengthCheck impute_dist IsVSTout IsSCT is.null.externalptr IsMatrixEmpty Interleave Extend ExtractField CreateDummyAssay CheckDuplicateCellNames CheckGC CheckDots L2Norm ChunkPoints `%iff%` `%||%` as.data.frame.Matrix UpdateSymbolList RegroupIdents SaveAnnoyIndex LoadAnnoyIndex PseudobulkExpression PercentageFeatureSet MinMax MetaFeature LogVMR GroupCorrelation GeneSymbolThesarus FastRowScale ExpVar ExpSD ExpMean CustomDistance CreateAnn CollapseSpeciesExpressionMatrix CellCycleScoring CaseMatch AverageExpression AggregateExpression AddModuleScore

Documented in AddModuleScore AggregateExpression as.data.frame.Matrix AverageExpression CaseMatch CellCycleScoring CollapseSpeciesExpressionMatrix CustomDistance ExpMean ExpSD ExpVar FastRowScale GeneSymbolThesarus GroupCorrelation LoadAnnoyIndex LogVMR MetaFeature MinMax PercentageFeatureSet RegroupIdents SaveAnnoyIndex UpdateSymbolList

#' @include generics.R
#'
NULL

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Functions
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

#' Calculate module scores for feature expression programs in single cells
#'
#' Calculate the average expression levels of each program (cluster) on single
#' cell level, subtracted by the aggregated expression of control feature sets.
#' All analyzed features are binned based on averaged expression, and the
#' control features are randomly selected from each bin.
#'
#' @param object Seurat object
#' @param features A list of vectors of features for expression programs; each
#' entry should be a vector of feature names
#' @param pool List of features to check expression levels against, defaults to
#' \code{rownames(x = object)}
#' @param nbin Number of bins of aggregate expression levels for all
#' analyzed features
#' @param ctrl Number of control features selected from the same bin per
#' analyzed feature
#' @param k Use feature clusters returned from DoKMeans
#' @param assay Name of assay to use
#' @param name Name for the expression programs; will append a number to the
#' end for each entry in \code{features} (eg. if \code{features} has three
#' programs, the results will be stored as \code{name1}, \code{name2},
#' \code{name3}, respectively)
#' @param seed Set a random seed. If NULL, seed is not set.
#' @param search Search for symbol synonyms for features in \code{features} that
#' don't match features in \code{object}? Searches the HGNC's gene names
#' database; see \code{\link{UpdateSymbolList}} for more details
#' @param ... Extra parameters passed to \code{\link{UpdateSymbolList}}
#'
#' @return Returns a Seurat object with module scores added to object meta data;
#' each module is stored as \code{name#} for each module program present in
#' \code{features}
#'
#' @importFrom ggplot2 cut_number
#' @importFrom Matrix rowMeans colMeans
#'
#' @references Tirosh et al, Science (2016)
#'
#' @export
#' @concept utilities
#'
#' @examples
#' \dontrun{
#' data("pbmc_small")
#' cd_features <- list(c(
#'   'CD79B',
#'   'CD79A',
#'   'CD19',
#'   'CD180',
#'   'CD200',
#'   'CD3D',
#'   'CD2',
#'   'CD3E',
#'   'CD7',
#'   'CD8A',
#'   'CD14',
#'   'CD1C',
#'   'CD68',
#'   'CD9',
#'   'CD247'
#' ))
#' pbmc_small <- AddModuleScore(
#'   object = pbmc_small,
#'   features = cd_features,
#'   ctrl = 5,
#'   name = 'CD_Features'
#' )
#' head(x = pbmc_small[])
#' }
#'
AddModuleScore <- function(
  object,
  features,
  pool = NULL,
  nbin = 24,
  ctrl = 100,
  k = FALSE,
  assay = NULL,
  name = 'Cluster',
  seed = 1,
  search = FALSE,
  ...
) {
  if (!is.null(x = seed)) {
    set.seed(seed = seed)
  }
  assay.old <- DefaultAssay(object = object)
  assay <- assay %||% assay.old
  DefaultAssay(object = object) <- assay
  assay.data <- GetAssayData(object = object)
  features.old <- features
  if (k) {
    .NotYetUsed(arg = 'k')
    features <- list()
    for (i in as.numeric(x = names(x = table(object@kmeans.obj[[1]]$cluster)))) {
      features[[i]] <- names(x = which(x = object@kmeans.obj[[1]]$cluster == i))
    }
    cluster.length <- length(x = features)
  } else {
    if (is.null(x = features)) {
      stop("Missing input feature list")
    }
    features <- lapply(
      X = features,
      FUN = function(x) {
        missing.features <- setdiff(x = x, y = rownames(x = object))
        if (length(x = missing.features) > 0) {
          warning(
            "The following features are not present in the object: ",
            paste(missing.features, collapse = ", "),
            ifelse(
              test = search,
              yes = ", attempting to find updated synonyms",
              no = ", not searching for symbol synonyms"
            ),
            call. = FALSE,
            immediate. = TRUE
          )
          if (search) {
            tryCatch(
              expr = {
                updated.features <- UpdateSymbolList(symbols = missing.features, ...)
                names(x = updated.features) <- missing.features
                for (miss in names(x = updated.features)) {
                  index <- which(x == miss)
                  x[index] <- updated.features[miss]
                }
              },
              error = function(...) {
                warning(
                  "Could not reach HGNC's gene names database",
                  call. = FALSE,
                  immediate. = TRUE
                )
              }
            )
            missing.features <- setdiff(x = x, y = rownames(x = object))
            if (length(x = missing.features) > 0) {
              warning(
                "The following features are still not present in the object: ",
                paste(missing.features, collapse = ", "),
                call. = FALSE,
                immediate. = TRUE
              )
            }
          }
        }
        return(intersect(x = x, y = rownames(x = object)))
      }
    )
    cluster.length <- length(x = features)
  }
  if (!all(LengthCheck(values = features))) {
    warning(paste(
      'Could not find enough features in the object from the following feature lists:',
      paste(names(x = which(x = !LengthCheck(values = features)))),
      'Attempting to match case...'
    ))
    features <- lapply(
      X = features.old,
      FUN = CaseMatch,
      match = rownames(x = object)
    )
  }
  if (!all(LengthCheck(values = features))) {
    stop(paste(
      'The following feature lists do not have enough features present in the object:',
      paste(names(x = which(x = !LengthCheck(values = features)))),
      'exiting...'
    ))
  }
  pool <- pool %||% rownames(x = object)
  data.avg <- Matrix::rowMeans(x = assay.data[pool, ])
  data.avg <- data.avg[order(data.avg)]
  data.cut <- cut_number(x = data.avg + rnorm(n = length(data.avg))/1e30, n = nbin, labels = FALSE, right = FALSE)
  #data.cut <- as.numeric(x = Hmisc::cut2(x = data.avg, m = round(x = length(x = data.avg) / (nbin + 1))))
  names(x = data.cut) <- names(x = data.avg)
  ctrl.use <- vector(mode = "list", length = cluster.length)
  for (i in 1:cluster.length) {
    features.use <- features[[i]]
    for (j in 1:length(x = features.use)) {
      ctrl.use[[i]] <- c(
        ctrl.use[[i]],
        names(x = sample(
          x = data.cut[which(x = data.cut == data.cut[features.use[j]])],
          size = ctrl,
          replace = FALSE
        ))
      )
    }
  }
  ctrl.use <- lapply(X = ctrl.use, FUN = unique)
  ctrl.scores <- matrix(
    data = numeric(length = 1L),
    nrow = length(x = ctrl.use),
    ncol = ncol(x = object)
  )
  for (i in 1:length(ctrl.use)) {
    features.use <- ctrl.use[[i]]
    ctrl.scores[i, ] <- Matrix::colMeans(x = assay.data[features.use, ])
  }
  features.scores <- matrix(
    data = numeric(length = 1L),
    nrow = cluster.length,
    ncol = ncol(x = object)
  )
  for (i in 1:cluster.length) {
    features.use <- features[[i]]
    data.use <- assay.data[features.use, , drop = FALSE]
    features.scores[i, ] <- Matrix::colMeans(x = data.use)
  }
  features.scores.use <- features.scores - ctrl.scores
  rownames(x = features.scores.use) <- paste0(name, 1:cluster.length)
  features.scores.use <- as.data.frame(x = t(x = features.scores.use))
  rownames(x = features.scores.use) <- colnames(x = object)
  object[[colnames(x = features.scores.use)]] <- features.scores.use
  CheckGC()
  DefaultAssay(object = object) <- assay.old
  return(object)
}

#' Aggregated feature expression by identity class
#'
#' Returns aggregated (summed) expression values for each identity class
#'
#' If slot is set to 'data', this function assumes that the data has been log
#' normalized and therefore feature values are exponentiated prior to aggregating
#' so that sum is done in non-log space. Otherwise, if slot is set to
#' either 'counts' or 'scale.data', no exponentiation is performed prior to
#' aggregating
#' If \code{return.seurat = TRUE} and slot is not 'scale.data', aggregated values
#' are placed in the 'counts' slot of the returned object and the log of aggregated values
#' are placed in the 'data' slot. For the \code{\link{ScaleData}} is then run on the default assay
#' before returning the object.
#' If \code{return.seurat = TRUE} and slot is 'scale.data', the 'counts' slot is left empty,
#' the 'data' slot is filled with NA, and 'scale.data' is set to the aggregated values.
#'
#' @param object Seurat object
#' @param assays Which assays to use. Default is all assays
#' @param features Features to analyze. Default is all features in the assay
#' @param return.seurat Whether to return the data as a Seurat object. Default is FALSE
#' @param group.by Categories for grouping (e.g, ident, replicate, celltype); 'ident' by default
#' @param add.ident (Deprecated) Place an additional label on each cell prior to pseudobulking
#' (very useful if you want to observe cluster pseudobulk values, separated by replicate, for example)
#' @param slot Slot(s) to use; if multiple slots are given, assumed to follow
#' the order of 'assays' (if specified) or object's assays
#' @param verbose Print messages and show progress bar
#' @param ... Arguments to be passed to methods such as \code{\link{CreateSeuratObject}}#'
#' @return Returns a matrix with genes as rows, identity classes as columns.
#' If return.seurat is TRUE, returns an object of class \code{\link{Seurat}}.
#' @export
#' @concept utilities
#'
#' @examples
#' data("pbmc_small")
#' head(AggregateExpression(object = pbmc_small))
#'
AggregateExpression <- function(
  object,
  assays = NULL,
  features = NULL,
  return.seurat = FALSE,
  group.by = 'ident',
  add.ident = NULL,
  slot = 'data',
  verbose = TRUE,
  ...
) {
  return(
    PseudobulkExpression(
      object = object,
      pb.method = 'aggregate',
      assays = assays,
      features = features,
      return.seurat = return.seurat,
      group.by = group.by,
      add.ident = add.ident,
      slot = slot,
      verbose = verbose,
      ...
    )
  )
}

#' Averaged feature expression by identity class
#'
#' Returns averaged expression values for each identity class
#'
#' If slot is set to 'data', this function assumes that the data has been log
#' normalized and therefore feature values are exponentiated prior to averaging
#' so that averaging is done in non-log space. Otherwise, if slot is set to
#' either 'counts' or 'scale.data', no exponentiation is performed prior to
#' averaging
#' If \code{return.seurat = TRUE} and slot is not 'scale.data', averaged values
#' are placed in the 'counts' slot of the returned object and the log of averaged values
#' are placed in the 'data' slot. \code{\link{ScaleData}} is then run on the default assay
#' before returning the object.
#' If \code{return.seurat = TRUE} and slot is 'scale.data', the 'counts' slot is left empty,
#' the 'data' slot is filled with NA, and 'scale.data' is set to the aggregated values.
#'
#' @param object Seurat object
#' @param assays Which assays to use. Default is all assays
#' @param features Features to analyze. Default is all features in the assay
#' @param return.seurat Whether to return the data as a Seurat object. Default is FALSE
#' @param group.by Categories for grouping (e.g, ident, replicate, celltype); 'ident' by default
#' @param add.ident (Deprecated) Place an additional label on each cell prior to pseudobulking
#' (very useful if you want to observe cluster pseudobulk values, separated by replicate, for example)
#' @param slot Slot(s) to use; if multiple slots are given, assumed to follow
#' the order of 'assays' (if specified) or object's assays
#' @param verbose Print messages and show progress bar
#' @param ... Arguments to be passed to methods such as \code{\link{CreateSeuratObject}}
#'
#' @return Returns a matrix with genes as rows, identity classes as columns.
#' If return.seurat is TRUE, returns an object of class \code{\link{Seurat}}.
#' @export
#' @concept utilities
#'
#' @examples
#' data("pbmc_small")
#' head(AverageExpression(object = pbmc_small))
#'
AverageExpression <- function(
  object,
  assays = NULL,
  features = NULL,
  return.seurat = FALSE,
  group.by = 'ident',
  add.ident = NULL,
  slot = 'data',
  verbose = TRUE,
  ...
) {
  return(
    PseudobulkExpression(
      object = object,
      pb.method = 'average',
      assays = assays,
      features = features,
      return.seurat = return.seurat,
      group.by = group.by,
      add.ident = add.ident,
      slot = slot,
      verbose = verbose,
      ...
    )
  )
}

#' Match the case of character vectors
#'
#' @param search A vector of search terms
#' @param match A vector of characters whose case should be matched
#'
#' @return Values from search present in match with the case of match
#'
#' @export
#' @concept utilities
#'
#' @examples
#' data("pbmc_small")
#' cd_genes <- c('Cd79b', 'Cd19', 'Cd200')
#' CaseMatch(search = cd_genes, match = rownames(x = pbmc_small))
#'
CaseMatch <- function(search, match) {
  search.match <- sapply(
    X = search,
    FUN = function(s) {
      return(grep(
        pattern = paste0('^', s, '$'),
        x = match,
        ignore.case = TRUE,
        perl = TRUE,
        value = TRUE
      ))
    }
  )
  return(unlist(x = search.match))
}

#' Score cell cycle phases
#'
#' @param object A Seurat object
#' @param s.features A vector of features associated with S phase
#' @param g2m.features A vector of features associated with G2M phase
#' @param ctrl Number of control features selected from the same bin per
#' analyzed feature supplied to \code{\link{AddModuleScore}}.
#' Defaults to value equivalent to minimum number of features
#' present in 's.features' and 'g2m.features'.
#' @param set.ident If true, sets identity to phase assignments
#' Stashes old identities in 'old.ident'
#' @param ... Arguments to be passed to \code{\link{AddModuleScore}}
#'
#' @return A Seurat object with the following columns added to object meta data: S.Score, G2M.Score, and Phase
#'
#' @seealso \code{AddModuleScore}
#'
#' @export
#' @concept utilities
#'
#' @examples
#' \dontrun{
#' data("pbmc_small")
#' # pbmc_small doesn't have any cell-cycle genes
#' # To run CellCycleScoring, please use a dataset with cell-cycle genes
#' # An example is available at http://satijalab.org/seurat/cell_cycle_vignette.html
#' pbmc_small <- CellCycleScoring(
#'   object = pbmc_small,
#'   g2m.features = cc.genes$g2m.genes,
#'   s.features = cc.genes$s.genes
#' )
#' head(x = pbmc_small@meta.data)
#' }
#'
CellCycleScoring <- function(
  object,
  s.features,
  g2m.features,
  ctrl = NULL,
  set.ident = FALSE,
  ...
) {
  name <- 'Cell.Cycle'
  features <- list('S.Score' = s.features, 'G2M.Score' = g2m.features)
  if (is.null(x = ctrl)) {
    ctrl <- min(vapply(X = features, FUN = length, FUN.VALUE = numeric(length = 1)))
  }
  object.cc <- AddModuleScore(
    object = object,
    features = features,
    name = name,
    ctrl = ctrl,
    ...
  )
  cc.columns <- grep(pattern = name, x = colnames(x = object.cc[[]]), value = TRUE)
  cc.scores <- object.cc[[cc.columns]]
  rm(object.cc)
  CheckGC()
  assignments <- apply(
    X = cc.scores,
    MARGIN = 1,
    FUN = function(scores, first = 'S', second = 'G2M', null = 'G1') {
      if (all(scores < 0)) {
        return(null)
      } else {
        if (length(which(x = scores == max(scores))) > 1) {
          return('Undecided')
        } else {
          return(c(first, second)[which(x = scores == max(scores))])
        }
      }
    }
  )
  cc.scores <- merge(x = cc.scores, y = data.frame(assignments), by = 0)
  colnames(x = cc.scores) <- c('rownames', 'S.Score', 'G2M.Score', 'Phase')
  rownames(x = cc.scores) <- cc.scores$rownames
  cc.scores <- cc.scores[, c('S.Score', 'G2M.Score', 'Phase')]
  object[[colnames(x = cc.scores)]] <- cc.scores
  if (set.ident) {
    object[['old.ident']] <- Idents(object = object)
    Idents(object = object) <- 'Phase'
  }
  return(object)
}

#' Slim down a multi-species expression matrix, when only one species is primarily of interenst.
#'
#' Valuable for CITE-seq analyses, where we typically spike in rare populations of 'negative control' cells from a different species.
#'
#' @param object A UMI count matrix. Should contain rownames that start with
#' the ensuing arguments prefix.1 or prefix.2
#' @param prefix The prefix denoting rownames for the species of interest.
#' Default is "HUMAN_". These rownames will have this prefix removed in the returned matrix.
#' @param controls The prefix denoting rownames for the species of 'negative
#' control' cells. Default is "MOUSE_".
#' @param ncontrols How many of the most highly expressed (average) negative
#' control features (by default, 100 mouse genes), should be kept? All other
#' rownames starting with prefix.2 are discarded.
#'
#' @return A UMI count matrix. Rownames that started with \code{prefix} have this
#' prefix discarded. For rownames starting with \code{controls}, only the
#' \code{ncontrols} most highly expressed features are kept, and the
#' prefix is kept. All other rows are retained.
#'
#' @importFrom Matrix rowSums
#'
#' @export
#' @concept utilities
#'
#' @examples
#' \dontrun{
#' cbmc.rna.collapsed <- CollapseSpeciesExpressionMatrix(cbmc.rna)
#' }
#'
CollapseSpeciesExpressionMatrix <- function(
  object,
  prefix = "HUMAN_",
  controls = "MOUSE_",
  ncontrols = 100
) {
  features <- grep(pattern = prefix, x = rownames(x = object), value = TRUE)
  controls <- grep(pattern = controls, x = rownames(x = object), value = TRUE)
  others <- setdiff(x = rownames(x = object), y = c(features, controls))
  controls <- rowSums(x = object[controls, ])
  controls <- names(x = head(
    x = sort(x = controls, decreasing = TRUE),
    n = ncontrols
  ))
  object <- object[c(features, controls, others), ]
  rownames(x = object) <- gsub(
    pattern = prefix,
    replacement = '',
    x = rownames(x = object)
  )
  return(object)
}

# Create an Annoy index
#
# @note Function exists because it's not exported from \pkg{uwot}
#
# @param name Distance metric name
# @param ndim Number of dimensions
#
# @return An nn index object
#
#' @importFrom methods new
#' @importFrom RcppAnnoy AnnoyAngular AnnoyManhattan AnnoyEuclidean AnnoyHamming
#
CreateAnn <- function(name, ndim) {
  return(switch(
    EXPR = name,
    cosine = new(Class = AnnoyAngular, ndim),
    manhattan = new(Class = AnnoyManhattan, ndim),
    euclidean = new(Class = AnnoyEuclidean, ndim),
    hamming = new(Class = AnnoyHamming, ndim),
    stop("BUG: unknown Annoy metric '", name, "'")
  ))
}

#' Run a custom distance function on an input data matrix
#'
#' @author Jean Fan
#'
#' @param my.mat A matrix to calculate distance on
#' @param my.function A function to calculate distance
#' @param ... Extra parameters to my.function
#'
#' @return A distance matrix
#'
#' @importFrom stats as.dist
#'
#' @export
#' @concept utilities
#'
#' @examples
#' data("pbmc_small")
#' # Define custom distance matrix
#' manhattan.distance <- function(x, y) return(sum(abs(x-y)))
#'
#' input.data <- GetAssayData(pbmc_small, assay.type = "RNA", slot = "scale.data")
#' cell.manhattan.dist <- CustomDistance(input.data, manhattan.distance)
#'
CustomDistance <- function(my.mat, my.function, ...) {
  CheckDots(..., fxns = my.function)
  n <- ncol(x = my.mat)
  mat <- matrix(data = 0, ncol = n, nrow = n)
  colnames(x = mat) <- rownames(x = mat) <- colnames(x = my.mat)
  for (i in 1:nrow(x = mat)) {
    for (j in 1:ncol(x = mat)) {
      mat[i,j] <- my.function(my.mat[, i], my.mat[, j], ...)
    }
  }
  return(as.dist(m = mat))
}

#' Calculate the mean of logged values
#'
#' Calculate mean of logged values in non-log space (return answer in log-space)
#'
#' @param x A vector of values
#' @param ... Other arguments (not used)
#'
#' @return Returns the mean in log-space
#'
#' @export
#' @concept utilities
#'
#' @examples
#' ExpMean(x = c(1, 2, 3))
#'
ExpMean <- function(x, ...) {
  if (inherits(x = x, what = 'AnyMatrix')) {
    return(apply(X = x, FUN = function(i) {log(x = mean(x = exp(x = i) - 1) + 1)}, MARGIN = 1))
  } else {
    return(log(x = mean(x = exp(x = x) - 1) + 1))
  }
}

#' Calculate the standard deviation of logged values
#'
#' Calculate SD of logged values in non-log space (return answer in log-space)
#'
#' @param x A vector of values
#'
#' @return Returns the standard deviation in log-space
#'
#' @importFrom stats sd
#'
#' @export
#' @concept utilities
#'
#' @examples
#' ExpSD(x = c(1, 2, 3))
#'
ExpSD <- function(x) {
  return(log1p(x = sd(x = expm1(x = x))))
}

#' Calculate the variance of logged values
#'
#' Calculate variance of logged values in non-log space (return answer in
#' log-space)
#'
#' @param x A vector of values
#'
#' @return Returns the variance in log-space
#'
#' @importFrom stats var
#'
#' @export
#' @concept utilities
#'
#' @examples
#' ExpVar(x = c(1, 2, 3))
#'
ExpVar <- function(x) {
  return(log1p(x = var(x = expm1(x = x))))
}

#' Scale and/or center matrix rowwise
#'
#' Performs row scaling and/or centering. Equivalent to using t(scale(t(mat)))
#' in R except in the case of NA values.
#'
#' @param mat A matrix
#' @param center a logical value indicating whether to center the rows
#' @param scale a logical value indicating whether to scale the rows
#' @param scale_max clip all values greater than scale_max to scale_max. Don't
#' clip if Inf.
#' @return Returns the center/scaled matrix
#'
#' @importFrom matrixStats rowMeans2 rowSds rowSums2
#'
#' @export
#' @concept utilities
#'
FastRowScale <- function(
  mat,
  center = TRUE,
  scale = TRUE,
  scale_max = 10
) {
  # inspired by https://www.r-bloggers.com/a-faster-scale-function/
  if (center) {
    rm <- rowMeans2(x = mat, na.rm = TRUE)
  }
  if (scale) {
    if (center) {
      rsd <- rowSds(mat, center = rm)
    } else {
      rsd <- sqrt(x = rowSums2(x = mat^2)/(ncol(x = mat) - 1))
    }
  }
  if (center) {
    mat <- mat - rm
  }
  if (scale) {
    mat <- mat / rsd
  }
  if (scale_max != Inf) {
    mat[mat > scale_max] <- scale_max
  }
  return(mat)
}


#' Get updated synonyms for gene symbols
#'
#' Find current gene symbols based on old or alias symbols using the gene
#' names database from the HUGO Gene Nomenclature Committee (HGNC)
#'
#' @details For each symbol passed, we query the HGNC gene names database for
#' current symbols that have the provided symbol as either an alias
#' (\code{alias_symbol}) or old (\code{prev_symbol}) symbol. All other queries
#' are \strong{not} supported.
#'
#' @note This function requires internet access
#'
#' @param symbols A vector of gene symbols
#' @param timeout Time to wait before cancelling query in seconds
#' @param several.ok Allow several current gene sybmols for each provided symbol
#' @param verbose Show a progress bar depicting search progress
#' @param ... Extra parameters passed to \code{\link[httr]{GET}}
#'
#' @return For \code{GeneSymbolThesarus}, if \code{several.ok}, a named list
#' where each entry is the current symbol found for each symbol provided and the
#' names are the provided symbols. Otherwise, a named vector with the same information.
#'
#' @source \url{https://www.genenames.org/} \url{https://www.genenames.org/help/rest/}
#'
#' @importFrom utils txtProgressBar setTxtProgressBar
#' @importFrom httr GET accept_json timeout status_code content
#'
#' @rdname UpdateSymbolList
#' @name UpdateSymbolList
#'
#' @export
#' @concept utilities
#'
#' @seealso \code{\link[httr]{GET}}
#'
#' @examples
#' \dontrun{
#' GeneSybmolThesarus(symbols = c("FAM64A"))
#' }
#'
GeneSymbolThesarus <- function(
  symbols,
  timeout = 10,
  several.ok = FALSE,
  verbose = TRUE,
  ...
) {
  db.url <- 'http://rest.genenames.org/fetch'
  search.types <- c('alias_symbol', 'prev_symbol')
  synonyms <- vector(mode = 'list', length = length(x = symbols))
  not.found <- vector(mode = 'logical', length = length(x = symbols))
  multiple.found <- vector(mode = 'logical', length = length(x = symbols))
  names(x = multiple.found) <- names(x = not.found) <- names(x = synonyms) <- symbols
  if (verbose) {
    pb <- txtProgressBar(max = length(x = symbols), style = 3, file = stderr())
  }
  for (symbol in symbols) {
    sym.syn <- character()
    for (type in search.types) {
      response <- GET(
        url = paste(db.url, type, symbol, sep = '/'),
        config = c(accept_json(), timeout(seconds = timeout)),
        ...
      )
      if (!identical(x = status_code(x = response), y = 200L)) {
        next
      }
      response <- content(x = response)
      if (response$response$numFound != 1) {
        if (response$response$numFound > 1) {
          warning(
            "Multiple hits found for ",
            symbol,
            " as ",
            type,
            ", skipping",
            call. = FALSE,
            immediate. = TRUE
          )
        }
        next
      }
      sym.syn <- c(sym.syn, response$response$docs[[1]]$symbol)
    }
    not.found[symbol] <- length(x = sym.syn) < 1
    multiple.found[symbol] <- length(x = sym.syn) > 1
    if (length(x = sym.syn) == 1 || (length(x = sym.syn) > 1 && several.ok)) {
      synonyms[[symbol]] <- sym.syn
    }
    if (verbose) {
      setTxtProgressBar(pb = pb, value = pb$getVal() + 1)
    }
  }
  if (verbose) {
    close(con = pb)
  }
  if (sum(not.found) > 0) {
    warning(
      "The following symbols had no synonyms: ",
      paste(names(x = which(x = not.found)), collapse = ', '),
      call. = FALSE,
      immediate. = TRUE
    )
  }
  if (sum(multiple.found) > 0) {
    msg <- paste(
      "The following symbols had multiple synonyms:",
      paste(names(x = which(x = multiple.found)), sep = ', ')
    )
    if (several.ok) {
      message(msg)
      message("Including anyways")
    } else {
      warning(msg, call. = FALSE, immediate. = TRUE)
    }
  }
  synonyms <- Filter(f = Negate(f = is.null), x = synonyms)
  if (!several.ok) {
    synonyms <- unlist(x = synonyms)
  }
  return(synonyms)
}

#' Compute the correlation of features broken down by groups with another
#' covariate
#'
#' @param object Seurat object
#' @param assay Assay to pull the data from
#' @param slot Slot in the assay to pull feature expression data from (counts,
#' data, or scale.data)
#' @param var Variable with which to correlate the features
#' @param group.assay Compute the gene groups based off the data in this assay.
#' @param min.cells Only compute for genes in at least this many cells
#' @param ngroups Number of groups to split into
#' @param do.plot Display the group correlation boxplot (via
#' \code{GroupCorrelationPlot})
#'
#' @return A Seurat object with the correlation stored in metafeatures
#'
#' @export
#' @concept utilities
#'
GroupCorrelation <- function(
  object,
  assay = NULL,
  slot = "scale.data",
  var = NULL,
  group.assay = NULL,
  min.cells = 5,
  ngroups = 6,
  do.plot = TRUE
) {
  assay <- assay %||% DefaultAssay(object = object)
  group.assay <- group.assay %||% assay
  var <- var %||% paste0("nCount_", group.assay)
  gene.grp <- GetFeatureGroups(
    object = object,
    assay = group.assay,
    min.cells = min.cells,
    ngroups = ngroups
  )
  data <- as.matrix(x = GetAssayData(object = object[[assay]], slot = slot))
  data <- data[rowMeans(x = data) != 0, ]
  grp.cors <- apply(
    X = data,
    MARGIN = 1,
    FUN = function(x) {
      cor(x = x, y = object[[var]])
    }
  )
  grp.cors <- grp.cors[names(x = gene.grp)]
  grp.cors <- as.data.frame(x = grp.cors[which(x = !is.na(x = grp.cors))])
  grp.cors$gene_grp <- gene.grp[rownames(x = grp.cors)]
  colnames(x = grp.cors) <- c("cor", "feature_grp")
  object[[assay]][["feature.grp"]] <- grp.cors[, "feature_grp", drop = FALSE]
  object[[assay]][[paste0(var, "_cor")]] <- grp.cors[, "cor", drop = FALSE]
  if (do.plot) {
    print(GroupCorrelationPlot(
      object = object,
      assay = assay,
      feature.group = "feature.grp",
      cor = paste0(var, "_cor")
    ))
  }
  return(object)
}

#' Calculate the variance to mean ratio of logged values
#'
#' Calculate the variance to mean ratio (VMR) in non-logspace (return answer in
#' log-space)
#'
#' @param x A vector of values
#' @param ... Other arguments (not used)
#'
#' @return Returns the VMR in log-space
#'
#' @importFrom stats var
#'
#' @export
#' @concept utilities
#'
#' @examples
#' LogVMR(x = c(1, 2, 3))
#'
LogVMR <- function(x, ...) {
  if (inherits(x = x, what = 'AnyMatrix')) {
    return(apply(X = x, FUN = function(i) {log(x = var(x = exp(x = i) - 1) / mean(x = exp(x = i) - 1))}, MARGIN = 1))
  } else {
    return(log(x = var(x = exp(x = x) - 1) / mean(x = exp(x = x) - 1)))
  }
}

#' Aggregate expression of multiple features into a single feature
#'
#' Calculates relative contribution of each feature to each cell
#' for given set of features.
#'
#' @param object A Seurat object
#' @param features List of features to aggregate
#' @param meta.name Name of column in metadata to store metafeature
#' @param cells List of cells to use (default all cells)
#' @param assay Which assay to use
#' @param slot Which slot to take data from (default data)
#'
#' @return Returns a \code{Seurat} object with metafeature stored in objct metadata
#'
#' @importFrom Matrix rowSums colMeans
#'
#' @export
#' @concept utilities
#'
#' @examples
#' data("pbmc_small")
#' pbmc_small <- MetaFeature(
#'   object = pbmc_small,
#'   features = c("LTB", "EAF2"),
#'   meta.name = 'var.aggregate'
#' )
#' head(pbmc_small[[]])
#'
MetaFeature <- function(
  object,
  features,
  meta.name = 'metafeature',
  cells = NULL,
  assay = NULL,
  slot = 'data'
) {
  cells <- cells %||% colnames(x = object)
  assay <- assay %||% DefaultAssay(object = object)
  newmat <- GetAssayData(object = object, assay = assay, slot = slot)
  newmat <- newmat[features, cells]
  if (slot == 'scale.data') {
    newdata <- Matrix::colMeans(newmat)
  } else {
    rowtotals <- Matrix::rowSums(newmat)
    newmat <- newmat / rowtotals
    newdata <- Matrix::colMeans(newmat)
  }
  object[[meta.name]] <- newdata
  return(object)
}

#' Apply a ceiling and floor to all values in a matrix
#'
#' @param data Matrix or data frame
#' @param min all values below this min value will be replaced with min
#' @param max all values above this max value will be replaced with max
#' @return Returns matrix after performing these floor and ceil operations
#' @export
#' @concept utilities
#'
#' @examples
#' mat <- matrix(data = rbinom(n = 25, size = 20, prob = 0.2 ), nrow = 5)
#' mat
#' MinMax(data = mat, min = 4, max = 5)
#'
MinMax <- function(data, min, max) {
  data2 <- data
  data2[data2 > max] <- max
  data2[data2 < min] <- min
  return(data2)
}

#' Calculate the percentage of all counts that belong to a given set of features
#'
#' This function enables you to easily calculate the percentage of all the counts belonging to a
#' subset of the possible features for each cell. This is useful when trying to compute the percentage
#' of transcripts that map to mitochondrial genes for example. The calculation here is simply the
#' column sum of the matrix present in the counts slot for features belonging to the set divided by
#' the column sum for all features times 100.
#'
#' @param object A Seurat object
#' @param pattern A regex pattern to match features against
#' @param features A defined feature set. If features provided, will ignore the pattern matching
#' @param col.name Name in meta.data column to assign. If this is not null, returns a Seurat object
#' with the proportion of the feature set stored in metadata.
#' @param assay Assay to use
#'
#' @return Returns a vector with the proportion of the feature set or if md.name is set, returns a
#' Seurat object with the proportion of the feature set stored in metadata.
#' @importFrom Matrix colSums
#' @export
#' @concept utilities
#'
#' @examples
#' data("pbmc_small")
#' # Calculate the proportion of transcripts mapping to mitochondrial genes
#' # NOTE: The pattern provided works for human gene names. You may need to adjust depending on your
#' # system of interest
#' pbmc_small[["percent.mt"]] <- PercentageFeatureSet(object = pbmc_small, pattern = "^MT-")
#'
PercentageFeatureSet <- function(
  object,
  pattern = NULL,
  features = NULL,
  col.name = NULL,
  assay = NULL
) {
  assay <- assay %||% DefaultAssay(object = object)
  if (!is.null(x = features) && !is.null(x = pattern)) {
    warning("Both pattern and features provided. Pattern is being ignored.")
  }
  features <- features %||% grep(pattern = pattern, x = rownames(x = object[[assay]]), value = TRUE)
  percent.featureset <- colSums(x = GetAssayData(object = object, assay = assay, slot = "counts")[features, , drop = FALSE])/
    object[[paste0("nCount_", assay)]] * 100
  if (!is.null(x = col.name)) {
    object <- AddMetaData(object = object, metadata = percent.featureset, col.name = col.name)
    return(object)
  }
  return(percent.featureset)
}

# Pseudobulk feature expression by identity class
#
# Returns a representative expression value for each identity class
#
# @param object Seurat object
# @param pb.method Whether to 'average' (default) or 'aggregate' expression levels
# @param assays Which assays to use. Default is all assays
# @param features Features to analyze. Default is all features in the assay
# @param return.seurat Whether to return the data as a Seurat object. Default is FALSE
# @param group.by Categories for grouping (e.g, ident, replicate, celltype); 'ident' by default
# @param add.ident (Deprecated) Place an additional label on each cell prior to pseudobulking
# (very useful if you want to observe cluster pseudobulk values, separated by replicate, for example)
# @param slot Slot(s) to use; if multiple slots are given, assumed to follow
# the order of 'assays' (if specified) or object's assays
# @param verbose Print messages and show progress bar
# @param ... Arguments to be passed to methods such as \code{\link{CreateSeuratObject}}
#
# @return Returns a matrix with genes as rows, identity classes as columns.
# If return.seurat is TRUE, returns an object of class \code{\link{Seurat}}.
#
#' @importFrom Matrix rowMeans sparse.model.matrix
#' @importFrom stats as.formula
# @export
#
# @examples
# data("pbmc_small")
# head(PseudobulkExpression(object = pbmc_small))
#
PseudobulkExpression <- function(
  object,
  pb.method = 'average',
  assays = NULL,
  features = NULL,
  return.seurat = FALSE,
  group.by = 'ident',
  add.ident = NULL,
  slot = 'data',
  verbose = TRUE,
  ...
) {
  CheckDots(..., fxns = 'CreateSeuratObject')
  if (!is.null(x = add.ident)) {
    .Deprecated(msg = "'add.ident' is a deprecated argument, please use the 'group.by' argument instead")
    group.by <- c('ident', add.ident)
  }
  if (!(pb.method %in% c('average', 'aggregate'))) {
    stop("'pb.method' must be either 'average' or 'aggregate'")
  }
  object.assays <- FilterObjects(object = object, classes.keep = 'Assay')
  assays <- assays %||% object.assays
  if (!all(assays %in% object.assays)) {
    assays <- assays[assays %in% object.assays]
    if (length(x = assays) == 0) {
      stop("None of the requested assays are present in the object")
    } else {
      warning("Requested assays that do not exist in object. Proceeding with existing assays only.")
    }
  }
  if (length(x = slot) == 1) {
    slot <- rep_len(x = slot, length.out = length(x = assays))
  } else if (length(x = slot) != length(x = assays)) {
    stop("Number of slots provided does not match number of assays")
  }
  data <- FetchData(object = object, vars = rev(x = group.by))
  data <- data[which(rowSums(x = is.na(x = data)) == 0), , drop = F]
  if (nrow(x = data) < ncol(x = object)) {
    message("Removing cells with NA for 1 or more grouping variables")
    object <- subset(x = object, cells = rownames(x = data))
  }
  for (i in 1:ncol(x = data)) {
    data[, i] <- as.factor(x = data[, i])
  }
  num.levels <- sapply(
    X = 1:ncol(x = data),
    FUN = function(i) {
      length(x = levels(x = data[, i]))
    }
  )
  if (any(num.levels == 1)) {
    message(paste0("The following grouping variables have 1 value and will be ignored: ",
                   paste0(colnames(x = data)[which(num.levels <= 1)], collapse = ", ")))
    group.by <- colnames(x = data)[which(num.levels > 1)]
    data <- data[, which(num.levels > 1), drop = F]
  }
  if (ncol(x = data) == 0) {
    message("All grouping variables have 1 value only. Computing across all cells.")
    category.matrix <- matrix(
      data = 1,
      nrow = ncol(x = object),
      dimnames = list(Cells(x = object), 'all')
    )
    if (pb.method == 'average') {
      category.matrix <- category.matrix / sum(category.matrix)
    }
  } else {
    category.matrix <- sparse.model.matrix(object = as.formula(
      object = paste0(
        '~0+',
        paste0(
          "data[,",
          1:length(x = group.by),
          "]",
          collapse = ":"
        )
      )
    ))
    colsums <- colSums(x = category.matrix)
    category.matrix <- category.matrix[, colsums > 0]
    colsums <- colsums[colsums > 0]
    if (pb.method == 'average') {
      category.matrix <- Sweep(
        x = category.matrix,
        MARGIN = 2,
        STATS = colsums,
        FUN = "/")
    }
    colnames(x = category.matrix) <- sapply(
      X = colnames(x = category.matrix),
      FUN = function(name) {
        name <- gsub(pattern = "data\\[, [1-9]*\\]", replacement = "", x = name)
        return(paste0(rev(x = unlist(x = strsplit(x = name, split = ":"))), collapse = "_"))
      })
  }
  data.return <- list()
  for (i in 1:length(x = assays)) {
    data.use <- GetAssayData(
      object = object,
      assay = assays[i],
      slot = slot[i]
    )
    features.to.avg <- features %||% rownames(x = data.use)
    if (inherits(x = features, what = "list")) {
      features.to.avg <- features[i]
    }
    if (IsMatrixEmpty(x = data.use)) {
      warning(
        "The ", slot[i], " slot for the ", assays[i],
        " assay is empty. Skipping assay.", immediate. = TRUE, call. = FALSE)
      next
    }
    bad.features <- setdiff(x = features.to.avg, y = rownames(x = data.use))
    if (length(x = bad.features) > 0) {
      warning(
        "The following ", length(x = bad.features),
        " features were not found in the ", assays[i], " assay: ",
        paste(bad.features, collapse = ", "), call. = FALSE, immediate. = TRUE)
    }
    features.assay <- intersect(x = features.to.avg, y = rownames(x = data.use))
    if (length(x = features.assay) > 0) {
      data.use <- data.use[features.assay, ]
    } else {
      warning("None of the features specified were found in the ", assays[i],
              " assay.", call. = FALSE, immediate. = TRUE)
      next
    }
    if (slot[i] == 'data') {
      data.use <- expm1(x = data.use)
      if (any(data.use == Inf)) {
        warning("Exponentiation yielded infinite values. `data` may not be log-normed.")
      }
    }
    data.return[[i]] <- as.matrix(x = (data.use %*% category.matrix))
    names(x = data.return)[i] <- assays[[i]]
  }
  if (return.seurat) {
    if (slot[1] == 'scale.data') {
      na.matrix <- data.return[[1]]
      na.matrix[1:length(x = na.matrix)] <- NA
      # TODO: restore once check.matrix is in SeuratObject
      # toRet <- CreateSeuratObject(
      #   counts = na.matrix,
      #   project = if (pb.method == "average") "Average" else "Aggregate",
      #   assay = names(x = data.return)[1],
      #   check.matrix = FALSE,
      #   ...
      # )
      toRet <- CreateSeuratObject(
        counts = na.matrix,
        project = if (pb.method == "average") "Average" else "Aggregate",
        assay = names(x = data.return)[1],
        ...
      )
      toRet <- SetAssayData(
        object = toRet,
        assay = names(x = data.return)[1],
        slot = "counts",
        new.data = matrix()
      )
      toRet <- SetAssayData(
        object = toRet,
        assay = names(x = data.return)[1],
        slot = "data",
        new.data = na.matrix
      )
      toRet <- SetAssayData(
        object = toRet,
        assay = names(x = data.return)[1],
        slot = "scale.data",
        new.data = data.return[[1]]
      )
    } else {
      # TODO: restore once check.matrix is in SeuratObject
      # toRet <- CreateSeuratObject(
      #   counts = data.return[[1]],
      #   project = if (pb.method == "average") "Average" else "Aggregate",
      #   assay = names(x = data.return)[1],
      #   check.matrix = FALSE,
      #   ...
      # )
      toRet <- CreateSeuratObject(
        counts = data.return[[1]],
        project = if (pb.method == "average") "Average" else "Aggregate",
        assay = names(x = data.return)[1],
        ...
      )
      toRet <- SetAssayData(
        object = toRet,
        assay = names(x = data.return)[1],
        slot = "data",
        new.data = log1p(x = as.matrix(x = data.return[[1]]))
      )
    }
    #for multimodal data
    if (length(x = data.return) > 1) {
      for (i in 2:length(x = data.return)) {
        if (slot[i] == 'scale.data') {
          na.matrix <- data.return[[i]]
          na.matrix[1:length(x = na.matrix)] <- NA
          # TODO: restore once check.matrix is in SeuratObject
          # toRet[[names(x = data.return)[i]]] <- CreateAssayObject(counts = na.matrix, check.matrix = FALSE)
          toRet[[names(x = data.return)[i]]] <- CreateAssayObject(counts = na.matrix)
          toRet <- SetAssayData(
            object = toRet,
            assay = names(x = data.return)[i],
            slot = "counts",
            new.data = matrix()
          )
          toRet <- SetAssayData(
            object = toRet,
            assay = names(x = data.return)[i],
            slot = "data",
            new.data = na.matrix
          )
          toRet <- SetAssayData(
            object = toRet,
            assay = names(x = data.return)[i],
            slot = "scale.data",
            new.data = as.matrix(x = data.return[[i]])
          )
        } else {
          # TODO: restore once check.matrix is in SeuratObject
          # toRet[[names(x = data.return)[i]]] <- CreateAssayObject(counts = data.return[[i]], check.matrix = FALSE)
          toRet[[names(x = data.return)[i]]] <- CreateAssayObject(counts = data.return[[i]])
          toRet <- SetAssayData(
            object = toRet,
            assay = names(x = data.return)[i],
            slot = "data",
            new.data = log1p(x = as.matrix(x = data.return[[i]]))
          )
        }

      }
    }
    if (DefaultAssay(object = object) %in% names(x = data.return)) {
      DefaultAssay(object = toRet) <- DefaultAssay(object = object)
      if (slot[which(DefaultAssay(object = object) %in% names(x = data.return))[1]] != 'scale.data') {
        toRet <- ScaleData(object = toRet, verbose = verbose)
      }
    }
    if ('ident' %in% group.by) {
      first.cells <- c()
      for (i in 1:ncol(x = category.matrix)) {
        first.cells <- c(first.cells, Position(x = category.matrix[,i], f = function(x) {x > 0}))
      }
      Idents(object = toRet) <- Idents(object = object)[first.cells]
    }
    return(toRet)
  } else {
    return(data.return)
  }
}

#' Load the Annoy index file
#'
#' @param object Neighbor object
#' @param file Path to file with annoy index
#'
#' @return Returns the Neighbor object with the index stored
#' @export
#' @concept utilities
#'
LoadAnnoyIndex <- function(object, file){
  metric <- slot(object = object, name = "alg.info")$metric
  ndim <- slot(object = object, name = "alg.info")$ndim
  if (is.null(x = metric)) {
    stop("Provided Neighbor object wasn't generated with annoy")
  }
  annoy.idx <- CreateAnn(name = metric, ndim = ndim)
  annoy.idx$load(path.expand(path = file))
  Index(object = object) <- annoy.idx
  return(object)
}

#' Save the Annoy index
#'
#' @param object A Neighbor object with the annoy index stored
#' @param file Path to file to write index to
#'
#' @export
#' @concept utilities
#'
SaveAnnoyIndex <- function(
  object,
  file
) {
  index <- Index(object = object)
  if (is.null(x = index)) {
    stop("Index for provided Neighbor object is NULL")
  }
  index$save(path.expand(path = file))
}

#' Regroup idents based on meta.data info
#'
#' For cells in each ident, set a new identity based on the most common value
#' of a specified metadata column.
#'
#' @param object Seurat object
#' @param metadata Name of metadata column
#' @return A Seurat object with the active idents regrouped
#'
#' @export
#' @concept utilities
#'
#' @examples
#' data("pbmc_small")
#' pbmc_small <- RegroupIdents(pbmc_small, metadata = "groups")
#'
RegroupIdents <- function(object, metadata) {
  for (ii in levels(x = object)) {
    ident.cells <- WhichCells(object = object, idents = ii)
    if (length(x = ident.cells) == 0) {
      next
    }
    new.ident <- names(x = which.max(x = table(object[[metadata]][ident.cells, ])))
    if (is.null(x = new.ident)) {
      stop("Cluster ", ii, " contains only cells with NA values in the '", metadata, "' metadata column.")
    }
    Idents(object = object, cells = ident.cells) <- new.ident
  }
  return(object)
}

#' @rdname UpdateSymbolList
#'
#' @return For \code{UpdateSymbolList}, \code{symbols} with updated symbols from
#' HGNC's gene names database
#'
#' @export
#' @concept utilities
#'
#' @examples
#' \dontrun{
#' UpdateSymbolList(symbols = cc.genes$s.genes)
#' }
#'
UpdateSymbolList <- function(
  symbols,
  timeout = 10,
  several.ok = FALSE,
  verbose = TRUE,
  ...
) {
  new.symbols <- suppressWarnings(expr = GeneSymbolThesarus(
    symbols = symbols,
    timeout = timeout,
    several.ok = several.ok,
    verbose = verbose,
    ...
  ))
  if (length(x = new.symbols) < 1) {
    warning("No updated symbols found", call. = FALSE, immediate. = TRUE)
  } else {
    if (verbose) {
      message("Found updated symbols for ", length(x = new.symbols), " symbols")
      x <- sapply(X = new.symbols, FUN = paste, collapse = ', ')
      message(paste(names(x = x), x, sep = ' -> ', collapse = '\n'))
    }
    for (sym in names(x = new.symbols)) {
      index <- which(x = symbols == sym)
      symbols <- append(
        x = symbols[-index],
        values = new.symbols[[sym]],
        after = index - 1
      )
    }
  }
  return(symbols)
}

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Methods for Seurat-defined generics
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Methods for R-defined generics
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

#' @inheritParams base::as.data.frame
#'
#' @return \code{as.data.frame.Matrix}: A data frame representation of the S4 Matrix
#'
#' @importFrom Matrix as.matrix
#'
#' @rdname as.sparse
#' @concept utilities
#' @export
#' @method as.data.frame Matrix
#'
as.data.frame.Matrix <- function(
  x,
  row.names = NULL,
  optional = FALSE,
  ...,
  stringsAsFactors = default.stringsAsFactors()
) {
  return(as.data.frame(
    x = as.matrix(x = x),
    row.names = row.names,
    optional = optional,
    stringsAsFactors = stringsAsFactors,
    ...
  ))
}

#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%
# Internal
#%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%

# Set a default value if an object is null
#
# @param lhs An object to set if it's null
# @param rhs The value to provide if x is null
#
# @return rhs if lhs is null, else lhs
#
# @author Hadley Wickham
# @references https://adv-r.hadley.nz/functions.html#missing-arguments
#
`%||%` <- function(lhs, rhs) {
  if (!is.null(x = lhs)) {
    return(lhs)
  } else {
    return(rhs)
  }
}

# Set a default value if an object is NOT null
#
# @param lhs An object to set if it's NOT null
# @param rhs The value to provide if x is NOT null
#
# @return lhs if lhs is null, else rhs
#
# @author Hadley Wickham
# @references https://adv-r.hadley.nz/functions.html#missing-arguments
#
`%iff%` <- function(lhs, rhs) {
  if (!is.null(x = lhs)) {
    return(rhs)
  } else {
    return(lhs)
  }
}

# Generate chunk points
#
# @param dsize How big is the data being chunked
# @param csize How big should each chunk be
#
# @return A matrix where each column is a chunk, row 1 is start points, row 2 is end points
#
ChunkPoints <- function(dsize, csize) {
  return(vapply(
    X = 1L:ceiling(x = dsize / csize),
    FUN = function(i) {
      return(c(
        start = (csize * (i - 1L)) + 1L,
        end = min(csize * i, dsize)
      ))
    },
    FUN.VALUE = numeric(length = 2L)
  ))
}

# L2 normalize the columns (or rows) of a given matrix
# @param mat Matrix to cosine normalize
# @param MARGIN Perform normalization over rows (1) or columns (2)
#
#
# @return returns l2-normalized matrix
#
#
L2Norm <- function(mat, MARGIN = 1){
  normalized <- Sweep(
    x = mat,
    MARGIN = MARGIN,
    STATS = apply(
      X = mat,
      MARGIN = MARGIN,
      FUN = function(x){
        sqrt(x = sum(x ^ 2))
      }
    ),
    FUN = "/"
  )
  normalized[!is.finite(x = normalized)] <- 0
  return(normalized)
}

# Check the use of ...
#
# @param ... Arguments passed to a function that fall under ...
# @param fxns A list/vector of functions or function names
#
# @return ...
#
# @importFrom utils argsAnywhere getAnywhere
#' @importFrom utils isS3stdGeneric methods argsAnywhere isS3method
#
# @examples
#
CheckDots <- function(..., fxns = NULL) {
  args.names <- names(x = list(...))
  if (length(x = list(...)) == 0) {
    return(invisible(x = NULL))
  }
  if (is.null(x = args.names)) {
    stop("No named arguments passed")
  }
  if (length(x = fxns) == 1) {
    fxns <- list(fxns)
  }
  for (f in fxns) {
    if (!(is.character(x = f) || is.function(x = f))) {
      stop("CheckDots only works on characters or functions, not ", class(x = f))
    }
  }
  fxn.args <- suppressWarnings(expr = sapply(
    X = fxns,
    FUN = function(x) {
      x <- tryCatch(
        expr = if (isS3stdGeneric(f = x)) {
          as.character(x = methods(generic.function = x))
        } else {
          x
        },
        error = function(...) {
          return(x)
        }
      )
      x <- if (is.character(x = x)) {
        sapply(X = x, FUN = argsAnywhere, simplify = FALSE, USE.NAMES = TRUE)
      } else if (length(x = x) <= 1) {
        list(x)
      }
      return(sapply(
        X = x,
        FUN = function(f) {
          return(names(x = formals(fun = f)))
        },
        simplify = FALSE,
        USE.NAMES = TRUE
      ))
    },
    simplify = FALSE,
    USE.NAMES = TRUE
  ))
  fxn.args <- unlist(x = fxn.args, recursive = FALSE)
  fxn.null <- vapply(X = fxn.args, FUN = is.null, FUN.VALUE = logical(length = 1L))
  if (all(fxn.null) && !is.null(x = fxns)) {
    stop("None of the functions passed could be found")
  } else if (any(fxn.null)) {
    warning(
      "The following functions passed could not be found: ",
      paste(names(x = which(x = fxn.null)), collapse = ', '),
      call. = FALSE,
      immediate. = TRUE
    )
    fxn.args <- Filter(f = Negate(f = is.null), x = fxn.args)
  }
  dfxns <- vector(mode = 'logical', length = length(x = fxn.args))
  names(x = dfxns) <- names(x = fxn.args)
  for (i in 1:length(x = fxn.args)) {
    dfxns[i] <- any(grepl(pattern = '...', x = fxn.args[[i]], fixed = TRUE))
  }
  if (any(dfxns)) {
    dfxns <- names(x = which(x = dfxns))
    if (any(nchar(x = dfxns) > 0)) {
      fx <- vapply(
        X = Filter(f = nchar, x = dfxns),
        FUN = function(x) {
          if (isS3method(method = x)) {
            x <- unlist(x = strsplit(x = x, split = '\\.'))
            x <- x[length(x = x) - 1L]
          }
          return(x)
        },
        FUN.VALUE = character(length = 1L)
      )
      message(
        "The following functions and any applicable methods accept the dots: ",
        paste(unique(x = fx), collapse = ', ')
      )
      if (any(nchar(x = dfxns) < 1)) {
        message(
          "In addition, there is/are ",
          length(x = Filter(f = Negate(f = nchar), x = dfxns)),
          " other function(s) that accept(s) the dots"
        )
      }
    } else {
      message("There is/are ", length(x = dfxns), 'function(s) that accept(s) the dots')
    }
  } else {
    unused <- Filter(
      f = function(x) {
        return(!x %in% unlist(x = fxn.args))
      },
      x = args.names
    )
    if (length(x = unused) > 0) {
      msg <- paste0(
        "The following arguments are not used: ",
        paste(unused, collapse = ', ')
      )
      switch(
        EXPR = getOption(x = "Seurat.checkdots"),
        "warn" = warning(msg, call. = FALSE, immediate. = TRUE),
        "stop" = stop(msg),
        "silent" = NULL,
        stop("Invalid Seurat.checkdots option. Please choose one of warn, stop, silent")
      )
      unused.hints <- sapply(X = unused, FUN = OldParamHints)
      names(x = unused.hints) <- unused
      unused.hints <- na.omit(object = unused.hints)
      if (length(x = unused.hints) > 0) {
        message(
          "Suggested parameter: ",
          paste(unused.hints, "instead of", names(x = unused.hints), collapse = '; '),
          "\n"
        )
      }
    }
  }
}

# Call gc() to perform garbage collection
#
CheckGC <- function() {
  if (getOption(x = "Seurat.memsafe")) {
    gc(verbose = FALSE)
  }
}

# Check a list of objects for duplicate cell names
#
# @param object.list List of Seurat objects
# @param verbose Print message about renaming
# @param stop Error out if any duplicate names exist
#
# @return Returns list of objects with duplicate cells renamed to be unique
#
# @keywords internal
#
# @noRd
#
CheckDuplicateCellNames <- function(object.list, verbose = TRUE, stop = FALSE) {
  cell.names <- unlist(x = lapply(X = object.list, FUN = colnames))
  if (any(duplicated(x = cell.names))) {
    if (stop) {
      stop("Duplicate cell names present across objects provided.")
    }
    if (verbose) {
      warning("Some cell names are duplicated across objects provided. Renaming to enforce unique cell names.")
    }
    object.list <- lapply(
      X = 1:length(x = object.list),
      FUN = function(x) {
        return(RenameCells(
          object = object.list[[x]],
          new.names = paste0(Cells(x = object.list[[x]]), "_", x)
        ))
      }
    )
  }
  return(object.list)
}


# Create an empty dummy assay to replace existing assay
#
CreateDummyAssay <- function(assay) {
  cm <- as.sparse(x = matrix(
    data = 0,
    nrow = nrow(x = assay),
    ncol = ncol(x = assay)
  ))
  rownames(x = cm) <- rownames(x = assay)
  colnames(x = cm) <- colnames(x = assay)
  # TODO: restore once check.matrix is in SeuratObject
  # return(CreateAssayObject(
  #   counts = cm,
  #   check.matrix = FALSE
  # ))
  return(CreateAssayObject(
    counts = cm
  ))
}

# Extract delimiter information from a string.
#
# Parses a string (usually a cell name) and extracts fields based on a delimiter
#
# @param string String to parse.
# @param field Integer(s) indicating which field(s) to extract. Can be a vector multiple numbers.
# @param delim Delimiter to use, set to underscore by default.
#
# @return A new string, that parses out the requested fields, and (if multiple), rejoins them with the same delimiter
#
# @export
#
# @examples
# ExtractField(string = 'Hello World', field = 1, delim = '_')
#
ExtractField <- function(string, field = 1, delim = "_") {
  fields <- as.numeric(x = unlist(x = strsplit(x = as.character(x = field), split = ",")))
  if (length(x = fields) == 1) {
    return(strsplit(x = string, split = delim)[[1]][field])
  }
  return(paste(strsplit(x = string, split = delim)[[1]][fields], collapse = delim))
}

# Resize GenomicRanges upstream and or downstream
# from https://support.bioconductor.org/p/78652/
#
Extend <- function(x, upstream = 0, downstream = 0) {
  if (any(GenomicRanges::strand(x = x) == "*")) {
    warning("'*' ranges were treated as '+'")
  }
  on_plus <- GenomicRanges::strand(x = x) == "+" | GenomicRanges::strand(x = x) == "*"
  new_start <- GenomicRanges::start(x = x) - ifelse(test = on_plus, yes = upstream, no = downstream)
  new_end <- GenomicRanges::end(x = x) + ifelse(test = on_plus, yes = downstream, no = upstream)
  IRanges::ranges(x = x) <- IRanges::IRanges(start = new_start, end = new_end)
  x <- GenomicRanges::trim(x = x)
  return(x)
}

# Interleave vectors together
#
# @param ... Vectors to be interleaved
#
# @return A vector with the values from each vector in ... interleaved
#
Interleave <- function(...) {
  return(as.vector(x = t(x = as.data.frame(x = list(...)))))
}

# Check if a matrix is empty
#
# Takes a matrix and asks if it's empty (either 0x0 or 1x1 with a value of NA)
#
# @param x A matrix
#
# @return Whether or not \code{x} is empty
#
IsMatrixEmpty <- function(x) {
  matrix.dims <- dim(x = x)
  matrix.na <- all(matrix.dims == 1) && all(is.na(x = x))
  return(all(matrix.dims == 0) || matrix.na)
}

# Check if externalptr is null
# From https://stackoverflow.com/questions/26666614/how-do-i-check-if-an-externalptr-is-null-from-within-r
#
is.null.externalptr <- function(pointer) {
  stopifnot(is(pointer, "externalptr"))
  .Call("isnull", pointer)
}

# Check whether an assay has been processed by sctransform
#
# @param assay assay to check
#
# @return Boolean
#
IsSCT <- function(assay) {
  if (is.list(x = assay)) {
    sct.check <- lapply(X = assay, FUN = function(x) {
      return(!is.null(x = Misc(object = x, slot = 'vst.out')) | !is.null(x = Misc(object = x, slot = 'vst.set')) | inherits(x = x, what = "SCTAssay"))
    })
    return(unlist(x = sct.check))
  }
  return(!is.null(x = Misc(object = assay, slot = 'vst.out')) | !is.null(x = Misc(object = assay, slot = 'vst.set')) | inherits(x = assay, what = "SCTAssay"))
}

# Check whether a vst.out is from sctransform
#
# @param vst.out a sct model from sctransform
#
# @return Boolean
#
IsVSTout <- function(vst.out) {
  vst.element <- c("model_str", "model_pars_fit", "cell_attr" )
   vst.check <- setdiff(x = vst.element, y = names(x = vst.out))
   if (length(x = setdiff(x = vst.element, y = names(x = vst.out))) == 0) {
     vst.check <- TRUE
   } else {
     vst.check <- FALSE
   }
  return(vst.check)
}

# Calculate euclidean distance the x and y,
# and subtract the nearest neighbors of x distance to keep local connectivity
# It is used in FindModalityWeights to calculate the with and cross modality distance
impute_dist <- function(x, y, nearest.dist) {
  dist <- sqrt(x = rowSums(x = (x - y)**2)) - nearest.dist
  dist <- ReLu(x = dist)
  return(dist)
}

# Check the length of components of a list
#
# @param values A list whose components should be checked
# @param cutoff A minimum value to check for
#
# @return a vector of logicals
#
LengthCheck <- function(values, cutoff = 0) {
  return(vapply(
    X = values,
    FUN = function(x) {
      return(length(x = x) > cutoff)
    },
    FUN.VALUE = logical(1)
  ))
}

# Function to map values in a vector `v` as defined in `from`` to the values
# defined in `to`.
#
# @param v     vector of values to map
# @param from  vector of original values
# @param to    vector of values to map original values to (should be of equal
#              length as from)
# @return      returns vector of mapped values
#
MapVals <- function(v, from, to) {
  if (length(x = from) != length(x = to)) {
    stop("from and to vectors are not the equal length.")
  }
  vals.to.match <- match(x = v, table = from)
  vals.to.match.idx  <- !is.na(x = vals.to.match)
  v[vals.to.match.idx] <- to[vals.to.match[vals.to.match.idx]]
  return(v)
}

# Independently shuffle values within each row of a matrix
#
# Creates a matrix where correlation structure has been removed, but overall values are the same
#
# @param x Matrix to shuffle
#
# @return Returns a scrambled matrix, where each row is shuffled independently
#
#' @importFrom stats runif
#
# @export
#
# @examples
# mat <- matrix(data = rbinom(n = 25, size = 20, prob = 0.2 ), nrow = 5)
# mat
# MatrixRowShuffle(x = mat)
#
MatrixRowShuffle <- function(x) {
  x2 <- x
  x2 <- t(x = x)
  ind <- order(c(col(x = x2)), runif(n = length(x = x2)))
  x2 <- matrix(
    data = x2[ind],
    nrow = nrow(x = x),
    ncol = ncol(x = x),
    byrow = TRUE
  )
  return(x2)
}

# Reverse the vector x and return the value at the Nth index. If N is larger
# than the length of the vector, return the last value in the reversed vector.
#
# @param x vector of interest
# @param N index in reversed vector
#
# @return returns element at given index
#
MaxN <- function(x, N = 2){
  len <- length(x)
  if (N > len) {
    warning('N greater than length(x).  Setting N=length(x)')
    N <- length(x)
  }
  sort(x, partial = len - N + 1)[len - N + 1]
}

# Given a range from cut, compute the mean
#
# @x range from cut as a string (e.g. (10, 20] )
# @return returns a numeric with the mean of the range
#
MeanRange <- function(x) {
  left <- gsub(pattern = "\\]", replacement = "", x = sub(pattern = "\\([[:digit:]\\.e+]*,", x = x, replacement = ""))
  right <- gsub(pattern = "\\(", replacement = "", x = sub(pattern = ",[[:digit:]\\.e+]*]", x = x, replacement = ""))
  return(mean(c(as.numeric(x = left), as.numeric(x = right))))
}

# Melt a data frame
#
# @param x A data frame
#
# @return A molten data frame
#
Melt <- function(x) {
  if (!is.data.frame(x = x)) {
    x <- as.data.frame(x = x)
  }
  return(data.frame(
    rows = rep.int(x = rownames(x = x), times = ncol(x = x)),
    cols = unlist(x = lapply(X = colnames(x = x), FUN = rep.int, times = nrow(x = x))),
    vals = unlist(x = x, use.names = FALSE)
  ))
}

# Modify parameters in calling environment
#
# Used exclusively for helper parameter validation functions
#
# @param param name of parameter to change
# @param value new value for parameter
#
ModifyParam <- function(param, value) {
  # modify in original function environment
  env1 <- sys.frame(which = length(x = sys.frames()) - 2)
  env1[[param]] <- value
  # also modify in validation function environment
  env2 <- sys.frame(which = length(x = sys.frames()) - 1)
  env2[[param]] <- value
}

# Give hints for old paramters and their newer counterparts
#
# This is a non-exhaustive list. If your function isn't working properly based
# on the parameters you give it, please read the documentation for your function
#
# @param param A vector of paramters to get hints for
#
# @return Parameter hints for the specified paramters
#
OldParamHints <- function(param) {
  param.conversion <- c(
    'raw.data' = 'counts',
    'min.genes' = 'min.features',
    'features.plot' = 'features',
    'pc.genes' = 'features',
    'do.print' = 'verbose',
    'genes.print' = 'nfeatures.print',
    'pcs.print' = 'ndims.print',
    'pcs.use' = 'dims',
    'reduction.use' = 'reduction',
    'cells.use' = 'cells',
    'do.balanced' = 'balanced',
    'display.progress' = 'verbose',
    'print.output' = 'verbose',
    'dims.use' = 'dims',
    'reduction.type' = 'reduction',
    'y.log' = 'log',
    'cols.use' = 'cols',
    'assay.use' = 'assay'
  )
  return(param.conversion[param])
}

# Check the existence of a package
#
# @param ... Package names
# @param error If true, throw an error if the package doesn't exist
#
# @return Invisibly returns boolean denoting if the package is installed
#
PackageCheck <- function(..., error = TRUE) {
  pkgs <- unlist(x = c(...), use.names = FALSE)
  package.installed <- vapply(
    X = pkgs,
    FUN = requireNamespace,
    FUN.VALUE = logical(length = 1L),
    quietly = TRUE
  )
  if (error && any(!package.installed)) {
    stop(
      "Cannot find the following packages: ",
      paste(pkgs[!package.installed], collapse = ', '),
      ". Please install"
    )
  }
  invisible(x = package.installed)
}

# Parenting parameters from one environment to the next
#
# This function allows one to modifiy a parameter in a parent environement
# The primary use of this is to ensure logging functions store correct parameters
# if they've been modified by a child function or method
#
# @param parent.find Regex pattern of name of parent function call to modify.
# For example, this can be the class name for a method that was dispatched previously
# @param ... Parameter names and values to parent; both name and value must be supplied
# in the standard \code{name = value} format; any number of name/value pairs can be specified
#
# @return No return, modifies parent environment directly
#
# @examples
# Parenting(parent.find = 'Seurat', features = features[features > 7])
#
Parenting <- function(parent.find = 'Seurat', ...) {
  calls <- as.character(x = sys.calls())
  calls <- lapply(
    X = strsplit(x = calls, split = '(', fixed = TRUE),
    FUN = '[',
    1
  )
  parent.index <- grep(pattern = parent.find, x = calls)
  if (length(x = parent.index) != 1) {
    warning(
      "Cannot find a parent environment called ",
      parent.find,
      immediate. = TRUE,
      call. = FALSE
    )
  } else {
    to.parent <- list(...)
    if (length(x = to.parent) == 0) {
      warning("Nothing to parent", immediate. = TRUE, call. = FALSE)
    } else if (is.null(x = names(x = to.parent))) {
      stop("All input must be in a key = value pair")
    } else if (length(x = Filter(f = nchar, x = names(x = to.parent))) != length(x = to.parent)) {
      stop("All inputs must be named")
    } else {
      parent.environ <- sys.frame(which = parent.index)
      for (i in 1:length(x = to.parent)) {
        parent.environ[[names(x = to.parent)[i]]] <- to.parent[[i]]
      }
    }
  }
}

# Calculate the percentage of a vector above some threshold
#
# @param x Vector of values
# @param threshold Threshold to use when calculating percentage
#
# @return Returns the percentage of `x` values above the given threshold
#
PercentAbove <- function(x, threshold) {
  return(length(x = x[x > threshold]) / length(x = x))
}

# Generate a random name
#
# Make a name from randomly sampled lowercase letters,
# pasted together with no spaces or other characters
#
# @param length How long should the name be
# @param ... Extra parameters passed to sample
#
# @return A character with nchar == length of randomly sampled letters
#
# @seealso \code{\link{sample}}
#
RandomName <- function(length = 5L, ...) {
  CheckDots(..., fxns = 'sample')
  return(paste(sample(x = letters, size = length, ...), collapse = ''))
}


# Rectified linear units function. Calculate positive part of its argument
# The input can be a vector and a matrix
ReLu <- function(x) {
  x[x < 0] <- 0
  return(x)
}

# Remove the last field from a string
#
# Parses a string (usually a cell name) and removes the last field based on a delimter
#
# @param string String to parse
# @param delim Delimiter to use, set to underscore by default.
#
# @return A new string sans the last field
#
RemoveLastField <- function(string, delim = "_") {
  ss <- strsplit(x = string, split = delim)[[1]]
  if (length(x = ss) == 1) {
    return(string)
  } else {
    return(paste(ss[1:(length(x = ss)-1)], collapse = delim))
  }
}

# Sweep out array summaries
#
# Reimplmentation of \code{\link[base]{sweep}} to maintain compatability with
# both R 3.X and 4.X
#
# @inheritParams base::sweep
# @param x an array.
#
# @seealso \code{\link[base]{sweep}}
#
Sweep <- function(x, MARGIN, STATS, FUN = '-', check.margin = TRUE, ...) {
  if (any(grepl(pattern = 'X', x = names(x = formals(fun = sweep))))) {
    return(sweep(
      X = x,
      MARGIN = MARGIN,
      STATS = STATS,
      FUN = FUN,
      check.margin = check.margin,
      ...
    ))
  } else {
    return(sweep(
      x = x,
      MARGIN = MARGIN,
      STATS = STATS,
      FUN = FUN,
      check.margin = check.margin,
      ...
    ))
  }
}

# Get program paths in a system-agnostic way
#
# @param progs A vector of program names
# @param error Throw an error if any programs are not found
# @param add.exe Add '.exe' extension to program names that don't have it
#
# @return A named vector of program paths; missing programs are returned as
# \code{NA} if \code{error = FALSE}
#
#' @importFrom tools file_ext
#
SysExec <- function(
  progs,
  error = ifelse(test = length(x = progs) == 1, yes = TRUE, no = FALSE),
  add.exe = .Platform$OS.type == 'windows'
) {
  cmd <- ifelse(
    test = .Platform$OS.type == 'windows',
    yes = 'where.exe',
    no = 'which'
  )
  if (add.exe) {
    missing.exe <- file_ext(x = progs) != 'exe'
    progs[missing.exe] <- paste0(progs[missing.exe], '.exe')
  }
  paths <- sapply(
    X = progs,
    FUN = function(x) {
      return(tryCatch(
        expr = system2(command = cmd, args = x, stdout = TRUE)[1],
        warning = function(...) {
          return(NA_character_)
        }
      ))
    }
  )
  if (error && any(is.na(x = paths))) {
    stop(
      "Could not find the following programs: ",
      paste(names(x = paths[is.na(x = paths)]), collapse = ', '),
      call. = FALSE
    )
  }
  return(paths)
}

# Try to convert x to numeric, if NA's introduced return x as is
#
ToNumeric <- function(x){
  # check for x:y range
  if (is.numeric(x = x)) {
    return(x)
  }
  if (length(x = unlist(x = strsplit(x = x, split = ":"))) == 2) {
    num <- unlist(x = strsplit(x = x, split = ":"))
    return(num[1]:num[2])
  }
  num <- suppressWarnings(expr = as.numeric(x = x))
  if (!is.na(x = num)) {
    return(num)
  }
  return(x)
}
ibseq/scs-analysis documentation built on Feb. 27, 2021, 12:35 a.m.