R/celda_functions.R
In celda: CEllular Latent Dirichlet Allocation

Documented in distinctColors featureModuleTable normalizeCounts recodeClusterY recodeClusterZ retrieveFeatureIndex

.sampleLl <- function(llProbs) {
  probsSub <- exp(llProbs - max(llProbs))
  probsNorm <- probsSub / sum(probsSub)
  probsSelect <- sample.int(
    length(probsNorm),
    size = 1L,
    replace = TRUE,
    prob = probsNorm
  )
  return(probsSelect)
}


.cosineDist <- function(x) {
  x <- t(x)
  y <- (1 - .cosine(x)) / 2
  return(stats::as.dist(y))
}


.cosine <- function(x) {
  y <- x %*% t(x) / (sqrt(rowSums(x^2) %*% t(rowSums(x^2))))
  return(y)
}


.spearmanDist <- function(x) {
  y <- (1 - stats::cor(x, method = "spearman")) / 2
  return(stats::as.dist(y))
}


.hellingerDist <- function(x) {
  y <- stats::dist(t(sqrt(x)), method = "euclidean") * 1 / sqrt(2)
  return(y)
}


.normalizeLogProbs <- function(llProbs) {
  llProbs <- exp(sweep(llProbs, 1, base::apply(llProbs, 1, max), "-"))
  probs <- sweep(llProbs, 1, rowSums(llProbs), "/")
  return(probs)
}


#' @title Normalization of count data
#' @description Performs normalization, transformation, and/or scaling of a
#'  counts matrix
#' @param counts Integer matrix. Rows represent features and columns represent
#'  cells.
#' @param normalize Character.
#'  Divides counts by the library sizes for each cell. One of 'proportion',
#'  'cpm', 'median', or 'mean'. 'proportion' uses the total counts for each
#'  cell as the library size. 'cpm' divides the library size of each cell by
#'  one million to produce counts per million. 'median' divides the library
#'  size of each cell by the median library size across all cells. 'mean'
#'  divides the library size of each cell by the mean library size across all
#'  cells.
#' @param scaleFactor Numeric. Sets the scale factor for cell-level
#'  normalization. This scale factor is multiplied to each cell after the
#'  library size of each cell had been adjusted in \code{normalize}. Default
#'  \code{NULL} which means no scale factor is applied.
#' @param transformationFun Function. Applys a transformation such as
#'  \link{sqrt}, \link{log}, \link{log2}, \link{log10}, or \link{log1p}.
#'  If NULL, no transformation will be applied. Occurs after normalization.
#'  Default NULL.
#' @param scaleFun Function. Scales the rows of the normalized and transformed
#'  count matrix. For example, 'scale' can be used to z-score normalize the
#'  rows. Default NULL.
#' @param pseudocountNormalize Numeric. Add a pseudocount to counts before
#'  normalization. Default 0.
#' @param pseudocountTransform Numeric. Add a pseudocount to normalized counts
#'  before applying the transformation function. Adding a pseudocount
#'  can be useful before applying a log transformation. Default  0.
#' @return Numeric Matrix. A normalized matrix.
#' @examples
#' data(celdaCGSim)
#' normalizedCounts <- normalizeCounts(celdaCGSim$counts, "proportion",
#'   pseudocountNormalize = 1)
#' @export
normalizeCounts <- function(counts,
                            normalize = c("proportion", "cpm",
                              "median", "mean"),
                            scaleFactor = NULL,
                            transformationFun = NULL,
                            scaleFun = NULL,
                            pseudocountNormalize = 0,
                            pseudocountTransform = 0) {

  normalize <- match.arg(normalize)
  counts <- as.matrix(counts)

  if (!is.null(transformationFun) &&
    !is.function(transformationFun)) {
    stop("'transformationFun' needs to be of class 'function'")
  }
  if (!is.null(scaleFun) && !is.function(scaleFun)) {
    stop("'scaleFun' needs to be of class 'function'")
  }
  # Perform normalization
  if (normalize == "proportion") {
    norm <- fastNormProp(counts, pseudocountNormalize)
  } else {
    counts <- counts + pseudocountNormalize
    cs <- .colSums(counts, nrow(counts), ncol(counts))
    norm <- switch(
      normalize,
      "proportion" = sweep(counts, 2, cs, "/"),
      "cpm" = sweep(counts, 2, cs / 1e6, "/"),
      "median" = sweep(counts, 2, cs / stats::median(cs), "/"),
      "mean" = sweep(counts, 2, cs / mean(cs), "/")
    )
  }

  if (!is.null(scaleFactor)) {
      norm <- norm * scaleFactor
  }

  if (!is.null(transformationFun)) {
    norm <- do.call(
      transformationFun,
      list(norm + pseudocountTransform)
    )
  }
  if (!is.null(scaleFun)) {
    norm <- t(base::apply(norm, 1, scaleFun))
  }
  colnames(norm) <- colnames(counts)
  rownames(norm) <- rownames(counts)
  return(norm)
}


#' @title Recode cell cluster labels
#' @description Recode cell subpopulaton clusters using a mapping in the
#'  \code{from} and \code{to} arguments.
#' @param sce \linkS4class{SingleCellExperiment} object returned from
#'  \link{celda_C} or \link{celda_CG}. Must contain column
#'  \code{celda_cell_cluster} in
#'  \code{\link{colData}(altExp(sce, altExpName))}.
#' @param from Numeric vector. Unique values in the range of
#'  \code{seq(celdaClusters(sce, altExpName = altExpName))} that correspond to
#'  the original cluster
#'  labels in \code{sce}.
#' @param to Numeric vector. Unique values in the range of
#'  \code{seq(celdaClusters(sce, altExpName = altExpName))} that correspond to
#'  the new cluster labels.
#' @param altExpName The name for the \link{altExp} slot
#'  to use. Default "featureSubset".
#' @return \linkS4class{SingleCellExperiment} object with recoded cell
#'  cluster labels.
#' @examples
#' data(sceCeldaCG)
#' sceReorderedZ <- recodeClusterZ(sceCeldaCG, c(1, 3), c(3, 1))
#' @importFrom plyr mapvalues
#' @export
recodeClusterZ <- function(sce, from, to, altExpName = "featureSubset") {
    if (length(setdiff(from, to)) != 0) {
        stop("All values in 'from' must have a mapping in 'to'")
    }
    if (is.null(celdaClusters(sce, altExpName = altExpName))) {
        stop("Provided 'sce' argument does not have a 'celda_cell_cluster'",
            " column in 'colData(altExp(sce, altExpName))'")
    }
    celdaClusters(sce, altExpName = altExpName) <- plyr::mapvalues(
        celdaClusters(sce, altExpName = altExpName), from, to)
    return(sce)
}


# for deprecated celda model objects
.recodeClusterZ <- function(celdaMod, from, to) {
    if (length(setdiff(from, to)) != 0) {
        stop("All values in 'from' must have a mapping in 'to'")
    }
    if (is.null(celdaClusters(celdaMod)$z)) {
        stop("Provided celdaMod argument does not have a z attribute")
    }
    celdaMod@clusters$z <- plyr::mapvalues(celdaClusters(celdaMod)$z,
        from, to)
    return(celdaMod)
}


#' @title Recode feature module labels
#' @description Recode feature module clusters using a mapping in the
#'  \code{from} and \code{to} arguments.
#' @param sce \linkS4class{SingleCellExperiment} object returned from
#'  \link{celda_G} or \link{celda_CG}. Must contain column
#'  \code{celda_feature_module} in
#'  \code{\link{rowData}(altExp(sce, altExpName))}.
#' @param from Numeric vector. Unique values in the range of
#'  \code{seq(celdaModules(sce))} that correspond to the original module labels
#'  in \code{sce}.
#' @param to Numeric vector. Unique values in the range of
#'  \code{seq(celdaModules(sce))} that correspond to the new module labels.
#' @param altExpName The name for the \link{altExp} slot
#'  to use. Default "featureSubset".
#' @return @return \linkS4class{SingleCellExperiment} object with recoded
#'  feature module labels.
#' @examples
#' data(sceCeldaCG)
#' sceReorderedY <- recodeClusterY(sceCeldaCG, c(1, 3), c(3, 1))
#' @export
recodeClusterY <- function(sce, from, to, altExpName = "featureSubset") {
    if (length(setdiff(from, to)) != 0) {
        stop("All values in 'from' must have a mapping in 'to'")
    }
    if (is.null(celdaModules(sce, altExpName = altExpName))) {
        stop("Provided 'sce' argument does not have a 'celda_feature_module'",
            " column in 'rowData(altExp(sce, altExpName))'")
    }
    celdaModules(sce, altExpName = altExpName) <- plyr::mapvalues(
        celdaModules(sce, altExpName = altExpName), from, to)
    return(sce)
}


# for deprecated celda model objects
.recodeClusterY <- function(celdaMod, from, to) {
    if (length(setdiff(from, to)) != 0) {
        stop("All values in 'from' must have a mapping in 'to'")
    }
    if (is.null(celdaClusters(celdaMod)$y)) {
        stop("Provided celdaMod argument does not have a y attribute")
    }
    celdaMod@clusters$y <- plyr::mapvalues(celdaClusters(celdaMod)$y,
        from, to)
    return(celdaMod)
}


#' @title Check count matrix consistency
#' @description Checks if the counts matrix is the same one used to generate
#'  the celda model object by comparing dimensions and MD5 checksum.
#' @param counts Integer matrix. Rows represent features and columns represent
#'  cells.
#' @param celdaMod A \code{celdaModel} or \code{celdaList} object.
#' @param errorOnMismatch Logical. Whether to throw an error in the event of
#'  a mismatch. Default TRUE.
#' @param ... Ignored. Placeholder to prevent check warning.
#' @return Returns TRUE if provided count matrix matches the one used in the
#'  celda object and/or \code{errorOnMismatch = FALSE}, FALSE otherwise.
#' @export
setGeneric("compareCountMatrix", function(counts, celdaMod, ...) {
    standardGeneric("compareCountMatrix")})


#' @rdname compareCountMatrix
#' @examples
#' data(celdaCGSim, celdaCGMod)
#' compareCountMatrix(celdaCGSim$counts, celdaCGMod, errorOnMismatch = FALSE)
#' @export
setMethod("compareCountMatrix",
    signature(celdaMod = "celdaModel"),
    function(counts, celdaMod, errorOnMismatch = TRUE) {

        if ("y" %in% names(celdaClusters(celdaMod))) {
            if (nrow(counts) != length(celdaClusters(celdaMod)$y)) {
                stop(
                    "The provided celda object was generated from a counts",
                    " matrix with a different number of features than the one",
                    " provided."
                )
            }
        }

        if ("z" %in% names(celdaClusters(celdaMod))) {
            if (ncol(counts) != length(celdaClusters(celdaMod)$z)) {
                stop(
                    "The provided celda object was generated from a counts",
                    " matrix with a different number of cells than the one",
                    " provided."
                )
            }
        }
        celdaChecksum <- params(celdaMod)$countChecksum

        counts <- .processCounts(counts)
        # Checksums are generated in celdaGridSearch and model after processing
        count.md5 <- .createCountChecksum(counts)
        res <- isTRUE(count.md5 == celdaChecksum)
        if (res) {
            return(TRUE)
        }
        if (!res && errorOnMismatch) {
            stop(
                "There was a mismatch between the provided count matrix and",
                " the count matrix used to generate the provided celda result."
            )
        } else if (!res && !errorOnMismatch) {
            warning("There was a mismatch between the provided count matrix",
                " and the count matrix used to generate the provided celda",
                " result.")
            return(FALSE)
        }
    }
)


#' @rdname compareCountMatrix
#' @examples
#' data(celdaCGSim, celdaCGGridSearchRes)
#' compareCountMatrix(celdaCGSim$counts, celdaCGGridSearchRes,
#'     errorOnMismatch = FALSE)
#' @export
setMethod("compareCountMatrix",
    signature(celdaMod = "celdaList"),
    function(counts, celdaMod, errorOnMismatch = TRUE) {

        if ("y" %in% names(celdaMod@resList[[1]]@clusters)) {
            if (nrow(counts) != length(celdaMod@resList[[1]]@clusters$y)) {
                stop(
                    "The provided celda object was generated from a counts",
                    " matrix with a different number of features than the one",
                    " provided."
                )
            }
        }

        if ("z" %in% names(celdaMod@resList[[1]]@clusters)) {
            if (ncol(counts) != length(celdaMod@resList[[1]]@clusters$z)) {
                stop(
                    "The provided celda object was generated from a counts",
                    " matrix with a different number of cells than the one",
                    " provided."
                )
            }
        }
        celdaChecksum <- celdaMod@countChecksum

        counts <- .processCounts(counts)
        # Checksums are generated in celdaGridSearch and model after processing
        count.md5 <- .createCountChecksum(counts)
        res <- isTRUE(count.md5 == celdaChecksum)
        if (res) {
            return(TRUE)
        }
        if (!res && errorOnMismatch) {
            stop(
                "There was a mismatch between the provided count matrix and",
                " the count matrix used to generate the provided celda result."
            )
        } else if (!res && !errorOnMismatch) {
            warning("There was a mismatch between the provided count matrix",
                " and the count matrix used to generate the provided celda",
                " result.")
            return(FALSE)
        }
    }
)


.logMessages <- function(...,
                         sep = " ",
                         logfile = NULL,
                         append = FALSE,
                         verbose = TRUE) {
  if (isTRUE(verbose)) {
    if (!is.null(logfile)) {
      if (!is.character(logfile) || length(logfile) > 1) {
        stop(
          "The log file parameter needs to be a single character",
          " string."
        )
      }
      cat(paste(..., "\n", sep = sep),
        file = logfile,
        append = append
      )
    } else {
      message(paste(..., sep = sep))
    }
  }
}


#' @title Create a color palette
#' @description Generate a palette of `n` distinct colors.
#' @param n Integer. Number of colors to generate.
#' @param hues Character vector. Colors available from `colors()`. These will
#'  be used as the base colors for the clustering scheme in HSV. Different
#'  saturations and values will be generated for each hue. Default c("red",
#'  "cyan", "orange", "blue", "yellow", "purple", "green", "magenta").
#' @param saturationRange Numeric vector. A vector of length 2 denoting the
#'  saturation for HSV. Values must be in [0,1]. Default: c(0.25, 1).
#' @param valueRange Numeric vector. A vector of length 2 denoting the range
#'  of values for HSV. Values must be in [0,1]. Default: `c(0.5, 1)`.
#' @return A vector of distinct colors that have been converted to HEX from HSV.
#' @examples
#' colorPal <- distinctColors(6) # can be used in plotting functions
#' @importFrom grDevices colors
#' @importFrom grDevices rgb2hsv
#' @importFrom grDevices hsv
#' @export
distinctColors <- function(n,
                           hues = c(
                             "red",
                             "cyan",
                             "orange",
                             "blue",
                             "yellow",
                             "purple",
                             "green",
                             "magenta"
                           ),
                           saturationRange = c(0.7, 1),
                           valueRange = c(0.7, 1)) {
  if (!(all(hues %in% grDevices::colors()))) {
    stop(
      "Only color names listed in the 'color' function can be used in",
      " 'hues'"
    )
  }
  # Convert R colors to RGB and then to HSV color format
  huesHsv <- grDevices::rgb2hsv(grDevices::col2rgb(hues))
  # Calculate all combination of saturation/value pairs
  # Note that low saturation with low value (i.e. high darkness) is too dark
  # for all hues. Likewise, high saturation with high value (i.e. low
  # darkness) is hard to distinguish. Therefore, saturation and value are
  # set to be anticorrelated
  numVs <- ceiling(n / length(hues))
  s <- seq(
    from = saturationRange[1],
    to = saturationRange[2],
    length = numVs
  )
  v <- seq(
    from = valueRange[2],
    to = valueRange[1],
    length = numVs
  )
  # Create all combination of hues with saturation/value pairs
  list <- lapply(seq(numVs), function(x) {
    rbind(huesHsv[1, ], s[x], v[x])
  })
  newHsv <- do.call(cbind, list)
  # Convert to hex
  col <- grDevices::hsv(newHsv[1, ], newHsv[2, ], newHsv[3, ])
  return(col[seq(n)])
}


.processCounts <- function(counts) {
  counts <- as.matrix(counts)
  if (typeof(counts) != "integer") {
    counts <- round(counts)
    storage.mode(counts) <- "integer"
  }
  return(counts)
}


# Perform some simple checks on the counts matrix, to ensure celda modeling
# expectations are met
.validateCounts <- function(counts) {
  # And each row/column of the count matrix must have at least one count
  countRowSum <- rowSums(counts)
  countColSum <- colSums(counts)
  if (sum(countRowSum == 0) > 0 | sum(countColSum == 0) > 0) {
    stop(
      "Each row and column of the count matrix must have at least",
      " one count"
    )
  }
}


# Wrapper function, creates checksum for matrix.
# Feature names, cell names are not taken into account.
#' @importFrom digest digest
.createCountChecksum <- function(counts) {
  rownames(counts) <- NULL
  colnames(counts) <- NULL
  countChecksum <- digest::digest(counts, algo = "md5")
  return(countChecksum)
}


# Generate n random deviates from the Dirichlet function with shape parameters
# alpha. Adapted from gtools v3.5
.rdirichlet <- function(n, alpha) {
  l <- length(alpha)
  x <- matrix(stats::rgamma(l * n, alpha),
    ncol = l,
    byrow = TRUE
  )
  # Check for case where all sampled entries are zero due to round off
  # One entry will be randomly chosen to be one
  isZero <- rowSums(x) == 0
  assignment <- sample(seq(l), size = sum(isZero), replace = TRUE)
  x[cbind(which(isZero), assignment)] <- 1
  # Normalize
  sm <- x %*% rep(1, l)
  y <- x / as.vector(sm)
  return(y)
}


# Make sure provided sample labels are the right type,
# or generate some if none were provided
.processSampleLabels <- function(sampleLabel, numCells) {
  if (is.null(sampleLabel)) {
    sampleLabel <- as.factor(rep("Sample_1", numCells))
  } else {
    if (length(sampleLabel) != numCells) {
      stop(
        "'sampleLabel' must be the same length as the number of",
        " columns in the 'counts' matrix."
      )
    }
  }

  if (!is.factor(sampleLabel)) {
    sampleLabel <- as.factor(sampleLabel)
  }
  return(sampleLabel)
}


#' @title Output a feature module table
#' @description Creates a table that contains the list of features in
#'  each feature module.
#' @param sce A \linkS4class{SingleCellExperiment} object returned by
#'  \link{celda_G}, or \link{celda_CG}, with the matrix
#'  located in the \code{useAssay} assay slot.
#'  Rows represent features and columns represent cells.
#' @param useAssay A string specifying which \link{assay}
#'  slot to use. Default "counts".
#' @param altExpName The name for the \link{altExp} slot
#'  to use. Default "featureSubset".
#' @param outputFile File name for feature module table. If NULL, file will
#'  not be created. Default NULL.
#' @return Matrix. Contains a list of features per each column (feature module)
#' @examples
#' data(sceCeldaCG)
#' featureModuleTable(sceCeldaCG)
#' @importFrom stringi stri_list2matrix
#' @export
featureModuleTable <- function(sce, useAssay = "counts",
    altExpName = "featureSubset", outputFile = NULL) {

  factorizeMatrix <- factorizeMatrix(sce,
      useAssay = useAssay,
      altExpName = altExpName,
      type = "proportion")
  allGenes <- topRank(factorizeMatrix$proportions$module, n = nrow(sce))
  res <- as.data.frame(stringi::stri_list2matrix(allGenes$names))
  res <- apply(res, c(1, 2), function(x) {
    if (is.na(x)) {
      return("")
    } else {
      return(x)
    }
  })
  colnames(res) <- gsub(
    pattern = "V",
    replacement = "L",
    x = colnames(res)
  )
  if (is.null(outputFile)) {
    return(res)
  } else {
    utils::write.table(
      res,
      file = outputFile,
      sep = "\t",
      row.names = FALSE,
      quote = FALSE
    )
  }
}


#' @title Retrieve row index for a set of features
#' @description This will return indices of features among the rownames
#' or rowData of a data.frame, matrix, or a \linkS4class{SummarizedExperiment}
#' object including a \linkS4class{SingleCellExperiment}.
#' Partial matching (i.e. grepping) can be used by setting
#' \code{exactMatch = FALSE}.
#' @param features Character vector of feature names to find in the rows of
#' \code{x}.
#' @param x A data.frame, matrix, or \linkS4class{SingleCellExperiment}
#' object to search.
#' @param by Character. Where to search for features in \code{x}. If set to
#' \code{"rownames"} then the features will be searched for among
#' \code{rownames(x)}. If \code{x} inherits from class
#' \linkS4class{SummarizedExperiment}, then \code{by} can be one of the
#' fields in the row annotation data.frame (i.e. one of
#' \code{colnames(rowData(x))}).
#' @param exactMatch Boolean. Whether to only identify exact matches
#' or to identify partial matches using \code{\link{grep}}.
#' @param removeNA Boolean. If set to \code{FALSE}, features not found in
#' \code{x} will be given \code{NA} and the returned vector will be the same
#' length as \code{features}. If set to \code{TRUE}, then the \code{NA}
#' values will be removed from the returned vector. Default \code{FALSE}.
#' @return A vector of row indices for the matching features in \code{x}.
#' @author Yusuke Koga, Joshua Campbell
#' @seealso '\link[scater]{retrieveFeatureInfo}' from package \code{'scater'}
#' and \code{link{regex}} for how to use regular expressions when
#' \code{exactMatch = FALSE}.
#' @examples
#' data(celdaCGSim)
#' retrieveFeatureIndex(c("Gene_1", "Gene_5"), celdaCGSim$counts)
#' retrieveFeatureIndex(c("Gene_1", "Gene_5"), celdaCGSim$counts,
#'                                             exactMatch = FALSE)
#' @export
retrieveFeatureIndex <- function(features,
                                 x,
                                 by = "rownames",
                                 exactMatch = TRUE,
                                 removeNA = FALSE) {

  # Extract vector to search through
  if (by == "rownames") {
    if (is.null(rownames(x))) {
      stop("'rownames' of 'x' are 'NULL'. Please set 'rownames' or change",
           " 'by' to search a different column in 'x'.")
    }
    search <- rownames(x)
  } else if (length(ncol(x)) > 0) {
    if (inherits(x, "SummarizedExperiment")) {
      if (!(by %in% colnames(rowData(x)))) {
        stop("'by' is not a column in 'rowData(x)'.")
      }
      search <- rowData(x)[, by]
    } else {
      if (!(by %in% colnames(x))) {
        stop("'by' is not a column in 'x'.")
      }
      search <- x[, by]
    }
  } else {
    search <- as.character(x)
  }

  # Match each element of 'pattern' in vector 'search'
  if (!isTRUE(exactMatch)) {
    featuresIndices <- rep(NA, length(features))
    for (i in seq_along(features)) {
      g <- grep(features[i], search)
      if (length(g) == 1) {
        featuresIndices[i] <- g
      } else if (length(g) > 1) {
        warning(
          "Feature '", features[i], "' matched multiple items in '",
          by, "': ", paste(search[g], collapse = ","),
          ". Only the first match will be selected."
        )
        featuresIndices[i] <- g[1]
      }
    }
  } else {
    featuresIndices <- match(features, search)
  }

  if (sum(is.na(featuresIndices)) > 0) {
    if (sum(is.na(featuresIndices)) == length(features)) {
      if (isTRUE(exactMatch)) {
        stop(
          "None of the provided features had matching",
          " items in '", by, "' within 'x'. ",
          "Check the spelling or try setting",
          " 'exactMatch = FALSE'."
        )
      } else {
        stop(
          "None of the provided features had matching",
          " items in '", by, "' within 'x'. ",
          "Check the spelling and make sure 'by' is set",
          " to the appropriate place in 'x'."
        )
      }
    }
    warning(
      "The following features were not present in 'x': ",
      paste(features[which(is.na(featuresIndices))],
        collapse = ","
      )
    )
  }

  if (isTRUE(removeNA)) {
    featuresIndices <- featuresIndices[!is.na(featuresIndices)]
  }
  return(featuresIndices)
}