#' Transform/normalize compartment calls using TF-IDF
#'
#' @details
#' This function and its helpers were modeled after or taken from:
#' - http://andrewjohnhill.com/images/posts/2019-5-6-dimensionality-reduction-for-scatac-data/analysis.html
#' - https://divingintogeneticsandgenomics.rbind.io/post/clustering-scatacseq-data-the-tf-idf-way/
#'
#' @param mat n x p input matrix (n = samples/cells; p = compartments)
#' @param scale.factor Scaling factor for the term-frequency (TF)
#'
#' @return A TF-IDF transformed matrix of the same dimensions as the input
#'
#' @import Matrix
#'
#' @examples
#' m <- 1000
#' n <- 100
#' mat <- round(matrix(runif(m * n), m, n))
#' # Input needs to be a tall matrix
#' tfidf <- transformTFIDF(mat)
#'
#' @export
transformTFIDF <- function(mat, scale.factor = 1e5) {
if (!is(mat, "matrix") & !is(mat, "Matrix")) {
stop("Input needs to be a matrix.")
}
# binarize the matrix
# this assumes n x p matrix (e.g. a wide matrix)
# check and transpose as needed
# input matrix is tall
if (dim(mat)[1] > dim(mat)[2]) mat <- t(mat)
# make sparse
mat.binary <- Matrix(.binarizeMatrix(t(mat)), sparse = TRUE)
tf <- t(t(mat.binary) / Matrix::colSums(mat.binary)) # compute term-frequency
tf@x <- log1p(tf@x * scale.factor) # scale
idf <- log(1 + ncol(mat.binary) / Matrix::rowSums(mat.binary)) # inverse-document frequency smooth
tfidf <- .tfidf(tf, idf) # transform
# cast back to a matrix since things like UMAP don't like sparse matrices
tfidf <- as.matrix(tfidf)
return(t(tfidf))
}
# helper function
# set positive values to 1 and negative to 0
# open chromatin in atac or RNA is 1
# closed chromatin in bisulfite or arrays is 1
# just associate 1 with signal
.binarizeMatrix <- function(mat) {
mat[mat > 0] <- 1
mat[mat < 0] <- 0
return(mat)
}
# helper function for TF-IDF transform
# modeled after http://andrewjohnhill.com/images/posts/2019-5-6-dimensionality-reduction-for-scatac-data/analysis.html
.tfidf <- function(tf, idf) {
tf <- t(tf)
tf@x <- tf@x * rep.int(idf, diff(tf@p))
tf <- t(tf)
return(tf)
}
#' Transform/normalize compartment calls using TF-IDF on HDF5-backed objects
#'
#' @name hdf5TFIDF
#'
#' @param h5 SummarizedExperiment object, DelayedMatrix, or a normal matrix
#' @param scale.factor Scaling factor for the term-frequency (TF)
#' @param return.dense Whether to return a dense, in memory matrix
#' @param return.se Whether to return the TF-IDF matrix as a new assay in the SummarizedExperiment
#'
#' @return A TF-IDF transformed matrix of the same dimensions as the input
#' @import Matrix
#' @import DelayedMatrixStats
#' @import DelayedArray
#' @import HDF5Array
#' @importFrom methods as is
#'
#' @examples
#'
#' m <- 1000
#' n <- 100
#' mat <- round(matrix(runif(m * n), m, n))
#' # Input needs to be a tall matrix
#' tfidf <- hdf5TFIDF(mat)
#'
#' @export
hdf5TFIDF <- function(h5, scale.factor = 1e5,
return.dense = FALSE,
return.se = FALSE) {
# binarze
if (is(h5, "SummarizedExperiment")) {
if (!is(assay(h5), "DelayedMatrix")) {
assay(h5) <- as(assay(h5), "HDF5Matrix") # coerce to hdf5 backing to work with any SE
}
assay(h5)[assay(h5) > 0] <- 1
h5.mat <- assay(h5) # tall matrix
}
if (is(h5, "DelayedMatrix")) {
# make the matrix tall if needed
if (dim(h5)[1] < dim(h5)[2]) h5 <- t(h5)
h5[h5 > 0] <- 1
h5.mat <- h5
}
if (is(h5, "matrix")) {
h5[h5 > 0] <- 1
h5.mat <- as(h5, "HDF5Matrix")
}
message("Computing term frequency.")
tf <- t(t(h5.mat) / DelayedMatrixStats::colSums2(h5.mat)) # term frequency
tf <- log1p(tf * scale.factor) # scale
message("Computing inverse document frequency.")
idf <- log(1 + ncol(h5.mat) / DelayedMatrixStats::rowSums2(h5.mat)) # inverse document frequency
# TODO: fix this ugliness...
tf.mat <- as.matrix(tf)
tf.sparse <- Matrix(tf.mat, sparse = TRUE) # cast the tf matrix back to a sparse matrix
tf.sparse <- t(tf.sparse) # transpose for TF-IDF
message("TF-IDF")
tf.sparse@x <- tf.sparse@x * rep.int(idf, diff(tf.sparse@p)) # TF-IDF applied
tf.sparse <- t(tf.sparse) # transpose again
# coerce back to dense matrix
if (return.dense) {
message("WARNING: This might blow up!")
message("If you get a cholmod error: problem too large, set return.dense to FALSE.")
message("You will get a sparse matrix returned instead.")
return(as.matrix(tf.sparse))
}
if (return.se) {
if (!is(h5, "SummarizedExperiment")) {
return(tf.sparse)
}
message("Returning the TF-IDF matrix into the SummarizedExperiment.")
assays(h5)$tfidf <- tf.sparse
return(h5)
}
return(tf.sparse)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.