# Jake Yeung
# Date of Creation: 2019-07-22
# File: ~/projects/scchic_gastru/scripts/Rfunctions/LSI.R
# Functions for LSI
# Main functions ----------------------------------------------------------
require(irlba)
require(hash)
require(igraph)
require(Matrix)
NormalizeMatrix <- function(count.mat, tf.method = "sums", idf.method = "inverselog", remove.empty.features = TRUE, remove.empty.cells = TRUE, use.sweep = TRUE){
if (remove.empty.cells){
ncols.orig <- ncol(count.mat)
cols.keep <- which(Matrix::colSums(count.mat) > 0)
count.mat <- count.mat[, cols.keep]
if (ncols.orig - length(cols.keep) > 0){
print(paste("Removing", ncols.orig - length(cols.keep), "columns. They are empty"))
}
}
if (remove.empty.features){
# remove empty features
nrows.orig <- nrow(count.mat)
rows.keep <- which(Matrix::rowSums(count.mat) > 0)
count.mat <- count.mat[rows.keep, ]
if (nrows.orig - length(rows.keep) > 0){
print(paste("Removing", nrows.orig - length(rows.keep), "rows. They are empty"))
}
}
tf <- TermFreq(count.mat, method = tf.method)
idf <- InverseDocFreq(count.mat, method = idf.method)
if (use.sweep){
count.mat.norm <- sweep(count.mat,
MARGIN = 2,
STATS = tf,
FUN = "*")
# step 2: row transform (inverse document frequency)
count.mat.norm <- sweep(count.mat.norm,
MARGIN = 1,
# STATS = log10(1 + ncol(count.mat) / Matrix::rowSums(count.mat)),
STATS = idf,
FUN = "*")
} else {
count.mat.norm <- count.mat * tf
count.mat.norm <- count.mat.norm * idf
}
return(count.mat.norm)
}
RunLSI <- function(count.mat, tf.method = "sums", idf.method = "inverselog", n.components = 50, .log = FALSE, .center = FALSE, .truncated = TRUE){
# rows are genes, columns are cells
count.mat.norm <- NormalizeMatrix(count.mat, tf.method, idf.method, remove.empty.features = TRUE)
# # remove empty features
# rows.keep <- which(Matrix::rowSums(count.mat) > 0)
# count.mat <- count.mat[rows.keep, ]
#
#
# tf <- TermFreq(count.mat, method = tf.method)
# idf <- InverseDocFreq(count.mat, method = idf.method)
#
# count.mat.norm <- sweep(count.mat,
# MARGIN = 2,
# STATS = tf,
# FUN = "*")
# # step 2: row transform (inverse document frequency)
# count.mat.norm <- sweep(count.mat.norm,
# MARGIN = 1,
# # STATS = log10(1 + ncol(count.mat) / Matrix::rowSums(count.mat)),
# STATS = idf,
# FUN = "*")
if (.log){
count.mat.norm <- log2(count.mat.norm * 10^6 + 1)
}
if (.center){
count.mat.norm <- sweep(count.mat.norm,
MARGIN = 1,
STATS = Matrix::rowMeans(count.mat.norm),
FUN = "-")
}
if (.truncated){
system.time(
count.svd <- irlba(A = t(count.mat.norm), nv = n.components, scale = FALSE, center = FALSE)
)
} else {
system.time(
count.svd <- svd(x = t(count.mat.norm), nu = n.components, nv = n.components)
)
}
rownames(count.svd$u) <- colnames(count.mat.norm)
rownames(count.svd$v) <- rownames(count.mat.norm)
return(count.svd)
}
UmapLSIOutput <- function(count.svd, jsettings, cellnames){
# umap the outputs
if (missing(jsettings)){
jsettings <- umap::umap.defaults
}
if (missing(cellnames)){
cellnames <- rownames(count.svd$u)
}
umap.out <- umap::umap(count.svd$u, config = jsettings)
# return as tidy dataframe
umap.dat <- data.frame(cell = cellnames, umap1 = umap.out$layout[, 1], umap2 = umap.out$layout[, 2], stringsAsFactors = FALSE)
return(umap.dat)
}
DoLouvain <- function(topics.mat, custom.settings.louv, dat.umap.long = NULL){
# Do Louvain for clustering
dat.umap.louv <- umap(topics.mat, config = custom.settings.louv)
dat.umap.louv.long <- data.frame(umap1 = dat.umap.louv$layout[, 1], umap2 = dat.umap.louv$layout[, 2], cell = rownames(dat.umap.louv$layout),
stringsAsFactors = FALSE)
cell.indx <- hash::hash(rownames(dat.umap.louv$knn$indexes), dat.umap.louv$knn$indexes[, 1])
cell.indx.rev <- hash(dat.umap.louv$knn$indexes[, 1], rownames(dat.umap.louv$knn$indexes))
nr <- nrow(dat.umap.louv$knn$indexes)
nc <- ncol(dat.umap.louv$knn$indexes)
edgelist <- matrix(NA, nrow = nr * nc, ncol = 2)
colnames(edgelist) <- c("from", "to")
for (vertex.i in seq(nr)){
istart <- nc*(vertex.i - 1)+1
iend <- nc*vertex.i
edgelist[istart : iend, 1] <- cell.indx.rev[[as.character(vertex.i)]]
edgelist[istart : iend, 2] <- sapply(dat.umap.louv$knn$indexes[vertex.i, 1:nc], function(x) cell.indx.rev[[as.character(x)]])
# edgelist[istart : iend, 3] <- 1 / (dat.umap$knn$distances[vertex.i, 1:nc] + 0.1)
}
g <- graph_from_data_frame(edgelist, directed=FALSE)
g.out <- cluster_louvain(g, weights = NULL)
V(g)$color <- g.out$membership
clstr <- hash(g.out$names, g.out$membership)
if (is.data.frame(dat.umap.long)){
dat.umap.long$louvain <- as.character(sapply(dat.umap.long$cell, function(x) clstr[[as.character(x)]]))
} else {
dat.umap.long <- clstr
}
return(dat.umap.long)
}
# Helper functions --------------------------------------------------------
TermFreq <- function(x, method = "sums"){
if (method == "sums"){
jstats <- Matrix::colSums(x)
}
return(1 / jstats) # multiply this to matrix
}
InverseDocFreq <- function(x, method = "inverselog"){
if (method == "inverselog"){
jstats <- log10(1 + ncol(x) / Matrix::rowSums(x))
}
return(jstats) # multiply this to matrix
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.