#' Finds common features in a list of cds objects
#'
#' @description Machine learning algorithms often require features to be the same across
#' datasets. Thisfunction finds common features between a list of cell data set objects and
#' returns a list of cds's that have the same features. Note that this function uses rownames
#' of the 'fData' DataFrame to find the intersect of features common to all cds's
#'
#' @param cds_list Input cell_data_set object.
#' @export
common_features <- function(cds_list){
len<-length(cds_list)
common_features=vector()
for(i in 1:len){
if(i < 2){
common_features<-rownames(fData(cds_list[[i]]))
}else{
common_features<-unique(intersect(common_features, rownames(fData(cds_list[[i]]))))
}
}
for(i in 1:len){
cds_list[[i]]<-cds_list[[i]][match(common_features, rownames(cds_list[[i]])),]
}
return(cds_list)
}
#' Performs TF-IDF transformation on a cell_data_set
#'
#' @description Just like it sounds.
#'
#' @param cds_list Input cell_data_set object or sparse matrix.
#' @import Matrix
#' @export
tf_idf_transform <- function(input, method=1, verbose=T){
if(class(input)=="cell_data_set"){
mat<-exprs(input)
}else{
mat<-input
}
rn <- rownames(mat)
row_sums<-Matrix::rowSums(mat)
nz<-which(row_sums>0)
mat <- mat[nz,]
rn <- rn[nz]
row_sums <- row_sums[nz]
col_sums <- Matrix::colSums(mat)
#column normalize
mat <- t(t(mat)/col_sums)
if (method == 1) {
#Adapted from Casanovich et al.
if(verbose) message("Computing Inverse Document Frequency")
idf <- as(log(1 + ncol(mat) / row_sums), "sparseVector")
if(verbose) message("Computing TF-IDF Matrix")
mat <- as(Matrix::Diagonal(x = as.vector(idf)), "sparseMatrix") %*%
mat
}
else if (method == 2) {
#Adapted from Stuart et al.
if(verbose) message("Computing Inverse Document Frequency")
idf <- as( ncol(mat) / row_sums, "sparseVector")
if(verbose) message("Computing TF-IDF Matrix")
mat <- as(Matrix::Diagonal(x = as.vector(idf)), "sparseMatrix") %*%
mat
mat@x <- log(mat@x * scale_to + 1)
}else if (method == 3) {
mat@x <- log(mat@x + 1)
if(verbose) message("Computing Inverse Document Frequency")
idf <- as(log(1 + ncol(mat) /row_sums), "sparseVector")
if(verbose) message("Computing TF-IDF Matrix")
mat <- as(Matrix::Diagonal(x = as.vector(idf)), "sparseMatrix") %*%
mat
}else {
stop("LSIMethod unrecognized please select valid method!")
}
rownames(mat) <- rn
if(class(input)=="cell_data_set"){
input@assays$data$counts<-mat
return(input)
}else{
return(mat)
}
}
#' Performs TF-IDF transformation on a cell_data_set v2
#'
#' @description Just like it sounds but different.
#'
#' @param cds_list Input cell_data_set object or sparse matrix.
#' @import Matrix
#' @export
tf_idf_transform_v2 <- function(input){
if(class(input)=="cell_data_set"){
mat<-exprs(input)
}else{
mat<-input
}
colSm <- Matrix::colSums(mat)
rowSm <- Matrix::rowSums(mat)
freqs <- t(t(mat)/colSm)
idf <- as(log(1 + ncol(mat) / rowSm), "sparseVector")
tfidf <- as(Matrix::Diagonal(x=as.vector(idf)), "sparseMatrix") %*% freqs
tfidf@x[is.na(tfidf@x)] <- 0
if(class(input)=="cell_data_set"){
input@assays$data$counts<-tfidf
return(input)
}else{
return(tfidf)
}
}
#' @export
svd_lsi<-function(sp_mat, num_dim, mat_only=T){
svd <- irlba::irlba(sp_mat, num_dim, num_dim)
svdDiag <- matrix(0, nrow=num_dim, ncol=num_dim)
diag(svdDiag) <- svd$d
matSVD <- t(svdDiag %*% t(svd$v))
rownames(matSVD) <- colnames(sp_mat)
colnames(matSVD) <- seq_len(ncol(matSVD))
if(mat_only){
return(matSVD)
}else{
return(list(matSVD=matSVD, svd=svd))
}
}
Noisify <- function(data, amount=0.0001) {
if (is.vector(data)) {
noise <- runif(length(data), -amount, amount)
noisified <- data + noise
} else {
length <- dim(data)[1] * dim(data)[2]
noise <- matrix(runif(length, -amount, amount), dim(data)[1])
noisified <- data + noise
}
return(noisified)
}
#' Detects genes above minimum threshold.
#'
#' @description For each gene in a cell_data_set object, detect_genes counts
#' how many cells are expressed above a minimum threshold. In addition, for
#' each cell, detect_genes counts the number of genes above this threshold that
#' are detectable. Results are added as columns num_cells_expressed and
#' num_genes_expressed in the rowData and colData tables respectively.
#'
#' @param cds Input cell_data_set object.
#' @param min_expr Numeric indicating expression threshold
#' @param exprs_bin Boolean whether to bin genes by mean expression
#' @param exprs_cuts Numeic indicating number of bins if using exprs_bin
#' @return Updated cell_data_set object
#' @importFrom Hmisc cut2
#' @export
detect_genes <- function(cds, min_expr=0, exprs_bin=TRUE, exprs_cuts=25){
assertthat::assert_that(methods::is(cds, "cell_data_set"))
assertthat::assert_that(is.numeric(min_expr))
rowData(cds)$num_cells_expressed <- Matrix::rowSums(SingleCellExperiment::counts(cds) > min_expr)
colData(cds)$num_genes_expressed <- Matrix::colSums(SingleCellExperiment::counts(cds) > min_expr)
if(exprs_bin){
fData(cds)$exprs_bin = cut2(log(Matrix::rowMeans(normalized_counts(cds))), m=floor(nrow(fData(cds))/exprs_cuts))
}
cds
}
#' Convert a GMT File to a ragged list of genes
#'
#' @description GMT Files (See MSigDB) are a convenient means of storing genesets
#'
#' @param GMTfn GMT filename
#' @export
GMT_to_list<-function (GMTfn)
{
file.data <- readLines(GMTfn)
num.lines <- length(file.data)
output <- vector("list", num.lines)
for (i in 1:num.lines) {
vec <- unlist(strsplit(file.data[i], "\t"))
output[[i]] <- vec[3:length(vec)]
names(output)[i] <- vec[1]
}
return(output)
}
#' @export
bimodality_coefficient<-function(x, finite=TRUE,...){
if(finite==TRUE){
G=skewness(x,finite)
sample.excess.kurtosis=kurtosis(x,finite)
K=sample.excess.kurtosis
n=length(x)
B=((G^2)+1)/(K+ ((3*((n-1)^2))/((n-2)*(n-3))))
}
else{
G=skewness(x,FALSE)
K=kurtosis(x,FALSE)
B=((G^2)+1)/(K)
}
return(B)
}
#' @export
skewness<-function(x, finite=TRUE){
n=length(x)
S=(1/n)*sum((x-mean(x))^3)/(((1/n)*sum((x-mean(x))^2))^1.5)
if(finite==FALSE){
S=S
}else{
S=S*(sqrt(n*(n-1)))/(n-2)
}
return(S)
}
#' @export
kurtosis<-function(x, finite){
n=length(x)
K=(1/n)*sum((x-mean(x))^4)/(((1/n)*sum((x-mean(x))^2))^2) - 3
if(finite==FALSE){
K=K
}
else{
K=((n-1)*((n+1)*K - 3*(n-1))/((n-2)*(n-3))) +3
}
return(K)
}
#' @export
lighten_darken_color<-function(col, amt) {
if (substring(col, 1, 1)=="#") {
col = substring(col, 2)
}
num = as.hexmode(col)
r = bitwShiftR(num, 16) + amt
if (r > 255) {r = 255}
if (r < 0) {r = 0}
b = bitwAnd(bitwShiftR(num, 8), 0x00FF) + amt
if (b > 255) {b = 255}
if (b < 0) {b = 0}
g = bitwAnd(num, 0x0000FF) + amt
if (g > 255) {g = 255}
if (g < 0) {g = 0}
inter<-paste("000000", as.hexmode(bitwOr(g , bitwOr(bitwShiftL(b, 8), bitwShiftL(r, 16)))), sep="")
ret<-substr(inter, nchar(inter)-5, nchar(inter))
return(toupper(paste("#", ret, sep="")))
}
#' @export
mean_gene_expression<-function (cds, markers, group_cells_by = "cluster", reduction_method = "UMAP",
norm_method = c("log", "size_only"), lower_threshold = 0,
max.size = 10, ordering_type = c("cluster_row_col", "maximal_on_diag",
"none"), axis_order = c("group_marker", "marker_group"),
flip_percentage_mean = FALSE, pseudocount = 1, scale_max = 3,
scale_min = -3)
{
assertthat::assert_that(methods::is(cds, "cell_data_set"))
if (!is.null(group_cells_by)) {
assertthat::assert_that(group_cells_by %in% c("cluster",
"partition") | group_cells_by %in% names(colData(cds)),
msg = paste("group_cells_by must be a column in",
"the colData table."))
}
assertthat::assert_that("gene_short_name" %in% names(rowData(cds)),
msg = paste("This function requires a gene_short_name",
"column in your rowData. If you do not have", "gene symbols, you can use other gene ids",
"by running", "rowData(cds)$gene_short_name <- row.names(rowData(cds))"))
norm_method = match.arg(norm_method)
gene_ids = as.data.frame(fData(cds)) %>% tibble::rownames_to_column() %>%
dplyr::filter(rowname %in% markers | gene_short_name %in%
markers) %>% dplyr::pull(rowname)
if (length(gene_ids) < 1)
stop(paste("Please make sure markers are included in the gene_short_name\",\n \"column of the rowData!"))
if (flip_percentage_mean == FALSE) {
major_axis <- 1
minor_axis <- 2
}
else if (flip_percentage_mean == TRUE) {
major_axis <- 2
minor_axis <- 1
}
exprs_mat <- t(as.matrix(exprs(cds)[gene_ids, ]))
exprs_mat <- reshape2::melt(exprs_mat)
colnames(exprs_mat) <- c("Cell", "Gene", "Expression")
exprs_mat$Gene <- as.character(exprs_mat$Gene)
if (group_cells_by == "cluster") {
cell_group <- tryCatch({
clusters(cds, reduction_method = reduction_method)
}, error = function(e) {
NULL
})
}
else if (group_cells_by == "partition") {
cell_group <- tryCatch({
partitions(cds, reduction_method = reduction_method)
}, error = function(e) {
NULL
})
}
else {
cell_group <- colData(cds)[, group_cells_by]
}
if (length(unique(cell_group)) < 2) {
stop(paste("Only one type in group_cells_by. To use plot_genes_by_group,",
"please specify a group with more than one type. "))
}
names(cell_group) = colnames(cds)
exprs_mat$Group <- cell_group[exprs_mat$Cell]
exprs_mat = exprs_mat %>% dplyr::filter(is.na(Group) == FALSE)
ExpVal <- exprs_mat %>% dplyr::group_by(Group, Gene) %>%
dplyr::summarize(mean = mean(log(Expression + pseudocount)),
percentage = sum(Expression > lower_threshold)/length(Expression))
ExpVal$mean <- ifelse(ExpVal$mean < scale_min, scale_min,
ExpVal$mean)
ExpVal$mean <- ifelse(ExpVal$mean > scale_max, scale_max,
ExpVal$mean)
ExpVal$Gene <- fData(cds)[ExpVal$Gene, "gene_short_name"]
res <- reshape2::dcast(ExpVal[, 1:4], Group ~ Gene, value.var = colnames(ExpVal)[2 +
major_axis])
group_id <- res[, 1]
out<-t(res[, -1])
rownames(out)<-colnames(res[, -1])
colnames(out)<-res$Group
out
}
plot_genes_by_group<-function (cds, markers, group_cells_by = "cluster", reduction_method = "UMAP",
norm_method = c("log", "size_only"), lower_threshold = 0,
max.size = 10, ordering_type = c("cluster_row_col", "maximal_on_diag",
"none"), axis_order = c("group_marker", "marker_group"),
flip_percentage_mean = FALSE, scale_row=T, pseudocount = 1, scale_max = 3,
scale_min = -3)
{
assertthat::assert_that(methods::is(cds, "cell_data_set"))
if (!is.null(group_cells_by)) {
assertthat::assert_that(group_cells_by %in% c("cluster",
"partition") | group_cells_by %in% names(colData(cds)),
msg = paste("group_cells_by must be a column in",
"the colData table."))
}
assertthat::assert_that("gene_short_name" %in% names(rowData(cds)),
msg = paste("This function requires a gene_short_name",
"column in your rowData. If you do not have", "gene symbols, you can use other gene ids",
"by running", "rowData(cds)$gene_short_name <- row.names(rowData(cds))"))
norm_method = match.arg(norm_method)
gene_ids = as.data.frame(fData(cds)) %>% tibble::rownames_to_column() %>%
dplyr::filter(rowname %in% markers | gene_short_name %in%
markers) %>% dplyr::pull(rowname)
if (length(gene_ids) < 1)
stop(paste("Please make sure markers are included in the gene_short_name\",\n \"column of the rowData!"))
if (flip_percentage_mean == FALSE) {
major_axis <- 1
minor_axis <- 2
}
else if (flip_percentage_mean == TRUE) {
major_axis <- 2
minor_axis <- 1
}
exprs_mat <- t(as.matrix(exprs(cds)[gene_ids, ]))
exprs_mat <- reshape2::melt(exprs_mat)
colnames(exprs_mat) <- c("Cell", "Gene", "Expression")
exprs_mat$Gene <- as.character(exprs_mat$Gene)
if (group_cells_by == "cluster") {
cell_group <- tryCatch({
clusters(cds, reduction_method = reduction_method)
}, error = function(e) {
NULL
})
}
else if (group_cells_by == "partition") {
cell_group <- tryCatch({
partitions(cds, reduction_method = reduction_method)
}, error = function(e) {
NULL
})
}
else {
cell_group <- colData(cds)[, group_cells_by]
}
if (length(unique(cell_group)) < 2) {
stop(paste("Only one type in group_cells_by. To use plot_genes_by_group,",
"please specify a group with more than one type. "))
}
names(cell_group) = colnames(cds)
exprs_mat$Group <- cell_group[exprs_mat$Cell]
exprs_mat = exprs_mat %>% dplyr::filter(is.na(Group) == FALSE)
ExpVal <- exprs_mat %>% dplyr::group_by(Group, Gene) %>%
dplyr::summarize(mean = mean(log(Expression + pseudocount)),
percentage = sum(Expression > lower_threshold)/length(Expression))
ExpVal$mean <- ifelse(ExpVal$mean < scale_min, scale_min,
ExpVal$mean)
ExpVal$mean <- ifelse(ExpVal$mean > scale_max, scale_max,
ExpVal$mean)
ExpVal$Gene <- fData(cds)[ExpVal$Gene, "gene_short_name"]
res <- reshape2::dcast(ExpVal[, 1:4], Group ~ Gene, value.var = colnames(ExpVal)[2 +
major_axis])
group_id <- res[, 1]
res <- res[, -1]
if(scale_row){
res<-t(scale(t(res)))
}
row.names(res) <- group_id
if (ordering_type == "cluster_row_col") {
row_dist <- stats::as.dist((1 - stats::cor(t(res)))/2)
row_dist[is.na(row_dist)] <- 1
col_dist <- stats::as.dist((1 - stats::cor(res))/2)
col_dist[is.na(col_dist)] <- 1
ph <- pheatmap::pheatmap(res, useRaster = T, cluster_cols = TRUE,
cluster_rows = TRUE, show_rownames = F, show_colnames = F,
clustering_distance_cols = col_dist, clustering_distance_rows = row_dist,
clustering_method = "ward.D2", silent = TRUE, filename = NA)
ExpVal$Gene <- factor(ExpVal$Gene, levels = colnames(res)[ph$tree_col$order])
ExpVal$Group <- factor(ExpVal$Group, levels = row.names(res)[ph$tree_row$order])
}
else if (ordering_type == "maximal_on_diag") {
order_mat <- t(apply(res, major_axis, order))
max_ind_vec <- c()
for (i in 1:nrow(order_mat)) {
tmp <- max(which(!(order_mat[i, ] %in% max_ind_vec)))
max_ind_vec <- c(max_ind_vec, order_mat[i, tmp])
}
max_ind_vec <- max_ind_vec[!is.na(max_ind_vec)]
if (major_axis == 1) {
max_ind_vec <- c(max_ind_vec, setdiff(1:length(markers),
max_ind_vec))
ExpVal$Gene <- factor(ExpVal$Gene, levels = dimnames(res)[[2]][max_ind_vec])
}
else {
max_ind_vec <- c(max_ind_vec, setdiff(1:length(unique(exprs_mat$Group)),
max_ind_vec))
ExpVal$Group <- factor(ExpVal$Group, levels = dimnames(res)[[1]][max_ind_vec])
}
}
else if (ordering_type == "none") {
ExpVal$Gene <- factor(ExpVal$Gene, levels = markers)
}
if (flip_percentage_mean) {
g <- ggplot(ExpVal, aes(y = Gene, x = Group)) + geom_point(aes(colour = percentage,
size = mean)) + viridis::scale_color_viridis(name = "percentage") +
scale_size(name = "log(mean + 0.1)", range = c(0,
max.size))
}
else {
g <- ggplot(ExpVal, aes(y = Gene, x = Group)) + geom_point(aes(colour = mean,
size = percentage)) + viridis::scale_color_viridis(name = "log(mean + 0.1)") +
scale_size(name = "percentage", range = c(0, max.size))
}
if (group_cells_by == "cluster") {
g <- g + xlab("Cluster")
}
else if (group_cells_by == "partition") {
g <- g + xlab("Partition")
}
else {
g <- g + xlab(group_cells_by)
}
g <- g + ylab("Gene") + monocle3:::monocle_theme_opts() + theme(axis.text.x = element_text(angle = 30,
hjust = 1))
if (axis_order == "marker_group") {
g <- g + coord_flip()
}
g
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.