R/utils.R

Defines functions lighten_darken_color kurtosis skewness bimodality_coefficient detect_genes Noisify svd_lsi tf_idf_transform_v2 tf_idf_transform common_features

Documented in common_features detect_genes tf_idf_transform tf_idf_transform_v2

#' Finds common features in a list of cds objects
#'
#' @description Machine learning algorithms often require features to be the same across 
#' datasets.  Thisfunction finds common features between a list of cell data set objects and 
#' returns a list of cds's that have the same features.  Note that this function uses rownames 
#' of the 'fData' DataFrame to find the intersect of features common to all cds's
#'
#' @param cds_list Input cell_data_set object.
#' @export
common_features <- function(cds_list){
  len<-length(cds_list)
  common_features=vector()
  for(i in 1:len){
    if(i < 2){
      common_features<-rownames(fData(cds_list[[i]]))
    }else{
      common_features<-unique(intersect(common_features, rownames(fData(cds_list[[i]]))))
    }
  }
  for(i in 1:len){
    cds_list[[i]]<-cds_list[[i]][match(common_features, rownames(cds_list[[i]])),]
  }
  return(cds_list)
}



#' Performs TF-IDF transformation on a cell_data_set
#'
#' @description Just like it sounds.
#'
#' @param cds_list Input cell_data_set object or sparse matrix.
#' @import Matrix
#' @export
tf_idf_transform <- function(input, method=1, verbose=T){
  if(class(input)=="cell_data_set"){
    mat<-exprs(input)
  }else{
    mat<-input
  }
  rn <- rownames(mat)
  row_sums<-Matrix::rowSums(mat)
  nz<-which(row_sums>0)
  mat <- mat[nz,]
  rn <- rn[nz]
  row_sums <- row_sums[nz]
  col_sums <- Matrix::colSums(mat)
  
  #column normalize
  mat <- t(t(mat)/col_sums)
  
  
  if (method == 1) {
    #Adapted from Casanovich et al.
    if(verbose) message("Computing Inverse Document Frequency")
    idf   <- as(log(1 + ncol(mat) / row_sums), "sparseVector")
    if(verbose) message("Computing TF-IDF Matrix")
    mat <- as(Matrix::Diagonal(x = as.vector(idf)), "sparseMatrix") %*% 
      mat
  }
  else if (method == 2) {
    #Adapted from Stuart et al.
    if(verbose) message("Computing Inverse Document Frequency")
    idf   <- as( ncol(mat) / row_sums, "sparseVector")
    if(verbose) message("Computing TF-IDF Matrix")
    mat <- as(Matrix::Diagonal(x = as.vector(idf)), "sparseMatrix") %*% 
      mat
    mat@x <- log(mat@x * scale_to + 1)
  }else if (method == 3) {
    mat@x <- log(mat@x + 1)
    if(verbose) message("Computing Inverse Document Frequency")
    idf <- as(log(1 + ncol(mat) /row_sums), "sparseVector")
    if(verbose) message("Computing TF-IDF Matrix")
    mat <- as(Matrix::Diagonal(x = as.vector(idf)), "sparseMatrix") %*% 
      mat
  }else {
    stop("LSIMethod unrecognized please select valid method!")
  }
  rownames(mat) <- rn
    if(class(input)=="cell_data_set"){
    input@assays$data$counts<-mat
    return(input)
  }else{
    return(mat)
  }
}

#' Performs TF-IDF transformation on a cell_data_set v2
#'
#' @description Just like it sounds but different.
#'
#' @param cds_list Input cell_data_set object or sparse matrix.
#' @import Matrix
#' @export
tf_idf_transform_v2 <- function(input){
  if(class(input)=="cell_data_set"){
    mat<-exprs(input)
  }else{
    mat<-input
  }
  colSm <- Matrix::colSums(mat)
  rowSm <- Matrix::rowSums(mat)
  freqs <- t(t(mat)/colSm)
  idf   <- as(log(1 + ncol(mat) / rowSm), "sparseVector")
  tfidf <- as(Matrix::Diagonal(x=as.vector(idf)), "sparseMatrix") %*% freqs
  tfidf@x[is.na(tfidf@x)] <- 0
  if(class(input)=="cell_data_set"){
    input@assays$data$counts<-tfidf
    return(input)
  }else{
    return(tfidf)
  }
}

#' @export
svd_lsi<-function(sp_mat, num_dim, mat_only=T){
  svd <- irlba::irlba(sp_mat, num_dim, num_dim)
  svdDiag <- matrix(0, nrow=num_dim, ncol=num_dim)
  diag(svdDiag) <- svd$d
  matSVD <- t(svdDiag %*% t(svd$v))
  rownames(matSVD) <- colnames(sp_mat)
  colnames(matSVD) <- seq_len(ncol(matSVD))
  if(mat_only){
    return(matSVD)
  }else{
    return(list(matSVD=matSVD, svd=svd))
  }
}


Noisify <- function(data, amount=0.0001) {
  if (is.vector(data)) {
    noise <- runif(length(data), -amount, amount)
    noisified <- data + noise
  } else {
    length <- dim(data)[1] * dim(data)[2]
    noise <- matrix(runif(length, -amount, amount), dim(data)[1])
    noisified <- data + noise
  }
  return(noisified)
}


#' Detects genes above minimum threshold.
#'
#' @description For each gene in a cell_data_set object, detect_genes counts
#' how many cells are expressed above a minimum threshold. In addition, for
#' each cell, detect_genes counts the number of genes above this threshold that
#' are detectable. Results are added as columns num_cells_expressed and
#' num_genes_expressed in the rowData and colData tables respectively.
#'
#' @param cds Input cell_data_set object.
#' @param min_expr Numeric indicating expression threshold
#' @param exprs_bin Boolean whether to bin genes by mean expression
#' @param exprs_cuts Numeic indicating number of bins if using exprs_bin
#' @return Updated cell_data_set object
#' @importFrom Hmisc cut2
#' @export
detect_genes <- function(cds, min_expr=0, exprs_bin=TRUE, exprs_cuts=25){
  assertthat::assert_that(methods::is(cds, "cell_data_set"))
  assertthat::assert_that(is.numeric(min_expr))
  
  rowData(cds)$num_cells_expressed <- Matrix::rowSums(SingleCellExperiment::counts(cds) > min_expr)
  colData(cds)$num_genes_expressed <- Matrix::colSums(SingleCellExperiment::counts(cds) > min_expr)
  if(exprs_bin){
    fData(cds)$exprs_bin = cut2(log(Matrix::rowMeans(normalized_counts(cds))), m=floor(nrow(fData(cds))/exprs_cuts))
  }
  cds
}


#' Convert a GMT File to a ragged list of genes
#'
#' @description GMT Files (See MSigDB) are a convenient means of storing genesets
#'
#' @param GMTfn GMT filename
#' @export
GMT_to_list<-function (GMTfn) 
{
  file.data <- readLines(GMTfn)
  num.lines <- length(file.data)
  output <- vector("list", num.lines)
  for (i in 1:num.lines) {
    vec <- unlist(strsplit(file.data[i], "\t"))
    output[[i]] <- vec[3:length(vec)]
    names(output)[i] <- vec[1]
  }
  return(output)
}

#' @export
bimodality_coefficient<-function(x, finite=TRUE,...){
  if(finite==TRUE){
    G=skewness(x,finite)
    sample.excess.kurtosis=kurtosis(x,finite)
    K=sample.excess.kurtosis
    n=length(x)
    B=((G^2)+1)/(K+ ((3*((n-1)^2))/((n-2)*(n-3))))
  }
  else{
    G=skewness(x,FALSE)
    K=kurtosis(x,FALSE)
    B=((G^2)+1)/(K)
  }
  return(B)
}

#' @export
skewness<-function(x, finite=TRUE){
  n=length(x)
  S=(1/n)*sum((x-mean(x))^3)/(((1/n)*sum((x-mean(x))^2))^1.5)
  if(finite==FALSE){
    S=S
  }else{
    S=S*(sqrt(n*(n-1)))/(n-2)
  }
  return(S)	
}

#' @export
kurtosis<-function(x, finite){
  n=length(x)
  K=(1/n)*sum((x-mean(x))^4)/(((1/n)*sum((x-mean(x))^2))^2) - 3
  if(finite==FALSE){
    K=K
  }
  else{
    K=((n-1)*((n+1)*K - 3*(n-1))/((n-2)*(n-3))) +3
  }
  return(K)	
}

#' @export

lighten_darken_color<-function(col, amt) {
  if (substring(col, 1, 1)=="#") {
    col = substring(col, 2)
  }
  num = as.hexmode(col)
  r = bitwShiftR(num, 16) + amt
  if (r > 255) {r = 255}
  if  (r < 0) {r = 0}
  b = bitwAnd(bitwShiftR(num, 8), 0x00FF) + amt
  if (b > 255) {b = 255}
  if  (b < 0) {b = 0}
  g = bitwAnd(num, 0x0000FF) + amt
  if (g > 255) {g = 255}
  if (g < 0) {g = 0}
  inter<-paste("000000", as.hexmode(bitwOr(g , bitwOr(bitwShiftL(b, 8), bitwShiftL(r, 16)))), sep="")
  ret<-substr(inter, nchar(inter)-5, nchar(inter))
  return(toupper(paste("#", ret, sep="")))
}

#' @export
mean_gene_expression<-function (cds, markers, group_cells_by = "cluster", reduction_method = "UMAP", 
          norm_method = c("log", "size_only"), lower_threshold = 0, 
          max.size = 10, ordering_type = c("cluster_row_col", "maximal_on_diag", 
                                           "none"), axis_order = c("group_marker", "marker_group"), 
          flip_percentage_mean = FALSE, pseudocount = 1, scale_max = 3, 
          scale_min = -3) 
{
  assertthat::assert_that(methods::is(cds, "cell_data_set"))
  if (!is.null(group_cells_by)) {
    assertthat::assert_that(group_cells_by %in% c("cluster", 
                                                  "partition") | group_cells_by %in% names(colData(cds)), 
                            msg = paste("group_cells_by must be a column in", 
                                        "the colData table."))
  }
  assertthat::assert_that("gene_short_name" %in% names(rowData(cds)), 
                          msg = paste("This function requires a gene_short_name", 
                                      "column in your rowData. If you do not have", "gene symbols, you can use other gene ids", 
                                      "by running", "rowData(cds)$gene_short_name <- row.names(rowData(cds))"))
  norm_method = match.arg(norm_method)
  gene_ids = as.data.frame(fData(cds)) %>% tibble::rownames_to_column() %>% 
    dplyr::filter(rowname %in% markers | gene_short_name %in% 
                    markers) %>% dplyr::pull(rowname)
  if (length(gene_ids) < 1) 
    stop(paste("Please make sure markers are included in the gene_short_name\",\n               \"column of the rowData!"))
  if (flip_percentage_mean == FALSE) {
    major_axis <- 1
    minor_axis <- 2
  }
  else if (flip_percentage_mean == TRUE) {
    major_axis <- 2
    minor_axis <- 1
  }
  exprs_mat <- t(as.matrix(exprs(cds)[gene_ids, ]))
  exprs_mat <- reshape2::melt(exprs_mat)
  colnames(exprs_mat) <- c("Cell", "Gene", "Expression")
  exprs_mat$Gene <- as.character(exprs_mat$Gene)
  if (group_cells_by == "cluster") {
    cell_group <- tryCatch({
      clusters(cds, reduction_method = reduction_method)
    }, error = function(e) {
      NULL
    })
  }
  else if (group_cells_by == "partition") {
    cell_group <- tryCatch({
      partitions(cds, reduction_method = reduction_method)
    }, error = function(e) {
      NULL
    })
  }
  else {
    cell_group <- colData(cds)[, group_cells_by]
  }
  if (length(unique(cell_group)) < 2) {
    stop(paste("Only one type in group_cells_by. To use plot_genes_by_group,", 
               "please specify a group with more than one type. "))
  }
  names(cell_group) = colnames(cds)
  exprs_mat$Group <- cell_group[exprs_mat$Cell]
  exprs_mat = exprs_mat %>% dplyr::filter(is.na(Group) == FALSE)
  ExpVal <- exprs_mat %>% dplyr::group_by(Group, Gene) %>% 
    dplyr::summarize(mean = mean(log(Expression + pseudocount)), 
                     percentage = sum(Expression > lower_threshold)/length(Expression))
  ExpVal$mean <- ifelse(ExpVal$mean < scale_min, scale_min, 
                        ExpVal$mean)
  ExpVal$mean <- ifelse(ExpVal$mean > scale_max, scale_max, 
                        ExpVal$mean)
  ExpVal$Gene <- fData(cds)[ExpVal$Gene, "gene_short_name"]
  res <- reshape2::dcast(ExpVal[, 1:4], Group ~ Gene, value.var = colnames(ExpVal)[2 + 
                                                                                     major_axis])
  group_id <- res[, 1]
  out<-t(res[, -1])
  rownames(out)<-colnames(res[, -1])
  colnames(out)<-res$Group
  out
}


plot_genes_by_group<-function (cds, markers, group_cells_by = "cluster", reduction_method = "UMAP", 
                               norm_method = c("log", "size_only"), lower_threshold = 0, 
                               max.size = 10, ordering_type = c("cluster_row_col", "maximal_on_diag", 
                                                                "none"), axis_order = c("group_marker", "marker_group"), 
                               flip_percentage_mean = FALSE, scale_row=T, pseudocount = 1, scale_max = 3, 
                               scale_min = -3) 
{
  assertthat::assert_that(methods::is(cds, "cell_data_set"))
  if (!is.null(group_cells_by)) {
    assertthat::assert_that(group_cells_by %in% c("cluster", 
                                                  "partition") | group_cells_by %in% names(colData(cds)), 
                            msg = paste("group_cells_by must be a column in", 
                                        "the colData table."))
  }
  assertthat::assert_that("gene_short_name" %in% names(rowData(cds)), 
                          msg = paste("This function requires a gene_short_name", 
                                      "column in your rowData. If you do not have", "gene symbols, you can use other gene ids", 
                                      "by running", "rowData(cds)$gene_short_name <- row.names(rowData(cds))"))
  norm_method = match.arg(norm_method)
  gene_ids = as.data.frame(fData(cds)) %>% tibble::rownames_to_column() %>% 
    dplyr::filter(rowname %in% markers | gene_short_name %in% 
                    markers) %>% dplyr::pull(rowname)
  if (length(gene_ids) < 1) 
    stop(paste("Please make sure markers are included in the gene_short_name\",\n               \"column of the rowData!"))
  if (flip_percentage_mean == FALSE) {
    major_axis <- 1
    minor_axis <- 2
  }
  else if (flip_percentage_mean == TRUE) {
    major_axis <- 2
    minor_axis <- 1
  }
  exprs_mat <- t(as.matrix(exprs(cds)[gene_ids, ]))
  exprs_mat <- reshape2::melt(exprs_mat)
  colnames(exprs_mat) <- c("Cell", "Gene", "Expression")
  exprs_mat$Gene <- as.character(exprs_mat$Gene)
  if (group_cells_by == "cluster") {
    cell_group <- tryCatch({
      clusters(cds, reduction_method = reduction_method)
    }, error = function(e) {
      NULL
    })
  }
  else if (group_cells_by == "partition") {
    cell_group <- tryCatch({
      partitions(cds, reduction_method = reduction_method)
    }, error = function(e) {
      NULL
    })
  }
  else {
    cell_group <- colData(cds)[, group_cells_by]
  }
  if (length(unique(cell_group)) < 2) {
    stop(paste("Only one type in group_cells_by. To use plot_genes_by_group,", 
               "please specify a group with more than one type. "))
  }
  names(cell_group) = colnames(cds)
  exprs_mat$Group <- cell_group[exprs_mat$Cell]
  exprs_mat = exprs_mat %>% dplyr::filter(is.na(Group) == FALSE)
  ExpVal <- exprs_mat %>% dplyr::group_by(Group, Gene) %>% 
    dplyr::summarize(mean = mean(log(Expression + pseudocount)), 
                     percentage = sum(Expression > lower_threshold)/length(Expression))
  ExpVal$mean <- ifelse(ExpVal$mean < scale_min, scale_min, 
                        ExpVal$mean)
  ExpVal$mean <- ifelse(ExpVal$mean > scale_max, scale_max, 
                        ExpVal$mean)
  ExpVal$Gene <- fData(cds)[ExpVal$Gene, "gene_short_name"]
  res <- reshape2::dcast(ExpVal[, 1:4], Group ~ Gene, value.var = colnames(ExpVal)[2 + 
                                                                                     major_axis])
  group_id <- res[, 1]
  res <- res[, -1]
  if(scale_row){
    res<-t(scale(t(res)))
  }
  row.names(res) <- group_id
  if (ordering_type == "cluster_row_col") {
    row_dist <- stats::as.dist((1 - stats::cor(t(res)))/2)
    row_dist[is.na(row_dist)] <- 1
    col_dist <- stats::as.dist((1 - stats::cor(res))/2)
    col_dist[is.na(col_dist)] <- 1
    ph <- pheatmap::pheatmap(res, useRaster = T, cluster_cols = TRUE, 
                             cluster_rows = TRUE, show_rownames = F, show_colnames = F, 
                             clustering_distance_cols = col_dist, clustering_distance_rows = row_dist, 
                             clustering_method = "ward.D2", silent = TRUE, filename = NA)
    ExpVal$Gene <- factor(ExpVal$Gene, levels = colnames(res)[ph$tree_col$order])
    ExpVal$Group <- factor(ExpVal$Group, levels = row.names(res)[ph$tree_row$order])
  }
  else if (ordering_type == "maximal_on_diag") {
    order_mat <- t(apply(res, major_axis, order))
    max_ind_vec <- c()
    for (i in 1:nrow(order_mat)) {
      tmp <- max(which(!(order_mat[i, ] %in% max_ind_vec)))
      max_ind_vec <- c(max_ind_vec, order_mat[i, tmp])
    }
    max_ind_vec <- max_ind_vec[!is.na(max_ind_vec)]
    if (major_axis == 1) {
      max_ind_vec <- c(max_ind_vec, setdiff(1:length(markers), 
                                            max_ind_vec))
      ExpVal$Gene <- factor(ExpVal$Gene, levels = dimnames(res)[[2]][max_ind_vec])
    }
    else {
      max_ind_vec <- c(max_ind_vec, setdiff(1:length(unique(exprs_mat$Group)), 
                                            max_ind_vec))
      ExpVal$Group <- factor(ExpVal$Group, levels = dimnames(res)[[1]][max_ind_vec])
    }
  }
  else if (ordering_type == "none") {
    ExpVal$Gene <- factor(ExpVal$Gene, levels = markers)
  }
  if (flip_percentage_mean) {
    g <- ggplot(ExpVal, aes(y = Gene, x = Group)) + geom_point(aes(colour = percentage, 
                                                                   size = mean)) + viridis::scale_color_viridis(name = "percentage") + 
      scale_size(name = "log(mean + 0.1)", range = c(0, 
                                                     max.size))
  }
  else {
    g <- ggplot(ExpVal, aes(y = Gene, x = Group)) + geom_point(aes(colour = mean, 
                                                                   size = percentage)) + viridis::scale_color_viridis(name = "log(mean + 0.1)") + 
      scale_size(name = "percentage", range = c(0, max.size))
  }
  if (group_cells_by == "cluster") {
    g <- g + xlab("Cluster")
  }
  else if (group_cells_by == "partition") {
    g <- g + xlab("Partition")
  }
  else {
    g <- g + xlab(group_cells_by)
  }
  g <- g + ylab("Gene") + monocle3:::monocle_theme_opts() + theme(axis.text.x = element_text(angle = 30, 
                                                                                  hjust = 1))
  if (axis_order == "marker_group") {
    g <- g + coord_flip()
  }
  g
}

  
  
  
scfurl/viewmaster documentation built on Nov. 22, 2022, 2:59 p.m.