R/ROGUE.R

Defines functions wrapperROGUE DetermineK rogue.boxplot rogue ent.toli CalculateRogue matr.filter SEplot SE_fun entropy_fit Entropy

Documented in CalculateRogue DetermineK Entropy entropy_fit ent.toli matr.filter rogue rogue.boxplot SE_fun SEplot wrapperROGUE

#' Calculate the expression entropy of each gene
#' @name Entropy
#' @usage Entropy(expr, r = 1)
#' @param expr The expression matrix. Rows should be genes and columns should be cells.
#' @param r A small fixed value to avoid log(0) of mean gene expression levels. The default value of r is set to 1, but can also be set to other values such as 0.1 and 0.01.
#'
#' @return A tibble object with three columns 'Gene', 'mean.expr' and 'entropy'.
#' @export
#'
#' @examples
Entropy <- function(expr, r = 1){
  tmp <- log(expr+1)
  entropy <- Matrix::rowMeans(tmp)
  mean.expr <- log(Matrix::rowMeans(expr)+r)

  ent_res <- tibble(
    Gene = rownames(expr),
    mean.expr = mean.expr,
    entropy = entropy
  )

  return(ent_res)
}


#' Fit the relationship between expression entropy and mean gene expression
#' @name entropy_fit
#' @description Fit the relationship between expression entropy and mean gene expression using loess regression.
#' @usage entropy_fit(.x, span = 0.5, mt.method = c("fdr","BH"))
#' @param .x A tibble object returned from the Entropy function.
#' @param span The parameter α which controls the degree of smoothing.
#' @param mt.method The multiple testing method used in p.adjust.
#' @return A tibble object with six columns.
#' @export
#'
#' @examples ent.res <- Entropy(expr)
#' ent.res <- entropy_fit(ent.res, span = 0.3, mt.method = "fdr")
entropy_fit <- function(.x, span = 0.5, mt.method = "fdr"){
  .x <- .x %>% dplyr::filter(is.finite(mean.expr)) %>% dplyr::filter(entropy > 0)
  fit <- loess(entropy~mean.expr, data = .x, span=span)
  prd <- predict(fit, .x$mean.expr)
  .x %>%
    dplyr::mutate(fit = prd) %>%
    dplyr::mutate(ds = fit - entropy) %>%
    dplyr::mutate(pv = 1-pnorm(.$ds, mean = mean(.$ds), sd = sd(.$ds))) %>%
    dplyr::filter(pv > 0.1) -> tmp

  fit <- loess(entropy~mean.expr, data = tmp, span=span)
  prd <- predict(fit, .x$mean.expr)
  .x %>%
    dplyr::mutate(fit = prd) %>%
    dplyr::mutate(ds = fit - entropy) %>%
    dplyr::filter(is.finite(ds)) %>%
    dplyr::mutate(pv = 1-pnorm(.$ds, mean = mean(.$ds), sd = sd(.$ds))) %>%
    dplyr::filter(pv > 0.1) -> tmp

  fit <- loess(entropy~mean.expr, data = tmp, span=span)
  prd <- predict(fit, .x$mean.expr)

  .x %>%
    dplyr::mutate(fit = prd) %>%
    dplyr::mutate(ds = fit - entropy) %>%
    dplyr::filter(is.finite(ds)) -> .x

  .x <- .x %>% dplyr::mutate(p.value = 1-pnorm(.x$ds, mean = mean(.x$ds), sd = sd(.x$ds)))
  p.adj <- p.adjust(.x$p.value, method = mt.method)
  .x <- .x %>% dplyr::mutate(p.adj = p.adj) %>% dplyr::arrange(desc(ds))
}


#' Identify highly informative genes using S-E model
#' @description Use S-E curve to identify highly informative genes.
#' @param expr The expression matrix. Rows should be genes and columns should be cells.
#' @param span The parameter α which controls the degree of smoothing.
#' @param r A small fixed value to avoid log(0) of mean gene expression levels. The default value of r is set to 1, but can also be set to other values such as 0.1 and 0.01.
#' @param mt.method The multiple testing method used in p.adjust.
#' @param if.adj Whether to apply multiple testing method to adjust p.value.
#' @return A tibble object with seven columns:
#' @return * Gene, the gene name.
#' @return * mean.expr, the mean expression levels of genes.
#' @return * entropy, the expected expression entropy from a given mean gene expression.
#' @return * fit, the mean expression levels of genes.
#' @return * ds, the entropy reduction against the null expectation.
#' @return * p.value, the significance of ds.
#' @return * p.adj, adjusted P value.
#'
#' @export
#'
#' @examples ent.res <- SE_fun(expr, span = 0.1, r = 1, mt.method = "fdr")
SE_fun <- function(expr, span = 0.5, r = 1, mt.method = "fdr", if.adj = T){
  ent_res <- Entropy(expr, r = r)
  ent_res <- entropy_fit(ent_res, span = span, mt.method = mt.method)
  if(!isTRUE(if.adj)){
    ent_res <- ent_res %>% dplyr::mutate(p.adj = p.value)
  }
  return(ent_res)
}


#' S-E plot
#' @description Draws a point plot of the relationship between S and E.
#' @param .x A tibble object returned from the SE_fun or entropy_fit function.
#' @param point_size Point size for geom_point.
#' @param geom_line Logical, whether to show the expected expression entropy.
#' @param p.adj Logical, whether to highlight significantly varied genes.
#' @param cutoff The threshold (adjusted P value) for identifying significantly varied genes.
#'
#' @return A ggplot object
#' @export
#'
#' @examples ent.res <- SE_fun(expr, span = 0.1, r = 1, mt.method = "fdr")
#' SEplot(ent.res)
SEplot <- function(.x, point_size = 1, geom_line = T, p.adj = T, cutoff = 0.05){
  if(isFALSE(p.adj)){
    if(geom_line){
      .x %>%
        ggplot(aes(mean.expr, entropy)) +
        geom_point(colour = '#1E90FF', size = point_size) +
        geom_line(aes(mean.expr, fit), lwd = 0.7) +
        theme_bw() +
        theme(
          axis.title = element_text(size = 15,color="black"),
          axis.text = element_text(size = 15,color="black"),
          legend.title = element_text(size = 0),
          legend.text = element_text(size = 0),
          axis.text.y = element_text(color="black"),
          axis.text.x = element_text(color="black")
        ) +
        labs(
          x = "log(mean expression)",
          y = "expression entropy"
        ) -> p
    }
    else{
      .x %>%
        ggplot(aes(mean.expr, entropy)) +
        geom_point(colour = '#1E90FF', size = point_size) +
        #geom_line(aes(mean.expr, fit), lwd = 0.7) +
        theme_bw() +
        theme(
          axis.title = element_text(size = 15,color="black"),
          axis.text = element_text(size = 15,color="black"),
          legend.title = element_text(size = 0),
          legend.text = element_text(size = 0),
          axis.text.y = element_text(color="black"),
          axis.text.x = element_text(color="black")
        ) +
        labs(
          x = "log(mean expression)",
          y = "expression entropy"
        ) -> p
    }
  }
  if(isTRUE(p.adj)){
    .x <- .x %>% dplyr::mutate(sig = ifelse(p.adj <= cutoff, 1, 0))

    if(geom_line){
      .x %>%
        ggplot(aes(mean.expr, entropy)) +
        geom_point(aes(colour = factor(sig)), size = point_size) +
        geom_line(aes(mean.expr, fit), lwd = 0.7) +
        scale_color_manual(values = c("#1E90FF", "red")) +
        theme_bw() +
        theme(
          legend.position = "none",
          axis.title = element_text(size = 15,color="black"),
          axis.text = element_text(size = 15,color="black"),
          legend.title = element_text(size = 0),
          legend.text = element_text(size = 0),
          axis.text.y = element_text(color="black"),
          axis.text.x = element_text(color="black")
        ) +
        labs(
          x = "log(mean expression)",
          y = "expression entropy"
        ) -> p
    }

    else{
      .x %>%
        ggplot(aes(mean.expr, entropy)) +
        geom_point(aes(colour = factor(sig)), size = point_size) +
        #geom_line(aes(mean.expr, fit), lwd = 0.7) +
        scale_color_manual(values = c("#1E90FF", "red")) +
        theme_bw() +
        theme(
          legend.position = "none",
          axis.title = element_text(size = 15,color="black"),
          axis.text = element_text(size = 15,color="black"),
          legend.title = element_text(size = 0),
          legend.text = element_text(size = 0),
          axis.text.y = element_text(color="black"),
          axis.text.x = element_text(color="black")
        ) +
        labs(
          x = "log(mean expression)",
          y = "expression entropy"
        ) -> p
    }
  }
  return(p)
}


#' Filtering out low-abundance genes and low-quality cells
#'
#' @param expr The expression matrix. Rows should be genes and columns should be cells.
#' @param min.cells Include genes detected in at least this many cells.
#' @param min.genes Include cells where at least this many genes are detected.
#'
#' @return A filtered gene expression matrix.
#' @export
#'
#' @examples expr <- matrix(data = rbinom(n = 100, size = 20, prob = 0.5 ), nrow = 10)
#' expr
#' filtered.expr <- matr.filter(expr, min.cells = 3, min.genes = 3)
#' filtered.expr
matr.filter <- function(expr, min.cells = 10, min.genes = 10){
  gene_count <- colSums(expr > 0, na.rm = T)
  cell_count <- rowSums(expr > 0, na.rm = T)

  lq1 <- cell_count < min.cells
  lq2 <- gene_count < min.genes

  return(expr[!lq1, !lq2])
}


#' ROGUE calculation
#' @description  Using ROGUE to assess the purity of single cell population.
#' @param .x A tibble object returned from the SE_fun or entropy_fit function.
#' @param platform The platform ("UMI" or "full-length") used for generating the tested dataset.
#' @param cutoff The threshold (adjusted P value) for identifying significant ds. The default threshold is 0.05.
#' @param k The scaling factor for calculating ROGUE. The default value of K is set to 45 and 500 for droplet-based ("UMI") and "full-length" based datasets, respectively. When specifying a custom k value, the "platform" argument is redundant.
#' @param features Use these features to calculate ROGUE.
#' @details By taking advantage of the wide applicability of S-E model to scRNA-seq data, we introduce the statistic ROGUE to measure the purity of a cell population as:
#' @details \deqn{ROGUE=1-∑sig.ds/(∑sig.ds+K)}
#' @details where K is an important parameter that constrains the ROGUE value between 0 and 1. A cell population with no significant ds for all genes will receive a ROGUE value of 1, while a population with maximum summarization of significant ds is supposed to yield a purity score of ~0.
#' @return A value of ROGUE.
#' @export
#'
#' @examples ent.res <- SE_fun(expr, span = 0.1, r = 1, mt.method = "fdr")
#' CalculateRogue(ent.res, platform = "UMI")
#' CalculateRogue(ent.res, k = 30)
CalculateRogue <- function(.x, platform = NULL, cutoff = 0.05, k = NULL, features = NULL){
  if(is.null(k)){
    if(is.null(platform)){
      warning("Please provide a \"platform\" argument or specify a k value")
    }else if(platform == "UMI"){
      k = 45
    }else if(platform == "full-length"){
      k = 500
    }else if(!is.null(platform) & !(platform %in% c("UMI","full-length"))){
      warning("Please provide valid \"platform\" argument")
    }
  }else if(!is.null(k)){
    k <- k
  }

  if(!is.null(features)){
    .x <- .x %>% dplyr::filter(Gene %in% features)
    sig_value <- sum(abs(.x$ds))
    Rogue <- 1-sig_value/(sig_value+k)
    return(Rogue)
  }else{
    sig_value <- abs(.x$ds[.x$p.adj < cutoff & .x$p.value < cutoff])
    sig_value <- sum(sig_value)
    Rogue <- 1-sig_value/(sig_value+k)
    return(Rogue)
  }
}


#' Remove outlier cells when calculating ROGUE
#' @usage ent.toli(ent, expr, n = 2, span = 0.5, r = 1, mt.method = c("fdr","BH"))
#' @param ent A tibble object returned from the SE_fun or entropy_fit function.
#' @param expr The expression matrix. Rows should be genes and columns should be cells.
#' @param n Remove this many outlier cells.
#' @param span The parameter α which controls the degree of smoothing.
#' @param r A small fixed value to avoid log(0) of mean gene expression levels. The default value of r is set to 1, but can also be set to other values such as 0.1 and 0.01.
#' @param mt.method The multiple testing method used in p.adjust.
#'
#' @return A tibble object with seven columns as 'ent' object.
#' @export
#'
#' @examples ent.toli(ent.res, expr, n = 2, mt.method = "fdr")
ent.toli <- function(ent, expr, n = 2, span = 0.5, r = 1, mt.method = "fdr"){
  sig.gene <- ent %>% dplyr::filter(p.adj < 0.05) %>% dplyr::pull(Gene)
  ng <- length(sig.gene)
  expr <- expr[sig.gene,]

  mean.v <- c()
  entr.v <- c()
  for (i in 1:ng) {
    .x <- as.numeric(expr[i,])
    .x <- base::sort(.x, decreasing = T)
    .x <- .x[-c(1:n)]
    mean.v[i] <- log(mean(.x)+r)
    entr.v[i] <- mean(log(.x+1))
  }

  mean.cut <- min(ent$mean.expr)

  ent$mean.expr[1:ng] <- mean.v
  ent$entropy[1:ng] <- entr.v

  ent <- ent %>% dplyr::select(-p.adj) %>% dplyr::filter(mean.expr > mean.cut)
  ent <- entropy_fit(ent, span = span, mt.method = "fdr")
  return(ent)
}


#' Calculate the ROGUE value of each putative cluster for each sample.
#' @usage rogue(expr, labels, samples, platform = NULL, k= NULL, min.cell.n = 10, remove.outlier.n = 2, span = 0.5, r = 1, mt.method = c("fdr","BH"))
#' @param expr The expression matrix. Rows should be genes and columns should be cells.
#' @param labels A vector of cell cluster lables for all cells corresponding to 'expr' argument.
#' @param samples A vector of samples (e.g. patients) to which each cell belongs, corresponding to 'expr' argument.
#' @param min.cell.n Only clusters containing at least this many cells will receive ROGUE values.
#' @param remove.outlier.n Remove this many outlier cells when calculating ROGUE.
#' @param span The parameter α which controls the degree of smoothing.
#' @param r A small fixed value to avoid log(0) of mean gene expression levels. The default value of r is set to 1, but can also be set to other values such as 0.1 and 0.01.
#' @param filter Logical, whether to filter out low-abundance genes and low-quality cells.
#' @param min.cells if parameter filter is "TRUE", include genes detected in at least this many cells.
#' @param min.genes if parameter filter is "TRUE", Include cells where at least this many genes are detected.
#' @param mt.method The multiple testing method used in p.adjust.
#'
#' @return A dataframe where rows represent samples, cols represent clusters, and values represent corresponding ROGUEs.
#' @export
#'
#' @examples
rogue <- function(expr, labels, samples, platform = NULL, k = NULL, min.cell.n = 10, remove.outlier.n = 2, span = 0.5, r = 1, filter = F, min.cells = 10, min.genes = 10, mt.method = "fdr"){
  clusters <- unique(labels)
  meta <- tibble(CellID = 1:ncol(expr), ct = labels, sample = samples)
  sample.rogue <- function(meta, cluster){
    tmp <- meta %>% dplyr::filter(ct == cluster)
    s <- unique(samples)
    rogue <- c()
    for (i in 1:length(s)) {
      index1 <- tmp %>% dplyr::filter(sample == s[i]) %>% dplyr::pull(CellID)
      if(length(index1) >= min.cell.n){
        tmp.matr <- expr[,index1]
        if(isTRUE(filter)){
          print("Filtering out low-abundance genes and low-quality cells")
          tmp.matr <- matr.filter(tmp.matr, min.cells = min.cells, min.genes = min.genes)
        }else{
          tmp.matr <- tmp.matr
        }
        tmp.res <- SE_fun(tmp.matr, span = span, r = r)
        tmp.res <- ent.toli(tmp.res, tmp.matr, span = span, r = r, n = remove.outlier.n)
        rogue[i] <- CalculateRogue(tmp.res, platform = platform, k = k)
      }
      else{
        rogue[i] <- NA
      }
    }
    return(rogue)
  }

  res <- list()

  for (i in 1:length(clusters)) {
    res[[i]] <- sample.rogue(meta, clusters[i])
  }

  res.tibble <- Reduce(rbind, res) %>% as.matrix() %>% t() %>% as.data.frame()
  colnames(res.tibble) <- clusters
  rownames(res.tibble) <- unique(samples)
  return(res.tibble)
}


#' Visualize ROGUE values on a boxplot
#' @description Draws a boxplot of the ROGUE values for each cluster in different samples.
#' @param res.rogue A dataframe returned from the 'rogue' function.
#'
#' @return A ggplot object.
#' @export
#'
#' @examples res.rogue <- rogue(expr, labels, samples)
#' rogue.boxplot(res.rogue)
rogue.boxplot <- function(res.rogue){
  res.rogue %>%
    tidyr::gather(key = clusters, value = ROGUE) %>%
    ggplot(aes(clusters, ROGUE)) +
    geom_boxplot(color = "#FF3E96", outlier.shape = NA) +
    geom_point(color = "#FF3E96", size = 1.5) +
    theme_bw() +
    theme(axis.text = element_text(size = 12, colour = "black"),
          axis.title = element_text(size = 13, colour = "black")) +
    labs(
      x = "Clusters",
      y = "ROGUE"
    ) -> p
  return(p)
}


#' Calculate the value of the reference factor K
#' @description Determine the value of the reference factor K by specifying a highly heterogeneous dataset
#' @param expr The expression matrix. Rows should be genes and columns should be cells.
#' @param span The parameter α which controls the degree of smoothing.To improve the performance.
#' @param r A small fixed value to avoid log(0) of mean gene expression levels. The default value of r is set to 1, but can also be set to other values such as 0.1 and 0.01.
#' @param mt.method The multiple testing method used in p.adjust.
#' @param if.adj Whether to apply multiple testing method to adjust p.value.
#' @return The K value
#'
#' @export
#'
#' @examples k <- DetermineK(expr, span = 0.5, r = 1, mt.method = "fdr")
DetermineK <- function(expr, span = 0.5, r = 1, mt.method = "fdr", if.adj = T){
  ent_res <- Entropy(expr, r = r)
  ent_res <- entropy_fit(ent_res, span = span, mt.method = mt.method)
  if(!isTRUE(if.adj)){
    ent_res <- ent_res %>% dplyr::mutate(p.adj = p.value)
  }
  k <- ent_res %>% dplyr::filter(p.adj < 0.05) %>% dplyr::pull(ds) %>% sum()
  k <- k/2
  return(k)
}


#' Caculate ROGUE values for Cell Cluster and conclude their their heterogeneity.
#' @param object Seurat or SingleCellExperiment or SummarizedExperiment Object
#' @param slot which slot data use,default counts
#' @param useFeatures features use to caculate rogue.value
#' @param draw whether draw plot
#' @param ext if draw,which figure style to save,default pdf
#' @param k value for caculate rogue
#' @param auto when k is NULL,will auto caculate k value
#' @param platform which platform,"UMI" or "full-length"
#' @param labelCol which column in metadata as labels(celltype)
#' @param sampleCol which column in metadata as sample
#' @param outdir path to save figures
#' @return return as list of ent.res and rogue.res
#' @export
wrapperROGUE<-function(object=NULL,
                        slot="counts",
                        useFeatures=NULL,
                        draw=TRUE,
                        ext="pdf",
                        k=NULL,
                        auto=TRUE,
                        platform = "UMI",
                        labelCol="label_fine",
                        sampleCol="Sample",
                        outdir=NULL){
  stopifnot(ext%in%c("pdf","png","jpeg","jpeg","tiff", "bmp", "svg","wmf","ps","eps"))
  if(inherits(object,"Seurat")){
    count<-GetAssayData(object,slot=slot)
    metaData<-object@meta.data
  }else if(inherits(object,"SingleCellExperiment")){
    #require("SummarizedExperiment")
    count<-SummarizedExperiment::assays(object)[[slot]]
    metaData<-SummarizedExperiment::colData(object)
  }else if(inherits(object,"SummarizedExperiment")){
    count<-SummarizedExperiment::assays(object)[[slot]]
    metaData<-SummarizedExperiment::colData(object)
  }else{
    stop("Invalid Class Object,Please Check your Object Class!")
  }

  #if(!is.null(useFeatures)){
  #  count<-count[useFeatures,]
  #}
  if(is.null(outdir)){
    outdir="rogue"
  }
  makedir(outdir)

  message("run Expression entropy model ..")
  ent.res<-SE_fun(count,span = 0.1, r = 1, mt.method = "fdr")
  if(draw){
    message("plot S-E ..")
    p<-SEplot(ent.res,point_size = 1, geom_line = T, p.adj = T, cutoff = 0.05)
    ggsave(filename=file.path(outdir,paste0("SE.",ext)),plot = p,width = 12,height = 10)
  }

  message("ROGUE calculation..")
  nVars<-nrow(count)
  if(!is.null(useFeatures)){
    nVars<-length(useFeatures)
  }
  if(is.null(k)){
    if(auto){
      k <- DetermineK(count, span = 0.5, r = 1, mt.method = "fdr")
    }
  }
  rogue.value<-CalculateRogue(ent.res, platform = platform, cutoff = 0.05, k =k, features = useFeatures)
  cat(sprintf("ROGUE value of %d Variables is %f\n",nVars,rogue.value))
  cat(sprintf("ROGUE value of %d Variables is %f\n",nVars,rogue.value),file =file.path(outdir,"rogue.log"))

  message("Calculate the ROGUE value of each putative cluster for each sample ..")
  rogue.res <- rogue(count, labels=metaData[[labelCol]], samples = metaData[[sampleCol]], platform =platform, span = 0.6,k=k)
  if(draw){
    message("plot rogue label vs sample ..")
    p<-rogue.boxplot(rogue.res)
    ggsave(filename=file.path(outdir,paste0("rogue.boxplot.",ext)),plot = p,width = 12,height = 10)
  }

  return(list("ent.res"=ent.res,"rogue.res"=rogue.res))
}
RyanYip-Kat/yipCat documentation built on Dec. 18, 2021, 11:55 a.m.