R/compute_supercells_DEA.R

Defines functions plot_DEA_consistency compute_consistency_of_supercell_DEA compute_DEA_GT_bool compute_singglecell_DEA compute_supercells_DEA

Documented in compute_consistency_of_supercell_DEA compute_DEA_GT_bool compute_singglecell_DEA compute_supercells_DEA plot_DEA_consistency

#' Computes differential expression analysis for super-cells
#'
#' @param SC.list list of super-cells and other simplifications (output of \link{compute_supercell})
#' @param SC.GE.list list of super-cell gene expressions
#' @param cluster.name name of clusterig result field
#' @param ident.1 name(s) of cluster for which markers are computed, if \code{NULL} (default), markers for all clusters will be computed (running \link[SuperCell]{supercell_FindAllMarkers} instead of \link[SuperCell]{supercell_FindMarkers}
#' @param ident.2 name(s) of clusters for comparison. If \code{NULL} (defauld), then all the other clusters used
#' @param seed sandom seed to compute logFC_AUC score
#' @param DO_unweighted whether to compute not sample-weighed DEA
#' @param ... other parameters of \link[SuperCell]{supercell_FindMarkers} or \link[SuperCell]{supercell_FindAllMarkers}
#'
#' @return list of DEA results for each super-cell like structure
#'
#' @export


compute_supercells_DEA <- function(
  SC.list,
  SC.GE.list,
  cluster.name,
  ident.1 = NULL,
  ident.2 = NULL,
  seed = 12345,
  verbose = FALSE,
  pval.thresh = 0.05,
  logFC.thresh = 0,
  DO_unweighted = FALSE,
  ...
){

  res <- list()

  method.seq <- names(SC.list)
  for(meth in method.seq){
    res[[meth]] <- list()
    cur.gamma.seq <- names(SC.list[[meth]])

    for(gamma.ch in cur.gamma.seq){
      res[[meth]][[gamma.ch]] <- list()
      cur.seed.seq <- names(SC.list[[meth]][[gamma.ch]])

      for(seed.i.ch in cur.seed.seq){
        if(verbose) print(paste("Method:", meth, "Gamma:", gamma.ch, "Seed:", seed.i.ch))

        cur.SC     <- SC.list[[meth]][[gamma.ch]][[seed.i.ch]]
        cur.GE     <- SC.GE.list[[meth]][[gamma.ch]][[seed.i.ch]]

        cur.supercell_size  = cur.SC$supercell_size
        if(DO_unweighted) cur.supercell_size <- NULL

        if(cluster.name %in% names(cur.SC)){
          clusters   <- cur.SC[[cluster.name]]
        } else {
          stop(paste("Field (cluster.name)", cluster.name, "not found within SC fields, please provide valid clustering name (cluster.name)"))
        }

        if(is.null(ident.1)){ # find all genes
          cur.res <- SuperCell::supercell_FindAllMarkers(
            ge = cur.GE,
            supercell_size = cur.supercell_size,
            clusters = clusters,
            ...
          )
        } else {
          idnt.name <- paste(ident.1, collapse = "_")
          cur.res <- list()

          cur.res[[idnt.name]] <- SuperCell::supercell_FindMarkers(
            ge = cur.GE,
            supercell_size = cur.supercell_size,
            clusters = clusters,
            ident.1 = ident.1,
            ident.2 = ident.2,
            ...
          )
        }

        cur.res <- lapply(
          cur.res,
          FUN = function(x){
            r <- x
            mlt <- ifelse(as.numeric(r$adj.p.value) < pval.thresh, 1, 0)

            print(sum(mlt)/length(mlt))

            set.seed(seed)
            r$logFC_AUC <- mlt * r$logFC +
              (1 - mlt)*runif(min = 0, max = 1e-1, n = length(mlt)) * sign(r$logFC)


            ## save actual gamma
            if("gamma.actual" %in% names(cur.SC)){
              gamma.actual <- cur.SC$gamma.actual
            } else {
              gamma.actual <- as.numeric(gamma.ch)
            }

            r$gamma.actual <- gamma.actual

            return(r)
          })


        res[[meth]][[gamma.ch]][[seed.i.ch]] <- cur.res
      }
    }
  }

  return(res)
}


#' Computes DEA for single-cell data and for GT cell type annotation
#' @param sc.ge single-cell gene expression
#' @param clusters single-cell clustering result or GT cell type annotation
#' @param ... rest of the parameters of \link{compute_supercells_DEA}
#' @export
compute_singglecell_DEA <- function(
  sc.ge,
  clusters,
  ident.1 = NULL,
  ident.2 = NULL,
  seed = 12345,
  pval.thresh = 0.05,
  ...
){

  if(is.null(ident.1)){ # find all genes
    cur.res <- SuperCell::supercell_FindAllMarkers(
      ge = sc.ge,
      clusters = clusters,
      ...
    )
  } else {
    idnt.name <- paste(ident.1, collapse = "_")
    cur.res <- list()

    cur.res[[idnt.name]] <- SuperCell::supercell_FindMarkers(
      ge = sc.ge,
      clusters = clusters,
      ident.1 = ident.1,
      ident.2 = ident.2,
      ...
    )
  }

  cur.res <- lapply(
    cur.res,
    FUN = function(x){
      r <- x
      mlt <- ifelse(as.numeric(r$adj.p.value) < pval.thresh, 1, 0)

      print(sum(mlt)/length(mlt))

      set.seed(seed)
      r$logFC_AUC <- mlt * r$logFC +
        (1 - mlt)*runif(min = 0, max = 1e-1, n = length(mlt)) * sign(r$logFC)

      r$gamma.actual <- 1

      return(r)
    })

  return(cur.res)
}


#' Compute bool vector of cluster-specific DEG
#'
#' @param DEA.GT  GT differentially expressed genes (DEA for GT annotation)
#' @param logFC.thresh thereshold to consider DEG to be statistically significant
#' @param pval.thresh thereshold to consider DEG to be statistically significant (keep the same as in \link{compute_supercells_DEA} and \link{compute_singlecells_DEA})
#'
#' @export

compute_DEA_GT_bool <- function(
  DEA.GT,
  all.genes  = NULL,
  logFC.thresh = 1,
  pval.thresh = 0.05
){
  if(is.null(all.genes)){
    all.genes <- sort(unique(unlist(lapply(DEA.GT, rownames))))
    warning(paste("The entire set of genes was not provided, will use N =", length(all.genes), "genes to compute consistency of DEA"))
  }

  cell.types <- names(DEA.GT)

  DEA.GT.signif <- lapply(DEA.GT, function(x){
    rownames(x)[(x$adj.p.value < pval.thresh) & (x$logFC > logFC.thresh)]
  })

  signif.gene.cl <- c()
  for(cl in names(DEA.GT.signif)){
    signif.gene.cl <- c(signif.gene.cl, paste0(cl, "_", DEA.GT.signif[[cl]]))
  }

  res <- rep(FALSE, length(cell.types)*length(all.genes))
  names(res) <- paste(rep(cell.types, each = length(all.genes)), rep(all.genes, length(cell.types)),  sep = "_")

  res[signif.gene.cl] <- TRUE

  return(res)
}


#' Compute consistency of super-cell (meta-cell) DEA with the GT DEA (DEA for GT annotation)
#'
#'@param DEA.list list of DEA result (output of \link{compute_supercells_DEA})
#'@param GT.DEA.bool ground truth differentially expressed genes (named boolean with names beeing genes)
#'@param sc.DEA if NULL (dafault) equalt to \code{GT.DEA.bool}
#'
#'@return TPR, F1, AUC between GT and super-cell DEA
#'
#'@export

compute_consistency_of_supercell_DEA <- function(
  DEA.list,
  GT.DEA.bool,
  sc.DEA = NULL,
  verbose = FALSE
){

  `%>%` <- dplyr::`%>%`

  tpr.dea <- data.frame()

  N.clusters    <- length(names(DEA.list)) # GT number of clusters (cell types)
  N.markers     <- sum(GT.DEA.bool)

  for(meth in names(DEA.list)){

    for(gamma.ch in names(DEA.list[[meth]])){
      for(seed.i.ch in names(DEA.list[[meth]][[gamma.ch]])){
        if(verbose) print(paste(meth, "Gamma:", gamma.ch, "Seed:", seed.i.ch))

        cur.DEA           <- DEA.list[[meth]][[gamma.ch]][[seed.i.ch]]

        if(is.null(names(cur.DEA)) & length(cur.DEA) > 0) names(cur.DEA) <- as.character(1:length(cur.DEA))
        markers.logFC_AUC <- c()

        for(cl in names(cur.DEA)){
          cur.DEA.cl      <- cur.DEA[[cl]]

          cur.markers.logFC_AUC         <- cur.DEA.cl$logFC_AUC
          names(cur.markers.logFC_AUC)  <- paste(cl, rownames(cur.DEA.cl), sep = "_")
          markers.logFC_AUC             <- c(markers.logFC_AUC, cur.markers.logFC_AUC)

        }

        if(!is.null(markers.logFC_AUC)){
          markers.logFC_AUC     <- markers.logFC_AUC[names(GT.DEA.bool)]
        } else {
          markers.logFC_AUC     <- rep(NA, length(GT.DEA.bool))
        }

        markers.logFC_AUC[is.na(markers.logFC_AUC)] <- -1000
        names(markers.logFC_AUC)    <- names(GT.DEA.bool)


        cur.rocit <- ROCR::prediction(as.numeric(markers.logFC_AUC), as.numeric(GT.DEA.bool))
        cur.perf  <- ROCR::performance(cur.rocit,"tpr","fpr")

        cur.TPR   <- cur.perf@y.values[[1]]
        cur.FPR   <- cur.perf@x.values[[1]]

        cur.AUC   <- ROCR::performance(cur.rocit,"auc")
        cur.AUC   <- cur.AUC@y.values[[1]]

        cur.F1    <- ROCR::performance(cur.rocit,"f")
        cur.F1    <- cur.F1@y.values[[1]]

        cur.n.pos <- cur.rocit@n.pos.pred[[1]]

        cur.df <- data.frame(
          TPR = cur.TPR,
          FPR = cur.FPR,
          AUC = cur.AUC,
          F1  = cur.F1,
          N.pos = cur.n.pos,
          Gamma = as.numeric(gamma.ch),
          Gamma_actual = unique(cur.DEA.cl$gamma.actual),
          Seed = as.numeric(seed.i.ch),
          Method = meth
        )


        cur.df <- cur.df[-nrow(cur.df),]

        #closest N.pos to GT.N.genes
        filt.N.pos <- cur.df[["N.pos"]][which.min(abs(cur.df[["N.pos"]] - min(N.markers, max(cur.df[["N.pos"]]))))]
        cur.df <- cur.df %>%
          dplyr::filter(N.pos == filt.N.pos)

        tpr.dea <- rbind(tpr.dea, cur.df)

      }
    }
  }

  #####  for Gamma == 1
  if(verbose) print("Gamma = 1")

  if(is.null(names(sc.DEA)) & length(sc.DEA) > 0) names(sc.DEA) <- as.character(1:length(sc.DEA))

  markers.logFC_AUC <- c()
  for(cl in names(cur.DEA)){
    cur.DEA.cl      <- sc.DEA[[cl]]

    cur.markers.logFC_AUC         <- cur.DEA.cl$logFC_AUC
    names(cur.markers.logFC_AUC)  <- paste(cl, rownames(cur.DEA.cl), sep = "_")
    markers.logFC_AUC             <- c(markers.logFC_AUC, cur.markers.logFC_AUC)

  }

  if(!is.null(markers.logFC_AUC)){
    markers.logFC_AUC     <- markers.logFC_AUC[names(GT.DEA.bool)]
  } else {
    markers.logFC_AUC     <- rep(NA, length(GT.DEA.bool))
  }

  markers.logFC_AUC[is.na(markers.logFC_AUC)] <- -1000
  names(markers.logFC_AUC)    <- names(GT.DEA.bool)


  cur.rocit <- ROCR::prediction(as.numeric(markers.logFC_AUC), as.numeric(GT.DEA.bool))
  cur.perf  <- ROCR::performance(cur.rocit,"tpr","fpr")

  cur.TPR   <- cur.perf@y.values[[1]]
  cur.FPR   <- cur.perf@x.values[[1]]

  cur.AUC   <- ROCR::performance(cur.rocit,"auc")
  cur.AUC   <- cur.AUC@y.values[[1]]

  cur.F1    <- ROCR::performance(cur.rocit,"f")
  cur.F1    <- cur.F1@y.values[[1]]

  cur.n.pos <- cur.rocit@n.pos.pred[[1]]

  for(meth in names(DEA.list)){
    cur.df <- data.frame(
      TPR = cur.TPR,
      FPR = cur.FPR,
      AUC = cur.AUC,
      F1  = cur.F1,
      N.pos = cur.n.pos,
      Gamma = 1,
      Gamma_actual = 1,
      Seed = as.numeric(names(DEA.list[[meth]][[gamma.ch]]))[1],
      Method = meth
    )

    cur.df <- cur.df[-nrow(cur.df),]
    #closest N.pos to GT.N.genes
    filt.N.pos <- cur.df[["N.pos"]][which.min(abs(cur.df[["N.pos"]] - min(N.markers, max(cur.df[["N.pos"]]))))]

    cur.df <- cur.df %>%
      dplyr::filter(N.pos == filt.N.pos)

    tpr.dea <- rbind(tpr.dea, cur.df)
  }

  return(tpr.dea)
}

####### not finished !!! modoft from here

#' Plot DEA consistency
#'
#' @param clust.consistency.df output of \link{compute_consistency_of_supercell_DEA}
#' @param consistency.index.name name of the consistency index (i.e., TPR, FPR, AUC, F1)
#'
#' @param error_bars name of values used for errorbars (for subsampling, random grouping,
#' alternative clusteting of single cells and other methods with more than one clustering/simplification output).
#' \code{'extr'} for min/max, \code{'quartiles'} for quartiles and \code{'sd'} for meadin +- sd
#'
#' @export
#'

plot_DEA_consistency <- function(
  DEA.consistency.df,
  consistency.index.name = 'TPR',
  error_bars = c('extr', 'quartiles', 'sd')[1],
  ignore.gammas = c(),
  ignore.methods = c(),
  to.save.plot = TRUE,
  to.save.plot.raw = FALSE,
  asp = 0.5,
  fig.folder = './plots',
  .shapes = c("Exact"=1, "Approx"=0, "Subsampling"=2, "Random"=3,
              "Metacell_default_fp"=4, "Metacell_default_av" = 8,
              "Metacell_SC_like_fp"=4, "Metacell_SC_like_av" = 8),
  .colors = c("Exact"="darkred", "Approx"="royalblue", "Subsampling"="black", "Random"="gray",
              "Metacell_default_fp"="forestgreen", "Metacell_default_av" = "forestgreen",
              "Metacell_SC_like_fp"="gold", "Metacell_SC_like_av" = "gold"),
  verbose = FALSE,
  ...
){

  `%>%` <- dplyr::`%>%`

  if(consistency.index.name %in% colnames(DEA.consistency.df)){
    DEA.consistency.df[["Score"]] <- DEA.consistency.df[[consistency.index.name]]
  } else {
    stop(paste("consistency.index.name:", consistency.index.name, "not found in clust.consistency.df,
               available values are:", paste(colnames(DEA.consistency.df), collapse = ', ')))
  }

  DEA.consistency.df_summarized <- DEA.consistency.df %>%
    dplyr::group_by(Method, Gamma_actual) %>%
    dplyr::summarise(
      meanScore   = mean(Score),
      firstQScore = unname(summary(Score)[2]),
      thirdQScore = unname(summary(Score)[5]),
      medianScore = median(Score),
      sdScore     = ifelse(!is.na(sd(Score)), sd(Score), 0),
      minScore     = min(Score),
      maxScore     = max(Score),
      medianPsd    = min(median(Score)+ifelse(!is.na(sd(Score)), sd(Score), 0), 1),
      medianMsd    = max(median(Score)-ifelse(!is.na(sd(Score)), sd(Score), 0), 0)
    )


  if(is.null(error_bars)){
    error_bars <- 'extr'
  }
  if(!(error_bars %in% c('extr', 'quartiles', 'sd'))){
    stop(paste("Error bar name:", error_bars, "not known. Available names are 'extr', 'quartiles', 'sd' "))
  }

  switch (error_bars,

          extr = {
            min_err_name <- 'minScore'
            max_err_name <- 'maxScore'
          },

          quartiles = {
            min_err_name <- 'firstQScore'
            max_err_name <- 'thirdQScore'
          },

          sd = {
            min_err_name <- 'medianMsd'
            max_err_name <- 'medianPsd'
          },

          {
            min_err_name <- 'minScore'
            max_err_name <- 'maxScore'
          }
  )



  ## Plot across gamma
  df.to.plot <- DEA.consistency.df_summarized %>%
    dplyr::filter(
      !(Method %in% ignore.methods) & !(Gamma_actual %in% ignore.gammas))

  df.to.plot[['min_err_bar']] <- df.to.plot[[min_err_name]]
  df.to.plot[['max_err_bar']] <- df.to.plot[[max_err_name]]


  g <- ggplot2::ggplot(df.to.plot, ggplot2::aes(x = Gamma_actual, y = medianScore, color = Method, fill = Method,  shape = Method)) +
    ggplot2::geom_point() +
    ggplot2::geom_line() +
    ggplot2::geom_errorbar(
      ggplot2::aes(ymin=min_err_bar, ymax=max_err_bar), width=.0,
      position = ggplot2::position_dodge(0.02)) +
    ggplot2::scale_x_log10() +
    ggplot2::labs(x = 'Graining level', y = paste0(consistency.index.name))

  if(!is.null(.colors)){
    .colors <- .colors[unique(df.to.plot$Method)]
    g <- g + ggplot2::scale_color_manual(values = .colors) +
      ggplot2::scale_fill_manual(values = .colors)
  }

  if(!is.null(.shapes)){
    .shapes <- .shapes[unique(df.to.plot$Method)]
    g <- g +  ggplot2::scale_shape_manual(values = .shapes)
  }
  plot(g)

  if(to.save.plot){
    fig.folder.save = file.path(fig.folder, 'save')
    if(!dir.exists(fig.folder.save))
      dir.create(fig.folder.save, recursive = TRUE)

    filename = paste0(consistency.index.name, '_errbar_', error_bars)
    SCBM_saveplot(p = g, folder = fig.folder.save, name = filename, save.raw.ggplot = FALSE, asp = asp, ...)
  }
  return(df.to.plot)

}
mariiabilous/SuperCellBM documentation built on Jan. 28, 2022, 7:45 p.m.