R/DE_Seurat.R

Defines functions DE_Seurat

Documented in DE_Seurat

#' Function to perform differential expression analysis for all clusters in a Seurat object.
#'
#' This function will take a precomputed Seurat object and perform differential expression analysis using one of the differential expression tests
#' included in Seurat (default= wilcox). If you want to perform DE analysis using edgeR, please check the function DE_edgeR_Seurat()!
#' All the results will be saved in a folder above the current folder location named DE_Seurat (../DE_Seurat). The output folder can easily be
#' modified using the parameter 'output_dir'.
#'
#' @param seurat_object The S4 Seurat object which contains filtered and normalized cells in the data slot.
#' @param de_function The function that will be used to perform differential expression analysis. See ?FindMarkers in the Seurat package for all options.
#' @param output_dir The relative directory that will be used to save results.
#' @param de_groups The two group labels to use for differential expression, supplied as a vector.
#' @param min_pct The minimum percentage of cells in either group to express a gene for it to be tested. (Changes Seurats FindMarker min.pct variable)
#' @param man.logfc.threshold The minimum log-fold change a change has to have to be tested. (changes Seurats FindMarker logfc.threshold variable)
#' @param clusters_to_exclude Define a vector of clusters for which you don't want to perform DE analysis.
#' @keywords Seurat, DE, differential expression
#' @import Seurat
#' @import ggplot2
#' @import UpSetR
#' @import dplyr
#' @export
#' @examples
#' \donttest{
#' DE_Seurat()
#' }

## dependencies:
## Seurat : https://github.com/satijalab/seurat
## Plotly : https://plot.ly/r/
## ggplot2 : https://ggplot2.tidyverse.org/

DE_Seurat <- function(seurat_object,
                      de_function='wilcox',
                      output_dir= "../DE_Seurat",
                      grouping_var = "Genotype",
                      de_groups = c("WT","KO"),
                      min_pct = 0.1,
                      man.logfc.threshold = 0.25,
                      clusters_to_exclude = c(),
                      max_cells = Inf)
  {

  ## print start message
  print("Starting differential expression analysis")

  ## Initiate empty data frames and lists for comparisons of clusters
  joined_res_table <- data.frame()
  upset_Rlist_DE_genes <- list()

  ## Set cluster numbers to keep track of how many clusters have been processed
  cluster_number <- 0

  ## Get vector of clusters to test
  clusters_to_test <- sort(unique(seurat_object@ident))
  clusters_to_test <- setdiff(clusters_to_test,clusters_to_exclude)

  ## Iterate over each cluster in the @ident slot
  for(this_cluster in clusters_to_test){

    cluster_number <- cluster_number + 1

    ## Print status for which cluster identity is being processed
    print(paste("Working on cluster #",cluster_number,":",this_cluster,sep=""))

    ## Subset Seurat object to only contain cells from this cluster
    cells_in_this_cluster <- Seurat::SubsetData(seurat_object,
                                        ident.use=this_cluster)

    ## Get vector of names for both groups of cells
    cells_group_1 <- rownames(Seurat::subset(cells_in_this_cluster@meta.data,get(grouping_var) == de_groups[1]))
    cells_group_2 <- rownames(Seurat::subset(cells_in_this_cluster@meta.data,get(grouping_var) == de_groups[2]))

    ## Check whether there are cells in both groups, otherwise skip this cluster
    if(length(cells_group_1) > 1 & length(cells_group_2) > 1){

      cells_in_this_cluster <- Seurat::StashIdent(cells_in_this_cluster, save.name = "OldIdent")
      cells_in_this_cluster <- Seurat::SetAllIdent(cells_in_this_cluster, id = grouping_var)

      ## Perform differential expression test using the Seurat FindMarkers function
      this_cluster_de_genes <- data.frame()
      this_cluster_de_genes <- Seurat::FindMarkers(cells_in_this_cluster,
                                                  ident.1 = de_groups[1],
                                                  ident.2 = de_groups[2],
                                                  print.bar = TRUE,
                                                  test.use = de_function,
                                                  min.pct = min_pct,
                                                  logfc.threshold = man.logfc.threshold,
                                           max.cells.per.ident = max_cells
                                          )

      ## Filter out genes with an adjusted p value that are not significant
      #this_cluster_de_genes <- subset(this_cluster_de_genes,p_val_adj < 0.05)

      ## Write table for all differentially expressed genes containing testing results
      write.table(this_cluster_de_genes,
                  file=paste(output_dir,"/",this_cluster,"_significant_DE_genes.",de_function,".txt",sep=""),
                  sep="\t",
                  quote=FALSE,
                  row.names=TRUE,
                  col.names=TRUE)


      ## Save DE results in a joined table
      this_cluster_de_genes$cluster <- replicate(nrow(this_cluster_de_genes),this_cluster)
      this_cluster_de_genes$gene <- rownames(this_cluster_de_genes)
      rownames(this_cluster_de_genes) <- c()
      joined_res_table <- rbind(joined_res_table,this_cluster_de_genes)

      ## Add DE for this cell type to the UpsetR list
      upset_Rlist_DE_genes[[this_cluster]] <- c(this_cluster_de_genes$gene)

      ## Calculate cell type average expressions to check correlation between the two groups
      avg.cells_in_this_cluster <- log1p(Seurat::AverageExpression(cells_in_this_cluster, show.progress = FALSE))
      avg.cells_in_this_cluster$gene <- rownames(avg.cells_in_this_cluster)


      ## Make a correlation plot between the two conditions
      corr_plot <- ggplot(avg.cells_in_this_cluster, aes(get(de_groups[1]), get(de_groups[2]),text=gene)) +
        geom_point() +
        ggtitle(paste("Cluster : ",this_cluster,sep="")) +
        geom_abline(intercept = 0, slope = 1, col="red") +
        labs(x = de_groups[1],
             y= de_groups[2])

      ## Save normal png version of the plot
      ggsave(corr_plot,
             file=paste(output_dir,"/",this_cluster,"_corrplot.png",sep=""))

      # ## Also make an interactive version using plotly
      # library(plotly)
      #
      # htmlwidgets::saveWidget(as_widget(ggplotly(corr_plot)), paste(output_dir,"/",this_cluster,"_corrplot.plotly.html",sep=""))

      # ## Plot volcano plot
      # volcano_plot <- ggplot(this_cluster_de_genes,aes(logFC,-log(PValue))) +
      #   geom_point(size=2) +
      #   geom_point(data = subset(res_table,(logFC < neg_log_FC_thresh) & (PValue < q_value_thresh)),col="red") +
      #   geom_point(data = subset(res_table,(logFC > pos_log_FC_thresh ) & (PValue < q_value_thresh) ),col="green") +
      #   geom_text_repel(
      #     data = subset(res_table, (logFC > pos_log_FC_thresh | logFC < neg_log_FC_thresh) & (PValue < q_value_thresh)),
      #     aes(label = subset(res_table, (logFC > pos_log_FC_thresh | logFC < neg_log_FC_thresh) & (PValue < q_value_thresh))$gene),
      #     size = 5,
      #     box.padding = unit(0.35, "lines"),
      #     point.padding = unit(0.3, "lines")
      #   ) +
      #   ggtitle(res$comparison)+
      #   theme_light()
      #
      # volcano_plot
      # ggsave(volcano_plot,file=paste("../DE_edgeR/cluster-",this_cluster,"_volcano_plot.svg",sep=""))

      print("Finished with this cluster!")

    }
    else {
      print(paste("Cluster",this_cluster," only contains cells from one group!",sep=""))
    }

  }

  ## Check the overlap of DE genes between clusters using
  ## UpsetR: https://cran.r-project.org/web/packages/UpSetR/vignettes/basic.usage.html


  ## The two functions fromList and get_intersect_members originate from this github post:
  ## https://github.com/hms-dbmi/UpSetR/issues/85
  ## Function to run UpsetR with a list of named vectors
  fromList <- function (input) {
    # Same as original fromList()...
    elements <- unique(unlist(input))
    data <- unlist(lapply(input, function(x) {
      x <- as.vector(match(elements, x))
    }))
    data[is.na(data)] <- as.integer(0)
    data[data != 0] <- as.integer(1)
    data <- data.frame(matrix(data, ncol = length(input), byrow = F))
    data <- data[which(rowSums(data) != 0), ]
    names(data) <- names(input)
    # ... Except now it conserves your original value names!
    row.names(data) <- elements
    return(data)
  }


  ## Plot the Upet plot
    svg(paste(output_dir,"/Overlap_DE_genes.svg",sep=""),
        width = 24,
        height=20)
    upset(fromList(upset_Rlist_DE_genes),
          order.by = "freq",
          sets = names(upset_Rlist_DE_genes),
          main.bar.color = "black",
          matrix.color="#1482a5ff",
          mainbar.y.label = "Number of DE genes",
          point.size = 6,
          line.size = 2,
          group.by = "degree",
          cutoff = 2,
          text.scale= 3)
    dev.off()

    ## Function to get the members of an intersection
    get_intersect_members <- function (x, ...){
      require(dplyr)
      require(tibble)
      x <- x[,sapply(x, is.numeric)][,0<=colMeans(x[,sapply(x, is.numeric)],na.rm=T) & colMeans(x[,sapply(x, is.numeric)],na.rm=T)<=1]
      n <- names(x)
      x %>% rownames_to_column() -> x
      l <- c(...)
      a <- intersect(names(x), l)
      ar <- vector('list',length(n)+1)
      ar[[1]] <- x
      i=2
      for (item in n) {
        if (item %in% a){
          if (class(x[[item]])=='integer'){
            ar[[i]] <- paste(item, '>= 1')
            i <- i + 1
          }
        } else {
          if (class(x[[item]])=='integer'){
            ar[[i]] <- paste(item, '== 0')
            i <- i + 1
          }
        }
      }
      do.call(filter_, ar) %>% column_to_rownames() -> x
      return(x)
    }

  ## Write table for all differentially expressed genes containing testing results
  write.table(joined_res_table,
              file=paste(output_dir,"/All_DE_genes.tsv",sep=""),
              sep="\t",
              quote=FALSE,
              row.names=TRUE,
              col.names=TRUE)

  ## Count number of clusters each gene is found DE in and write a table for easy inspection
  ## Count number of occurences for each gene and which cell types its in
  genes_accumulated <- joined_res_table %>%
    count(gene)

  colnames(genes_accumulated) <- c("gene","Clusters_DE")

  ## Count number of occurences for each gene and which cell types its in
  genes_clusters <- joined_res_table %>%
    group_by(gene) %>%
    summarise(cluster_names = paste(cluster, collapse=","))

  ## Calculate mean avgLogFC as well as the SD
  genes_clusters_means <- joined_res_table %>%
    group_by(gene) %>%
    summarise(mean_avgLogFC = mean(avg_logFC),
              SD_avgLogFC = sd(avg_logFC),
              mean_pvalue_adj = mean(p_val_adj),
              SD_pvalue_adj = sd(p_val_adj),
              mean_pct1 = mean(pct.1),
              SD_pct1 = sd(pct.1),
              mean_pct2 = mean(pct.2),
              SD_pct2 = sd(pct.2))

  de_genes_count_intermediate <- merge(genes_accumulated,genes_clusters,by="gene")
  de_genes_counts_final <- merge(de_genes_count_intermediate,genes_clusters_means)

  de_genes_counts_final <- de_genes_counts_final %>%
    arrange(Clusters_DE,desc(mean_avgLogFC))

  ## Write table for all differentially expressed genes containing testing results
  write.table(de_genes_counts_final,
              file=paste(output_dir,"/DE_genes_summary.tsv",sep=""),
              sep="\t",
              quote=FALSE,
              row.names=FALSE,
              col.names=TRUE)

  return(upset_Rlist_DE_genes)

  }
FloWuenne/scFunctions documentation built on June 3, 2021, 6:42 p.m.