CVRCFunc: Functions for CVRC bioinformatics

Documented in FindMarkers

#' Differential expression testing between 2 specified groups of cells via pseudobulking and DESeq2
#' @param seurat A Seurat object
#' @param clus_ident Identity for clusters. Normally 'seurat_clusters' but can be any identity
#' @param sample_ident Sample identities. Identity class that indicates how to partition samples
#' @param group_1 Identity of cells within \code{clus_ident} to compare
#' @param group_2 Identity of cells within \code{clus_ident} to compare
#' @param expfilt_freq genes that have greater than \code{expfilt_counts} in greater than \code{expfilt_freq} fraction of cells will be kept for the DESeq2 model. 0.5 by default
#' @param expfilt_counts genes with less than \code{expfilt_counts} in \code{expfilt_freq * sample number} will be removed from DESeq2 model. 1 by default.
#' @param alpha FDR adjusted p-value threshold for significance in plotting. 0.1 by default.
#' @param out_dir Name of output directory
#' @return .csv files with marker genes per \code{clus_ident}. .pdf files with diagnostic plots
#' @import Seurat pheatmap DESeq2 reshape2 ggplot2 ggrepel stringr utils grDevices
#' @importFrom BiocGenerics t
#' @importFrom magrittr set_colnames
#' @export

FindMarkers <- function(seurat, clus_ident, group_1, group_2, sample_ident, expfilt_counts = 1, expfilt_freq = 0.5, out_dir = "FindMarkers", alpha = 0.1){
  start <- Sys.time()

  coef <- variable <- value <- NULL

  dir.create(out_dir, showWarnings = FALSE)

  Idents(seurat) <- clus_ident
  seurat <- subset(seurat, idents = c(group_1, group_2))
  clusters <- unique(Idents(seurat))
  clusters <- sort(clusters)

  # Print out the table of cells in each cluster-sample group
  seurat@meta.data[,clus_ident] <- droplevels(factor(seurat@meta.data[,clus_ident]))
  pdf(paste(out_dir,'/',group_1,"_",group_2,"_",'cells_per_group_HM.pdf',sep = ''))
  pheatmap(table(seurat@meta.data[,clus_ident], seurat@meta.data[,sample_ident]), display_numbers = T, cluster_rows = F, cluster_cols = F, fontsize_number = 8)
  dev.off()

  #Run wilcoxauc from presto for pct.in and pct.out (maybe save the stats later for comparison?)
  wilcox <- wilcoxauc(seurat, assay = 'data', seurat_assay = 'RNA', group_by = clus_ident)

  # Subset metadata to only include the cluster and sample IDs to aggregate across
  groups <- seurat@meta.data[, c(clus_ident, sample_ident)]

  # Aggregate across cluster-sample groups
  pb <- aggregate.Matrix(t(seurat@assays$RNA@counts), groupings = groups[,1:2], fun = "sum")

  # Not every cluster is present in all samples; create a vector that represents how to split samples
  splitf <- sapply(stringr::str_split(rownames(pb), pattern = "_",  n = 2), `[`, 2)

  cluster_counts <- as.data.frame(t(as.matrix(pb)))
  cluster_metadata <- data.frame(cluster = sapply(stringr::str_split(colnames(cluster_counts), pattern = "_",  n = 2), `[`, 1),
                                 sample = sapply(stringr::str_split(colnames(cluster_counts), pattern = "_",  n = 2), `[`, 2))
  cluster_metadata$cluster <- factor(cluster_metadata$cluster, levels = c(group_1, group_2))

  dds <- DESeqDataSetFromMatrix(cluster_counts, colData = cluster_metadata, design = ~ cluster)
  # Filter data
  keep <- rowSums(counts(dds) >= expfilt_counts) >= expfilt_freq*nrow(cluster_metadata)
  dds <- dds[keep,]
  # DESeq2
  vst <- varianceStabilizingTransformation(dds)
  dds <- DESeq(dds, test = "LRT", reduced = ~1)
  res <- results(dds, alpha = 0.05)

  # Shrink the log2 fold changes to be more appropriate using the apeglm method
  res_shrink <- lfcShrink(dds, coef = colnames(coef(dds))[2], res=res, type = "apeglm")
  res_shrink <- as.data.frame(res_shrink)
  res_shrink$sig <- rep('Not significant', nrow(res_shrink))
  res_shrink$sig[which(res_shrink$padj < alpha)] <- 'Significant'

  #set up data for top gene plot
  d <- plotCounts(dds, gene=which.min(res$padj), intgroup="cluster", returnData=TRUE)
  exp_gene_logfc <- res[which.min(res$padj),]$log2FoldChange
  exp_gene_padj <- res[which.min(res$padj),]$padj

  #Plots
  gg_counts <- cluster_counts[,sort(colnames(cluster_counts))]
  gg_counts <- melt(log10(gg_counts))
  pdf(paste(out_dir,"/",colnames(coef(dds))[2],"_diagnostic_plots.pdf", sep = ''))
  print(ggplot(gg_counts, aes(x = variable, y = value, fill = variable)) + geom_boxplot() +  theme_bw() + theme(axis.text.x = element_text(angle = 90), legend.position = "none") + ylab('Log10(Counts)'))
  print(DESeq2::plotPCA(vst, intgroup = "cluster") +theme_classic())
  print(DESeq2::plotPCA(vst, intgroup = "sample") +theme_classic() +geom_text_repel(aes(label = sample), show.legend = FALSE))
  plotDispEsts(dds)
  print(ggplot(res_shrink, aes(x = log2FoldChange, y = -log10(pvalue), color = sig))+geom_point(alpha = 0.7)+scale_color_manual(values = c('grey40', 'blue'))+theme_classic())
  print(ggplot(d, aes(x=cluster, y=count)) + geom_point(position=position_jitter(w=0.1,h=0)) + ggtitle(paste('Gene:', row.names(res)[which.min(res$padj)],'\nLog2FC = ',exp_gene_logfc,'\npadj = ',exp_gene_padj, sep = '')))
  plotMA(res)
  dev.off()


  #Add pct.in and pct.out and save results table
  wilcox_groups <- unique(wilcox$group)
  wilcox_sub <- wilcox[which(wilcox$group == wilcox_groups[1]),]
  res_shrink$feature <- row.names(res_shrink)
  merged_res <- merge(x = res_shrink, y = wilcox_sub, by = 'feature', sort = F)
  res_shrink[,paste('pct_in',wilcox_groups[1],sep = "_")] <- merged_res$pct_in
  res_shrink[,paste('pct_in',wilcox_groups[2],sep = "_")] <- merged_res$pct_out
  res_shrink$feature <- NULL
  write.csv(file = paste(out_dir,"/",colnames(coef(dds))[2],"_results.csv", sep = ''), res_shrink)

  print(start)
  print(Sys.time())
}