In kevincjnixon/BinfTools: Useful functions for standard analyses

knitr::opts_chunk$set(
  collapse = TRUE,
  comment = "#>",
  fig.width=8,
  fig.height=8,
  fig.align = "center"
)

Install Dependencies

#Bioconductor packages don't install automatically on BinfTools install
BiocManager::install("SAGx")
BiocManager::install("GSVA")
BiocManager::install("fgsea")
BiocManager::install("gage")

devtools::install_github("kevincjnixon/gpGeneSets") # gprofiler genesets for mouse, human, and drosophila
devtools::install_github("kevincjnixon/BinfTools")

Load

library(BinfTools)
library(gpGeneSets)

After running DESeq2 (DESeq object='dds', DESeq results = 'res')

norm_counts<-as.data.frame(counts(dds, normalized=T)) #If you want to use another count matrix, it must be a data frame with rows as genes and columns matching the order of 'cond'
cond<-as.character(dds$condition) #Use the factor used for the DESeq2 contrasts
res<-as.data.frame(res) #results object

If you used packages like limma or edger for your analysis, you can convert the results from those packages to a BinfTools compatible results object using:

### Converting Limma results:
res<-limma::topTable(fit, number=Inf)
res<-fromLimma(res)
###Converting EdgeR results:
res<-edgeR::topTags(de, n=Inf)
res<-fromEdgeR(res)

BinfTools has examples of these objects built in if you don't have data on hand

These results are from Drosophila (my PhD project) and the gene IDs are flybase IDs (equivalent to ENSEMBL IDs - hard to tell what they are from the ID)

We can run BinfTools using these original gene symbols, but if we want to more easily identify genes, we can use a gprofiler2::gconvert wrapper to convert the IDs to gene symbols for both the results and counts objects.

symRes<-getSym(object=res, #The object with rownames to convert
               obType="res", #The type of object 'res' or 'counts'
               species="dmelanogaster", #species
               target="FLYBASENAME_GENE", #What you want the gene names converted to. 'HGNC' is human gene symbols, 'MGI' for mouse symbols
               addCol=F) #Boolean, FALSE replaces rownames with gene symbols (if there are duplicates, only the highest expressed version is kept), TRUE adds a column named SYMBOL and all duplicates are kept

symCounts<-getSym(object=norm_counts,
                  obType="counts",
                  species="dmelanogaster",
                  target="FLYBASENAME_GENE",
                  addCol=F)

Now, if we want to look at specific genes, it's a little easier to identify them rather than having to look up the ENSEMBL IDs.

Let's make a bar plot to look at the expression of certain genes in each condition.

#set up the genes of interest:
genes<-c("Ldh","dac","Hsp70Bb","rut","tmod")
barGene(genes=genes,
        counts=symCounts, #Using symCounts because the rownames match the gene names that we're looking for
        conditions=cond,
        title="Genes of interest - normalized Expression",
        norm=NULL, #Set this to a character of the control condition to plot the relative expression (i.e. set this condition to a value of 1)
        eb="sd", #Error bars, 'sd' = standard devation, 'se' = standard error, a value of zero (0; no quotes) will remove error bars altogether - this may be useful if plotting relative expression
        returnDat=F, #Return the values if you want to plot this elsewhere (graphpad/excel) to match figures made by researchers
        col="Dark2", #Colour scheme, can be RColourBrewer palette name, or vector of colours in rgb(), hexadecimal, or colour names
        ord=c("WT","KO")) #If the order of the conditions in the plot is not what you wanted, you can set this to reorder the conditions in any way you want
#Mess around with it:
barGene(genes=genes,
        counts=log10(1+symCounts),
        conditions=cond,
        title="log10 (1+normCounts)",
        norm="WT",
        eb="se",
        col=c("blue","yellow"),
        ord=NULL)

Another form of quality control is investigating the corrleation between the samples and conditions:

#We can plot the correlation heatmap of all samples using corHeat
corHeat(counts=symCounts, #Counts object
        method="spearman", #Correlation method
        title="Spearman Correlation",#Plot title
        hmcol=colorRampPalette(c("red","grey","blue"))(100), #heatmap colours
        showNum=T) #Show correlation coefficients?

#Plot a gene-by-gene correlation
expCor(counts=symCounts, #Counts object
       cond=cond, #Conditions
       title="Correlation KO vs WT", #Plot title
       method="spearman", #Correlation method
       transform="log10", #Transformation of counts? "none","log10", or "log2"
       printInd=F) #Print individual correlation plots in addition to paired plot?

Now we can move into generating some summary plots (volcano / MA)

DEG<-volcanoPlot(res=symRes, #Results object
                 title="KO vs WT",
                 p=0.05, #adjusted p-value threshold for DEGs
                 pval=NULL, #unadjusted p-value threshold for DEGs (in case you don't want to use adjusted)
                 FC=log2(1.5), #log2FoldChange threshold for DEGs (can be 0)
                 lab=genes, #list of genes to label (NULL to not label any)
                 col=genes, #list of genes to colour (NULL to not colour any)
                 fclim=NULL, #x-axis (log2FoldChange) limits, genes passing this limit will be represented as triangles on the edge of the plot - good if you have some extreme outliers
                 showNum=T, #Show the numbers of genes on the plot?
                 returnDEG=T, #Return list of DEGs (Down, Up) - this is good for running GO later on
                 expScale=F, #Scale point size to mean expression?
                 upcol=NULL, #Colour value for upregulated genes, NULL will be red
                 dncol=NULL) #Colour value for downregulated genes, NULL will be blue)
MA_Plot(res=symRes,
        title="KO vs WT",
        p=0.05,
        pval=NULL,
        FC=log2(1.5),
        lab=genes,
        col=genes,
        fclim=NULL, #Same as volcano plot, but will act on y-axis, not x
        showNum=T,
        returnDEG=F,
        sigScale=F, #Scale point size to significance?
        upcol=NULL,
        dncol=NULL)

BinfTools also uses plotly to create interactive volcano and MA plots - you can hover your cursor over points to see the individual genes:

interVP(res=symRes,
        title="Volcano Plot",
        p=0.05,
        pval=NULL,
        FC=log2(1.5))

interMA(res=symRes,
        title="MA Plot",
        p=0.05,
        pval=NULL,
        FC=log2(1.5))

Now for some pathway enrichment analyses. The GO_GEM() function is a wrapper for gprofiler2::gost() and will output results in a ".GO.txt" file, a GEM file (for Cytoscape) in ".gem.txt", a pdf of the top 10 enriched and significant terms, and a list of genes in ".genes.txt". GO_GEM can take either a character vector of genes or a named list of genes (for example, the DEG list from volcanoPlot). It can export a gprofiler2 'gost' object (for use with other gprofiler2 function and the customGMT() function) and a results table for use with other functions.

dir.create("GO") #Make a directory for output
GO_res<-GO_GEM(geneList=DEG,
               species="dmelanogaster",
               bg=rownames(symRes), #A character vector of genes indicating the background for the GO analysis. Leave NULL to use all genes (if you don't have one)
               source="GO:BP", #A character indicating the source - see documentation for all of them
               corr="fdr", #How to correct the p-values
               iea=F, #Remove genes in terms 'inferred by electronic analysis' ?
               prefix="GO/", #Character for output prefix. If named list is provided as geneList, names of the list will be added to the prefix
               ts=c(10,500), #numeric of length 2 indicating minimum/maximum term size for results plots
               pdf=F, #print figures to pdf?
               fig=T, #Show figures in plots in RStudio?
               figCols=c("blue","orange"), #colours for enrichment/significance in plots
               returnGost=F, #Return gprofiler2 gost object
               writeRes=F, #Write results to ".GO.txt" file
               writeGem=F, #Write gem file?
               writeGene=F, #Write genes in query to file?
               returnRes=T) #Return the results table (only one of returnRes or returnGost can be T, not both)

GO_gost<-GO_GEM(geneList=DEG,
                species="dmelanogaster",
                bg=rownames(symRes),
                source="GO:BP",
                prefix="GO/",
                pdf=F,
                fig=F,
                returnGost=T,
                writeRes=F)

If we want to plot GO results for DEGs (up/down, specifically), we can use this function using the GO_res (must be list of length 2 with 'Down', 'Up' in that order)

combGO_plot(GOresList=GO_res, #results table from GO_GEM() (set returnRes=T)
                        title="Biological process - significant", #Plot title
                        ts=c(10,500), #min, max term sizes for filtering, respectively
                        sig=T, #Plot top significant results? FALSE means plot top enriched results
                        numTerm=10, #Number of top terms to include in the figure
                        upcols=c("lightpink","red"), #Colours for enrichment, significance respectively for entry 2 (upregulated)
                        downcols=c("lightblue","blue"),#Colours for enrichment, significance respectively for entry 1 (downregulated)
                        labs=c("Downregulated","Upregulated")) #Legend labels for entries 1 and 2, respectively

combGO_plot(GOresList=GO_res,
                        title="Biological process - enriched",
                        ts=c(10,500),
                        sig=F,
                        numTerm=10,
                        upcols=c("lightpink","red"),
                        downcols=c("lightblue","blue"))

Now, if we want to run a GSEA, we can do so, but we need to either load a gmt for GSEA into R (so that it's a named list with the term as the name and genes in each entry) or specify a .gmt file. The gpGeneSets has genesets already loaded (but they only have GO IDs, so it won't be as nice). The Bader Lab has great curated genesets for GSEA, but not for Drosophila, so we'll have to use the gpGeneSets for this example.

#First start by ranking the genes
rnk<-GenerateGSEA(res=symRes, #Results object
                  filename="GSEA.rnk", #Output rnk file name for GSEA preranked outside of R. Set to NULL to skip writing out to file
                  bystat=T, #Rank genes by stats? will use Wald statistic or if not nere, -log10(p-value) with the direction from the log2FoldChange
                  byFC=F, #Rank genes by log2FoldChange? I like to use this with the shrunken log fold change from DESEq2
                  plotRNK=T, #Plot the distribution of scores by rank - useful to see if your scoring is skewed prior to GSEA
                  retRNK=T) #Return the RNK object? yes, to run GSEA in R
library(gpGeneSets)
gsea_res<-GSEA(rnk=rnk, #Rnk object
               gmt=gp_dm, #either .gmt filename or a loaded gene set
               pval=1, #adjusted p-value threshold for terms to return, set to 1 to return all terms and filter later
               ts=c(10,600), #min/max term sizes to filter terms BEFORE analysis
               nperm=10000, #number of permuations for p-value generation
               parseBader=F) #Set to TRUE if using Bader Lab genesets - it will parse the term names so it looks neater. I'm not using gary's genesets here, so we will set to FALSE

We have the GSEA results now. And if we want an enrichment plot from the results, we can specify the row(s) of the gsea_res to make the enrichment plots for that/those term(s). Here, I'll just do the top positively and negatively enriched terms (row 1 and nrow(gsea_res))

rows<-c(1, nrow(gsea_res))
enPlot(gseaRes=gsea_res[rows,], #GSEA results table subset into the rows of interest to make a plot
       rnk=rnk, #Original rnk object used to make gsea_res
       gmt=gp_dm, #Original gmt object/filename used to make gsea_res
       title=NULL) #character vector of length (nrow(gseaRes)) for custom titles, or leave NULL for automatic titles - works better for Gary's genesets

Now that we've done some global analyses, let's look at something more focused. Let's say we have certain terms that are enriched in our GO analysis or the GSEA. We can make custom genesets (either separately into a .gmt file), or we can extract genesets related to a keyword from the gpGeneSets. For instance, in the original GO results, we see 'rhodopsin' in a number of terms, so let's focus on that. But if you are unsure as to what a good keyword might be, you can always make a word cloud to start:

#Make a word cloud, to find some ideal keywords:
PathWC(text=GO_res$Down$term_name, #Character vector of text to be converted to a word cloud, for pathway results, specify the coloumn of the results with the term/pathway names to investigate
       cols="Dark2",#Colour scheme for the word cloud
       retTerms=F, #Return a data frame with the word frequencies?
       minfreq=3, #How many times must a words show up before you include it in the cloud?
       rmwords=c("regulation","process","positive", "negative","mediated","cell","cellular","protein"))#Words not to include in the word cloud (because they don't mean anything specific, biologically - these 8 words are set to filter out by default, but you can either set to NULL to keep all words, or uses any combination of these or other words you'd like)

#Make a custom GMT from the GO_gost object and gpGeneSets
#We want it to be as unbiased as possible, so we'll combind up and downregulated GO results:
rhodopsin<-c(customGMT(gost=GO_gost$Down, #gost object from GO_GEM and returnGost=T
                     key="rhodopsin", #keyword to pull gene sets - this is a grep, so anything with this key will be pulled - the resulting geneset may require some manual curation so check the names
                     gmt=gp_dm), #The gpGeneSets object containing the complete gene sets 'gp_hs' for human, 'gp_mm' for mouse and 'gp_dm' for drosophila
             customGMT(gost=GO_gost$Up,
                       key="rhodopsin",
                       gmt=gp_dm))

#Now we have a geneset of rhodospin-related terms and if we want to write it to a gmt file, we can use:
write.gmt(geneSet=rhodopsin,
          filename="rhodopsin.gmt")

#if we have a gene set from outside R, we can read it in using qusage:

rhodopsin<-qusage::read.gmt(file="rhodopsin.gmt")

#And let's say we want to look at the overlap of the rhodopsin genes with DEGs. We can make a Venn diagram:

#plotVenn takes a named list of up to length 5:
forVenn<-list(DE_Up=DEG$Up,
              DE_Down=DEG$Down,
              Rhodopsin=unique(unlist(rhodopsin)))

plotVenn(x=forVenn, #name list for plotting Venn diagram
         title="Rhodopsin genes",
         cols="Dark2", #Colour scheme for plot
         lty="blank", #outlines for circles
         scale=F, #Scale to list sizes?
         retVals=F) #Return list of values in overlaps?

Now we have a geneset, we can run a single-sample gsea and plot the results:

gsva_plot(counts=as.matrix(symCounts), #counts object (as matrix), make sure rownames are the same nomenclature as the gene symbols in geneset
          geneset=rhodopsin,
          method="gsva", #Method for gsva plot - see documentation for options
          condition=cond,
          con="WT", #Indicate the control condition
          title="Rhodospin ssGSEA", 
          compare=NULL, #for pairwise t-tests, leave NULL to do all possible comparisons, or provide a list of vectors, length 2 indicating the conditions to compare
          col="Dark2", #Colour scheme, can be RColourBrewer palette name, or vector of rgb(), hexadecimal, or colour names
          style="violin") #If not 'violin' it will be a box plot

We can also plot the overall expression (gsva plots enrichment) of genes:

count_plot(counts=symCounts,
           scaling="none", #Can be "zscore" to emphasize differences, or 'log10', or "none"
           genes=unique(unlist(rhodopsin)), #Character vector of gene names - need to unlist the geneset for this
           condition=cond,
           con="WT",
           title="Rhodopsin Genes Expression",
           compare=NULL,
           col="Dark2",
           method="perMean", #What method to plot? "mean", "median", "perMean", "ind", "geoMean"
           pair=F, #Paired t-tests?
           pc=1, #pseudocount if scaling="log10"
           yax="Percent Mean Expression", #y-axis label if default isn't descriptive enough
           showStat=T, #Show statistics on plot?
           style="box") #Default is violin

We can also generate heatmaps using zheat():

htree<-zheat(genes=unique(unlist(rhodopsin)), #Character vector of genes to plot in heatmap, NULL will plot all genes
      counts=symCounts,
      conditions=cond,
      con="WT",
      title="Rhodopsin genes",
      labgenes="",#Character vecotr of gene names to label in heatmap, NULL labels all, "" will label none
      zscore=T, #Plot row-zscore? if FALSE, probably want to log transform counts
      rclus=T, #TRUE=hierarchical clustering, FALSE=order in decreasing expression in control condition, can also give it a dataframe with rownames=gene names and the first column with an identifier to cluster genes
      hmcol=NULL, #colorRampPalette of length 100 for custom heatmap colours (NULL=default colours)
      retClus=T) #return clustered objects if rclus=T - will be used to pull clustered genes later

Now let's say we want to pull a cluster of genes from this:

#Cut the hierarchical tree at a certain level (use outputted figure to refine where you're cutting) and output a data frame of genes belonging to each cluster
annodf<-BinfTools:::heatClus(out=htree,
                             level=3)
head(annodf)

zheat(genes=unique(unlist(rhodopsin)),
      counts=symCounts,
      conditions=cond,
      con="WT",
      title="Rhodopsin - cut tree clusters",
      labgenes=NULL,
      zscore=T,
      rclus=annodf)

Now, for one final thing (for now), we can create a shiny app to explore our data and results. This could be handy to send to collaborators in the future to easily browse their results and then tell us what they would like to focus on.

app<-exploreData(res=symRes, #Results object, or named list of results objects
                 counts=symCounts, #Normalized counts or name list of normalized counts - must be same names as res
                 cond=cond) #Conditions or named list of conditions (same names as above)

Now the app is it's own object and can be run by: