knitr::opts_chunk$set( collapse = TRUE, comment = "#>", fig.width=8, fig.height=8, fig.align = "center" )
Install Dependencies
#Bioconductor packages don't install automatically on BinfTools install BiocManager::install("SAGx") BiocManager::install("GSVA") BiocManager::install("fgsea") BiocManager::install("gage") devtools::install_github("kevincjnixon/gpGeneSets") # gprofiler genesets for mouse, human, and drosophila devtools::install_github("kevincjnixon/BinfTools")
Load
library(BinfTools) library(gpGeneSets) data(gp_dm)#Load in the gpGeneSets data object for Drosophila (other options are gp_mm for mouse and gp_hs for human)
After running DESeq2 (DESeq object='dds', DESeq results = 'res')
norm_counts<-as.data.frame(counts(dds, normalized=T)) #If you want to use another count matrix, it must be a data frame with rows as genes and columns matching the order of 'cond' cond<-as.character(dds$condition) #Use the factor used for the DESeq2 contrasts res<-as.data.frame(res) #results object
If you used packages like limma or edger for your analysis, you can convert the results from those packages to a BinfTools compatible results object using:
### Converting Limma results: res<-limma::topTable(fit, number=Inf) res<-fromLimma(res) ###Converting EdgeR results: res<-edgeR::topTags(de, n=Inf) res<-fromEdgeR(res)
BinfTools has examples of these objects built in if you don't have data on hand
These results are from Drosophila (my PhD project) and the gene IDs are flybase IDs (equivalent to ENSEMBL IDs - hard to tell what they are from the ID)
We can run BinfTools using these original gene symbols, but if we want to more easily identify genes, we can use a gprofiler2::gconvert wrapper to convert the IDs to gene symbols for both the results and counts objects.
symRes<-getSym(object=res, #The object with rownames to convert obType="res", #The type of object 'res' or 'counts' species="dmelanogaster", #species target="FLYBASENAME_GENE", #What you want the gene names converted to. 'HGNC' is human gene symbols, 'MGI' for mouse symbols addCol=F) #Boolean, FALSE replaces rownames with gene symbols (if there are duplicates, only the highest expressed version is kept), TRUE adds a column named SYMBOL and all duplicates are kept symCounts<-getSym(object=norm_counts, obType="counts", species="dmelanogaster", target="FLYBASENAME_GENE", addCol=F)
Now, if we want to look at specific genes, it's a little easier to identify them rather than having to look up the ENSEMBL IDs.
Let's make a bar plot to look at the expression of certain genes in each condition.
#set up the genes of interest: genes<-c("Ldh","dac","Hsp70Bb","rut","tmod") barGene(genes=genes, counts=symCounts, #Using symCounts because the rownames match the gene names that we're looking for conditions=cond, title="Genes of interest - normalized Expression", norm=NULL, #Set this to a character of the control condition to plot the relative expression (i.e. set this condition to a value of 1) eb="sd", #Error bars, 'sd' = standard devation, 'se' = standard error, a value of zero (0; no quotes) will remove error bars altogether - this may be useful if plotting relative expression returnDat=F, #Return the values if you want to plot this elsewhere (graphpad/excel) to match figures made by researchers col="Dark2", #Colour scheme, can be RColourBrewer palette name, or vector of colours in rgb(), hexadecimal, or colour names ord=c("WT","KO")) #If the order of the conditions in the plot is not what you wanted, you can set this to reorder the conditions in any way you want #Mess around with it: barGene(genes=genes, counts=log10(1+symCounts), conditions=cond, title="log10 (1+normCounts)", norm="WT", eb="se", col=c("blue","yellow"), ord=NULL)
Another form of quality control is investigating the corrleation between the samples and conditions:
#We can plot the correlation heatmap of all samples using corHeat corHeat(counts=symCounts, #Counts object method="spearman", #Correlation method title="Spearman Correlation",#Plot title hmcol=colorRampPalette(c("red","grey","blue"))(100), #heatmap colours showNum=T) #Show correlation coefficients? #Plot a gene-by-gene correlation expCor(counts=symCounts, #Counts object cond=cond, #Conditions title="Correlation KO vs WT", #Plot title method="spearman", #Correlation method transform="log10", #Transformation of counts? "none","log10", or "log2" printInd=F) #Print individual correlation plots in addition to paired plot? #You can also plot correlations from anything: plotCor(x=res, #X-axis data y=res_mature, #y-axis data xCol="log2FoldChange", #x-axis data column yCol="log2FoldChange", #y-axis data column xlab="log2FoldChange Juvenile", ylab="log2FoldChange Mature", title="Compare Juvenile to Mature Expression changes", scale=F) #z-score?
Now we can move into generating some summary plots (volcano / MA)
DEG<-volcanoPlot(res=symRes, #Results object title="KO vs WT", p=0.05, #adjusted p-value threshold for DEGs pval=NULL, #unadjusted p-value threshold for DEGs (in case you don't want to use adjusted) FC=log2(1.5), #log2FoldChange threshold for DEGs (can be 0) lab=genes, #list of genes to label (NULL to not label any) col=genes, #list of genes to colour (NULL to not colour any) fclim=NULL, #x-axis (log2FoldChange) limits, genes passing this limit will be represented as triangles on the edge of the plot - good if you have some extreme outliers showNum=T, #Show the numbers of genes on the plot? returnDEG=T, #Return list of DEGs (Down, Up) - this is good for running GO later on expScale=F, #Scale point size to mean expression? upcol=NULL, #Colour value for upregulated genes, NULL will be red dncol=NULL) #Colour value for downregulated genes, NULL will be blue) MA_Plot(res=symRes, title="KO vs WT", p=0.05, pval=NULL, FC=log2(1.5), lab=genes, col=genes, fclim=NULL, #Same as volcano plot, but will act on y-axis, not x showNum=T, returnDEG=F, sigScale=F, #Scale point size to significance? upcol=NULL, dncol=NULL)
Now for some pathway enrichment analyses. The GO_GEM() function is a wrapper for gprofiler2::gost() and will output results in a ".GO.txt" file, a GEM file (for Cytoscape) in ".gem.txt", a pdf of the top 10 enriched and significant terms, and a list of genes in ".genes.txt". GO_GEM can take either a character vector of genes or a named list of genes (for example, the DEG list from volcanoPlot). It can export a gprofiler2 'gost' object (for use with other gprofiler2 function and the customGMT() function) and a results table for use with other functions.
dir.create("GO") #Make a directory for output GO_res<-GO_GEM(geneList=DEG, species="dmelanogaster", bg=rownames(symRes), #A character vector of genes indicating the background for the GO analysis. Leave NULL to use all genes (if you don't have one) source="GO:BP", #A character indicating the source - see documentation for all of them corr="fdr", #How to correct the p-values iea=F, #Remove genes in terms 'inferred by electronic analysis' ? prefix="GO/", #Character for output prefix. If named list is provided as geneList, names of the list will be added to the prefix ts=c(10,500), #numeric of length 2 indicating minimum/maximum term size for results plots pdf=F, #print figures to pdf? fig=T, #Show figures in plots in RStudio? figCols=c("blue","orange"), #colours for enrichment/significance in plots returnGost=F, #Return gprofiler2 gost object writeRes=F, #Write results to ".GO.txt" file writeGem=F, #Write gem file? writeGene=F, #Write genes in query to file? returnRes=T) #Return the results table (only one of returnRes or returnGost can be T, not both) GO_gost<-GO_GEM(geneList=DEG, species="dmelanogaster", bg=rownames(symRes), source="GO:BP", prefix="GO/", pdf=F, fig=F, returnGost=T, writeRes=F)
If we want to plot GO results for DEGs (up/down, specifically), we can use this function using the GO_res (must be name list with 'Down', 'Up')
combGO_plot(GOresList=GO_res, title="Biological process - significant", ts=c(10,500), sig=T, numTerm=10, upcols=c("lightpink","red"), downcols=c("lightblue","blue")) combGO_plot(GOresList=GO_res, title="Biological process - enriched", ts=c(10,500), sig=F, numTerm=10, upcols=c("lightpink","red"), downcols=c("lightblue","blue"))
We can also use a list of GO results to make a heatmap of terms of interest
#First, if there are any GO results in the list, replace them with 'dummy' data using filtGO() GO_res<-filtGO(GO_res, replace=T) #In this case, there were no results that needed replacement #Now we need to create a list of term groups that interest us: keyList<-list(Rhodopsin=c("rhodopsin"), Muscle=c("muscle","myo")) termlist<-makeTermList(GOres=GO_res, keylist=keyList) #Always check the term list to make sure unwanted terms didn't sneak their way in there as the 'key' is never exact #Now we can make a heatmap of the significance of these GO term groups in each contrast: GOHeat(GOresList=GO_res, termList=termlist, hmcol=colorRampPalette(c("white","darkblue"))(100), width=10, #cell width height=10, #cell height maxVal=NA, #If you want to set a maximum value on the heatmap minVal=NA, #If you want to set a minimum value on the heatmap NAcol="darkgrey", #Colour for non-significant terms ret=F) #Return the table generated to make the heatmap?
Now, if we want to run a GSEA, we can do so, but we need to either load a gmt for GSEA into R (so that it's a named list with the term as the name and genes in each entry) or specify a .gmt file. The gpGeneSets has genesets already loaded (but they only have GO IDs, so it won't be as nice). The Bader Lab has great curated genesets for GSEA, but not for Drosophila, so we'll have to use the gpGeneSets for this example.
#First start by ranking the genes rnk<-GenerateGSEA(res=symRes, #Results object filename="GSEA.rnk", #Output rnk file name for GSEA preranked outside of R bystat=T, #Rank genes by stats? will use Wald statistic or if not nere, -log10(p-value) with the direction from the log2FoldChange byFC=F, #Rank genes by log2FoldChange? I like to use this with the shrunken log fold change from DESEq2 retRNK=T) #Return the RNK object? yes, to run GSEA in R library(gpGeneSets) gsea_res<-GSEA(rnk=rnk, #Rnk object gmt=gp_dm, #either .gmt filename or a loaded gene set pval=1, #adjusted p-value threshold for terms to return, set to 1 to return all terms and filter later ts=c(10,600), #min/max term sizes to filter terms BEFORE analysis nperm=10000, #number of permuations for p-value generation parseBader=F) #Set to TRUE if using Bader Lab genesets - it will parse the term names so it looks neater. I'm not using gary's genesets here, so we will set to FALSE
We have the GSEA results now. And if we want an enrichment plot from the results, we can specify the row(s) of the gsea_res to make the enrichment plots for that/those term(s). Here, I'll just do the top positively and negatively enriched terms (row 1 and nrow(gsea_res))
rows<-c(1, nrow(gsea_res)) enPlot(gseaRes=gsea_res[rows,], #GSEA results table subset into the rows of interest to make a plot rnk=rnk, #Original rnk object used to make gsea_res gmt=gp_dm, #Original gmt object/filename used to make gsea_res title=NULL) #character vector of length (nrow(gseaRes)) for custom titles, or leave NULL for automatic titles - works better for Gary's genesets
Now that we've done some global analyses, let's look at something more focused. Let's say we have certain terms that are enriched in our GO analysis or the GSEA. We can make custom genesets (either separately into a .gmt file), or we can extract genesets related to a keyword from the gpGeneSets. For instance, in the original GO results, we see 'rhodopsin' in a number of terms, so let's focus on that. But if you are unsure as to what a good keyword might be, you can always make a word cloud to start:
#Make a word cloud, to find some ideal keywords: PathWC(text=GO_res$Down$term_name, #Character vector of text to be converted to a word cloud, for pathway results, specify the coloumn of the results with the term/pathway names to investigate cols="Dark2",#Colour scheme for the word cloud retTerms=F, #Return a data frame with the word frequencies? minfreq=3, #How many times must a words show up before you include it in the cloud? rmwords=c("regulation","process","positive", "negative","mediated","cell","cellular","protein"))#Words not to include in the word cloud (because they don't mean anything specific, biologically - these 8 words are set to filter out by default, but you can either set to NULL to keep all words, or uses any combination of these or other words you'd like) #Make a custom GMT from the GO_gost object and gpGeneSets #We want it to be as unbiased as possible, so we'll combind up and downregulated GO results: rhodopsin<-c(customGMT(gost=GO_gost$Down, #gost object from GO_GEM and returnGost=T key="rhodopsin", #keyword to pull gene sets - this is a grep, so anything with this key will be pulled - the resulting geneset may require some manual curation so check the names gmt=gp_dm), #The gpGeneSets object containing the complete gene sets 'gp_hs' for human, 'gp_mm' for mouse and 'gp_dm' for drosophila customGMT(gost=GO_gost$Up, key="rhodopsin", gmt=gp_dm)) #Now we have a geneset of rhodospin-related terms and if we want to write it to a gmt file, we can use: write.gmt(geneSet=rhodopsin, filename="rhodopsin.gmt") #if we have a gene set from outside R, we can read it in using read.gmt: #rhodopsin<-read.gmt(file="rhodopsin.gmt") #And let's say we want to look at the overlap of the rhodopsin genes with DEGs. We can make a Venn diagram: #plotVenn takes a named list of up to length 5: forVenn<-list(DE_Up=DEG$Up, DE_Down=DEG$Down, Rhodopsin=unique(unlist(rhodopsin))) plotVenn(x=forVenn, #name list for plotting Venn diagram title="Rhodopsin genes", cols="Dark2", #Colour scheme for plot lty="blank", #outlines for circles scale=F, #Scale to list sizes? retVals=F) #Return list of values in overlaps?
Now we have a geneset, we can run a single-sample gsea and plot the results:
gsva_plot(counts=as.matrix(symCounts), #counts object (as matrix), make sure rownames are the same nomenclature as the gene symbols in geneset geneset=rhodopsin, method="gsva", #Method for gsva plot - see documentation for options condition=cond, con="WT", #Indicate the control condition title="Rhodospin ssGSEA", compare=NULL, #for pairwise t-tests, leave NULL to do all possible comparisons, or provide a list of vectors, length 2 indicating the conditions to compare col="Dark2", #Colour scheme, can be RColourBrewer palette name, or vector of rgb(), hexadecimal, or colour names style="violin") #If not 'violin' it will be a box plot
We can also plot the overall expression (gsva plots enrichment) of genes:
count_plot(counts=symCounts, scaling="none", #Can be "zscore" to emphasize differences, or 'log10', or "none" genes=unique(unlist(rhodopsin)), #Character vector of gene names - need to unlist the geneset for this condition=cond, con="WT", title="Rhodopsin Genes Expression", compare=NULL, col="Dark2", method="perMean", #What method to plot? "mean", "median", "perMean", "ind", "geoMean" pair=F, #Paired t-tests? pc=1, #pseudocount if scaling="log10" yax="Percent Mean Expression", #y-axis label if default isn't descriptive enough showStat=T, #Show statistics on plot? style="box") #Default is violin
We can also generate heatmaps using zheat():
htree<-zheat(genes=unique(unlist(rhodopsin)), #Character vector of genes to plot in heatmap, NULL will plot all genes counts=symCounts, conditions=cond, con="WT", title="Rhodopsin genes", labgenes="",#Character vecotr of gene names to label in heatmap, NULL labels all, "" will label none zscore=T, #Plot row-zscore? if FALSE, probably want to log transform counts rclus=T, #TRUE=hierarchical clustering, FALSE=order in decreasing expression in control condition, can also give it a dataframe with rownames=gene names and the first column with an identifier to cluster genes hmcol=NULL, #colorRampPalette of length 100 for custom heatmap colours (NULL=default colours) retClus=T) #return clustered objects if rclus=T - will be used to pull clustered genes later
Now let's say we want to pull a cluster of genes from this:
#Cut the hierarchical tree at a certain level (use outputted figure to refine where you're cutting) and output a data frame of genes belonging to each cluster annodf<-BinfTools:::heatClus(out=htree, level=3) head(annodf) zheat(genes=unique(unlist(rhodopsin)), counts=symCounts, conditions=cond, con="WT", title="Rhodopsin - cut tree clusters", labgenes=NULL, zscore=T, rclus=annodf)
We can also plot a heatmap of gene expression in groups based on gene sets
gmtHeat(counts=symCounts, cond=cond, gmt=rhodopsin, con="WT", labgenes="", avgExp=T, #plot the average expression for each condition zscore=T, hmcol=NULL, retGroups=F) #Return list of data frames of expression?
We've looked at hierarchical clustering from zheat() but there is also k-means clustering. This is useful if you have multiple contrasts and want to compare gene expression changes or if you want to compare expression from different conditions. We will look at clustering from different DESeq2 contrasts. We have data from Juvenile fly mushroom bodies, and mature fly mushroom bodies.
#Make a list of DESeq2 results objects resList<-list(Juvenile=res, Mature=res_mature) #We may also want to subset the results to genes that are differentially expressed in at least one of the contrasts - this helps to remove noise from the clustering: genes<-unique(unlist(lapply(lapply(resList, subRes, p=0.05, FC=log2(1.5)), rownames))) resList<-lapply(resList, subRes, genes=genes) #Cluster based on log2FoldChanges in multiple contrasts clusRes<-clusFigs(resList=resList, #Named list of DESeq2 results objects, in the order you want displayed in the figures numClus=8, # Number of k-means clusters to use - play around with it to optimize the number title="Clustered Results", labgenes="", #Label any genes in the heatmap? col="Dark2", #Colour palette hmcol=NULL, #heatmap colours avgExp=F, #If you want to cluster based on expression, you can set resList to avgExp(counts, cond) and then avgExp=T con="Juvenile", #Control condition or contrast - will order results showStat=T, #Show significance on boxplot? retStat=F) #Return stats in list with clustered Results? #If the clusters aren't in the order you like, you can always reorder them: clusRes2<-clusRelev(clusRes=clusRes, #output from clusFigs cluslev = c(8,7,6,5,4,3,2,1), #New order of clusters rename=F, #Rename clusters starting at 1? title="Releveled Clusters", col="Dark2", hmcol=NULL, labgenes="", avgExp=F, showStat=T, retStat=F) #You can also perform GO enrichment on the clusters clusGO<-clusGO(clusRes2, species="dmelanogaster", bg=rownames(res), source="GO:BP ", writeRes=F, pdf=F, returnRes=T, fig=F) #And we can show the results as tables, since otherwise it would be a lot of figures to show: listGO_Table(GOres=clusGO, title="Top Enriched Terms", subtitle="Clustered Results", sig=F, ts=c(10,500), colnames=c("Biological Process","Enrichment","FDR"), numTerm=10, retGT=F, printGT=T)
Now, for one final thing (for now), we can create a shiny app to explore our data and results. This could be handy to send to collaborators in the future to easily browse their results and then tell us what they would like to focus on.
NOTE: This function has not been updated recently and may no longer be compatible with other functions.
app<-exploreData(res=symRes, #Results object, or named list of results objects counts=symCounts, #Normalized counts or name list of normalized counts - must be same names as res cond=cond) #Conditions or named list of conditions (same names as above)
Now the app is it's own object and can be run by:
app
You can save it in an RData object and send (with chunk no. 1 of this Rmd file) to a collaborator to run
There are other functions available in this package that are helpful in analyses. Please take a look at the BinfTools index in R to see all available functions and their documentation.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.