#' @title Automated RNASeq Analysis Pipeline
#' @description An easy to use pipeline for analysing RNA Seq data. The package currently supports the following analysis- Differential gene expression analysis using DESeq2, Calculate the most variable genes, PCA analysis, GO enrichment of the differentially expressed genes, KEGG pathway enrichment of the differentially expressed genes, GSEA analysis.
#' @param data Un-normalized counts matrix (please note that you should NOT pass in normalized data). The counts' table should contain unique gene names as the first column. ENSEMBL ID's are also allowed but no other form of ID's are currently supported. Check example- head(example_data): \code{\link{example_data}}.
#' @param meta A CSV file with information regarding the samples. It is absolutely critical that the columns of the counts' matrix and the rows of the metadata are in the same order. The function will not make guesses as to which column of the count matrix belongs to which row of the metadata, these must be provided in a consistent order. Check example- head(example_meta): \code{\link{example_meta}}.
#' @param design The formula that expresses how the counts for each gene depend on your metadata (used to calculate the necessary data for differential gene expression analysis). Check DESeq2 documentation for designing the formula. In general, you pass in a column name (e.g. treatment) of your metadata file or a combination of column names (e.g. treatment + cell_type).
#' @param contrast Information regarding the groups between which you would like to perform differential gene expression analysis. It could be between two groups or between multiple groups and needs to follow the following format: contrast = list(A = c(" "), B= c(" ")). If you are comparing two groups (e.g. control vs treatment), the constrast argument should look like the following: contrast = list(A = c("control"), B= c("treatment")). In situations where you have multiple groups to compare- (e.g. control vs treatment1 and treatment2), you should do the following- contrast = list(A = c("control"), B= c("treatment1", "treatment2")).
#' @param species Only applies to converting ENSEMBL IDs to Gene names (not to enrichment analysis). Species you want to use. To see the different datasets available you can use do: library(biomaRt); followed by mart = useEnsembl('ENSEMBL_MART_ENSEMBL'); followed by listDatasets(mart). Default: 'hsapiens_gene_ensembl'.
#' @param variable.genes numeric: The number of most variable genes to be identified. By default, the program identifies the top 1000 most variable genes.
#' @param folder.name Custom folder name that you would like to save your results in.
#' @param save.dir Directory to save the results in. Default: Working Directory.
#' @param custom.gsea User defined gene list to perform GSEA. File need to be supplied as a dataframe with each row as a gene list
#' @param qc Logical. When passed in TRUE, the program would run the quality control modules on the entire dataset. If you are planning to perform multiple comparisons using the contrast argument, run qc = TRUE for the first time and then change it to qc = FALSE for the subsequent comparisons to speed up the analysis.
#' @param dgea Logical. Parameter to define if differential gene expression analysis is to be performed. Default: TRUE
#' @param ensemblmirror String. Values for the mirror argument are: useast, uswest, asia
#' @param kmeans int. Number of clusters/ gene modules. The Most Variable Genes and Differentially Expressed Genes will be divided into the user defined clusters. Default 10.
#' @import DESeq2
#' @import utils
#' @import plyr
#' @importFrom stats as.formula
#' @return Differentially expressed genes.
#' @examples
#' \dontrun{
#' contrast = list(A = c("control"), B= c("drug_A"))
#' arseq2 (data = example_data,meta = example_meta, design = "treatment", contrast = contrast)
#' }
#' @export
arseq <- function(data,meta,design, contrast, dds=NULL, qc= TRUE, dgea=TRUE,
species='hsapiens_gene_ensembl',
variable.genes=1000, folder.name="ARSeq", custom.gsea=NULL,
kmeans=10, save.dir=getwd(),ensemblmirror="useast"){
# Set the working directory
if (save.dir != getwd()){
setwd(save.dir)
}
# Resolve the contrasts and design
goi <- c(paste(unlist(contrast[[1]]), collapse="_"),paste(unlist(contrast[[2]]), collapse="_"))
groups <- c(unlist(contrast[[1]]),contrast[[2]])
# Figure out the contrast column programatically
colum.names <- c()
statements <- c()
for (i in 1: length(groups)){
colum.names <- c(colum.names,colnames(meta[, which(meta == groups[i], arr.ind=T)[1,][[2]], drop=FALSE]))
}
# IF the groups of interest are found in multiple colums; find if any one column that contains all groups of interest
for(i in colum.names){statements <- c(statements,all(groups %in% meta[,i]))}
for(i in colum.names){if (all(groups %in% meta[,i])){con = i}}
if (all(colum.names == colum.names[1])){
contrast.by = colum.names[1]
# Remname the column with contrast information to arseq.group
colnames(meta)[which(colnames(meta) == contrast.by)] <- "arseq.group"
}else if (any(statements)){
contrast.by = con
colnames(meta)[which(colnames(meta) == contrast.by)] <- "arseq.group"
print(paste("Meta Data column that has been chosen for contrast determination is [[",con,"]]"))
}else{stop('All of your contrast groups for Differntial expression analysis need to be listed under the same column within your metadata file')}
# resolving design
design <- gsub(contrast.by, "arseq.group", design)
design <- paste("~",design,sep = "")
design <- as.formula(design)
# Creating a folder to save results
print("Creating a folder to store your analysis results")
suppressWarnings(dir.create(folder.name))
# ENSEMBL ID to gene names
if (all(substr(row.names(data),1,3) == "ENS")){
data <- arseq.ensembl2genename(data,ensemblmirror,species)
}
# Remove genes that are completely not expressed
print("Removing genes that are not expressed: if any")
data <- data[rowSums(data) > 0, ]
# create a copy for keeping the same order for normalized data
orginal_order <- colnames(data)
# Rearrange the samples
meta <- meta[order(meta$arseq.group),]
data <- data[,row.names(meta)]
# Generate the DESEq2 data object
if(is.null(dds)){
print("Generating the DESeq object")
dds <- DESeqDataSetFromMatrix(countData = data, colData = meta, design = design)
dds <- DESeq(dds)
}
# to return
dds_raw <- dds
# Get the normalized matrix for heatmaps
print("Normalizing data and saving the normalized data")
n_data <- log2(counts(dds, normalized=TRUE)+1)
n_data <- n_data [,orginal_order]
write.csv(n_data, file = paste(folder.name,"/normalized_data",".csv",sep = ""))
# Quality Control of the data
if (isTRUE(qc)){
# Create a folder to save results
suppressWarnings(dir.create(paste(folder.name,"/Quality Control",sep = "")))
# General structure of the dataset
print("Performing Quality Control on the data")
# Calculate Euclidean distance between samples
euclid.dist <- arseq.euclid.dist (dds)
euclid.plot <- arseq.euclid.dist.plot (euclid.dist) # Euclidean distance heatmap
arseq.pheatmap(euclid.plot, filename="Euclidean distance.pdf", save.dir=paste(folder.name,"/Quality Control",sep="")) # Save Euclidean distance.pdf
# Calculate Poisson Distance between samples
poisd.dist <- arseq.poisd.dist (dds)
poisd.plot <- arseq.poisd.dist.plot (poisd.dist) # Poisson distance heatmap
arseq.pheatmap(poisd.plot, filename="Poisson distance.pdf", save.dir=paste(folder.name,"/Quality Control",sep="")) # Save Poisson distance.pdf
#PCA plot
pca.plot <- arseq.pca.plot (dds, intgroup="arseq.group")
arseq.plot.save (pca.plot, filename="PCA plot.pdf", width=7, height=7, save.dir= paste(folder.name,"/Quality Control",sep="")) # Save the PCA plot
# Eignen vectors of the PC's
pca.ev <- arseq.pca (dds)
write.csv(pca.ev, file = paste(folder.name,"/Quality Control/PCA Eigenvectors.csv",sep="")) # save the eigen vectors
# MSD plot
mds.plot <- arseq.mds.plot (dds, intgroup="arseq.group")
arseq.plot.save (mds.plot, filename="MDS plot.pdf", width=7, height=7, save.dir= paste(folder.name,"/Quality Control",sep="")) # Save the MDS plot
# Most variable genes
mvg <- arseq.mvg (dds,variable.genes=variable.genes)
write.csv(mvg, file = paste(folder.name,"/Quality Control/most variable genes.csv",sep=""))
# K-Means clustering of the most variable genes
if (nrow(mvg) > 100) {
kmeans.clusters <- arseq.kmeans (data=mvg, kmeans=kmeans)
mvg.plot <- arseq.kmeans.plot (data=mvg, clusters=kmeans.clusters, metadata=meta,save.plot=TRUE,
save.dir= paste(folder.name,"/Quality Control/",sep=""))
reactome.plot <- arseq.kmeans.reactome.plot (clusters=kmeans.clusters, save.plot=TRUE,
save.dir= paste(folder.name,"/Quality Control/",sep=""))
write.csv(data.frame(kmeans.clusters), file = paste(folder.name,"/Quality Control/most variable genes kmeans clusters.csv",sep=""))
} else {
# Heatmap of the most variable genes
mvg.plot <- arseq.mvg.plot (mvg,metadata=meta,intgroup="arseq.group",
save.plot=TRUE, save.dir=paste(folder.name,"/Quality Control/",sep=""))
}
}
if (isTRUE(dgea)){
# Create folder to save the differentially expression results
suppressWarnings(dir.create(paste(folder.name,"/",goi[1], " vs ", goi[2],sep = "")))
location <- paste(folder.name,"/",goi[1], " vs ", goi[2],"/",sep = "")
# Manipulate the dds groups to look at the contrasts of interest
dds$arseq.group <- mapvalues(dds$arseq.group, from=contrast[[1]], to=rep(paste(unlist(contrast[[1]]), collapse="_"),length(contrast[[1]])))
dds$arseq.group <- mapvalues(dds$arseq.group, from=contrast[[2]], to=rep(paste(unlist(contrast[[2]]), collapse="_"),length(contrast[[2]])))
# Subset the normalized matrix and dds based on the groups of interest
colums_to_subset <- row.names(meta[meta$arseq.group %in% groups,,drop=FALSE]) # Changed
n_data_goi <- data.frame(n_data[,colums_to_subset])
dds_subset <- dds[ , dds$arseq.group %in% goi ]
dds_subset$arseq.group <- factor(dds_subset$arseq.group)
# Differential Gene Expression Analysis
print("Performing Differential gene expression analysis between the constrast groups")
dds <- DESeq(dds)
deg <- results(dds, contrast=c("arseq.group",goi[1],goi[2]))
deg <- deg[order(deg$padj),]
# Save the DEG matrix
print("Saving the differentially expressed genes")
suppressWarnings(dir.create(paste(location,"Differential expression/",sep = "")))
write.csv(deg, file = paste(location,"Differential expression/",goi[1], " vs ", goi[2],".csv",sep = ""))
# Heatmap of the differentially expressed genes
try(deg.plot <- arseq.deg.plot (deg, data=n_data_goi, dds=dds_subset, save.plot=TRUE, save.dir=paste(location,"Differential expression/",sep="")))
# Kmeans cluster DEG to identify submodules within the differentially expressed genes
try(sig_deg <- row.names(data.frame(deg[which(deg$padj <= 0.05), ])))
try(sig_deg <- n_data[row.names(n_data) %in% sig_deg, ])
try(
if (nrow(sig_deg) > 100) {
suppressWarnings(dir.create(paste(location,"Differential expression/kmeans/",sep = "")))
kmeans.clusters.deg <- arseq.kmeans (data=sig_deg, kmeans=kmeans)
mvg.plot.deg <- arseq.kmeans.plot (data=sig_deg, clusters=kmeans.clusters.deg, metadata=meta,save.plot=TRUE,
save.dir= paste(location,"/Differential expression/kmeans/",sep=""))
reactome.plot.deg <- arseq.kmeans.reactome.plot (clusters=kmeans.clusters.deg, save.plot=TRUE,
save.dir= paste(location,"/Differential expression/kmeans/",sep=""))
write.csv(data.frame(kmeans.clusters.deg), file = paste(location,"/Differential expression/kmeans/kmeans clusters.csv",sep=""))
})
# Calculate Euclidean distance between contrast groups
suppressWarnings(dir.create(paste(location,"Sample relation plots",sep = "")))
euclid.dist.subset <- arseq.euclid.dist (dds_subset)
euclid.plot.subset <- arseq.euclid.dist.plot (euclid.dist.subset) # Euclidean distance heatmap
arseq.pheatmap(euclid.plot.subset, filename=paste("Euclidean distance- ",goi[1], " vs ", goi[2],".pdf",sep = ""),
save.dir=paste(location,"Sample relation plots",sep=""))
# Calculate Poisson Distance between samples
poisd.dist.subset <- arseq.poisd.dist (dds_subset)
poisd.plot.subset <- arseq.poisd.dist.plot (poisd.dist.subset) # Poisson distance heatmap
arseq.pheatmap(poisd.plot.subset, filename=paste("Poisson distance- ",goi[1], " vs ", goi[2],".pdf",sep = ""),
save.dir=paste(location,"Sample relation plots",sep=""))
# PCA
pca.plot.subset <- arseq.pca.plot (dds_subset, intgroup="arseq.group")
arseq.plot.save (pca.plot.subset, filename=paste("PCA plot- ",goi[1], " vs ", goi[2],".pdf",sep = ""),
width=7, height=7,
save.dir=paste(location,"Sample relation plots",sep="")) # Save the PCA plot
# Eignen vectors of the PC's
pca.ev.subset <- arseq.pca (dds_subset)
write.csv(pca.ev.subset, file = paste(location,"Sample relation plots/",goi[1], " vs ", goi[2]," PCA Eigenvectors.csv",sep = "")) # save the eigen vectors
# MSD plot
mds.plot.subset <- arseq.mds.plot (dds_subset, intgroup="arseq.group")
arseq.plot.save (mds.plot.subset, filename=paste("MDS plot- ",goi[1], " vs ", goi[2],".pdf",sep = ""),
width=7, height=7,
save.dir= paste(location,"Sample relation plots",sep="")) # Save the MDS plot
# Volcano plot
volcano.plot <- arseq.volcano.plot (deg)
arseq.plot.save (volcano.plot, filename=paste("Volcano plot- ",goi[1], " vs ", goi[2],".pdf",sep = ""),
width=14, height=14,
save.dir= paste(location,"Sample relation plots",sep="")) # Save the MDS plot
# GO Enrichment Analysis
suppressWarnings(dir.create(paste(location,"GO enrichment analysis/",sep = "")))
try(go.enrich <- arseq.go.enrich (deg,Padj=0.05))
try(write.csv(go.enrich, file = paste(location,"GO enrichment analysis/","GO Enrichment- ",goi[1], " vs ", goi[2],".csv", sep = "")))
# GO Enrichment plot
try(go.plot <- arseq.go.enrich.plot (go.enrich))
try(arseq.plot.save (go.plot, filename=paste("GO Enrichment plot- ",goi[1], " vs ", goi[2],".pdf",sep = ""),
width=6, height=20,
save.dir= paste(location,"GO enrichment analysis",sep=""))) # Save the GO plot
# Reactome Ecnrichment Analysis
suppressWarnings(dir.create(paste(location,"Reactome enrichment analysis/",sep = "")))
try(reactome.enrich <- arseq.reactome.enrich (deg,Padj=0.05,plot=TRUE,save.plot=TRUE, save.dir= paste(location,"Reactome enrichment analysis/",sep="")))
try(write.csv(reactome.enrich, file = paste(location,"Reactome enrichment analysis/","Reactome Enrichment- ",goi[1], " vs ", goi[2],".csv", sep = "")))
# Kegg Pathway Analysis for the DEG's
suppressWarnings(dir.create(paste(location,"KEGG pathway enrichment/",sep = "")))
try(kegg.enrich <- arseq.kegg.enrich (deg, kegg.compare="as.group"))
# write the reults
try(write.csv(kegg.enrich$up_regulated_pathways, file = paste(location,"KEGG pathway enrichment/","KEGG Up Regulated Pathways- ",goi[1], " vs ", goi[2],".csv",sep = "")))
try(write.csv(kegg.enrich$down_regulated_pathways, file = paste(location,"KEGG pathway enrichment/","KEGG Down Regulated Pathways- ",goi[1], " vs ", goi[2],".csv",sep = "")))
# Kegg Pathway Analysis Plot
try(kegg.plot <- arseq.kegg.enrich.plot (kegg.enrich,foldchanges=kegg.enrich$foldchanges,save.dir= paste(location,"KEGG pathway enrichment",sep="")))
try(kegg.plot.2 <- arseq.kegg.enrich.plot (kegg.enrich,foldchanges=kegg.enrich$foldchanges,pathway.plots=FALSE))
try(arseq.plot.save (kegg.plot.2, filename=paste("KEGG Enrichment plot- ",goi[1], " vs ", goi[2],".pdf",sep = ""),
width=8, height=12,
save.dir= paste(location,"KEGG pathway enrichment",sep=""))) # Save the KEGG plot
# GSEA Analysis
try(ranked.list <- arseq.gsea.preprocess (deg))
try(gsea.output <- arseq.gsea.runall (ranked.list, save=TRUE, save.dir=location, custom.gsea=custom.gsea))
}
#if (isTRUE(dgea)){return(deg)}
return (dds_raw)
# END
print(paste("Well done- your analysis is now complete. Head over to [[", getwd(), "]] to view your results"))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.