#' Calculate WGCNA module coherence
#'
#' Calculate within module pairwise gene correlations from WGCNA modules and gene expression data.
#' Test the coherence of modules built from Dataset A in data from Dataset B.
#'
#'
#' @param mods Dataframe giving module membership such as output by make_modules. Must include mods_var with module names/IDs and gene_var with gene symbols/IDs
#' @param mods_title Character string specifying name the study/dataset from which modules were built. This is used simply for labeling of outputs. Default = "STUDY1"
#' @param mods_var Character string specifying name of the column in mods df which defines modules. Defaults to "module.char" which is output by make_modules()
#' @param remove_mods Vector of character strings naming modules which you want removed from the analysis. Default includes 0,"0","00","grey"
#' @param dat limma EList output by voom( ) or dataframe contains expression data (like dat$E) in which module coherence is to be tested.
#' @param gene_var Character string specifying name of the column in mods df which gives gene IDs. Values must match rownames in expression data (dat). Defaults to "geneName" which is output by make_modules()
#' @param dat_title Character string specifying name of the study from which the data come. This is used simply for labeling of outputs. Default = "STUDY2"
#' @param return_plot Logical indicating whether plot should be printed when function runs. Defaults = TRUE.
#' @param r_cutoff Vector or single numeric value at which to draw red cutoff lines in correlation plot. Default = 0.3
#' @param p_cutoff Vector or single numeric value at which to draw red cutoff lines in significance plot. Default = 0.01
#'
#' @return List including:
#' \itemize{
#' \item{coherence_boxplot_combined} A ggplot plot object visualizing distributions of correlation and p values.
#' \item{coherence_boxplot_cor} A ggplot plot object visualizing distributions of correlation values.
#' \item{coherence_boxplot_p} A ggplot plot object visualizing distributions of correlation p values.
#' \item{subgene_correlation_df} Dataframe containing pairwise gene correlations within modules.
#' \item{GSabsent} A list of missing genes enumerating genes defined in modules but missing from expression data.
#' }#'
#' @export
#'
#' @examples
#' moduleCoherence <- calculate_module_coherence(mods = example.mods,
#' dat = example.voom, mods_title = "Example WGCNA Modules",
#' dat_title = "Example Data", r_cutoff = 0.3, p_cutoff = 0.01)
calculate_module_coherence<-
function(mods,
mods_title="STUDY1",
mods_var = "module.char",
remove_mods = c(0,"0","00","grey"),
dat,
gene_var="geneName",
dat_title="STUDY2",
return_plot=TRUE,
r_cutoff = 0.3,
p_cutoff = 0.01){
# set undefined global variables to null
X1 <- X2 <- allCorVals<- allPVals <- V3<- Cor<- P <- Set <- gene1<- gene2<- P.est<- h_label<- h_pos<- negLogP<- value<- h_line<- NULL
# convenience define module definition dataframe and check data formatting
moduleSets<- mods
if(class(dat)[1]== "EList"){voomData<-dat$E}else{voomData<-dat}
if(!is.numeric(voomData)){
print("Warning, count/expression matrix is non-numeric.
This usually arises when a non-expression column has been retained from a metadata object (eg due to grouping).
Be advised to check for stray metadata in the expression matrix.")}
#-----------------------------------------------------------------
# Calculate the gene set means (module expression)
# [this is currently not used, but may be incorporated later]
#-----------------------------------------------------------------
uniGS<-moduleSets%>%
dplyr::pull(mods_var)%>%
unique()%>%as.character()%>%gtools::mixedsort() # Pull unique modules & order correctly
# establish storage
GSsub<-c()
GSabsent<-list()
# for each geneset pull corresponding gene indices and compute means
for (i in uniGS){
curGS<-i
curIDs<-as.character(moduleSets[which(moduleSets[, mods_var]==curGS), gene_var])
matchIndex<-match(curIDs, rownames(voomData))
if (any(is.na(matchIndex))){
matchIndex<-matchIndex[-which(is.na(matchIndex))]}
GSabsent[[i]]<-curIDs[which(!(curIDs %in% rownames(voomData)))]
if (length(curIDs[which(curIDs %in% rownames(voomData))]) > 1){
GSsub<-rbind(GSsub, apply(voomData[matchIndex,], 2, mean))
}else{
GSsub<-rbind(GSsub, voomData[matchIndex,])}
}
# print message if genes are missing from genesets.
if(length(unlist(GSabsent))>0){
print(paste0(length(unique(unlist(GSabsent))), " Module Genes absent in voom object. Missing genes accesible in .$GSabsent"))}
#-------------------------------------------
# Set function for correlation p values
#-------------------------------------------
# Correlation p value function
cor2pvalue = function(r, n) {
t <- (r*sqrt(n-2))/sqrt(1-r^2)
p <- 2*stats::pt(abs(t),(n-2), lower.tail=FALSE)
se <- sqrt((1-r*r)/(n-2))
out <- list(r, n, t, p, se)
names(out) <- c("r", "n", "t", "p", "se")
return(out)
}
# remove gene sets that are zero
if(!is.null(remove_mods)){uniGS<-uniGS[-which(uniGS %in% remove_mods)]}
# Setup storage
SubGeneCorDF<-list()
SubGeneCorMedian<-c()
SubGenePMedian<-c()
# for each geneset pull the correct gene IDs and indices, calculate correlation matrices, and format results.
for (i in uniGS){
curGS<-i
curIDs<-as.character(moduleSets[which(moduleSets[, mods_var]==curGS), gene_var])
matchIndex<-match(curIDs, rownames(voomData))
if (any(is.na(matchIndex))){
matchIndex<-matchIndex[-which(is.na(matchIndex))]}
if(length(matchIndex)>1){ # run correlation calculations as long as modules contain multiple genes
setCor<-stats::cor(t(voomData[matchIndex,])) # Calculate pairwise gene correlations from expression data
n<-t(!is.na(t(voomData[matchIndex,]))) %*% (!is.na(t(voomData[matchIndex,]))) # create matrix with sample size to calculate correlation p (dim=curIDs*curIDs, value = sample size)
allCorInfo<-cor2pvalue(setCor, n) # calculate correlation p values from matrix.
setP<-allCorInfo$p
SubGeneCorDF[[i]]<- # Assemble df of gene pairs, correlations, and p values
data.frame(t(utils::combn(colnames(setCor), 2)),
allCorVals=setCor[t(utils::combn(colnames(setCor), 2))],
allPVals = setP[t(utils::combn(colnames(setP), 2))])%>%
dplyr::rename(gene1 = X1, gene2 = X2)%>%
dplyr::mutate(V3=i)} else {print(paste0("Faulty gene-set:", curGS, " only contains 1 gene present in expression data. Skipping."))}
}
# Assemble and format results dataframe
SubGeneCorDF<-
dplyr::bind_rows(SubGeneCorDF)%>%
dplyr::rename(Cor = allCorVals, P = allPVals, Set = V3)%>%
dplyr::mutate(Cor = as.numeric(as.character(Cor)))%>%
dplyr::mutate(P = as.numeric(as.character(P)))%>%
dplyr::mutate(Set = factor(Set, levels =uniGS))%>%
dplyr::select(Set, gene1, gene2, Cor, P)
#Warning if 0 exist in p-values
if(min(SubGeneCorDF$P) ==0){
print(paste(length(SubGeneCorDF$P[SubGeneCorDF$P==0]),
"p-values = 0. In plots, these will be replaced with the lowest non-zero value",
formatC(sort(unique(SubGeneCorDF$P))[2], digits=2, format="e")))
}
#Boxplot of all pairwise correlations per module
coherence_boxplot_cor<-
SubGeneCorDF%>%
ggplot2::ggplot(ggplot2::aes(y=Cor, x=Set))+
ggplot2::geom_hline(yintercept =0, color="black")+
ggplot2::geom_boxplot(outlier.shape = NA)+
ggplot2::geom_hline(yintercept = r_cutoff, linetype = "dashed", color = "red")+
ggplot2::scale_y_continuous(breaks=c(seq(-1,1,0.2), r_cutoff))+
ggplot2::ylab("Correlation Strength")+
ggplot2::xlab(paste0(mods_title, " Modules"))+
ggplot2::labs(title = paste0(mods_title, " Module Coherence in ", dat_title, " Samples"),
subtitle = "Inter-Gene PearsonCorrelation")+
ggplot2::theme_bw()+
ggplot2::theme(panel.grid.minor = ggplot2::element_blank(),
panel.grid.major.x = ggplot2::element_blank(),
plot.title = ggplot2::element_text(hjust=0.5),
plot.subtitle = ggplot2::element_text(hjust=0.5),
axis.text.x = ggplot2::element_text(angle=45, hjust=1, vjust=1))
# define plot labels
labels_df_P<-data.frame(name=rep("negLogP", length(p_cutoff)),
h_pos = -log10(p_cutoff)-0.2,
h_label = paste("p=",p_cutoff,sep=""))
#Boxplot of all pairwise correlations pvals per module
coherence_boxplot_p<-
SubGeneCorDF%>%
dplyr::mutate(P.est = ifelse(P==0, sort(unique(SubGeneCorDF$P))[2], P)) %>% #Fill in true 0 with lowest P-value in dataset
ggplot2::ggplot(ggplot2::aes(y=-log10(P.est), x=Set))+
ggplot2::geom_hline(yintercept =0, color="black")+
ggplot2::geom_boxplot(outlier.shape = NA)+
ggplot2::geom_hline(yintercept = -log10(p_cutoff), color="red", linetype="dashed")+
ggplot2::geom_text(data=labels_df_P, ggplot2::aes(label = h_label, x=3, y=h_pos), color="red")+
ggplot2::ylab("-log10(Correlation P Value)")+
ggplot2::xlab(paste0(mods_title, " Modules"))+
ggplot2::labs(title = paste0(mods_title, " Module Coherence in ", dat_title, " Samples"),
subtitle = "Inter-Gene PearsonCorrelation")+
ggplot2::theme_bw()+
ggplot2::theme(panel.grid.minor = ggplot2::element_blank(),
panel.grid.major.x = ggplot2::element_blank(),
plot.title = ggplot2::element_text(hjust=0.5),
plot.subtitle = ggplot2::element_text(hjust=0.5),
axis.text.x = ggplot2::element_text(angle=45, hjust=1, vjust=1))
# Plot equivalent with facets
# define plot labels
labels_df<-data.frame(name=c(rep("Cor", length(r_cutoff)), rep("negLogP", length(p_cutoff))),
h_line = c(r_cutoff, -log10(p_cutoff)),
h_pos = c(r_cutoff-0.02, -log10(p_cutoff)-0.2),
h_label = c(paste("r=",r_cutoff,sep=""), paste("p=",p_cutoff,sep="")))
#Boxplot of all pairwise correlations & pvals per module
coherence_boxplot_faceted<-
SubGeneCorDF%>%
dplyr::mutate(P.est = ifelse(P==0, sort(unique(SubGeneCorDF$P))[2], P)) %>% #Fill in true 0 with lowest P-value in dataset
dplyr::mutate(negLogP = -log10(P.est))%>%
tidyr::pivot_longer(cols = c(Cor, negLogP))%>%
ggplot2::ggplot(ggplot2::aes(y=value, x=Set))+
ggplot2::geom_hline(yintercept =0, color="black")+
ggplot2::geom_boxplot(outlier.shape = NA)+
ggplot2::geom_hline(data=labels_df, ggplot2::aes(yintercept = h_line), color = "red", linetype="dashed")+
ggplot2::geom_text(data=labels_df, ggplot2::aes(label = h_label, x=3, y=h_pos), color="red")+
ggplot2::xlab(paste0(mods_title, " Modules"))+
ggplot2::labs(title = paste0(mods_title, " Module Coherence in ", dat_title, " Samples"),
subtitle = "Inter-Gene PearsonCorrelation")+
ggplot2::theme_bw()+
ggplot2::theme(
axis.title.y = ggplot2::element_blank(),
strip.placement = "outside",
strip.background = ggplot2::element_blank(),
panel.grid.minor = ggplot2::element_blank(),
panel.grid.major.x = ggplot2::element_blank(),
plot.title = ggplot2::element_text(hjust=0.5),
plot.subtitle = ggplot2::element_text(hjust=0.5),
axis.text.x = ggplot2::element_text(angle=45, hjust=1, vjust=1))+
ggplot2::facet_wrap(~name, scales = "free_y", strip.position = "left",
labeller = ggplot2::labeller(name = c("Cor" = "Correlation Strength", "negLogP" = "-log10(Correlation P Value)")))
# Format results for return
if(return_plot){print(coherence_boxplot_faceted)}
if(length(unlist(GSabsent))>0){
return(list(coherence_boxplot_combined = coherence_boxplot_faceted,
coherence_boxplot_cor = coherence_boxplot_cor,
coherence_boxplot_p = coherence_boxplot_p,
subgene_correlation_df = SubGeneCorDF,
GSabsent = GSabsent))} else
return(list(coherence_boxplot_combined = coherence_boxplot_faceted,
coherence_boxplot_cor = coherence_boxplot_cor,
coherence_boxplot_p = coherence_boxplot_p,
subgene_correlation_df = SubGeneCorDF))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.