#######################################################################################
# HIPC gene-level multi-cohort analysis script
# Original Developer: Francesco Vallania, Stanford University (2014)
# Refactoring Developer: Evan Henrich, Fred Hutchinson Cancer Research Center (2017)
# purpose: perform gene-level analysis for HIPC
#######################################################################################
#######################################################################################
### HELPER FUNCTIONS ###
#######################################################################################
# ensembl_gene_connector_hs76
#~fetch list of all genes in Homo sapiens
# loading as part of pkg
# ensembl_gene_connector_hs76 <- function(){
#
# #Establish connection to EnsEMBL
# con <- dbConnect(dbDriver("MySQL"),
# username= 'anonymous',
# host = 'ensembldb.ensembl.org',
# dbname = 'homo_sapiens_core_76_38',
# password= '',
# port = 3306);
#
# #build the parts for the query string
# query <- "select gene.stable_id,
# xref.display_label
# from xref
# inner join gene
# where xref.xref_id = gene.display_xref_id";
#
# #run query and get table
# ens_table <- dbGetQuery(con,query);
#
# #Close connection
# dbDisconnect(con);
#
# #return table
# return(ens_table);
# }
# Performs any pairwise ROC comparison in one dataset -----------------------------
# input-> data.frame containint class and score as columns
multi_roc_cmp <- function(input_df,
title_name=input_df$name[1]){
#Get input classes and form pairs (combinations of n elements taken 2 at a time)
input_classes <- unique(input_df$class);
input_pairs <- t(combn(input_classes,2));
test_function <- function(sets){
# select data for test
roc_obj <- roc(subset(input_df,class %in% sets)$class,
subset(input_df,class %in% sets)$score,
lty=5,
lwd=5,
col='royalblue',
percent=T
)
return(list(roc=roc_obj,
name=paste(title_name,"\n",
subset(input_df,class==sets[1])$label[1],
"VS",
subset(input_df,class==sets[2])$label[1],sep=" ")
))
}
#apply function
res <- lapply(1:nrow(input_pairs),
function(i) test_function(input_pairs[i,]))
}
# Gets Stats -----------------------------------------------------------------------
multi_stat_cmp <- function(input_df,test.type = 't.test'){
#Get input classes and form pairs (combinations of n elements taken 2 at a time)
input_classes <- unique(input_df$class);
input_pairs <- t(combn(input_classes,2));
test_function <- function(sets){
#select data for test
set_a <- input_df$score[input_df$class==sets[1]];
set_b <- input_df$score[input_df$class==sets[2]];
#perform stats
two.sided <- NA;
if(test.type=='wilcox.test'){
two.sided <- wilcox.test(set_a,set_b)$p.value;
}
else if(test.type=='t.test'){
two.sided <- t.test(set_a,set_b)$p.value;
}
else{
warning('Unsupported test type.');
}
return(as.numeric(two.sided));
}
#apply function
res <- sapply(1:nrow(input_pairs),
function(i) test_function(input_pairs[i,]));
#return data in the form of a data.frame
return(data.frame(Var1 = input_pairs[,1],
Var2 = input_pairs[,2],
pvalue = res,
bonf = res*nrow(input_pairs)));
}
# Creates violin plot using GGPLOT2 -------------------------------------------------
violinPlotter <- function(inputDF,
labels,
colorVect,
shapeVect){
#format data into data.table
inputDT <- as.data.table(inputDF)
inputDT$label <- labels[inputDT$class+1]
inputDT$label <- factor(inputDT$label,
levels = labels,
ordered = TRUE)
#function for standard error
sterr <- function(vector){sd(vector)/sqrt(length(vector))}
#add column with mean and standard error
inputDT[, mean:=mean(score), by=class]
inputDT[,sterr:=sterr(score),by=class]
#create plot object with ggplot
finalPlot <- ggplot(inputDT,aes(y=score,x=label)) +
geom_violin(trim = F) +
geom_jitter(width = 0.2,aes(col=label,shape=label),size=4) +
scale_color_manual(values=colorVect) +
scale_shape_manual(values=shapeVect) +
geom_point(aes(y=mean),size=6,col='grey35') +
geom_errorbar(aes(ymax=mean+sterr,ymin=mean-sterr),
col='grey35',width = 0.2) + xlab("") +
ylab("Response score") +
theme(axis.text.x=element_blank(),
legend.position = "bottom",
legend.direction = "vertical",
legend.title = element_blank())
}
# Computes ROC curves from an input list of pROC objects --------------------------
rocGGMultiRoc <- function(validation_roc_list,
rocStyle = 1,
rocSize = 1.4,
newLineLegend = TRUE){
roc_data_table <- rocDT(validation_roc_list)
# generate plot
p <- ggplot(roc_data_table,
aes(x= Specificity,y = Sensitivity, colour = name)) +
geom_line(size = rocSize,linetype = rocStyle) +
scale_x_reverse() +
geom_segment(x=-100, y=0, xend=0, yend=100, linetype=3, colour="grey75") +
theme(legend.key.height=unit(2,"line"),
legend.title = element_blank(),
legend.text = element_text(size = 10),
legend.direction = "vertical",
legend.position = "bottom")
}
# reformat in a data.table -----------------------------------------------------------
rocDT <- function(validation_roc_list){
roc_data_table <- rbindlist(lapply(validation_roc_list, function(i){
final_name <- paste(i$name,
"\n",
paste("AUC = ",
round(i$roc$auc[[1]],2),
"%",
collapse=""),
paste(" CI = ",
paste(c(round(ci(i$roc)[1],2),
round(ci(i$roc)[3],2)),
collapse = "-")))
x <- data.table(name = final_name,
AUC = paste("AUC = ",
round(i$roc$auc[[1]],2),
"%",
collapse=""),
Specificity = i$roc$specificities,
Sensitivity = i$roc$sensitivities,
FPR = 100 - i$roc$specificities)
x <- x[ order(Sensitivity), ]
}))
}
# get metaIntegratorResults ----------------------------------------------------------
metaRes <- function(gems){
res <- list()
# Structure it as a list to correctly input into MetaIntegrator
# Keep copy for forest plot figure 4
res$metaObj <- mRes <- MetaIntegrator::runMetaAnalysis( list("originalData" = gems) )
# filter meta-analysis results by 10% FDR
mResFilt <- MetaIntegrator::filterGenes(mRes,
isLeaveOneOut = FALSE,
FDRThresh = 0.10,
heterogeneityPvalThresh = 0)
# store up/down regulated genes as results, filtering FDR thresholds as above
tmp <- mRes$metaAnalysis$pooledResults
tmp <- tmp[ tmp$effectSizeFDR < 0.1, ]
res$upReg <- round(tmp[ tmp$fisherFDRUp < 0.1, ], digits = 3)
res$dnReg <- round(tmp[ tmp$fisherFDRDown < 0.1, ], digits = 3)
# select set of positive genes to be used from this point on
# Keep copy of neg genes for forest plot figure 4
res$posGenes <- mResFilt$filterResults$FDR0.1_es0_nStudies1_looaFALSE_hetero0$posGeneNames
res$negGenes <- mResFilt$filterResults$FDR0.1_es0_nStudies1_looaFALSE_hetero0$negGeneNames
return(res)
}
#######################################################################################
### EXPORTED FUNCTIONS ###
#######################################################################################
# Convert eset into appropriate list type for MetaIntegrator --------------------------
#' Converts expressionSet into list for MetaIntegrator analysis
#' @param eset expressionSet object
#' @param rmMods Remove moderates, takes boolean
#' @param ens_table ensemble table of same name preloaded with pkg
#' @export
makeGEM <- function(eset, rmGrp, grpToCut, ens_table){
# rm moderates or high depending
if( rmGrp ){ eset <- eset[ , eset$fc_res_max_d30 != grpToCut ] }
#create key vector
keys <- fData(eset)$gene_symbol
names(keys) <- rownames(fData(eset))
#remove probes that do not map back to genes
keys[ !(keys %in% ens_table$display_label) ] <- NA
# create class vector and change high-responders to "1" from "2" for MetaIntegrator
# Able to do this because all moderate responders have already been removed
endPoint <- pData(eset)[["fc_res_max_d30"]]
if( rmGrp & grpToCut == 1 ){ endPoint <- gsub(2, 1, endPoint) }
classV <- as.numeric(endPoint)
names(classV) <- row.names(pData(eset))
# Ensure that exprs is numeric
em <- exprs(eset)
class(em) <- "numeric"
#create list
gem <- list(expr = em,
pheno = pData(eset),
name = sdy,
formattedName = sdy,
keys = keys,
class = classV,
comment = "created from SDY GEM matrix + demographics file")
return(gem)
}
# Need to export for use with SDY80 all timepoint plot --------------------------------
#' Generates Single Gene Analysis Scores
#' @param GEM expression matrix
#' @param genes_p up-regulated genes
#' @param genes_n down-regulated genes
#' @param scale unknown
#' @export
arithMean_comb_validate <- function(GEM,
genes_p = NULL,
genes_n = NULL,
scale = 1){
#compute score with mean
score <- rep(0, length(GEM$class) )
if( !is.null(genes_p) ){
currGenes_p <- getSampleLevelGeneData(GEM,genes_p)
score <- score + colMeans(currGenes_p, na.rm=T)
}
if( !is.null(genes_n) ){
currGenes_n <- getSampleLevelGeneData(GEM,genes_n)
score <- score - colMeans(currGenes_n, na.rm=T)
}
if( !is.null(genes_p) || !is.null(genes_n) ){
if( scale == 1 ){
score <- scale(score)[,1];
}
}
#return data.frame
return(data.frame(class = GEM$class,
score = score,
name = GEM$name));
}
# helper function SDY404 column swap -----------------------------------------
#' Swap columns by name
#' @param em expression matrix with colnames including sub1 and sub2
#' @param sub1 subject 1
#' @param sub2 subject 2
#' @export
swapCols <- function(em, sub1, sub2){
idx1 <- grep(sub1, colnames(em))
idx2 <- grep(sub2, colnames(em))
tmp1 <- em[, idx1]
em[, idx1] <- em[ , idx2]
em[ , idx2] <- tmp1
return(em)
}
#***********MAIN METHOD***************************************************************
#' Function to perform single gene meta analysis for HIPC ImmuneSignatures Project
#'
#' @import ggplot2 pROC clinfun
#' @param adjEsetList list of expressionSet objects holding expr, pheno data adjusted for cohort
#' @param cohort Study cohort, young or old
#' @export
singleGeneAnalysis <- function(adjEsetList, cohort, grpToCut){
# Create holder list for results
finalRes <- list("disc","val")
# reformat esets for use in MetaIntegrator functions
# Need to take out moderate responders here for discovery studies,
# because runMetaAnalysis only allows 1 or 0 in class aka endPoint,
# but leave for validation studies for plotting work.
gems <- lapply(names(adjEsetList), function(sdy){
rmGrp <- !(sdy %in% c("SDY67", "SDY80"))
makeGEM(eset = adjEsetList[[sdy]],
rmGrp = rmGrp,
grpToCut = grpToCut,
ens_table = ens_table)
})
names(gems) <- names(adjEsetList)
# Assign discovery set and validation
discStr <- ifelse(grpToCut == 1, "212|400|404|63", "212|404|63")
discGems <- gems[grep(discStr, names(gems))]
finalRes$disc <- metaRes(discGems)
sdyId <- ifelse(cohort == "young", "SDY80", "SDY67")
valGemUnProc <- gems[[ sdyId ]]
# Older cohort does not return any significant genes therefore not processing
# But do need metaAnalysis object for Fig8
if( sdyId == "SDY80" & grpToCut == 1 ){
# compute validation object using validation study gem
valObj <- arithMean_comb_validate(valGemUnProc, finalRes$disc$posGenes, NULL)
# ViolinPlot for Val - fig 6A in manuscript
finalRes$val$ViolinPlot <- violinPlotter(valObj,
c("Low responders",
"Moderate responders",
"High responders"),
c("deepskyblue",
"magenta",
"red"),
c(15,19,17))
# ROCPlot for Val - fig 6B in manuscript
valRoc <- multi_roc_cmp(valObj)
valRocLs <- list(valRoc[[1]], valRoc[[2]])
valRocLs[[1]]$name <- "High Responders Vs Low Responders"
valRocLs[[2]]$name <- "Moderate Responders Vs Low Responders"
finalRes$val$RocPlot <- rocGGMultiRoc(valRocLs) +
scale_colour_manual(values=c("red","magenta"))
}
return(finalRes)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.