r reportTitle
This is a fully reproducible signature development report created with the rdsmGeneSig deterministic statistical machine version r packageVersion("rdsmGeneSig")
. The machine is available from Github at https://github.com/jtleek/rdsmGeneSig.
Before beginning the analysis we load the required R packages.
require(inSilicoMerging) require(GEOquery) require(affy) require(frma) require(hthgu133afrmavecs) require(hgu133plus2frmavecs) require(AnnotationDbi) require(affyio) require(survHD) require(survAUC) require(healthvis) require(sva) require(tspreg)
# Get the time startTime <- format(Sys.time(), "_Date_%Y-%m-%d_Time_%H-%M-%S")
First we load in the discovery meta data nd merge them together.
## Determine the number of data sets numDisc <- length(discMETA) ## Load the data sets - if more than one load as a list if(numDisc > 1){ discMetaData <- vector("list",numDisc) for(i in 1:numDisc){ discMetaData[[i]] <- read.csv(discMETA[i]) } }else{ discMetaData <-read.csv(discMETA) }
According to your inputs we discovered r length(discMETA)
files. The files suggest that there are n=r dim(discMetaData)[1]
and m=r dim(discMetaData)[2]
variables in your data set.
Load in the discovery expression data and run frozen robust multi-array average normalization to allow for single-sample prediction. For more information on frma see McCall et al. 2009
## Load the data sets - if more than one load as a list if(numDisc > 1){ discExpr <- discFileNames <- vector("list",numDisc) for(i in 1:numDisc){ discFileNames <- list.files(discCEL[i],full.names=TRUE) discAffyBatch <- read.affybatch(filenames=discFileNames) discExpr[[i]] <- frma(discAffyBatch, summarize="robust_weighted_average") pData(discExpr[[i]]) <- cbind(data.frame(fullFilePath=discFileNames),pData(discExpr[[i]])) } }else{ discFileNames <- list.files(discCEL,full.names=TRUE) discAffyBatch <- read.affybatch(filenames=discFileNames) discExpr <- frma(discAffyBatch, summarize="robust_weighted_average") pData(discExpr) <- cbind(data.frame(fullFilePath=discFileNames),pData(discExpr)) }
We identified r dim(discExpr)[2]
samples collected from the r annotation(discExpr)
platform. We discovered r numDisc
expression sets. r if(numDisc > 1){discMergeText}else{discMergeText2}
.
if(numDisc > 1){ mergedDiscMeta <- Reduce(function(...) base::merge(..., all=T), discMetaData) mergedDiscExpr <- inSilicoMerging::merge(discExpr) }else{ mergedDiscMeta <- discMetaData mergedDiscExpr <- discExpr }
The next step is to make sure that the file names match at least one of the columns and keep only those files with matching meta data.
# Find the column with the highest number of matching files exprCelFiles <- rownames(pData(mergedDiscExpr)) percentMatchedNames <- apply(mergedDiscMeta,2,function(x){mean(exprCelFiles %in% as.character(x))}) fileNameIndex <- which.max(percentMatchedNames) dat <- mergedDiscMeta[match(exprCelFiles,mergedDiscMeta[,fileNameIndex]),] rownames(dat) <- rownames(pData(mergedDiscExpr)) pData(mergedDiscExpr) <- cbind(pData(mergedDiscExpr),dat) fileNameIndex <- fileNameIndex + 2
We identified the file names in your metadata with variable name r names(pData(mergedDiscExpr))[fileNameIndex]
.r if(mean(pData(mergedDiscExpr)[,fileNameIndex] == rownames(pData(mergedDiscExpr))) < 1){text <- "WARNING! File names in meta data and expression data do not perfectly match.\n"; tex;}
Next we collect information on RNA quality and batch effects.
## Get dimensions nSamples <- dim(mergedDiscExpr)[2]; nFeatures <- dim(mergedDiscExpr)[1] ## Make batch variables batchVars <- makeBatch(mergedDiscExpr) ## If the batch variables have variation include them ## in the pDatap pData(mergedDiscExpr) <- cbind(pData(mergedDiscExpr), pdBatch=batchVars$pdBatch, scanBatch=batchVars$pdBatch, rinVals=batchVars$rin) phenoVar <- getPheno(mergedDiscExpr)
Next we consider possible confounding variables in the study design. We look for measures of RNA quality and batch effects in the metadata and also attempt to identify scan dates from the microarrays.
scanBatchVals <- pdBatchVals <- rinVals <- FALSE sbtext <- pdtext <- rtext <- "" if(any(!is.na(pData(mergedDiscExpr)$rinVals))){ rinVals <- TRUE tmpVar <- pData(mergedDiscExpr)$rinVals tmpOut <- phenoVar rinP <- anova(coxph(tmpOut ~ tmpVar))[2,4] rtext <- batchOutText(tmpVar,tmpOut,1) } if(any(!is.na(pData(mergedDiscExpr)$pdBatch))){ pdBatchVals <- TRUE tmpVar <- pData(mergedDiscExpr)$pdBatch tmpOut <- phenoVar pdbatchP <- anova(coxph(tmpOut ~ tmpVar))[2,4] pdtext <- batchOutText(tmpVar,tmpOut,2) } if(any(!is.na(pData(mergedDiscExpr)$scanBatch))){ scanBatchVals <- TRUE tmpVar <- pData(mergedDiscExpr)$scanBatch tmpOut <- phenoVar scanbatchP <- anova(coxph(tmpOut ~ tmpVar))[2,4] sbtext <- batchOutText(tmpVar,tmpOut,3) }
r rtext
r pdtext
r sbtext
Next, we divide the data into training and testing data sets. This is done by randomly subdividing the data set into a training set with 2/3 of the samples and a test set with 1/3 of the samples.
## Set seed nTrain <- floor(nSamples*2/3) set.seed(analysisSeed) ## Set up indices trainIndex <- sample(1:nSamples,size=nTrain) testIndex <- (1:nSamples)[-trainIndex] ## Separate data sets discTrain <- mergedDiscExpr[,trainIndex] discTest <- mergedDiscExpr[,testIndex]
We created a training set of size ntrain=r length(trainIndex)
and a test set of size ntest=r length(testIndex)
.
Now we use frozen surrogate variable analysis to get batch-effect removed versions of split training data.
## Get the expression matrix traindat <- exprs(discTrain) testdat <- exprs(discTest) ## Set up the model matrix mod <- model.matrix(~phenoVar[trainIndex,2]) ## Perform fsva svaObj <- sva(traindat,mod) fsvaObject <- fsva(traindat,mod,svaObj,testdat) ## Get clean versions of the data cleanTrain <- fsvaObject$db cleanTest <- fsvaObject$new
We fit the model to the first half of the clean training data.
## Build the prediction model predModel <- tspreg(cleanTrain,phenoVar[trainIndex], npair=20,nvars=1000) trainPairs <- calculateTspairs(cleanTrain,predModel$index) testPairs <- calculateTspairs(cleanTest,predModel$index) ## Set the number of pairs max at 10 auc <- rep(NA,10) for(i in 1:10){ form <- formula(paste0("phenoVar[trainIndex] ~",paste(colnames(trainPairs$pairMat)[1:i],collapse="+"))) tmpModel <- coxph(form,data=trainPairs$pairMat) auc[i] <- AUC.uno(phenoVar[trainIndex], phenoVar[testIndex], predict(tmpModel,newdata=testPairs$pairMat), times=quantileSequence(phenoVar[testIndex][,1],10))$iauc } nPair <- which.max(auc)
Our analysis identified that the optimal number of pairs for prediction is r nPair
with a time varying AUC of r auc[nPair]
.
Now we clean the full training set and build the predictive model.
## Set up the model matrices mod <- model.matrix(~phenoVar[,2]) ## Perform sva svaObj <- sva(exprs(mergedDiscExpr),mod) fsvaObject <- fsva(exprs(mergedDiscExpr),mod,svaObj,exprs(mergedDiscExpr)) cleanMerged <- fsvaObject$db predModel <- tspreg(cleanMerged,phenoVar, npair=nPair,nvars=1000) tsPairs <- calculateTspairs(cleanMerged,predModel$index) modelPredictors <- as.data.frame(sapply(tsPairs$pairMat, as.factor)) form <- formula(paste0("Surv(survivalTime,outcome)~",paste(colnames(modelPredictors),collapse="+"))) modelPredictors <- cbind(survivalTime=as.numeric(pData(mergedDiscExpr)$survivalTime), outcome=pData(mergedDiscExpr)$outcome, modelPredictors) modelFit <- coxph(form,data=modelPredictors,model=TRUE)
In the training set we fit a cox proportional hazards regression model and obtained an estimated concordance of r summary(modelFit)$concordance[1]
. The model fit is summarized in the output chunk below.
summary(modelFit)
Now we make the prediction function based on the classifier we have developed. We save the functions to the file geneSig.rda.
predictTsp <- function(eSet){ cleanData <- fsva(exprs(mergedDiscExpr),mod,svaObj,exprs(eSet))$new tsPairs <- calculateTspairs(cleanData,predModel$index) modelPredictors <- as.data.frame(sapply(tsPairs$pairMat, as.factor)) sf <- survfit(modelFit,modelPredictors) return(sf) } save(predictTsp,mergedDiscExpr,svaObj,mod,predModel,modelFit,file=paste0(workDir,"/geneSig.rda"))
To use the function you just type:
## To apply the function you need to pass an expression set, the returned values are ## the expected event times. load("geneSig.rda") predictions <- predictTsp(eSet)
Now we load in the validation metadata.
## Determine the number of data sets numValid <- length(validMETA) ## Load the data sets - if more than one load as a list if(numValid > 1){ validMetaData <- vector("list",numValid) for(i in 1:numValid){ validMetaData[[i]] <- read.csv(validMETA[i]) } }else{ validMetaData <-read.csv(validMETA) }
We identified r dim(discExpr)[2]
samples collected from the r annotation(discExpr)
platform.
Load in the validation expression data and run frozen robust multi-array average normalization to allow for single-sample prediction. For more information on frma see McCall et al. 2009
## Load the data sets - if more than one load as a list if(numValid > 1){ validExpr <- validFileNames <- vector("list",numValid) for(i in 1:numValid){ validFileNames <- list.files(validCEL[i],full.names=TRUE) validAffyBatch <- read.affybatch(filenames=validFileNames) validExpr[[i]] <- frma(validAffyBatch, summarize="robust_weighted_average") pData(validExpr[[i]]) <- cbind(data.frame(fullFilePath=validFileNames),pData(validExpr[[i]])) } }else{ validFileNames <- list.files(validCEL,full.names=TRUE) validAffyBatch <- read.affybatch(filenames=validFileNames) validExpr <- frma(validAffyBatch, summarize="robust_weighted_average") pData(validExpr) <- cbind(data.frame(fullFilePath=validFileNames),pData(validExpr)) }
We discovered r numDisc
expression sets. r if(numDisc > 1){discMergeText}else{discMergeText2}
.
if(numValid > 1){ mergedValidMeta <- Reduce(function(...) base::merge(..., all=T), validMetaData) mergedValidExpr <- inSilicoMerging::merge(validExpr) }else{ mergedValidMeta <- validMetaData mergedValidExpr <- validExpr }
The next step is to make sure that the file names match at least one of the columns and keep only those files with matching meta data.
# Find the column with the highest number of matching files exprCelFiles <- rownames(pData(mergedValidExpr)) percentMatchedNames <- apply(mergedValidMeta,2,function(x){mean(exprCelFiles %in% as.character(x))}) fileNameIndex <- which.max(percentMatchedNames) dat <- mergedValidMeta[match(exprCelFiles,mergedValidMeta[,fileNameIndex]),] rownames(dat) <- rownames(pData(mergedValidExpr)) pData(mergedValidExpr) <- cbind(pData(mergedValidExpr),dat) fileNameIndex <- fileNameIndex + 2
Confirm that the filenames from the read CEL files correspond to the filenames in the meta data.
if(mean(pData(mergedValidExpr)[,fileNameIndex] == rownames(pData(mergedValidExpr))) < 1){ print("WARNING! File names in meta data and expression data do not match.\n") }
Next we get some information about variables, including data on batch effects.
## Get dimensions nSamples <- dim(mergedValidExpr)[2]; nFeatures <- dim(mergedValidExpr)[1] ## Make batch variables batchVars <- makeBatch(mergedValidExpr) ## If the batch variables have variation include them ## in the pDatap pData(mergedValidExpr) <- cbind(pData(mergedValidExpr), pdBatch=batchVars$pdBatch, scanBatch=batchVars$pdBatch, rinVals=batchVars$rin)
Next we find the phenotype variable and apply the predictor to the validation data set.
validPhenoVar <- getPheno(mergedValidExpr) validData <- fsva(exprs(mergedDiscExpr),mod,svaObj,exprs(mergedValidExpr))$new validPairs <- calculateTspairs(validData,predModel$index) validPredictors <- as.data.frame(sapply(validPairs$pairMat, as.factor)) validPredictors <- cbind(survivalTime=as.numeric(pData(mergedValidExpr)$survivalTime), outcome=pData(mergedValidExpr)$outcome, validPredictors) aucObj <- AUC.uno(phenoVar,validPhenoVar, predict(modelFit,newdata=validPredictors), times = quantileSequence(validPhenoVar[,1],10))
This is a plot of the time varying AUC of our prediction applied to the test set. We obtained an integrated AUC of r aucObj$iauc
.
plot(aucObj)
Here we list all versions of the software that were used for this analysis for reproducibility.
# Get the end time endTime <- format(Sys.time(), "_Date_%Y-%m-%d_Time_%H-%M-%S")
sessionInfo() cat("Analysis started \n") startTime cat("Analysis ended \n") endTime
Here we list all relevant citations for the analyses that were performed to be included in any publication based on running the rdsmGeneSig machine.
citation("inSilicoMerging") citation("sva") citation("frma") citation("affy") citation("knitr") citation("survHD") citation("pROC") citation("healthvis") citation("survAUC")
hvObject <- survivalVis(modelFit,data=modelPredictors,plot=FALSE) save(hvObject,file=paste0(workDir,"/dynamic-plot.rda"))
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.