This code provides options for the report.
library(knitr) # Knitr default options opts_chunk$set(dpi=150, tidy=TRUE, fig.align='center', echo=TRUE)
This code loads all functions and data needed to run elastic net.
# Set random seed set.seed(1) library(rcellminer) library(rcellminerData) library(rcellminerElasticNet) library(impute) library(glmnet) molDB <- getMolDataMatrices() drugActData <- exprs(getAct(rcellminerData::drugData))
These are the parameters used for the elastic net run, including:
# Drug NSC drug <- "<%= nsc %>" # Included datasets includeDatasets <- <%= includeDatasets %> includeFeaturesPath <- '<%= includeFeaturesPath %>' # Excluded features and cell lines excludeCellLines <- <%= excludeCellLines %> #excludeFeatures <- c("expSLFN11") excludeFeatures <- NULL ## Restrict gene set # load(file.path(.lmp, "gene_set_pathway_analysis", "data", "ddr_genelist.Rdata")) # restrictedGeneSet <- rownames(ddr.list) restrictedGeneSet <- NULL # Use intersection of features across data types in molDB. useCommonFeaturesInMolDB <- <%= useCommonFeaturesInMolDB %> # Show debugging data? verbose <- <%= verbose %> # THRESHOLDS # Features must have absolute correlation greater than corThreshold to be # included in elastic net. corThreshold <- <%= corThreshold %> # Elastic net results will be augmented to include information about input features # that are correlated with selected predictors above predictorCorFeatureThreshold. predCorFeatureThreshold <- <%= predCorFeatureThreshold %> minNumResponsiveLines <- <%= minNumResponsiveLines %> responseThreshold <- <%= responseThreshold %> minNumMutations <- <%= minNumMutations %> # Results parameters maxPlotPredictors <- <%= maxPlotPredictors %> # Determines if intercept term is to be used in predicting held out response # values during cross validation. This should be set to FALSE for leave-out-out # cross validation; TRUE setting can be evaluated for 5 or 10 fold CV. useModelYIntercept <- <%= useModelYIntercept %> filePrefix <- "<%= filePrefix %>" outputDir <- "<%= outputDir %>" outputFilePath <- "enResults.csv"
This code prepares the data used in the elastic net analysis.
tic() #----[prepare drug data]----------------------------------------------------------------- # Make sure data is available for drug. stopifnot(all(is.element(drug, rownames(drugActData)))) # Exclude selected cell lines. drugActData <- drugActData[, -which(colnames(drugActData) %in% excludeCellLines)] # Exclude cell lines w/o activity data for drug of interest drugActProfile <- setNames(as.numeric(drugActData[drug, ]), colnames(drugActData)) drugActProfile <- drugActProfile[!is.na(drugActProfile)] drugActData <- drugActData[, names(drugActProfile)] # NOTE DRUG ACTIVITY PROFILES WITH AN INSUFFICIENT NUMBER OF RESPONSIVE LINES. numResponsive <- sum(drugActProfile > responseThreshold, na.rm=TRUE) if (numResponsive < minNumResponsiveLines){ filePrefix <- paste0("invalid_", filePrefix) } #----[prepare molecular data]------------------------------------------------------------ # Restrict molecular data to drug data cell line set prepared above. molDB <- lapply(molDB, function(x) { x[, names(drugActProfile)] }) # Restrict molecular data to specified subtypes. datasetIdx <- (names(molDB) %in% includeDatasets) molDB <- molDB[datasetIdx] # Impute missing gene expression data. if ("exp" %in% names(molDB)){ # NOTE: invisible() is used to suppress function print statements molDB[["exp"]] <- invisible(impute.knn(molDB[["exp"]])$data) } # Transform to z-score data. if ("swa" %in% names(molDB)){ molDB[["swa"]] <- t(scale(t(molDB[["swa"]]))) } # Filter mutation profiles with fewer than minNumMutations. if ("mut" %in% names(molDB)){ countMutations <- function(mutPattern){ return(sum(mutPattern > 0)) } mutCounts <- apply(molDB$mut, MARGIN = 1, FUN = countMutations) molDB$mut <- molDB$mut[(mutCounts >= minNumMutations), ] # Transform to z-score data. molDB[["mut"]] <- t(scale(t(molDB[["mut"]]))) } #----[prepare elastic net feature data]-------------------------------------------------- if (includeFeaturesPath != ""){ includeFeatures <- read.table(includeFeaturesPath, header = TRUE, stringsAsFactors = FALSE) includeFeatures <- includeFeatures[,1] for (molDBType in names(molDB)) { filteredFeatures <- intersect(rownames(molDB[[molDBType]]), paste0(molDBType, includeFeatures)) molDB[[molDBType]] <- molDB[[molDBType]][filteredFeatures, ] } } if (useCommonFeaturesInMolDB){ molDBFeatures <- lapply(molDB, function(x) {unname(removeMolDataType(rownames(x)))}) commonFeatures <- Reduce(intersect, molDBFeatures) for (molDBType in names(molDB)){ molDB[[molDBType]] <- molDB[[molDBType]][paste0(molDBType, commonFeatures), ] } } # Filter datasets featureSet <- selectCorrelatedRowsFromMatrices(Y=drugActProfile, XList=molDB, corThreshold=corThreshold) # Filter features if(!is.null(restrictedGeneSet)) { featureSet <- restrictFeatureMat(geneSet=restrictedGeneSet, featureMat=featureSet, prefixSet=includeDatasets) } featureSet <- featureSet[setdiff(rownames(featureSet), excludeFeatures), ] # Included feature sets names(molDB)
# RUN ELASTIC NET # ADJUSTED CALL TO RECORD CUMULATIVE CORRELATIONS FOR EXPANDED SET OF PREDICTORS. tryCatch({ elNetResults <- elasticNet(featureMat=featureSet, responseVec=drugActProfile, id=drug, alphaVals=seq(<%= alphaValMin %>, <%= alphaValMax %>, length = 10), useLambdaMin=TRUE, cumCorNumPredictors=maxPlotPredictors, verbose=verbose, useOneStdErrRule = TRUE, standardize=TRUE, standardizeY=TRUE, fitIntercept=FALSE) }, error=function(e) { elNetResults <- list() filePrefix <- paste0("invalid_", filePrefix) save(elNetResults, file=file.path(outputDir, paste0(filePrefix, ".Rdata"))) stop(e) }) # NOTE: Add features that correlated to resulting predictors #elNetResults <- addPredictorCorrelatedFeatures(elNetResults, molDB[datasetIdx], predCorFeatureThreshold) toc()
r paste(names(molDB), collapse=", ")
r dim(featureSet)[1]
r dim(featureSet)[2]
r elNetResults$alpha
r elNetResults$lambda
The table below provides feature counts for each of the feature sets used.
table(substr(rownames(featureSet), 1, 3))
This plot shows potential models at different values of lambda:
# No longer applicable with repeats of cross-validation. # plot(elNetResults$cvOutput)
This plot shows 3 pieces of information for the resulting model:
numPlotPredictors <- min(length(elNetResults$predictorWts), maxPlotPredictors) featureAnnot <- genFeatureAnnotations(molDB, elNetResults, as.numeric(drugActData[drug, ])) featureAnnot <- featureAnnot[1:numPlotPredictors] plotElasticNet(drugName=drug, weights=elNetResults$predictorWts[1:numPlotPredictors], drugAct=drugActData, molDb=molDB, numCellLines=length(drugActProfile), numFeatures=numPlotPredictors, featureAnnotations=featureAnnot, showCellLineLabels=TRUE, pdfFilename=NULL, thresholdValues=TRUE, topMarWt=2.4, bottomMarWt=1.5, leftMarHeatmap=6, verbose=FALSE)
An explanation of the predictor annotations to the left of the above heatmap:
Cross validation results for elastic net model (feature set) selection.
iMinEstErr <- which.min(elNetResults$cvMeanSqErrVals) if (length(iMinEstErr) == 1){ maxErr <- elNetResults$cvMeanSqErrVals[iMinEstErr] + elNetResults$cvSdMeanSqErrVals[iMinEstErr] } else{ maxErr <- NA } k <- length(elNetResults$predictorWts) x <- seq_along(elNetResults$cvMeanSqErrVals) y <- elNetResults$cvMeanSqErrVals se <- elNetResults$cvSdMeanSqErrVals plot(x, y, xlab="Number of Predictors", ylab="Cross-Validation Error", main="Cross-Validation Errors for Nested Predictor Sets", ylim=c(0, max(1, max(y+se, na.rm = TRUE)))) if (!is.na(maxErr)){ abline(h = maxErr, col="red") } abline(v=k, col="green", lty=3, lwd=2.5) segments(x,y-se,x,y+se) epsilon <- 0.10 segments(x-epsilon, y-se, x+epsilon, y-se) segments(x-epsilon, y+se, x+epsilon, y+se) x <- seq_along(elNetResults$cvPredRsqVals) y <- elNetResults$cvPredRVals plot(x, y, xlab="Number of Predictors", ylab="Pearson's Correlation", main="CV-Predicted vs. Actual Response Correlation", ylim=c(0, 1)) abline(v=k, col="green", lty=3, lwd=2.5)
tryCatch({ if (length(elNetResults) > 0){ enPredsPreModSelection <- names(elNetResults$predictorWts_preModelSelection) if (length(enPredsPreModSelection) > 1){ set.seed(1) enPredMat <- t(featureSet[enPredsPreModSelection, , drop = FALSE]) ridgeCvOut <- glmnet::cv.glmnet(x = enPredMat, y = drugActProfile, alpha = 0) ridgeOut <- glmnet::glmnet(x = enPredMat, y = drugActProfile, alpha = 0) elNetResults$predictorWts_enSelectedRidge <- predict(ridgeOut, type = "coefficients", s = ridgeCvOut$lambda.min)[-1, ] } } })
Cross-validation of overall elastic net.
# Leave-one-out cross-validation. set.seed(2) cvFoldIds <- getCvFoldIds(nObs = length(drugActProfile), nFolds = length(drugActProfile)) elNetResults$fullEnCvResults <- cvElasticNet( cvFoldIds=cvFoldIds, keepEnResults=FALSE, featureMat=featureSet, responseVec=drugActProfile, id=drug, alphaVals=seq(<%= alphaValMin %>, <%= alphaValMax %>, length = 10), useLambdaMin=TRUE, cumCorNumPredictors=maxPlotPredictors, verbose=verbose, useModelYIntercept = useModelYIntercept, useOneStdErrRule = TRUE, standardize=TRUE, standardizeY=TRUE, fitIntercept=FALSE) # Correlation between full EN cross-validation predicted values and actual values. elNetResults$fullEnCvResults$cvPredR cvEnPredTab <- elNetResults$fullEnCvResults$cvFoldEnPredictorTab[ c("PREDICTOR", "FREQUENCY", "SIGN")] cvEnPredTab <- cvEnPredTab[order(cvEnPredTab$FREQUENCY, decreasing = TRUE), ] kable(cvEnPredTab, caption="Predictors Selected During CV Elastic Net Runs", row.names=FALSE, digits=2)
This code saves the results of the elastic net analysis.
save(elNetResults, file=file.path(outputDir, paste0(filePrefix, ".Rdata")))
cat("CV Run Time: ", sum(elNetResults$fullEnCvResults$enRunTimes), "\n") cat("Timestamp: ", format(Sys.time(), "%m%d%yT%H%M%S"), "\n") sessionInfo()
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.