#' Make arguments for glmnet.
#'
#' Make vectors for foldid and weights to be used for leave-one-study-out
#' cross-validation with [glmnet::glmnet()].
#'
#' @param metadata `data.frame` containing a row for each observation.
#' @param foldidColname column to use for grouping observations.
#' @param sampleColname column that contains sample names.
#'
#' @return A named list.
#' \item{foldid}{vector of integers}
#' \item{weights}{vector of weights, such that each study is weighted equally.}
#'
#' @export
makeGlmnetArgs = function(metadata, foldidColname = 'study', sampleColname = 'sample') {
foldid = as.numeric(factor(
metadata[[foldidColname]],
labels = seq_along(unique(metadata[[foldidColname]]))))
names(foldid) = metadata[[sampleColname]]
weights = length(unique(foldid)) /
do.call(c, sapply(sapply(unique(foldid), function(x) sum(foldid == x)),
function(n) rep_len(n, n), simplify = FALSE))
names(weights) = names(foldid)
return(list(foldid = foldid, weights = weights))}
#' Perform cross-validation of merged gene expression data.
#'
#' Run cross-validation to predict a response variable from gene expression data
#' across multiple studies.
#'
#' @param ematMerged matrix of gene expression for genes by samples.
#' @param sampleMetadata data.frame of sample metadata,
#' with rownames corresponding to sample names.
#' @param weights vector of weights.
#' @param alpha vector of values for alpha, the elastic net mixing parameter.
#' @param nFolds number of folds. Ignored, if `foldid` is not `NA`.
#' @param foldid vector of values specifying what fold each observation is in.
#' @param nRepeats number of times to perform cross-validation. Ignored, if
#' foldid is not `NA`.
#' @param yName column in `sampleMetadata` containing values of the response
#' variable.
#' @param addlFeatureColnames optional vector of column names containing other
#' features to be used for predicting the response variable.
#' @param ... Other arguments passed to [glmnet::cv.glmnet()].
#'
#' @return A list of `cv.glmnet` objects.
#'
#' @export
metapredictCv = function(
ematMerged, sampleMetadata, weights, alpha, nFolds = 10, foldid = NA,
nRepeats = 3, yName = 'class', addlFeatureColnames = NA, ...) {
args = list(...)
sm = mergeDataTable(colnames(ematMerged), sampleMetadata)
if (!is.null(args$family) && args$family == 'cox') {
y = as.matrix(sm[, yName, drop = FALSE])
colnames(y) = c('time', 'status')
} else {
y = sm[[yName]]}
if (is.na(addlFeatureColnames[1])) {
x = scale(t(ematMerged), center = TRUE, scale = FALSE)
} else {
addlFeatureTmp = data.frame(lapply(sm[, addlFeatureColnames], factor))
addlFeatureDummy = stats::model.matrix(~ 0 + ., data = addlFeatureTmp)
x = cbind(scale(t(ematMerged), center = TRUE, scale = FALSE), addlFeatureDummy)}
if (is.na(foldid[1])) {
cvFitList = list()
for (ii in 1:nRepeats) {
foldid = sample(rep(seq(nFolds), length = ncol(ematMerged)))
cvFitList[[ii]] = foreach(alpha = alpha) %do% {
glmnet::cv.glmnet(x, y, weights = weights[colnames(ematMerged)],
foldid = foldid, alpha = alpha, standardize = FALSE, ...)}}
} else {
cvFitList = foreach(alpha = alpha) %do% {
glmnet::cv.glmnet(x, y, weights = weights[colnames(ematMerged)],
foldid = foldid[colnames(ematMerged)],
alpha = alpha, standardize = FALSE, ...)}}
return(cvFitList)}
#' Predict the response variable in validation datasets.
#'
#' Merge the discovery datasets with each validation dataset, train a `glmnet`
#' model on the samples from the discovery datasets, then predicts the response
#' variable for the samples in the respective validation dataset.
#'
#' @param ematList Named list of expression matrices.
#' @param studyMetadata `data.frame` of study metadata.
#' @param sampleMetadata `data.frame` of sample metadata, with rownames
#' corresponding to sample names.
#' @param discoveryStudyNames vector of study names for training.
#' @param alpha value of alpha for the elastic net mixing parameter.
#' @param lambda value of regularization parameter.
#' @param weights vector of weights for training the `glmnet` model.
#' @param batchColname column in `sampleMetadata` containing batch information
#' for [sva::ComBat()].
#' @param covariateName column in `sampleMetadata` containing additional
#' covariates for [sva::ComBat()] besides batch.
#' @param className column in `sampleMetadata` containing values of the response
#' variable.
#' @param type type of prediction to make, passed to [glmnet::predict.glmnet()].
#' @param ... Other arguments passed to [glmnet::glmnet()].
#'
#' @return A named list of objects from [glmnet::predict.glmnet()].
#'
#' @export
metapredict = function(
ematList, studyMetadata, sampleMetadata, discoveryStudyNames, alpha, lambda,
weights, batchColname = 'study', covariateName = NA, className = 'class',
type = 'response', ...) {
study = validationStudyName = NULL
sampleMetadataDT = data.table(sampleMetadata)
discoverySampleNames = sampleMetadataDT[
which(study %in% discoveryStudyNames), sample]
validationStudyNames = setdiff(
sort(unique(sampleMetadataDT[, study])), discoveryStudyNames)
predsList = foreach(validationStudyName = validationStudyNames) %do% {
validationSampleNames = sampleMetadataDT[study == validationStudyName]$sample
ematListNow = ematList[c(discoveryStudyNames, validationStudyName)]
ematMergedDiscVal = mergeStudyData(
ematListNow, sampleMetadata, batchColname = batchColname,
covariateName = covariateName)
ematMergedDisc = ematMergedDiscVal[, discoverySampleNames]
yDTM = mergeDataTable(discoverySampleNames, sampleMetadataDT)
yDTM2 = yDTM[, className, with = FALSE]
y = as.matrix(yDTM2)
fitResult = glmnet::glmnet(
t(ematMergedDisc), y, alpha = alpha, lambda = lambda,
weights = weights[discoverySampleNames], standardize = FALSE, ...)
newx = data.matrix(t(ematMergedDiscVal[, validationSampleNames]))
preds = stats::predict(fitResult, newx = newx, s = lambda, type = type)}
names(predsList) = validationStudyNames
return(predsList)}
#' Make data.table of non-zero coefficients from a glmnet model.
#'
#' Make a sorted `data.table` of the non-zero coefficients of a logistic or
#' multinomial `glmnet` model.
#'
#' @param fitResult `glmnet` object.
#' @param lambda value of lambda for which to obtain coefficients.
#' @param decreasing logical passed to `order`.
#' @param classLevels order of columns in resulting `data.table`.
#'
#' @return A `data.table`.
#'
#' @export
makeCoefDt = function(fitResult, lambda, decreasing = TRUE, classLevels = NA) {
coefSparse = NULL
coefResult = glmnet::coef.glmnet(fitResult, s = lambda)
if (is.list(coefResult)) {
coefResultNonzero = foreach(coefSparse = coefResult) %do% {
x = data.table(rownames(coefSparse)[(coefSparse@i) + 1], coefSparse[(coefSparse@i) + 1])
setnames(x, 1:2, c('geneId', 'coefficient'))
x}
names(coefResultNonzero) = names(coefResult)
if (!is.na(classLevels[1])) {
coefResult = coefResult[classLevels]
coefResultNonzero = coefResultNonzero[classLevels]}
for (ii in seq_along(coefResult)) {
setnames(coefResultNonzero[[ii]], 2, names(coefResult)[ii])}
coefDt = Reduce(function(x, y) merge(x, y, by = 'geneId', all = TRUE), coefResultNonzero)
decNum = if (isTRUE(decreasing)) -1L else 1L
setorderv(coefDt, colnames(coefDt)[2:ncol(coefDt)], decNum, na.last = TRUE)
coefDt[is.na(coefDt)] = 0
} else {
coefDt = data.table(names(coefResult[(coefResult@i) + 1]), coefResult[(coefResult@i) + 1])
setnames(coefDt, c('geneId', 'coefficient'))
decNum = if (isTRUE(decreasing)) -1L else 1L
setorderv(coefDt, 'coefficient', decNum, na.last = TRUE)}
return(coefDt)}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.