#' This function performs the following tasks:
#' 1. Run IntLIM to obtain all pairwise models.
#' 2. Filter the IntLIM results to obtain a subset of models.
#' 3. Predict the phenotype of all training data given each pairwise model.
#' 4. Build the co-regulation graph using the set of pairwise models.
#' 5. Initialize model with parameters.
#' @param inputData An object with the following fields:
#' @param stype The phenotype (outcome) to predict. This can be either a categorical
#' or numeric outcome.
#' @param covar The clinical covariates to include in the model. These should be the same
#' covariates that were included when running the IntLIM linear models.
#' @param independentVarType The independent variable type (1 or 2)
#' @param outcomeType The outcome type (1 or 2)
#' @param continuous Whether or not the outcome is continuous. Default is TRUE.
#' @param metaFeatureList A list of the valid metrics to include. Valid metrics are
#' "pdf", "localerr", "globalerr", and "pathway".
#' @param k The number of nearest neighbors to consider in localerr.
#' @param eigStep The number of eigenvectors to step by during the evaluation
#' in localerr.
#' Note that this must be less than the number of samples in localerr. Default = 10.
#' @param colIdInd The ID of the column that has the analyte ID's for the
#' independent variable. If blank, then the existing ID's are used.
#' @param colIdOut The ID of the column that has the analyte ID's for the
#' outcome variable. If blank, then the existing ID's are used.
#' @param edgeTypeList List containing one or more of the following to include
#' in the line graph:
#' - "shared.outcome.analyte"
#' - "shared.independent.analyte"
#' - "analyte.chain"
#' @param learningRate Learning rate to use during training. Default is 0.2
#' @param maxIterations Maximum number of iterations. Default is 1,000.
#' @param convergenceCutoff Cutoff for convergence. Default is 0.001.
#' @param learningRate Learning rate to use during training. Default is 0.2
#' @param optimizationType Type of optimization. May be "SGD", "momentum",
#' "adagrad", or "adam". Default is "SGD".
#' @param initialMetaFeatureWeights Initial weights for model meta-features. Default
#' is 0, which results in each meta-feature being given equal weight.
#' @param corrCutoff Predictors with correlations higher than this cutoff will be clustered
#' together, and a representative predictor will be chosen.
#' @param pvalcutoff Predictors with p-values higher than this cutoff will be discarded.
#' @param rsquaredCutoff Predictors with R^2 values below this cutoff will be discarded.
#' @param coeffPercentile Predictors with interaction coefficient percentiles below
#' this cutoff will be discarded.
#' @export
DoModelSetup <- function(inputData, stype,
outcomeType = 1,
independentVarType = 2,
continuous = TRUE,
pvalcutoff = 1,
rsquaredCutoff = 0,
coeffPercentile = 0,
metaFeatureList = c("pdf","interactionpval", "interactioncoef", "analytecoef", "localerr"),
k = 2,
eigStep = 1,
colIdInd = "databaseId",
colIdOut = "databaseId",
edgeTypeList = c("shared.outcome.analyte", "shared.independent.analyte"),
learningRate = 0.2,
covar = c(),
maxIterations = 1000,
convergenceCutoff = 0.001,
optimizationType = "SGD",
initialMetaFeatureWeights = 0,
corrCutoff = 0.7){
# Run IntLIM.
myres <- IntLIM::RunIntLim(inputData = inputData,
stype=stype,
save.covar.pvals = TRUE,
outcome = outcomeType,
independent.var.type = independentVarType,
covar = covar,
continuous = continuous)
# Process the results.
myres.filt <- IntLIM::ProcessResults(inputResults = myres, inputData = inputData,
pvalcutoff = pvalcutoff, rsquaredCutoff = rsquaredCutoff,
interactionCoeffPercentile = coeffPercentile)
# Perform one-hot-encoding on the covariates.
encoding <- GENN::OneHotEncoding(covar = covar, inputData = inputData)
if(length(encoding$covar) > 0 && encoding$covar[1] != ""){
covar = encoding$covar
}
inputData@sampleMetaData <- encoding$sampleMetaData
# Limit covariates in encoding to those used in the models.
intersectCovar <- intersect(covar, colnames(myres.filt))
if(length(intersectCovar) < length(covar)){
covar = intersectCovar
sampleMetaDataColnames <- setdiff(colnames(inputData@sampleMetaData), setdiff(covar, colnames(myres.filt)))
inputData@sampleMetaData <- inputData@sampleMetaData[,sampleMetaDataColnames]
}
# Run pairwise prediction.
pred <- GENN::RunPairwisePrediction(inputResults = myres.filt,
inputData = inputData,
stype = stype,
independentVarType = independentVarType,
outcomeType = outcomeType)
coreg <- GENN::BuildCoRegulationGraph(myres.filt)
projectedGraph <- GENN::ProjectPredictionsOntoGraph(coRegulationGraph = coreg,
predictions = pred)
# Get global error correlation.
# As per https://blogs.sas.com/content/iml/2014/04/28/how-much-ram-do-i-need-to-store-that-matrix.html, a
# matrix of 40,000 x 40,000 takes up 11.92 Gb.
errorCorr <- colnames(pred)
if(ncol(pred) < 40000 && corrCutoff > 0){
errorCorr <- GENN::GetGlobalErrorCorr(predictions = pred,
inputData = inputData,
stype = stype,
corrCutoff = corrCutoff)
}else{print("This model has too many pairs to generate a correlation matrix (or you set correlation cutoff to 0). Limit is 40,000. Skipping!")}
# Compute meta features.
metafeatures <- GENN::GetMetaFeatures(predictions = pred,
stype = stype,
inputData = inputData,
metaFeatureList = metaFeatureList,
k = k, eigStep = eigStep,
modelStats = myres.filt,
colIdInd = colIdInd,
colIdOut = colIdOut,
modelsToConsider = unlist(errorCorr))
# Initialize model.
stypeClass <- "categorical"
if(continuous == TRUE){
stypeClass <- "numeric"
}
modelInput <- GENN::FormatInput(predictionGraphs = projectedGraph,
coregulationGraph = coreg,
inputData = inputData,
stype.class = stypeClass,
stype = stype,
edgeTypeList = edgeTypeList,
metaFeatures = metafeatures,
modelProperties = myres.filt,
outcome = outcomeType,
covariates = covar,
independent.var.type = independentVarType,
errorCorrelationGroupReps = unlist(errorCorr))
modelResults <- GENN::InitializeGraphLearningModel(modelInputs = modelInput,
iterations = maxIterations,
learningRate = learningRate,
optimizationType = optimizationType,
initialMetaFeatureWeights = initialMetaFeatureWeights)
# Return initialized mode,
return(modelResults)
}
#' This function performs the following tasks:
#' (IntLIM models of interest have already been run and passed in)
#' 3. Predict the phenotype of all training data given each pairwise model.
#' 4. Build the co-regulation graph using the set of pairwise models.
#' 5. Initialize model with parameters.
#' @param inputData An object with the following fields:
#' @param intLimResults A data frame including the results of the IntLIM models,
#' post-filtering.
#' @param stype The phenotype (outcome) to predict. This can be either a categorical
#' or numeric outcome.
#' @param covar The clinical covariates to include in the model. These should be the same
#' covariates that were included when running the IntLIM linear models.
#' @param independentVarType The independent variable type (1 or 2)
#' @param outcomeType The outcome type (1 or 2)
#' @param continuous Whether or not the outcome is continuous. Default is TRUE.
#' @param metaFeatureList A list of the valid metrics to include. Valid metrics are
#' "pdf", "localerr", "globalerr", and "pathway".
#' @param k The number of nearest neighbors to consider in localerr.
#' @param eigStep The number of eigenvectors to step by during the evaluation
#' in localerr.
#' Note that this must be less than the number of samples in localerr. Default = 10.
#' @param colIdInd The ID of the column that has the analyte ID's for the
#' independent variable. If blank, then the existing ID's are used.
#' @param colIdOut The ID of the column that has the analyte ID's for the
#' outcome variable. If blank, then the existing ID's are used.
#' @param edgeTypeList List containing one or more of the following to include
#' in the line graph:
#' - "shared.outcome.analyte"
#' - "shared.independent.analyte"
#' - "analyte.chain"
#' @param learningRate Learning rate to use during training. Default is 0.2
#' @param maxIterations Maximum number of iterations. Default is 1,000.
#' @param convergenceCutoff Cutoff for convergence. Default is 0.001.
#' @param learningRate Learning rate to use during training. Default is 0.2
#' @param optimizationType Type of optimization. May be "SGD", "momentum",
#' "adagrad", or "adam". Default is "SGD".
#' @param initialMetaFeatureWeights Initial weights for model meta-features. Default
#' is 0, which results in each meta-feature being given equal weight.
#' @param coeffPercentile Predictors with interaction coefficient percentiles below
#' this cutoff will be discarded.
#' @export
DoModelSetupFromIntLim <- function(inputData, intLimResults, stype,
outcomeType = 1,
independentVarType = 2,
continuous = TRUE,
metaFeatureList = c("pdf","interactionpval", "interactioncoef", "analytecoef", "localerr"),
k = 2,
eigStep = 1,
colIdInd = "databaseId",
colIdOut = "databaseId",
edgeTypeList = c("shared.outcome.analyte", "shared.independent.analyte"),
learningRate = 0.2,
covar = c(),
maxIterations = 1000,
convergenceCutoff = 0.001,
optimizationType = "SGD",
initialMetaFeatureWeights = 0,
corrCutoff = 0.7){
# Process the results.
myres.filt <- intLimResults
# Perform one-hot-encoding on the covariates.
encoding <- GENN::OneHotEncoding(covar = covar, inputData = inputData)
if(length(encoding$covar) > 0 && encoding$covar[1] != ""){
covar = encoding$covar
}
inputData@sampleMetaData <- encoding$sampleMetaData
# Limit covariates in encoding to those used in the models.
intersectCovar <- intersect(covar, colnames(myres.filt))
if(length(intersectCovar) < length(covar)){
covar = intersectCovar
sampleMetaDataColnames <- setdiff(colnames(inputData@sampleMetaData), setdiff(covar, colnames(myres.filt)))
inputData@sampleMetaData <- inputData@sampleMetaData[,sampleMetaDataColnames]
}
# Run pairwise prediction.
pred <- GENN::RunPairwisePrediction(inputResults = myres.filt,
inputData = inputData,
stype = stype,
independentVarType = independentVarType,
outcomeType = outcomeType)
coreg <- GENN::BuildCoRegulationGraph(myres.filt)
projectedGraph <- GENN::ProjectPredictionsOntoGraph(coRegulationGraph = coreg,
predictions = pred)
# Get global error correlation.
# Get global error correlation.
# As per https://blogs.sas.com/content/iml/2014/04/28/how-much-ram-do-i-need-to-store-that-matrix.html, a
# matrix of 40,000 x 40,000 takes up 11.92 Gb.
errorCorr <- colnames(pred)
if(ncol(pred) < 40000 && corrCutoff > 0){
errorCorr <- GENN::GetGlobalErrorCorr(predictions = pred,
inputData = inputData,
stype = stype,
corrCutoff = corrCutoff)
}else{print("This model has too many pairs to generate a correlation matrix (or you set correlation cutoff to 0). Limit is 40,000. Skipping!")}
# Compute meta features.
metafeatures <- GENN::GetMetaFeatures(predictions = pred,
stype = stype,
inputData = inputData,
metaFeatureList = metaFeatureList,
k = k, eigStep = eigStep,
modelStats = myres.filt,
colIdInd = colIdInd,
colIdOut = colIdOut,
modelsToConsider = unlist(errorCorr))
# Initialize model.
stypeClass <- "categorical"
if(continuous == TRUE){
stypeClass <- "numeric"
}
modelInput <- GENN::FormatInput(predictionGraphs = projectedGraph,
coregulationGraph = coreg,
inputData = inputData,
stype.class = stypeClass,
stype = stype,
edgeTypeList = edgeTypeList,
metaFeatures = metafeatures,
modelProperties = myres.filt,
outcome = outcomeType,
covariates = covar,
independent.var.type = independentVarType,
errorCorrelationGroupReps = unlist(errorCorr))
modelResults <- GENN::InitializeGraphLearningModel(modelInputs = modelInput,
iterations = maxIterations,
learningRate = learningRate,
optimizationType = optimizationType,
initialMetaFeatureWeights = initialMetaFeatureWeights)
# Return initialized mode,
return(modelResults)
}
#' Creates a Spearman correlation matrix of the prediction errors for each
#' predictor on the training data.
#' @param predictions The predictions for each predictor and sample.
#' @param inputData An IntLIMData object.
#' @param stype The phenotype name.
#' @param corrCutoff All model pairs above this correlation cutoff will be clustered together.
#' @export
GetGlobalErrorCorr <- function(predictions, inputData, stype, corrCutoff){
# Obtain true values.
trueValues <- matrix(rep(inputData@sampleMetaData[,stype], ncol(predictions)), ncol = ncol(predictions))
# Calculate errors and error correlation.
if(!is.numeric(inputData@sampleMetaData[,stype])){
catNames <- sort(unique(inputData@sampleMetaData[,stype]))
trueValuesChar <- trueValues
trueValues <- matrix(rep(0, length(trueValuesChar)), ncol = ncol(trueValuesChar),
nrow = nrow(trueValuesChar))
trueValues[which(trueValuesChar == catNames[2])] <- 1
}
errors <- predictions - trueValues
correlation <- stats::cor(errors, method = "spearman")
hist(correlation)
# Find groups of correlated predictors.
whichCor <- which(correlation > corrCutoff, arr.ind = TRUE, useNames = FALSE)
colnames(whichCor) <- c("V1", "V2")
toInvestigate <- unique(whichCor[,"V2"])
correlatedGroups <- list()
while(length(toInvestigate) > 0){
# Remove a predictor from the queue.
predictor <- toInvestigate[1]
# Create a group with all of the predictor's correlated neighbors.
whichCorrelated <- whichCor[which(whichCor[,"V2"] == predictor), "V1"]
newCorrelatedGroup <- c(whichCorrelated, predictor)
correlatedGroupNames <- colnames(predictions)[unname(unique(newCorrelatedGroup))]
correlatedGroups[[length(correlatedGroups)+1]] <- unlist(correlatedGroupNames)
# Remove the group from the queue.
toInvestigate <- setdiff(toInvestigate, newCorrelatedGroup)
}
# Compute individual performance of each predictor.
individualPerformance <- ComputeIndividualPerformance(predictions = predictions,
trueVal = inputData@sampleMetaData[,stype])
representativeModels <- lapply(correlatedGroups, function(grp){
performanceInGroup <- individualPerformance[grp]
bestPerformer <- names(performanceInGroup)[which.max(performanceInGroup)]
return(bestPerformer)
})
if(length(representativeModels) == 1){
stop(paste("Only one model is remaining with the correlation cutoff you have chosen.",
"Please select a higher correlation cutoff to remove fewer models."))
}
return(representativeModels)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.