R/01_ModelSetup.R

Defines functions GetGlobalErrorCorr DoModelSetupFromIntLim DoModelSetup

Documented in DoModelSetup DoModelSetupFromIntLim GetGlobalErrorCorr

#' This function performs the following tasks:
#' 1. Run IntLIM to obtain all pairwise models.
#' 2. Filter the IntLIM results to obtain a subset of models.
#' 3. Predict the phenotype of all training data given each pairwise model.
#' 4. Build the co-regulation graph using the set of pairwise models.
#' 5. Initialize model with parameters.
#' @param inputData An object with the following fields:
#' @param stype The phenotype (outcome) to predict. This can be either a categorical
#' or numeric outcome.
#' @param covar The clinical covariates to include in the model. These should be the same
#' covariates that were included when running the IntLIM linear models.
#' @param independentVarType The independent variable type (1 or 2)
#' @param outcomeType The outcome type (1 or 2)
#' @param continuous Whether or not the outcome is continuous. Default is TRUE.
#' @param metaFeatureList A list of the valid metrics to include. Valid metrics are
#' "pdf", "localerr", "globalerr", and "pathway".
#' @param k The number of nearest neighbors to consider in localerr.
#' @param eigStep The number of eigenvectors to step by during the evaluation
#' in localerr.
#' Note that this must be less than the number of samples in localerr. Default = 10.
#' @param colIdInd The ID of the column that has the analyte ID's for the
#' independent variable. If blank, then the existing ID's are used.
#' @param colIdOut The ID of the column that has the analyte ID's for the
#' outcome variable. If blank, then the existing ID's are used.
#' @param edgeTypeList List containing one or more of the following to include
#' in the line graph:
#' - "shared.outcome.analyte"
#' - "shared.independent.analyte"
#' - "analyte.chain"
#' @param learningRate Learning rate to use during training. Default is 0.2
#' @param maxIterations Maximum number of iterations. Default is 1,000.
#' @param convergenceCutoff Cutoff for convergence. Default is 0.001.
#' @param learningRate Learning rate to use during training. Default is 0.2
#' @param optimizationType Type of optimization. May be "SGD", "momentum",
#' "adagrad", or "adam". Default is "SGD".
#' @param initialMetaFeatureWeights Initial weights for model meta-features. Default
#' is 0, which results in each meta-feature being given equal weight.
#' @param corrCutoff Predictors with correlations higher than this cutoff will be clustered
#' together, and a representative predictor will be chosen.
#' @param pvalcutoff Predictors with p-values higher than this cutoff will be discarded.
#' @param rsquaredCutoff Predictors with R^2 values below this cutoff will be discarded.
#' @param coeffPercentile Predictors with interaction coefficient percentiles below
#' this cutoff will be discarded.
#' @export
DoModelSetup <- function(inputData, stype,
                         outcomeType = 1,
                         independentVarType = 2,
                         continuous = TRUE,
                         pvalcutoff = 1, 
                         rsquaredCutoff = 0, 
                         coeffPercentile = 0,
                         metaFeatureList = c("pdf","interactionpval", "interactioncoef", "analytecoef", "localerr"),
                         k = 2, 
                         eigStep = 1,
                         colIdInd = "databaseId",
                         colIdOut = "databaseId",
                         edgeTypeList = c("shared.outcome.analyte", "shared.independent.analyte"),
                         learningRate = 0.2,
                         covar = c(),
                         maxIterations = 1000,
                         convergenceCutoff = 0.001,
                         optimizationType = "SGD",
                         initialMetaFeatureWeights = 0,
                         corrCutoff = 0.7){
  
  # Run IntLIM.
  myres <- IntLIM::RunIntLim(inputData = inputData, 
                                stype=stype,
                                save.covar.pvals = TRUE, 
                                outcome = outcomeType, 
                                independent.var.type = independentVarType,
                                covar = covar,
                                continuous = continuous)
  
  # Process the results. 
  myres.filt <- IntLIM::ProcessResults(inputResults = myres, inputData = inputData, 
                                         pvalcutoff = pvalcutoff, rsquaredCutoff = rsquaredCutoff,
                                       interactionCoeffPercentile = coeffPercentile)
  # Perform one-hot-encoding on the covariates.
  encoding <- GENN::OneHotEncoding(covar = covar, inputData = inputData)
  if(length(encoding$covar) > 0 && encoding$covar[1] != ""){
    covar = encoding$covar
  }
  inputData@sampleMetaData <- encoding$sampleMetaData
  
  # Limit covariates in encoding to those used in the models.
  intersectCovar <- intersect(covar, colnames(myres.filt))
  if(length(intersectCovar) < length(covar)){
    covar = intersectCovar
    sampleMetaDataColnames <- setdiff(colnames(inputData@sampleMetaData), setdiff(covar, colnames(myres.filt)))
    inputData@sampleMetaData <- inputData@sampleMetaData[,sampleMetaDataColnames]
  }

  # Run pairwise prediction.
  pred <- GENN::RunPairwisePrediction(inputResults = myres.filt, 
                                                              inputData = inputData,
                                                              stype = stype,
                                                              independentVarType = independentVarType,
                                                              outcomeType = outcomeType)
  coreg <- GENN::BuildCoRegulationGraph(myres.filt)
  projectedGraph <- GENN::ProjectPredictionsOntoGraph(coRegulationGraph = coreg,
                                                                              predictions = pred)
  # Get global error correlation.
  # As per https://blogs.sas.com/content/iml/2014/04/28/how-much-ram-do-i-need-to-store-that-matrix.html, a 
  # matrix of 40,000 x 40,000 takes up 11.92 Gb.
  errorCorr <- colnames(pred)
  if(ncol(pred) < 40000  && corrCutoff > 0){
     errorCorr <- GENN::GetGlobalErrorCorr(predictions = pred,
                                                             inputData = inputData,
                                                             stype = stype,
                                                             corrCutoff = corrCutoff)
  }else{print("This model has too many pairs to generate a correlation matrix (or you set correlation cutoff to 0). Limit is 40,000. Skipping!")}
  # Compute meta features.
  metafeatures <- GENN::GetMetaFeatures(predictions = pred,
                                                                  stype = stype,
                                                                  inputData = inputData,
                                                                  metaFeatureList = metaFeatureList,
                                                                  k = k, eigStep = eigStep,
                                                                  modelStats = myres.filt,
                                                                  colIdInd = colIdInd,
                                                                  colIdOut = colIdOut,
                                                             modelsToConsider = unlist(errorCorr))

  # Initialize model.
  stypeClass <- "categorical"
  if(continuous == TRUE){
    stypeClass <- "numeric"
  }
  modelInput <- GENN::FormatInput(predictionGraphs = projectedGraph, 
                                                          coregulationGraph = coreg,
                                                          inputData = inputData, 
                                                          stype.class = stypeClass,
                                                          stype = stype,
                                                          edgeTypeList = edgeTypeList,
                                                          metaFeatures = metafeatures,
                                                          modelProperties = myres.filt,
                                                          outcome = outcomeType,
                                                       covariates = covar,
                                                          independent.var.type = independentVarType,
                                                       errorCorrelationGroupReps = unlist(errorCorr))

  modelResults <- GENN::InitializeGraphLearningModel(modelInputs = modelInput, 
                                                                          iterations = maxIterations,
                                                                          learningRate = learningRate,
                                                                          optimizationType = optimizationType,
                                                                          initialMetaFeatureWeights = initialMetaFeatureWeights)
  
  # Return initialized mode,
  return(modelResults)
}

#' This function performs the following tasks:
#' (IntLIM models of interest have already been run and passed in)
#' 3. Predict the phenotype of all training data given each pairwise model.
#' 4. Build the co-regulation graph using the set of pairwise models.
#' 5. Initialize model with parameters.
#' @param inputData An object with the following fields:
#' @param intLimResults A data frame including the results of the IntLIM models,
#' post-filtering.
#' @param stype The phenotype (outcome) to predict. This can be either a categorical
#' or numeric outcome.
#' @param covar The clinical covariates to include in the model. These should be the same
#' covariates that were included when running the IntLIM linear models.
#' @param independentVarType The independent variable type (1 or 2)
#' @param outcomeType The outcome type (1 or 2)
#' @param continuous Whether or not the outcome is continuous. Default is TRUE.
#' @param metaFeatureList A list of the valid metrics to include. Valid metrics are
#' "pdf", "localerr", "globalerr", and "pathway".
#' @param k The number of nearest neighbors to consider in localerr.
#' @param eigStep The number of eigenvectors to step by during the evaluation
#' in localerr.
#' Note that this must be less than the number of samples in localerr. Default = 10.
#' @param colIdInd The ID of the column that has the analyte ID's for the
#' independent variable. If blank, then the existing ID's are used.
#' @param colIdOut The ID of the column that has the analyte ID's for the
#' outcome variable. If blank, then the existing ID's are used.
#' @param edgeTypeList List containing one or more of the following to include
#' in the line graph:
#' - "shared.outcome.analyte"
#' - "shared.independent.analyte"
#' - "analyte.chain"
#' @param learningRate Learning rate to use during training. Default is 0.2
#' @param maxIterations Maximum number of iterations. Default is 1,000.
#' @param convergenceCutoff Cutoff for convergence. Default is 0.001.
#' @param learningRate Learning rate to use during training. Default is 0.2
#' @param optimizationType Type of optimization. May be "SGD", "momentum",
#' "adagrad", or "adam". Default is "SGD".
#' @param initialMetaFeatureWeights Initial weights for model meta-features. Default
#' is 0, which results in each meta-feature being given equal weight.
#' @param coeffPercentile Predictors with interaction coefficient percentiles below
#' this cutoff will be discarded.
#' @export
DoModelSetupFromIntLim <- function(inputData, intLimResults, stype,
                                   outcomeType = 1,
                                   independentVarType = 2,
                                   continuous = TRUE,
                                   metaFeatureList = c("pdf","interactionpval", "interactioncoef", "analytecoef", "localerr"),
                                   k = 2, 
                                   eigStep = 1,
                                   colIdInd = "databaseId",
                                   colIdOut = "databaseId",
                                   edgeTypeList = c("shared.outcome.analyte", "shared.independent.analyte"),
                                   learningRate = 0.2,
                                   covar = c(),
                                   maxIterations = 1000,
                                   convergenceCutoff = 0.001,
                                   optimizationType = "SGD",
                                   initialMetaFeatureWeights = 0,
                                   corrCutoff = 0.7){
  
  # Process the results. 
  myres.filt <- intLimResults
  
  # Perform one-hot-encoding on the covariates.
  encoding <- GENN::OneHotEncoding(covar = covar, inputData = inputData)
  if(length(encoding$covar) > 0 && encoding$covar[1] != ""){
    covar = encoding$covar
  }
  inputData@sampleMetaData <- encoding$sampleMetaData
  
  # Limit covariates in encoding to those used in the models.
  intersectCovar <- intersect(covar, colnames(myres.filt))
  if(length(intersectCovar) < length(covar)){
    covar = intersectCovar
    sampleMetaDataColnames <- setdiff(colnames(inputData@sampleMetaData), setdiff(covar, colnames(myres.filt)))
    inputData@sampleMetaData <- inputData@sampleMetaData[,sampleMetaDataColnames]
  }
  
  # Run pairwise prediction.
  pred <- GENN::RunPairwisePrediction(inputResults = myres.filt, 
                                                           inputData = inputData,
                                                           stype = stype,
                                                           independentVarType = independentVarType,
                                                           outcomeType = outcomeType)
  coreg <- GENN::BuildCoRegulationGraph(myres.filt)
  projectedGraph <- GENN::ProjectPredictionsOntoGraph(coRegulationGraph = coreg,
                                                                           predictions = pred)
  # Get global error correlation.
  # Get global error correlation.
  # As per https://blogs.sas.com/content/iml/2014/04/28/how-much-ram-do-i-need-to-store-that-matrix.html, a 
  # matrix of 40,000 x 40,000 takes up 11.92 Gb.
  errorCorr <- colnames(pred)
  if(ncol(pred) < 40000 && corrCutoff > 0){
    errorCorr <- GENN::GetGlobalErrorCorr(predictions = pred,
                                          inputData = inputData,
                                          stype = stype,
                                          corrCutoff = corrCutoff)
  }else{print("This model has too many pairs to generate a correlation matrix (or you set correlation cutoff to 0). Limit is 40,000. Skipping!")}
  

  # Compute meta features.
  metafeatures <- GENN::GetMetaFeatures(predictions = pred,
                                                             stype = stype,
                                                             inputData = inputData,
                                                             metaFeatureList = metaFeatureList,
                                                             k = k, eigStep = eigStep,
                                                             modelStats = myres.filt,
                                                             colIdInd = colIdInd,
                                                             colIdOut = colIdOut,
                                                             modelsToConsider = unlist(errorCorr))
  
  # Initialize model.
  stypeClass <- "categorical"
  if(continuous == TRUE){
    stypeClass <- "numeric"
  }
  modelInput <- GENN::FormatInput(predictionGraphs = projectedGraph, 
                                                       coregulationGraph = coreg,
                                                       inputData = inputData, 
                                                       stype.class = stypeClass,
                                                       stype = stype,
                                                       edgeTypeList = edgeTypeList,
                                                       metaFeatures = metafeatures,
                                                       modelProperties = myres.filt,
                                                       outcome = outcomeType,
                                                       covariates = covar,
                                                       independent.var.type = independentVarType,
                                                       errorCorrelationGroupReps = unlist(errorCorr))
  
  modelResults <- GENN::InitializeGraphLearningModel(modelInputs = modelInput, 
                                                                          iterations = maxIterations,
                                                                          learningRate = learningRate,
                                                                          optimizationType = optimizationType,
                                                                          initialMetaFeatureWeights = initialMetaFeatureWeights)
  
  # Return initialized mode,
  return(modelResults)
}

#' Creates a Spearman correlation matrix of the prediction errors for each
#' predictor on the training data.
#' @param predictions The predictions for each predictor and sample.
#' @param inputData An IntLIMData object.
#' @param stype The phenotype name.
#' @param corrCutoff All model pairs above this correlation cutoff will be clustered together.
#' @export
GetGlobalErrorCorr <- function(predictions, inputData, stype, corrCutoff){
  # Obtain true values.
  trueValues <- matrix(rep(inputData@sampleMetaData[,stype], ncol(predictions)), ncol = ncol(predictions))
  
  # Calculate errors and error correlation.
  if(!is.numeric(inputData@sampleMetaData[,stype])){
    catNames <- sort(unique(inputData@sampleMetaData[,stype]))
    trueValuesChar <- trueValues
    trueValues <- matrix(rep(0, length(trueValuesChar)), ncol = ncol(trueValuesChar),
                         nrow = nrow(trueValuesChar))
    trueValues[which(trueValuesChar == catNames[2])] <- 1
  }
  errors <- predictions - trueValues
  correlation <- stats::cor(errors, method = "spearman")
  hist(correlation)

  # Find groups of correlated predictors.
  whichCor <- which(correlation > corrCutoff, arr.ind = TRUE, useNames = FALSE)
  colnames(whichCor) <- c("V1", "V2")
  toInvestigate <- unique(whichCor[,"V2"])
  correlatedGroups <- list()
  while(length(toInvestigate) > 0){
    # Remove a predictor from the queue.
    predictor <- toInvestigate[1]
    
    # Create a group with all of the predictor's correlated neighbors.
    whichCorrelated <- whichCor[which(whichCor[,"V2"] == predictor), "V1"]
    newCorrelatedGroup <- c(whichCorrelated, predictor)
    correlatedGroupNames <- colnames(predictions)[unname(unique(newCorrelatedGroup))]
    correlatedGroups[[length(correlatedGroups)+1]] <- unlist(correlatedGroupNames)

    # Remove the group from the queue.
    toInvestigate <- setdiff(toInvestigate, newCorrelatedGroup)
  }
  
  # Compute individual performance of each predictor.
  individualPerformance <- ComputeIndividualPerformance(predictions = predictions,
                                                        trueVal = inputData@sampleMetaData[,stype])
  representativeModels <- lapply(correlatedGroups, function(grp){
    performanceInGroup <- individualPerformance[grp]
    bestPerformer <- names(performanceInGroup)[which.max(performanceInGroup)]
    return(bestPerformer)
  })
  if(length(representativeModels) == 1){
    stop(paste("Only one model is remaining with the correlation cutoff you have chosen.",
               "Please select a higher correlation cutoff to remove fewer models."))
  }
  return(representativeModels)
}
ncats/MultiOmicsGraphPrediction documentation built on Aug. 23, 2023, 9:19 a.m.