R/metafeaturefunctions.R

Defines functions ComputeGlobalErrorMetafeatureTest ComputeGlobalErrorMetafeature ComputeLocalErrorMetafeatureTest ComputeLocalErrorMetafeature CosineSimilarityToTrainingOnSingleDataType CosineSimilarityToTrainingOnSubspace ComputeCosineSimilarity CombineSubspacesTest CombineSubspaces FindOptimalSubspaceClustering ComputeAnalyteCoefMetafeature ComputeInteractionCoefMetafeature ComputePvalMetafeature ComputeReactionMetafeature ComputePathwayMetafeature GetPredictorIDs ComputePDFMetafeature GetTestMetaFeatures GetMetaFeatures

Documented in CombineSubspaces CombineSubspacesTest ComputeAnalyteCoefMetafeature ComputeCosineSimilarity ComputeGlobalErrorMetafeature ComputeGlobalErrorMetafeatureTest ComputeInteractionCoefMetafeature ComputeLocalErrorMetafeature ComputeLocalErrorMetafeatureTest ComputePathwayMetafeature ComputePDFMetafeature ComputePvalMetafeature ComputeReactionMetafeature CosineSimilarityToTrainingOnSingleDataType CosineSimilarityToTrainingOnSubspace FindOptimalSubspaceClustering GetMetaFeatures GetPredictorIDs GetTestMetaFeatures

#' Computes the metafeatures for each sample and model.
#' @param predictions Prediction data frame, where rows are samples, and
#' columns are predictors.
#' @param metaFeatureList A list of the valid metrics to include. Valid metrics are
#' "pdf", "localerr", "globalerr", "pathway", "reaction", "interactionpval", 
#' "interactioncoef", and "analytecoef". Additionally, use "equality" to add equal
#' weight to all predictors.
#' @param k The number of nearest neighbors to consider in localerr.
#' @param inputData The input data read in using the function IntLIM function
#' ReadData.
#' @param eigStep The number of eigenvectors to step by during the evaluation
#' in localerr.
#' Note that this must be less than the number of samples in localerr. Default = 10.
#' @param alphaMin The lowest value of alpha to investigate in localerr. Default = 0.
#' @param alphaMax The highest value of alpha to investigate in localerr. Default = 1.
#' @param alphaStep The value of alpha to step by during the evaluation in localerr.
#' Default = 0.1.
#' @param modelStats A data frame that includes the interaction p-values and
#' interaction coefficients for each pair (such as the one output by IntLIM's
#' ProcessResults function)
#' @param stype Phenotype or outcome to use in models.
#' @param colIdInd The ID of the column that has the analyte ID's for the
#' independent variable. If blank, then the existing ID's are used.
#' @param colIdOut The ID of the column that has the analyte ID's for the
#' outcome variable. If blank, then the existing ID's are used.
#' @param modelsToConsider A list of models for which to calculate meta-features.
#' @return A list of data frames (one for each sample) with predictor importance
#' measured according to the listed criteria (one column per metric, one row
#' per predictor).
#' @export
GetMetaFeatures <- function(predictions, inputData,
                                       metaFeatureList = c("pdf", "localerr", "globalerr",
                                                      "pathway", "reaction",
                                                      "interactionpval", "interactioncoef",
                                                      "analytecoef", "equality"),
                                       modelStats = "",
                                       k = k,
                                       eigStep = 10, 
                                       alphaMin = 0,
                                       alphaMax = 1, 
                                       alphaStep = 0.1,
                                    stype = "",
                                    colIdInd = "",
                                    colIdOut = "",
                            modelsToConsider){
  
  # Compute metrics.
  metaFeatures <- list()
  if("pdf" %in% metaFeatureList){
    print("Computing PDF metafeatures")
    metaFeatures$pdf <- ComputePDFMetafeature(predictions = predictions[,modelsToConsider  ])
  }
  if("localerr" %in% metaFeatureList){
    print("Computing Local Error metafeatures")
    trueVals <- inputData@sampleMetaData[,stype]
    names(trueVals) <- rownames(inputData@sampleMetaData)
    metaFeatures$localerr <- ComputeLocalErrorMetafeature(predictions = predictions[,modelsToConsider], 
                                                    true = trueVals,
                                                    k = k,
                                                    inputData = inputData, 
                                                    eigStep = eigStep, 
                                                    alphaMin = alphaMin,
                                                    alphaMax = alphaMax, 
                                                    alphaStep = alphaStep)
  }
  if("globalerr" %in% metaFeatureList){  
    print("Computing Global Error metafeatures")
    metaFeatures$globalerr <- ComputeGlobalErrorMetafeature(predictions = predictions[,modelsToConsider], 
                                                      true = inputData@sampleMetaData[,stype])
  }
  if("pathway" %in% metaFeatureList){
    print("Computing Pathway metafeatures")
    metaFeatures$pathway <- ComputePathwayMetafeature(predictions = predictions[,modelsToConsider], 
                                                inputData = inputData,
                                                colIdInd = colIdInd,
                                                colIdOut = colIdOut)
  }
  if("reaction" %in% metaFeatureList){
    print("Computing Reaction metafeatures")
    metaFeatures$reaction <- ComputeReactionMetafeature(predictions = predictions[,modelsToConsider], 
                                                  inputData = inputData,
                                                  colIdInd = colIdInd,
                                                  colIdOut = colIdOut)
  }
  if("interactionpval" %in% metaFeatureList){
    print("Computing Interaction Term p-Value metafeatures")
    metaFeatures$interactionpval <- ComputePvalMetafeature(predictions = predictions[,modelsToConsider], 
                                                     modelStats = modelStats)
  }
  if("interactioncoef" %in% metaFeatureList){
    print("Computing Interaction Coefficient metafeatures")
    metaFeatures$interactioncoef <- ComputeInteractionCoefMetafeature(predictions = predictions[,modelsToConsider], 
                                                                modelStats = modelStats)
  }
  if("analytecoef" %in% metaFeatureList){
    print("Computing Analyte Coefficient metafeatures")
    metaFeatures$analytecoef <- ComputeAnalyteCoefMetafeature(predictions = predictions[,modelsToConsider], 
                                                        modelStats = modelStats)
  }
  if("equality" %in% metaFeatureList){
    print("Adding an equality metafeature")
    metaFeatures$equality <- t(matrix(rep(1, nrow(predictions) * length(modelsToConsider)),
                                      ncol = nrow(predictions)))
    rownames(metaFeatures$equality) <- rownames(predictions)
    colnames(metaFeatures$equality) <- modelsToConsider
  }
  
  return(metaFeatures)
}

#' Computes the metafeatures for each sample and model in the testing data.
#' @param predictionsTrain Prediction data frame for training data, where rows are samples, and
#' columns are predictors.
#' @param predictionsTest Prediction data frame for testing data, where rows are samples, and
#' columns are predictors.
#' @param k The number of nearest neighbors to consider in localerr.
#' @param inputDataTrain The training data (in IntLimData format).
#' @param inputDataTest The testing data (in IntLimData format).
#' @param eigStep The number of eigenvectors to step by during the evaluation
#' in localerr.
#' Note that this must be less than the number of samples in localerr. Default = 10.
#' @param alphaMin The lowest value of alpha to investigate in localerr. Default = 0.
#' @param alphaMax The highest value of alpha to investigate in localerr. Default = 1.
#' @param alphaStep The value of alpha to step by during the evaluation in localerr.
#' Default = 0.1.
#' @param modelStats A data frame that includes the interaction p-values and
#' interaction coefficients for each pair (such as the one output by IntLIM's
#' ProcessResults function)
#' @param stype Phenotype or outcome to use in models.
#' @param colIdInd The ID of the column that has the analyte ID's for the
#' independent variable. If blank, then the existing ID's are used.
#' @param colIdOut The ID of the column that has the analyte ID's for the
#' outcome variable. If blank, then the existing ID's are used.
#' @param metaFeatureList A list of metafeatures to compute.
#' @return A list of data frames (one for each sample) with predictor importance
#' measured according to the listed criteria (one column per metric, one row
#' per predictor).
#' @export
GetTestMetaFeatures <- function(predictionsTrain, predictionsTest, inputDataTrain, inputDataTest,
                                metaFeatureList = c("pdf", "localerr", "globalerr",
                                                   "pathway", "reaction",
                                                   "interactionpval", "interactioncoef",
                                                   "analytecoef"),
                                    modelStats = "",
                                    k = k,
                                    eigStep = 10, 
                                    alphaMin = 0,
                                    alphaMax = 1, 
                                    alphaStep = 0.1,
                                    stype = "",
                                    colIdInd = "",
                                    colIdOut = ""){
  # Compute metrics.
  metaFeatures <- list()
  if("pdf" %in% metaFeatureList){
    print("Computing PDF metafeatures")
    metaFeatures$pdf <- ComputePDFMetafeature(predictions = predictionsTest)
  }
  if("localerr" %in% metaFeatureList){
    print("Computing Local Error metafeatures")
    trueVals <- inputDataTrain@sampleMetaData[,stype]
    names(trueVals) <- rownames(inputDataTrain@sampleMetaData)
    metaFeatures$localerr <- ComputeLocalErrorMetafeatureTest(predictions = predictionsTrain, 
                                                    true = trueVals,
                                                    k = k,
                                                    inputDataTrain = inputDataTrain, 
                                                    inputDataTest = inputDataTest,
                                                    eigStep = eigStep, 
                                                    alphaMin = alphaMin,
                                                    alphaMax = alphaMax, 
                                                    alphaStep = alphaStep)
  }
  if("globalerr" %in% metaFeatureList){  
    print("Computing Global Error metafeatures")
    metaFeatures$globalerr <- ComputeGlobalErrorMetafeatureTest(predictions = predictionsTrain, 
                                                      true = inputDataTrain@sampleMetaData[,stype])
    # Adjust dimensionality for test data.
    metaFeatures$globalerr <- t(matrix(rep(metaFeatures$globalerr[1,], nrow(predictionsTest),
                                         ncol = nrow(predictionsTest))))
    rownames(metaFeatures$globalerr) <- rownames(predictionsTest)
    colnames(metaFeatures$globalerr) <- colnames(predictionsTest)
  }
  if("pathway" %in% metaFeatureList){
    print("Computing Pathway metafeatures")
    metaFeatures$pathway <- ComputePathwayMetafeature(predictions = predictionsTest, 
                                                inputData = inputDataTest,
                                                colIdInd = colIdInd,
                                                colIdOut = colIdOut)
    # Adjust dimensionality for test data.
    metaFeatures$pathway <- t(matrix(rep(metaFeatures$pathway[1,], nrow(predictionsTest)),
                                                 ncol = nrow(predictionsTest)))
    rownames(metaFeatures$pathway) <- rownames(predictionsTest)
    colnames(metaFeatures$pathway) <- colnames(predictionsTest)
  }
  if("reaction" %in% metaFeatureList){
    print("Computing Reaction metafeatures")
    metaFeatures$reaction <- ComputeReactionMetafeature(predictions = predictionsTest, 
                                                  inputData = inputDataTest,
                                                  colIdInd = colIdInd,
                                                  colIdOut = colIdOut)
    # Adjust dimensionality for test data.
    metaFeatures$reaction <- t(matrix(rep(metaFeatures$reaction[1,], nrow(predictionsTest)),
                                           ncol = nrow(predictionsTest)))
    rownames(metaFeatures$reaction) <- rownames(predictionsTest)
    colnames(metaFeatures$reaction) <- colnames(predictionsTest)
  }
  if("interactionpval" %in% metaFeatureList){
    print("Computing Interaction Term p-Value metafeatures")
    metaFeatures$interactionpval <- ComputePvalMetafeature(predictions = predictionsTrain,
                                                     modelStats = modelStats)
    metaFeatures$interactionpval <- t(matrix(rep(metaFeatures$interactionpval[1,], nrow(predictionsTest)),
                                       ncol = nrow(predictionsTest)))
    rownames(metaFeatures$interactionpval) <- rownames(predictionsTest)
    colnames(metaFeatures$interactionpval) <- colnames(predictionsTest)
  }
  if("interactioncoef" %in% metaFeatureList){
    print("Computing Interaction Coefficient metafeatures")
    metaFeatures$interactioncoef <- ComputeInteractionCoefMetafeature(predictions = predictionsTrain,
                                                                modelStats = modelStats)
    # Adjust dimensionality for test data.
    metaFeatures$interactioncoef <- t(matrix(rep(metaFeatures$interactioncoef[1,], nrow(predictionsTest)),
                                          ncol = nrow(predictionsTest)))
    rownames(metaFeatures$interactioncoef) <- rownames(predictionsTest)
    colnames(metaFeatures$interactioncoef) <- colnames(predictionsTest)
  }
  if("analytecoef" %in% metaFeatureList){
    print("Computing Analyte Coefficient metafeatures")
    metaFeatures$analytecoef <- ComputeAnalyteCoefMetafeature(predictions = predictionsTrain, 
                                                        modelStats = modelStats)
    
    # Adjust dimensionality for test data.
    metaFeatures$analytecoef <- t(matrix(rep(metaFeatures$analytecoef[1,], nrow(predictionsTest)),
                                                 ncol = nrow(predictionsTest)))
    rownames(metaFeatures$analytecoef) <- rownames(predictionsTest)
    colnames(metaFeatures$analytecoef) <- colnames(predictionsTest)
  }
  if("equality" %in% metaFeatureList){
    print("Adding an equality metafeature")
    metaFeatures$equality <- t(matrix(rep(1, nrow(predictionsTest) * ncol(predictionsTest)),
                                      ncol = nrow(predictionsTest)))
    rownames(metaFeatures$equality) <- rownames(predictionsTest)
    colnames(metaFeatures$equality) <- colnames(predictionsTest)
  }
  
  return(metaFeatures)
}

#' Computes the importance as the density in the probability density function,
#' normalized over all densities.
#' @param predictions Prediction data frame, where rows are samples, and
#' columns are predictors.
#' @return A data frame with the importance metric for each sample and each
#' predictor.
#' @export
ComputePDFMetafeature <- function(predictions){
  
  # Compute the importance for each sample.
  predictions <- as.matrix(predictions)
  importanceAll <- lapply(1:nrow(predictions), function(samp){
    # Prediction
    pred <- predictions[samp,]

    # Compute importance based on density.
    importance <- rep(0, length(pred))
    d <- stats::density(pred)
    importance <- unlist(lapply(pred, function(pred){
      return(d$y[which.min(abs(d$x - pred))])
    }))
    
    # Get the density percentile over all pairs.
    percentileVals <- seq(1, 100, by = 1) / 100
    quantiles <- stats::quantile(importance, percentileVals)
    percentiles <- unlist(lapply(importance, function(c){
      cvec <- rep(c, length(quantiles))
      which_match <- which.min(abs(cvec - quantiles))
      return(percentileVals[which_match])
    }))
    importance <- percentiles
    
    # Return.
    cat(".")
    return(importance)
  })
  importance <- t(do.call(cbind, importanceAll))
  rownames(importance) <- rownames(predictions)
  colnames(importance) <- colnames(predictions)
  return(importance)
}

#' Function for obtaining database identifiers for each predictor. Used
#' in functional analysis.
#' @param preds Prediction data frame, where rows are samples, and
#' columns are predictors.
#' @param inputData Input data, which may include database ID mappings for
#' the analytes.
#' @param colIdInd The ID of the column that has the analyte ID's for the
#' independent variable. If blank, then the existing ID's are used.
#' @param colIdOut The ID of the column that has the analyte ID's for the
#' outcome variable. If blank, then the existing ID's are used.
#' @return A data frame with two columns. The indId column contains the ID's
#' for the independent analyte, and the outId column contains the ID's for
#' the outcome analyte.
GetPredictorIDs <- function(preds, inputData, colIdInd, colIdOut){
  # Get prediction names.
  indAnalytes <- unlist(lapply(colnames(preds), function(pair){
    return(strsplit(pair, "__")[[1]][1])
  }))
  outAnalytes <- unlist(lapply(colnames(preds), function(pair){
    return(strsplit(pair, "__")[[1]][2])
  }))
  
  # Map to source ID.
  indId <- indAnalytes
  outId <- outAnalytes
  if(length(inputData@analyteType1MetaData) > 0 && colIdInd != "" && indAnalytes[[1]] %in% rownames(inputData@analyteType1MetaData)){
    indId <- inputData@analyteType1MetaData[indAnalytes, colIdInd]
  }else if(length(inputData@analyteType2MetaData) > 0 && colIdInd != "" && indAnalytes[[1]] %in% rownames(inputData@analyteType2MetaData)){
    indId <- inputData@analyteType2MetaData[indAnalytes, colIdInd]
  }
  if(length(inputData@analyteType1MetaData) > 0 && colIdOut != "" && outAnalytes[[1]] %in% rownames(inputData@analyteType1MetaData)){
    outId <- inputData@analyteType1MetaData[outAnalytes, colIdOut]
  }else if(length(inputData@analyteType2MetaData) > 0 && colIdOut != "" && outAnalytes[[1]] %in% rownames(inputData@analyteType2MetaData)){
    outId <- inputData@analyteType2MetaData[outAnalytes, colIdOut]
  }
  
  # Return data frame.
  return(data.frame(indId = indId, outId = outId))
}

#' Compute the importance using pathway membership. Predictor pairs that share
#' at least one pathway have an importance of 1, and all others have an 
#' importance of 0.
#' @param predictions Prediction data frame, where rows are samples, and
#' columns are predictors.
#' @param inputData Input data, which may include database ID mappings for
#' the analytes.
#' @param colIdInd The ID of the column that has the analyte ID's for the
#' independent variable. If blank, then the existing ID's are used.
#' @param colIdOut The ID of the column that has the analyte ID's for the
#' outcome variable. If blank, then the existing ID's are used.
#' @return A data frame with the importance metric for each sample and each
#' predictor. The metric will be the same for all samples in this case.
ComputePathwayMetafeature <- function(predictions, inputData, colIdInd,
                                     colIdOut){
  
  # Obtain names of predictors.
  predNames <- GetPredictorIDs(predictions, inputData, colIdInd, colIdOut)
  predNames[,"indId"] <- as.character(predNames[,"indId"])
  predNames[,"outId"] <- as.character(predNames[,"outId"])
  allPathways <- RaMP::getPathwayFromAnalyte(unique(c(predNames[,"indId"], 
                                                      predNames[,"outId"])))
  
  # Find the pathways associated with each analyte.
  namesInd <- unique(predNames[,"indId"])
  pwayResultInd <- lapply(namesInd, function(analyte){
    return(allPathways$pathwayId[which(allPathways$inputId == analyte)])
  })
  names(pwayResultInd) <- namesInd
  namesOut <- unique(predNames[,"outId"])
  pwayResultOut <- lapply(namesOut, function(analyte){
    return(allPathways$pathwayId[which(allPathways$inputId == analyte)])
  })
  names(pwayResultOut) <- namesOut
  
  # Find which pairs share pathways.
  sharesPathway <- unlist(lapply(1:nrow(predNames), function(i){
    if(i %% (ceiling(nrow(predNames) / 10)) == 0){
      cat(".")
    }
    shares <- FALSE
    if(length(intersect(pwayResultInd[[predNames[i,"indId"]]], pwayResultOut[[predNames[i,"outId"]]])) > 0){
      shares <- TRUE
    }
    return(shares)
  }))
  importance <- rep(0, length(sharesPathway))
  importance[which(sharesPathway == TRUE)] <- 1
  
  # Make copies for each sample.
  importanceAll <- lapply(rownames(predictions), function(samp){return(importance)})
  importanceFinal <- t(do.call(cbind, importanceAll))
  rownames(importanceFinal) <- rownames(predictions)
  colnames(importanceFinal) <- colnames(predictions)
  return(importanceFinal)
}

#' Compute the importance using reaction membership. Predictor pairs that share
#' at least one reaction have an importance of 1, and all others have an 
#' importance of 0.
#' @param predictions Prediction data frame, where rows are samples, and
#' columns are predictors.
#' @param inputData Input data, which may include database ID mappings for
#' the analytes.
#' @param colIdInd The ID of the column that has the analyte ID's for the
#' independent variable. If blank, then the existing ID's are used.
#' @param colIdOut The ID of the column that has the analyte ID's for the
#' outcome variable. If blank, then the existing ID's are used.
#' @return A data frame with the importance metric for each sample and each
#' predictor. The metric will be the same for all samples in this case.
ComputeReactionMetafeature <- function(predictions, inputData, colIdInd,
                                      colIdOut){
  # Obtain names of predictors.
  predNames <- GetPredictorIDs(predictions, inputData, colIdInd, colIdOut)
  predNames[,"indId"] <- as.character(predNames[,"indId"])
  predNames[,"outId"] <- as.character(predNames[,"outId"])
  uniqueOutIds <- unique(predNames[,"outId"])
  rxnAll <- RaMP::rampFastCata(uniqueOutIds)$rxn_partner_ids
  names(rxnAll) <- uniqueOutIds
  
  # Find which pairs share reactions.
  sharesRxn <- unlist(lapply(1:nrow(predNames), function(i){
    if(i %% (ceiling(nrow(predNames) / 10)) == 0){
      cat(".")
    }
    shares <- FALSE
    tryCatch({
      rxnResult <- rxnAll[[predNames[i, "outId"]]]
      rxnResultAll <- unlist(lapply(rxnResult, function(res){
        return(strsplit(res, "; ")[[1]])
      }))
      if(predNames[i,"indId"] %in% rxnResultAll){
        shares <- TRUE
      }
    }, error = function(cond){})
    return(shares)
  }))
  importance <- rep(0, length(sharesRxn))
  importance[which(sharesRxn == TRUE)] <- 1
  
  # Make copies for each sample.
  importanceAll <- lapply(rownames(predictions), function(samp){return(importance)})
  importanceFinal <- t(do.call(cbind, importanceAll))
  rownames(importanceFinal) <- rownames(predictions)
  colnames(importanceFinal) <- colnames(predictions)
  return(importanceFinal)
}

#' Compute the importance using p-value of model interaction term. Specifically,
#' the p-value percentile is calculated, and the weight is taken as 1 - the percentile,
#' so that the lowest p-values are weighted more heavily.
#' @param predictions Prediction data frame, where rows are samples, and
#' columns are predictors.
#' @param modelStats A data frame that includes the interaction p-values and
#' interaction coefficients for each pair (such as the one output by IntLIM's
#' ProcessResults function)
#' @return A data frame with the importance metric for each sample and each
#' predictor. The metric will be the same for all samples in this case.
ComputePvalMetafeature <- function(predictions, modelStats){
  
  # Measure importance for each predictor.
  pvals <- modelStats$FDRadjPval
  names(pvals) <- paste(modelStats$Analyte1, modelStats$Analyte2, sep = "__")
  pvals <- pvals[colnames(predictions)]
  
  # Get the p-value percentile over all pairs.
  percentileVals <- seq(1, 100, by = 1) / 100
  quantiles <- stats::quantile(pvals, percentileVals)
  closestQuantile <- rep(-1, length(pvals))
  smallestDifference <- rep(1, length(pvals))
  for(i in 1:length(quantiles)){
    diffs <- abs(pvals - quantiles[i])
    which_better <- which(diffs < smallestDifference)
    smallestDifference[which_better] <- diffs[which_better]
    closestQuantile[which_better] <- percentileVals[i]
  }
  importance <- 1 - closestQuantile
  
  # Make copies for each sample.
  importanceAll <- lapply(rownames(predictions), function(samp){return(importance)})
  importanceFinal <- t(do.call(cbind, importanceAll))
  rownames(importanceFinal) <- rownames(predictions)
  colnames(importanceFinal) <- colnames(predictions)
  return(importanceFinal)
}

#' Compute the importance using absolute value of interaction term coefficient. Specifically,
#' the coefficient percentile is calculated and taken as the weight.
#' @param predictions Prediction data frame, where rows are samples, and
#' columns are predictors.
#' @param modelStats A data frame that includes the interaction p-values and
#' interaction coefficients for each pair (such as the one output by IntLIM's
#' ProcessResults function)
#' @return A data frame with the importance metric for each sample and each
#' predictor. The metric will be the same for all samples in this case.
ComputeInteractionCoefMetafeature <- function(predictions, modelStats){
  
  # Measure importance for each predictor.
  coefs <- modelStats$interaction_coeff
  names(coefs) <- paste(modelStats$Analyte1, modelStats$Analyte2, sep = "__")
  coefs <- coefs[colnames(predictions)]
  
  # Get the p-value percentile over all pairs.
  percentileVals <- seq(1, 100, by = 1) / 100
  quantiles <- stats::quantile(abs(coefs), percentileVals)
  closestQuantile <- rep(-1, length(coefs))
  smallestDifference <- rep(max(abs(coefs)), length(coefs))
  for(i in 1:length(quantiles)){
    diffs <- abs(abs(coefs) - quantiles[i])
    which_better <- which(diffs < smallestDifference)
    smallestDifference[which_better] <- diffs[which_better]
    closestQuantile[which_better] <- percentileVals[i]
  }
  importance <- closestQuantile
  
  # Make copies for each sample.
  importanceAll <- lapply(rownames(predictions), function(samp){return(importance)})
  importanceFinal <- t(do.call(cbind, importanceAll))
  rownames(importanceFinal) <- rownames(predictions)
  colnames(importanceFinal) <- colnames(predictions)
  return(importanceFinal)
}

#' Compute the importance using absolute value of analyte term coefficient. Specifically,
#' the coefficient percentile is calculated and taken as the weight.
#' @param predictions Prediction data frame, where rows are samples, and
#' columns are predictors.
#' @param modelStats A data frame that includes the interaction p-values,
#' interaction coefficients, and other coefficients for each pair (such as the one output by IntLIM's
#' ProcessResults function)
#' @return A data frame with the importance metric for each sample and each
#' predictor. The metric will be the same for all samples in this case.
ComputeAnalyteCoefMetafeature <- function(predictions, modelStats){
  
  # Measure importance for each predictor.
  coefs <- modelStats$type
  names(coefs) <- paste(modelStats$Analyte1, modelStats$Analyte2, sep = "__")
  coefs <- coefs[colnames(predictions)]
  
  # Get the p-value percentile over all pairs.
  percentileVals <- seq(1, 100, by = 1) / 100
  quantiles <- stats::quantile(abs(coefs), percentileVals)
  closestQuantile <- rep(-1, length(coefs))
  smallestDifference <- rep(max(abs(coefs)), length(coefs))
  for(i in 1:length(quantiles)){
    diffs <- abs(abs(coefs) - quantiles[i])
    which_better <- which(diffs < smallestDifference)
    smallestDifference[which_better] <- diffs[which_better]
    closestQuantile[which_better] <- percentileVals[i]
  }
  importance <- closestQuantile
  
  # Make copies for each sample.
  importanceAll <- lapply(rownames(predictions), function(samp){return(importance)})
  importanceFinal <- t(do.call(cbind, importanceAll))
  rownames(importanceFinal) <- rownames(predictions)
  colnames(importanceFinal) <- colnames(predictions)
  return(importanceFinal)
}

#' Finds the optimal subspace clustering (i.e. using Grassmann Manifold Technique
#' - see PMID 30329022) given two different modalities of data (e.g. gene and metabolite).
#' It optimizes the cophenetic correlation of the hierarchicial clustering
#' of the samples using a grid search.
#' @param type1Similarity A cosine similarity matrix for the first data type, 
#' found using ComputeCosineSimilarity.
#' @param type2Similarity A cosine similarity matrix for the second data type,
#' found using ComputeCosineSimilarity.
#' @param eigStep The number of eigenvectors to step by during the evaluation.
#' Note that this must be less than the number of samples. Default = 10.
#' @param alphaMin The lowest value of alpha to investigate. Default = 0.
#' @param alphaMax The highest value of alpha to investigate. Default = 1.
#' @param alphaStep The value of alpha to step by during the evalutation.
#' Default = 0.1.
#' @return A named list including the data projected onto the merged subspace,
#' the optimal number of eigenvectors, the optimal alpha value, the clustering
#' coefficient, and the dendrogram.
FindOptimalSubspaceClustering <- function(type1Similarity, type2Similarity,
                                          eigStep = 10, alphaMin = 0,
                                          alphaMax = 1, alphaStep = 0.1){
  # Check input parameters.
  if(eigStep > nrow(type1Similarity)){
    stop("Must set eigStep to a value lower than the number of samples!")
  }
  else if(alphaMin < 0 || alphaMin > 1 || alphaMax < 0 || alphaMax > 1 ){
    stop("Both alphaMin and alphaMax must be between 0 and 1!")
  }
  else if(alphaStep > alphaMax - alphaMin){
    stop(paste("alphaStep is too big! There must be at least one step between",
               "alphaMin and alphaMax"))
  }
  else{
    # Initialize.
    L_mod <- matrix()
    dendro <- -1
    optimal_eigens <- -1
    optimal_alpha <- -1
    coph_cor <- matrix()
    
    # Do grid search.
    for(eig in seq(from=1,to=dim(type1Similarity)[1],by=eigStep)){
      for(a in seq(from=alphaMin,to=alphaMax,by=alphaStep)){
        
        # Find clustering with the current grid values.
        L_mod_i <- CombineSubspaces(type1Similarity = type1Similarity, 
                                    type2Similarity = type2Similarity, 
                                    eigenCount = eig, alpha = a)
        
        # Compute cophenetic correlation.
        rownames(L_mod_i) <- rownames(type1Similarity)
        sim <- ComputeCosineSimilarity(L_mod_i)
        dist <- max(sim) - sim
        rownames(dist) <- rownames(sim)
        d <- stats::as.dist(dist)
        dend <- stats::hclust(d)
        coph_i <- stats::cophenetic(dend)
        coph_cor_i <- stats::cor(coph_i, d)
        
        # If cophenetic correlation is higher than previous, save.
        if(length(coph_cor) > 1 || is.na(coph_cor_i > coph_cor) || coph_cor_i > coph_cor){
          coph_cor <- coph_cor_i
          L_mod <- L_mod_i
          dendro <- dend
          optimal_eigens <- eig
          optimal_alpha <- a
        }
      }
    }
    return(list("L_mod"=L_mod, "eigenvector_ct"=optimal_eigens, "alpha"=optimal_alpha,
                "cluster_coeff"=coph_cor, "dendrogram"=dendro))
  }
}

#' Combine subspaces (i.e. using Grassmann Manifold Technique
#' - see PMID 30329022) given two different modalities of data (e.g. gene and metabolite)
#' and the alpha value and the desired number of eigenvectors.
#' @param type1Similarity A cosine similarity matrix for the first data type, 
#' found using ComputeCosineSimilarity.
#' @param type2Similarity A cosine similarity matrix for the second data type,
#' found using ComputeCosineSimilarity.
#' @param eigenCount The number of eigenvectors to use.
#' @param alpha The value of alpha to use.
#' @return A named list including the data projected onto the merged subspace,
#' the optimal number of eigenvectors, the optimal alpha value, the clustering
#' coefficient, and the dendrogram.
CombineSubspaces <- function(type1Similarity, type2Similarity, eigenCount, alpha){
  
  # Make graphs.
  D_dat1 <- diag(colSums(type1Similarity))
  D_dat2 <- diag(colSums(type2Similarity))
  
  # Get normalized Laplacian for each graph.
  L_dat1 <- as.matrix(D_dat1 - type1Similarity)
  L_dat2 <- as.matrix(D_dat2 - type2Similarity)
  
  # Normalize.
  D_neg_half_dat1 <- diag(diag(1 / sqrt(abs(D_dat1))))
  D_neg_half_dat2 <- diag(diag(1 / sqrt(abs(D_dat2))))
  L_dat1 <- D_neg_half_dat1 %*% L_dat1 %*% D_neg_half_dat1
  L_dat2 <- D_neg_half_dat2 %*% L_dat2 %*% D_neg_half_dat2
  
  # Get eigenvectors for each Laplacian.
  U_dat1 <- eigen(L_dat1)$vectors[,c(1:eigenCount)]
  U_dat2 <- eigen(L_dat2)$vectors[,c(1:eigenCount)]
  
  # Combine
  L_mod <- L_dat1 + L_dat2 - alpha * ((U_dat1 %*% t(U_dat1)) + (U_dat2 %*% t(U_dat2)))
  return(L_mod)
}

#' Combine subspaces (i.e. using Grassmann Manifold Technique
#' - see PMID 30329022) given two different modalities of data (e.g. gene and metabolite)
#' and the alpha value and the desired number of eigenvectors.
#' @param type1SimilarityTrain A cosine similarity matrix for the first data type, 
#' found using ComputeCosineSimilarity on the training data.
#' @param type2SimilarityTrain A cosine similarity matrix for the second data type,
#' found using ComputeCosineSimilarity on the training data.
#' @param type1SimilarityTest A cosine similarity matrix for the first data type, 
#' found using ComputeCosineSimilarity on the testing data.
#' @param type2SimilarityTest A cosine similarity matrix for the second data type,
#' found using ComputeCosineSimilarity on the testing data.
#' @param eigenCount The number of eigenvectors to use.
#' @param alpha The value of alpha to use.
#' @return A named list including the data projected onto the merged subspace,
#' the optimal number of eigenvectors, the optimal alpha value, the clustering
#' coefficient, and the dendrogram.
CombineSubspacesTest <- function(type1SimilarityTrain, type2SimilarityTrain, 
                                 type1SimilarityTest, type2SimilarityTest, 
                                 eigenCount, alpha){
  
  # Make graphs. We compute the diagonals on the training data only.
  D_dat1 <- diag(colSums(type1SimilarityTrain))
  D_dat2 <- diag(colSums(type2SimilarityTrain))
  L_dat1_train <- as.matrix(D_dat1 - type1SimilarityTrain)
  L_dat2_train <- as.matrix(D_dat2 - type2SimilarityTrain)
  D_neg_half_dat1 <- diag(diag(1 / sqrt(abs(D_dat1))))
  D_neg_half_dat2 <- diag(diag(1 / sqrt(abs(D_dat2))))
  L_dat1_train <- D_neg_half_dat1 %*% L_dat1_train %*% D_neg_half_dat1
  L_dat2_train <- D_neg_half_dat2 %*% L_dat2_train %*% D_neg_half_dat2
  
  # Get eigenvectors for each Laplacian on the training data.
  U_dat1_train <- eigen(L_dat1_train)$vectors[,c(1:eigenCount)]
  U_dat2_train <- eigen(L_dat2_train)$vectors[,c(1:eigenCount)]
  
  # Do not subtract from D_dat for testing data because none of the
  # data points are on the diagonal. Simply negate the values.
  L_dat1 <- as.matrix(0 - type1SimilarityTest)
  L_dat2 <- as.matrix(0 - type2SimilarityTest)
  
  # Normalize. Here, transposition is necessary to obtain the correct
  # dimensionality of the matrix.
  D_neg_half_dat1 <- diag(diag(1 / sqrt(abs(D_dat1))))
  D_neg_half_dat2 <- diag(diag(1 / sqrt(abs(D_dat2))))
  L_dat1 <- t(t(D_neg_half_dat1 %*% t(L_dat1)) %*% D_neg_half_dat1)
  L_dat2 <- t(t(D_neg_half_dat2 %*% t(L_dat2)) %*% D_neg_half_dat2)
  
  # Project testing data onto eigenvectors.
  U_dat1 <- t(L_dat1) %*% U_dat1_train
  U_dat2 <- t(L_dat2) %*% U_dat2_train
  
  # Combine
  L_mod <- L_dat1 + L_dat2 - alpha * (t(U_dat1 %*% t(U_dat1_train)) + t(U_dat2 %*% t(U_dat2_train)))
  return(L_mod)
}

#' Compute cosine similarity between samples on an adjacency matrix.
#' @param R The adjacency matrix.
#' @return A matrix of sample rows and sample columns, with the cosine
#' similarities listed as the values.
ComputeCosineSimilarity <- function(R){
  # Normalize matrix by the norm of its columns.
  euclidean_norm <- sqrt(rowSums(R^2))
  euclidean_norm_mat <- matrix(rep(euclidean_norm, ncol(R)), ncol=ncol(R))
  R_norm <- as.matrix(R / euclidean_norm_mat)

  # Compute matrix transpose to get cosine similarity.
  sim <- R_norm %*% t(R_norm)
  rownames(sim) <- rownames(R)
  colnames(sim) <- rownames(R)
  return(sim)
}

#' Compute cosine similarity between a testing sample and each training sample
#' in the combined subspace matrix.
#' @param opt The optimal model obtained using FindOptimalSubspaceClustering on
#' the training data.
#' @param inputDataTest The testing data.
#' @param inputDataTrain The training data.
#' @param samp The name of the testing sample against which to compute similarity.
#' @return A matrix of similarities, where the rows are the training data samples
#' and the columns are the testing data samples.
CosineSimilarityToTrainingOnSubspace <- function(opt, inputDataTest, inputDataTrain,
                                                 samp){
  
  # Compute similarity between the training samples.
  type1SimilarityTrain <- ComputeCosineSimilarity(t(inputDataTrain@analyteType1))
  type2SimilarityTrain <- ComputeCosineSimilarity(t(inputDataTrain@analyteType2))
  
  # Compute adjacency of the sample to each of the training samples using
  # the parameters of the optimal model.
  type1SimilarityTest <- CosineSimilarityToTrainingOnSingleDataType(trainingData = inputDataTrain@analyteType1,
                                                           testingData = inputDataTest@analyteType1,
                                                           samp = samp)
  type2SimilarityTest <- CosineSimilarityToTrainingOnSingleDataType(trainingData = inputDataTrain@analyteType2,
                                                         testingData = inputDataTest@analyteType2,
                                                         samp = samp)

  R <- CombineSubspacesTest(type1SimilarityTrain = type1SimilarityTrain,
                            type2SimilarityTrain = type2SimilarityTrain,
                            type1SimilarityTest = t(type1SimilarityTest), 
                        type2SimilarityTest = t(type2SimilarityTest), 
                        eigenCount = opt$eigenvector_ct, 
                        alpha = opt$alpha)
  colnames(R) <- samp
  rownames(R) <- rownames(inputDataTrain@sampleMetaData)
  
  # Compute cosine similarity on the new subspace.
  similarity <- CosineSimilarityToTrainingOnSingleDataType(trainingData = opt$L_mod,
                                                           testingData = R,
                                                           samp = samp)
  colnames(similarity) <- samp
  rownames(similarity) <- rownames(inputDataTrain@sampleMetaData)

  return(similarity)
}

#' Compute cosine similarity between a testing sample and each training sample
#' for a single data type.
#' @param testingData The testing data.
#' @param trainingData The training data.
#' @param samp The name of the testing sample against which to compute similarity.
#' @return A matrix of similarities, where the rows are the training data samples
#' and the columns are the testing data samples.
CosineSimilarityToTrainingOnSingleDataType <- function(trainingData, testingData,
                                                       samp){
  # Compute cosine similarity to each training sample.
  similarityList <- matrix(unlist(lapply(1:ncol(trainingData), function(c){
    dotProduct <- sum(trainingData[,c] * testingData[,samp])
    normTrain <- sqrt(sum(trainingData[,c]^2))
    normTest <- sqrt(sum(testingData[,samp]^2))
    return(dotProduct / (normTrain * normTest))
  })))

  # Return similarity as a matrix.
  return(similarityList)
}

#' Computes the importance as the median absolute error for local predictors
#' (i.e. the predictions for k nearest neighbors of each sample).
#' @param predictions Prediction data frame, where rows are samples, and
#' columns are predictors.
#' @param true Named vector of the true outcome values.
#' @param k The number of nearest neighbors to consider.
#' @param inputData The input data read in using the function IntLIM function
#' ReadData.
#' @param eigStep The number of eigenvectors to step by during the evaluation.
#' Note that this must be less than the number of samples. Default = 10.
#' @param alphaMin The lowest value of alpha to investigate. Default = 0.
#' @param alphaMax The highest value of alpha to investigate. Default = 1.
#' @param alphaStep The value of alpha to step by during the evalutation.
#' Default = 0.1.
#' @return A data frame with the importance metric for each sample and each
#' predictor.
#' @export
ComputeLocalErrorMetafeature <- function(predictions, true, k,
                                       inputData, eigStep = 10, alphaMin = 0,
                                       alphaMax = 1, alphaStep = 0.1){
  
  # Find the optimal projection for best cluster separability.
  type1sim <- ComputeCosineSimilarity(t(inputData@analyteType1))
  type2sim <- ComputeCosineSimilarity(t(inputData@analyteType2))
  opt <- FindOptimalSubspaceClustering(type1Similarity = type1sim, 
                                       type2Similarity = type2sim,
                                       eigStep = eigStep, alphaMin = alphaMin,
                                       alphaMax = alphaMax, alphaStep = alphaStep)
  
  # Compute the importance for each sample.
  importanceAll <- lapply(1:nrow(predictions), function(samp){
    # Prediction
    pred <- predictions[-samp,]
    
    # Find nearest neighbors.
    similarities <- ComputeCosineSimilarity(opt$L_mod)[samp,]
    sorted_sims <- names(similarities)[order(-similarities)]
    pred_sorted_sims <- sorted_sims[which(sorted_sims %in% rownames(pred))]
    knn <- pred_sorted_sims[1:k]
    pred_local <- pred[knn,]
    
    # Error
    trueVals <- matrix(rep(true[knn], ncol(pred_local)),
                       nrow = length(knn),
                       ncol = ncol(pred_local))
    if(!is.numeric(trueVals)){
      catNames <- sort(unique(trueVals))
      trueValuesChar <- trueVals
      trueVals <- matrix(rep(0, length(trueValuesChar)), ncol = ncol(trueValuesChar),
                           nrow = nrow(trueValuesChar))
      trueVals[which(trueValuesChar == catNames[2])] <- 1
    }

    err <- abs(trueVals - pred_local)
    medErr <- unlist(lapply(1:ncol(err), function(c){
      return(stats::median(err[,c]))
    }))
    importance <- 1 - medErr

    # Get the 1 - error percentile over all pairs.
    percentileVals <- seq(1, 100, by = 1) / 100
    quantiles <- stats::quantile(importance, percentileVals)
    percentiles <- unlist(lapply(importance, function(c){
      cvec <- rep(c, length(quantiles))
      which_match <- which.min(abs(cvec - quantiles))
      return(percentileVals[which_match])
    }))
    importance <- percentiles
    
    # Return.
    cat(".")
    return(importance)
  })
  importanceFinal <- t(do.call(cbind, importanceAll))
  rownames(importanceFinal) <- rownames(predictions)
  colnames(importanceFinal) <- colnames(predictions)
  return(importanceFinal)
}

#' Computes the importance as the median absolute error for local predictors
#' (i.e. the predictions for k nearest neighbors of each sample).
#' @param predictions Prediction data frame, where rows are samples, and
#' columns are predictors.
#' @param true Named vector of the true outcome values.
#' @param k The number of nearest neighbors to consider.
#' @param inputDataTrain The training data (in IntLimData format).
#' @param inputDataTest The testing data (in IntLimData format).
#' @param eigStep The number of eigenvectors to step by during the evaluation.
#' Note that this must be less than the number of samples. Default = 10.
#' @param alphaMin The lowest value of alpha to investigate. Default = 0.
#' @param alphaMax The highest value of alpha to investigate. Default = 1.
#' @param alphaStep The value of alpha to step by during the evalutation.
#' Default = 0.1.
#' @return A data frame with the importance metric for each sample and each
#' predictor.
#' @export
ComputeLocalErrorMetafeatureTest <- function(predictions, true, k,
                                        inputDataTrain, inputDataTest, 
                                        eigStep = 10, alphaMin = 0,
                                        alphaMax = 1, alphaStep = 0.1){
  
  # Find the optimal projection for best cluster separability on the training data.
  type1sim <- ComputeCosineSimilarity(t(inputDataTrain@analyteType1))
  type2sim <- ComputeCosineSimilarity(t(inputDataTrain@analyteType2))
  opt <- FindOptimalSubspaceClustering(type1Similarity = type1sim, 
                                       type2Similarity = type2sim,
                                       eigStep = eigStep, alphaMin = alphaMin,
                                       alphaMax = alphaMax, alphaStep = alphaStep)
  
  # Compute the importance for each sample in the testing data.
  importanceAll <- lapply(rownames(inputDataTest@sampleMetaData), function(samp){

    # Training data predictions
    pred <- predictions
    
    # Find nearest neighbors.
    similarities <- CosineSimilarityToTrainingOnSubspace(opt = opt, 
                                                         inputDataTrain = inputDataTrain, 
                                                         inputDataTest = inputDataTest, 
                                                         samp = samp)
    sorted_sims <- rownames(similarities)[order(-similarities)]
    pred_sorted_sims <- sorted_sims[which(sorted_sims %in% rownames(pred))]
    knn <- pred_sorted_sims[1:k]
    pred_local <- pred[knn,]
    
    # Error
    trueVals <- matrix(rep(true[knn], ncol(pred_local)),
                       nrow = length(knn),
                       ncol = ncol(pred_local))
    if(!is.numeric(trueVals)){
      catNames <- sort(unique(trueVals))
      trueValuesChar <- trueVals
      trueVals <- matrix(rep(0, length(trueValuesChar)), ncol = ncol(trueValuesChar),
                           nrow = nrow(trueValuesChar))
      trueVals[which(trueValuesChar == catNames[2])] <- 1
    }
    
    err <- abs(trueVals - pred_local)
    medErr <- unlist(lapply(1:ncol(err), function(c){
      return(stats::median(err[,c]))
    }))
    importance <- 1 - medErr
    
    # Get the 1 - error percentile over all pairs.
    percentileVals <- seq(1, 100, by = 1) / 100
    quantiles <- stats::quantile(importance, percentileVals)
    percentiles <- unlist(lapply(importance, function(c){
      cvec <- rep(c, length(quantiles))
      which_match <- which.min(abs(cvec - quantiles))
      return(percentileVals[which_match])
    }))
    importance <- percentiles
    
    # Return.
    cat(".")
    return(importance)
  })
  importanceFinal <- t(do.call(cbind, importanceAll))
  rownames(importanceFinal) <- rownames(inputDataTest@sampleMetaData)
  colnames(importanceFinal) <- colnames(predictions)
  return(importanceFinal)
}

#' Computes the importance as the median absolute error for all predictors
#' on all samples except the sample in question.
#' @param predictions Prediction data frame, where rows are samples, and
#' columns are predictors.
#' @param true Named vector of the true outcome values.
#' @return A data frame with the importance metric for each sample and each
#' predictor.
#' @export
ComputeGlobalErrorMetafeature <- function(predictions, true){
  
  # Compute the importance for each sample.
  importanceAll <- lapply(1:nrow(predictions), function(samp){
    # Prediction
    pred <- predictions[-samp,]
    
    # Error
    trueVals <- matrix(rep(true, length(pred)),
                       nrow = length(true),
                       ncol = ncol(pred))
    trueVals <- trueVals[-samp,]
    
    err <- abs(trueVals - pred)
    medErr <- unlist(lapply(1:ncol(err), function(c){
      return(stats::median(err[,c]))
    }))
    importance <- 1 - medErr
    
    # Get the 1 - error percentile over all pairs.
    percentileVals <- seq(1, 100, by = 1) / 100
    quantiles <- stats::quantile(importance, percentileVals)
    percentiles <- unlist(lapply(importance, function(c){
      cvec <- rep(c, length(quantiles))
      which_match <- which.min(abs(cvec - quantiles))
      return(percentileVals[which_match])
    }))
    importance <- percentiles
    
    # Return.
    cat(".")
    return(importance)
  })
  importanceFinal <- t(do.call(cbind, importanceAll))
  rownames(importanceFinal) <- rownames(predictions)
  colnames(importanceFinal) <- colnames(predictions)
  return(importanceFinal)
}

#' Computes the importance as the median absolute error for all predictors
#' samples in the training data.
#' @param predictions Prediction data frame, where rows are samples, and
#' columns are predictors.
#' @param true Named vector of the true outcome values.
#' @return A data frame with the importance metric for each sample and each
#' predictor. For this metric, the value will be the same for all samples.
#' @export
ComputeGlobalErrorMetafeatureTest <- function(predictions, true){
  
  # Compute the importance for each sample.
  importanceAll <- lapply(1:nrow(predictions), function(samp){
    # Prediction
    pred <- predictions
    
    # Error
    trueVals <- matrix(rep(true, length(pred)),
                       nrow = length(true),
                       ncol = ncol(pred))
    trueVals <- trueVals
    
    err <- abs(trueVals - pred)
    medErr <- unlist(lapply(1:ncol(err), function(c){
      return(stats::median(err[,c]))
    }))
    importance <- 1 - medErr
    
    # Get the 1 - error percentile over all pairs.
    percentileVals <- seq(1, 100, by = 1) / 100
    quantiles <- stats::quantile(importance, percentileVals)
    percentiles <- unlist(lapply(importance, function(c){
      cvec <- rep(c, length(quantiles))
      which_match <- which.min(abs(cvec - quantiles))
      return(percentileVals[which_match])
    }))
    importance <- percentiles
    
    # Return.
    cat(".")
    return(importance)
  })
  importanceFinal <- t(do.call(cbind, importanceAll))
  rownames(importanceFinal) <- rownames(predictions)
  colnames(importanceFinal) <- colnames(predictions)
  return(importanceFinal)
}
ncats/MultiOmicsGraphPrediction documentation built on Aug. 23, 2023, 9:19 a.m.