CMapBox:

Documented in benchmarkDrugResults

#' Function to benchmark drug repurposing schemes
#'
#' This function will benchmark various drug repurposing methods according to the drug rank lists sent to the function and the drugs one
#' has selected to use for benchmarking. Step 1 would be to use compDrugMethods to obtain a data frame with multiple drug rankings, one for
#' each drug repurposing method, for a given signature. Step 2 would be to use the searchwords input of getBenchmarkDrugs to find appropriate drugs
#' to use for benchmarking the signature used in step 1. The results of step 1 (frame returned from compDrugMethods) should be passed to  compResList
#' and the results of step 2 (frame returned from getBenchmarkDrugs) should be passed to drugTrialsList. Step 1 and 2 can be repeated in order to increase
#' the number of valid drugs used to benchmark. The below example uses a breast cancer, lung cancer, and aml signature to get drug rankings for multiple
#' drug repurposing techniques using compDrugMethods, then uses getBenchmarkDrugs to find breast cancer, lung cancer, and aml drugs for benchmarking,
#' and then runs this funciton to determine which drug repurposing technique performs best. Note that since few drugs were used, the results are invalid.
#' Please obtain the entire CMap drug perturbation signature to run valid benchmarking with the breast cancer, lungh cancer, and aml signatures provided
#' with this package.
#' @param compResList a list containing data frames formatted the same as the data frames returned from the compDrugMethods function.
#' Essentially, the column names are the drug repurposing techniques, rownames are the drug names, and the values are the final scores for the drugs.
#' @param drugTrialsList a list containing data frames that are each a subset of the rows from the cmapTrialInfo frame which can be loaded in with data("cmapTrialInfo").
#' The rows that are kept should correspond to the drugs that one would like to use to benchmark the drug rank frames in compResList. Use
#' getBenchmarkDrugs to search the cmapTrialInfo frame for drugs and obtain frames to be stored in this variable.
#' @param nperm number of permutations used to determine the p values. For each permutation, a randomly ordered drug ranking list is generated. The p values reported
#' are based on how often the drug rank list from the analysis outperforms these randomly created lists.
#' @return a list of 3 elements. The first element contains the final scores and p values for the methods
#' being benchmarked. The second element is a list of data frames. There is 1 data frame for each pair of compResList and drugTrialList
#' elements and the frame shows the position of the drugs in compResList for each drug repurposing technique. The third element is
#' also a list of data frames. It is the same as the second elements data frames except the values describe the number of unapproved drugs
#' that were reanked better than approved drugs for each technique. The final scores are generated by using the data in the 2nd and 3rd list
#' elements and the final score frame (list element 1) details how final scores are calculated.
#' @export
#' @examples
#' drugNames = c("doxorubicin", "cetuximab")
#' clinicalInfo = getDrugClinicalInfo(drugNames)
#'
#' breastDrugsCmap = getBenchmarkDrugs("breast ")
#' lungDrugsCmap = getBenchmarkDrugs("lung")
#' amlDrugsCmap = getBenchmarkDrugs("myeloid")
#' drugTrialsList = list(breastDrugsCmap, lungDrugsCmap, amlDrugsCmap)
#'

benchmarkDrugResults = function(compResList, drugTrialsList, nperm = 1000)
{
  #should add option to just use top 5 median approved drug positions. Dont want a bunch of bottom drugs to cloud top 5 being very good
  #function takes a list of compareResults frames, and a list of drugs relevant to each signature used to generate the compareResults frames
  #finalScoreFrameOrig = finalScoreFrame
  #compResListOrig = compResList
  #drugTrialsListOrig = drugTrialsList

  getBenchmarkScores = function(compResList, drugTrialsList)
  {
    if(length(compResList) != length(drugTrialsList))
      stop("The length of the objects compResList and drugNamesList should be equal")

    sigAppList = list()
    sigTermList = list()

    for(i in 1:length(compResList))
    {
      #data("cmapTrialInfo")
      notScoreCol = which(colnames(compResList[[i]]) == "cmap_name")
      if(length(notScoreCol) > 0){
        compResults = compResList[[i]][, 1:(notScoreCol - 1)]
      }else{
        compResults = compResList[[i]]
      }

      benchDrugFrame = drugTrialsList[[i]]
      cmapNames = tolower(rownames(compResults))
      repoNames = tolower(benchDrugFrame$drug_name)

      overlapDrugInds = c()
      for(j in 1:length(cmapNames))
        overlapDrugInds = c(overlapDrugInds, which(repoNames == cmapNames[j]))
      if(length(overlapDrugInds) == 0)
        stop(paste("None of the drug names in the", i, "element of drugNamesList are present in compResList"))
      repoCmapFr = benchDrugFrame[overlapDrugInds, ]

      #remove withdrawn trials and trials with accrual or funding issues
      rmStudyVec = c(unique(c(which(tolower(repoCmapFr$status) == "withdrawn"), which(grepl("accrual", tolower(repoCmapFr$DetailedStatus))), which(grepl("fund", tolower(repoCmapFr$DetailedStatus))))))
      if(length(rmStudyVec) > 0)
        repoCmapFr = repoCmapFr[-rmStudyVec, ]
      if(nrow(repoCmapFr) == 0)
      {
        warning(paste("Not using drugNamesList element", i, "in the benchmark as there were no valid clinical trials, i.e no approved trials and all the terminated trials appear to be due to funding and accrual issues"))
        next
      }

      #remove duplicated drugs, but make sure clinical trials are in agreeance, if they aren't remove drug as oppose to duplicate
      dupInds = which(duplicated(repoCmapFr$drug_name))
      repoCopy = repoCmapFr
      rmDupVec = c()
      for(j in 1:length(dupInds))
      {
        sameDrugRows = which(repoCmapFr$drug_name[dupInds[j]] == repoCmapFr$drug_name)
        if(sum(tolower(repoCmapFr$status[dupInds[j]]) == tolower(repoCmapFr$status[sameDrugRows])) == length(sameDrugRows)){
          rmDupVec = c(rmDupVec, sameDrugRows[2:length(sameDrugRows)])
        }else{
          rmDupVec = c(rmDupVec, sameDrugRows)
        }
      }
      if(length(rmDupVec) > 0)
        repoCmapFr = repoCmapFr[-rmDupVec, ]

      if(nrow(repoCmapFr) == 0)
      {
        warning(paste("Not using drugNamesList element", i, "in the benchmark as all the drugs had conflicting clinical trials, i.e drug was approved in one trial but terminated in another"))
        next
      }

      #use median to not skew results when one drug is out of place?
      #score results
      #rerank schemes using other signature and other drugs for the diff cancer
      #create final score based on all the sigs/drugs used

      #gaining points is bad
      #1. gain points farther approved drug is from the top
      #2. gain points for every position terminated drug is above approved drug (lose points when approved is above terminated? nah)
      #   remove drugs not on list, more logical and should keep score from 1 and 2 similar for methods that will get a good benchmark
      #3. use median of 1. and multiply by mean of 2.?
      appScoreList  = list()
      termScoreList = list()
      appScoreFrame = as.data.frame(NULL)
      termScoreFrame = as.data.frame(NULL)
      repoCmapFr$drug_name = tolower(repoCmapFr$drug_name)
      for(j in 1:ncol(compResults))
      {
        appScoreVec = c()
        termScoreVec = c()
        #order from most -ve to most +ve
        methScores = compResults[, j]
        names(methScores) = tolower(rownames(compResults))
        methScores = methScores[order(methScores)]
        for(k in 1:nrow(repoCmapFr))
        {
          drugName = repoCmapFr$drug_name[k]
          drugStat = repoCmapFr$status[k]
          if(drugStat == "Approved"){
            remDrugNames = tolower(repoCmapFr$drug_name[-k])
            methScoresSub = methScores[-which(names(methScores) %in% remDrugNames)]
            appDrugPos = which(names(methScoresSub) == drugName)
            appScoreVec = c(appScoreVec, appDrugPos)
          }else{
            keepDrugNames = c(repoCmapFr$drug_name[which(repoCmapFr$status == "Approved")], drugName)
            methScoresSub = methScores[which(names(methScores) %in% keepDrugNames)]
            termDrugPos = which(names(methScoresSub) == drugName)
            termScoreVec = c(termScoreVec, length(methScoresSub) - termDrugPos)
          }
        }
        appScoreList[[j]] = appScoreVec
        termScoreList[[j]] = termScoreVec
        appScoreFrame = rbind(appScoreFrame, appScoreVec)
        termScoreFrame = rbind(termScoreFrame, termScoreVec)
      }
      if(nrow(appScoreFrame) > 0)
      {
        colnames(appScoreFrame) = repoCmapFr$drug_name[which(repoCmapFr$status == "Approved")]
        rownames(appScoreFrame) = colnames(compResults)
        appScoreFrame = t(appScoreFrame)
      }
      if(nrow(termScoreFrame) > 0)
      {
        colnames(termScoreFrame) = repoCmapFr$drug_name[which(repoCmapFr$status == "Terminated")]
        rownames(termScoreFrame) = colnames(compResults)
        termScoreFrame = t(termScoreFrame)
      }

      #below are lists of lists
      #outer list element corresponds to results for specific set of drugs and signature
      #inner list element corresponds to results for a given drug repurposing scheme
      #sigAppList vector elements correspond to positin of approved drug in list
      #sigTermList vector elements correspond to number spots terminated drug is above approved drug when only looking at drugs that had trials
      #sigAppList[[i]] = appScoreList
      #sigTermList[[i]] = termScoreList

      #each list element is a frame with the results on a rankList generated using a given signature and its drugs
      #columns have each drug repurposing technique, rows have the drugs, values are the scores
      sigAppList[[i]] = appScoreFrame
      sigTermList[[i]] = termScoreFrame
    }
    #create 2 dataframes for each compResults, 1 for app and 1 for term drugs, rownames drugs, colnames method, fill in result
    listLength = length(sigAppList)
    if(length(sigTermList) > length(sigAppList))
      listLength = length(sigTermList)

    allAppFrame = as.data.frame(NULL)
    allTermFrame = as.data.frame(NULL)
    for(i in 1:listLength)
    {
      if(i <= length(sigAppList))
        allAppFrame = rbind(allAppFrame, sigAppList[[i]])
      if(i <= length(sigTermList))
        allTermFrame = rbind(allTermFrame, sigTermList[[i]])
    }

    appVec = c()
    termVec = c()
    scoreVec = c()
    for(i in 1:ncol(compResults))
    {
      appVec[i] = median(allAppFrame[, i])
      termVec[i] = 1 + mean(allTermFrame[, i])
      scoreVec[i] = appVec[i]*termVec[i]
    }
    finalScoreFrame = rbind(appVec, termVec, scoreVec)
    rownames(finalScoreFrame) = c("Median Approved Drug Position", "Mean Number of Unapproved Drugs Above Approved Drugs + 1", "Final Score (Row 1 Times Row 2)")
    colnames(finalScoreFrame) = gsub("Final Score", "", colnames(compResults))
    return(list(finalScoreFrame, sigAppList, sigTermList))
  }

  benchmarkResults = getBenchmarkScores(compResList, drugTrialsList)

  randScoreVec = c()
  for(i in 1:nperm)
  {
    compResListRand = compResList
    for(j in 1:length(compResList))
    {
      compResListRand[[j]] = as.data.frame(compResListRand[[j]][, 1])
      randOrd = sample(c(1:nrow(compResList[[j]])), nrow(compResList[[j]]))
      rownames(compResListRand[[j]]) = rownames(compResList[[j]])[randOrd]
    }
    benchmarkResultsRand = getBenchmarkScores(compResListRand, drugTrialsList)
    randScoreVec = c(randScoreVec, benchmarkResultsRand[[1]]["Final Score (Row 1 Times Row 2)",1])
  }

  finalScoreFrame = benchmarkResults[[1]]
  pVec = c()
  for(i in 1:ncol(finalScoreFrame))
  {
    score = finalScoreFrame[nrow(finalScoreFrame), i]
    pVal = length(which(randScoreVec <= score))/nperm
    if(pVal == 0)
      pVal = 1/nperm
    pVec = c(pVec, pVal)
  }
  finalScoreFrame = rbind(finalScoreFrame, pVec)
  rownames(finalScoreFrame)[nrow(finalScoreFrame)] = "p value"
  benchmarkResults[[1]] = finalScoreFrame

  names(benchmarkResults) = c("scoreFrame", "approvedDrugFrames", "unapprovedDrugFrames")
  #probably need to use combine.est
  return(benchmarkResults)
}