R/ensembles.R

Defines functions g2pmlRun stats getHits getPredictedGenes milkEnsemble evalEnsemble evalEnsembleOneShot evalEnsembles evalEnsemblesOneShot g2pmlpredict ensemblePredict ensemblePredictAllGenomev2 caretLearn ensembleLearn ensembleLearnTournament ensembleLearnKFold

Documented in ensembleLearn ensembleLearnKFold

#' Title
#'
#' @param methods
#' @param k
#' @param seed
#' @param nboot
#' @param tuneLength
#' @param panel
#' @param auto
#' @param maxTrials
#' @param controls
#' @param fs
#' @param fsThreshold
#' @param qmeasure
#' @param nsamps
#'
#' @return
#' @export
#'
#' @examples
ensembleLearnKFold = function(panel="Congenital_myopathy",
                              genes=NULL,
                              methods=c("rpart",
                                        "C5.0Tree",
                                        "svmRadial",
                                        "rf",
                                        "sparseLDA",
                                        "lda",
                                        "bayesglm",
                                        "kknn",
                                        "naive_bayes"),
                              k=5,
                              #The control genes to use
                              seed=12345,
                              #Number of simple models to create within the ensemble
                              nboot=200,
                              #This is useful to generate a null distribution of models to
                              #Number of different values to test for each ML algorithm
                              #hyperparameter
                              tuneLength=5,
                              #Number of repeats to be done by Caret
                              nsamps=5,
                              auto=T,
                              maxTrials=5,
                              controls = "allgenome",
                              fs="own",
                              fsThreshold=0.6,
                              qmeasure="kappa"){


  #Check which models are installed to try them
  methods = unlist(lapply(methods,
                          function(x){ tryCatch({ checkInstall(getModelInfo(x)$library); return(x)},
                                                error = function(e){ print(e); NULL },0)}))

  cat(paste0("Available methods to use are ",paste0(methods,collapse=", ")),"\n")
  finalEnsembles = NULL
  finalEnsembles$models = NULL
  expid = panel
  if(is.null(genes)){
    panelf = paste0(system.file("g2pml/gel/geMarch2017/", "", package = "G2PML"),
                    panel,".csv")

    stopifnot(file.exists(panelf))
    cat("Reading disease genes from",panelf,"\n")
    genes = read.csv(panelf,stringsAsFactors=F)
    genes = genes$GeneSymbol[genes$LevelOfConfidence == "HighEvidence"]
    genes = na.omit(G2PML::fromSymbol2Hugo(genes))
  }else
    genes = na.omit(G2PML::fromSymbol2Hugo(genes))

  if(controls == "allghosh"){
    ctrlpath=system.file("g2pml/controlgenes/allghosh/", "", package = "G2PML")
    #Getting control genes
    ctrlfile = paste0(ctrlpath,"/",panel,"_controls.tsv")
    cat("Reading controls from",ctrlfile,"\n")
    ctrlgenes = read.delim(ctrlfile,header=F,stringsAsFactors=F)$V1
    ctrlgenes = na.omit(G2PML::fromSymbol2Hugo(ctrlgenes))
  }else if(controls == "allgenome"){
    coding = getCodingGenome()
    ctrlgenes = coding[!(coding %in% genes)]
  }else
    stop(paste0("Control type not found:",controls))

  cat("We get",length(ctrlgenes),"control genes\n")


  #Configuration parameters
  if(length(genes) < 50){
    leaveoutpos = as.integer(length(genes)*0.3)
  }else{
    leaveoutpos = as.integer(length(genes)/k)
  }
  indexespos = matrix(nrow=k,ncol=length(genes)-leaveoutpos)
  evalindexespos = matrix(nrow=k,ncol=leaveoutpos)
  leaveoutneg = as.integer(length(ctrlgenes)/k)
  indexesneg = matrix(nrow=k,ncol=length(ctrlgenes)-leaveoutneg)
  evalindexesneg = matrix(nrow=k,ncol=leaveoutneg)

  if(typeof(fs) == "character"){
    if(fs == "own")
      pathfs = system.file("g2pml/fs/", "", package = "G2PML")
    else
      stop(paste0("Feature selection strategy not found:",fs))
    vars=NULL
    fsfile = paste0(pathfs,"/featureSelection",panel)
    if(!file.exists(paste0(fsfile,"_fsrfs_1.rds")) &
       !file.exists(paste0(fsfile,"_fsrfs_2.rds")) &
       !file.exists(paste0(fsfile,"_fsrfs_3.rds")) &
       !file.exists(paste0(fsfile,"_fsrfs_4.rds")) &
       !file.exists(paste0(fsfile,"_fsrfs_5.rds")))
      fsfile=NULL
  }else
    vars = getVarsFromFS(fs,r=fsThreshold)



  for(fold in 1:k){
    sequence = 1:length(genes)
    indexespos[fold,] = sample(x=sequence,length(genes) -  leaveoutpos)
    evalindexespos[fold,] = sequence[!(sequence %in% indexespos[fold,])]
    lexpid = paste0(expid,"_kfold","_",fold)

    sequence = 1:length(ctrlgenes)
    indexesneg[fold,] = sample(x=sequence,length(ctrlgenes) -  leaveoutneg)
    evalindexesneg[fold,] = sequence[!(sequence %in% indexesneg[fold,])]

    genespos = genes[indexespos[fold,]]
    genespos = G2PML::fromSymbol2Hugo(genespos)
    genesneg = ctrlgenes[indexesneg[fold,]]
    genesneg = G2PML::fromSymbol2Hugo(genesneg)
    genestogo = c(genespos,genesneg)
    condition = c(rep("Disease",length(genespos)),
                  rep("Nondisease",length(genesneg)))


    if(auto)
      finalEnsembles$models[[fold]] = ensembleLearnTournament(panel=panel,
                                                       methods=methods,
                                                       condition=condition,
                                                       genes=genestogo,
                                                       nboot=nboot,
                                                       nsamps=nsamps,
                                                       tuneLength=tuneLength,
                                                       fsfile=fsfile,
                                                       maxTrials=maxTrials,
                                                       qmeasure=qmeasure,
                                                       vars=vars,
                                                       evalctrl=ctrlgenes[evalindexesneg[fold,]],
                                                       evaldisease=genes[evalindexespos[fold,]])
    else
      finalEnsembles$models[[fold]] = ensembleLearn(expID=lexpid,
                                             panel=panel,
                                             condition=condition,
                                             useSeedGene=useSeedGene,
                                             useSmote=useSmote,
                                             method=method,
                                             genes=genestogo,
                                             nboot=nboot,
                                             nsamps=nsamps,
                                             tuneLength=tuneLength,
                                             fsfile=fsfile,
                                             vars=vars,
                                             evalctrl=ctrlgenes[evalindexesneg[fold,]],
                                             evaldisease=genes[evalindexespos[fold,]])
  }

  finalEnsembles = evalEnsembles(finalEnsembles)
  return(finalEnsembles)
}



ensembleLearnTournament  = function(genes,
                                    panel=NULL,
                                    #If condition is NULL then all genes are disease
                                    condition=NULL,
                                    #The ID for referring to the results
                                    expID = "BootstrapEnsemble",
                                    #The caret Method to use
                                    methods=c("rpart",
                                              "C5.0Tree",
                                              "svmRadial",
                                              "rf",
                                              "sparseLDA",
                                              "kknn"),
                                    #The control genes to use
                                    controls="allghosh",
                                    #If we do feature selection, these are the vars we use
                                    fsfile=NULL,
                                    vars=NULL,
                                    #We don't want these predictors to be used for learning
                                    filter=c("DSI","DPI","ESTcount","constitutiveexons"),
                                    seed=12345,
                                    #Number of simple models to create within the ensemble
                                    nboot=20,
                                    #Number of different values to test for each ML algorithm
                                    #hyperparameter
                                    tuneLength=5,
                                    #Number of repeats to be done by Caret
                                    nsamps=5,
                                    evalStep = 1,
                                    maxTrials=10,
                                    minKappa = 0.1,
                                    minPPV = 0.6,
                                    cutoff=0.9,
                                    alpha=0.7,
                                    debug=F,
                                    usereplacement=F,
                                    qmeasure="sens",
                                    useppv=F,
                                    evaldisease=NULL,
                                    evalctrl=NULL,
                                    silent=T,
                                    ...)
{
  cat("Calling ensembleLearn with\n")
  genes = na.omit(fromSymbol2Hugo(genes))
  cat("Genes(",length(genes),"):",paste0(genes[1:3],collapse=","),"...\n")
  cat("We'll use",nboot,"sub-models\n")
  cat("Proportion between disease and non disease\n")
  print(table(condition))
  cat("Method list:",paste0(methods,collapse=","),"\n")
  cat("expID:",expID,"\n")

  methods = unlist(lapply(methods,
                          function(x){ tryCatch({ checkInstall(getModelInfo(x)$library); return(x)},
                                                error = function(e){ print(e); NULL },0)}))

  cat(paste0("Available methods to use are ",paste0(methods,collapse=", ")),"\n")

  if(is.null(vars)){
    if(!is.null(fsfile)){
      vars = fsGetVars(file=fsfile)
      cat("We will use feature selection on vars\n")
      print(vars)
    }
  }

  modelEvals = NULL
  modelHits = NULL

  set.seed(seed)
  evaluation = NULL
  prefix = NULL
  ensemble = NULL

  alldata.in = fromGenes2MLData(genes=genes,
                                addcontrols=F,
                                vars=vars,
                                condition=condition,
                                filter=filter)

  if(debug){
    alldata.in = alldata.in[,c(1:10,ncol(alldata.in))]
  }

  set.seed(seed)
  evaluation = NULL
  prefix = NULL
  ensemble = NULL

  #Separate controls from disease
  controlsd = alldata.in[alldata.in$condition == "Nondisease",]
  data.in = alldata.in[alldata.in$condition == "Disease",]
  ncontrols = nrow(data.in)
  allresults = NULL
  fits = NULL

  isMinimumQuality = function(qmeasure,value){
    if(qmeasure == "costerror"){
      return(value < 0.9)
    }
    if(qmeasure == "kappa" | qmeasure == "mccc")
      return(value > 0.1)
    if(qmeasure == "bacc" | qmeasure == "sens")
      return(value > 0.3)
  }

  isOptimalValue = function(qmeasure,value){
    if(qmeasure == "costerror"){
      return(value == 0)
    }
    return(value == 1)
  }

  isProgressMade = function(qmeasure,value,lastValue){
    if(qmeasure == "costerror"){
      return(value <= lastValue)
    }
    return(value >= lastValue)
  }

  i = 1
  if(qmeasure == "costerror"){
    lastKappa = 1
    minKappa=0.9
  }else
    lastKappa = -1
  nulltrials = 0
  methodIndex = 1
  optInfo = list()
  nonOptimal=T
  while(i <= nboot & nulltrials < maxTrials & nonOptimal){
    cat("Bootstrap",i,"and null trials in a row", nulltrials,"\n")
    #cat("We will select controls sampling from the whole set\n")
    ctrlmask = sample(1:nrow(controlsd),nrow(data.in))
    if(usereplacement)
      localdata.in = rbind(data.in[sample(1:nrow(data.in),
                                          nrow(data.in),
                                          replace=T),],
                           controlsd[ctrlmask,])
    else{
      then = floor(nrow(data.in)*0.9)
      ctrlmask = sample(1:nrow(controlsd),then)
      localdata.in = rbind(data.in[sample(1:nrow(data.in),then),],
                           controlsd[ctrlmask,])

    }

    kappas = NULL
    localModels = NULL
    localEnsembles = NULL
    for(method in methods){
      #cat("Trying our luck now with with method",method,"\n")
      result = caretLearn(tuneLength=tuneLength,
                          nsamps=nsamps,
                          in.file=localdata.in,
                          model.with.all = T,
                          method=method)
      ctrlgenes = localdata.in$gene[localdata.in$condition == "Nondisease"]
      localModels[[method]] =  list(model=result$model$finalModel,
                                    attsused=result$attsused,
                                    method=method,
                                    controls=ctrlgenes)
      ensemble[[i]] = localModels[[method]]
      toreturn = result$results[,c("ROC","Sens","Spec","ROCSD","SensSD","SpecSD")]
      toreturn = as.vector(toreturn)
      #cat("Finished running method",method,"wihtin bootstrap",i,"\n")

      allresults = rbind(allresults,cbind(rep(i,nrow(toreturn)),toreturn))
      #cat("Now let's evaluate\n")

      #Do we get an improvement???
      model.indexes = as.numeric(names(sort(tapply(allresults$ROC,allresults[,1],
                                                   function(x){ max(x)}),
                                            decreasing=T)))
      testModel = list(genes=genes,
                       method=method,
                       panel=panel,
                       controls=controls,
                       controlgenes=controlsd$gene,
                       vars=vars,
                       condition=condition,
                       nboot=length(ensemble),
                       model=ensemble,
                       eval=allresults,
                       modelindexes=model.indexes,
                       modelprefix=prefix)

      preds = ensemblePredictAllGenomev2(ensemble=testModel,
                                         n=testModel$nboot,
                                         cutoff=cutoff,
                                         vars=vars)
      testModel$preds = preds
      testModel$evaldisease = evaldisease
      testModel$evalctrl = evalctrl
      testModelEvaluation = evalEnsemble(testModel,cutoff=cutoff,alpha=alpha)
      localEnsembles[[method]] = testModel

      #Should we keep evalating or not??
      #We access the kappa value
      if(useppv){
        kappas[[method]] = mean(testModelEvaluation$ppv)
        #cat("Using PPV\n")
        minThreshold = minPPV
      }else{
        #cat("Using as kappa,",qmeasure,"\n")
        #kappa = mean(testModelEvaluation$kappa)
        kappas[[method]] = mean(unlist(testModelEvaluation[qmeasure]))
        minThreshold = minKappa
      }
    }

    #print("These are the kappas per method")
    #print(kappas)
    winnerIndex = names(kappas)[which.max(unlist(kappas))]
    kappa = kappas[[winnerIndex]]
    winnerMethod = localModels[[winnerIndex]]

    #print(kappa)
    if(!is.nan(kappa)){
      if(isOptimalValue(qmeasure,kappa)){
        nonOptimal=F
        ensemble[[i]] = localModels[[winnerIndex]]

      }

      else{
        if(isMinimumQuality(qmeasure,kappa)){
          if(!silent)
            cat("The ensemble is good enough with the new model, should we keep it?\n")
          if(i >= evalStep){
            if(!silent)
              cat("We now test whether the model should we kept it?\n")
            if(isProgressMade(qmeasure,kappa,lastKappa)){
              if(!silent)
                cat("We improve kappa from",lastKappa,"to",kappa,"at iteration",i,
                    "using method",winnerIndex,"\n")
              lastKappa = kappa
              optInfo[[length(optInfo) + 1]] = list(kappa=kappa,method=winnerIndex,success=T)
              nulltrials = 0
              ensemble[[i]] = localModels[[winnerIndex]]
              i = i + 1

              singleeval = evalEnsemble(ensemble=localEnsembles[[winnerIndex]],
                                               cutoff=0.5)
              modelEvals = rbind(modelEvals,c(panel,i,singleeval["auc"],singleeval["ppv"],
                                              singleeval["kappa"],singleeval["mcc"],
                                              singleeval["bacc"],singleeval["sens"],
                                              singleeval["spec"],singleeval["costerror"]))


            }else{
              if(!silent)
                cat("We can't improve kappa from",lastKappa,"to",kappa,"at iteration",i,", doing backtrack\n")
              ensemble =  ensemble[1:(i-1)]
              optInfo[[length(optInfo) + 1]] = list(kappa=kappa,method=winnerIndex,success=F)
              nulltrials = nulltrials + 1
              allresults = allresults[allresults[,1] != i,]
              if(!silent)
                cat("Still",maxTrials - nulltrials,"left\n")
              if(!silent)
                cat("We'll try now with a new round of methods",paste0(methods,collapse=","),"\n")
            }
            if(useppv)
              cat("PPVs so far",paste0(unlist(lapply(optInfo,function(x){return(x["kappa"])})),collapse=","),"\n")
            else
              cat("Kappas so far",paste0(unlist(lapply(optInfo,function(x){return(x["kappa"])})),collapse=","),"\n")

          }else{ #We simply add the model and wait until we have enough of them
            if(!silent)
              cat("Yes, we keep it. Not enough models in the ensemble yet\n")
            optInfo[[length(optInfo) + 1]] = list(kappa=kappa,method=winnerIndex,success=T)
            ensemble[[i]] = localModels[[winnerIndex]]
            i = i + 1
            lastKappa = kappa
          }
        }else{ #At this point, the whole list of methods did badly
          if(!silent)
            cat("We have to drop the last models iteration results, no minimum quality achieved\n")
          ensemble[[i]] =  NULL
          allresults = allresults[allresults[,1] != i,]
          if(!silent)
            cat("We'll try now now with a new round of methods",paste0(methods,collapse=","),"\n")
        }
      }



    }else{
      if(!silent)
        cat("This iteration is NULL, kappa is not valid\n")
      nulltrials = nulltrials + 1
      allresults = allresults[allresults[,1] != i,]
      if(!silent)
        cat("Still",maxTrials - nulltrials,"left\n")
      if(!silent)
        cat("We'll try now now with a new round of methods",paste0(methods,collapse=","),"\n")
    }

  }
  cat("Leaving the loop and finishing up\n")
  genes = c(data.in$gene,controlsd$gene)

  finalModel = list(genes=genes,
                    panel=panel,
                    controls=controls,
                    controlgenes=controlsd$gene,
                    vars=vars,
                    condition=condition,
                    nboot=length(ensemble),
                    model=ensemble,
                    eval=allresults,
                    modelindexes=1:length(ensemble),
                    modelprefix=prefix,
                    optinfo=optInfo)

  preds = ensemblePredictAllGenomev2(ensemble=finalModel,
                                     n=finalModel$nboot,
                                     cutoff=cutoff,
                                     vars=vars)
  finalModel$preds = preds
  finalModel$cutoff = cutoff

  #finalModel$metadata = mllearn.genEnsembleMetaData(finalModel,panel)

  finalModel$method = "multimethod"
  finalModel$evaldisease = evaldisease
  finalModel$evalctrl = evalctrl
  finalModel$modelEvals = modelEvals
  return(finalModel)
}
#' Title
#'
#' @param genes
#' @param panel
#' @param condition
#' @param expID
#' @param method
#' @param fsfile
#' @param controls
#' @param filter
#' @param seed
#' @param nboot
#' @param tuneLength
#' @param nsamps
#' @param out.folder
#' @param debug
#' @param usereplacement
#' @param useSeedGene
#' @param useSmote
#' @param evalEach
#' @param experiment
#' @param ...
#'
#' @return
#' @export
#'
#' @examples
ensembleLearn  = function(genes,
                          panel=NULL,
                          #If condition is NULL then all genes are disease
                          condition=NULL,
                          #The ID for referring to the results
                          expID = "BootstrapEnsemble",
                          #The caret Method to use
                          method="J48",
                          #If we do feature selection, these are the vars we use
                          fsfile=NULL,
                          vars=NULL,
                          #The control genes to use
                          controls="ghosh",
                          #We don't want these predictors to be used for learning
                          filter=c("DSI","DPI","ESTcount","constitutiveexons"),
                          seed=12345,
                          #Number of simple models to create within the ensemble
                          nboot=200,
                          #Number of different values to test for each ML algorithm
                          #hyperparameter
                          tuneLength=5,
                          #Number of repeats to be done by Caret
                          nsamps=5,
                          evalctrl,
                          evaldisease,
                          debug=F,
                          usereplacement=T,
                          evalEach=seq(from=1,to=nboot,by=3),
                          experiment="DRN",
                          ...)
{
  cat("Calling ensembleLearn with\n")
  cat("Genes(",length(genes),"):",paste0(genes[1:3],collapse=","),"...\n")
  cat("We'll use",nboot,"sub-models\n")
  cat("Proportion between disease and non disease\n")
  print(table(condition))
  cat("Method:",method,"\n")
  cat("Controls:",controls,"\n")
  cat("expID:",expID,"\n")

  if(is.null(vars)){
    if(!is.null(fsfile)){
      vars = fsGetVars(file=fsfile)
      cat("We will use feature selection on vars\n")
      print(vars)
    }
  }


  set.seed(seed)
  evaluation = NULL
  prefix = NULL
  ensemble = NULL

  alldata.in = fromGenes2MLData(genes=genes,
                                addcontrols=F,
                                vars=vars,
                                condition=condition,
                                filter=filter,...)

  if(debug){
    alldata.in = alldata.in[,c(1:10,ncol(alldata.in))]
  }

  #Separate controls from disease
  controlsd = alldata.in[alldata.in$condition == "Nondisease",]
  data.in = alldata.in[alldata.in$condition == "Disease",]
  ncontrols = nrow(data.in)
  allresults = NULL
  fits = NULL

  modelEvals = NULL
  modelHits = NULL
  for(i in 1:nboot){
    cat("Starting with bootstrap",i,"\n")

    cat("We will select controls sampling from the whole set\n")
    ctrlmask = sample(1:nrow(controlsd),nrow(data.in))
    if(usereplacement)
      localdata.in = rbind(data.in[sample(1:nrow(data.in),nrow(data.in),replace=T),],
                           controlsd[ctrlmask,])
    else
      localdata.in = rbind(data.in,controlsd[ctrlmask,])
    result = caretLearn(tuneLength=tuneLength,
                        nsamps=nsamps,
                        in.file=localdata.in,
                        model.with.all = T,
                        method=method)

    ctrlgenes = localdata.in$gene[localdata.in$condition == "Nondisease"]
    ensemble[[i]] = list(model=result$model$finalModel,
                         attsused=result$attsused,
                         method=method,
                         controls=ctrlgenes)


    toreturn = as.vector(result$results)
    cat("Done with bootstrap",i,"\n")
    allresults = rbind(allresults,cbind(rep(i,nrow(toreturn)),toreturn))

    if(i %in% evalEach){

      model.indexes = as.numeric(names(sort(tapply(allresults$ROC,allresults[,1],
                                                   function(x){ max(x)}),
                                            decreasing=T)))
      #print("Temporal model indexes order")
      #print(model.indexes)
      #if(controls == "clustering"){
      genes = c(data.in$gene,controlsd$gene)

      tmpModel = list(genes=genes,
                      panel=panel,
                      method=method,
                      controls=controls,
                      controlgenes=controlsd$gene,
                      vars=vars,
                      condition=condition,
                      nboot=i,
                      model=ensemble,
                      eval=allresults,
                      modelindexes=model.indexes,
                      modelprefix=prefix)
      #if(genPredictions){
      preds = ensemblePredictAllGenomev2(ensemble=tmpModel,
                                         n=i,
                                         cutoff=0.5,
                                         vars=vars,
                                         method=method)

      tmpModel$preds = preds
      #We'll get now the genes leaved out
      tmpModel$evaldisease = evaldisease
      tmpModel$evalctrl = evalctrl

      singleeval = evalEnsembleOneShot(ensemble=tmpModel,cutoff=0.5)
      modelEvals = rbind(modelEvals,c(panel,i,singleeval["auc"],singleeval["ppv"],
                                      singleeval["kappa"],singleeval["mcc"],
                                      singleeval["bacc"],singleeval["sens"],
                                      singleeval["spec"],singleeval["costerror"]))
      modelHits = rbind(modelHits,singleeval$hits)
      #print(modelEvals)
      #print(modelHits)
    }

  }
  colnames(modelEvals) = c("panel","boot","auc","ppv","kappa","mcc",
                           "bacc","sens","spec","costerror")

  #write.table(modelEvals,paste0(out.model,"_evals.tsv"),col.names=T,row.names=F,sep="\t",quote=F)
  colnames(modelHits) = c("panel","method","q","models","predictedgenes","evalgenes","hits","enrichment")
  #write.table(modelHits,paste0(out.model,"_hits.tsv"),col.names=T,row.names=F,sep="\t",quote=F)
  colnames(allresults)[1] = "bootstrap"
  model.indexes = as.numeric(names(sort(tapply(allresults$ROC,allresults$bootstrap,
                                               function(x){ max(x)}),
                                        decreasing=T)))
  #print("Model indexes order")
  #print(model.indexes)
  #if(controls == "clustering"){
  genes = c(data.in$gene,controlsd$gene)
  finalModel = list(genes=genes,
                    panel=panel,
                    method=method,
                    controls=controls,
                    controlgenes=controlsd$gene,
                    vars=vars,
                    condition=condition,
                    nboot=nboot,
                    model=ensemble,
                    eval=allresults,
                    modelindexes=model.indexes,
                    modelprefix=prefix)
  #if(genPredictions){
  preds = ensemblePredictAllGenomev2(ensemble=finalModel,
                                     n=nboot,
                                     cutoff=0.5,
                                     vars=vars,
                                     method=method)

  finalModel$preds = preds
  finalModel$modelHits = modelHits
  finalModel$allresults = allresults
  finalModel$evalctrl = evalctrl
  finalModel$evaldisease = evaldisease



  return(finalModel)
}

caretLearn = function(in.file,
                      genes=NULL,
                      method="Rborist",
                      out.file=NULL,
                      tuneLength=10,
                      nsamps=10,
                      trainingProportion=0.8,
                      tuneGrid=NULL,
                      silent=T,
                      model.with.all=F)
{
  library(caret)

  if(is.null(genes)){
    if(typeof(in.file) == "character"){
      if(length(grep(".rds$",in.file)) > 0)
        data.in = readRDS(in.file)
      else
        data.in = read.delim(in.file,sep=",")
    }else
      data.in = in.file
  }else{
    data.in = fromGenes2MLData(genes)
  }

  data.in$gene = NULL
  condition = data.in$condition
  data.in$condition = NULL
  #numcolumns = which(colnames(data.in) != "condition")

  nzv = nearZeroVar(data.in, saveMetrics= TRUE)
  data.in = data.in[,!nzv$zeroVar]
  descrCor = cor(data.in) # builds correlation matrix
  highlyCorDescr = findCorrelation(descrCor, cutoff = 1,
                                   names = T, verbose = F)
  data.in = data.in[,!(colnames(data.in) %in% highlyCorDescr)]
  data.in$condition = condition

  mask = which(startsWith(colnames(data.in),"RankedMM"))
  mask = c(mask,which(startsWith(colnames(data.in),"ExprSpecific")))
  mask = c(mask,which(startsWith(colnames(data.in),"AdjSpecific")))



  #Do a repeated cross-validation evaluation with class probabilities and two class summaries
  ctrl = trainControl(method="repeatedcv",
                      repeats=nsamps,
                      classProbs=T,
                      summaryFunction=twoClassSummary)

  #model.with.all is going to be true if we call this function from a bootstrapping model
  #learning function
  if(model.with.all){

    if(!silent)
      cat("***********Calling Caret train method with",nrow(data.in),
          "samples and",ncol(data.in),"attributes (all data)\n")

    fit = train(condition ~.,
                data = data.in,
                method=method,
                trControl=ctrl,
                metric="ROC",
                tuneLength=tuneLength)


    plsClasses = predict(fit, newdata = data.in)
    cfm = confusionMatrix(data = plsClasses,
                          reference=as.factor(data.in$condition),
                          positive="Disease")

    data.in$condition = NULL
    if(!silent) cat("Finished, returning the model\n")
    return(list(model=fit,results=fit$results,cfm=cfm,attsused=colnames(data.in)))
  }


  if(!silent)
    cat("**************************************Calling Caret train method with",
        nrow(data.in),
        "samples and",ncol(data.in),"attributes\n")
  #We should be here when we call this function in the conventional way
  inTrain = createDataPartition(y=data.in$condition,p=trainingProportion,list=F)
  training = data.in[inTrain,]
  testing = data.in[-inTrain,]

  if(!is.null(tuneGrid))
    fit = train(condition ~ .,data=training,
                method=method,
                trControl=ctrl,
                tuneGrid=tuneGrid,
                metric="ROC")
  else
    fit = train(condition ~ .,data=training,
                method=method,
                trControl=ctrl,
                tuneLength=tuneLength,
                metric="ROC")

  #Evaluate with testing data
  plsClasses = predict(fit, newdata = testing)
  cfm = confusionMatrix(data = plsClasses,
                        reference=testing$condition,
                        positive="Disease")
  return(list(model=fit,results=fit$results,cfm=cfm))
}

ensemblePredictAllGenomev2 = function(ensemble,
                                      n = 200,
                                      cutoff=0.9,
                                      vars=NULL,
                                      method=NULL){
  #cat("Predicting All Genome v2\n")
  allgenes = getCodingGenome()
  return(ensemblePredict(genes=allgenes,
                         ensemble=ensemble$model,
                         vars=vars,
                         cutoff=cutoff,
                         model.indexes=ensemble$modelindexes[1:n],
                         method=method))
}



ensemblePredict = function(genes,
                           ensemble,
                           cutoff=0.9,
                           silent=T,
                           vars=NULL,
                           model.indexes=NULL,
                           method=NULL){

  alldata.in = fromGenes2MLData(genes=genes,addcontrols=F,vars=vars)
  genes = alldata.in$gene
  alldata.in$gene = NULL
  alldata.in$condition = NULL

  preds = NULL
  if(is.null(model.indexes))
    model.indexes = 1:n
  for(i in model.indexes){
    data.in = alldata.in
    model = ensemble[[i]]$model
    method = ensemble[[i]]$method

    attsused = ensemble[[i]]$attsused
    data.in = data.in[,attsused]
    if(!silent)
      cat("Predicting with simple model",i,"and method",method,"\n")



    if(nrow(data.in) == 0)
      return(NULL)

    if(method %in% c("xgbTree","xgbLinear","xgbDART")){

      numpred = predict(model,newdata=as.matrix(data.in),type="response")
      localpred = vector(mode="character",length=length(numpred))
      localpred[numpred > 0.5] = "Disease"
      localpred[numpred <= 0.5] = "Nondisease"
    }else if(method %in% c("C5.0Tree","rpart","C5.0","naive_bayes","rf","J48","knn")){
      localpred = as.character(predict(model,newdata=data.in,type="class"))
    }else if (method %in% c("kknn","svmRadial")){
      library(kernlab)
      localpred = as.character(predict(model,newdata=data.in))
    }else if (method %in% c("lda","sparseLDA")){
      localpred = as.character(predict(model,newdata=data.in)$class)
    }else{
      numpred = predict(model,newdata=data.in,type="response")
      localpred = vector(mode="character",length=length(numpred))
      localpred[numpred <= 0.5] = "Disease"
      localpred[numpred > 0.5] = "Nondisease"
    }


    preds$preds[[i]] = localpred
  }

  preds$genes = genes
  preds$predictions = g2pmlpredict(preds,ratio=cutoff)


  return(list(genes=preds$genes,
              unknowngenes=genes[!(genes %in% preds$genes)],
              prediction=preds$predictions$prediction,
              quality=preds$predictions$quality,
              rawpredictions=preds$preds))
}

g2pmlpredict = function(preds,ratio=0.5,useDontKnow=F){

  if(is.null(preds))
    return(preds)

  genes = preds$genes
  ngenes = length(preds$genes)
  disease = vector(mode="numeric",length=ngenes)
  disease[] = 0
  nondisease = disease

  preds = preds$preds
  npreds = length(preds)
  for(i in 1:npreds){
    mask = preds[[i]] == "Disease"
    disease[mask] = disease[mask] + 1
    nondisease[!mask] = nondisease[!mask] + 1
  }
  disease = disease/npreds
  nondisease = nondisease/npreds

  if(useDontKnow){
    outlabel = rep("DontKnow",ngenes)
    outlabel[disease >= ratio] = "Disease"
    outlabel[nondisease >= ratio] = "Nondisease"
  }else{
    outlabel = rep("Nondisease",ngenes)
    outlabel[disease >= ratio] = "Disease"
  }


  confidence = disease
  #confidence[nondisease >= ratio] = nondisease[nondisease >= ratio]
  names(confidence) = outlabel

  return(list(prediction=outlabel,quality=confidence,genes=genes))
}

evalEnsemblesOneShot = function(ensembles,
                                useppv=F,
                                cutoff=0.7,
                                qmeasure="mcc",
                                doGO=F){

  k = length(ensembles)
  globalallhits = NULL
  globalhitsperfold = NULL

  allpredictions = NULL
  alldiseasegenes = NULL


  results = NULL
  modelcount = 0
  used = 0
  evalgenes = NULL
  allkappas = NULL
  kappa = NULL

  for(i in 1:k){
    untilTree = ensembles[[i]]$nboot
    #for(untilTree in 1:ensembles[[1]]$nboot){
    #print(untilTree)

    cat("First we evaluate without predictions\n")
    evaluation = evalEnsembleOneShot(ensemble = ensembles[[i]],
                                     cutoff=cutoff,
                                     trees=untilTree)
    ensembles[[i]]$evaluation = evaluation
    if(useppv){
      kappa = c(kappa,ensembles[[i]]$evaluation$ppv)
    }else{
      kappa = c(kappa,unlist(ensembles[[i]]$evaluation[qmeasure]))
    }
  }
  for(q in c(0.5,0.6,0.7,0.8,0.9,0.99)){
    diseasegenes = NULL
    diseasegenesunr = NULL
    hitsperfold = NULL
    allhits = NULL
    allevaldisease = NULL

    for(i in 1:k){
      cat("Now we generate predictions\n")

      milked = milkEnsemble(ensemble = ensembles[[i]],
                            cutoff=q,
                            trees=untilTree,
                            remove=c(ensembles[[i]]$genes[ensembles[[i]]$condition == "Disease"]))
      milkedunr = milkEnsemble(ensemble = ensembles[[i]],
                               cutoff=q,
                               trees=untilTree,
                               remove=NULL)

      hits = getHits(panel=ensembles[[i]]$panel,
                     genes=milked,
                     brandnew=ensembles[[i]]$evaldisease)

      allevaldisease = c(allevaldisease,ensembles[[i]]$evaldisease)

      hitsperfold = rbind(hitsperfold,c(ensembles[[i]]$panel,
                                        ensembles[[i]]$method,
                                        i,q,untilTree,
                                        length(milked),
                                        hits$newgenes,
                                        hits$hits,
                                        hits$fold))
      diseasegenes = c(diseasegenes, milked)
      diseasegenesunr = c(diseasegenesunr,milkedunr)

    }
    allevaldisease = unique(allevaldisease)
    diseasegenes = table(diseasegenes)
    diseasegenesunr = table(diseasegenesunr)

    mylocalgenes = names(diseasegenesunr)[diseasegenesunr/k >= q]
    hits = getHits(panel=ensembles[[i]]$panel,
                   genes=mylocalgenes,
                   brandnew=allevaldisease)

    #Now the final "all ensemble" predictions
    hitsperfold = rbind(hitsperfold,c(ensembles[[i]]$panel,
                                      "multimethod",
                                      i+1,
                                      q,
                                      untilTree,
                                      length(mylocalgenes),
                                      hits$newgenes,
                                      hits$hits,
                                      hits$fold))

    #print(hitsperfold)

    #diseasegenes = as.numeric(diseasegenes/used)
    diseasegenes = sort(diseasegenes,decreasing=T)
    genes = names(diseasegenes)
    diseasegenes = as.vector(diseasegenes)
    diseasegenes = diseasegenes/k

    finaltable = as.data.frame(cbind(genes=genes,quality=diseasegenes),stringsAsFactors=F)
    colnames(finaltable) = c("gene","quality")
    allpredictions = rbind(allpredictions,cbind(rep(ensembles[[i]]$panel,nrow(finaltable)),finaltable))
    for(qkfold in c(0,0.3,0.5)){
      localfinaltable = finaltable
      localfinaltable = localfinaltable[as.numeric(localfinaltable[,2]) > qkfold,]

      #Using all
      hits = getHits(panel=ensembles[[i]]$panel,genes=localfinaltable[,1],
                     brandnew=ensembles[[i]]$evaldisease)

      #Now the hits
      hits = getHits(panel=ensembles[[i]]$panel,genes=localfinaltable[,1])
      if(!is.null(hits)){
        allhits = rbind(allhits,c(ensembles[[i]]$panel,
                                  ensembles[[i]]$method,
                                  q,
                                  qkfold,
                                  nrow(localfinaltable),
                                  hits$newgenes,
                                  hits$hits,
                                  hits$fold))
      }

      cat("Trying gProfilerR query for gene quality",q,"and fold quality",qkfold,
          "and",nrow(localfinaltable),"genes\n")
      if(nrow(localfinaltable) > 0 & nrow(localfinaltable) < 4000 & doGO){
        cat("Doing gProfilerR query for gene quality",q,"and fold quality",qkfold,
            ",yields",nrow(localfinaltable),"genes\n")
        res = gProfileR::gprofiler(query=localfinaltable[,1],
                                   src_filter=c("GO","KEGG","REAC","HP","OMIM"))
        #res=matrix(nrow=0,ncol=0)
        cat("We got",nrow(res),"enrichment terms\n")
        if(nrow(res) > 0){
          res$intersection = NULL
          write.table(res,paste0(out.path,"/",localsaveprefix,"_predsGO_",q,"_",qkfold,".tsv"),
                      row.names=F,quote=F,sep="\t")

        }

      }
    }

    if(!is.null(allhits)){
      globalallhits = rbind(globalallhits,allhits)
    }
    if(!is.null(hitsperfold)){
      #c(i,q,untilTree,length(milked),hits$newgenes,hits$hits,hits$fold)
      globalhitsperfold = rbind(globalhitsperfold,hitsperfold)
    }
  }
  if(!is.null(globalallhits)){
    colnames(globalallhits) = c("panel","method","ensembleq","kfoldq","predictions","newgenes","hits","enrichment")
  }

  colnames(globalhitsperfold) = c("panel","method","fold","q","trees","predictions","evalgenes","hits","enrichment")
  allkappas = c(k,mean(kappa),max(kappa),min(kappa))
  names(allkappas) = c("folds","meankappa","maxkappa","minkappa")
  mask = globalhitsperfold[,"fold"] == as.character(k + 1)
  iplusoneevals = globalhitsperfold[mask,]
  qfold = iplusoneevals[which.max(as.numeric(iplusoneevals[,"enrichment"])),"q"]

  q=tapply(as.numeric(globalhitsperfold[,"enrichment"]),globalhitsperfold[,"q"],mean)
  bestq = as.numeric(names(q)[which.max(q)][1])

  #Now the final predictions
  diseasegenes = NULL
  for(i in 1:k){
    cat("Now we generate predictions\n")
    milked = milkEnsemble(ensemble = ensembles[[i]],
                          cutoff=bestq,
                          trees=untilTree,
                          remove=c(ensembles[[i]]$genes[ensembles[[i]]$condition == "Disease"]))

    #hits = getHits(panel=ensembles[[i]]$panel,
    #               genes=milked,
    #               brandnew=ensembles[[i]]$evaldisease)

    diseasegenes = c(diseasegenes, milked)
  }

  if(length(diseasegenes) > 0){
    diseasegenes = table(diseasegenes)
    diseasegenes = sort(diseasegenes,decreasing=T)
    genes = names(diseasegenes)
    diseasegenes = as.vector(diseasegenes)
    diseasegenes = diseasegenes/k
  }

  names(diseasegenes) = genes
  finalpreds = genes[diseasegenes >= qfold]
  return(list(hits=globalallhits,
              hitsperfold=globalhitsperfold,
              allpredictions=diseasegenes,
              bestq=bestq,
              qfold=qfold,
              finalpreds=finalpreds,
              kappa=allkappas))
}

evalEnsembles = function(ensembles){
  qs = c(0.5,0.6,0.7,0.8,0.9,0.99)
  k = length(ensembles$models)
  diseasegenes = NULL
  minkappa = 1
  maxkappa = -1
  meankappa = 0
  for(i in 1:k){
    kappa = NULL
    untilTree = ensembles$models[[i]]$nboot
    cat("First we evaluate without predictions\n")
    for(q in qs){
      evaluation = evalEnsemble(ensemble = ensembles$models[[i]],
                                cutoff=q,
                                trees=untilTree)
      kappa = c(kappa,evaluation$kappa)
    }
    localmaxkappa = max(kappa)
    ensembles$models[[i]]$evaluation = evalEnsemble(ensemble = ensembles$models[[i]],
                                             cutoff=qs[which.max(kappa)],
                                             trees=untilTree)
    ensembles$models[[i]]$bestq = qs[which.max(kappa)]
    ensembles$models[[i]]$kappa = localmaxkappa
    milked = milkEnsemble(ensemble = ensembles$models[[i]],
                          cutoff=ensembles$models[[i]]$bestq,
                          trees=untilTree,
                          remove=c(ensembles$models[[i]]$genes[ensembles$models[[i]]$condition == "Disease"]))
    diseasegenes = c(diseasegenes, milked)
    minkappa = min(c(minkappa,kappa))
    maxkappa = max(c(maxkappa,kappa))

  }


  diseasegenes = table(diseasegenes)
  diseasegenes = sort(diseasegenes,decreasing=T)
  genes = names(diseasegenes)
  diseasegenes = as.vector(diseasegenes)
  diseasegenes = diseasegenes/k
  finaltable = as.data.frame(cbind(genes=genes,quality=diseasegenes),stringsAsFactors=F)
  colnames(finaltable) = c("gene","quality")

  names(diseasegenes) = genes
  finalpreds = genes[diseasegenes >= 0.5]
  ensembles$allpredictions=finaltable
  ensembles$minkappa=minkappa
  ensembles$maxkappa=maxkappa
  ensembles$finalpreds=finalpreds
  return(ensembles)
}

evalEnsembleOneShot = function(ensemble,
                               cutoff=0.9,
                               balance=50,
                               alpha=0.9,
                               trees=-1){

  if(length(trees) == 1){
    if(trees < 0 | trees > length(ensemble$model))
      trees = length(ensemble$model)
  }

  if(trees < length(ensemble$model)){
    allgenes = getCodingGenome()
    ensemble$preds = ensemblePredict(genes=allgenes,
                                     ensemble=ensemble$model,
                                     vars=ensemble$vars,
                                     cutoff=cutoff,
                                     model.indexes=ensemble$modelindexes[1:trees],
                                     method=ensemble$method)
  }

  #Let us eval the disease genes
  gindexes = which(ensemble$preds$genes %in% ensemble$evaldisease)
  allcgindexes = which(ensemble$preds$genes %in% ensemble$evalctrl)
  preds = ensemble$preds$rawpredictions

  alloutpreds = NULL
  allstats = NULL
  outpreds = NULL
  localstats = NULL
  done = 0
  for(b in 1:balance){
    cgindexes = sample(allcgindexes,length(gindexes))
    gtruth = c(rep("Disease",
                   length(gindexes)),
               rep("Nondisease",
                   length(cgindexes)))

    allindexestoeval = c(gindexes,cgindexes)
    numpreds = rep("Disease",length(allindexestoeval))
    numpreds[ensemble$preds$quality[allindexestoeval] < cutoff] = "Nondisease"
    itstats = stats(numpreds,gtruth,alpha)
    #if(sum(is.nan(unlist(itstats))) == 0){
    localstats = rbind(localstats,c(itstats$ppv,itstats$npv,
                                    itstats$kappa,itstats$mcc,
                                    itstats$bacc,itstats$f1,itstats$auc,
                                    itstats$sens,
                                    itstats$spec,itstats$costerror))
    # done = done + 1
    #}
  }
  hitsperfold=NULL
  for(q in c(0.5,0.8,0.9,0.99)){
    diseasegenes = NULL
    milked = milkEnsemble(ensemble = ensemble,
                          cutoff=q,
                          trees=trees,
                          remove=c(ensemble$genes[ensemble$condition == "Disease"]))

    hits = getHits(panel=ensemble$panel,
                   genes=milked,
                   brandnew=ensemble$evaldisease)

    hitsperfold = rbind(hitsperfold,c(ensemble$panel,
                                      ensemble$method,
                                      q,
                                      trees,
                                      length(milked),
                                      hits$newgenes,
                                      hits$hits,
                                      hits$fold))


  }

  localstats = na.omit(localstats)
  return(list(auc=mean(localstats[,7]),
              ppv=mean(localstats[,1]),
              kappa=mean(localstats[,3]),
              mcc=mean(localstats[,4]),
              bacc=mean(localstats[,5]),
              sens=mean(localstats[,8]),
              spec=mean(localstats[,9]),
              costerror=mean(localstats[,10]),hits=hitsperfold))
}

evalEnsemble = function(ensemble,
                        cutoff=0.9,
                        balance=50,
                        alpha=0.9,
                        trees=-1){

  if(length(trees) == 1){
    if(trees < 0 | trees > length(ensemble$model))
      trees = length(ensemble$model)
  }

  if(trees < length(ensemble$model)){
    allgenes = getCodingGenome()
    ensemble$preds = ensemblePredict(genes=allgenes,
                                     ensemble=ensemble$model,
                                     vars=ensemble$vars,
                                     cutoff=cutoff,
                                     model.indexes=ensemble$modelindexes[1:trees],
                                     method=ensemble$method)
  }

  #Let us eval the disease genes
  gindexes = which(ensemble$preds$genes %in% ensemble$evaldisease)
  allcgindexes = which(ensemble$preds$genes %in% ensemble$evalctrl)
  preds = ensemble$preds$rawpredictions

  alloutpreds = NULL
  allstats = NULL
  outpreds = NULL
  localstats = NULL
  done = 0
  for(b in 1:balance){
    cgindexes = sample(allcgindexes,length(gindexes))
    gtruth = c(rep("Disease",
                   length(gindexes)),
               rep("Nondisease",
                   length(cgindexes)))

    allindexestoeval = c(gindexes,cgindexes)
    numpreds = rep("Disease",length(allindexestoeval))
    numpreds[ensemble$preds$quality[allindexestoeval] < cutoff] = "Nondisease"
    itstats = stats(numpreds,gtruth,alpha)
    #if(sum(is.nan(unlist(itstats))) == 0){
    localstats = rbind(localstats,c(itstats$ppv,itstats$npv,
                                    itstats$kappa,itstats$mcc,
                                    itstats$bacc,itstats$f1,itstats$auc,
                                    itstats$sens,
                                    itstats$spec,itstats$costerror))
  }

  localstats = na.omit(localstats)
  return(list(auc=mean(localstats[,7]),
              ppv=mean(localstats[,1]),
              kappa=mean(localstats[,3]),
              mcc=mean(localstats[,4]),
              bacc=mean(localstats[,5]),
              sens=mean(localstats[,8]),
              spec=mean(localstats[,9]),
              costerror=mean(localstats[,10])))
}

milkEnsemble = function(ensemble,
                        cutoff=0.9,
                        remove=NULL,
                        trees=-1){

  if(!is.null(ensemble$model)){
    if(trees < 0 | trees > length(ensemble$model))
      trees = length(ensemble$model)

    if(trees < length(ensemble$model)){
      allgenes = getCodingGenome()
      ensemble$preds = ensemblePredict(genes=allgenes,
                                       ensemble=ensemble$model,
                                       vars=ensemble$vars,
                                       cutoff=cutoff,
                                       model.indexes=ensemble$modelindexes[1:trees],
                                       method=ensemble$method)
    }

  }
  #Let us eval the disease genes
  if(!is.null(remove)){
    genemask = !(ensemble$preds$genes %in% remove)
    ensemble$preds$genes = ensemble$preds$genes[genemask]
    ensemble$preds$quality = ensemble$preds$quality[genemask]
  }
  return(ensemble$preds$genes[ensemble$preds$quality >= cutoff])
}

getPredictedGenes = function(ens,q){
  return(ens$allpredictions$gene[ens$allpredictions$quality >= q])
}

getHits = function(panel,genes,
                   newgenespath=paste0(system.file("g2pml/gel/", "", package = "G2PML"),
                                       "geOctober2018/"),
                   oldgenespath=paste0(system.file("g2pml/gel/", "", package = "G2PML"),
                                       "geApril2018/"),
                   evidence = c("HighEvidence","ModerateEvidence"),
                   brandnew=NULL,
                   backgroundgenes=18000)
{
  #cat("Getting hits for panel",panel,"\n")
  ngf = paste0(newgenespath,"/",panel,".csv")
  ogf = paste0(oldgenespath,"/",panel,".csv")
  if(!file.exists(ngf) | !file.exists(ogf))
    return(NULL)

  if(is.null(brandnew)){
    newgenes = read.csv(ngf,stringsAsFactors=F)
    newgenes = newgenes$GeneSymbol[newgenes$LevelOfConfidence %in% evidence]
    oldgenes = read.csv(ogf,stringsAsFactors=F)
    oldgenes = oldgenes$GeneSymbol[oldgenes$LevelOfConfidence %in% "HighEvidence"]
    brandnew = setdiff(newgenes,oldgenes)
    brandnew = na.omit(fromSymbol2Hugo(brandnew))
    #print("new genes from newer panel are")
    #print(brandnew)
  }


  if(length(brandnew) > 0){
    bgrndprob = length(genes)/backgroundgenes
    localhits = sum(brandnew %in% genes)
    #cat("We got",localhits,"hits\n")
    fold = (localhits/length(brandnew))/bgrndprob
    return(list(hits=localhits,fold=fold,newgenes=length(brandnew),predictions=length(genes)))
  }#else
  #cat("No new genes to get hits from\n")
  return(NULL)
}


stats = function(preds,gtruth,alpha=0.5){
  library(pROC)
  tp = sum(preds == "Disease" & gtruth == "Disease")
  fp = sum(preds == "Disease" & gtruth == "Nondisease")
  tn = sum(preds == "Nondisease" & gtruth == "Nondisease")
  fn = sum(preds == "Nondisease" & gtruth == "Disease")
  pos = sum(gtruth == "Disease")
  posuseful = sum(gtruth == "Disease" & preds != "DontKnow")
  neg = sum(gtruth == "Nondisease")
  neguseful = sum(gtruth == "Nondisease" & preds != "DontKnow")

  dnratio = sum(preds == "DontKnow")/length(preds)
  diseaseratio = sum(preds == "Disease")/length(preds)
  ndiseaseratio = sum(preds == "Nondisease")/length(preds)
  derror = fp/(tp + fp)

  fprate = fp/(tp+fp)
  fnrate = fn/(tn+fn)
  costerror = (alpha*fprate + (1-alpha)*fnrate)/(neg + pos)
  if(is.nan(costerror))
    costerror = 0

  sensitivity = tp/(tp + fn)
  specificity = tn/(fp + tn)
  ppv = tp/(tp + fp)
  npv = tn/(tn + fn)
  if(is.nan(npv))
    npv = 0
  acc = (tp + tn)/sum(preds == "Disease" | preds == "Nondisease")
  bacc = (tp/pos + tn/neg)/2
  f1 = 2*tp/(2*tp + fp + fn)
  mcc = (tp*tn - fp*fn)/sqrt(as.numeric(tp+fp)*as.numeric(tp+fn)*as.numeric(tn+fp)*as.numeric(tn+fn))
  if(is.nan(mcc))
    mcc = -1
  #cat("mcc:",mcc,"\n")
  randomacc = ((tn+fp)*(tn+fn) + (fn+tp)*(fp+tp))/(tp+tn+fp+fn)^2
  kappa = (((tp+tn)/(tp+tn+fp+fn)) - randomacc) / (1 - randomacc)
  #if(length(unique(preds)) == 2)
  #  auc = auc(preds,gtruth,plotit=F)
  #else
  auc = 0.4
  return(list(auc=auc,sens=sensitivity,spec=specificity,
              ppv=ppv,npv=npv,acc=acc,bacc=bacc,f1=f1,
              mcc=mcc,kappa=kappa,
              diseaseratio=diseaseratio,
              ndiseaseratio=ndiseaseratio,
              dnratio=dnratio,
              costerror=costerror,
              derror=derror))
}

g2pmlRun = function(thresholds=seq(0.3,0.8,0.1),
                    genes,
                    seed=12345,
                    sizes = c(5,10,20,30),
                    kfs=5,
                    k=5,
                    controls="allgenome",
                    trnProp=0.8,
                    repeats=10,
                    ...){

  ensembles = NULL
  fs = featureSelection(genes=genes,
                        seed=seed,
                        sizes=sizes,
                        k=kfs,
                        controls=controls,
                        trnProp=0.8,
                        repeats=repeats)

  for(threshold in thresholds){
    key = as.character(threshold)
    if(length(getVarsFromFS(fsdata = fs,r = threshold)) > 20)
      ensembles[[key]] = ensembleLearnKFold(genes=genes,fs=fs,k=k,
                                            fsThreshold = threshold,...)
  }
  ensembles[["fs"]] = fs
  return(ensembles)


}
juanbot/G2PML documentation built on Aug. 1, 2020, 5:07 a.m.