R/functions.R

Defines functions markTrainCGC getScaledTable createDescribeTrainingCGC runNoveltyDetection getScore plattScaling getSysCansOAC scoreGenes

## Prepare training and prediction set
#############
## Functions
#############

## Marking of training/prediction set
markTrainCGC <- function(df){

  geneInfo=sysSVM:::geneInfo
  cancerGenes=sysSVM:::cancerGenes

  ## Get information for cgc/cans and primary sites
  ## Fix gene info table from NCG
  geneInfo = geneInfo %>% select(entrez, cancer_type) %>% unique
  ## Get a cancer gene with all the associated primary sites and cancer sites
  cancerGenes = cancerGenes %>% select(entrez, primary_site, cancer_site) %>%
    group_by(entrez) %>% summarise(primary_site=paste(unique(primary_site), collapse=","),
                                   cancer_site=paste(unique(cancer_site), collapse=",")) %>%
    ungroup
  geneInfo = geneInfo %>% left_join(cancerGenes, by=c("entrez")) %>%
    rename(gene_type=cancer_type)

  ## Get false positive genes
  false_positive_genes <- sysSVM:::false_positive_genes
  false_positive_genes <- false_positive_genes %>% select(entrez, symbol)

  ## Get primary site information for the df
  df = df %>% left_join(geneInfo, by=c("entrez"="entrez"))

  ## Training set extraction
  cat("Marking genes for training...", "\n")
  #df = df %>% mutate(TP=ifelse((vogel=="Vog.Oncogene" | vogel=="Vog.TS") & (no_TRUNC_muts!=0 | no_NTDam_muts!=0 | no_GOF_muts!=0 | CNVGain==1 | CNVLoss==1 | ExpT_NET==1), T, F))
  #if( df%>%grepl(tissue, primary_site)%>%nrow==0 ) stop("Tissue not present")


  df = df %>% mutate(TP=ifelse((gene_type=="cgc") & (no_TRUNC_muts!=0 |
                                                       no_NTDam_muts!=0 |
                                                       no_GOF_muts!=0 |
                                                       (CNVGain==1) |
                                                       (Copy_number==0) |
                                                       ((Copy_number==1) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0)) |
                                                       BND!=0 |
                                                       INS!=0 |
                                                       INV!=0) & !(entrez%in%false_positive_genes$entrez), T, F))


  cat("Adding type annotation for training...", "\n")

  df = df %>% mutate(type=ifelse(TP==T, "C", NA))

  df = df %>% select(-TP)
  ## For cancer-specific training set
  df = df %>% select(-gene_type, -primary_site, -cancer_site)
  df$type = as.factor(df$type)

  ## Exclude genes without any alterations from training & prediction set  (I now added the exclusion of the NCG FPs as an option by default they are not excluded)
  df = df %>% subset((no_TRUNC_muts!=0 |
                        no_NTDam_muts!=0 |
                        no_GOF_muts!=0 |
                        (CNVGain==1) |
                        (Copy_number==0) |
                        ((Copy_number==1) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0)) |
                        BND!=0 |
                        INS!=0 |
                        INV!=0) )

  return(df)
}

## Scaling
getScaledTable <- function(df, type="cs", models=NULL, scaling.factors=NULL){ #

  ## is.nan for data frames
  is.nan.data.frame <- function(x){do.call(cbind, lapply(x, is.nan))}

  cat("Feature scaling: All factors are excluded from scaling...\n")

  factor_cols <- names(sapply(df, function(x) class(x))[sapply(df, function(x) class(x))=="factor"])

  df_f <- df[,names(df)%in%factor_cols]

  df <-df[,!(names(df)%in%factor_cols)]

  if(is.null(scaling.factors)){
    ## Get mean and sd for scaling new observations
    scaling.factors = sapply(df, function(cl) list(means=mean(cl,na.rm=TRUE), sds=sd(cl,na.rm=TRUE)))
    cat("Scaling...", "\n")
    df <- data.frame(lapply(df, function(x) (x-mean(x, na.rm = TRUE))/sd(x, na.rm = TRUE)))
    df <- cbind(df, df_f)
    #df <- data.frame(lapply(df, function(x) rescale(x, to=c(-1,1)))) ## only cs scaling for now
  }else{
    load(scaling.factors)
    cat("Scaling...", "\n")
    if (type=="cs"){
      df <- data.frame(lapply(seq_along(df), function(y,n,i) {
        (y[i]-unlist(mean_sd["means",n[i]]))/unlist(mean_sd["sds",n[i]])
      }, y=df, n=names(df)))
      df <- cbind(df, df_f)
    }
  }

  return(list(df_scaled=df, scaling.factors=scaling.factors))
}


## Create training and prediction sets
createDescribeTrainingCGC = function(output.dir=NULL, exclude.features=NULL, trainingMode=TRUE, models=NULL, scaling.factors=NULL){

  require(dplyr)
  require(tidyr)

  ## Get data
  d = sysSVM:::oac_data

  FEATURES = colnames(d)[!colnames(d)%in%exclude.features]

  d = d %>% select(one_of(c("sample", "entrez", FEATURES))) %>% data.frame()
  ## We need to make it factors
  cat("Converting binary features to factors", "\n")

  ## Get which features have only two levels and onvert them to factors
  to.be.factors = lapply(d, function(x){
    length(unique(x))==2
  })

  fcols=colnames(d)[unlist(to.be.factors)]
  fcols = fcols[!fcols%in%c("sample", "entrez")]

  cols <- which(colnames(d) %in% fcols)
  for(i in cols){
    d[,i] = factor(d[,i], levels = c(0,1))
  }


  ## Mark training observations before you scale
  if(trainingMode){
    cat("Creating training and prediction set", "\n")

    if (!file.exists(output.dir)){
      dir.create(file.path(output.dir))
    }else{
      stop("sysSVM ERROR: Output directory exists. Can't overwrite...")
    }

    d = markTrainCGC(df=d)

    rnames = paste0(d$sample, ".", d$entrez)
    d=d %>% select(-sample, -entrez) %>% data.frame()
    rownames(d) = rnames
    ## Save training and prediction set without scaling
    training_ns = d %>% subset(!is.na(type))
    save(training_ns, file=paste(output.dir,"/training_set_noScale.Rdata", sep=""))

    prediction_ns = d %>% subset(is.na(type)) %>% select(-type)
    save(prediction_ns, file=paste(output.dir,"/prediction_set_noScale.Rdata", sep=""))


    ## Scale dataset
    d = getScaledTable(df = d, scaling.factors = scaling.factors)
    scaling.factors = d[["scaling.factors"]]
    ## Save means and sds
    if(!is.null(scaling.factors)){
      save(scaling.factors, file=paste(output.dir,"/scaling.factors.Rdata", sep=""))
    }

    d = d[["df_scaled"]]
    training = d %>% subset(!is.na(type))
    save(training, file=paste(output.dir,"/training_set.Rdata", sep=""))

    prediction = d %>% subset(is.na(type)) %>% select(-type)
    save(prediction, file=paste(output.dir,"/prediction_set.Rdata", sep=""))

  }else{
    ## Scale dataset
    if(is.null(scaling.factors)){
      stop("sysSVM ERROR: Please provide scaling values or train sysSVM", "\n")
    }
    cat("Scaling data and preparing table for prediction...", "\n")

    if (!file.exists(output.dir)){
      dir.create(file.path(output.dir))
    }else{
      stop("sysSVM ERROR: Output directory exists. Can't overwrite...")
    }

    save(d, file=paste(output.dir,"/prediction_set_noScale.Rdata", sep=""))
    d = getScaledTable(df = d, scaling.factors = scaling.factors)

    d = d[["df_scaled"]]
    save(d, file=paste(output.dir,"/prediction_set.Rdata", sep=""))
  }

}


## Run novelty detection
runNoveltyDetection = function(output.dir=NULL, cv=3, iters=100,
                               kernels=c("linear", "polynomial", "radial", "sigmoid"),
                               cores=2){

  require(doSNOW)
  require(parallel)

  ## Constants
  NU = seq(0.05, 0.9, 0.05)
  GAMMA = 2^seq(-7,4)
  DEGREE = c(3, 4, 9)

  LO=1 ## Leave out i.e here I take CV-1 folds for the train and 1 for the test
  CV=cv

  ## Start with linear
  cat("Grid search for:", "\n")

  create.output.folder = function (output.dir, kern, mynu, mygamma, mydegree, no) {

    if(length(grep("CV",list.dirs(output.dir, recursive = F, full.names=F)))==0){
      dir.create(paste(output.dir, "/CV", sep=""))
    }
    timer    = format(Sys.time(), "%Y-%m-%d.%H_%M_%S")
    if(kern=="linear"){
      job_dir = paste(output.dir, "/CV/", kern,".",mynu, sep="" )
      if(!file.exists(job_dir)){
        dir.create(job_dir)
      }
    }else if (kern=="radial"){
      job_dir = paste(output.dir, "/CV/", kern,".", mynu, ".", mygamma, sep="")
      if(!file.exists(job_dir)){
        dir.create(job_dir)
      }
    }else if (kern=="polynomial"){
      job_dir = paste(output.dir, "/CV/", kern,".", mynu, ".", mygamma, ".", mydegree, sep="")
      if(!file.exists(job_dir)){
        dir.create(job_dir)
      }
    }else if (kern=="sigmoid"){
      job_dir = paste(output.dir, "/CV/", kern,".", mynu, ".", mygamma, sep="")
      if(!file.exists(job_dir)){
        dir.create(job_dir)
      }
    }
    iteration_dir = paste(job_dir, "/iteration",".", no, sep="")
    if(!file.exists(iteration_dir)){
      dir.create(iteration_dir)
    }
    return(iteration_dir)
  }

  ## get the data from path (training)
  load(paste(output.dir, "/training_set.Rdata", sep = ""))
  pgenes <- training %>% add_rownames %>% separate(rowname, into=c("sample", "entrez"), sep="\\.") %>% subset(type=="C") %>% select(entrez) %>% unique %>% .$entrez

  cv_stats_full = NULL
  for(k in kernels){
    cat(k, "\n")
    if(k=="linear"){
      total = length(NU)

      cl <- makeCluster(cores)
      registerDoSNOW(cl)
      # create progress bar
      pb <- txtProgressBar(max = total, style = 3)
      progress <- function(n) setTxtProgressBar(pb, n)
      opts <- list(progress = progress)
      result <- foreach(i = 1:total, .combine = rbind, .packages=c('caret', 'sampling', 'e1071', 'tidyr', 'dplyr'),
                        .options.snow = opts) %dopar%
                        {

                            Sys.sleep(0.1)
                            mynu=NU[i]
                            mygamma=0
                            mydegree=0

                            cv_stats_grid = NULL
                            for(y in 1:iters){
                              iteration_dir <- create.output.folder(output.dir, k, mynu, mygamma, mydegree, y)
                              print(iteration_dir)
                              traingenes = sample(pgenes,ceiling(((CV-LO)/CV)*length(pgenes)))
                              testgenes = pgenes[!(pgenes%in%traingenes)]

                              ## Extract trainset and testset
                              trainset <- training %>% tibble::rownames_to_column() %>% separate(rowname, into=c("sample", "entrez"), sep="\\.") %>% subset(entrez %in% traingenes)
                              testset <-training %>% tibble::rownames_to_column() %>% separate(rowname, into=c("sample", "entrez"), sep="\\.") %>% subset(entrez %in% testgenes)

                              ## Make Cancer_type, Sample and Entrez row names
                              trainset <- trainset %>% mutate(key=paste(sample, entrez, sep=".")) %>%
                                select(-sample, -entrez)
                              rnames <- trainset$key
                              trainset <- trainset %>% select(-key) %>% data.frame()
                              row.names(trainset) <- rnames

                              testset <- testset %>% mutate(key=paste(sample, entrez, sep=".")) %>%
                                select(-sample, -entrez)
                              rnames <- testset$key
                              testset <- testset %>% select(-key) %>% data.frame()
                              row.names(testset) <- rnames

                              ## Exclude columns that are NaN - this may be changed in later versions to make the column all 0s
                              trainset=Filter(function(x)!all(is.nan(x)), trainset)
                              testset=Filter(function(x)!all(is.nan(x)), testset)

                              ## Store the train and testset
                              save(trainset, file=paste(iteration_dir, "/trainset.Rdata", sep=""))
                              save(testset, file=paste(iteration_dir, "/testset.Rdata", sep=""))

                              ## -----------------------------------------------------
                              ##                  SVM
                              ## -----------------------------------------------------

                              svm.model <- svm(type ~ ., data = trainset, kernel=k, type="one-classification", scale=FALSE, nu=mynu)
                              save(svm.model, file=paste(iteration_dir, "/svmModel.Rdata", sep=""))

                              prediction <- predict(svm.model, testset%>%select(-type), decision.values = TRUE, probability = TRUE)
                              ## save prediction in a file
                              save(prediction, file=paste(iteration_dir, "/prediction.Rdata", sep=""))

                              ## Get the performance
                              noTrain = trainset %>% nrow
                              ## ---------------------- Test set -------------------------------------
                              ## Concatenate the predictions to the true values
                              testset$prediction = prediction
                              testset = testset %>% select(type, prediction)
                              testset = testset %>% mutate(p=ifelse(prediction==TRUE, "C", "N"))
                              testset$type = factor(as.character(testset$type), levels = c("C", "N"))
                              testset$p = factor(as.character(testset$p), levels = c("C", "N"))

                              ## Performance metrics
                              ## Confusion matrix
                              cv_stats = NULL
                              cM <- confusionMatrix(testset$p,testset$type, positive = c("C"))
                              cv_stats <- rbind(cv_stats, data.frame(kernel=k,
                                                                     nu=mynu, gamma=mygamma, degree=mydegree,
                                                                     iteration=y,
                                                                     type="accuracy", trainSize=noTrain,
                                                                     class="overall", set = "test",
                                                                     value=cM$overall[["Accuracy"]]))
                              cv_stats <- rbind(cv_stats, data.frame(kernel=k,
                                                                     nu=mynu, gamma=mygamma, degree=mydegree,
                                                                     iteration=y,
                                                                     type="Sensitivity", trainSize=noTrain,
                                                                     class="overall", set = "test",
                                                                     value=cM$byClass["Sensitivity"]))
                              cv_stats <- rbind(cv_stats, data.frame(kernel=k,
                                                                     nu=mynu, gamma=mygamma, degree=mydegree,
                                                                     iteration=y,
                                                                     type="Specificity", trainSize=noTrain,
                                                                     class="overall", set = "test",
                                                                     value=cM$byClass["Specificity"]))

                              ## ---------------------- Train set -------------------------------------
                              trainset$fitted = svm.model$fitted
                              trainset = trainset %>% select(type, fitted)
                              trainset = trainset %>% mutate(f=ifelse(fitted==TRUE, "C", "N"))
                              trainset$type = factor(as.character(trainset$type), levels = c("C", "N"))
                              trainset$f = factor(as.character(trainset$f), levels = c("C", "N"))

                              ## Confusion matrix
                              cM <- confusionMatrix(trainset$f,trainset$type, positive = c("C"))
                              cv_stats <- rbind(cv_stats, data.frame(kernel=k,
                                                                     nu=mynu, gamma=mygamma, degree=mydegree,
                                                                     iteration=y,
                                                                     type="accuracy", trainSize=noTrain,
                                                                     class="overall", set = "train",
                                                                     value=cM$overall[["Accuracy"]]))
                              cv_stats <- rbind(cv_stats, data.frame(kernel=k,
                                                                     nu=mynu, gamma=mygamma, degree=mydegree,
                                                                     iteration=y,
                                                                     type="Sensitivity", trainSize=noTrain,
                                                                     class="overall", set = "train",
                                                                     value=cM$byClass["Sensitivity"]))
                              cv_stats <- rbind(cv_stats, data.frame(kernel=k,
                                                                     nu=mynu, gamma=mygamma, degree=mydegree,
                                                                     iteration=y,
                                                                     type="Specificity", trainSize=noTrain,
                                                                     class="overall", set = "train",
                                                                     value=cM$byClass["Specificity"]))
                              cv_stats_grid = rbind(cv_stats_grid, cv_stats)
                            }
                          return(cv_stats_grid)
                        }
      close(pb)
      stopCluster(cl)
      cv_stats_full = rbind(cv_stats_full, result)
    }

    if(k=="polynomial"){
      param_grid=expand.grid(NU, GAMMA, DEGREE)
      total = nrow(param_grid)

      cl <- makeCluster(cores)
      registerDoSNOW(cl)
      # create progress bar
      pb <- txtProgressBar(max = total, style = 3)
      progress <- function(n) setTxtProgressBar(pb, n)
      opts <- list(progress = progress)
      result <- foreach(i = 1:total, .combine = rbind, .packages=c('caret', 'sampling', 'e1071', 'tidyr', 'dplyr'),
                        .options.snow = opts) %dopar%
                        {

                          Sys.sleep(0.1)
                          mynu=param_grid[i, 1]
                          mygamma=param_grid[i, 2]
                          mydegree=param_grid[i, 3]

                          cv_stats_grid = NULL
                          for(y in 1:iters){
                            iteration_dir <- create.output.folder(output.dir, k, mynu, mygamma, mydegree, y)

                            traingenes = sample(pgenes,ceiling(((CV-LO)/CV)*length(pgenes)))
                            testgenes = pgenes[!(pgenes%in%traingenes)]

                            ## Extract trainset and testset
                            trainset <- training %>% tibble::rownames_to_column() %>% separate(rowname, into=c("sample", "entrez"), sep="\\.") %>% subset(entrez %in% traingenes)
                            testset <-training %>% tibble::rownames_to_column() %>% separate(rowname, into=c("sample", "entrez"), sep="\\.") %>% subset(entrez %in% testgenes)

                            ## Make Cancer_type, Sample and Entrez row names
                            trainset <- trainset %>% mutate(key=paste(sample, entrez, sep=".")) %>%
                              select(-sample, -entrez)
                            rnames <- trainset$key
                            trainset <- trainset %>% select(-key) %>% data.frame()
                            row.names(trainset) <- rnames

                            testset <- testset %>% mutate(key=paste(sample, entrez, sep=".")) %>%
                              select(-sample, -entrez)
                            rnames <- testset$key
                            testset <- testset %>% select(-key) %>% data.frame()
                            row.names(testset) <- rnames

                            ## Exclude columns that are NaN - this may be changed in later versions to make the column all 0s
                            trainset=Filter(function(x)!all(is.nan(x)), trainset)
                            testset=Filter(function(x)!all(is.nan(x)), testset)

                            ## Store the train and testset
                            save(trainset, file=paste(iteration_dir, "/trainset.Rdata", sep=""))
                            save(testset, file=paste(iteration_dir, "/testset.Rdata", sep=""))

                            ## -----------------------------------------------------
                            ##                  SVM
                            ## -----------------------------------------------------

                            svm.model <- svm(type ~ ., data = trainset, kernel=k, type="one-classification", scale=FALSE, gamma = mygamma, nu=mynu, degree = mydegree)
                            save(svm.model, file=paste(iteration_dir, "/svmModel.Rdata", sep=""))

                            prediction <- predict(svm.model, testset%>%select(-type), decision.values = TRUE, probability = TRUE)
                            ## save prediction in a file
                            save(prediction, file=paste(iteration_dir, "/prediction.Rdata", sep=""))

                            ## Get the performance
                            noTrain = trainset %>% nrow
                            ## ---------------------- Test set -------------------------------------
                            ## Concatenate the predictions to the true values
                            testset$prediction = prediction
                            testset = testset %>% select(type, prediction)
                            testset = testset %>% mutate(p=ifelse(prediction==TRUE, "C", "N"))
                            testset$type = factor(as.character(testset$type), levels = c("C", "N"))
                            testset$p = factor(as.character(testset$p), levels = c("C", "N"))

                            ## Performance metrics
                            ## Confusion matrix
                            cv_stats = NULL
                            cM <- confusionMatrix(testset$p,testset$type, positive = c("C"))
                            cv_stats <- rbind(cv_stats, data.frame(kernel=k,
                                                                   nu=mynu, gamma=mygamma, degree=mydegree,
                                                                   iteration=y,
                                                                   type="accuracy", trainSize=noTrain,
                                                                   class="overall", set = "test",
                                                                   value=cM$overall[["Accuracy"]]))
                            cv_stats <- rbind(cv_stats, data.frame(kernel=k,
                                                                   nu=mynu, gamma=mygamma, degree=mydegree,
                                                                   iteration=y,
                                                                   type="Sensitivity", trainSize=noTrain,
                                                                   class="overall", set = "test",
                                                                   value=cM$byClass["Sensitivity"]))
                            cv_stats <- rbind(cv_stats, data.frame(kernel=k,
                                                                   nu=mynu, gamma=mygamma, degree=mydegree,
                                                                   iteration=y,
                                                                   type="Specificity", trainSize=noTrain,
                                                                   class="overall", set = "test",
                                                                   value=cM$byClass["Specificity"]))

                            ## ---------------------- Train set -------------------------------------
                            trainset$fitted = svm.model$fitted
                            trainset = trainset %>% select(type, fitted)
                            trainset = trainset %>% mutate(f=ifelse(fitted==TRUE, "C", "N"))
                            trainset$type = factor(as.character(trainset$type), levels = c("C", "N"))
                            trainset$f = factor(as.character(trainset$f), levels = c("C", "N"))

                            ## Confusion matrix
                            cM <- confusionMatrix(trainset$f,trainset$type, positive = c("C"))
                            cv_stats <- rbind(cv_stats, data.frame(kernel=k,
                                                                   nu=mynu, gamma=mygamma, degree=mydegree,
                                                                   iteration=y,
                                                                   type="accuracy", trainSize=noTrain,
                                                                   class="overall", set = "train",
                                                                   value=cM$overall[["Accuracy"]]))
                            cv_stats <- rbind(cv_stats, data.frame(kernel=k,
                                                                   nu=mynu, gamma=mygamma, degree=mydegree,
                                                                   iteration=y,
                                                                   type="Sensitivity", trainSize=noTrain,
                                                                   class="overall", set = "train",
                                                                   value=cM$byClass["Sensitivity"]))
                            cv_stats <- rbind(cv_stats, data.frame(kernel=k,
                                                                   nu=mynu, gamma=mygamma, degree=mydegree,
                                                                   iteration=y,
                                                                   type="Specificity", trainSize=noTrain,
                                                                   class="overall", set = "train",
                                                                   value=cM$byClass["Specificity"]))
                            cv_stats_grid = rbind(cv_stats_grid, cv_stats)
                          }
                          return(cv_stats_grid)
                        }
      close(pb)
      stopCluster(cl)
      cv_stats_full = rbind(cv_stats_full, result)
    }

    if(k=="radial" | k=="sigmoid"){
      param_grid=expand.grid(NU, GAMMA)
      total = nrow(param_grid)

      cl <- makeCluster(cores)
      registerDoSNOW(cl)
      # create progress bar
      pb <- txtProgressBar(max = total, style = 3)
      progress <- function(n) setTxtProgressBar(pb, n)
      opts <- list(progress = progress)
      result <- foreach(i = 1:total, .combine = rbind, .packages=c('caret', 'sampling', 'e1071', 'tidyr', 'dplyr'),
                        .options.snow = opts) %dopar%
                        {

                          Sys.sleep(0.1)
                          mynu=param_grid[i, 1]
                          mygamma=param_grid[i, 2]
                          mydegree=0
                          cv_stats_grid = NULL
                          for(y in 1:iters){
                            iteration_dir <- create.output.folder(output.dir, k, mynu, mygamma, mydegree, y)

                            traingenes = sample(pgenes,ceiling(((CV-LO)/CV)*length(pgenes)))
                            testgenes = pgenes[!(pgenes%in%traingenes)]

                            ## Extract trainset and testset
                            trainset <- training %>% tibble::rownames_to_column() %>% separate(rowname, into=c("sample", "entrez"), sep="\\.") %>% subset(entrez %in% traingenes)
                            testset <-training %>% tibble::rownames_to_column() %>% separate(rowname, into=c("sample", "entrez"), sep="\\.") %>% subset(entrez %in% testgenes)

                            ## Make Cancer_type, Sample and Entrez row names
                            trainset <- trainset %>% mutate(key=paste(sample, entrez, sep=".")) %>%
                              select(-sample, -entrez)
                            rnames <- trainset$key
                            trainset <- trainset %>% select(-key) %>% data.frame()
                            row.names(trainset) <- rnames

                            testset <- testset %>% mutate(key=paste(sample, entrez, sep=".")) %>%
                              select(-sample, -entrez)
                            rnames <- testset$key
                            testset <- testset %>% select(-key) %>% data.frame()
                            row.names(testset) <- rnames

                            ## Exclude columns that are NaN - this may be changed in later versions to make the column all 0s
                            trainset=Filter(function(x)!all(is.nan(x)), trainset)
                            testset=Filter(function(x)!all(is.nan(x)), testset)

                            ## Store the train and testset
                            save(trainset, file=paste(iteration_dir, "/trainset.Rdata", sep=""))
                            save(testset, file=paste(iteration_dir, "/testset.Rdata", sep=""))

                            ## -----------------------------------------------------
                            ##                  SVM
                            ## -----------------------------------------------------

                            svm.model <- svm(type ~ ., data = trainset, kernel=k, type="one-classification", scale=FALSE, gamma = mygamma, nu=mynu)
                            save(svm.model, file=paste(iteration_dir, "/svmModel.Rdata", sep=""))

                            prediction <- predict(svm.model, testset%>%select(-type), decision.values = TRUE, probability = TRUE)
                            ## save prediction in a file
                            save(prediction, file=paste(iteration_dir, "/prediction.Rdata", sep=""))

                            ## Get the performance
                            noTrain = trainset %>% nrow
                            ## ---------------------- Test set -------------------------------------
                            ## Concatenate the predictions to the true values
                            testset$prediction = prediction
                            testset = testset %>% select(type, prediction)
                            testset = testset %>% mutate(p=ifelse(prediction==TRUE, "C", "N"))
                            testset$type = factor(as.character(testset$type), levels = c("C", "N"))
                            testset$p = factor(as.character(testset$p), levels = c("C", "N"))

                            ## Performance metrics
                            ## Confusion matrix
                            cv_stats = NULL
                            cM <- confusionMatrix(testset$p,testset$type, positive = c("C"))
                            cv_stats <- rbind(cv_stats, data.frame(kernel=k,
                                                                   nu=mynu, gamma=mygamma, degree=mydegree,
                                                                   iteration=y,
                                                                   type="accuracy", trainSize=noTrain,
                                                                   class="overall", set = "test",
                                                                   value=cM$overall[["Accuracy"]]))
                            cv_stats <- rbind(cv_stats, data.frame(kernel=k,
                                                                   nu=mynu, gamma=mygamma, degree=mydegree,
                                                                   iteration=y,
                                                                   type="Sensitivity", trainSize=noTrain,
                                                                   class="overall", set = "test",
                                                                   value=cM$byClass["Sensitivity"]))
                            cv_stats <- rbind(cv_stats, data.frame(kernel=k,
                                                                   nu=mynu, gamma=mygamma, degree=mydegree,
                                                                   iteration=y,
                                                                   type="Specificity", trainSize=noTrain,
                                                                   class="overall", set = "test",
                                                                   value=cM$byClass["Specificity"]))

                            ## ---------------------- Train set -------------------------------------
                            trainset$fitted = svm.model$fitted
                            trainset = trainset %>% select(type, fitted)
                            trainset = trainset %>% mutate(f=ifelse(fitted==TRUE, "C", "N"))
                            trainset$type = factor(as.character(trainset$type), levels = c("C", "N"))
                            trainset$f = factor(as.character(trainset$f), levels = c("C", "N"))

                            ## Confusion matrix
                            cM <- confusionMatrix(trainset$f,trainset$type, positive = c("C"))
                            cv_stats <- rbind(cv_stats, data.frame(kernel=k,
                                                                   nu=mynu, gamma=mygamma, degree=mydegree,
                                                                   iteration=y,
                                                                   type="accuracy", trainSize=noTrain,
                                                                   class="overall", set = "train",
                                                                   value=cM$overall[["Accuracy"]]))
                            cv_stats <- rbind(cv_stats, data.frame(kernel=k,
                                                                   nu=mynu, gamma=mygamma, degree=mydegree,
                                                                   iteration=y,
                                                                   type="Sensitivity", trainSize=noTrain,
                                                                   class="overall", set = "train",
                                                                   value=cM$byClass["Sensitivity"]))
                            cv_stats <- rbind(cv_stats, data.frame(kernel=k,
                                                                   nu=mynu, gamma=mygamma, degree=mydegree,
                                                                   iteration=y,
                                                                   type="Specificity", trainSize=noTrain,
                                                                   class="overall", set = "train",
                                                                   value=cM$byClass["Specificity"]))
                            cv_stats_grid = rbind(cv_stats_grid, cv_stats)
                          }
                          return(cv_stats_grid)
                        }
      close(pb)
      stopCluster(cl)
      cv_stats_full = rbind(cv_stats_full, result)
    }

  }
  ## write metrics
  write.table(cv_stats_full, file=paste(output.dir,"/cv_stats.tsv",sep=""), quote=F, row.names = F, sep="\t")
}


## Calculate the score of each gene
## This function will run every 100 iters and calculate the most frequent set of genes
getScore = function(path=NULL,
                    linear_preds=NULL, linearCVS=NULL, linearVar=NULL,
                    radial_preds=NULL, radialCVS=NULL, radialVar=NULL,
                    polynomial_preds=NULL, polynomialCVS=NULL, polynomialVar=NULL,
                    sigmoid_preds=NULL, sigmoidCVS=NULL, sigmoidVar=NULL){


  ## Get the number of predictions before intersection
  all_preds = NULL
  all_scores = NULL

  if(!is.null(linear_preds) & !is.null(linearCVS)){
    load(paste0(path,linear_preds))
    linear = preds
    rm(preds)
    ## Push it to the all_preds
    all_preds = rbind(all_preds, linear %>% subset(label) %>% mutate(kernel="linear", training="cgc") %>% data.frame())

    ## Kernel-specific score
    linear = linear %>% arrange(sample, dv) %>% group_by(sample) %>% mutate(rank=row_number(-dv)) %>% ungroup
    linear = linear %>% group_by(sample) %>% mutate(N=max(rank), Nlog10=log10(N)) %>% ungroup
    linear = linear %>% mutate(cvs=linearCVS, var=linearVar) %>% mutate(ratio=cvs/var)
    linear = linear %>% mutate(score=-log10(rank/N)*cvs)
    linear = linear %>% mutate(kernel="linear") %>% data.frame()
    all_scores = rbind(all_scores, linear)
  }

  if(!is.null(radial_preds) & !is.null(radialCVS)){
    load(paste0(path,radial_preds))
    radial = preds
    rm(preds)
    ## Push it to the all_preds
    all_preds = rbind(all_preds, radial %>% subset(label) %>% mutate(kernel="radial", training="cgc") %>% data.frame())

    ## Kernel-specific score
    radial = radial %>% arrange(sample, dv) %>% group_by(sample) %>% mutate(rank=row_number(-dv)) %>% ungroup
    radial = radial %>% group_by(sample) %>% mutate(N=max(rank), Nlog10=log10(N)) %>% ungroup
    radial = radial %>% mutate(cvs=radialCVS, var=radialVar) %>% mutate(ratio=cvs/var)
    radial = radial %>% mutate(score=-log10(rank/N)*cvs)
    radial = radial %>% mutate(kernel="radial") %>% data.frame()
    all_scores = rbind(all_scores, radial)
  }

  if(!is.null(polynomial_preds) & !is.null(polynomialCVS)){
    load(paste0(path,polynomial_preds))
    polynomial = preds
    rm(preds)
    ## Push it to the all_preds
    all_preds = rbind(all_preds, polynomial %>% subset(label) %>% mutate(kernel="polynomial", training="cgc") %>% data.frame())

    ## Kernel-specific score
    polynomial = polynomial %>% arrange(sample, dv) %>% group_by(sample) %>% mutate(rank=row_number(-dv)) %>% ungroup
    polynomial = polynomial %>% group_by(sample) %>% mutate(N=max(rank), Nlog10=log10(N)) %>% ungroup
    polynomial = polynomial %>% mutate(cvs=polynomialCVS, var=polynomialVar) %>% mutate(ratio=cvs/var)
    polynomial = polynomial %>% mutate(score=-log10(rank/N)*cvs)
    polynomial = polynomial %>% mutate(kernel="polynomial") %>% data.frame()
    all_scores = rbind(all_scores, polynomial)
  }

  if(!is.null(sigmoid_preds) & !is.null(sigmoidCVS)){
    load(paste0(path,sigmoid_preds))
    sigmoid = preds
    rm(preds)
    ## Push it to the all_preds
    all_preds = rbind(all_preds, sigmoid %>% subset(label) %>% mutate(kernel="sigmoid", training="cgc") %>% data.frame())

    ## Kernel-specific score
    sigmoid = sigmoid %>% arrange(sample, dv) %>% group_by(sample) %>% mutate(rank=row_number(-dv)) %>% ungroup
    sigmoid = sigmoid %>% group_by(sample) %>% mutate(N=max(rank), Nlog10=log10(N)) %>% ungroup
    sigmoid = sigmoid %>% mutate(cvs=sigmoidCVS, var=sigmoidVar) %>% mutate(ratio=cvs/var)
    sigmoid = sigmoid %>% mutate(score=-log10(rank/N)*cvs)
    sigmoid = sigmoid %>% mutate(kernel="sigmoid") %>% data.frame()
    all_scores = rbind(all_scores, sigmoid)
  }

  ## Final score for each gene across kernels
  scores = all_scores %>% group_by(sample, entrez, gene_type, N, Nlog10) %>%
    summarise(sum_of_scores=sum(score), kernels_used=length(unique(kernel))) %>%
    mutate(score=(1/(kernels_used*Nlog10))*sum_of_scores) %>% ungroup %>% data.frame()


  ## Add from how many kernels they have predicted as TRUE
  preds2kernels = all_preds %>% subset(label) %>% group_by(sample, entrez) %>% summarise(kernels_predicted=paste(unique(kernel), collapse=","), kernels_predicted_no=length(unique(kernel))) %>% ungroup
  scores = scores %>% left_join(preds2kernels)
  scores$kernels_predicted_no[is.na(scores$kernels_predicted_no)] = 0

  return(list(all_scores=all_scores, scores=scores))

}

## Platt scaling
plattScaling = function(model, predictions){
  ## Impementation was taken from pseudocode in:
  ## Probabilistic Outputs for Support Vector Machines and Comparisons to Regularized Likelihood Methods, Platt, 1999
  ## Construct the training set for Platt's sigmoid

  decision_values = model$decision.values %>% as.vector()
  labels = model$fitted %>% as.vector()
  df = data.frame(dv=decision_values, lbs = labels)

  ## the prediction at the moment is a predict object coming from svm predict
  dv = attr(predictions, "decision.values") %>% as.vector()
  preds = data.frame(dv = dv)
  preds$label = as.vector(predictions)
  rownames(preds) = names(predictions)

  ## Input parameters
  out = df$dv
  target = df$lbs
  prior1 = df%>%subset(lbs==TRUE)%>%nrow
  prior0 = df%>%subset(lbs==FALSE)%>%nrow
  ## Output
  ## A, B = parameters of sigmoid

  A = 0
  B = log((prior0+1)/(prior1+1))
  hiTarget = (prior1+1)/(prior1+2)
  loTarget = 1/(prior0+2)
  lambda = 1e-3
  olderr = 1e300


  pp = rep((prior1+1)/(prior0+prior1+2), nrow(df))
  count = 0

  for (it in 1:100){
    a=0
    b=0
    c=0
    d=0
    e=0

    ## First compute the Hessian & gradient of error function
    ## with respect to A & B
    for (i in 1:length(pp)){
      if (target[i]==TRUE){
        t = hiTarget
      }else{
        t = loTarget
      }

      d1 = pp[i]-t
      d2 = pp[i]*(1-pp[i])
      a = a + out[i]*out[i]*d2
      b = b + d2
      c = c + out[i]*d2
      d = d + out[i]*d1
      e = e + d1
    }
    ## If gradient is really tiny, then stop
    if ((abs(d) < 1e-9) & (abs(e) < 1e-9)){break}

    oldA = A
    oldB = B
    err = 0

    ## Loop until goodness of fit increases
    while(TRUE){
      det = (a+lambda)*(b+lambda)-c*c
      if (det==0){ ## If determinant if Hessian is 0
        ## Increase stabilizer
        lambda = lambda*10
        next
      }
      A = oldA + ((b+lambda)*d-c*e)/det
      B = oldB + ((a+lambda)*e-c*d)/det

      ## Now compute the goodness of fit
      err = 0
      for (i in 1:length(pp)){
        p = 1/(1+exp(out[i]*A+B))
        pp[i] = p
        ## At this step, make sure log(0) returns -200
        if (p<=1.383897e-87){
          err = err - t*(-200)+(1-t)*log(1-p)
        }else if (p==1){
          err = err - t*log(p)+(1-t)*(-200)
        }else{
          err = err - t*log(p)+(1-t)*log(1-p)
        }
        if(err==-Inf) browser()
      }
      if (err < olderr*(1+1e-7)){
        lambda = lambda*0.1
        break
      }
      ## Error did not decrease: increase stabilizer by factor of 10
      ## and try again
      lambda = lambda* 10
      if (lambda >= 1e6){ ## something is broken. Give up
        break
      }
    }
    diff = err-olderr
    scale = 0.5*(err+olderr+1)
    if (diff > -1e-3*scale & diff < 1e-7*scale){
      count = count + 1
    }else{
      count = 0
    }
    olderr = err
    if (count==3){
      break
    }


  }

  applyPlatt = function(x){
    p = 1/(1+exp(x*A+B))
  }

  preds$prob = apply(preds, 1, function(x) applyPlatt(x[1]))
  return(preds)
}

## Reproduce OAC results
getSysCansOAC = function(output.dir=NULL, working.dir = NULL, gtex.tissue=NULL,
                         scores_fn="scores.Rdata",
                         exclude_expr=c("Not Expressed")){

  if(is.null(gtex.tissue)){
    stop("Please provide tissue")
  }

  ## Get the data we need for the definition of the sys cans
  ## Load geneInfo to get the symbols as well
  geneInfo=sysSVM:::geneInfo
  geneInfoNCG5 = geneInfo %>% select(entrez, symbol) %>% unique

  ## Load the CGC that we will consider drivers
  cat("Getting CGC annotation", "\n")
  cgcs = sysSVM:::cgcs
  ## refine to those that will be retained
  retained_CGC = cgcs %>% subset(RET.DIS=="RET")
  retained_CGC_trans = retained_CGC %>% subset(grepl("Trans", Drivers.to.be.retained))
  ## Get training and validation set
  load(paste0(output.dir, "/training_set_noScale.Rdata"))
  training_ns = training_ns%>%tibble::rownames_to_column()%>%
    separate(rowname, into=c("sample", "entrez"), sep="\\.") %>%
    mutate(entrez=as.numeric(entrez)) %>% data.frame()
  ## Refine the training set to include only the ones we decided to coonsider drivers
  cat("Refining CGC drivers", "\n")
  cgc_drivers = NULL
  for (i in 1:nrow(retained_CGC)){
    en = retained_CGC$entrez[i]
    gene_type= retained_CGC$GENETICS[i]
    symbol = retained_CGC$symbol[i]
    retained_drivers = retained_CGC$Drivers.to.be.retained[i]
    #message(paste(symbol, gene_type, sep = "\t"))
    if (gene_type=="O"){
      if(grepl("Trans", retained_drivers)){
        dd = training_ns %>% subset(entrez==en) %>% subset(CNVGain==1 | no_NTDam_muts!=0 | no_GOF_muts!=0 | BND==1 | INV==1 | INS==1)
      }else{
        dd = training_ns %>% subset(entrez==en) %>% subset(CNVGain==1 | no_NTDam_muts!=0 | no_GOF_muts!=0)
      }
      cgc_drivers = rbind(cgc_drivers, dd)
    }else if (gene_type=="TS"){
      if(grepl("Trans", retained_drivers)){
        dd = training_ns %>% subset(entrez==en) %>% subset(Copy_number==0 | no_TRUNC_muts!=0 | no_NTDam_muts!=0 | (Copy_number==1 & (no_TRUNC_muts>0 | no_NTDam_muts>0)) | BND==1 | INV==1 | INS==1)
      }else{
        dd = training_ns %>% subset(entrez==en) %>% subset(Copy_number==0 | no_TRUNC_muts!=0 | no_NTDam_muts!=0 | (Copy_number==1 & (no_TRUNC_muts>0 | no_NTDam_muts>0)))
      }
      cgc_drivers = rbind(cgc_drivers, dd)
    }else if (gene_type=="O/TS"){
      if(grepl("Trans", retained_drivers)){
        ## All drivers are considered
        dd = training_ns %>% subset(entrez==en)
      }else{
        ## Here we need to exlude all the genes with only SVs
        dd = training_ns %>% subset(entrez==en) %>% subset(!(no_TRUNC_muts==0 & no_NTDam_muts==0 & no_GOF_muts==0 & Copy_number!=0 & CNVGain==0 & (BND!=0 | INS!=0 | INV!=0)))
      }
      cgc_drivers = rbind(cgc_drivers, dd)
    }else{
      stop("Unknown gene type")
    }
  }

  ## Now add in the CGC drivers the CGCs identified from previous studies
  previous = sysSVM:::previous
  ## Get only those that were not considered already
  previous = previous %>% subset(to.be.considered & !already.considered)
  ## Get the entrez for these genes
  previous = previous %>% left_join(geneInfo%>%select(entrez, symbol), by=c("Gene.symbol"="symbol"))

  for(i in 1:nrow(previous)){
    en = previous$entrez[i]
    gene_type= previous$Type[i]
    symbol = previous$Gene.symbol[i]

    if(gene_type=="SVs"){
      dd = training_ns %>% subset(entrez==en) %>% subset(BND==1 | INV==1 | INS==1 | CNVLoss==1 | CNVGain==1)
    }else if(gene_type=="CNVs"){
      dd = training_ns %>% subset(entrez==en) %>% subset(CNVLoss==1 | CNVGain==1)
    }
    cgc_drivers = rbind(cgc_drivers, dd)
  }


  load(paste0(output.dir, "/prediction_set_noScale.Rdata"))
  prediction_ns = prediction_ns%>%tibble::rownames_to_column()%>%
    separate(rowname, into=c("sample", "entrez"), sep="\\.")%>%
    mutate(type="P")%>%mutate(entrez=as.numeric(entrez))%>%data.frame()
  cohort = rbind(cgc_drivers, prediction_ns) ## Instead of the whole training, we use only the subset of the refined CGC
  ## Get scores
  load(paste0(working.dir, "/",scores_fn))
  scores = scores[["scores"]]

  cohort = cohort %>% left_join(scores)
  cohort$gene_type[is.na(cohort$gene_type)] = "cgc"
  cohort = cohort %>% left_join(geneInfoNCG5)

  samples = unique(cohort$sample)
  sys_cans = NULL
  if (!is.null(gtex.tissue)){
    cat("Expression is considered for the selection of sys cans", "\n")
    ## Load expression data (GTeX)
    cat("Loading GTeX expression data", "\n")
    gtex=sysSVM:::gtex
    gtex = gtex %>% select(entrez, tissue, exp_level) %>% spread(tissue, exp_level)
    ## Checkpoint
    if(!(gtex.tissue%in%colnames(gtex))){
      stop("Wrong tissue name. Please provide a valid GTeX tissue name.")
    }
    ## Select tissue of the cancer type
    gtex = gtex[,c("entrez", gtex.tissue)]
    colnames(gtex) = c("entrez", "gtex_tissue_expression")
    ## bring in the cohort table the expression
    cohort = cohort %>% left_join(gtex)


    total = length(samples)
    pb <- txtProgressBar(min = 0, max = total, style = 3)
    for (i in 1:length(samples)){
      Sys.sleep(0.1)
      ## When you get the candidates here you need to take into account expression in normal tissues and in the cancer sample

      ## Here I get the ranks for the non-CGC genes. The CGCs will be added as patient.rank==NA
      ## I also add 2 ranks, one with all the genes and one without amplifications
      d = cohort %>% subset(sample==samples[i] & gene_type!="cgc" & !gtex_tissue_expression%in%exclude_expr & !is.na(gtex_tissue_expression)) %>%
        arrange(desc(score)) %>% mutate(patient.rank=row_number(-score))
      d_withoutAmp = d %>% subset(sample==samples[i] & gene_type!="cgc" & CNVGain==0 & !gtex_tissue_expression%in%exclude_expr & !is.na(gtex_tissue_expression)) %>%
        arrange(desc(score)) %>% mutate(patient.rank.no.amp=row_number(-score))
      ## Bring in the rank without amplification
      d = suppressMessages(d %>% left_join(d_withoutAmp))

      ## Add the CGCs
      d_cgcs = cohort %>% subset(sample==samples[i] & gene_type=="cgc" & !gtex_tissue_expression%in%exclude_expr & !is.na(gtex_tissue_expression))
      d = plyr::rbind.fill(d, d_cgcs)

      ## Add them to the big table
      sys_cans = rbind(sys_cans, d)
      setTxtProgressBar(pb, i)
    }
  }

  ## Add false positive annotation
  false_positive_genes <- sysSVM:::false_positive_genes
  false_positive_genes <- false_positive_genes %>% select(entrez) %>% .$entrez
  sys_cans = sys_cans %>% mutate(NCG_FP=ifelse(entrez%in%false_positive_genes, TRUE, FALSE))

  return(sys_cans)

}


## Score genes
scoreGenes = function(ncg.tissue.name=NULL,
                      output.dir=NULL, exclude.features=NULL,
                      iters=NULL,
                      step=NULL,
                      top.rank=10,
                      refine=TRUE,
                      gtex.tissue="Esophagus"){
  require(e1071)
  require(caret)
  cat("Summarising cross-validation", "\n")
  cv_stats_fn = paste0(output.dir, "/cv_stats.tsv")
  cv_stats = read.table(cv_stats_fn, header = T, sep = "\t")

  outDir = paste0(output.dir, "/Results/")
  dir.create(outDir) ## Write results in a separate directory

  geneInfo=sysSVM:::geneInfo
  cancerGenes=sysSVM:::cancerGenes

  ## Check point for NCG tissues
  ncg_tissues = cancerGenes %>% select(primary_site) %>% unique %>% .$primary_site
  if (!(ncg.tissue.name%in%ncg_tissues)){
    stop("Please provide a valid tissue for NCG5!")
  }

  ## Load training and prediction sets
  load(paste(output.dir, "/training_set.Rdata", sep=""))
  load(paste(output.dir, "/prediction_set.Rdata", sep=""))
  load(paste(output.dir, "/prediction_set_noScale.Rdata", sep=""))

  ## What features should be excluded
  s = c("sample", "entrez", "no_ALL_muts","no_TRUNC_muts", "no_NTDam_muts",
        "no_GOF_muts", "Copy_number", "CNVLoss", "CNVGain")
  if(!is.null(exclude.features)){
    s = s[!(s%in%exclude.features)]
  }

  prediction_ns = prediction_ns %>% tibble::rownames_to_column() %>%
    separate(rowname, into=c("sample", "entrez"), sep="\\.") %>%
    mutate(entrez=as.numeric(entrez)) %>%
    select(one_of(s))

  ## Fix gene info table from NCG
  geneInfo = geneInfo %>% select(entrez, cancer_type) %>% unique
  ## Get a cancer gene with all the associated primary sites and cancer sites
  cancerGenes = cancerGenes %>% select(entrez, primary_site, cancer_site) %>%
    group_by(entrez) %>% summarise(primary_site=paste(unique(primary_site), collapse=","),
                                   cancer_site=paste(unique(cancer_site), collapse=",")) %>%
    ungroup
  geneInfo = geneInfo %>% left_join(cancerGenes, by=c("entrez"))

  ## Training
  training = training%>%subset(type=="C")

  ## Filter out columns that are NaN - this should be changed to replace the column with all 0s
  training=Filter(function(x)!all(is.nan(x)), training)
  prediction=Filter(function(x)!all(is.nan(x)), prediction)

  ## Check the steps to summarise the results
  steps = seq(0,iters,step)
  steps = steps[2:length(steps)]
  if(steps[length(steps)]!=iters){steps[length(steps)]=iters}

  geneLists = list()
  recurrence_table = NULL

  for(st in steps){
    cat(paste0("Current step: Scoring at ", st, " iterations..."))
    outDir2 = paste0(output.dir, "/Results/", st, "/")
    dir.create(outDir2) ## Write results in a separate directory
    cv_stats_summary <- cv_stats %>% subset(!is.na(value) & type=="Sensitivity" & set=="test" & iteration<=st) %>%
      group_by(kernel, nu, gamma, degree) %>% summarize(iterations=n(),
                                                        min=min(value),
                                                        q1=quantile(value)[2],
                                                        median=median(value),
                                                        mean=mean(value),
                                                        q3=quantile(value)[4],
                                                        max=max(value),
                                                        var=stats::var(value)) %>% ungroup
    cv_stats_summary$analysis = paste(cv_stats_summary$kernel, cv_stats_summary$nu, cv_stats_summary$gamma, cv_stats_summary$degree, sep=".")

    write.table(cv_stats_summary, file=paste0(outDir2,"/cv_stats_summary.tsv"), quote=F, row.names=F, sep="\t")

    ## -------------------
    ## Get the best models
    ## -------------------
    best_models = NULL
    for (k in unique(cv_stats_summary$kernel)){
      d = cv_stats_summary %>% subset(kernel==k) %>% data.frame()
      bm = d %>% arrange(desc(mean)) %>% slice(1:5) %>% mutate(var_rank=row_number(var)) %>% subset(var_rank==1)
      best_models = rbind(best_models, bm)
    }
    write.table(best_models, file=paste0(outDir2,"/best_models.tsv"), quote=F, row.names=F, sep="\t")

    for (i in 1:nrow(best_models)){
      kern = as.character(best_models$kernel[i])
      analysis = best_models$analysis[i]
      mynu=best_models$nu[i]
      mygamma=best_models$gamma[i]
      mydegree=best_models$degree[i]

      if (kern=="linear"){
        svm.model <- svm(type ~ ., data = training, kernel=kern, type="one-classification", scale=FALSE, nu=mynu)
      }else if (kern=="radial"){
        svm.model <- svm(type ~ ., data = training, kernel=kern, type="one-classification", scale=FALSE, gamma=mygamma, nu=mynu)
      }else if (kern=="polynomial"){
        svm.model <- svm(type ~ ., data = training, kernel=kern, type="one-classification", scale=FALSE, gamma=mygamma, nu=mynu, degree=mydegree)
      }else if (kern=="sigmoid"){
        svm.model <- svm(type ~ ., data = training, kernel=kern, type="one-classification", scale=FALSE, gamma=mygamma, nu=mynu)
      }

      ## Asses training sensitivity
      trainset = training %>% select(type)
      trainset$fitted = svm.model$fitted
      trainset = trainset %>% select(type, fitted)
      trainset = trainset %>% mutate(f=ifelse(fitted==TRUE, "C", "N"))
      trainset$type = factor(as.character(trainset$type), levels = c("C", "N"))
      trainset$f = factor(as.character(trainset$f), levels = c("C", "N"))
      ## Confusion matrix
      cM <- confusionMatrix(trainset$f,trainset$type, positive = c("C"))
      training_sensitivity = cM$byClass["Sensitivity"]

      ## make prediction
      predictions = predict(svm.model, prediction, decision.values = TRUE)
      ## add probability from Platt and Binning
      predictions_platt = plattScaling(svm.model, predictions)
      colnames(predictions_platt) = c("dv", "label", "prob_platt")
      preds = predictions_platt %>% tibble::rownames_to_column()

      ## Take information from gene info
      preds = preds %>%
        separate(rowname, into=c("sample", "entrez"), sep="\\.") %>%
        mutate(entrez=as.numeric(entrez)) %>%
        left_join(geneInfo%>%rename(gene_type=cancer_type), by=c("entrez"))

      ## Join also with systems level properties from validaton set
      preds = preds %>% left_join(prediction_ns, by=c("sample", "entrez"))


      save(svm.model, file=paste(outDir2,"/model_",analysis, ".Rdata", sep=""))
      save(preds, file=paste(outDir2,"/predictions_",analysis, ".Rdata", sep=""))
    }

    ## Get the scores (based on the best models above)
    LINEAR_PREDS = best_models$analysis[best_models$kernel=="linear"]
    LINEAR_CVS = round(best_models$mean[best_models$kernel=="linear"], digits = 2)
    LINEAR_VAR = round(best_models$var[best_models$kernel=="linear"], digits = 2)

    POLYNOMIAL_PREDS = best_models$analysis[best_models$kernel=="polynomial"]
    POLYNOMIAL_CVS = round(best_models$mean[best_models$kernel=="polynomial"], digits = 2)
    POLYNOMIAL_VAR = round(best_models$var[best_models$kernel=="polynomial"], digits = 2)

    RADIAL_PREDS = best_models$analysis[best_models$kernel=="radial"]
    RADIAL_CVS = round(best_models$mean[best_models$kernel=="radial"], digits = 2)
    RADIAL_VAR = round(best_models$var[best_models$kernel=="radial"], digits = 2)

    SIGMOID_PREDS = best_models$analysis[best_models$kernel=="sigmoid"]
    SIGMOID_CVS = round(best_models$mean[best_models$kernel=="sigmoid"], digits = 2)
    SIGMOID_VAR = round(best_models$var[best_models$kernel=="sigmoid"], digits = 2)

    #save.image(file = paste0(output.dir, "/namespace.Rdata"))

    scores = getScore(path=outDir2,
                      linear_preds = paste0("/predictions_", LINEAR_PREDS, ".Rdata"), linearCVS = LINEAR_CVS, linearVar = LINEAR_VAR,
                      radial_preds = paste0("/predictions_", RADIAL_PREDS, ".Rdata"), radialCVS = RADIAL_CVS, radialVar = RADIAL_VAR,
                      polynomial_preds = paste0("/predictions_", POLYNOMIAL_PREDS, ".Rdata"), polynomialCVS = POLYNOMIAL_CVS, polynomialVar = POLYNOMIAL_VAR,
                      sigmoid_preds = paste0("/predictions_", SIGMOID_PREDS, ".Rdata"), sigmoidCVS = SIGMOID_CVS, sigmoidVar = SIGMOID_VAR)

    save(scores, file=paste0(outDir2, "/scores.Rdata"))

    if(refine){
      ## Add refined CGCs, exclude "Not expressed" genes and define patient ranks
      ## Then I use the getSysCans (source it from config.R) to get a data frame with the ranks. Command as follows:
      syscan = getSysCansOAC(output.dir = output.dir, working.dir = outDir2, gtex.tissue = gtex.tissue)
      ## And then save it
      save(syscan, file=paste0(outDir2, "/syscan.Rdata"))
      cat("\n")
    }## proper function to calculate ranks will be added here (instead of getSysCansOAC)

    ## Check genes and decide whether to stop or not
    top_syscans = syscan %>% subset(patient.rank<=top.rank)
    geneLists[[as.character(st)]] = unique(top_syscans$entrez)
  }

  unique_predictions = NULL
  count = 1
  for(ul in unique(geneLists)){
    l = lapply(geneLists, function(x) sum(x==ul)==length(ul))
    oc = sum(unlist(l))
    iterations_predicted = paste0(names(which(l ==TRUE)), collapse=",")
    d = data.frame(list = count, occurences = oc, iterations=iterations_predicted)
    unique_predictions = rbind(unique_predictions, d)
    count = count + 1
  }

  unique_predictions = unique_predictions %>% dplyr::arrange(desc(occurences)) %>% dplyr::mutate(steps=length(steps), frequency=occurences/steps)
  write.table(unique_predictions, file=paste0(output.dir, "/gene_prediction_frequency.tsv"), row.names = F, sep="\t", quote = F)
  lapply(unique_predictions, class)
  ## And finally export the most frequent list (from the bigest number of iterations to approximate the true values of scores/sensitivity of the models)
  check_max = which(unique_predictions$frequency==max(unique_predictions$frequency))
  find_iter = as.numeric(unlist(strsplit(as.character(unique_predictions$iterations[check_max]), ",")))
  find_iter = max(find_iter)
  file.copy(paste0(output.dir, "/Results/", find_iter, "/syscan.Rdata"), paste0(output.dir, "/syscan.Rdata"))
  file.copy(paste0(output.dir, "/Results/", find_iter, "/scores.Rdata"), paste0(output.dir, "/scores.Rdata"))


}
thmourikis/sysSVM documentation built on May 30, 2019, 4:05 a.m.