raw_scripts/config.R

## You source this file as soon as you start working on this project to load all the
## relevant tables and functions

pkgsR  =  c('plyr',
            'tidyr',
            'caret',
            'ggplot2',
            'grid',
            'gridExtra',
            'tibble',
            'reshape2',
            'doMC',
            'xlsx',
            'readxl',
            'scales',
            'doParallel',
            'foreach',
            'RMySQL',
            'pROC',
            'e1071',
            'scales',
            'RColorBrewer',
            'sampling',
            'data.table',
            'dplyr')
for (pkgR in pkgsR){
    if (!pkgR %in% rownames(installed.packages())) {
        install.packages(pkgR, dependencies=TRUE, INSTALL_opts = c('--no-lock'), repos = "https://cran.ma.imperial.ac.uk/")
        library(pkgR, character.only = TRUE, quietly = TRUE)
    } else {
        library(pkgR, character.only = TRUE, quietly = TRUE)
    }
}

source("http://bioconductor.org/biocLite.R")

pkgs  =  c("RDRToolbox")
for (pkg in pkgs){
    if (!pkg %in% rownames(installed.packages())) {
        biocLite(pkg)
        library(pkg, character.only = TRUE, quietly = TRUE)
    } else {
        library(pkg, character.only = TRUE, quietly = TRUE)
    }
}

##update.packages(repos=biocinstallRepos(), ask=FALSE)

rm(pkg,pkgR,pkgs,pkgsR)
## Load dplyr always last
detach("package:dplyr", unload=TRUE)
library(dplyr)

theme_boss <- function(base_size = 12, base_family = "sans"){
  theme_bw(base_size = base_size, base_family = base_family) %+replace%
    theme(
      panel.grid.major = element_blank(),
      panel.grid.minor = element_blank(),
      panel.background = element_blank(),
      panel.border=element_blank()
    )
}

## Needed for some functions
registerDoMC(3)
cancer_types <- c("ACC", "BLCA", "BRCA", "CESC", "CHOL", "COAD", "ESCA", "GBM",
                  "HNSC", "KICH", "KIRC", "KIRP", "LAML", "LGG", "LIHC", "LUAD",
                  "LUSC", "OV", "PAAD", "PCPG", "PRAD", "READ", "SARC", "SKCM",
                  "STAD", "TGCT", "THCA", "THYM", "UCEC", "UCS", "UVM")

drivers_secrier = c("SMYD3", "RUNX1", "CTNNA3", "RBFOX1", "AGBL4", "CDKN2A",
                    "CDKN2B", "SAMD5", "CDK14", "KIF26B", "THADA", "SASH1",
                    "MECOM", "JUP", "IKZF3", "FHIT", "WWOX", "MACROD2", "IMMP2L",
                    "CCSER1", "PDE4D", "NAALADL2", "PARK2", "PARD3B", "PRKG1",
                    "TP53", "SMAD4", "ARID1A", "KCNQ3", "CCDC102B", "CYP7B1")


## Function to get KEGG pathways using REST API
getKEGG = function(r=NULL, fn="Desktop/kegg_test.txt", app=T){
    require(KEGGREST)
    path_ids = unique(unlist(keggLink("pathway", "hsa")))
    cat(paste0("Pathways found: ", length(path_ids)), "\n")

    ## I placed this here in case there is a limitation in the number of times you can query KEGG
    if(is.null(r)){
        r = 1:length(path_ids)
    }

    for(i in r){
        kegg_path = keggGet(path_ids[i]) ## This returns an object with name, description etc
        kegg_path = keggGet(path_ids[i]) ## This returns an object with name, description etc
        nm = kegg_path[[1]]$NAME
        cat(paste0(i, ") ", nm), "\n")
        if("GENE"%in%names(kegg_path[[1]])){
            genes = unlist(lapply(strsplit(kegg_path[[1]]$GENE[seq(2, length(kegg_path[[1]]$GENE), 2)], ";"), function(x) x[1]))
            cat(paste(nm, paste(genes, collapse="\t"), sep = "\t"), file = fn, append = app, "\n")
            #paths[[nm]] = genes
        }else{
            cat("skipped", "\n")
            next
        }
    }
}


getGeneCoordsHg19 = function(){
    require(TxDb.Hsapiens.UCSC.hg19.knownGene)
    txdb <- TxDb.Hsapiens.UCSC.hg19.knownGene
    ## Get gene coordinates
    g = genes(txdb)
    return(g)
}

## The wat I use this function is the following:
## I give the list I want and it returns a list with the number of common elements for all combinations of the elements of the list
## Prepare the gene sets for the intersection (I did the intersection only on the mutated genes initially)
# paths = gsep %>% select(pathway) %>% unique %>% .$pathway
# gene_sets = list()
# for(p in paths){
#     genes = gsep %>% subset(pathway==p) %>% .$symbol %>% unique
#     gene_sets[[p]] = genes
# }
getListIntersection = function(l, separator=".", exclude=NULL){

    ## excldue argument helps to remove elements from the lists during each comparison
    ## not implemented yet
    if(!is.null(exclude)){
        l = lapply(l, function(x) x=x[!x%in%exclude])
    }

    require(dplyr)
    require(tidyr)

    # Get the combinations of names of list elements
    nms <- combn( names(l) , 2 , FUN = paste0 , collapse = separator , simplify = FALSE )

    # Make the combinations of list elements
    print("Making combinations")
    ll <- combn( l , 2 , simplify = FALSE )
    print(length(ll))
    # Intersect the list elements
    print("starting intersections")
    out <- lapply( ll , function(x) length( intersect( x[[1]] , x[[2]] ) ) )
    out = setNames( out , nms )
    return(out)

}
## And then to take the minimum number of pathway from the above intersections for gene sets I do the following
## Prepare the table
# geneSets2length = lapply(gene_sets, function(x) length(x))
# geneSets2length = data.frame(pathway=names(geneSets2length), pathway.length=unlist(geneSets2length))
# rownames(geneSets2length) = NULL
# out = lapply(seq_along(out), function(y,n,i){
#     ldply(y[[i]]) %>% mutate(pathway=n[[i]])
# }, y=out, n=names(out))
# out =do.call(rbind, out)
# out = out %>% separate(pathway, into=c("pathway1", "pathway2"), sep="\\.")
# out = out %>% left_join(geneSets2length%>%rename(pathway1=pathway, pathway1.length=pathway.length)) %>% left_join(geneSets2length%>%rename(pathway2=pathway, pathway2.length=pathway.length))
# out = out %>% mutate(pathway1.perc=(V1/pathway1.length)*100, pathway2.perc=(V1/pathway2.length)*100)
## Add also samples and genes

## Old function to get containers
getContainers2 = function(gsea=gsea.noAmp.top10.plusCGC.pathways[["genes2paths"]], cutoff=80, sample.filter=0){ ## d here is a data frame with all the necessary columns (like length of pathways, percentage of overlap etc)
    require(dplyr)

    gsea = gsea %>% subset(!is.na(pathway) & Level1==F & Level2==F & samples.pathway>sample.filter)
    cat(paste0("Merging ", length(unique(gsea$pathway)), " pathways"), "\n")

    uniq_paths = unique(gsea$pathway)


    ## Now I need to construct again the new similarity table
    similarity_table = combn(uniq_paths, 2) %>% t() %>% data.frame()
    cat(paste0("Pairs: ", nrow(similarity_table)), "\n")
    colnames(similarity_table) = c("pathway1", "pathway2")
    similarity_table$overlap = apply(similarity_table, 1, function(x){
        length(intersect(gsea %>% subset(pathway==x[1]) %>% .$symbol %>% unique, gsea %>% subset(pathway==x[2]) %>% .$symbol %>% unique))
    })

    unique_paths = unique(c(similarity_table$pathway1, similarity_table$pathway2))
    additional_info = NULL
    for(p in unique_paths){
        s = gsea %>% subset(pathway==p) %>% .$sample %>% unique %>% length
        l = gsea %>% subset(pathway==p) %>% .$symbol %>% unique %>% length
        d = data.frame(pathway=p, samples=s, length=l)
        additional_info = rbind(additional_info, d)
    }

    ## bring them in and calculate overlap
    similarity_table = similarity_table %>% left_join(additional_info %>% rename(pathway1=pathway, length1=length, samples.pathway1=samples)) %>%
        left_join(additional_info %>% rename(pathway2=pathway, length2=length, samples.pathway2=samples)) %>%
        mutate(pathway1.perc=(overlap/length1)*100) %>%
        mutate(pathway2.perc=(overlap/length2)*100)

    ## Here, I need to start the iterative processes
    ## Reset collapsed pathways
    containers = list()
    for(p in unique_paths){ ## Meaning that there is still a combination of pathways/containers
        d = similarity_table %>% subset((pathway1==p | pathway2==p) & (pathway1.perc>=cutoff | pathway2.perc>=cutoff))
        if(nrow(d)>0){
            d = data.frame(pathway=unique(c(d$pathway1, d$pathway2))) %>% left_join(additional_info) %>% arrange(desc(samples))
            container = d$pathway[1]
            if(container %in% names(containers)){ ## Bring in new pathways to the container
                containers[[container]] = unique(c(containers[[container]], p))
            }else{
                containers[[container]] = d$pathway
            }

        }else{
            containers[[p]] = p ## Add those that do not have similarity with anything
        }

    }

    ## return the two lists
    return(list(containers=containers, similarity_table=similarity_table))

}

getContainers = function(gsea=gsea.withAmp.top10.plusCGC.pathways[["genes2paths"]], cutoff=c(60), weight=T, sample.filter=0){ ## d here is a data frame with all the necessary columns (like length of pathways, percentage of overlap etc)
    require(plyr)
    require(dplyr)
    require(tidyr)
    require(igraph)

    ## save file to export it
    export = gsea
    gsea = gsea %>% subset(!is.na(pathway) & Level1==F & Level2==F & samples.pathway>sample.filter)
    cat(paste0("Merging ", length(unique(gsea$pathway)), " pathways"), "\n")

    uniq_paths = unique(gsea$pathway)


    ## Now I need to construct again the new similarity table
    cat("Calculating similarity matrix...", "\n")
    similarity_table = combn(uniq_paths, 2) %>% t() %>% data.frame()
    cat(paste0("Pairs: ", nrow(similarity_table)), "\n")
    colnames(similarity_table) = c("pathway1", "pathway2")
    similarity_table$overlap = apply(similarity_table, 1, function(x){
        length(intersect(gsea %>% subset(pathway==x[1] & !is.na(pathway)) %>% .$symbol %>% unique, gsea %>% subset(pathway==x[2] & !is.na(pathway)) %>% .$symbol %>% unique))
    })

    unique_paths = unique(c(as.character(similarity_table$pathway1), as.character(similarity_table$pathway2)))
    additional_info = NULL
    for(p in unique_paths){
        s = gsea %>% subset(pathway==p) %>% .$sample %>% unique %>% length
        l = gsea %>% subset(pathway==p) %>% .$symbol %>% unique %>% length
        d = data.frame(pathway=p, samples=s, length=l)
        additional_info = rbind(additional_info, d)
    }

    ## bring them in and calculate overlap
    similarity_table = similarity_table %>% left_join(additional_info %>% rename(pathway1=pathway, length1=length, samples.pathway1=samples)) %>%
        left_join(additional_info %>% rename(pathway2=pathway, length2=length, samples.pathway2=samples)) %>%
        mutate(pathway1.perc=(overlap/length1)*100) %>%
        mutate(pathway2.perc=(overlap/length2)*100)

    res = list()
    for(co in cutoff){

        net = similarity_table %>% subset(pathway1.perc>=co | pathway2.perc>=co) %>% select(pathway1, pathway2)
        w = similarity_table %>% subset(pathway1.perc>=co | pathway2.perc>=co) %>% mutate(weight=ifelse(length1<length2, pathway1.perc, ifelse(length2<=length1, pathway2.perc, NA))) %>% .$weight
        cat(paste0(nrow(net), " pairs with similarity >= ", co), "\n")
        g = graph_from_data_frame(net, directed = F, vertices = uniq_paths)
        E(g)$weight = w

        ## Get clusters by resolving bridges
        cat("Getting pathway clusters...", "\n")
        if(w){
            cls = cluster_edge_betweenness(g, weights = E(g)$weight)
        }else{
            cls = cluster_edge_betweenness(g)
        }


        cat(paste0(length(cls), " containers found"), "\n")
        print(sizes(cls))

        ## give the name of the container to the pathway covering more samples
        containers = list()
        for(i in 1:length(cls)){
            cont = gsea %>% select(pathway, samples.pathway) %>% unique %>% subset(pathway%in%cls[[i]]) %>% arrange(desc(samples.pathway))
            containers[[cont$pathway[1]]] = cls[[i]]
        }

        l = lapply(seq_along(containers), function(y,n,i){
            ldply(y[[i]]) %>% mutate(container=n[[i]])
        }, y=containers, n=names(containers))
        l = do.call(rbind, l) %>% rename(pathway=V1)

        ## join it with export and return it
        print(nrow(export))
        exp = export %>% left_join(l)
        print(nrow(export))

        res[[as.character(co)]] = list(containers=containers, clusters=cls, network=g, genes2paths=exp)
    }

    res[["similarity_table"]] = similarity_table
    return(res)

}

getPatientCommonPathways = function(genes2paths=gsea.withAmp.top10.plusCGC.containers[["genes2paths"]],
                                     normal = T
){

    require(dplyr)
    require(tidyr)

    d = genes2paths

    ## Decide if you want normalisation or not
    normal = normal

    d = d %>% subset(!is.na(pathway))
    ## Get a graph where nodes are samples and connections are number of containers
    sample_combns = do.call(rbind, combn(unique(d$sample), 2, simplify = FALSE))

    compaths = function(s1, s2, normal=T){
        if(s1==s2){
            stop("Sample twice")
        }
        common.pathways = d %>% select(sample, pathway) %>% unique %>% subset(sample%in%c(s1, s2)) %>% count(pathway) %>% subset(n>1) %>% .$pathway
        union_pathways = d %>% select(sample, pathway) %>% unique %>% subset(sample%in%c(s1, s2)) %>% select(pathway) %>% unique %>% nrow
        if(normal){
            common.pathways=length(common.pathways)/union_pathways
        }else{
            common.pathways=length(common.pathways)
        }
        return(common.pathways)
    }

    sample_combns = sample_combns %>% data.frame() %>% rename(sample1=X1, sample2=X2)
    cat("Calculating common pathways among samples...")
    sample_combns$common.pathways = apply(sample_combns, 1, function(x) compaths(x[1], x[2]))

    ## And add self versus self
    for (s in unique(d$sample)){
        common.pathways = d %>% subset(sample==s) %>% select(pathway) %>% unique %>% .$pathway
        if(normal){
            sample_combns = rbind(sample_combns, data.frame(sample1=s, sample2=s, common.pathways=1))
        }else{
            sample_combns = rbind(sample_combns, data.frame(sample1=s, sample2=s, common.pathways=length(common.pathways)))
        }
    }

    return(sample_combns)
}


getPatientCommonProcesses = function(genes2paths=gsea.withAmp.top10.plusCGC.containers[["genes2paths"]],
                                     normal = T
                                     ){

    require(dplyr)
    require(tidyr)

    d = genes2paths

    ## Decide if you want normalisation or not
    normal = normal

    d = d %>% subset(!is.na(container))
    ## Get a graph where nodes are samples and connections are number of containers
    sample_combns = do.call(rbind, combn(unique(d$sample), 2, simplify = FALSE))

    comconts = function(s1, s2, normal=T){
        if(s1==s2){
            stop("Sample twice")
        }
        common.containers = d %>% select(sample, container) %>% unique %>% subset(sample%in%c(s1, s2)) %>% count(container) %>% subset(n>1) %>% .$container
        union_processes = d %>% select(sample, container) %>% unique %>% subset(sample%in%c(s1, s2)) %>% select(container) %>% unique %>% nrow
        if(normal){
            common.containers=length(common.containers)/union_processes
        }else{
            common.containers=length(common.containers)
        }
        return(common.containers)
    }

    sample_combns = sample_combns %>% data.frame() %>% rename(sample1=X1, sample2=X2)
    cat("Calculating common processes among samples...")
    sample_combns$common.containers = apply(sample_combns, 1, function(x) comconts(x[1], x[2]))

    ## And add self versus self
    for (s in unique(d$sample)){
        common.containers = d %>% subset(sample==s) %>% select(container) %>% unique %>% .$container
        if(normal){
            sample_combns = rbind(sample_combns, data.frame(sample1=s, sample2=s, common.containers=1))
        }else{
            sample_combns = rbind(sample_combns, data.frame(sample1=s, sample2=s, common.containers=length(common.containers)))
        }
    }

    return(sample_combns)
}

## check the overlap of the containers (genes)
checkContainers = function(container_list=containers[["containers"]],
                           genes2paths=gsea.noAmp.top10.plusCGC.pathways[["genes2paths"]]){

    require(dplyr)
    require(tidyr)

    combs = combn(names(container_list), 2) %>% t() %>% data.frame()
    colnames(combs) = c("container1", "container2")

    getSharedGenes = function(container1, container2){
        paths1 = container_list[[container1]]
        paths2 = container_list[[container2]]

        genes1 = genes2paths %>% subset(pathway%in%paths1) %>% .$symbol %>% unique
        genes2 = genes2paths %>% subset(pathway%in%paths2) %>% .$symbol %>% unique

        return(paste0( length(intersect(genes1, genes2)), ";", length(genes1), ";", length(genes2) ))
    }

    combs$inter = apply(combs, 1, function(x) getSharedGenes(x[1], x[2]))
    combs = combs %>% separate(inter, into=c("overlap", "len1", "len2"), sep=";") %>% mutate(perc1=(as.numeric(overlap)/as.numeric(len1))*100, perc2=(as.numeric(overlap)/as.numeric(len2))*100)
}


## This function gets 2 entrez and returns the distance of the genes in hg19
geneDistanceHg19 <- function(g=NULL, gene1=NULL, gene2=NULL){
    ## Convert genes in strings (metadata column is string)
    if(is.null(g) | is.null(gene1) | is.null(gene2)){
        stop("Gene missing")
    }
    gene1 = as.character(gene1)
    gene2 = as.character(gene2)
    d = distance(g[names(g)==gene1], g[names(g)==gene2], ignore.strand=TRUE)
    return(d)
}

pathwayEnrichmentRanks = function(geneInfo_fn="/Users/fc-8s-imac/athena/data/geneInfoNCG5.Rdata",
                             gene_sets_dir="/Users/fc-8s-imac/athena/data/geneSets/reactome_12_12_16/",
                             gene_sets_gene_start=4,
                             gene_sets_identifiers = "reactome",
                             sub_identifier="symbol",
                             path_length=F,
                             min_path_length=10,
                             max_path_length=500,
                             scores_fn="/Users/fc-8s-imac/athena/data/OAC/129_OAC/ML/OAC/syscans_4_with_gains_in_sys_cans.Rdata",
                             cohort_fn="/Users/fc-8s-imac/athena/data/OAC/129_OAC/Rdata/mainCohort.Rdata",
                             remove.amplifications=T,
                             remove.cgc=T,
                             null_distribution=F,
                             sample.by.sample=F){

    ## pathway parameter in the following function can be kegg/reactome/all
    ## sub parameter can be entrez/symbol
    getPathway = function(pathway="kegg", sub=sub_identifier, genes_start=gene_sets_gene_start,
                          path_dir=gene_sets_dir,
                          gi_fn=geneInfo_fn){

        make_pathway_list = function(pathMsigDbFile) {
            inputFile <- pathMsigDbFile
            con  <- file(inputFile, open = "r")
            c = 1
            pathway.list <- vector(mode="list",length=0)
            message("Loading gene sets....")
            while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {
                myVector <- do.call("rbind",strsplit(oneLine, "\t"))
                t = vector(mode="list",length=1)
                t[[1]] = myVector[genes_start:length(myVector)]
                names(t) = myVector[1]
                pathway.list = c(pathway.list,t)
                c = c+1
            }

            close(con)
            return(pathway.list)
        }

        fns = list.files(path_dir)
        pattern=paste0(pathway, ".*", sub)
        fn = fns[grepl(pattern, fns)]
        if(length(fn)>1 | length(fn)==0){
            stop("No or multiple files found. Please check the parameters provided")
        }

        message(paste0("Pathway file: ", fn))
        p = make_pathway_list(paste0(path_dir, fn))
        message(paste0("Number of pathways: ", length(p)))
        if(sub=="entrez"){
            genes_p = as.numeric(unique(unlist(p)))
        }else if(sub=="symbol"){
            genes_p = unique(unlist(p))
        }
        message(paste0("Number of genes: ", length(genes_p)))
        ## Get gene info to check overlap with 19014
        load(gi_fn)
        geneInfo = geneInfo %>% subset(duplicability!=-1) %>% select(symbol, entrez)
        if(sub=="entrez"){
            message(paste0("Number of genes (19,014): ", length(genes_p[genes_p%in%geneInfo$entrez])), "\n")
        }else if(sub=="symbol"){
            message(paste0("Number of genes (19,014): ", length(genes_p[genes_p%in%geneInfo$symbol])), "\n")
        }
        return(p)

    }

    ## x is a vector of genes
    ## y is a list of pathways
    ## identifier can be entrez/symbol
    updatePathway = function(x, y, identifier=sub_identifier, min_len=min_path_length, max_len=max_path_length, len=path_length,
                             exclude_pattern="CANCER|LEUKEMIA|MELANOMA|GLIOMA",
                             gi_fn=geneInfo_fn){

        ## This function removes all genes that are not in the gene list of interest
        ## from the pathways

        if (identifier=="entrez"){
            y = lapply(y, as.numeric)
        }else if (identifier=="symbol"){
            y = lapply(y, as.character)
        }

        ## Update for the list of genes that are in our gene set
        y = lapply(y, function(g) g[g%in%x])
        ## Remove empty sets
        y = y[unlist(lapply(y, function(x) length(x)>0))]

        if(len==TRUE){
            y = y[unlist(lapply(y, function(x) length(x)>min_len))]
            y = y[unlist(lapply(y, function(x) length(x)<max_len))]
        }
        y = y[names(y)[!grepl(exclude_pattern, names(y))]]

        message(paste0("Number of pathways: ", length(y)))
        if(identifier=="entrez"){
            genes_p = as.numeric(unique(unlist(y)))
        }else if(identifier=="symbol"){
            genes_p = unique(unlist(y))
        }
        message(paste0("Number of genes: ", length(genes_p)))
        ## Get gene info to check overlap with 19014
        load(gi_fn)
        geneInfo = geneInfo %>% subset(duplicability!=-1) %>% select(symbol, entrez)
        if(identifier=="entrez"){
            message(paste0("Number of genes (19,014): ", length(genes_p[genes_p%in%geneInfo$entrez])))
        }else if(identifier=="symbol"){
            message(paste0("Number of genes (19,014): ", length(genes_p[genes_p%in%geneInfo$symbol])))
        }


        return(list(gene_sets=y, genes_in_p=length(genes_p)))
    }

    ## x is the pathway
    ## g is the data frame with ranks
    run.singed.ks = function(x, g, func_fn="/Users/fc-8s-imac/athena/data/OAC/signed_ks.R"){
        source(func_fn)
        if(sub_identifier=="entrez"){
            s = g %>% subset(entrez%in%x) %>% select(rank) %>% .$rank
        }else if(sub_identifier=="symbol"){
            s = g %>% subset(symbol%in%x) %>% select(rank) %>% .$rank
        }

        k <- ks.test.2(s, (1:max(g$rank))[-s], maxCombSize=10^10)
        res = list(es=k$ES, p.value=k$p, no.genes=length(x), ranks=paste(s, collapse=","))
        return(res)
    }

    getNullDist = function(gene_sets, g, times=1000, ks_func_fn="/Users/fc-8s-imac/athena/data/OAC/signed_ks.R"){
        ## Get the null distribution
        source(ks_func_fn)
        l = unlist(lapply(gene_sets, function(x) length(x)))
        l = unique(l)
        ranks = c(1:max(g$rank))
        ex = replicate(times, sapply(l, function(x) {
            z = sample(ranks, x)
            t = ks.test.2(z, (1:max(g$rank))[-z], maxCombSize=10^10)
            ## For testing I return also the vector
            paste(t$ES, t$p, x, paste(z, collapse="/"), sep=",")
        }, simplify = F), simplify = F) %>% unlist()
        ex = data.frame(ex=ex) %>% separate(ex , into=c("es", "p.value", "len", "sample.ranks"), sep=",")
        return(ex)
    }

    load(scores_fn)
    load(geneInfo_fn)
    ## Make sure you update the symbols to work with gene sets with symbols
    syscan = syscan %>% select(-symbol) %>% left_join(geneInfo%>%select(entrez, symbol))
    ## Exclude "CGC discarded"
    syscan = syscan %>% subset(gene_type!="CGC_discarded")

    ## ------ Add mutational signature information to the scores ---------
    load(cohort_fn)
    mainCohort = mainCohort %>% rename(sample=Primary_tumour_solid_tissue)
    syscan = syscan %>% left_join(mainCohort%>%select(sample, Group)%>%unique)

    ## Filters
    if(remove.cgc){
        syscan = syscan %>% subset(gene_type!="cgc")
    }
    if(remove.amplifications){
        syscan = syscan %>% subset(CNVGain!=1)
    }

    ## get pathways
    gsea = NULL
    null_dists = NULL
    paths = gene_sets_identifiers

    if(sample.by.sample==F){
        message("Groups-collapsed mode")
        ## Get the genes for each group
        caDominant = syscan %>% subset(Group=="C>A/T dominant") %>% subset(gene_type!="CGC_discarded") %>%
            subset(type=="P") %>% select(entrez, symbol, score) %>% group_by(entrez, symbol) %>% summarise(score=mean(score)) %>% ungroup()
        addition = syscan %>% subset(Group=="C>A/T dominant") %>% subset(gene_type!="CGC_discarded") %>%
            subset(type=="C") %>% select(entrez, symbol) %>% unique %>% mutate(score=max(caDominant$score)+1)
        caDominant = rbind(caDominant, addition)
        caDominant = caDominant %>% arrange(desc(score)) %>% mutate(rank=row_number())
        if(sub_identifier=="entrez"){
            caDominant = caDominant %>% select(entrez, score, rank) %>% subset(!is.na(entrez))
        }else if(sub_identifier=="entrez"){
            caDominant = caDominant %>% select(symbol, score, rank) %>% subset(!is.na(symbol))
        }

        ddr = syscan %>% subset(Group=="DDR impaired") %>% subset(gene_type!="CGC_discarded") %>%
            subset(type=="P") %>% select(entrez, symbol, score) %>% group_by(entrez, symbol) %>% summarise(score=mean(score)) %>% ungroup()
        addition = syscan %>% subset(Group=="DDR impaired") %>% subset(gene_type!="CGC_discarded") %>%
            subset(type=="C") %>% select(entrez, symbol) %>% unique %>% mutate(score=max(ddr$score)+1)
        ddr = rbind(ddr, addition)
        ddr = ddr %>% arrange(desc(score)) %>% mutate(rank=row_number())
        if(sub_identifier=="entrez"){
            ddr = ddr %>% select(entrez, score, rank) %>% subset(!is.na(entrez))
        }else if(sub_identifier=="entrez"){
            ddr = ddr %>% select(symbol, score, rank) %>% subset(!is.na(symbol))
        }

        mutagenic = syscan %>% subset(Group=="Mutagenic") %>% subset(gene_type!="CGC_discarded") %>%
            subset(type=="P") %>% select(entrez, symbol, score) %>% group_by(entrez, symbol) %>% summarise(score=mean(score)) %>% ungroup()
        addition = syscan %>% subset(Group=="Mutagenic") %>% subset(gene_type!="CGC_discarded") %>%
            subset(type=="C") %>% select(entrez, symbol) %>% unique %>% mutate(score=max(mutagenic$score)+1)
        mutagenic = rbind(mutagenic, addition)
        mutagenic = mutagenic %>% arrange(desc(score)) %>% mutate(rank=row_number())
        if(sub_identifier=="entrez"){
            mutagenic = mutagenic %>% select(entrez, score, rank) %>% subset(!is.na(entrez))
        }else if(sub_identifier=="entrez"){
            mutagenic = mutagenic %>% select(symbol, score, rank) %>% subset(!is.na(symbol))
        }

        for(p in paths){
            ## C>A/T dominant
            gene_sets = getPathway(pathway = p)
            ## Just in case there are duplicated pathways
            dups = names(gene_sets)[duplicated(names(gene_sets))]
            for (d in dups){
                idx = which(names(gene_sets)==d)
                gene_sets[idx[2:length(idx)]] <- NULL
            }
            l = lapply(gene_sets, function(x) length(x))
            l = lapply(seq_along(l), function(y, n, i){
                ldply(y[[i]]) %>% mutate(pathway=n[[i]])
            }, y=l, n=names(l))
            l = do.call(rbind, l) %>% rename(pathway=V1)
            ## Fix names cause then we split on \\. and the folowing characters create problems
            # names(gene_sets) = gsub("\\.", "_", names(gene_sets))
            # names(gene_sets) = gsub("-", "_", names(gene_sets))
            # names(gene_sets) = gsub(" ", "_", names(gene_sets))
            # names(gene_sets) = gsub("\\'", "", names(gene_sets))
            # names(gene_sets) = gsub("\\/", "_", names(gene_sets))
            # names(gene_sets) = gsub("\\(", "", names(gene_sets))
            # names(gene_sets) = gsub("\\)", "", names(gene_sets))
            # names(gene_sets) = gsub(":", "_", names(gene_sets))
            # names(gene_sets) = gsub("&", "", names(gene_sets))
            # names(gene_sets) = gsub(",", "", names(gene_sets))

            if(sub_identifier=="entrez"){
                gene_sets_up = updatePathway(caDominant$entrez, gene_sets)
            }else if(sub_identifier=="symbol"){
                gene_sets_up = updatePathway(caDominant$symbol, gene_sets)
            }

            gene_sets_up = gene_sets_up[["gene_sets"]]

            if(null_distribution){
                ## Get the null distributions
                message("Getting null distribution")
                nullDist = getNullDist(gene_sets = gene_sets_up, g = caDominant)
                nullDist$group = "C>A/T dominant"
                null_dists = rbind(null_dists, nullDist)
            }


            res = lapply(gene_sets_up, function(x) run.singed.ks(x, caDominant))
            ## Make res data frame to work with
            res = lapply(seq_along(res), function(y, n, i){
                    ldply(y[[i]]) %>% mutate(pathway=n[[i]])
                }, y=res, n=names(res))
            res = do.call(rbind, res) %>% spread(.id, V1) %>% mutate(group="C>A/T dominant", gene_set=p)
            res$fdr = p.adjust(res$p.value, method="BH")
            res$bonf = p.adjust(res$p.value, method="bonferroni")
            res = res %>% left_join(l)
            gsea = rbind(gsea, res)

            ## DDR impaired
            if(sub_identifier=="entrez"){
                gene_sets_up = updatePathway(ddr$entrez, gene_sets)
            }else if(sub_identifier=="symbol"){
                gene_sets_up = updatePathway(ddr$symbol, gene_sets)
            }

            gene_sets_up = gene_sets_up[["gene_sets"]]
            ## Get the null distributions
            if(null_distribution){
                message("Getting null distribution")
                nullDist = getNullDist(gene_sets = gene_sets_up, g = ddr)
                nullDist$group = "DDR impaired"
                null_dists = rbind(null_dists, nullDist)
            }

            res = lapply(gene_sets_up, function(x) run.singed.ks(x, ddr))
            ## Make res data frame to work with
            res = lapply(seq_along(res), function(y, n, i){
                ldply(y[[i]]) %>% mutate(pathway=n[[i]])
            }, y=res, n=names(res))
            res = do.call(rbind, res) %>% spread(.id, V1) %>% mutate(group="DDR impaired", gene_set=p)
            res$fdr = p.adjust(res$p.value, method="BH")
            res$bonf = p.adjust(res$p.value, method="bonferroni")
            res = res %>% left_join(l)
            gsea = rbind(gsea, res)

            ## Mutagenic
            if(sub_identifier=="entrez"){
                gene_sets_up = updatePathway(mutagenic$entrez, gene_sets)
            }else if(sub_identifier=="symbol"){
                gene_sets_up = updatePathway(mutagenic$symbol, gene_sets)
            }

            gene_sets_up = gene_sets_up[["gene_sets"]]
            ## Get the null distributions
            if(null_distribution){
                message("Getting null distribution")
                nullDist = getNullDist(gene_sets = gene_sets_up, g = mutagenic)
                nullDist$group = "Mutagenic"
                null_dists = rbind(null_dists, nullDist)
            }

            res = lapply(gene_sets_up, function(x) run.singed.ks(x, mutagenic))
            res = lapply(seq_along(res), function(y, n, i){
                ldply(y[[i]]) %>% mutate(pathway=n[[i]])
            }, y=res, n=names(res))
            res = do.call(rbind, res) %>% spread(.id, V1) %>% mutate(group="Mutagenic", gene_set=p)
            res$fdr = p.adjust(res$p.value, method="BH")
            res$bonf = p.adjust(res$p.value, method="bonferroni")
            res = res %>% left_join(l)
            gsea = rbind(gsea, res)
        }

        return(list(gsea=gsea, null_dists=null_dists))
    }else if(sample.by.sample){
        ## Sample-by-sample pathway enrichment
        message("Sample-by-sample mode")
        for(p in paths){
            genes2samples = NULL
            genes_mapped = NULL
            gene_sets = getPathway(pathway = p)
            dups = names(gene_sets)[duplicated(names(gene_sets))]
            track_samples = NULL
            for (d in dups){
                idx = which(names(gene_sets)==d)
                gene_sets[idx[2:length(idx)]] <- NULL
            }
            l = lapply(gene_sets, function(x) length(x))
            l = lapply(seq_along(l), function(y, n, i){
                ldply(y[[i]]) %>% mutate(pathway=n[[i]])
            }, y=l, n=names(l))
            l = do.call(rbind, l) %>% rename(pathway_length=V1)
            for (s in unique(syscan$sample)){

                mg = syscan %>% subset(sample==s) %>% .$Group %>% unique

                g = syscan %>% subset(sample==s) %>% subset(gene_type!="CGC_discarded") %>%
                    select(entrez, symbol, score) %>% arrange(desc(score)) %>% mutate(rank=row_number())

                ## Check point for NAs in score column
                if(g%>%subset(is.na(score))%>%nrow>0){
                    stop("NAs found in sample scores")
                }
                if(sub_identifier=="entrez"){
                    gene_sets_up = updatePathway(g$entrez, gene_sets)
                }else if(sub_identifier=="symbol"){
                    gene_sets_up = updatePathway(g$symbol, gene_sets)
                }
                genes_in_p = gene_sets_up[["genes_in_p"]]
                gene_sets_up = gene_sets_up[["gene_sets"]]
                genes_mapped = c(genes_mapped, unique(unlist(gene_sets_up)))
                ## Keep a vector with genes in each sample
                genes2samples = c(genes2samples, genes_in_p)
                message(paste0(s, ": ", genes_in_p, " in gene sets"))

                if(genes_in_p==0){
                    next
                }

                ## Get the null distributions
                if(null_distribution){
                    message("Getting null distribution")
                    nullDist = getNullDist(gene_sets = gene_sets_up, g = g)
                    nullDist = nullDist %>% mutate(sample=s, group=mg, gene_set=p)
                    null_dists = rbind(null_dists, nullDist)
                }

                res = lapply(gene_sets_up, function(x) run.singed.ks(x, g))
                res = lapply(seq_along(res), function(y, n, i){
                    ldply(y[[i]]) %>% mutate(pathway=n[[i]])
                }, y=res, n=names(res))
                res = do.call(rbind, res) %>% spread(.id, V1) %>% mutate(sample=s, group=mg, gene_set=p)
                res$fdr = p.adjust(res$p.value, method="BH")
                res$bonf = p.adjust(res$p.value, method="bonferroni")
                res = res %>% left_join(l)
                gsea = rbind(gsea, res)

                ## Populate track samples table to check the results
                print(s)
                print(genes_in_p)
                print(unique(g$Group))
                print(p)
                ts  = data.frame(sample=s, number.of.genes=nrow(g), genes.in.pathways=genes_in_p, group=mg, geneset=p)
                track_samples = rbind(track_samples, ts)
                cat("\n")
            }
            message(paste0("Summary of genes in samples: ", p))
            print(summary(genes2samples))
            message(paste0("Unique genes mapped across all samples: ", length(unique(genes_mapped))))
        }
        return(list(gsea=gsea, null_dists=null_dists, track_samples=track_samples))
    }
}

## use this function to map samples to enriched pathways
mapPathways2Samples = function(gsea, efdr=0.01,
                               geneInfo_fn="~/Mountpoints/rosalind_lustre/mourikisa/data/geneInfoNCG5.Rdata",
                               gene_sets_dir="~/Mountpoints/rosalind_lustre/mourikisa/data/geneSets/reactome_12_12_16/",
                               gene_sets_gene_start=4,
                               gene_sets_identifiers = "reactome",
                               sub_identifier="symbol"){


    ## Get genes in pathways
    gene_sets = gsea[["paths"]]

    ## Get genes and pathways from the gsea
    enriched_paths = gsea[["gsea"]] %>% subset(fdr<efdr) %>% .$pathway %>% unique
    cat(paste0("Enriched pathways: ", length(enriched_paths)))

    ## Get only enriched pathways without
    gene_sets = gene_sets[names(gene_sets)%in%enriched_paths]

    ## Get gene sets ready for left joining
    l = lapply(seq_along(gene_sets), function(y, n, i){
        ldply(y[[i]]) %>% mutate(pathway=n[[i]])
    }, y=gene_sets, n=names(gene_sets))
    l = do.call(rbind, l) %>% rename(symbol=V1)

    ## Add pathways in genes
    genes = gsea[["genes"]] %>% left_join(l)

    paths2s2g = genes %>% subset(!is.na(pathway)) %>% group_by(pathway) %>% summarise(samples.pathway=length(unique(sample)), genes.pathway=length(unique(symbol))) %>% ungroup()

    paths2s2g = paths2s2g %>% left_join(gsea[["gsea"]]%>%select(pathway, Level1, Level2))
    ## Add also Level in reactome
    genes = genes %>% left_join(paths2s2g)

    ## return the enriched gsea list of objects
    return(list(gsea=gsea[["gsea"]], genes=gsea[["genes"]], genes2paths=genes, paths2s2g=paths2s2g))
}

## Use this function to find minimal set of processes explain each group
getREACTOMEsubgraph = function(hierarchy.file="~/athena/data/geneSets/reactome_12_12_16/pathway_hierarchy.tsv",
                               identifier.file="~/athena/data/geneSets/reactome_12_12_16/complete_list_of_pathways.tsv",
                               pathways.enriched="~/athena/enriched_paths2samples_CGC_plus_top10syscans_noGroups_fdr_0.01_pathwayFilters_corrected.tsv"){

    require(igraph)
    require(xlsx)
    require(gplots)
    require(RColorBrewer)
    require(RedeR)
    require(ComplexHeatmap)


    hier = read.table(hierarchy.file, sep = "\t", header = F)
    colnames(hier) = c("path1", "path2")
    ## Careful here cause some pathways have apostrophes in their names
    path_ids = read.table(identifier.file, sep = "\t", header = F, quote = "")
    colnames(path_ids) = c("path_id", "path_name", "species")
    path_ids = path_ids %>% subset(species=="Homo sapiens") %>% rename(pathway=path_name) %>% select(-species)

    ## Exclude leading nad trailing spaces from pathway names
    path_ids$pathway = gsub("^\\s+|\\s+$", "", path_ids$pathway)
    path_ids$path_id = gsub("^\\s+|\\s+$", "", path_ids$path_id)

    ## Add info to herarchy table and subset for the homo sapiens interactions
    hier = hier %>% subset(path1%in%path_ids$path_id | path2%in%path_ids$path_id) %>% left_join(path_ids%>%select(path_id, pathway)%>%rename(path1=path_id, pathway1=pathway)) %>%
        left_join(path_ids%>%select(path_id, pathway)%>%rename(path2=path_id, pathway2=pathway))

    ## Trials to exclude level 1&2 nodes
    level_1_nodes = c("Developmental Biology",
                      "Reproduction",
                      "Circadian Clock",
                      "Hemostasis",
                      "Neuronal System",
                      "Immune System",
                      "Signal Transduction",
                      "Disease",
                      "DNA Repair",
                      "Metabolism",
                      "Gene Expression",
                      "Chromatin organization",
                      "Transmembrane transport of small molecules",
                      "DNA replication",
                      "Metabolism of proteins",
                      "Muscle contraction",
                      "Cell Cycle",
                      "Organelle biogenesis and maintenance",
                      "Mitophagy",
                      "Vesicle-mediated transport",
                      "Programmed Cell Death",
                      "Cellular responses to stress",
                      "Extracellular matrix organization",
                      "Cell-Cell communication")

    level_2_nodes = hier %>% subset(pathway1%in%level_1_nodes | pathway2%in%level_1_nodes)
    level_2_nodes = unique(c(level_2_nodes$pathway1, level_2_nodes$pathway2))
    level_2_nodes = level_2_nodes[!level_2_nodes%in%level_1_nodes]


    ## Read in the group-specific enriched pathways
    gsep = read.table(pathways.enriched, sep="\t", header = T, quote="")

    ## fix name quotes etc, rgsea.counts comes from randomisation
    gsep$pathway = gsub("\"", "", gsep$pathway)
    gsep$pathway = gsub("\'", "", gsep$pathway)
    rgsea.counts$pathway = gsub("\'", "", rgsea.counts$pathway)
    rgsea.counts = rgsea.counts %>% mutate(inCGCplusTop10Enriched=ifelse(pathway%in%unique(gsep$pathway), TRUE, FALSE))


    paths2samples2genes = gsep %>% group_by(pathway) %>% summarise(samples=length(unique(sample)), genes=length(unique(symbol)))
    gsep = gsep %>% left_join(paths2samples2genes) %>% mutate(label=paste0(pathway, "(", samples, ";", genes, ")"), inGS=1)

    ## Refinement of pathways
    gsep = gsep %>% subset(!pathway%in%level_1_nodes & !pathway%in%level_2_nodes)
    gsep = gsep %>% subset(samples>11)

    ## Refine based on randomisation
    truly_enriched = rgsea.counts %>% subset(ns_perc>95 & inCGCplusTop10Enriched==TRUE) %>% .$pathway
    gsep = gsep %>% subset(pathway%in%truly_enriched)

    ## Extract subnetwork
    subnet = hier %>% subset(pathway1%in%unique(gsep$pathway) & pathway2%in%unique(gsep$pathway))
    #subnet = hier %>% subset(path1%in%gsep_ids | path2%in%gsep_ids)

    ## Bring names for path1 & path2
    subnet = subnet %>% left_join(gsep%>%select(pathway, label)%>%unique%>%rename(pathway1=pathway, label1=label)) %>%
        left_join(gsep%>%select(pathway, label)%>%unique%>%rename(pathway2=pathway, label2=label))


    ## this is for ehn we select interactions with both nodes to be pathways enriched
    lonely_nodes = gsep %>% subset(!label%in%subnet$label1 & !label%in%subnet$label2) %>% select(label) %>% unique

    ## Number of nodes
    length(unique(c(subnet$label1, subnet$label2)))
    length(unique(c(subnet$path1, subnet$path2)))

    ## Matrix for plotting
    toPlot = gsep %>% select(label, sample) %>% unique %>% table()

    # toPlot = gsep %>% select(label, sample) %>% subset(grepl("ER to Golgi Antero", label) | grepl("Signaling by Robo receptor", label) |
    #                                               grepl("RAF/MAP kinase cascade", label) | grepl("Golgi-to-ER retrograde transport", label) |
    #                                               grepl("Interleukin-3, 5 and GM", label) | grepl("Ion channel transport", label) |
    #                                               grepl("N-glycan antennae elongation in the medial/trans-Golgi", label) |
    #                                               grepl("PIP3 activates AKT signaling", label) | grepl("Rap1 signalling", label ) |
    #                                               grepl("Translation initiation complex formation", label)) %>% unique %>% table()

    ## Plot the network and color the group-specif enriched pathways
    ## http://kateto.net/network-visualization
    nodes = data.frame(pathway=unique(c(subnet$label1, subnet$label2, lonely_nodes$label)))%>%mutate(inGS=ifelse(pathway%in%gsep$label, 1, 0))
    #nodes = data.frame(pathway=unique(c(subnet$label1, subnet$label2)))%>%mutate(inGS=ifelse(pathway%in%gsep$label, 1, 0))
    net = graph_from_data_frame(d=subnet%>%select(label1, label2), vertices=nodes,directed=F)
    cls = groups(components(net))
    cls = lapply(seq_along(cls), function(y, n, i){
        ldply(y[[i]]) %>% mutate(cluster=n[[i]])
    }, y=cls, n=names(cls))
    cls = do.call(rbind, cls)

    cls = cls %>% subset(V1%in%rownames(toPlot))
    rnames = cls$V1
    cls = cls %>% select(-V1)
    rownames(cls) = rnames

    ## For each cluster get the unique number of samples and genes
    ## Then exclude those samples and cluster and redo the same
    samples_covered = NULL
    sequence_of_clusters = NULL
    cls_to_check = cls %>% tibble::rownames_to_column() %>% mutate(cluster = as.numeric(cluster))## starting point, this will be updated in the while loop
    gsep_to_check = gsep
    #max_samples = gsep %>% subset(!pathway%in%exclusion_list) %>% select(sample) %>% unique %>% nrow
    max_samples = gsep %>% select(sample) %>% unique %>% nrow

    while(length(samples_covered)<max_samples){
        ## select the cluster with the biggest coverage of samples
        biggest_cluster = NULL
        samples_so_far = NULL
        for(i in unique(cls_to_check$cluster)){
            paths = cls_to_check %>% subset(cluster==i) %>% .$rowname
            samples = gsep_to_check %>% subset(label%in%paths) %>% select(sample) %>% unique %>% .$sample
            if(length(samples)>length(samples_so_far)){
                biggest_cluster = i
                samples_so_far = samples
            }else{
                next
            }
        }

        sequence_of_clusters = c(sequence_of_clusters, biggest_cluster)
        samples_covered = c(samples_covered, samples_so_far)
        print(length(samples_covered))
        ## Now exclude that samples from the gsep
        gsep_to_check = gsep_to_check %>% subset(!(sample%in%samples_covered))
        ## Update cls_to_check by removing the bigest cluster
        cls_to_check = cls_to_check %>% subset(cluster!=biggest_cluster)
    }


    # Generate colors based on media type:
    colrs <- c("gray50", "tomato")
    V(net)$color = ifelse(V(net)$inGS==1, "tomato", "gray50")

    #plot(net, vertex.size=7, vertex.label.cex=0.2)
    tkid <- tkplot(g) #tkid is the id of the tkplot that will open
    ll <- tkplot.getcoords(tkid) # grab the coordinates from tkplot
    plot(g, layout=ll,vertex.size=5, vertex.label.cex=1)

    ## Cluster patients based on the group-specific pathways
    toPlot = toPlot[match(rownames(cls), rownames(toPlot)), ]
    ## Make it table cause somthing wierd is going on with the names if the class of object is table
    attributes(toPlot)$class <- "matrix"


    ## Get all possible intersections of the binary matrix (toPlot)
    toPlot_list = list()
    for(c in colnames(toPlot)){
        toPlot_list[[c]] = names(toPlot[,c][toPlot[,c]==1])
    }


    ## Create annotation of samples
    # sample_ann = mainCohort %>% select(sample, specimen_donor_treatment_type,
    #                                    tumour_grade, donor_relapse_interval,
    #                                    donor_age_at_diagnosis, donor_survival_time, Group)
    sample_ann = mainCohort %>% select(sample,Group)
    sample_ann[sample_ann==""] = NA
    sample_ann = sample_ann %>% subset(sample%in%colnames(toPlot))
    sample_ann = sample_ann[match(sample_ann$sample, colnames(toPlot)), ]


    # ## Create cluser table based on top 10 plus CGC
    # path2clust = gsep %>% select(label, process) %>% unique
    # rnames = path2clust$label
    # path2clust = data.frame(process=path2clust$process)
    # rownames(path2clust) = rnames

    pdf(file = "~/Desktop/all_129_enriched_top_10.pdf", width = 15, height = 15)
    heatmap.2(toPlot, trace="none", col=c("grey50", "red"),margins=c(10,10), cexRow=0.1, key = FALSE, Rowv = FALSE, dendrogram = "column",
              main="All 129 top 15 gene enrichment", RowSideColors=cls$cluster, ColSideColors = sample_ann$col)
    dev.off()


    pdf(file = "~/Desktop/test.pdf", width = 25, height = 15)
    #cls.cols = c(brewer.pal(8, "Accent"), brewer.pal(8, "Dark2"), brewer.pal(12, "Paired"), brewer.pal(12, "Set3"), brewer.pal(5, "Set1"))
    cls.cols = rainbow(length(unique(cls$cluster)))

    ha = HeatmapAnnotation(df = data.frame(group = sample_ann$Group),col = list(group = c("Mutagenic" =  "green", "DDR impaired" = "purple", "C>A/T dominant"="orange")))
    # ha = HeatmapAnnotation(df = sample_ann %>% select(-sample) %>% data.frame(), col = list(Group = c("Mutagenic" =  "green", "DDR impaired" = "purple", "C>A/T dominant"="orange"),
    #                                                                                         tumour_grade=c("3"="black", "2"="red", "1"="blue", "unknown"="grey50", "needs cpath clarification"="green")))
    Heatmap(toPlot,
            row_names_gp = gpar(fontsize=6),
            column_names_gp = gpar(fontsize=7),
            col=c("white", "red"),
            top_annotation = ha,
            #cluster_rows = FALSE,
            #cluster_columns = FALSE,
            #column_order = samples2containers$sample,
            #row_order = gsep_plus_containers %>% select(label, samples.container) %>% unique %>% arrange(desc(samples.container)) %>% .$label,
            column_title = 'GSEA on 120; No Groups considered; Top 10 genes plus CGC with amplifications; \n No level 1/2 reactome pathways; No pathways with less than 12 samples; Pathway containers') +
    Heatmap(cls$cluster, name="Pathway cluster",col = cls.cols, width = unit(5, "mm"), heatmap_legend_param = list(ncol=6))
    dev.off()



    ## Plot/cluster the network using RedeR package
    check = l[["container_overlap"]] %>% subset(V1>3)
    net = graph_from_data_frame(d=check%>%select(container1, container2),directed=F)
    E(net)$weight = check$V1
    plot(net, edge.width=check$V1, vertex.size=2)
    rdp=RedPort()
    calld(rdp)
    addGraph(rdp, net)
    relax(rdp)


    ## ----------------------------
    ## Plot the network as heatmap
    ## ----------------------------

    # # Re-generate dataframes for both nodes and edges, now containing
    # # calculated network attributes
    # node_list <- get.data.frame(net, what = "vertices")
    #
    # # Determine a community for each edge. If two nodes belong to the
    # # same community, label the edge with that community. If not,
    # # the edge community value is 'NA'
    # edge_list <- get.data.frame(net, what = "edges")
    # # Create a character vector containing every node name
    # all_nodes <- sort(node_list$name)
    #
    # # Adjust the 'to' and 'from' factor levels so they are equal
    # # to this complete list of node names
    # plot_data <- edge_list %>% mutate(
    #     to = factor(to, levels = all_nodes),
    #     from = factor(from, levels = all_nodes))
    #
    # # Create the adjacency matrix plot
    # ggplot(plot_data, aes(x = from, y = to, fill = group)) +
    #     geom_raster() +
    #     theme_bw() +
    #     # Because we need the x and y axis to display every node,
    #     # not just the nodes that have connections to each other,
    #     # make sure that ggplot does not drop unused factor levels
    #     scale_x_discrete(drop = FALSE) +
    #     scale_y_discrete(drop = FALSE) +
    #     theme(
    #         # Rotate the x-axis lables so they are legible
    #         axis.text.x = element_text(angle = 270, hjust = 0),
    #         # Force the plot into a square aspect ratio
    #         aspect.ratio = 1,
    #         # Hide the legend (optional)
    #         legend.position = "none")

    load("~/athena/239_OAC_enriched_pathways_plus_containers_80percent.Rdata")

    ## Decide if you want normalisation or not
    normal = T
    normal = F

    ## Exclude Tp53
    gsep_plus_containers = gsep_plus_containers %>% subset(symbol!="TP53")

    ## Get only CGCs
    gsep_plus_containers = gsep_plus_containers %>% subset(gene_type=="cgc")

    ## Exclude CGCs
    gsep_plus_containers = gsep_plus_containers %>% subset(gene_type!="cgc")

    ## Get a graph where nodes are samples and connections are number of containers
    sample_combns = do.call(rbind, combn(unique(gsep_plus_containers$sample), 2, simplify = FALSE))
    net = NULL
    for(i in 1:nrow(sample_combns)){
        cat(i, "\n")
        if(sample_combns[i,][1]==sample_combns[i,][2]){
            stop("Sample twice")
        }
        common.containers = gsep_plus_containers %>% select(sample, container) %>% unique %>% subset(sample%in%sample_combns[i,]) %>% count(container) %>% subset(n>1) %>% .$container
        union_processes = gsep_plus_containers %>% select(sample, container) %>% unique %>% subset(sample%in%sample_combns[i,]) %>% select(container) %>% unique %>% nrow
        if(normal){
            net = rbind(net, data.frame(sample1=sample_combns[i,][1], sample2=sample_combns[i,][2], common.containers=length(common.containers)/union_processes))
        }else{
            net = rbind(net, data.frame(sample1=sample_combns[i,][1], sample2=sample_combns[i,][2], common.containers=length(common.containers)))
        }
    }
    ## And add self versus self
    for (s in unique(gsep_plus_containers$sample)){
        common.containers = gsep_plus_containers %>% subset(sample==s) %>% select(container) %>% unique %>% .$container
        if(normal){
            net = rbind(net, data.frame(sample1=s, sample2=s, common.containers=1))
        }else{
            net = rbind(net, data.frame(sample1=s, sample2=s, common.containers=length(common.containers)))
        }
    }

    ## Get symmetric matrix
    g <- graph.data.frame(net%>% subset(common.containers>0), directed=FALSE)
    net2 = get.adjacency(g, attr="common.containers", sparse=FALSE)
    library(RColorBrewer)
    library(circlize)
    ## I select the breaks based on the distribution
    up = unname(quantile(c(net2), seq(0, 1, .05))[20])
    p =Heatmap(net2,colorRamp2(breaks = c(seq(0,up, up/10)),
                               colors = c("#053061","#2166ac","#4393c3","#92c5de","#d1e5f0","#f7f7f7","#fddbc7","#f4a582","#d6604d","#f03b20","#bd0026")),
               heatmap_legend_param = list(title="Shared processes/union", color_bar="continuous"),
               row_names_gp = gpar(fontsize = 5), column_names_gp = gpar(fontsize = 5))

    net2 = net2[unlist(row_dend(p)),unlist(column_dend(p))]

    p =Heatmap(net2,colorRamp2(breaks = c(seq(0,up, up/10)),
                               colors = c("#053061","#2166ac","#4393c3","#92c5de","#d1e5f0","#f7f7f7","#fddbc7","#f4a582","#d6604d","#f03b20","#bd0026")),
               heatmap_legend_param = list(title="Shared processes/union", color_bar="continuous"),
               row_names_gp = gpar(fontsize = 5), column_names_gp = gpar(fontsize = 5),
               cell_fun = function(j, i, x, y, w, h, col) {
                   if((i+j)>(nrow(net2))) {
                       grid.rect(x, y, w, h, gp = gpar(fill = "white", col="white"))
                   }
               }
               )


    ## Check groups
    ro = rownames(net2)[unlist(row_order(p))]
    high_sharing = ro[1:which(ro=="LP6005500-DNA_B03")]
    intermediate_sharing = ro[(which(ro=="LP6005500−DNA_B03")+1):which(ro=="LP6005334-DNA_G02")]
    no_sharing = ro[(which(ro=="LP6005500-DNA_B03")+1):length(ro)]

    t = gsep_plus_containers %>% select(sample, container) %>% unique %>% mutate(group=ifelse(sample%in%high_sharing, "high_sharing",
                                                                                              ifelse(sample%in%intermediate_sharing, "intermediate_sharing",
                                                                                                     ifelse(sample%in%no_sharing, "no_sharing",
                                                                                                            ifelse(sample%in%low_sharing, "low_sharing",NA))))) %>%
        group_by(group, container) %>% summarise(n=n()) %>% ungroup() %>% spread(group, n)


    ## Then do a proportion test on the above table
    cats = 2
    t = t_syscans %>% data.frame()
    total = data.frame(type=c("high_sharing", "low_sharing"), total=c(61, 59))
    stats=NULL
    for(i in 1:nrow(t)){
        cat(i, "\t")
        d = t[i,1:(cats+1)] %>% gather(type, value, -container)
        d[is.na(d)] = 0
        d = d %>% left_join(total)
        combs = combn(d$type, 2, simplify=F)
        for (y in 1:length(combs)){
            cat(y, "\t")
            dd = d %>% subset(type%in%combs[[y]])
            pt = prop.test(dd$value, dd$total)
            tb = data.frame(container=unique(dd$container), group1=dd$type[1], group2=dd$type[2],
                            group1_value=dd$value[1], group1_prop=pt$estimate[1], group2_value=dd$value[2],
                            group2_prop=pt$estimate[2], p.value=pt$p.value)
            rownames(tb) = NULL
            stats = rbind(stats, tb)
        }
        cat("\n")
    }

    ## create the table
    ## Get row and column order from Complex heatmap package
    ro = rownames(net2)[unlist(row_order(p))]
    co = rownames(net2)[rev(unlist(column_order(p)))]
    tb = net %>% subset(common.containers.number>0)
    ## reset the rownames
    rownames(tb) = 1:nrow(tb)
    orders = NULL
    for(i in 2:120){
        for(y in 1:(i-1)){
            orders <- c(orders,list(c(ro[i],co[y])))
        }
    }
    orders_final = lapply(orders, function(x){
                            r = rownames(tb %>% subset((sample1==x[1] & sample2==x[2]) | sample1==x[2] & sample2==x[1]))
                            })
    orders_final = unlist(orders_final)
    tb = tb[match(as.numeric(orders_final),rownames(tb)),]
    rownames(tb) = 1:nrow(tb)
    write.table(tb, file="Desktop/sample_shared_containers_network.tsv", sep="\t", quote=F)

    ## It seems that there are three groups (groups extracted by looking the heatmap)
    ## Extract the groups
    ## 1st group
    high_sharing = co[1:which(co=="LP6005500-DNA_H03")]
    limited_sharing = co[(which(co=="LP6005500-DNA_H03")+1):which(co=="LP6005935-DNA_C04")]
    intermediate_sharing = co[(which(co=="LP6005935-DNA_C04")+1):which(co=="SS6003119")]
    no_sharing = co[(which(co=="SS6003119")+1):length(co)]

    ## Number of processes per sample
    samples2info = rbind(gsep_plus_containers %>% subset(sample%in%high_sharing) %>% select(sample, container) %>% unique %>% group_by(sample) %>% summarise(value=n()) %>% mutate(sample_type="high-sharing", info_type="containers"),
        gsep_plus_containers %>% subset(sample%in%intermediate_sharing) %>% select(sample, container) %>% unique %>% group_by(sample) %>% summarise(value=n()) %>% mutate(sample_type="intermediate-sharing", info_type="containers"),
        gsep_plus_containers %>% subset(sample%in%limited_sharing) %>% select(sample, container) %>% unique %>% group_by(sample) %>% summarise(value=n()) %>% mutate(sample_type="limited-sharing", info_type="containers"),
        gsep_plus_containers %>% subset(sample%in%no_sharing) %>% select(sample, container) %>% unique %>% group_by(sample) %>% summarise(value=n()) %>% mutate(sample_type="no-sharing", info_type="containers"))

    samples2info = rbind(samples2info, gsep_plus_containers %>% subset(sample%in%high_sharing) %>% select(sample, pathway) %>% unique %>% group_by(sample) %>% summarise(value=n()) %>% mutate(sample_type="high-sharing", info_type="pathways"),
                         gsep_plus_containers %>% subset(sample%in%intermediate_sharing) %>% select(sample, pathway) %>% unique %>% group_by(sample) %>% summarise(value=n()) %>% mutate(sample_type="intermediate-sharing", info_type="pathways"),
                         gsep_plus_containers %>% subset(sample%in%limited_sharing) %>% select(sample, pathway) %>% unique %>% group_by(sample) %>% summarise(value=n()) %>% mutate(sample_type="limited-sharing", info_type="pathways"),
                         gsep_plus_containers %>% subset(sample%in%no_sharing) %>% select(sample, pathway) %>% unique %>% group_by(sample) %>% summarise(value=n()) %>% mutate(sample_type="no-sharing", info_type="pathways"))

    ## When you check the CGCs there are samples with 0s so we have to include them
    high_cgcs = gsea.top.10.plus.cgc[["genes"]] %>% subset(sample%in%high_sharing & gene_type=="cgc") %>% select(sample, symbol) %>% unique %>% group_by(sample) %>% summarise(value=n()) %>% mutate(sample_type="high-sharing", info_type="CGC genes")
    inter_cgcs = gsea.top.10.plus.cgc[["genes"]] %>% subset(sample%in%intermediate_sharing & gene_type=="cgc") %>% select(sample, symbol) %>% unique %>% group_by(sample) %>% summarise(value=n()) %>% mutate(sample_type="intermediate-sharing", info_type="CGC genes")
    limit_cgcs = gsea.top.10.plus.cgc[["genes"]] %>% subset(sample%in%limited_sharing & gene_type=="cgc") %>% select(sample, symbol) %>% unique %>% group_by(sample) %>% summarise(value=n()) %>% mutate(sample_type="limited-sharing", info_type="CGC genes")
    no_cgcs = gsea.top.10.plus.cgc[["genes"]] %>% subset(sample%in%no_sharing & gene_type=="cgc") %>% select(sample, symbol) %>% unique %>% group_by(sample) %>% summarise(value=n()) %>% mutate(sample_type="no-sharing", info_type="CGC genes")
    missing_samples = no_sharing[!no_sharing%in%no_cgcs$sample]
    no_cgcs = rbind(no_cgcs, data.frame(sample=missing_samples, value=rep(0, length(missing_samples)), sample_type=rep("no-sharing", length(missing_samples)), info_type=rep("CGC genes", length(missing_samples))))


    samples2info = rbind(samples2info,
                         high_cgcs,
                         inter_cgcs,
                         limit_cgcs,
                         no_cgcs
                         )

    ## Genes mapped in enriched pathways
    samples2info = rbind(samples2info, gsep_plus_containers %>% subset(sample%in%high_sharing) %>% select(sample, symbol) %>% unique %>% group_by(sample) %>% summarise(value=n()) %>% mutate(sample_type="high-sharing", info_type="Genes in enriched pathways"),
                         gsep_plus_containers %>% subset(sample%in%intermediate_sharing) %>% select(sample, symbol) %>% unique %>% group_by(sample) %>% summarise(value=n()) %>% mutate(sample_type="intermediate-sharing", info_type="Genes in enriched pathways"),
                         gsep_plus_containers %>% subset(sample%in%limited_sharing) %>% select(sample, symbol) %>% unique %>% group_by(sample) %>% summarise(value=n()) %>% mutate(sample_type="limited-sharing", info_type="Genes in enriched pathways"),
                         gsep_plus_containers %>% subset(sample%in%no_sharing) %>% select(sample, symbol) %>% unique %>% group_by(sample) %>% summarise(value=n()) %>% mutate(sample_type="no-sharing", info_type="Genes in enriched pathways"))

    ## CGCs mapped in enriched pathways
    samples2info = rbind(samples2info, gsep_plus_containers %>% subset(gene_type=="cgc") %>% subset(sample%in%high_sharing) %>% select(sample, symbol) %>% unique %>% group_by(sample) %>% summarise(value=n()) %>% mutate(sample_type="high-sharing", info_type="CGCs in enriched pathways"),
                         gsep_plus_containers %>% subset(gene_type=="cgc") %>% subset(sample%in%intermediate_sharing) %>% select(sample, symbol) %>% unique %>% group_by(sample) %>% summarise(value=n()) %>% mutate(sample_type="intermediate-sharing", info_type="CGCs in enriched pathways"),
                         gsep_plus_containers %>% subset(gene_type=="cgc") %>% subset(sample%in%limited_sharing) %>% select(sample, symbol) %>% unique %>% group_by(sample) %>% summarise(value=n()) %>% mutate(sample_type="limited-sharing", info_type="CGCs in enriched pathways"))
    ## Add the no-sharing (there are 3 samples with not CGCs mutated)
    no_cgcs = gsep_plus_containers %>% subset(gene_type=="cgc") %>% subset(sample%in%no_sharing) %>% select(sample, symbol) %>% unique %>% group_by(sample) %>% summarise(value=n()) %>% mutate(sample_type="no-sharing", info_type="CGCs in enriched pathways")
    missing_samples = no_sharing[!no_sharing%in%no_cgcs$sample]
    no_cgcs = rbind(no_cgcs, data.frame(sample=missing_samples, value=rep(0, length(missing_samples)), sample_type=rep("no-sharing", length(missing_samples)), info_type=rep("CGCs in enriched pathways", length(missing_samples))))
    samples2info = rbind(samples2info, no_cgcs)

    ## sys-candidates mapped in enriched pathways
    samples2info = rbind(samples2info, gsep_plus_containers %>% subset(gene_type!="cgc") %>% subset(sample%in%high_sharing) %>% select(sample, symbol) %>% unique %>% group_by(sample) %>% summarise(value=n()) %>% mutate(sample_type="high-sharing", info_type="sys-candidates in enriched pathways"),
                         gsep_plus_containers %>% subset(gene_type!="cgc") %>% subset(sample%in%intermediate_sharing) %>% select(sample, symbol) %>% unique %>% group_by(sample) %>% summarise(value=n()) %>% mutate(sample_type="intermediate-sharing", info_type="sys-candidates in enriched pathways"),
                         gsep_plus_containers %>% subset(gene_type!="cgc") %>% subset(sample%in%limited_sharing) %>% select(sample, symbol) %>% unique %>% group_by(sample) %>% summarise(value=n()) %>% mutate(sample_type="limited-sharing", info_type="sys-candidates in enriched pathways"),
                         gsep_plus_containers %>% subset(gene_type!="cgc") %>% subset(sample%in%no_sharing) %>% select(sample, symbol) %>% unique %>% group_by(sample) %>% summarise(value=n()) %>% mutate(sample_type="no-sharing", info_type="sys-candidates in enriched pathways"))


    ## Fix the order of the facets
    samples2info$info_type = factor(samples2info$info_type, levels=c("containers","pathways","CGC genes", "CGCs in enriched pathways", "Genes in enriched pathways", "sys-candidates in enriched pathways"))
    ggplot(samples2info, aes(x=sample_type, y=value, fill=sample_type)) + geom_boxplot() + facet_wrap(~info_type, ncol = 2, scales = "free")


    ## Check if on average CGCs, sys-candidates map to more/less pathways in each group
    ## CGCs mapped in enriched pathways
    genes2info = rbind(gsep_plus_containers %>% subset(gene_type=="cgc") %>% subset(sample%in%high_sharing) %>% select(symbol, container) %>% unique %>% group_by(symbol) %>% summarise(value=n()) %>% mutate(sample_type="high-sharing", info_type="Enriched pathways per CGCs"),
                         gsep_plus_containers %>% subset(gene_type=="cgc") %>% subset(sample%in%intermediate_sharing) %>% select(symbol, container) %>% unique %>% group_by(symbol) %>% summarise(value=n()) %>% mutate(sample_type="intermediate-sharing", info_type="Enriched pathways per CGCs"),
                         gsep_plus_containers %>% subset(gene_type=="cgc") %>% subset(sample%in%limited_sharing) %>% select(symbol, container) %>% unique %>% group_by(symbol) %>% summarise(value=n()) %>% mutate(sample_type="limited-sharing", info_type="Enriched pathways per CGCs"),
                         gsep_plus_containers %>% subset(gene_type=="cgc") %>% subset(sample%in%no_sharing) %>% select(symbol, container) %>% unique %>% group_by(symbol) %>% summarise(value=n()) %>% mutate(sample_type="no-sharing", info_type="Enriched pathways per CGCs"))


    ## sys-candidates mapped in enriched pathways
    genes2info = rbind(genes2info, gsep_plus_containers %>% subset(gene_type!="cgc") %>% subset(sample%in%high_sharing) %>% select(symbol, container) %>% unique %>% group_by(symbol) %>% summarise(value=n()) %>% mutate(sample_type="high-sharing", info_type="Enriched pathways per sys-candidates"),
                         gsep_plus_containers %>% subset(gene_type!="cgc") %>% subset(sample%in%intermediate_sharing) %>% select(symbol, container) %>% unique %>% group_by(symbol) %>% summarise(value=n()) %>% mutate(sample_type="intermediate-sharing", info_type="Enriched pathways per sys-candidates"),
                         gsep_plus_containers %>% subset(gene_type!="cgc") %>% subset(sample%in%limited_sharing) %>% select(symbol, container) %>% unique %>% group_by(symbol) %>% summarise(value=n()) %>% mutate(sample_type="limited-sharing", info_type="Enriched pathways per sys-candidates"),
                         gsep_plus_containers %>% subset(gene_type!="cgc") %>% subset(sample%in%no_sharing) %>% select(symbol, container) %>% unique %>% group_by(symbol) %>% summarise(value=n()) %>% mutate(sample_type="no-sharing", info_type="Enriched pathways per sys-candidates"))
    ggplot(genes2info, aes(x=sample_type, y=value, fill=sample_type)) + geom_boxplot() + facet_wrap(~info_type, ncol = 1, scales = "free")


    ## Percentage with TP53 in each group
    gsep_plus_containers %>% subset(sample%in%high_sharing & symbol=="TP53") %>% select(sample) %>% unique %>% nrow
    gsep_plus_containers %>% subset(sample%in%intermediate_sharing & symbol=="TP53") %>% select(sample) %>% unique %>% nrow
    gsep_plus_containers %>% subset(sample%in%limited_sharing & symbol=="TP53") %>% select(sample) %>% unique %>% nrow
    gsep_plus_containers %>% subset(sample%in%no_sharing & symbol=="TP53") %>% select(sample) %>% unique %>% nrow

    ## Number of processes that genes are mapped
    gsep_plus_containers %>% subset(sample%in%high_sharing) %>% select(symbol, container) %>% unique %>% group_by(symbol) %>% summarise(n=n()) %>% .$n %>% summary()
    gsep_plus_containers %>% subset(sample%in%intermediate_sharing) %>% select(symbol, container) %>% unique %>% group_by(symbol) %>% summarise(n=n()) %>% nrow
    gsep_plus_containers %>% subset(sample%in%limited_sharing) %>% select(symbol, container) %>% unique %>% group_by(symbol) %>% summarise(n=n()) %>% nrow
    gsep_plus_containers %>% subset(sample%in%no_sharing) %>% select(symbol, container) %>% unique %>% group_by(symbol) %>% summarise(n=n()) %>% nrow

    high_sharing_processes = gsep_plus_containers %>% subset(sample%in%high_sharing) %>% select(sample, container) %>% unique %>% count(container) %>% arrange(desc(n)) %>% rename(high_sharing=n)
    intermediate_sharing_processes = gsep_plus_containers %>% subset(sample%in%intermediate_sharing) %>% select(sample, container) %>% unique %>% count(container) %>% arrange(desc(n)) %>% rename(intermediate_sharing=n)
    groups2processes = high_sharing_processes %>% left_join(intermediate_sharing_processes)
    limited_sharing_processes = gsep_plus_containers %>% subset(sample%in%limited_sharing) %>% select(sample, container) %>% unique %>% count(container) %>% arrange(desc(n)) %>% rename(limited_sharing=n)
    groups2processes = groups2processes %>% left_join(limited_sharing_processes)
    no_sharing_processes = gsep_plus_containers %>% subset(sample%in%no_sharing) %>% select(sample, container) %>% unique %>% count(container) %>% arrange(desc(n)) %>% rename(no_sharing=n)
    groups2processes = groups2processes %>% left_join(no_sharing_processes)
    write.table(groups2processes, file="Desktop/groups2processes.tsv", row.names = F, sep = "\t", quote = F)

    ## Get percentages
    groups2processes_perc = groups2processes %>% mutate(high_sharing=high_sharing/43, intermediate_sharing=intermediate_sharing/19, limited_sharing=limited_sharing/33, no_sharing=no_sharing/25)
    rnames = groups2processes_perc$container
    groups2processes_perc = groups2processes_perc[,c(2:5)] %>% data.frame()
    rownames(groups2processes_perc) = rnames
    heatmap.2(as.matrix(groups2processes_perc), col="greenred",cexCol = 1, trace = "none",margins = c(10, 35), keysize = 1, main = "Fraction of samples with each \n container alter")

    ## Network visualisation
    netwrk =graph_from_data_frame(net%>%subset(common.containers>0)%>%select(sample1, sample2),directed=F)
    E(netwrk)$weigth = net%>%subset(common.containers>0) %>% .$common.containers
    E(netwrk)$width = net%>%subset(common.containers>0) %>% .$common.containers
    plot(netwrk, vertex.size=2, edge.width=E(netwrk)$weight)


    ggnet2(netwrk, edge.size = toPlot$common.containers)
    E(netwrk)$weight = net$common.containers
    plot(netwrk, edge.width=net$common.containers, vertex.size=2)
    rdp=RedPort()
    calld(rdp)
    addGraph(rdp, netwrk)
    relax(rdp)

}

getVennOfPathways = function(){
    ## Interection of group-specific pathways and the enriched pathways of all the samples together

    ## Top 10 only
    top5 = read.table("Desktop/enriched_paths2samples_CGC_plus_top5syscans_noGroups_fdr_0.01_pathwayFilters_corrected.tsv", quote = "", sep="\t", header = T)
    t1 = top5 %>% select(pathway) %>% unique %>% mutate(inCGCplusTop5=1)
    top10 = read.table("Desktop/enriched_paths2samples_CGC_plus_top10syscans_noGroups_fdr_0.01_pathwayFilters_corrected.tsv", quote = "", sep="\t", header = T)
    t2 = top10 %>% select(pathway) %>% unique %>% mutate(inCGCplusTop10=1)
    top15 = read.table("Desktop/enriched_paths2samples_CGC_plus_top15syscans_noGroups_fdr_0.01_pathwayFilters_corrected.tsv", quote = "", sep="\t", header = T)
    t3 = top15 %>% select(pathway) %>% unique %>% mutate(inCGCplusTop15=1)
    top25 = read.table("Desktop/enriched_paths2samples_CGC_plus_top25syscans_noGroups_fdr_0.01_pathwayFilters_corrected.tsv", quote = "", sep="\t", header = T)
    t4 = top25 %>% select(pathway) %>% unique %>% mutate(inCGCplusTop25=1)
    top50 = read.table("Desktop/enriched_paths2samples_CGC_plus_top50syscans_noGroups_fdr_0.01_pathwayFilters_corrected.tsv", quote = "", sep="\t", header = T)
    t5 = top50 %>% select(pathway) %>% unique %>% mutate(inCGCplusTop50=1)

    t = t1 %>% full_join(t2) %>% full_join(t3) %>% full_join(t4) %>% full_join(t5)
    t = t %>% left_join(procs)

    write.xlsx(t, file="Desktop/Pathway_intersection_CGCPlusTopX.xlsx", row.names = F)


    top = read.table("Desktop/enriched_paths2samples_CGC_plus_top10syscans_noGroups_fdr_0.01_pathwayFilters_corrected.tsv", quote = "", sep="\t", header = T)
    t1 = top10 %>% select(pathway) %>% unique %>% mutate(inCGCplusTop10=1)
    top10 = read.table("Desktop/enriched_paths2samples_CGC_plus_top10syscans_noGroups_fdr_0.01_pathwayFilters_corrected.tsv", quote = "", sep="\t", header = T)
    t1 = top10 %>% select(pathway) %>% unique %>% mutate(inCGCplusTop10=1)


    onlytop10 = onlytop10 %>% left_join(paths2s2g2)

    paths_rec2 = data.frame(pathway=unique(onlytop10$pathway), inOnlyTop10=1)

    ## CGC plus top10
    plusCGC = read.table("Desktop/enriched_paths2samples_CGC_plus_top10syscans_noGroups_fdr_0.01.txt", quote = "", sep="\t", header = T) %>% select(-kernels_predicted, -kernels_predicted_no)
    paths2s2g3 = plusCGC %>% group_by(pathway) %>% summarise(samplesCGCplusTop10=length(unique(sample)), genesCGCplusTop10=length(unique(entrez))) %>% ungroup()
    paths2s2g3 = paths2s2g3 %>% subset(samplesCGCplusTop10>11)
    plusCGC = plusCGC %>% subset(pathway%in%paths2s2g3$pathway)
    plusCGC = plusCGC %>% left_join(paths2s2g3)

    paths_rec3 = data.frame(pathway=unique(plusCGC$pathway), inCGCplusTop10=1)

    ## Top 10 including CGC
    withCGC = read.table("Desktop/enriched_paths2samples_top10syscans_withCGC_noGroups_fdr_0.01.txt", quote = "", sep="\t", header = T) %>% select(-kernels_predicted, -kernels_predicted_no)
    paths2s2g4 = withCGC %>% group_by(pathway) %>% summarise(samplesTop10withCGC=length(unique(sample)), genesTop10withCGC=length(unique(entrez))) %>% ungroup()
    paths2s2g4 = paths2s2g4 %>% subset(samplesTop10withCGC>11)
    withCGC = withCGC %>% subset(pathway%in%paths2s2g4$pathway)
    withCGC = withCGC %>% left_join(paths2s2g4)

    paths_rec4 = data.frame(pathway=unique(withCGC$pathway), inTop10withCGC=1)


    ## Secrier 31 drivers
    # secrier31 = read.table("Desktop/enriched_paths2samples_noGroups_Secrier31Drivers_fdr_0.01.tsv", quote = "", sep="\t", header = T) %>% select(-kernels_predicted, -kernels_predicted_no)
    # paths2s2g5 = secrier31 %>% group_by(pathway) %>% summarise(samplesSecrier31=length(unique(sample)), genesSecrier31=length(unique(entrez))) %>% ungroup()
    # paths2s2g5 = paths2s2g5 %>% subset(samplesSecrier31>11)
    # secrier31 = secrier31 %>% subset(pathway%in%paths2s2g5$pathway)
    # secrier31 = secrier31 %>% left_join(paths2s2g5)
    #
    # paths_rec5 = data.frame(pathway=unique(secrier31$pathway), inSecrier31=1)

    ## All genes
    all_genes = read.table("Desktop/enriched_paths2samples_noGroups_all_genes_plus_CGC_withoutAmplifications_fdr_0.01.tsv", quote = "", sep="\t", header = T) %>% select(-kernels_predicted, -kernels_predicted_no)
    paths2s2g5 = all_genes %>% group_by(pathway) %>% summarise(samplesAllgenes=length(unique(sample)), genesAllgenes=length(unique(entrez))) %>% ungroup()
    paths2s2g5 = paths2s2g5 %>% subset(samplesAllgenes>11)
    all_genes = all_genes %>% subset(pathway%in%paths2s2g5$pathway)
    all_genes = all_genes %>% left_join(paths2s2g5)

    paths_rec5 = data.frame(pathway=unique(all_genes$pathway), inAllgenes31=1)

    paths_rec = paths_rec %>% full_join(paths_rec2) %>% full_join(paths_rec3) %>% full_join(paths_rec4) %>% full_join(paths_rec5)
    ## Get frist level 1 and level 2 pathways from config
    paths_rec = paths_rec %>% mutate(Level_1=ifelse(pathway%in%level_1_nodes, TRUE, FALSE), Level_2=ifelse(pathway%in%level_2_nodes, TRUE, FALSE))

    paths_rec = paths_rec %>% left_join(paths2s2g1) %>% left_join(paths2s2g2) %>% left_join(paths2s2g3) %>% left_join(paths2s2g4) %>% left_join(paths2s2g5)

    write.table(paths_rec, file="Desktop/paths_rec.tsv", sep="\t", row.names = F, quote=F)
    write.table(onlyCGC, file="Desktop/onlyCGC.tsv", row.names = F, quote = F, sep="\t")
    write.table(onlytop10, file="Desktop/onlytop10.tsv", row.names = F, quote = F, sep="\t")
    write.table(plusCGC, file="Desktop/plusCGC.tsv", row.names = F, quote = F, sep="\t")
    write.table(withCGC, file="Desktop/withCGC.tsv", row.names = F, quote = F, sep="\t")
    write.table(secrier31, file="Desktop/secrier31.tsv", row.names = F, quote = F, sep="\t")
    ## Combine all 3 in one table
    t = onlyCGC %>% full_join(plusCGC) %>% full_join(onlytop10) %>% full_join(withCGC) %>% left_join(paths_rec)



    library(gplots)

    venn(list(onlyCGC=unique(onlyCGC%>% .$pathway),
              onlytop10=unique(onlytop10%>% .$pathway),
              top10plusCGC=unique(plusCGC%>% .$pathway),
              top10withCGC=unique(withCGC%>% .$pathway),
              Allgenes=unique(all_genes%>% .$pathway)))

    venn(list(onlyCGC=unique(onlyCGC%>%subset(!pathway%in%c(level_1_nodes, level_2_nodes))%>% .$pathway),
              onlytop10=unique(onlytop10%>%subset(!pathway%in%c(level_1_nodes, level_2_nodes))%>% .$pathway),
              top10plusCGC=unique(plusCGC%>%subset(!pathway%in%c(level_1_nodes, level_2_nodes))%>% .$pathway),
              top10withCGC=unique(withCGC%>%subset(!pathway%in%c(level_1_nodes, level_2_nodes))%>% .$pathway),
              Allgenes=unique(all_genes%>%subset(!pathway%in%c(level_1_nodes, level_2_nodes))%>% .$pathway)))



    ## Fixing the table (pathway filters/pathway groups/ranks of CGCs to NA etc)
    d = read.table("Desktop/enriched_paths2samples_CGC_plus_top5syscans_noGroups_fdr_0.01.txt", quote = "", sep="\t", header = T) %>% select(-kernels_predicted, -kernels_predicted_no)
    d1 = d %>% subset(gene_type=="cgc") %>% select(-patient.rank) %>% mutate(patient.rank=NA)
    d2 = d %>% subset(gene_type!="cgc") %>% select(-patient.rank)
    genes2rank = syscan %>% select(sample, symbol, score) %>% arrange(sample, score) %>% group_by(sample) %>% mutate(patient.rank=row_number(-score)) %>% ungroup %>% select(-score) %>% data.frame()
    d2 = d2 %>% left_join(genes2rank)

    paths2s2g = d %>% group_by(pathway) %>% summarise(samples=length(unique(sample)), genes=length(unique(entrez))) %>% ungroup()
    paths2s2g = paths2s2g %>% subset(samples>11)
    paths2s2g = paths2s2g %>% mutate(Level_1=ifelse(pathway%in%level_1_nodes, TRUE, FALSE), Level_2=ifelse(pathway%in%level_2_nodes, TRUE, FALSE))
    paths2s2g = paths2s2g %>% subset(Level_1==FALSE & Level_2==FALSE)
    d = rbind(d1, d2) %>% subset(pathway%in%paths2s2g$pathway) %>% left_join(paths2s2g)

    ## Add pathway groups
    paths2func = read.xlsx("Desktop/Pathways2Processes_CGC_AND_TOP10_2203.xlsx", 1) %>% rename(pathway=Pathways.enriched.in.CGC_Plus_TopTen)
    top10 = top10 %>% left_join(paths2func)
    top5 = top5 %>% left_join(paths2func)
    top15 = top15 %>% left_join(paths2func)


    write.table(d, file="Desktop/enriched_paths2samples_CGC_plus_top10syscans_noGroups_fdr_0.01_pathwayFilters_corrected.tsv", row.names = F, sep="\t", quote = F)

    t1 = top5 %>% select(pathway) %>% unique %>% mutate(in_Top5=1)
    t2 = top10 %>% select(pathway) %>% unique %>% mutate(in_Top10=1)
    t3 = top15 %>% select(pathway) %>% unique %>% mutate(in_Top15=1)
    t = t1 %>% full_join(t2) %>% full_join(t3)
    write.xlsx(t, file="Desktop/Pathway_intersection_CGCPlusTopX.xlsx", row.names = F)




}

## For the pathways that are the same among the three groups
## Map the genes altered onto the pathways
annotatePathways = function(paths=read.delim(pipe("pbpaste"), header=F), group="c2a", save.dir="~/Desktop/results", length.cutoff=500){
    ## pathway visualisation
    require(ReactomePA)

    ## Get the pathways
    if(group=="mutagenic"){
        oneList = read.delim("Desktop/oneList_mutagenic.txt", header = T, sep = "\t") %>% subset(!is.na(pathway))
        genes = oneList %>% dplyr::select(entrez, score) %>% group_by(entrez) %>% summarise(score=mean(score)) %>% ungroup()
        g = genes$score
        names(g) = genes$entrez
    }else if(group=="c2a"){
        oneList = read.delim("Desktop/oneList_c2adom.txt", header = T, sep = "\t") %>% subset(!is.na(pathway))
        genes = oneList %>% dplyr::select(entrez, score) %>% group_by(entrez) %>% summarise(score=mean(score)) %>% ungroup()
        g = genes$score
        names(g) = genes$entrez
    }else if(group=="ddr"){
        oneList = read.delim("Desktop/oneList_ddr.txt", header = T, sep = "\t") %>% subset(!is.na(pathway))
    }

    ## Create the output directory
    dir.create(save.dir, showWarnings = FALSE)

    for (p in paths$V1){
        pdf(paste0(save.dir, "/", gsub("\\/", "_", p), "_", group, ".pdf"))
        genes = oneList %>% subset(pathway==p) %>% dplyr::select(entrez, score) %>% group_by(entrez) %>% summarise(score=mean(score)) %>% ungroup()
        g = genes$score
        names(g) = genes$entrez
        viewPathway(p, readable=TRUE, main=paste0(p, "(", length(g), ")"), foldChange = g, vertex.label.font = 0.5, vertex.label.cex=0.4)
        dev.off()
    }
}

## I just dumped code here that I used to analyse pathways/mappings to samples etc
pathwayMetaAnalysis = function(){

    ## Check union of samples and genes in the enriched pathways
    setwd("~/Desktop/")

    oneList = read.delim("oneList_ddr.txt", header = T, sep = "\t")
    top10 = read.delim("top10_ddr.txt", header = T, sep = "\t") %>% subset(!is.na(pathway))

    ## Read in the pathways of the cluster - usually imported with copy from excel
    paths = read.delim(pipe("pbpaste"), header=T)
    paths
    ## Union of samples
    s1 = oneList %>% subset(pathway%in%paths$V1) %>% select(sample) %>% unique %>% .$sample
    s2 = top10 %>% subset(pathway%in%paths$V1) %>% select(sample) %>% unique %>% .$sample
    length(unique(c(s1, s2)))
    ## union of genes
    g1 = oneList %>% subset(pathway%in%paths$V1) %>% select(entrez) %>% unique %>% .$entrez
    oneList %>% subset(pathway%in%paths$V1) %>% select(symbol) %>% unique %>% nrow

    g2 = top10 %>% subset(pathway%in%paths$V1) %>% select(entrez) %>% unique %>% .$entrez
    top10 %>% subset(pathway%in%paths$V1) %>% select(symbol) %>% unique %>% nrow
    length(unique(c(g1, g2)))



    ## Get a venn-like graph for the pathway clusters
    d = read.delim("Desktop/pathway_clustering_c2a.txt", sep = "\t", header = T) %>% subset(pathway!="")
    oneList = read.delim("Desktop/oneList_c2adom.txt", header = T, sep = "\t")
    top10 = read.delim("Desktop/top10_c2adom.txt", header = T, sep = "\t") %>% subset(!is.na(pathway))

    ## Get a sample-based list with all the pathways
    s1 = oneList %>% subset(pathway%in%d$pathway) %>% dplyr::select(sample, pathway) %>% left_join(d%>%dplyr::select(pathway, Group.name))
    s2 = top10 %>% subset(pathway%in%d$pathway) %>% dplyr::select(sample, pathway)  %>% left_join(d%>%dplyr::select(pathway, Group.name))

    ## Before we collapse and count the combinations
    # clusters = c("MB", "GE", "ML", "CA")
    # samples = sort(unique(s$sample))
    # samples_covered = NULL
    # for (cl in clusters){
    #     cat(cl, "\n")
    #     add = s %>% subset(Group.name==cl) %>% select(sample) %>% unique %>% .$sample
    #     samples_covered = c(samples_covered, add)
    #     print(length(unique(samples_covered)))
    #     if(length(unique(samples_covered))==37){
    #         stop("reached maximum samples")
    #     }
    #
    # }

    s = rbind(s1, s2) %>% group_by(sample) %>% summarise(paths=paste(sort(unique(Group.name)), collapse=","))
    s$group.no = apply(s, 1, function(x) length(unlist(strsplit(x[2], split = ","))))
    s %>% count(paths) %>% data.frame()

    ## Create a list of all possible combinations of pathways
    i=1
    max_combs = 5
    l = list()
    while (i <= max_combs){
        cat(i, "\n")
        l[[i]] = split(combn(unique(d$Group.name), i), rep(1:ncol(combn(unique(d$Group.name), i)), each = nrow(combn(unique(d$Group.name), i))))
        i = i + 1
    }

    ## search for patterns and count how many time all patterns are found
    ## This is basically the intersection
    p = list()
    for (i in 1:length(l)){
        for(y in 1:length(l[[i]])){
            p[[paste(l[[i]][[y]], collapse = "&")]] = apply(s, 1, function(x) colSums(data.frame(sapply(l[[i]][[y]], grepl, x[2], simplify = T)))==length(l[[i]][[y]]))
        }
    }
    ## Keep only those with at least one TRUE
    p = p[unlist(lapply(p, function(x) length(x[x==TRUE])>=1))]
    p = lapply(p, function(x) length(x[x==TRUE]))
    p = lapply(seq_along(p), function(y,n,i){
        ldply(y[[i]]) %>% mutate(groups=n[[i]])
    }, y=p, n=names(p))
    p = do.call(rbind, p)

    ## lets do the union
    pu = list()
    for (i in 1:length(l)){
        for(y in 1:length(l[[i]])){
            pu[[paste(l[[i]][[y]], collapse = "&")]] = apply(s, 1, function(x) colSums(data.frame(sapply(l[[i]][[y]][l[[i]][[y]]!=""], grepl, x[2], simplify = T)))>=1)
        }
    }
    ## Keep only those with at least one TRUE
    pu = pu[unlist(lapply(pu, function(x) length(x[x==TRUE])>=1))]
    pu = lapply(pu, function(x) length(x[x==TRUE]))
    pu = lapply(seq_along(pu), function(y,n,i){
        ldply(y[[i]]) %>% mutate(groups=n[[i]])
    }, y=pu, n=names(pu))
    pu = do.call(rbind, pu)

    colnames(p) = c("intersection", "groups")
    colnames(pu) = c("union", "groups")
    pa = p %>% full_join(pu)
    pa$groups.no = apply(pa, 1, function(x) length(unlist(strsplit(x[2], split = "&"))))
    write.table(pa%>%dplyr::select(intersection, union, groups, groups.no), file="Desktop/pathway_cluster_combinations_c2adom.tsv", quote = F, row.names = F, sep = "\t")
}


## Check in which pathways and samples KAT2A, KAT2B and related genes are
analyzeKATs = function(geneInfo_fn="/Users/fc-8s-imac/athena/data/geneInfoNCG5.Rdata",
                       gene_sets_dir="/Users/fc-8s-imac/athena/data/geneSets/reactome_12_12_16/",
                       gene_sets_gene_start=4,
                       gene_sets_identifiers = "reactome",
                       sub_identifier="symbol",
                       scores_fn="/Users/fc-8s-imac/athena/data/OAC/129_OAC/ML/OAC/syscans_4_with_gains_in_sys_cans.Rdata",
                       cohort_fn="/Users/fc-8s-imac/athena/data/OAC/129_OAC/Rdata/mainCohort.Rdata",
                       remove.amplifications=T,
                       remove.cgc=T){

    getPathway = function(pathway="kegg", sub=sub_identifier, genes_start=gene_sets_gene_start,
                          path_dir=gene_sets_dir,
                          gi_fn=geneInfo_fn){

        make_pathway_list = function(pathMsigDbFile) {
            inputFile <- pathMsigDbFile
            con  <- file(inputFile, open = "r")
            c = 1
            pathway.list <- vector(mode="list",length=0)
            message("Loading gene sets....")
            while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {
                myVector <- do.call("rbind",strsplit(oneLine, "\t"))
                t = vector(mode="list",length=1)
                t[[1]] = myVector[genes_start:length(myVector)]
                names(t) = myVector[1]
                pathway.list = c(pathway.list,t)
                c = c+1
            }

            close(con)
            return(pathway.list)
        }

        fns = list.files(path_dir)
        pattern=paste0(pathway, ".*", sub)
        fn = fns[grepl(pattern, fns)]
        if(length(fn)>1 | length(fn)==0){
            stop("No or multiple files found. Please check the parameters provided")
        }

        message(paste0("Pathway file: ", fn))
        p = make_pathway_list(paste0(path_dir, fn))
        message(paste0("Number of pathways: ", length(p)))
        if(sub=="entrez"){
            genes_p = as.numeric(unique(unlist(p)))
        }else if(sub=="symbol"){
            genes_p = unique(unlist(p))
        }
        message(paste0("Number of genes: ", length(genes_p)))
        ## Get gene info to check overlap with 19014
        load(gi_fn)
        geneInfo = geneInfo %>% subset(duplicability!=-1) %>% select(symbol, entrez)
        if(sub=="entrez"){
            message(paste0("Number of genes (19,014): ", length(genes_p[genes_p%in%geneInfo$entrez])), "\n")
        }else if(sub=="symbol"){
            message(paste0("Number of genes (19,014): ", length(genes_p[genes_p%in%geneInfo$symbol])), "\n")
        }
        return(p)

    }

    load(scores_fn)
    load(geneInfo_fn)
    ## Make sure you update the symbols to work with gene sets with symbols
    syscan = syscan %>% select(-symbol) %>% left_join(geneInfo%>%select(entrez, symbol))
    ## Exclude "CGC discarded"
    syscan = syscan %>% subset(gene_type!="CGC_discarded")

    ## ------ Add mutational signature information to the scores ---------
    load(cohort_fn)
    mainCohort = mainCohort %>% rename(sample=Primary_tumour_solid_tissue)
    syscan = syscan %>% left_join(mainCohort%>%select(sample, Group)%>%unique)

    ## Filters
    if(remove.cgc){
        syscan = syscan %>% subset(gene_type!="cgc")
    }
    if(remove.amplifications){
        syscan = syscan %>% subset(CNVGain!=1)
    }

    ## Get the genes for each group
    caDominant = syscan %>% subset(Group=="C>A/T dominant") %>% subset(gene_type!="CGC_discarded") %>%
        subset(type=="P") %>% select(entrez, symbol, score) %>% group_by(entrez, symbol) %>% summarise(score=mean(score)) %>% ungroup()
    addition = syscan %>% subset(Group=="C>A/T dominant") %>% subset(gene_type!="CGC_discarded") %>%
        subset(type=="C") %>% select(entrez, symbol) %>% unique %>% mutate(score=max(caDominant$score)+1)
    caDominant = rbind(caDominant, addition)
    caDominant = caDominant %>% arrange(desc(score)) %>% mutate(rank=row_number())
    if(sub_identifier=="entrez"){
        caDominant = caDominant %>% select(entrez, score, rank) %>% subset(!is.na(entrez))
    }else if(sub_identifier=="entrez"){
        caDominant = caDominant %>% select(symbol, score, rank) %>% subset(!is.na(symbol))
    }

    ddr = syscan %>% subset(Group=="DDR impaired") %>% subset(gene_type!="CGC_discarded") %>%
        subset(type=="P") %>% select(entrez, symbol, score) %>% group_by(entrez, symbol) %>% summarise(score=mean(score)) %>% ungroup()
    addition = syscan %>% subset(Group=="DDR impaired") %>% subset(gene_type!="CGC_discarded") %>%
        subset(type=="C") %>% select(entrez, symbol) %>% unique %>% mutate(score=max(ddr$score)+1)
    ddr = rbind(ddr, addition)
    ddr = ddr %>% arrange(desc(score)) %>% mutate(rank=row_number())
    if(sub_identifier=="entrez"){
        ddr = ddr %>% select(entrez, score, rank) %>% subset(!is.na(entrez))
    }else if(sub_identifier=="entrez"){
        ddr = ddr %>% select(symbol, score, rank) %>% subset(!is.na(symbol))
    }

    mutagenic = syscan %>% subset(Group=="Mutagenic") %>% subset(gene_type!="CGC_discarded") %>%
        subset(type=="P") %>% select(entrez, symbol, score) %>% group_by(entrez, symbol) %>% summarise(score=mean(score)) %>% ungroup()
    addition = syscan %>% subset(Group=="Mutagenic") %>% subset(gene_type!="CGC_discarded") %>%
        subset(type=="C") %>% select(entrez, symbol) %>% unique %>% mutate(score=max(mutagenic$score)+1)
    mutagenic = rbind(mutagenic, addition)
    mutagenic = mutagenic %>% arrange(desc(score)) %>% mutate(rank=row_number())
    if(sub_identifier=="entrez"){
        mutagenic = mutagenic %>% select(entrez, score, rank) %>% subset(!is.na(entrez))
    }else if(sub_identifier=="entrez"){
        mutagenic = mutagenic %>% select(symbol, score, rank) %>% subset(!is.na(symbol))
    }

    gene_sets = getPathway(pathway = gene_sets_identifiers)
    genes = c("KAT2A", "KAT2B")

    mut_enriched_pathways = read.xlsx("Desktop/mutagenic_enriched_paths.xlsx", 1)
    ca_enriched_pathways = read.xlsx("Desktop/c2a2t_dominant_enriched_paths.xlsx", 1)
    ddr_enriched_pathways = read.xlsx("Desktop/ddr_impaired_enriched_paths.xlsx", 1)

    res = NULL
    for (g in genes){
        cat(g, "\n")
        gs = gene_sets[unlist(lapply(gene_sets, function(x) g%in%x))]
        gs_unlisted = unique(unlist(gs))
        ## Make the list dataframe
        gs = lapply(seq_along(gs), function(y, n, i){
            ldply(y[[i]]) %>% mutate(pathway=n[[i]])
        }, y=gs, n=names(gs))


        gs = do.call(rbind, gs) %>% rename(symbol=V1) %>% mutate(gene.tested=g)

        pats = syscan %>% select(sample) %>% unique %>% .$sample

        for (p in pats){
            cat(p, "\n")
            ## get patient's group
            pgroup = syscan %>% subset(sample==p) %>% select(Group) %>% unique %>% .$Group
            ## get patient's genes
            ## Check which genes are in the combined list because depending on
            ## whether CGCs/amplifications are included the genes will change
            if (pgroup=="Mutagenic" & !is.na(pgroup)){
                gp = syscan %>% subset(sample==p & entrez%in%mutagenic$entrez)
                gp = gp %>% left_join(mutagenic %>% select(-score) %>% rename(rank.in.group=rank))
                enriched_pathways = mut_enriched_pathways
            }else if(pgroup=="C>A/T dominant" & !is.na(pgroup)){
                gp = syscan %>% subset(sample==p & entrez%in%caDominant$entrez)
                gp = gp %>% left_join(caDominant %>% rename(rank.in.group=rank))
                enriched_pathways = ca_enriched_pathways
            }else if(pgroup=="DDR impaired" & !is.na(pgroup)){
                gp = syscan %>% subset(sample==p & entrez%in%ddr$entrez)
                gp = gp %>% left_join(ddr %>% rename(rank.in.group=rank))
                enriched_pathways = ddr_enriched_pathways
            }else if(is.na(pgroup)){
                next
            }

            ## Get ranks in the patient
            gp = gp %>% arrange(desc(score)) %>% mutate(rank=row_number())
            if(sub_identifier=="entrez"){
                gp = gp %>% subset(entrez %in% gs_unlisted) %>% select(sample, entrez, symbol, rank, rank.in.group) %>% rename(rank.in.patient=rank)
            }else if(sub_identifier=="symbol"){
                gp = gp %>% subset(symbol %in% gs_unlisted) %>% select(sample, entrez, symbol, rank, rank.in.group) %>% rename(rank.in.patient=rank)
            }

            ## Add group information
            gp = gp %>% mutate(group=pgroup)
            ## Now bring in information for the pathways
            gp = gp %>% left_join(gs)

            ## Check if the pathway is enriched
            gp = gp %>% mutate(enriched=ifelse(pathway%in%enriched_pathways$pathway, TRUE, FALSE))
            res = rbind(res, gp)
        }
    }
    return(res)

}
## I run this in athena
## Function to create the submission file for randomSamplingGSEA function
createSub = function(genes=804, iters=1000000, chunks=1000, range=NULL, ncore=1, save_dir="~/athena/data/OAC/randomise_GSEA_cmds.txt",
                     res_dir = "/mnt/lustre/users/k1469280/mourikisa/data/OAC/129_OAC/randomise_GSEA"){
    iters_per_fn = iters/chunks
    if(!is.null(range)){ ## Use range in case you want to rename the files cause you want to add more iterations
        if(length(range)==length(1:chunks)){
            chunks=range
        }else{
            stop("Vector length mismatch")
        }
    }else{
        chunks = 1:chunks
    }
    cmds = NULL
    for(i in chunks){
        sub <- paste0("qsub -N job", i," -pe threaded ",ncore," -q FCgroup.q@node042,FCgroup.q@node043 -l h_vmem=12G ./submitRandomiseGSEA.sh ", genes, " ", iters_per_fn, " ", res_dir, " ", i)
        cmds = rbind(cmds, sub)
    }

    write.table(cmds, sep="\t", quote=F, row.names=F, col.names=F, file=save_dir)
}


parseRandomisation = function(res_dir="/mnt/lustre/users/k1469280/mourikisa/data/OAC/129_OAC/randomise_GSEA/",
                              paths = "/home/mourikisa/enriched_paths2samples_CGC_plus_top10syscans_noGroups_fdr_0.01_pathwayFilters_corrected.tsv",
                              save_dir="/home/mourikisa"){
  fns = list.files(res_dir, full.names = TRUE)

  ## Get 239
  d = read.table(paths, header = T, sep="\t")
  d = d %>% select(pathway) %>% unique %>% .$pathway

  rgsea = NULL
  for(fn in fns){
    cat(fn, "\n")
    load(fn)
    res = res[["gsea"]]
    #res = res %>% subset(pathway%in%d)
    rgsea = rbind(rgsea, res)
    gc()
  }

  # pdf(paste0(save_dir, "/frd_dist.pdf"))
  # hist(rgsea$fdr, breaks=100, main = paste0(res%>%.$pathway%>%unique, " \nrandom sampling GSEA FDR"), xlab = "FDR")
  # legend(x="topright",legend=cbind(paste0(names(summary(rgsea$fdr)), ":", summary(rgsea$fdr))))
  # dev.off()

  save(rgsea, file=paste0(save_dir,"/rgsea.Rdata"))

}


## This function was written to a seprate file so it can be run in parallel in athena
## I kept it here for backup
randomSamplingGSEA = function(n=NULL, ## This is the number of genes I random sample
                              times=NULL,
                              geneInfo_fn="~/data/geneInfoNCG5.Rdata",
                              gene_sets_dir="~/data/geneSets/reactome_12_12_16/",
                              gene_sets_gene_start=4,
                              gene_sets_identifiers = "reactome",
                              sub_identifier="symbol",
                              scores_fn="~/data/OAC/129_OAC/ML/OAC/syscans_4_with_gains_in_sys_cans.Rdata",
                              cohort_fn="~/data/OAC/129_OAC/Rdata/mainCohort.Rdata",
                              remove.amplifications=T,
                              remove.cgc=T){

    ## Library cleaning
    detach("package:dplyr", unload = T)
    require(plyr)
    require(tidyr)
    require(dplyr)

    if(is.null(n)){
        stop("No number of genes")
    }

    ## pathway parameter in the following function can be kegg/reactome/all
    ## sub parameter can be entrez/symbol
    getPathway = function(pathway="kegg", sub=sub_identifier, genes_start=gene_sets_gene_start,
                          path_dir=gene_sets_dir,
                          gi_fn=geneInfo_fn){

        make_pathway_list = function(pathMsigDbFile) {
            inputFile <- pathMsigDbFile
            con  <- file(inputFile, open = "r")
            c = 1
            pathway.list <- vector(mode="list",length=0)
            message("Loading gene sets....")
            while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {
                myVector <- do.call("rbind",strsplit(oneLine, "\t"))
                t = vector(mode="list",length=1)
                t[[1]] = myVector[genes_start:length(myVector)]
                names(t) = myVector[1]
                pathway.list = c(pathway.list,t)
                c = c+1
            }

            close(con)
            return(pathway.list)
        }

        fns = list.files(path_dir)
        pattern=paste0(pathway, ".*", sub)
        fn = fns[grepl(pattern, fns)]
        if(length(fn)>1 | length(fn)==0){
            stop("No or multiple files found. Please check the parameters provided")
        }

        message(paste0("Pathway file: ", fn))
        p = make_pathway_list(paste0(path_dir, fn))
        message(paste0("Number of pathways: ", length(p)))
        if(sub=="entrez"){
            genes_p = as.numeric(unique(unlist(p)))
        }else if(sub=="symbol"){
            genes_p = unique(unlist(p))
        }
        message(paste0("Number of genes: ", length(genes_p)))
        ## Get gene info to check overlap with 19014
        load(gi_fn)
        geneInfo = geneInfo %>% subset(duplicability!=-1) %>% select(symbol, entrez)
        if(sub=="entrez"){
            message(paste0("Number of genes (19,014): ", length(genes_p[genes_p%in%geneInfo$entrez])), "\n")
        }else if(sub=="symbol"){
            message(paste0("Number of genes (19,014): ", length(genes_p[genes_p%in%geneInfo$symbol])), "\n")
        }
        return(p)

    }

    ## Get the genes to sample from
    load(scores_fn)
    load(geneInfo_fn)
    ## Make sure you update the symbols to work with gene sets with symbols
    syscan = syscan %>% select(-symbol) %>% left_join(geneInfo%>%select(entrez, symbol))
    ## Exclude "CGC discarded"
    syscan = syscan %>% subset(gene_type!="CGC_discarded")

    ## ------ Add mutational signature information to the scores ---------
    load(cohort_fn)
    mainCohort = mainCohort %>% rename(sample=Primary_tumour_solid_tissue)
    syscan = syscan %>% left_join(mainCohort%>%select(sample, Group)%>%unique)

    ## Filters
    if(remove.amplifications){
        syscan = syscan %>% subset(CNVGain!=1)
    }
    if(remove.cgc){
        cgcs = syscan %>% subset(!is.na(Group)) %>% subset(gene_type=="cgc") %>% select(symbol) %>% unique %>% .$symbol
        syscan = syscan %>% subset(gene_type!="cgc")
    }
    ## Exclude samples that do not correspond to
    syscan = syscan %>% subset(!is.na(Group))

    ## get genes for sampling
    genes_to_sample = syscan %>% select(symbol) %>% unique %>% .$symbol
    ## get gene sets
    gene_sets = getPathway(pathway = gene_sets_identifiers)

    ## x is the gene set and y your gene list
    run.hypergeometric = function(x, y, universe=19014){
        P = phyper(length(intersect(x,y)), length(x), universe-length(x), length(y), lower.tail = F)
        p = data.frame(p.value=P, gene.set.length=length(x), genes.in.gene.set=length(intersect(x,y)))
        return(p)
    }

    ex = replicate(times, sample(genes_to_sample, n))
    ex = split(ex, rep(1:ncol(ex), each = nrow(ex)))
    ## Add CGCs in the random samples
    ex = lapply(ex, function(x) x=c(x, cgcs))

    rgsea = lapply(ex, function(x){

                l = lapply(gene_sets, function(y) run.hypergeometric(y, x))
                l = lapply(seq_along(l), function(z, n, i){
                ldply(z[[i]]) %>% mutate(pathway=n[[i]])
                }, z=l, n=names(l))
                l = do.call(rbind, l) %>% rename(value=V1, type=.id)
                l = l %>% unique %>% spread(type, value)

                l$fdr = p.adjust(l$p.value, method="BH")
                l$number.of.genes.tested = length(x)
                return(l)
    })

    ## Make the list of gene set enrichments a table
    rgsea = lapply(seq_along(rgsea), function(y,n,i){
        y[[i]] %>% mutate(iter=n[[i]])
    }, y=rgsea, n=names(rgsea))

    rgsea = do.call(rbind, rgsea)

    return(list(gsea=rgsea, gene_samples=ex, length_cgcs=length(cgcs), length_sys_cans=length(genes_to_sample)))
}

## Use this function to determine enriched pathways given a gene list for OAC subgroups or the total cohort
getTopGenes = function(geneInfo_fn="~/Mountpoints/rosalind_lustre/mourikisa/data/geneInfoNCG5.Rdata",
                       scores_fn="~/Mountpoints/rosalind_lustre/mourikisa/data/OAC/Combined_ICGC_cohort_129_66_71/sysSVM/OAC/syscan_noNotExpressed.Rdata",
                       given_tb=NULL, ## If this is not NULL you do GSEA on all the genes in this table
                       remove.amplifications=F,
                       remove.cgc=T, ## redundant with CGC.plus
                       CGC.plus = F, ## Set it to False to do GSEA only on syscans
                       genes.per.patient=10 ## set it to NULL to do GSEA only on the CGCs
                    ){

    load(geneInfo_fn)
    if(is.null(given_tb)){
        load(scores_fn)
    }else if(!is.null(given_tb)){ ## I do that to give directly prediction or training set
        syscan = given_tb
        ## These tables are the raw input for SVM and they have no symbol
        syscan = syscan %>% tibble::rownames_to_column() %>% separate(rowname, into=c("cancer_type", "sample", "entrez"), sep="\\.")
        syscan = syscan %>% mutate(entrez=as.numeric(entrez)) %>% left_join(geneInfo%>%select(entrez, symbol))
    }

    ## Filters
    if(remove.cgc){
        syscan = syscan %>% subset(gene_type!="cgc")
    }
    if(remove.amplifications){
        syscan = syscan %>% subset(CNVGain!=1)
    }


    genes=NULL
    for(s in unique(syscan$sample)){
        sd = syscan %>% subset(sample==s)
        if(is.null(given_tb)){
            if(CGC.plus & !is.null(genes.per.patient)){
                sd1 = sd %>% subset(gene_type=="cgc")
                sd2 = sd %>% subset(gene_type!="cgc") %>% arrange(desc(score)) %>% slice(1:genes.per.patient)
                sd = rbind(sd1, sd2)
            }else if(CGC.plus==F & !is.null(genes.per.patient)){
                sd = sd %>% subset(gene_type!="cgc") %>% arrange(desc(score)) %>% slice(1:genes.per.patient)
            }else if(is.null(genes.per.patient)){
                sd = sd %>% subset(gene_type=="cgc")
            }
        }else if(!is.null(given_tb)){
            sd = sd
        }
        genes = rbind(genes, sd)
    }

    ## Store this to return also the genes
    genes_to_return = genes

    if(is.null(given_tb)){
        ## Get distribution
        print("CGCs")
        print(genes %>% subset(gene_type=="cgc") %>% group_by(sample) %>% summarise(genes=length(unique(symbol))) %>% ungroup() %>%
                  full_join(syscan%>%select(sample)%>%unique) %>% mutate(genes=ifelse(is.na(genes), 0, genes)) %>% .$genes %>% summary())
        print(genes %>% subset(gene_type=="cgc") %>% select(symbol) %>% unique %>% nrow)
        print("Non CGCs")
        print(genes %>% subset(gene_type!="cgc") %>% group_by(sample) %>% summarise(genes=length(unique(symbol))) %>% ungroup() %>%
                  full_join(syscan%>%subset(gene_type!="cgc")%>%select(sample)%>%unique) %>% mutate(genes=ifelse(is.na(genes), 0, genes)) %>% .$genes %>% summary())
        print(genes %>% subset(gene_type!="cgc") %>% select(symbol) %>% unique %>% nrow)
        print("Samples")
        print(genes %>% select(sample) %>% unique %>% nrow)
    }

    genes = genes$symbol
    ## Make the vector unique because we may have the same gene in two samples
    cat(paste0("Genes extracted: ", length(genes)), "\n")
    genes = unique(genes)
    cat(paste0("Unique genes: ", length(genes)), "\n")

    return(genes_to_return)
}

## Use this function to determine enriched pathways given a gene list for OAC subgroups or the total cohort
pathwayEnrichmentTopGenes = function(geneInfo_fn="~/Mountpoints/rosalind_lustre/mourikisa/data/geneInfoNCG5.Rdata",
                                     gene_sets_dir="~/Mountpoints/rosalind_lustre/mourikisa/data/geneSets/reactome_12_12_16/",
                                     gene_sets_gene_start=4,
                                     gene_sets_identifiers = "reactome",
                                     sub_identifier="symbol",
                                     scores_fn="~/Mountpoints/rosalind_lustre/mourikisa/data/OAC/Combined_ICGC_cohort_129_66_71/sysSVM/OAC/syscan_noNotExpressed.Rdata",
                                     given_tb=NULL, ## If this is not NULL you do GSEA on all the genes in this table
                                     remove.amplifications=T,
                                     remove.cgc=F, ## redundant with CGC.plus
                                     CGC.plus = T, ## Set it to False to do GSEA only on syscans
                                     genes.per.patient=10, ## set it to NULL to do GSEA only on the CGCs
                                     by.groups=F,
                                     ranks=F,
                                     path_length=T,
                                     min_path_length=10,
                                     max_path_length=500,
                                     pathlvls_excl=T,
                                     identifier.file="~/Mountpoints/rosalind_lustre/mourikisa/data/geneSets/reactome_12_12_16/complete_list_of_pathways.tsv",
                                     hierarchy.file="~/Mountpoints/rosalind_lustre/mourikisa/data/geneSets/reactome_12_12_16/pathway_hierarchy.tsv"){

    ## pathway parameter in the following function can be kegg/reactome/all
    ## sub parameter can be entrez/symbol
    getPathway = function(pathway="kegg", sub=sub_identifier, genes_start=gene_sets_gene_start,
                          path_dir=gene_sets_dir,
                          len=path_length,
                          min_len=min_path_length,
                          max_len=max_path_length,
                          gi_fn=geneInfo_fn){

        make_pathway_list = function(pathMsigDbFile) {
            inputFile <- pathMsigDbFile
            con  <- file(inputFile, open = "r")
            c = 1
            pathway.list <- vector(mode="list",length=0)
            message("Loading gene sets....")
            while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {
                myVector <- do.call("rbind",strsplit(oneLine, "\t"))
                t = vector(mode="list",length=1)
                t[[1]] = myVector[genes_start:length(myVector)]
                names(t) = myVector[1]
                pathway.list = c(pathway.list,t)
                c = c+1
            }

            close(con)
            return(pathway.list)
        }

        fns = list.files(path_dir)
        pattern=paste0(pathway, ".*", sub)
        fn = fns[grepl(pattern, fns)]
        if(length(fn)>1 | length(fn)==0){
            stop("No or multiple files found. Please check the parameters provided")
        }

        message(paste0("Pathway file: ", fn))
        p = make_pathway_list(paste0(path_dir, fn))


        if(len==TRUE & !is.null(min_len) & !is.null(max_len)){
            p = p[unlist(lapply(p, function(x) length(x)>=min_len))]
            p = p[unlist(lapply(p, function(x) length(x)<=max_len))]
        }

        if(pathlvls_excl){
            hier = read.table(hierarchy.file, sep = "\t", header = F)
            hier = read.table(hierarchy.file, sep = "\t", header = F)
            colnames(hier) = c("path1", "path2")
            ## Careful here cause some pathways have apostrophes in their names
            path_ids = read.table(identifier.file, sep = "\t", header = F, quote = "")
            colnames(path_ids) = c("path_id", "path_name", "species")
            path_ids = path_ids %>% subset(species=="Homo sapiens") %>% rename(pathway=path_name) %>% select(-species)

            ## Exclude leading nad trailing spaces from pathway names
            path_ids$pathway = gsub("^\\s+|\\s+$", "", path_ids$pathway)
            path_ids$path_id = gsub("^\\s+|\\s+$", "", path_ids$path_id)

            ## Add info to herarchy table and subset for the homo sapiens interactions
            hier = hier %>% subset(path1%in%path_ids$path_id | path2%in%path_ids$path_id) %>% left_join(path_ids%>%select(path_id, pathway)%>%rename(path1=path_id, pathway1=pathway)) %>%
                left_join(path_ids%>%select(path_id, pathway)%>%rename(path2=path_id, pathway2=pathway))

            ## Trials to exclude level 1&2 nodes
            level_1_nodes = c("Developmental Biology",
                              "Reproduction",
                              "Circadian Clock",
                              "Hemostasis",
                              "Neuronal System",
                              "Immune System",
                              "Signal Transduction",
                              "Disease",
                              "DNA Repair",
                              "Metabolism",
                              "Gene Expression",
                              "Chromatin organization",
                              "Transmembrane transport of small molecules",
                              "DNA replication",
                              "Metabolism of proteins",
                              "Muscle contraction",
                              "Cell Cycle",
                              "Organelle biogenesis and maintenance",
                              "Mitophagy",
                              "Vesicle-mediated transport",
                              "Programmed Cell Death",
                              "Cellular responses to stress",
                              "Extracellular matrix organization",
                              "Cell-Cell communication")

            level_2_nodes = hier %>% subset(pathway1%in%level_1_nodes | pathway2%in%level_1_nodes)
            level_2_nodes = unique(c(level_2_nodes$pathway1, level_2_nodes$pathway2))
            level_2_nodes = level_2_nodes[!level_2_nodes%in%level_1_nodes]
            p = p[!names(p)%in%level_1_nodes]
            p = p[!names(p)%in%level_2_nodes]

        }

        message(paste0("Number of pathways: ", length(p)))
        if(sub=="entrez"){
            genes_p = as.numeric(unique(unlist(p)))
        }else if(sub=="symbol"){
            genes_p = unique(unlist(p))
        }
        message(paste0("Number of genes: ", length(genes_p)))
        ## Get gene info to check overlap with 19014
        load(gi_fn)
        geneInfo = geneInfo %>% subset(duplicability!=-1) %>% select(symbol, entrez)
        if(sub=="entrez"){
            message(paste0("Number of genes (19,014): ", length(genes_p[genes_p%in%geneInfo$entrez])), "\n")
        }else if(sub=="symbol"){
            message(paste0("Number of genes (19,014): ", length(genes_p[genes_p%in%geneInfo$symbol])), "\n")
        }
        return(p)

    }

    ## x is a vector of genes
    ## y is a list of pathways
    ## identifier can be entrez/symbol
    updatePathway = function(x, y, identifier=sub_identifier, min_len=min_path_length, max_len=max_path_length, len=path_length,
                             gi_fn=geneInfo_fn){

        ## This function removes all genes that are not in the gene list of interest
        ## from the pathways

        if (identifier=="entrez"){
            y = lapply(y, as.numeric)
        }else if (identifier=="symbol"){
            y = lapply(y, as.character)
        }

        ## Update for the list of genes that are in our gene set
        y = lapply(y, function(g) g[g%in%x])
        ## Remove empty sets
        y = y[unlist(lapply(y, function(x) length(x)>0))]

        if(len==TRUE){
            y = y[unlist(lapply(y, function(x) length(x)>min_len))]
            y = y[unlist(lapply(y, function(x) length(x)<max_len))]
        }

        message(paste0("Number of pathways: ", length(y)))
        if(identifier=="entrez"){
            genes_p = as.numeric(unique(unlist(y)))
        }else if(identifier=="symbol"){
            genes_p = unique(unlist(y))
        }
        message(paste0("Number of genes: ", length(genes_p)))
        ## Get gene info to check overlap with 19014
        load(gi_fn)
        geneInfo = geneInfo %>% subset(duplicability!=-1) %>% select(symbol, entrez)
        if(identifier=="entrez"){
            message(paste0("Number of genes (19,014): ", length(genes_p[genes_p%in%geneInfo$entrez])))
        }else if(identifier=="symbol"){
            message(paste0("Number of genes (19,014): ", length(genes_p[genes_p%in%geneInfo$symbol])))
        }


        return(list(gene_sets=y, genes_in_p=length(genes_p)))
    }

    load(geneInfo_fn)
    if(is.null(given_tb)){
        load(scores_fn)
    }else if(!is.null(given_tb)){ ## I do that to give directly prediction or training set
        syscan = given_tb
        ## These tables are the raw input for SVM and they have no symbol
        syscan = syscan %>% tibble::rownames_to_column() %>% separate(rowname, into=c("cancer_type", "sample", "entrez"), sep="\\.")
        syscan = syscan %>% mutate(entrez=as.numeric(entrez)) %>% left_join(geneInfo%>%select(entrez, symbol))
    }

    ## Filters
    if(remove.cgc){
        syscan = syscan %>% subset(gene_type!="cgc")
    }
    if(remove.amplifications){
        syscan = syscan %>% subset(CNVGain!=1)
    }

    ## Exclude samples that do not correspond to
    #syscan = syscan %>% subset(!is.na(Group))

    gene_sets = getPathway(pathway = gene_sets_identifiers)
    genesets2length = lapply(gene_sets, function(x) length(x))
    genesets2length = data.frame(pathway=names(genesets2length), pathway.length=unlist(genesets2length))

    ## Checkpoint here whether the analysis should consider OAC subgroups or not
    if(ranks==F){
        if(by.groups){
            group_enrichment = NULL
            for(g in unique(syscan$Group[!is.na(syscan$Group)])){
                cat(g, "\n")
                gd = syscan %>% subset(Group==g)

                group_genes = NULL
                ## Get the top genes in each sample
                for(s in unique(gd$sample)){
                    sd = gd %>% subset(sample==s)
                    if(CGC.plus){
                        sd1 = sd %>% subset(gene_type=="cgc")
                        sd2 = sd %>% subset(gene_type!="cgc") %>% arrange(desc(score)) %>% slice(1:genes.per.patient)
                        sd = rbind(sd1, sd2)
                    }else{
                        sd = sd %>% arrange(desc(score)) %>% slice(1:genes.per.patient)
                    }
                    group_genes = c(group_genes, sd$symbol)
                }

                ## Make the vector unique because we may have the same gene in two samples
                group_genes = unique(group_genes)

                ## x is the gene set and y your gene list
                run.hypergeometric = function(x, y, universe=19014){
                    P = phyper(length(intersect(x,y)), length(x), universe-length(x), length(y), lower.tail = F)
                    p = data.frame(p.value=P, gene.set.length=length(x), genes.in.gene.set=length(intersect(x,y)))
                    return(p)
                }

                l = lapply(gene_sets, function(x) run.hypergeometric(x, group_genes))
                l = lapply(seq_along(l), function(y, n, i){
                    ldply(y[[i]]) %>% mutate(pathway=n[[i]])
                }, y=l, n=names(l))
                l = do.call(rbind, l) %>% rename(value=V1, type=.id)
                l = l %>% unique %>% spread(type, value)

                l$fdr = p.adjust(l$p.value, method="BH")
                l$number.of.genes.tested = length(group_genes)
                l$group = g
                group_enrichment = rbind(group_enrichment, l)
                return(group_enrichment)
            }
        }else{ ## Do GSEA without considering subgroups
            genes=NULL
            for(s in unique(syscan$sample)){
                sd = syscan %>% subset(sample==s)
                if(is.null(given_tb)){
                    if(CGC.plus & !is.null(genes.per.patient)){
                        sd1 = sd %>% subset(gene_type=="cgc")
                        sd2 = sd %>% subset(gene_type!="cgc") %>% arrange(desc(score)) %>% slice(1:genes.per.patient)
                        sd = rbind(sd1, sd2)
                    }else if(CGC.plus==F & !is.null(genes.per.patient)){
                        sd = sd %>% subset(gene_type!="cgc") %>% arrange(desc(score)) %>% slice(1:genes.per.patient)
                    }else if(is.null(genes.per.patient)){
                        sd = sd %>% subset(gene_type=="cgc")
                    }
                }else if(!is.null(given_tb)){
                    sd = sd
                }
                genes = rbind(genes, sd)
            }

            ## Store this to return also the genes
            genes_to_return = genes

            if(is.null(given_tb)){
                ## Get distribution
                print("CGCs")
                print(genes %>% subset(gene_type=="cgc") %>% group_by(sample) %>% summarise(genes=length(unique(symbol))) %>% ungroup() %>%
                          full_join(syscan%>%select(sample)%>%unique) %>% mutate(genes=ifelse(is.na(genes), 0, genes)) %>% .$genes %>% summary())
                print(genes %>% subset(gene_type=="cgc") %>% select(symbol) %>% unique %>% nrow)
                print(genes %>% subset(gene_type=="cgc" & symbol%in%unique(unlist(gene_sets))) %>% select(symbol) %>% unique %>% nrow)
                print("Non CGCs")
                print(genes %>% subset(gene_type!="cgc") %>% group_by(sample) %>% summarise(genes=length(unique(symbol))) %>% ungroup() %>%
                          full_join(syscan%>%subset(gene_type!="cgc")%>%select(sample)%>%unique) %>% mutate(genes=ifelse(is.na(genes), 0, genes)) %>% .$genes %>% summary())
                print(genes %>% subset(gene_type!="cgc") %>% select(symbol) %>% unique %>% nrow)
                print(genes %>% subset(gene_type!="cgc" & symbol%in%unique(unlist(gene_sets))) %>% select(symbol) %>% unique %>% nrow)
                print("Samples")
                print(genes %>% select(sample) %>% unique %>% nrow)
            }

            genes = genes$symbol
            ## Make the vector unique because we may have the same gene in two samples
            cat(paste0("Genes extracted: ", length(genes)), "\n")
            genes = unique(genes)
            cat(paste0("Unique genes: ", length(genes)), "\n")

            ## Get how many of those are mapped to pathways
            genesInGeneSets = sum(genes%in%unique(unlist(gene_sets)))
            cat(paste0("Unique genes in gene sets: ", genesInGeneSets), "\n")

            ## x is the gene set and y your gene list
            run.hypergeometric = function(x, y, universe=19014){
                P = phyper(length(intersect(x,y)), length(x), universe-length(x), length(y), lower.tail = F)
                p = data.frame(p.value=P, gene.set.length=length(x), genes.in.gene.set=length(intersect(x,y)))
                return(p)
            }

            l = lapply(gene_sets, function(x) run.hypergeometric(x, genes))
            l = lapply(seq_along(l), function(y, n, i){
                ldply(y[[i]]) %>% mutate(pathway=n[[i]])
            }, y=l, n=names(l))
            l = do.call(rbind, l) %>% rename(value=V1, type=.id)
            l = l %>% unique %>% spread(type, value)

            l$fdr = p.adjust(l$p.value, method="BH")
            l$number.of.genes.tested = length(genes)

            ## Add here annotation for the pathway level in reactome
            hier = read.table(hierarchy.file, sep = "\t", header = F)
            hier = read.table(hierarchy.file, sep = "\t", header = F)
            colnames(hier) = c("path1", "path2")
            ## Careful here cause some pathways have apostrophes in their names
            path_ids = read.table(identifier.file, sep = "\t", header = F, quote = "")
            colnames(path_ids) = c("path_id", "path_name", "species")
            path_ids = path_ids %>% subset(species=="Homo sapiens") %>% rename(pathway=path_name) %>% select(-species)

            ## Exclude leading nad trailing spaces from pathway names
            path_ids$pathway = gsub("^\\s+|\\s+$", "", path_ids$pathway)
            path_ids$path_id = gsub("^\\s+|\\s+$", "", path_ids$path_id)

            ## Add info to herarchy table and subset for the homo sapiens interactions
            hier = hier %>% subset(path1%in%path_ids$path_id | path2%in%path_ids$path_id) %>% left_join(path_ids%>%select(path_id, pathway)%>%rename(path1=path_id, pathway1=pathway)) %>%
            left_join(path_ids%>%select(path_id, pathway)%>%rename(path2=path_id, pathway2=pathway))

            ## Trials to exclude level 1&2 nodes
            level_1_nodes = c("Developmental Biology",
                                  "Reproduction",
                                  "Circadian Clock",
                                  "Hemostasis",
                                  "Neuronal System",
                                  "Immune System",
                                  "Signal Transduction",
                                  "Disease",
                                  "DNA Repair",
                                  "Metabolism",
                                  "Gene Expression",
                                  "Chromatin organization",
                                  "Transmembrane transport of small molecules",
                                  "DNA replication",
                                  "Metabolism of proteins",
                                  "Muscle contraction",
                                  "Cell Cycle",
                                  "Organelle biogenesis and maintenance",
                                  "Mitophagy",
                                  "Vesicle-mediated transport",
                                  "Programmed Cell Death",
                                  "Cellular responses to stress",
                                  "Extracellular matrix organization",
                                  "Cell-Cell communication")

            level_2_nodes = hier %>% subset(pathway1%in%level_1_nodes | pathway2%in%level_1_nodes)
            level_2_nodes = unique(c(level_2_nodes$pathway1, level_2_nodes$pathway2))
            level_2_nodes = level_2_nodes[!level_2_nodes%in%level_1_nodes]
            l = l %>% mutate(Level1=ifelse(pathway%in%level_1_nodes, TRUE, FALSE),
                            Level2=ifelse(pathway%in%level_2_nodes, TRUE, FALSE))

            return(list(gsea=l, genes=genes_to_return, paths=gene_sets))
        }
    }else if(ranks==T){

        ## x is the pathway
        ## g is the data frame with ranks
        run.singed.ks = function(x, g, func_fn="/Users/fc-8s-imac/athena/data/OAC/signed_ks.R"){
            source(func_fn)
            if(sub_identifier=="entrez"){
                s = g %>% subset(entrez%in%x) %>% select(rank) %>% .$rank
            }else if(sub_identifier=="symbol"){
                s = g %>% subset(symbol%in%x) %>% select(rank) %>% .$rank
            }

            k <- ks.test.2(s, (1:max(g$rank))[-s], maxCombSize=10^10)
            res = list(es=k$ES, p.value=k$p, no.genes=length(x), ranks=paste(s, collapse=","))
            return(res)
        }

        cat("Rank mode", "\n")
        genes=NULL
        for(s in unique(syscan$sample)){
            sd = syscan %>% subset(sample==s)
            sd = sd %>% arrange(desc(score)) %>% slice(1:genes.per.patient)
            genes = rbind(genes, sd)
        }

        ## Create one list for all the genes
        genes = genes %>% select(symbol, score) %>% group_by(symbol) %>% summarise(score=mean(score)) %>% ungroup()
        genes = genes %>% arrange(desc(score)) %>% mutate(rank=row_number())

        ## Just in case there are duplicated pathways
        dups = names(gene_sets)[duplicated(names(gene_sets))]
        for (d in dups){
            idx = which(names(gene_sets)==d)
            gene_sets[idx[2:length(idx)]] <- NULL
        }

        l = lapply(gene_sets, function(x) length(x))
        l = lapply(seq_along(l), function(y, n, i){
            ldply(y[[i]]) %>% mutate(pathway=n[[i]])
        }, y=l, n=names(l))
        l = do.call(rbind, l) %>% rename(pathway_length=V1)

        if(sub_identifier=="entrez"){
            gene_sets_up = updatePathway(genes$entrez, gene_sets)
        }else if(sub_identifier=="symbol"){
            gene_sets_up = updatePathway(genes$symbol, gene_sets)
        }

        gene_sets_up = gene_sets_up[["gene_sets"]]

        res = lapply(gene_sets_up, function(x) run.singed.ks(x, genes))
        ## Make res data frame to work with
        res = lapply(seq_along(res), function(y, n, i){
            ldply(y[[i]]) %>% mutate(pathway=n[[i]])
        }, y=res, n=names(res))
        res = do.call(rbind, res) %>% spread(.id, V1)
        res$fdr = p.adjust(res$p.value, method="BH")
        res$bonf = p.adjust(res$p.value, method="bonferroni")
        res = res %>% left_join(l)
        return(res)
    }
}

## function to take the distributions of genes mapped etc after the sample-by-sample GSEA
checkSampleBySample = function(gsea, fdr.cutoff=0.1){

    gsea = gsea[["gsea"]]
    ## Number of genes mapped to the pathways
    all_mappings = NULL
    for (g in unique(gsea$group)){
        if(is.na(g)){
            next
        }
        cat(g, "\n")
        group_mappings = NULL
        d = gsea %>% subset(group==g & fdr<fdr.cutoff & es>0)
        for (s in unique(d$sample)){
            cat(s, "\n")
            dd = d %>% subset(sample==s)
            u_genes = dd %>% mutate(ranks=strsplit(ranks, ",")) %>% unnest()
            u_genes = unique(dd$ranks)
            x = data.frame(sample=s, enriched_pathways=nrow(dd), genes_in_pathways=length(u_genes))
            group_mappings = rbind(group_mappings, x)
        }
        group_mappings = group_mappings %>% mutate(group=g)
        all_mappings = rbind(all_mappings, group_mappings)
        rm(group_mappings)
    }


}

comparePathways = function(gsea, r=2, c=2){

    ## Venn diagrams
    par(mfrow=c(r,c))
    gs = c("kegg", "reactome", "all")
    for(g in gs){
        d = gsea %>% subset(gene_set==g)
        d1 = d%>%subset(fdr<0.01 & es>0 & group=="C>A/T dominant")%>%.$pathway
        d2 = d%>%subset(fdr<0.01 & es>0 & group=="DDR impaired")%>%.$pathway
        d3 = d%>%subset(fdr<0.01 & es>0 & group=="Mutagenic")%>%.$pathway
        if(g=="kegg"){
            venn(list("C>A/T dominant_kegg"=d1, "DDR impaired_kegg"=d2, "Mutagenic_kegg"=d3))
        }else if(g=="reactome"){
            venn(list("C>A/T dominant_reactome"=d1, "DDR impaired_reactome"=d2, "Mutagenic_reactome"=d3))
        }else if(g=="all"){
            venn(list("C>A/T dominant_all"=d1, "DDR impaired_all"=d2, "Mutagenic_all"=d3))
        }

    }
    par(mfrow=c(1,1))
}

pathways2hallmarks = function(gsea,
                              gene_sets_dir="/Volumes/mourikisa/data/geneSets/reactome_12_12_16/",
                              gene_sets_gene_start=4,
                              geneInfo_fn="/Volumes/mourikisa/data/geneInfoNCG5.Rdata",
                              hallmarks_fn="/Volumes/mourikisa/data/geneSets/Supplementary Table S6 - hallmark curation keywords.xlsx"){
    require(xlsx)
    getPathway = function(pathway="reactome", sub="symbol", genes_start=gene_sets_gene_start,
                          path_dir=gene_sets_dir,
                          gi_fn=geneInfo_fn){

        make_pathway_list = function(pathMsigDbFile) {
            inputFile <- pathMsigDbFile
            con  <- file(inputFile, open = "r")
            c = 1
            pathway.list <- vector(mode="list",length=0)
            message("Loading gene sets....")
            while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {
                myVector <- do.call("rbind",strsplit(oneLine, "\t"))
                t = vector(mode="list",length=1)
                t[[1]] = myVector[genes_start:length(myVector)]
                names(t) = myVector[1]
                pathway.list = c(pathway.list,t)
                c = c+1
            }

            close(con)
            return(pathway.list)
        }

        fns = list.files(path_dir)
        pattern=paste0(pathway, ".*", sub)
        fn = fns[grepl(pattern, fns)]
        if(length(fn)>1 | length(fn)==0){
            stop("No or multiple files found. Please check the parameters provided")
        }

        message(paste0("Pathway file: ", fn))
        p = make_pathway_list(paste0(path_dir, fn))
        message(paste0("Number of pathways: ", length(p)))
        if(sub=="entrez"){
            genes_p = as.numeric(unique(unlist(p)))
        }else if(sub=="symbol"){
            genes_p = unique(unlist(p))
        }
        message(paste0("Number of genes: ", length(genes_p)))
        ## Get gene info to check overlap with 19014
        load(gi_fn)
        geneInfo = geneInfo %>% subset(duplicability!=-1) %>% select(symbol, entrez)
        if(sub=="entrez"){
            message(paste0("Number of genes (19,014): ", length(genes_p[genes_p%in%geneInfo$entrez])), "\n")
        }else if(sub=="symbol"){
            message(paste0("Number of genes (19,014): ", length(genes_p[genes_p%in%geneInfo$symbol])), "\n")
        }
        return(p)

    }

    ## Add pathway hallmarks annotation
    halls = read.xlsx(hallmarks_fn, 1) %>%
        rename(keyword=Keyword.searched.in.pathway.name) %>%
        rename(keyword.gene=Keywork.searched.in.the.names.of.the.composing.genes) %>%
        mutate(keyword=gsub("^\\s+|\\s+$", "", keyword)) %>%
        mutate(keyword.gene=gsub("^\\s+|\\s+$", "", keyword.gene))

    searchPathName = halls %>% select(Hallmark, keyword) %>%
        mutate(keyword=strsplit(keyword, ",")) %>%
        unnest() %>% mutate(keyword=gsub("^\\s+|\\s+$", "", keyword))

    searchGenes = halls %>% select(Hallmark, keyword.gene) %>%
        mutate(keyword.gene=strsplit(keyword.gene, ",")) %>%
        unnest() %>% mutate(keyword.gene=gsub("^\\s+|\\s+$", "", keyword.gene))
    searchGenes = searchGenes %>% subset(!is.na(keyword.gene))

    pl = getPathway()

    hm_path_names = NULL
    message("Searching path names")
    for(hm in unique(searchPathName$Hallmark)){
      toMatch = searchPathName %>% subset(Hallmark==hm) %>% .$keyword
      matches = unique (grep(paste(toMatch,collapse="|"),
                              unique(names(pl)), value=TRUE))
      hms = data.frame(pathway=matches) %>% mutate(Hallmark=hm, evidence="name")
      hm_path_names = rbind(hm_path_names, hms)
    }
    message("Searching genes in each path")
    pl_df = ldply(pl, data.frame)
    colnames(pl_df) = c("pathway", "gene")
    hm_path_genes = pl_df %>% left_join(searchGenes, by=c("gene"="keyword.gene")) %>%
      subset(!is.na(Hallmark)) %>% select(pathway, Hallmark) %>% unique %>%
      mutate(evidence="gene")

    p2h = rbind(hm_path_names, hm_path_genes)
    p2h = p2h %>% group_by(pathway) %>% summarise(Hallmark=paste(unique(Hallmark), collapse=","),
                                                  evidence=paste(unique(evidence), collapse=","))

    gsea = gsea %>% left_join(p2h)
    annotation = gsea %>% select(sample, group) %>% unique
    annotation$group[is.na(annotation$group)] = "No group"

    kat_pathways = unique(names(pl[unlist(lapply(pl, function(x) "KAT2A"%in%x))]), names(pl[unlist(lapply(pl, function(x) "KAT2B"%in%x))]))
    kat_pathways = data.frame(pathway=kat_pathways, KAT_pathway=TRUE)
    gsea = gsea %>% left_join(kat_pathways)

    hm = "KAT2A/B related pathways"


    message(hm)
    toPlot = gsea %>% subset(p.value<0.05) %>% subset(KAT_pathway==TRUE) %>% mutate(p.value=ifelse(p.value==0, 1e-10, p.value)) %>% mutate(path_ratio=as.numeric(no.genes)/as.numeric(pathway_length))
    #toPlot = gsea %>% subset(p.value<0.05) %>% subset(grepl(hm, Hallmark)) %>% mutate(p.value=ifelse(p.value==0, 1e-10, p.value))
    toPlot = toPlot %>% select(pathway, sample, group, p.value, fdr) %>% mutate(value= -log10(as.numeric(p.value)))
    toPlot = toPlot %>% select(pathway, sample, path_ratio) %>% spread(pathway, path_ratio)
    toPlot = toPlot %>% left_join(annotation)
    rownames(toPlot) = toPlot$sample
    toPlot = toPlot[2:length(toPlot)]
    toPlot[is.na(toPlot)] = 0



        h1 = Heatmap(toPlot%>%select(-group),
                     column_title = hm,
                     #cluster_rows = FALSE,
                     #cluster_columns = FALSE,
                     row_names_gp = gpar(fontsize = 6),
                     column_names_gp = gpar(fontsize = 8),
                     heatmap_legend_param = list(title = "-log10(p.value)",
                                                 title_gp = gpar(fontsize = 12, fontface = "bold"),
                                                 color_bar = "continuous",
                                                 grid_height = unit(6, "mm"),
                                                 grid_width = unit(6, "mm"),
                                                 labels_gp = gpar(fontsize = 12)))

        h2 = Heatmap(toPlot[,"group"], name = "group",
                     heatmap_legend_param = list(title = "group",
                                                 title_gp = gpar(fontsize = 12, fontface = "bold"),
                                                 color_bar = "discrete",
                                                 grid_height = unit(6, "mm"),
                                                 grid_width = unit(6, "mm"),
                                                 labels_gp = gpar(fontsize = 12)),
                     width = unit(6, "mm"))


        draw(h1 + h2)

}
## Check sample coverage for genes in the enriched pathways (combined list for each group)
enrichedPathways2Samples = function(gsea, fdr.cutoff=0.01,
                                    path="reactome",
                                    sub_identifier="symbol",
                                    gene_sets_gene_start=4,
                                    gene_sets_dir="/Users/fc-8s-imac/athena/data/geneSets/reactome_12_12_16/",
                                    geneInfo_fn="/Users/fc-8s-imac/athena/data/geneInfoNCG5.Rdata",
                                    scores_fn="/Users/fc-8s-imac/athena/data/OAC/129_OAC/ML/OAC/syscans_4_with_gains_in_sys_cans.Rdata",
                                    cohort_fn="/Users/fc-8s-imac/athena/data/OAC/129_OAC/Rdata/mainCohort.Rdata",
                                    remove.amplifications=T,
                                    remove.cgc=T,
                                    path_length=F,
                                    es.positive=T,
                                    min_path_length=10,
                                    max_path_length=500){

    ## get patient data
    load(scores_fn)
    load(geneInfo_fn)
    ## Make sure you update the symbols to work with gene sets with symbols
    syscan = syscan %>% select(-symbol) %>% left_join(geneInfo%>%select(entrez, symbol))
    ## Exclude "CGC discarded"
    syscan = syscan %>% subset(gene_type!="CGC_discarded")

    ## ------ Add mutational signature information to the scores ---------
    load(cohort_fn)
    mainCohort = mainCohort %>% rename(sample=Primary_tumour_solid_tissue)
    syscan = syscan %>% left_join(mainCohort%>%select(sample, Group)%>%unique)
    ## Filters
    if(remove.cgc){
        syscan = syscan %>% subset(gene_type!="cgc")
    }
    if(remove.amplifications){
        syscan = syscan %>% subset(CNVGain!=1)
    }

    gsea = gsea[["gsea"]]

    getPathway = function(pathway=path, sub=sub_identifier, genes_start=gene_sets_gene_start,
                          path_dir=gene_sets_dir,
                          gi_fn=geneInfo_fn){

        make_pathway_list = function(pathMsigDbFile) {
            inputFile <- pathMsigDbFile
            con  <- file(inputFile, open = "r")
            c = 1
            pathway.list <- vector(mode="list",length=0)
            message("Loading gene sets....")
            while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {
                myVector <- do.call("rbind",strsplit(oneLine, "\t"))
                t = vector(mode="list",length=1)
                t[[1]] = myVector[genes_start:length(myVector)]
                names(t) = myVector[1]
                pathway.list = c(pathway.list,t)
                c = c+1
            }

            close(con)
            return(pathway.list)
        }

        fns = list.files(path_dir)
        pattern=paste0(pathway, ".*", sub)
        fn = fns[grepl(pattern, fns)]
        if(length(fn)>1 | length(fn)==0){
            stop("No or multiple files found. Please check the parameters provided")
        }

        message(paste0("Pathway file: ", fn))
        p = make_pathway_list(paste0(path_dir, fn))
        message(paste0("Number of pathways: ", length(p)))
        if(sub=="entrez"){
            genes_p = as.numeric(unique(unlist(p)))
        }else if(sub=="symbol"){
            genes_p = unique(unlist(p))
        }
        message(paste0("Number of genes: ", length(genes_p)))
        ## Get gene info to check overlap with 19014
        load(gi_fn)
        geneInfo = geneInfo %>% subset(duplicability!=-1) %>% select(symbol, entrez)
        if(sub=="entrez"){
            message(paste0("Number of genes (19,014): ", length(genes_p[genes_p%in%geneInfo$entrez])), "\n")
        }else if(sub=="symbol"){
            message(paste0("Number of genes (19,014): ", length(genes_p[genes_p%in%geneInfo$symbol])), "\n")
        }
        return(p)

    }

    ## x is a vector of genes
    ## y is a list of pathways
    ## identifier can be entrez/symbol
    updatePathway = function(x, y, identifier=sub_identifier, min_len=min_path_length, max_len=max_path_length, len=path_length,
                             exclude_pattern="CANCER|LEUKEMIA|MELANOMA|GLIOMA",
                             gi_fn=geneInfo_fn){

        ## This function removes all genes that are not in the gene list of interest
        ## from the pathways

        if (identifier=="entrez"){
            y = lapply(y, as.numeric)
        }else if (identifier=="symbol"){
            y = lapply(y, as.character)
        }

        ## Update for the list of genes that are in our gene set
        y = lapply(y, function(g) g[g%in%x])
        ## Remove empty sets
        y = y[unlist(lapply(y, function(x) length(x)>0))]

        if(len==TRUE){
            y = y[unlist(lapply(y, function(x) length(x)>min_len))]
            y = y[unlist(lapply(y, function(x) length(x)<max_len))]
        }
        y = y[names(y)[!grepl(exclude_pattern, names(y))]]

        message(paste0("Number of pathways: ", length(y)))
        if(identifier=="entrez"){
            genes_p = as.numeric(unique(unlist(y)))
        }else if(identifier=="symbol"){
            genes_p = unique(unlist(y))
        }
        message(paste0("Number of genes: ", length(genes_p)))
        ## Get gene info to check overlap with 19014
        load(gi_fn)
        geneInfo = geneInfo %>% subset(duplicability!=-1) %>% select(symbol, entrez)
        if(identifier=="entrez"){
            message(paste0("Number of genes (19,014): ", length(genes_p[genes_p%in%geneInfo$entrez])))
        }else if(identifier=="symbol"){
            message(paste0("Number of genes (19,014): ", length(genes_p[genes_p%in%geneInfo$symbol])))
        }


        return(list(gene_sets=y, genes_in_p=length(genes_p)))
    }

    reactome = getPathway()

    ## Select enriched pathways based on the fdr
    genes_in_enriched_pathways = NULL
    if(es.positive==F){
        for(g in unique(gsea$group)){
            genes = syscan %>% subset(Group==g) %>% subset(gene_type!="CGC_discarded") %>%
                subset(type=="P") %>% select(entrez, symbol, score) %>% group_by(entrez, symbol) %>% summarise(score=mean(score)) %>% ungroup()
            addition = syscan %>% subset(Group==g) %>% subset(gene_type!="CGC_discarded") %>%
                subset(type=="C") %>% select(entrez, symbol) %>% unique %>% mutate(score=max(genes$score)+1)
            genes = rbind(genes, addition)
            d = gsea %>% subset(group==g & fdr<fdr.cutoff)
            gs = updatePathway(genes$symbol, reactome)
            gs = gs[["gene_sets"]]
            gs = gs[names(gs)[names(gs)%in%d$pathway]]

            ## Unlist all pathways and take unique genes
            gs = unique(unlist(gs))

            dd = syscan %>% subset(symbol%in% gs & Group==g) %>% count(sample) %>% mutate(group=g, genes_in_enriched_pathways=length(gs))
            genes_in_enriched_pathways = rbind(genes_in_enriched_pathways, dd)
        }
    }else{
        for(g in unique(gsea$group)){
            genes = syscan %>% subset(Group==g) %>% subset(gene_type!="CGC_discarded") %>%
                subset(type=="P") %>% select(entrez, symbol, score) %>% group_by(entrez, symbol) %>% summarise(score=mean(score)) %>% ungroup()
            addition = syscan %>% subset(Group==g) %>% subset(gene_type!="CGC_discarded") %>%
                subset(type=="C") %>% select(entrez, symbol) %>% unique %>% mutate(score=max(genes$score)+1)
            genes = rbind(genes, addition)
            d = gsea %>% subset(group==g & fdr<fdr.cutoff & es>0)
            gs = updatePathway(genes$symbol, reactome)
            gs = gs[["gene_sets"]]
            gs = gs[names(gs)[names(gs)%in%d$pathway]]

            ## Unlist all pathways and take unique genes
            gs = unique(unlist(gs))
            cat(g, "\n")
            cat(paste0("Enriched pathwas: ", nrow(d)), "\n")
            cat(paste0("Number of genes in enriched pathways: ", length(gs)), "\n")
            dd = syscan %>% subset(symbol%in% gs & Group==g) %>% count(sample) %>% mutate(group=g, genes_in_enriched_pathways=length(gs))
            genes_in_enriched_pathways = rbind(genes_in_enriched_pathways, dd)
        }
    }


    p = ggplot(genes_in_enriched_pathways, aes(x=group, y=n)) + geom_boxplot() +
        scale_y_continuous(limits = c(0, max(genes_in_enriched_pathways$n)), breaks = seq(0, max(genes_in_enriched_pathways$n), 20)) +
        ggtitle("Genes in enriched pathways per sample \n (one list per group approach)") +
        ylab("Genes (#)") + xlab("") +
        theme_boss() +
        theme(
            axis.line.x = element_line(colour = "black"),
            axis.line.y = element_line(colour = "black")
        )
    pdf("/Users/fc-8s-imac/athena/data/OAC/129_OAC/Tables_and_Plots/genes_in_enriched_pathways.pdf")
    print(p)
    dev.off()

}


checkRunning = function(dir="/Volumes/mourikisa/novel_driver_prediction/pancancer_model_training/automation/median_mode_imputation/"){
    cancer_types <- c("OAC", "BRCA", "COAD", "ESCA", "GBM",
                      "LAML", "LGG", "PAAD", "PCPG")
    for (ct in cancer_types){
        message(ct)
        res_dir = paste0(dir, "/", ct, "/Results/")
        kernels = c("linear", "polynomial", "radial", "sigmoid")
        for (k in kernels){
            fns = list.files(res_dir)
            fns = fns[grepl(k, fns)]
            message(paste0(k, ": ", length(fns)))
        }
    }
    cat("\n\n")
}

check_imputation = function(imputation_dir="/Volumes/mourikisa/data/"){
    require(mice)
    ## When I parallelize the imputation I store multiple objects objects
    ## I need to load one by one and gather the imputed datasets
    pars = 10 ## this is the number of objects run in parallel
    imp_datasets = 10 ## how many imputed datasets were produced by each process
    ## The imputed datasets have no entrez, you need to pull entrez from the geneProperties file
    ## Load geneProperties
    load("/Volumes/mourikisa/data/geneProperties_final.Rdata")
    entrez = geneProperties %>% select(Entrez)
    imputations = NULL ## gather all the imputed datasets and the initial one in a data frame
    count = 0
    for (i in 1:pars){
        cat(i, "\n")
        load(paste0(imputation_dir, "/geneProperties_final_miceImputed_object_", i, ".Rdata")) ## geneProperties_miceImputed
        if(i==1){ ## First time
            for (y in 0:imp_datasets){
                imputations = rbind(imputations, cbind(entrez, complete(geneProperties_miceImputed, y)) %>% data.frame() %>% mutate(iteration=count))
                count = count + 1
            }

        }else{
            for (y in 1:imp_datasets){
                imputations = rbind(imputations, cbind(entrez, complete(geneProperties_miceImputed, y)) %>% data.frame() %>% mutate(iteration=count))
                count = count + 1
            }
        }
    }
    rm(geneProperties_miceImputed)



    ## ---------------------------
    ## Check imputed features here
    ## ---------------------------
    ## Plot the distributions of each feature to make sure that the usage of the mean of the imputed values makes sense
    ## test properties to see what
    test = geneProperties%>%select(Entrez, age) %>% gather(feature, value, -Entrez) %>% subset(is.na(value)) %>% mutate(missing=T)
    for (i in 1:100){
        cat(i, "\n")
        d = imputations %>% subset(iteration==i) %>% select(Entrez, age) %>% gather(feature, value, -Entrez) %>% subset(Entrez%in%test$Entrez[test$missing==T]) %>% mutate(missing=F)
        test = rbind(test, d)
    }
    test = test %>% subset(!is.na(value)) %>% left_join(geneProperties%>%select(Entrez, cancer_type)) %>% count(cancer_type, value) %>% mutate(total=test%>%subset(!is.na(value))%>%nrow) %>% mutate(frac=n/total)
    ggplot(test, aes(x=value, y=frac, fill=cancer_type)) + geom_bar(stat="identity", position="dodge")


    ## I need to take the mean of the imputed values
    ## To do this I can utilize gather and spread functions
    ## I need to do that feature-by-feature because the type of variables are different
    ## The idea is i gather on each feature get the mean of the imputated values, then I spread and store all the data frames in a list
    ## Ten I merge all the data frames in the list to make a single data frame
    df_list = list()
    for (f in colnames(imputations)[2:(length(imputations)-1)]){
        cat(f, "\t")
        cat(class(imputations[,f]), "\n")
        ## I gather all the imputations without number 0 cause this is the missing data!
        if(class(imputations[,f])=="factor"){
            d = imputations %>% subset(iteration!=0) %>% select_("Entrez", f) %>% gather(feature, value, -Entrez)
            ## here we will take the majority
            d = d %>% group_by(Entrez, feature) %>% group_by(Entrez, feature) %>% summarise(value=names(which.max(table(value)))) %>% ungroup
        }else if(class(imputations[,f])%in%c("numeric", "integer")){
            d = imputations %>% subset(iteration!=0) %>% select_("Entrez", f) %>% gather(feature, value, -Entrez)
            ## here we will take the mean
            if (f=="betweenness"){ ## I dont round in betweenness
                d = d %>% group_by(Entrez, feature) %>% summarise(value=round(mean(value), digits=3)) %>% ungroup
            }else{
                d = d %>% group_by(Entrez, feature) %>% summarise(value=round(mean(value))) %>% ungroup
            }
        }
        d = d %>% spread(feature, value)
        df_list[[f]] = d
    }
    ## and now merge the tables into one
    merged = NULL
    for (i in 1:length(df_list)){
        cat(i, "\n")
        if(i==1){
            merged = df_list[[i]]
        }else{
            merged = merged %>% left_join(df_list[[i]])
        }
    }
    merged = merged %>% data.frame()
    ## sort it similarly to geneProperties
    merged = merged[order(match(merged[,"Entrez"],geneProperties[,"Entrez"])),]
    rownames(merged) <- seq(length=nrow(merged))
    save(geneProperties_miceImputed, file="/Volumes/mourikisa/data/geneProperties_final_miceImputed.Rdata")

    ## Load also gene info to get  cgc and rst annotation
    load("/Volumes/mourikisa/data/geneInfoNCG5.Rdata")
    geneInfo = geneInfo %>% subset(duplicability!=-1) %>% select(entrez, symbol, cancer_type, cancer_dom, cancer_rec)

    ## And the actual geneProperties cause we need the Entrez
    load("/Volumes/mourikisa/data/geneProperties_final.Rdata")
    geneProperties = geneProperties %>% left_join(geneInfo, by=c("Entrez"="entrez"))
    merged = merged %>% left_join(geneInfo, by=c("Entrez"="entrez"))

    ## We check the stats once for the geneProperties with the missing values and once for the imputed data
    overall_stats = NULL

    ## -----------------------------------
    ## For the raw data with mising values
    ## -----------------------------------
    ## Checking Length
    test = geneProperties %>% subset(!is.na(Length.fullrefseq) & cancer_type!="can") %>% select(Length.fullrefseq, cancer_type)
    cg = test %>% subset(cancer_type=="cgc") %>% .$Length.fullrefseq %>% median
    rst = test %>% subset(cancer_type=="rst") %>% .$Length.fullrefseq %>% median
    wp = wilcox.test(test%>%subset(cancer_type=="cgc")%>%.$Length.fullrefseq, test%>%subset(cancer_type=="rst")%>%.$Length.fullrefseq)$p.value
    tb = data.frame(data="Raw", description="Gene length (CGC vs Rest)", cg=cg, rst=rst, test="Wilcoxon", p_value=wp, Note="cg and rst are the median numbers of RefSeq length (aa) for CGC and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Check Duplicability
    test = geneProperties %>% subset(!is.na(duplicated) & cancer_type!="can" & (cancer_rec==1 | cancer_type=="rst")) %>% select(duplicated, cancer_type) %>% mutate(cancer_type=ifelse(cancer_type=="cgc", "rec", cancer_type))
    perc.cg = 100*(test %>% subset(duplicated==0 & cancer_type=="rec") %>% nrow)/(test %>% subset(cancer_type=="rec") %>% nrow)
    perc.rst = 100*(test %>% subset(duplicated==0 & cancer_type=="rst") %>% nrow)/(test %>% subset(cancer_type=="rst") %>% nrow)
    cp = chisq.test(table(test[,"duplicated"], test[,"cancer_type"]))$p.value
    tb = data.frame(data="Raw", description="Gene duplicability (CGC-recessive vs Rest)", cg=perc.cg, rst=perc.rst, test="chi-square test", p_value=cp, Note="cg and rst are percentages of singleton genes in CGC-recessive and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Checking WGD
    test = geneProperties %>% subset(!is.na(WGD) & cancer_type!="can") %>% select(WGD, cancer_type)
    perc.cg = 100*(test %>% subset(WGD==1 & cancer_type=="cgc") %>% nrow)/(test %>% subset(!is.na(WGD) & cancer_type=="cgc") %>% nrow)
    perc.rst = 100*(test %>% subset(WGD==1 & cancer_type=="rst") %>% nrow)/(test %>% subset(!is.na(WGD) & cancer_type=="rst") %>% nrow)
    cp = chisq.test(table(test[,"WGD"], test[,"cancer_type"]))$p.value
    tb = data.frame(data="Raw", description="Ohnolog (CGC vs Rest)", cg=perc.cg, rst=perc.rst, test="chi-square test", p_value=cp, Note="cg and rst are percentages of genes underwent WGD in CGC and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Number of domains between CGC and rest
    test = geneProperties %>% subset(!is.na(alldomains) & cancer_type!="can") %>% select(alldomains, cancer_type)
    cg = test %>% subset(cancer_type=="cgc") %>% .$alldomains %>% median
    rst = test %>% subset(cancer_type=="rst") %>% .$alldomains %>% median
    wp = wilcox.test(test%>%subset(cancer_type=="cgc")%>%.$alldomains, test%>%subset(cancer_type=="rst")%>%.$alldomains)$p.value
    tb = data.frame(data="Raw", description="Number of protein domains (CGC vs Rest)", cg=cg, rst=rst, test="Wilcoxon", p_value=wp, Note="cg and rst are the median numbers of alldomains for CGC and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Check hubs
    test = geneProperties %>% subset(!is.na(hub) & cancer_type!="can") %>% select(hub, cancer_type)
    perc.cg = 100*(test %>% subset(hub==1 & cancer_type=="cgc") %>% nrow)/(test %>% subset(cancer_type=="cgc") %>% nrow)
    perc.rst = 100*(test %>% subset(hub==1 & cancer_type=="rst") %>% nrow)/(test %>% subset(cancer_type=="rst") %>% nrow)
    cp = chisq.test(table(test[,"hub"], test[,"cancer_type"]))$p.value
    tb = data.frame(data="Raw", description="Hub (CGC vs Rest)", cg=perc.cg, rst=perc.rst, test="chi-square test", p_value=cp, Note="cg and rst are percentages of hubs in CGC and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Check Central
    test = geneProperties %>% subset(!is.na(central) & cancer_type!="can") %>% select(central, cancer_type)
    perc.cg = 100*(test %>% subset(central==1 & cancer_type=="cgc") %>% nrow)/(test %>% subset(cancer_type=="cgc") %>% nrow)
    perc.rst = 100*(test %>% subset(central==1 & cancer_type=="rst") %>% nrow)/(test %>% subset(cancer_type=="rst") %>% nrow)
    cp = chisq.test(table(test[,"central"], test[,"cancer_type"]))$p.value
    tb = data.frame(data="Raw", description="Central (CGC vs Rest)", cg=perc.cg, rst=perc.rst, test="chi-square test", p_value=cp, Note="cg and rst are percentages of central proteins in CGC and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Recessive cancer genes are of ancient evolutionary origin
    test = geneProperties %>% subset(!is.na(age) & cancer_type!="can" & (cancer_rec==1 | cancer_type=="rst")) %>% select(age, cancer_type) %>% mutate(cancer_type=ifelse(cancer_type=="cgc", "rec", cancer_type))
    perc.cg = 100*(test %>% subset(age=="old" & cancer_type=="rec") %>% nrow)/(test %>% subset(cancer_type=="rec") %>% nrow)
    perc.rst = 100*(test %>% subset(age=="old" & cancer_type=="rst") %>% nrow)/(test %>% subset(cancer_type=="rst") %>% nrow)
    cp = chisq.test(table(test[,"age"], test[,"cancer_type"]))$p.value
    tb = data.frame(data="Raw", description="Age (CGC-recessive vs Rest)", cg=perc.cg, rst=perc.rst, test="chi-square test", p_value=cp, Note="cg and rst are percentages of old genes in CGC-recessive and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Dominant cancer genes versus age
    test = geneProperties %>% subset(!is.na(age) & cancer_type!="can" & (cancer_dom==1 | cancer_type=="rst")) %>% select(age, cancer_type) %>% mutate(cancer_type=ifelse(cancer_type=="cgc", "dom", cancer_type))
    perc.cg = 100*(test %>% subset(age=="old" & cancer_type=="dom") %>% nrow)/(test %>% subset(cancer_type=="dom") %>% nrow)
    perc.rst = 100*(test %>% subset(age=="old" & cancer_type=="rst") %>% nrow)/(test %>% subset(cancer_type=="rst") %>% nrow)
    cp = chisq.test(table(test[,"age"], test[,"cancer_type"]))$p.value
    tb = data.frame(data="Raw", description="Age (CGC-dominant vs Rest)", cg=perc.cg, rst=perc.rst, test="chi-square test", p_value=cp, Note="cg and rst are percentages of old genes in CGC-dominant and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## All cancer genes versus age
    test = geneProperties %>% subset(!is.na(age)) %>% select(age, cancer_type) %>% mutate(cancer_type=ifelse(cancer_type=="cgc" | cancer_type=="can", "cancer_gene", cancer_type))
    perc.cg = 100*(test %>% subset(age=="old" & cancer_type=="cancer_gene") %>% nrow)/(test %>% subset(cancer_type=="cancer_gene") %>% nrow)
    perc.rst = 100*(test %>% subset(age=="old" & cancer_type=="rst") %>% nrow)/(test %>% subset(cancer_type=="rst") %>% nrow)
    cp = chisq.test(table(test[,"age"], test[,"cancer_type"]))$p.value
    tb = data.frame(data="Raw", description="Age (Cancer genes vs Rest)", cg=perc.cg, rst=perc.rst, test="chi-square test", p_value=cp, Note="cg and rst are percentages of old genes in Cancer genes and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Expression beadth
    test = geneProperties %>% subset(!is.na(tot.tissues) & cancer_type!="can") %>% select(tot.tissues, cancer_type)
    cg = test %>% subset(cancer_type=="cgc") %>% .$tot.tissues %>% median
    rst = test %>% subset(cancer_type=="rst") %>% .$tot.tissues %>% median
    wp = wilcox.test(test%>%subset(cancer_type=="cgc")%>%.$tot.tissues, test%>%subset(cancer_type=="rst")%>%.$tot.tissues)$p.value
    tb = data.frame(data="Raw", description="Number of tissues expressed (CGC vs Rest)", cg=cg, rst=rst, test="Wilcoxon", p_value=wp, Note="cg and rst are the median numbers of tissues in which the genes are expressed for CGC and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Checking HiC
    test = geneProperties %>% subset(!is.na(hic) & cancer_type!="can") %>% select(hic, cancer_type)
    cg = test %>% subset(cancer_type=="cgc") %>% .$hic %>% median
    rst = test %>% subset(cancer_type=="rst") %>% .$hic %>% median
    wp = wilcox.test(test%>%subset(cancer_type=="cgc")%>%.$hic, test%>%subset(cancer_type=="rst")%>%.$hic)$p.value
    tb = data.frame(data="Raw", description="Chromatin state (CGC vs Rest)", cg=cg, rst=rst, test="Wilcoxon", p_value=wp, Note="cg and rst are the median numbers of HiC for CGC and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Checking miRNA
    test = geneProperties %>% subset(!is.na(mirna) & cancer_type!="can") %>% select(mirna, cancer_type)
    cg = test %>% subset(cancer_type=="cgc") %>% .$mirna %>% median
    rst = test %>% subset(cancer_type=="rst") %>% .$mirna %>% median
    wp = wilcox.test(test%>%subset(cancer_type=="cgc")%>%.$mirna, test%>%subset(cancer_type=="rst")%>%.$mirna)$p.value
    tb = data.frame(data="Raw", description="miRNA target (CGC vs Rest)", cg=cg, rst=rst, test="Wilcoxon", p_value=wp, Note="cg and rst are the median numbers of miRNA for CGC and Rest respectively")
    overall_stats = rbind(overall_stats, tb)


    ## ------------------------------------------------------
    ## For the median/mode imputed data without mising values
    ## ------------------------------------------------------
    ## Checking Length
    test = geneProperties_mmImputed %>% subset(!is.na(Length.fullrefseq) & cancer_type!="can") %>% select(Length.fullrefseq, cancer_type)
    cg = test %>% subset(cancer_type=="cgc") %>% .$Length.fullrefseq %>% median
    rst = test %>% subset(cancer_type=="rst") %>% .$Length.fullrefseq %>% median
    wp = wilcox.test(test%>%subset(cancer_type=="cgc")%>%.$Length.fullrefseq, test%>%subset(cancer_type=="rst")%>%.$Length.fullrefseq)$p.value
    tb = data.frame(data="Imputed_MM", description="Gene length (CGC vs Rest)", cg=cg, rst=rst, test="Wilcoxon", p_value=wp, Note="cg and rst are the median numbers of RefSeq length (aa) for CGC and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Check Duplicability
    test = geneProperties_mmImputed %>% subset(!is.na(duplicated) & cancer_type!="can" & (cancer_rec==1 | cancer_type=="rst")) %>% select(duplicated, cancer_type) %>% mutate(cancer_type=ifelse(cancer_type=="cgc", "rec", cancer_type))
    perc.cg = 100*(test %>% subset(duplicated==0 & cancer_type=="rec") %>% nrow)/(test %>% subset(cancer_type=="rec") %>% nrow)
    perc.rst = 100*(test %>% subset(duplicated==0 & cancer_type=="rst") %>% nrow)/(test %>% subset(cancer_type=="rst") %>% nrow)
    cp = chisq.test(table(test[,"duplicated"], test[,"cancer_type"]))$p.value
    tb = data.frame(data="Imputed_MM", description="Gene duplicability (CGC-recessive vs Rest)", cg=perc.cg, rst=perc.rst, test="chi-square test", p_value=cp, Note="cg and rst are percentages of singleton genes in CGC-recessive and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Checking WGD
    test = geneProperties_mmImputed %>% subset(!is.na(WGD) & cancer_type!="can") %>% select(WGD, cancer_type)
    perc.cg = 100*(test %>% subset(WGD==1 & cancer_type=="cgc") %>% nrow)/(test %>% subset(!is.na(WGD) & cancer_type=="cgc") %>% nrow)
    perc.rst = 100*(test %>% subset(WGD==1 & cancer_type=="rst") %>% nrow)/(test %>% subset(!is.na(WGD) & cancer_type=="rst") %>% nrow)
     cp = chisq.test(table(test[,"WGD"], test[,"cancer_type"]))$p.value
    tb = data.frame(data="Imputed_MM", description="Ohnolog (CGC vs Rest)", cg=perc.cg, rst=perc.rst, test="chi-square test", p_value=cp, Note="cg and rst are percentages of genes underwent WGD in CGC and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Number of domains between CGC and rest
    test = geneProperties_mmImputed %>% subset(!is.na(alldomains) & cancer_type!="can") %>% select(alldomains, cancer_type)
    cg = test %>% subset(cancer_type=="cgc") %>% .$alldomains %>% median
    rst = test %>% subset(cancer_type=="rst") %>% .$alldomains %>% median
    wp = wilcox.test(test%>%subset(cancer_type=="cgc")%>%.$alldomains, test%>%subset(cancer_type=="rst")%>%.$alldomains)$p.value
    tb = data.frame(data="Imputed_MM", description="Number of protein domains (CGC vs Rest)", cg=cg, rst=rst, test="Wilcoxon", p_value=wp, Note="cg and rst are the median numbers of alldomains for CGC and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Check hubs
    test = geneProperties_mmImputed %>% subset(!is.na(hub) & cancer_type!="can") %>% select(hub, cancer_type)
    perc.cg = 100*(test %>% subset(hub==1 & cancer_type=="cgc") %>% nrow)/(test %>% subset(cancer_type=="cgc") %>% nrow)
    perc.rst = 100*(test %>% subset(hub==1 & cancer_type=="rst") %>% nrow)/(test %>% subset(cancer_type=="rst") %>% nrow)
    cp = chisq.test(table(test[,"hub"], test[,"cancer_type"]))$p.value
    tb = data.frame(data="Imputed_MM", description="Hub (CGC vs Rest)", cg=perc.cg, rst=perc.rst, test="chi-square test", p_value=cp, Note="cg and rst are percentages of hubs in CGC and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Check Central
    test = geneProperties_mmImputed %>% subset(!is.na(central) & cancer_type!="can") %>% select(central, cancer_type)
    perc.cg = 100*(test %>% subset(central==1 & cancer_type=="cgc") %>% nrow)/(test %>% subset(cancer_type=="cgc") %>% nrow)
    perc.rst = 100*(test %>% subset(central==1 & cancer_type=="rst") %>% nrow)/(test %>% subset(cancer_type=="rst") %>% nrow)
    cp = chisq.test(table(test[,"central"], test[,"cancer_type"]))$p.value
    tb = data.frame(data="Imputed_MM", description="Central (CGC vs Rest)", cg=perc.cg, rst=perc.rst, test="chi-square test", p_value=cp, Note="cg and rst are percentages of central proteins in CGC and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Recessive cancer genes are of ancient evolutionary origin
    test = geneProperties_mmImputed %>% subset(!is.na(age) & cancer_type!="can" & (cancer_rec==1 | cancer_type=="rst")) %>% select(age, cancer_type) %>% mutate(cancer_type=ifelse(cancer_type=="cgc", "rec", cancer_type))
    perc.cg = 100*(test %>% subset(age=="old" & cancer_type=="rec") %>% nrow)/(test %>% subset(cancer_type=="rec") %>% nrow)
    perc.rst = 100*(test %>% subset(age=="old" & cancer_type=="rst") %>% nrow)/(test %>% subset(cancer_type=="rst") %>% nrow)
    cp = chisq.test(table(test[,"age"], test[,"cancer_type"]))$p.value
    tb = data.frame(data="Imputed_MM", description="Age (CGC-recessive vs Rest)", cg=perc.cg, rst=perc.rst, test="chi-square test", p_value=cp, Note="cg and rst are percentages of old genes in CGC-recessive and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Dominant cancer genes versus age
    test = geneProperties_mmImputed %>% subset(!is.na(age) & cancer_type!="can" & (cancer_dom==1 | cancer_type=="rst")) %>% select(age, cancer_type) %>% mutate(cancer_type=ifelse(cancer_type=="cgc", "dom", cancer_type))
    perc.cg = 100*(test %>% subset(age=="old" & cancer_type=="dom") %>% nrow)/(test %>% subset(cancer_type=="dom") %>% nrow)
    perc.rst = 100*(test %>% subset(age=="old" & cancer_type=="rst") %>% nrow)/(test %>% subset(cancer_type=="rst") %>% nrow)
    cp = chisq.test(table(test[,"age"], test[,"cancer_type"]))$p.value
    tb = data.frame(data="Imputed_MM", description="Age (CGC-dominant vs Rest)", cg=perc.cg, rst=perc.rst, test="chi-square test", p_value=cp, Note="cg and rst are percentages of old genes in CGC-dominant and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## All cancer genes versus age
    test = geneProperties_mmImputed %>% subset(!is.na(age)) %>% select(age, cancer_type) %>% mutate(cancer_type=ifelse(cancer_type=="cgc" | cancer_type=="can", "cancer_gene", cancer_type))
    perc.cg = 100*(test %>% subset(age=="old" & cancer_type=="cancer_gene") %>% nrow)/(test %>% subset(cancer_type=="cancer_gene") %>% nrow)
    perc.rst = 100*(test %>% subset(age=="old" & cancer_type=="rst") %>% nrow)/(test %>% subset(cancer_type=="rst") %>% nrow)
    cp = chisq.test(table(test[,"age"], test[,"cancer_type"]))$p.value
    tb = data.frame(data="Imputed_MM", description="Age (Cancer genes vs Rest)", cg=perc.cg, rst=perc.rst, test="chi-square test", p_value=cp, Note="cg and rst are percentages of old genes in Cancer genes and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Expression beadth
    test = geneProperties_mmImputed %>% subset(!is.na(tot.tissues) & cancer_type!="can") %>% select(tot.tissues, cancer_type)
    cg = test %>% subset(cancer_type=="cgc") %>% .$tot.tissues %>% median
    rst = test %>% subset(cancer_type=="rst") %>% .$tot.tissues %>% median
    wp = wilcox.test(test%>%subset(cancer_type=="cgc")%>%.$tot.tissues, test%>%subset(cancer_type=="rst")%>%.$tot.tissues)$p.value
    tb = data.frame(data="Imputed_MM", description="Number of tissues expressed (CGC vs Rest)", cg=cg, rst=rst, test="Wilcoxon", p_value=wp, Note="cg and rst are the median numbers of tissues in which the genes are expressed for CGC and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Checking HiC
    test = geneProperties_mmImputed %>% subset(!is.na(hic) & cancer_type!="can") %>% select(hic, cancer_type)
    cg = test %>% subset(cancer_type=="cgc") %>% .$hic %>% median
    rst = test %>% subset(cancer_type=="rst") %>% .$hic %>% median
    wp = wilcox.test(test%>%subset(cancer_type=="cgc")%>%.$hic, test%>%subset(cancer_type=="rst")%>%.$hic)$p.value
    tb = data.frame(data="Imputed_MM", description="Chromatin state (CGC vs Rest)", cg=cg, rst=rst, test="Wilcoxon", p_value=wp, Note="cg and rst are the median numbers of HiC for CGC and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Checking miRNA
    test = geneProperties_mmImputed %>% subset(!is.na(mirna) & cancer_type!="can") %>% select(mirna, cancer_type)
    cg = test %>% subset(cancer_type=="cgc") %>% .$mirna %>% median
    rst = test %>% subset(cancer_type=="rst") %>% .$mirna %>% median
    wp = wilcox.test(test%>%subset(cancer_type=="cgc")%>%.$mirna, test%>%subset(cancer_type=="rst")%>%.$mirna)$p.value
    tb = data.frame(data="Imputed_MM", description="miRNA target (CGC vs Rest)", cg=cg, rst=rst, test="Wilcoxon", p_value=wp, Note="cg and rst are the median numbers of miRNA for CGC and Rest respectively")
    overall_stats = rbind(overall_stats, tb)


    ## ------------------------------------------------
    ## For the mice-imputed data without mising values
    ## ------------------------------------------------
    ## Checking Length
    test = geneProperties_miceImputed %>% subset(!is.na(Length.fullrefseq) & cancer_type!="can") %>% select(Length.fullrefseq, cancer_type)
    cg = test %>% subset(cancer_type=="cgc") %>% .$Length.fullrefseq %>% median
    rst = test %>% subset(cancer_type=="rst") %>% .$Length.fullrefseq %>% median
    wp = wilcox.test(test%>%subset(cancer_type=="cgc")%>%.$Length.fullrefseq, test%>%subset(cancer_type=="rst")%>%.$Length.fullrefseq)$p.value
    tb = data.frame(data="Imputed_MICE", description="Gene length (CGC vs Rest)", cg=cg, rst=rst, test="Wilcoxon", p_value=wp, Note="cg and rst are the median numbers of RefSeq length (aa) for CGC and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Check Duplicability
    test = geneProperties_miceImputed %>% subset(!is.na(duplicated) & cancer_type!="can" & (cancer_rec==1 | cancer_type=="rst")) %>% select(duplicated, cancer_type) %>% mutate(cancer_type=ifelse(cancer_type=="cgc", "rec", cancer_type))
    perc.cg = 100*(test %>% subset(duplicated==0 & cancer_type=="rec") %>% nrow)/(test %>% subset(cancer_type=="rec") %>% nrow)
    perc.rst = 100*(test %>% subset(duplicated==0 & cancer_type=="rst") %>% nrow)/(test %>% subset(cancer_type=="rst") %>% nrow)
    cp = chisq.test(table(test[,"duplicated"], test[,"cancer_type"]))$p.value
    tb = data.frame(data="Imputed_MICE", description="Gene duplicability (CGC-recessive vs Rest)", cg=perc.cg, rst=perc.rst, test="chi-square test", p_value=cp, Note="cg and rst are percentages of singleton genes in CGC-recessive and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Checking WGD
    test = geneProperties_miceImputed %>% subset(!is.na(WGD) & cancer_type!="can") %>% select(WGD, cancer_type)
    perc.cg = 100*(test %>% subset(WGD==1 & cancer_type=="cgc") %>% nrow)/(test %>% subset(!is.na(WGD) & cancer_type=="cgc") %>% nrow)
    perc.rst = 100*(test %>% subset(WGD==1 & cancer_type=="rst") %>% nrow)/(test %>% subset(!is.na(WGD) & cancer_type=="rst") %>% nrow)
    cp = chisq.test(table(test[,"WGD"], test[,"cancer_type"]))$p.value
    tb = data.frame(data="Imputed_MICE", description="Ohnolog (CGC vs Rest)", cg=perc.cg, rst=perc.rst, test="chi-square test", p_value=cp, Note="cg and rst are percentages of genes underwent WGD in CGC and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Number of domains between CGC and rest
    test = geneProperties_miceImputed %>% subset(!is.na(alldomains) & cancer_type!="can") %>% select(alldomains, cancer_type)
    cg = test %>% subset(cancer_type=="cgc") %>% .$alldomains %>% median
    rst = test %>% subset(cancer_type=="rst") %>% .$alldomains %>% median
    wp = wilcox.test(test%>%subset(cancer_type=="cgc")%>%.$alldomains, test%>%subset(cancer_type=="rst")%>%.$alldomains)$p.value
    tb = data.frame(data="Imputed_MICE", description="Number of protein domains (CGC vs Rest)", cg=cg, rst=rst, test="Wilcoxon", p_value=wp, Note="cg and rst are the median numbers of alldomains for CGC and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Check hubs
    test = geneProperties_miceImputed %>% subset(!is.na(hub) & cancer_type!="can") %>% select(hub, cancer_type)
    perc.cg = 100*(test %>% subset(hub==1 & cancer_type=="cgc") %>% nrow)/(test %>% subset(cancer_type=="cgc") %>% nrow)
    perc.rst = 100*(test %>% subset(hub==1 & cancer_type=="rst") %>% nrow)/(test %>% subset(cancer_type=="rst") %>% nrow)
    cp = chisq.test(table(test[,"hub"], test[,"cancer_type"]))$p.value
    tb = data.frame(data="Imputed_MICE", description="Hub (CGC vs Rest)", cg=perc.cg, rst=perc.rst, test="chi-square test", p_value=cp, Note="cg and rst are percentages of hubs in CGC and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Check Central
    test = geneProperties_miceImputed %>% subset(!is.na(central) & cancer_type!="can") %>% select(central, cancer_type)
    perc.cg = 100*(test %>% subset(central==1 & cancer_type=="cgc") %>% nrow)/(test %>% subset(cancer_type=="cgc") %>% nrow)
    perc.rst = 100*(test %>% subset(central==1 & cancer_type=="rst") %>% nrow)/(test %>% subset(cancer_type=="rst") %>% nrow)
    cp = chisq.test(table(test[,"central"], test[,"cancer_type"]))$p.value
    tb = data.frame(data="Imputed_MICE", description="Central (CGC vs Rest)", cg=perc.cg, rst=perc.rst, test="chi-square test", p_value=cp, Note="cg and rst are percentages of central proteins in CGC and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Recessive cancer genes are of ancient evolutionary origin
    test = geneProperties_miceImputed %>% subset(!is.na(age) & cancer_type!="can" & (cancer_rec==1 | cancer_type=="rst")) %>% select(age, cancer_type) %>% mutate(cancer_type=ifelse(cancer_type=="cgc", "rec", cancer_type))
    perc.cg = 100*(test %>% subset(age=="old" & cancer_type=="rec") %>% nrow)/(test %>% subset(cancer_type=="rec") %>% nrow)
    perc.rst = 100*(test %>% subset(age=="old" & cancer_type=="rst") %>% nrow)/(test %>% subset(cancer_type=="rst") %>% nrow)
    cp = chisq.test(table(test[,"age"], test[,"cancer_type"]))$p.value
    tb = data.frame(data="Imputed_MICE", description="Age (CGC-recessive vs Rest)", cg=perc.cg, rst=perc.rst, test="chi-square test", p_value=cp, Note="cg and rst are percentages of old genes in CGC-recessive and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Dominant cancer genes versus age
    test = geneProperties_miceImputed %>% subset(!is.na(age) & cancer_type!="can" & (cancer_dom==1 | cancer_type=="rst")) %>% select(age, cancer_type) %>% mutate(cancer_type=ifelse(cancer_type=="cgc", "dom", cancer_type))
    perc.cg = 100*(test %>% subset(age=="old" & cancer_type=="dom") %>% nrow)/(test %>% subset(cancer_type=="dom") %>% nrow)
    perc.rst = 100*(test %>% subset(age=="old" & cancer_type=="rst") %>% nrow)/(test %>% subset(cancer_type=="rst") %>% nrow)
    cp = chisq.test(table(test[,"age"], test[,"cancer_type"]))$p.value
    tb = data.frame(data="Imputed_MICE", description="Age (CGC-dominant vs Rest)", cg=perc.cg, rst=perc.rst, test="chi-square test", p_value=cp, Note="cg and rst are percentages of old genes in CGC-dominant and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## All cancer genes versus age
    test = geneProperties_miceImputed %>% subset(!is.na(age)) %>% select(age, cancer_type) %>% mutate(cancer_type=ifelse(cancer_type=="cgc" | cancer_type=="can", "cancer_gene", cancer_type))
    perc.cg = 100*(test %>% subset(age=="old" & cancer_type=="cancer_gene") %>% nrow)/(test %>% subset(cancer_type=="cancer_gene") %>% nrow)
    perc.rst = 100*(test %>% subset(age=="old" & cancer_type=="rst") %>% nrow)/(test %>% subset(cancer_type=="rst") %>% nrow)
    cp = chisq.test(table(test[,"age"], test[,"cancer_type"]))$p.value
    tb = data.frame(data="Imputed_MICE", description="Age (Cancer genes vs Rest)", cg=perc.cg, rst=perc.rst, test="chi-square test", p_value=cp, Note="cg and rst are percentages of old genes in Cancer genes and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Expression beadth
    test = geneProperties_miceImputed %>% subset(!is.na(tot.tissues) & cancer_type!="can") %>% select(tot.tissues, cancer_type)
    cg = test %>% subset(cancer_type=="cgc") %>% .$tot.tissues %>% median
    rst = test %>% subset(cancer_type=="rst") %>% .$tot.tissues %>% median
    wp = wilcox.test(test%>%subset(cancer_type=="cgc")%>%.$tot.tissues, test%>%subset(cancer_type=="rst")%>%.$tot.tissues)$p.value
    tb = data.frame(data="Imputed_MICE", description="Number of tissues expressed (CGC vs Rest)", cg=cg, rst=rst, test="Wilcoxon", p_value=wp, Note="cg and rst are the median numbers of tissues in which the genes are expressed for CGC and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Checking HiC
    test = geneProperties_miceImputed %>% subset(!is.na(hic) & cancer_type!="can") %>% select(hic, cancer_type)
    cg = test %>% subset(cancer_type=="cgc") %>% .$hic %>% median
    rst = test %>% subset(cancer_type=="rst") %>% .$hic %>% median
    wp = wilcox.test(test%>%subset(cancer_type=="cgc")%>%.$hic, test%>%subset(cancer_type=="rst")%>%.$hic)$p.value
    tb = data.frame(data="Imputed_MICE", description="Chromatin state (CGC vs Rest)", cg=cg, rst=rst, test="Wilcoxon", p_value=wp, Note="cg and rst are the median numbers of HiC for CGC and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    ## Checking miRNA
    test = geneProperties_miceImputed %>% subset(!is.na(mirna) & cancer_type!="can") %>% select(mirna, cancer_type)
    cg = test %>% subset(cancer_type=="cgc") %>% .$mirna %>% median
    rst = test %>% subset(cancer_type=="rst") %>% .$mirna %>% median
    wp = wilcox.test(test%>%subset(cancer_type=="cgc")%>%.$mirna, test%>%subset(cancer_type=="rst")%>%.$mirna)$p.value
    tb = data.frame(data="Imputed_MICE", description="miRNA target (CGC vs Rest)", cg=cg, rst=rst, test="Wilcoxon", p_value=wp, Note="cg and rst are the median numbers of miRNA for CGC and Rest respectively")
    overall_stats = rbind(overall_stats, tb)

    overall_stats = overall_stats %>% mutate(significant = ifelse(p_value<0.05, 'Significant', "Not Significant"))


}

## Impute in the following function was the geneproperties data for the 19014
## At the moment I run it manually
imputeData = function(){

  load("/Volumes/mourikisa/data/geneProperties_final.Rdata")

  ## Make factors the categorical variables
  geneProperties$duplicated = factor(geneProperties$duplicated)
  geneProperties$WGD = factor(geneProperties$WGD)
  geneProperties$hub = factor(geneProperties$hub)
  geneProperties$central = factor(geneProperties$central)
  geneProperties$age = factor(geneProperties$age)
  geneProperties$origin = factor(geneProperties$origin)
  geneProperties$exp.breadth = factor(geneProperties$exp.breadth)

  ## Explore missing values and patterns
  md.pattern(geneProperties)
  library(VIM)
  aggr(geneProperties[,c(3:5,7,10,14:15)], col=c('navyblue','yellow'),
       numbers=TRUE, sortVars=TRUE,
       labels=names(geneProperties[,c(3:5,7,10,14:15)]), cex.axis=.7,
       gap=3, ylab=c("Missing data","Pattern"))

  ## Define a mode function to fill in missing values with mode
  Mode <- function(x) {
    ux <- unique(x)
    ux[which.max(tabulate(match(x, ux)))]
  }

  ## Median & Mode imputation
  geneProperties_mmImputed = geneProperties

  ## Bring geneInfo in cause we need CGC-specific median and Rest-specific median
  ## Load also gene info to get  cgc and rst annotation
  load("/Volumes/mourikisa/data/geneInfoNCG5.Rdata")
  geneInfo = geneInfo %>% subset(duplicability!=-1) %>% select(entrez, symbol, cancer_type, cancer_dom, cancer_rec)
  geneProperties_mmImputed = geneProperties_mmImputed %>% left_join(geneInfo, by=c("Entrez"="entrez"))

  ## Median/Mode imputation
  ## Fill WGD
  geneProperties_mmImputed$WGD[is.na(geneProperties_mmImputed$WGD) & geneProperties_mmImputed$cancer_type=="cgc"] = Mode(geneProperties_mmImputed$WGD[geneProperties_mmImputed$cancer_type=="cgc"])
  geneProperties_mmImputed$WGD[is.na(geneProperties_mmImputed$WGD) & geneProperties_mmImputed$cancer_type=="can"] = Mode(geneProperties_mmImputed$WGD[geneProperties_mmImputed$cancer_type=="rst"])
  geneProperties_mmImputed$WGD[is.na(geneProperties_mmImputed$WGD) & geneProperties_mmImputed$cancer_type=="rst"] = Mode(geneProperties_mmImputed$WGD[geneProperties_mmImputed$cancer_type=="rst"])

  ## Fill alldomains
  geneProperties_mmImputed$alldomains[is.na(geneProperties_mmImputed$alldomains) & geneProperties_mmImputed$cancer_type=="cgc"] = median(geneProperties_mmImputed$alldomains[geneProperties_mmImputed$cancer_type=="cgc"], na.rm = T)
  geneProperties_mmImputed$alldomains[is.na(geneProperties_mmImputed$alldomains) & geneProperties_mmImputed$cancer_type=="can"] = median(geneProperties_mmImputed$alldomains[geneProperties_mmImputed$cancer_type=="rst"], na.rm = T)
  geneProperties_mmImputed$alldomains[is.na(geneProperties_mmImputed$alldomains) & geneProperties_mmImputed$cancer_type=="rst"] = median(geneProperties_mmImputed$alldomains[geneProperties_mmImputed$cancer_type=="rst"], na.rm = T)

  ## Fill degree
  geneProperties_mmImputed$degree[is.na(geneProperties_mmImputed$degree) & geneProperties_mmImputed$cancer_type=="cgc"] = median(geneProperties_mmImputed$degree[geneProperties_mmImputed$cancer_type=="cgc"], na.rm = T)
  geneProperties_mmImputed$degree[is.na(geneProperties_mmImputed$degree) & geneProperties_mmImputed$cancer_type=="can"] = median(geneProperties_mmImputed$degree[geneProperties_mmImputed$cancer_type=="rst"], na.rm = T)
  geneProperties_mmImputed$degree[is.na(geneProperties_mmImputed$degree) & geneProperties_mmImputed$cancer_type=="rst"] = median(geneProperties_mmImputed$degree[geneProperties_mmImputed$cancer_type=="rst"], na.rm = T)

  ## Fill betweenness
  geneProperties_mmImputed$betweenness[is.na(geneProperties_mmImputed$betweenness) & geneProperties_mmImputed$cancer_type=="cgc"] = median(geneProperties_mmImputed$betweenness[geneProperties_mmImputed$cancer_type=="cgc"], na.rm = T)
  geneProperties_mmImputed$betweenness[is.na(geneProperties_mmImputed$betweenness) & geneProperties_mmImputed$cancer_type=="can"] = median(geneProperties_mmImputed$betweenness[geneProperties_mmImputed$cancer_type=="rst"], na.rm = T)
  geneProperties_mmImputed$betweenness[is.na(geneProperties_mmImputed$betweenness) & geneProperties_mmImputed$cancer_type=="rst"] = median(geneProperties_mmImputed$betweenness[geneProperties_mmImputed$cancer_type=="rst"], na.rm = T)

  ## Fill hub
  geneProperties_mmImputed$hub[is.na(geneProperties_mmImputed$hub) & geneProperties_mmImputed$cancer_type=="cgc"] = Mode(geneProperties_mmImputed$hub[geneProperties_mmImputed$cancer_type=="cgc"])
  geneProperties_mmImputed$hub[is.na(geneProperties_mmImputed$hub) & geneProperties_mmImputed$cancer_type=="can"] = Mode(geneProperties_mmImputed$hub[geneProperties_mmImputed$cancer_type=="rst"])
  geneProperties_mmImputed$hub[is.na(geneProperties_mmImputed$hub) & geneProperties_mmImputed$cancer_type=="rst"] = Mode(geneProperties_mmImputed$hub[geneProperties_mmImputed$cancer_type=="rst"])

  ## Fill central
  geneProperties_mmImputed$central[is.na(geneProperties_mmImputed$central) & geneProperties_mmImputed$cancer_type=="cgc"] = Mode(geneProperties_mmImputed$central[geneProperties_mmImputed$cancer_type=="cgc"])
  geneProperties_mmImputed$central[is.na(geneProperties_mmImputed$central) & geneProperties_mmImputed$cancer_type=="can"] = Mode(geneProperties_mmImputed$central[geneProperties_mmImputed$cancer_type=="rst"])
  geneProperties_mmImputed$central[is.na(geneProperties_mmImputed$central) & geneProperties_mmImputed$cancer_type=="rst"] = Mode(geneProperties_mmImputed$central[geneProperties_mmImputed$cancer_type=="rst"])

  ## Fill age
  geneProperties_mmImputed$age[is.na(geneProperties_mmImputed$age) & geneProperties_mmImputed$cancer_type=="cgc"] = Mode(geneProperties_mmImputed$age[geneProperties_mmImputed$cancer_type=="cgc"])
  geneProperties_mmImputed$age[is.na(geneProperties_mmImputed$age) & geneProperties_mmImputed$cancer_type=="can"] = Mode(geneProperties_mmImputed$age[geneProperties_mmImputed$cancer_type=="rst"])
  geneProperties_mmImputed$age[is.na(geneProperties_mmImputed$age) & geneProperties_mmImputed$cancer_type=="rst"] = Mode(geneProperties_mmImputed$age[geneProperties_mmImputed$cancer_type=="rst"])

  ## Fill origin
  geneProperties_mmImputed$origin[is.na(geneProperties_mmImputed$origin) & geneProperties_mmImputed$cancer_type=="cgc"] = Mode(geneProperties_mmImputed$origin[geneProperties_mmImputed$cancer_type=="cgc"])
  geneProperties_mmImputed$origin[is.na(geneProperties_mmImputed$origin) & geneProperties_mmImputed$cancer_type=="can"] = Mode(geneProperties_mmImputed$origin[geneProperties_mmImputed$cancer_type=="rst"])
  geneProperties_mmImputed$origin[is.na(geneProperties_mmImputed$origin) & geneProperties_mmImputed$cancer_type=="rst"] = Mode(geneProperties_mmImputed$origin[geneProperties_mmImputed$cancer_type=="rst"])

  ## Fill exp.breadth
  geneProperties_mmImputed$exp.breadth[is.na(geneProperties_mmImputed$exp.breadth) & geneProperties_mmImputed$cancer_type=="cgc"] = Mode(geneProperties_mmImputed$exp.breadth[geneProperties_mmImputed$cancer_type=="cgc"])
  geneProperties_mmImputed$exp.breadth[is.na(geneProperties_mmImputed$exp.breadth) & geneProperties_mmImputed$cancer_type=="can"] = Mode(geneProperties_mmImputed$exp.breadth[geneProperties_mmImputed$cancer_type=="rst"])
  geneProperties_mmImputed$exp.breadth[is.na(geneProperties_mmImputed$exp.breadth) & geneProperties_mmImputed$cancer_type=="rst"] = Mode(geneProperties_mmImputed$exp.breadth[geneProperties_mmImputed$cancer_type=="rst"])

  ## Fill tot.tissue
  geneProperties_mmImputed$tot.tissues[is.na(geneProperties_mmImputed$tot.tissues) & geneProperties_mmImputed$cancer_type=="cgc"] = median(geneProperties_mmImputed$tot.tissues[geneProperties_mmImputed$cancer_type=="cgc"], na.rm = T)
  geneProperties_mmImputed$tot.tissues[is.na(geneProperties_mmImputed$tot.tissues) & geneProperties_mmImputed$cancer_type=="can"] = median(geneProperties_mmImputed$tot.tissues[geneProperties_mmImputed$cancer_type=="rst"], na.rm = T)
  geneProperties_mmImputed$tot.tissues[is.na(geneProperties_mmImputed$tot.tissues) & geneProperties_mmImputed$cancer_type=="rst"] = median(geneProperties_mmImputed$tot.tissues[geneProperties_mmImputed$cancer_type=="rst"], na.rm = T)

  ## Fill hic
  geneProperties_mmImputed$hic[is.na(geneProperties_mmImputed$hic) & geneProperties_mmImputed$cancer_type=="cgc"] = median(geneProperties_mmImputed$hic[geneProperties_mmImputed$cancer_type=="cgc"], na.rm = T)
  geneProperties_mmImputed$hic[is.na(geneProperties_mmImputed$hic) & geneProperties_mmImputed$cancer_type=="can"] = median(geneProperties_mmImputed$hic[geneProperties_mmImputed$cancer_type=="rst"], na.rm = T)
  geneProperties_mmImputed$hic[is.na(geneProperties_mmImputed$hic) & geneProperties_mmImputed$cancer_type=="rst"] = median(geneProperties_mmImputed$hic[geneProperties_mmImputed$cancer_type=="rst"], na.rm = T)

  ## Fill mirna
  geneProperties_mmImputed$mirna[is.na(geneProperties_mmImputed$mirna) & geneProperties_mmImputed$cancer_type=="cgc"] = median(geneProperties_mmImputed$mirna[geneProperties_mmImputed$cancer_type=="cgc"], na.rm = T)
  geneProperties_mmImputed$mirna[is.na(geneProperties_mmImputed$mirna) & geneProperties_mmImputed$cancer_type=="can"] = median(geneProperties_mmImputed$mirna[geneProperties_mmImputed$cancer_type=="rst"], na.rm = T)
  geneProperties_mmImputed$mirna[is.na(geneProperties_mmImputed$mirna) & geneProperties_mmImputed$cancer_type=="rst"] = median(geneProperties_mmImputed$mirna[geneProperties_mmImputed$cancer_type=="rst"], na.rm = T)


  ## I have problems with member of complex cause there is only 1 observation
  save(geneProperties_mmImputed, file="/Volumes/mourikisa/data/geneProperties_final_mmImputed.Rdata")


  ## PMM imputation using mice - here we predict everything
  ## You need to conver characters to factors in order to be imputed - they are already
  ## You can check the manuscript her - https://stat.ethz.ch/education/semesters/ss2012/ams/paper/mice.pdf
  ## I chose 10 because they say that 10-20 is usually sufficient
  geneProperties$duplicated = factor(geneProperties$duplicated)
  geneProperties$WGD = factor(geneProperties$WGD)
  geneProperties$hub = factor(geneProperties$hub)
  geneProperties$central = factor(geneProperties$central)
  geneProperties$age = factor(geneProperties$age)
  geneProperties$origin = factor(geneProperties$origin)
  geneProperties$exp.breadth = factor(geneProperties$exp.breadth)
  ## I parallelized it here. Check the impute.R in automation file, I run the following commnd
  geneProperties_miceImputed <- mice(geneProperties[2:15], m=10, maxit = 20)
  summary(geneProperties_miceImputed)
  save(geneProperties_miceImputed, file="/Users/thmourikis/Desktop/geneProperties_final_miceImputed_object.Rdata")

  ## Then we replace the NAs with the mean of the imputed values
  ## For this I need to utilize spread and gather functions



  ## Continuous variables
  densityplot(geneProperties_miceImputed)
  ## Categorical variables
  categs = geneProperties %>% Filter(f = is.factor) %>% names
  categs = categs[!(categs%in%c("Genic", "memberofcomplex"))]
  mv = complete(geneProperties_miceImputed, 0)
  imputed_data = NULL
  for (i in 1:10){
    cat(i, "\n")
    d = complete(geneProperties_miceImputed, i)
    for (cat in categs){
      initial = mv %>% select_(cat) %>% count_(cat)
      dd = d %>% select_(cat) %>% count_(cat) %>% left_join(initial %>% rename(initial=n)) %>%
        mutate(change=abs(n-initial))
      dd = dd %>% mutate(imputation=i, feature=cat)
      colnames(dd)[1] = "levels"
      imputed_data = rbind(dd, imputed_data)
    }
  }

  ggplot(imputed_data, aes(x=levels, y=change, group=feature, fill=feature)) +
    geom_bar(stat="identity", position = "dodge") + facet_wrap(~imputation, ncol = 2) +
    theme(
      axis.text.x=element_text(angle = 90)
    )

}


getAllDriversTCGA = function(data_dir = "~/athena/novel_driver_prediction/pancancer_model_training/automation/median_mode_imputation/",
                             ct=NULL){

    ## Get gene type
    ## Get the gene names from NCG
    load("~/athena/data/geneInfoNCG5.Rdata")
    load("~/athena/data/cancerGenesNCG5.Rdata")
    ## Fix gene info table from NCG
    geneInfo = geneInfo %>% select(entrez, symbol, cancer_type, gene_categories) %>% unique %>% rename(gene_type=cancer_type)
    ## Get a cancer gene with all the associated primary sites and cancer sites
    cancerGenes = cancerGenes %>% select(entrez, primary_site, cancer_site) %>%
        group_by(entrez) %>% summarise(primary_site=paste(unique(primary_site), collapse=","),
                                       cancer_site=paste(unique(cancer_site), collapse=",")) %>%
        ungroup
    geneInfo = geneInfo %>% left_join(cancerGenes, by=c("entrez"))

    if(is.null(ct)){
        cancer_types <- c("ACC", "BLCA", "BRCA", "CESC", "CHOL", "COAD", "ESCA", "GBM",
                          "HNSC", "KICH", "KIRC", "KIRP", "LAML", "LGG", "LIHC", "LUAD",
                          "LUSC", "OV", "PAAD", "PCPG", "PRAD", "READ", "SARC", "SKCM",
                          "STAD", "TGCT", "THCA", "THYM", "UCEC", "UCS", "UVM")
    }else{
        cancer_types=ct
    }



    ## get the total driver cohort

    total_drivers = NULL
    for (ct in cancer_types){
        message(ct)
        load(paste0(data_dir,ct, "/training_set_noScale.Rdata"))
        training_ns = training_ns %>% tibble::rownames_to_column() %>% separate(rowname, into=c("cancer_type", "sample", "entrez"), sep="\\.")
        load(paste0(data_dir,ct, "/validation_set_noScale.Rdata"))
        validation_ns = validation_ns %>% tibble::rownames_to_column() %>% separate(rowname, into=c("cancer_type", "sample", "entrez"), sep="\\.")
        cohort = rbind(training_ns%>%select(-type)%>%mutate(type="training"), validation_ns%>%mutate(type="prediction"))
        total_drivers = rbind(total_drivers, cohort %>% data.frame())
    }
    total_drivers$entrez = as.numeric(total_drivers$entrez)
    return(total_drivers)

}
## This function takes as input the data frame from getAllDriversTCGA and gives samples with drivers per gene
inferDriversPerGene = function(d = NULL, cgc_fn="/Volumes/mourikisa/novel_driver_prediction/518_CGC_annotation_24_10_16F.xlsx"){

    if (is.null(d)){
        stop("Please provide input data")
    }
    ## Store the CGC drivers after refinement of their driver alterations
    cgc_drivers = NULL
    if(!is.null(cgc_fn)){
        ## Load the CGC that we will consider drivers
        message("Getting CGC annotation...")
        wb = loadWorkbook(cgc_fn)
        cgcs = readWorksheet(wb, sheet=1)
        ## refine to those that will be retained
        retained_CGC = cgcs %>% subset(RET.DIS=="RET")

        ## Refine the training set to include only the ones we decided to coonsider drivers
        message("Refining CGC drivers...")
        for (i in 1:nrow(retained_CGC)){

            en = retained_CGC$entrez[i]
            gene_type= retained_CGC$GENETICS[i]
            symbol = retained_CGC$symbol[i]
            message(paste(symbol, gene_type, sep = "\t"))
            if (gene_type=="O"){
                dd = d %>% subset(entrez==en) %>% subset(CNVGain==1 | no_NTDam_muts!=0 | no_GOF_muts!=0)
                cgc_drivers = rbind(cgc_drivers, dd)
            }else if (gene_type=="TS"){
                dd = d %>% subset(entrez==en) %>% subset(Copy_number==0 | no_TRUNC_muts!=0 | no_NTDam_muts!=0 | (Copy_number==1 & (no_TRUNC_muts>0 | no_NTDam_muts>0)))
                cgc_drivers = rbind(cgc_drivers, dd)
            }else if (gene_type=="O/TS"){
                dd = d %>% subset(entrez==en)
                cgc_drivers = rbind(cgc_drivers, dd)
            }else{
                stop("Unknown gene type")
            }
        }
    }

    ## And now after the refinement of cgc
    ## rbind it with type=="prediction" (prediction set without the CGCs) and count samples per gene
    if(!is.null(cgc_drivers)){
        message("CGC refined alterations mode...")
        allDrivers = rbind(cgc_drivers, d%>%subset(type=="prediction"))
        ## Count per gene
        allDrivers = allDrivers %>% select(sample, entrez) %>% unique %>% group_by(entrez) %>% summarise(samples_with_drivers=n()) %>% ungroup()
    }else if (is.null(cgc_drivers)){
        message("CGC non-refined alterations mode...")
        allDrivers = d %>% select(sample, entrez) %>% unique %>% group_by(entrez) %>% summarise(samples_with_drivers=n()) %>% ungroup()
    }


    return(allDrivers)

}


## Recalculate score
reScore = function(path){

    load(paste0(path, "/scores.Rdata"))
    ## Get the distance from the decision boundary in each kernel
    all_preds = scores[["all_scores"]]
    all_preds = all_preds %>% select(cancer_type, sample, entrez, dv, label, prob_platt, gene_type, kernel, training)

    ## Get the best models
    bm = read.table(paste0(path, "/best_models.tsv"), header = T, sep = "\t", stringsAsFactors = F)
    if (!("kernel"%in%colnames(bm))){
        bm = bm %>% separate(analysis, into=c("kernel", "params"), sep="\\.", extra="merge", remove = F) %>%
            select(-params) %>% select(kernel, mean, var) %>% mutate(ratio=mean/var)
    }else{
        bm = bm %>% select(kernel, mean, var) %>% mutate(ratio=mean/var)
    }

    ## Get sample-specific rank per kernel
    all_preds = all_preds %>% arrange(sample, dv) %>% group_by(sample, kernel) %>% mutate(rank=row_number(-dv)) %>% ungroup
    all_preds = all_preds %>% group_by(sample) %>% mutate(N=max(rank), Nlog10=log10(N)) %>% ungroup

    ## bring the kernel information into the scores
    all_preds = all_preds %>% left_join(bm) %>% rename(cvs=mean)

    ## calculate new score
    #all_preds = all_preds %>% mutate(score=-log10(rank/N)*cvs)
    all_preds = all_preds %>% mutate(score=-log10(rank/N)*ratio)

    ## Sum up the score of each gene across kernels
    scores = all_preds %>% group_by(cancer_type, sample, entrez, gene_type, N, Nlog10) %>%
        summarise(sum_of_scores=sum(score), kernels_used=length(unique(kernel))) %>%
        #mutate(score=(1/(kernels_used*Nlog10))*sum_of_scores) %>% ungroup %>% data.frame()
        ungroup %>% group_by(sample) %>% mutate(max_score=max(sum_of_scores)) %>% mutate(score=sum_of_scores/max_score) %>%
        ungroup %>% data.frame()

    ## Add from how many kernels they have predicted as TRUE
    preds2kernels = all_preds %>% subset(label) %>% group_by(cancer_type, sample, entrez) %>% summarise(kernels_predicted=paste(unique(kernel), collapse=","), kernels_predicted_no=length(unique(kernel))) %>% ungroup
    scores = scores %>% left_join(preds2kernels)
    scores$kernels_predicted_no[is.na(scores$kernels_predicted_no)] = 0

    return(list(all_scores=all_preds, scores=scores))

}

saveReport = function (path, cohort=NULL, title=NULL, out_fn="sypcad_report.pdf", tissue=NULL) {

    if(is.null(tissue)){
        message("Tissue argument is null!")
    }

    pdf(file=paste0( path, "/", out_fn,sep=""),width = unit(12, "inches"), height = unit(14, "inches"))

    pushViewport(viewport(layout=grid.layout(10, 5, widths=c(0.05, 1, 0.05, 0.05, 1), heights=c(.05,.01,.15,.01, 0.1, 0.28,0.01,0.1, 0.28,.05))))

    grid.text( title , vp = viewport(layout.pos.row = 1, layout.pos.col = 3), gp=gpar(fontface='bold'), just="center")

    grid.text("A. General information", vp = viewport(layout.pos.row = 2, layout.pos.col = 1), gp=gpar(fontface='bold'), just="left")

    ## Get the table witht hte general info for the cancer type
    if(!is.null(cohort)){
        ## Make a table for the genes and print the table in the PDF
        ## This stays the same always
        training_g = cohort %>% subset(type=="C") %>% select(entrez) %>% unique %>% .$entrez
        training_s = cohort %>% subset(type=="C") %>% select(sample) %>% unique %>% .$sample
        training_t = cohort %>% subset(type=="C")
        validation_g = cohort %>% subset(type=="P") %>% select(entrez) %>% unique %>% .$entrez
        validation_s = cohort %>% subset(type=="P") %>% select(sample) %>% unique %>% .$sample
        validation_t = cohort %>% subset(type=="P")
        samples_drivers = unique(cohort$sample)
        samples_unexplained = setdiff(validation_s, training_s)
        d1 = data.frame(info=c("Training(g/s/t)", "Validation(g/s/t)","Samples (drivers)", "Samples (unexplained)"),
                       values=c(paste(length(training_g), length(training_s), nrow(training_t), sep="/"), paste(length(validation_g), length(validation_s), nrow(validation_t), sep="/"),
                                length(samples_drivers), length(samples_unexplained)))
        g1 = tableGrob(d1, rows = NULL, cols=NULL, vp=viewport(layout.pos.row=3,layout.pos.col=1:2))
        grid.draw(g1)

        cgc = cohort %>% subset(sys_can & gene_type=="cgc") %>% select(entrez) %>% unique %>% .$entrez
        ncg_cans = cohort %>% subset(sys_can & gene_type=="can") %>% select(entrez) %>% unique %>% .$entrez
        fps = cohort %>% subset(sys_can & NCG_FP) %>% select(entrez) %>% unique %>% .$entrez
        rst = cohort %>% subset(sys_can & gene_type=="rst") %>% select(entrez) %>% unique %>% .$entrez
        ## Load the Cancer genes file to distinguish between cancer specific and not
        load("/Volumes/mourikisa/data/geneInfoNCG5.Rdata")
        load("/Volumes/mourikisa/data/cancerGenesNCG5.Rdata")
        ## Fix gene info table from NCG
        geneInfo = geneInfo %>% select(entrez, cancer_type) %>% unique
        ## Get a cancer gene with all the associated primary sites and cancer sites
        cancerGenes = cancerGenes %>% select(entrez, primary_site, cancer_site) %>%
            group_by(entrez) %>% summarise(primary_site=paste(unique(primary_site), collapse=","),
                                           cancer_site=paste(unique(cancer_site), collapse=",")) %>%
            ungroup
        geneInfo = geneInfo %>% left_join(cancerGenes, by=c("entrez"))

        ## Check if tissue is in NCG tissue
        ncg_sites = geneInfo %>% subset(!is.na(primary_site)) %>% select(primary_site) %>% mutate(primary_site=strsplit(primary_site, ",")) %>% unnest(primary_site) %>% unique() %>% .$primary_site
        if (!(tissue%in%ncg_sites)){
            stop("The tissue provided is not part of NCG5")
        }

        ## get the cancer specific ones
        cs_cgc = geneInfo %>% subset(entrez%in%cgc & grepl(tissue, primary_site)) %>% select(entrez) %>% unique %>% .$entrez
        cs_ncg_cans = geneInfo %>% subset(entrez%in%ncg_cans & grepl(tissue, primary_site)) %>% select(entrez) %>% unique %>% .$entrez
        cs_fps = geneInfo %>% subset(entrez%in%fps & grepl(tissue, primary_site)) %>% select(entrez) %>% unique %>% .$entrez

        d2 = data.frame(info=c("CGCs", "Cancer-specific CGCs", "Sys candidates (cans;FP)", "Cancer-specific (cans;FP)", "Sys candidates (rst)"),
                       values=c(length(cgc), length(cs_cgc), paste0(length(ncg_cans), ";", length(fps)), paste0(length(cs_ncg_cans), ";", length(cs_fps)), length(rst)))
        g2 = tableGrob(d2, rows = NULL, cols=NULL, vp=viewport(layout.pos.row=3,layout.pos.col=4:5))
        grid.draw(g2)



    }

    grid.text("B. Score distribution (overall)", vp = viewport(layout.pos.row = 4, layout.pos.col = 1), gp=gpar(fontface='bold'), just="left")

    ## Get the score distributions
    if(!is.null(cohort)){
        if(grepl("OAC", title)){
            toPlot = rbind(cohort %>% subset(type=="P")%>% mutate(type="All scores"),
                           cohort %>% subset(type=="P" & sys_can)%>% mutate(type="Systems candidates"))
            toPlot = toPlot %>% mutate(cl=ifelse(symbol=="KAT2A", "KAT2A", ifelse(symbol=="KAT2B", "KAT2B", "Rest")))
            tb = toPlot %>% mutate(type=ifelse(type=="All scores", "All", "Sys cans")) %>%
                group_by(type) %>%
                summarise(min=format(min(score), scientific = T, digits = 2), median=format(median(score), scientific = T, digits = 2),
                          mean=format(mean(score), scientific = T, digits = 2), max=format(max(score), scientific = T, digits = 2))
            p = ggplot(toPlot, aes(x=type, y=score)) + geom_boxplot(outlier.colour = NA) + geom_point(aes(color=cl, alpha=.5), position = position_jitter(width = 0.2)) +
                scale_color_manual(values=c("red", "green", "black"), name="Genes", breaks=c("KAT2A","KAT2B")) +
                theme_boss() +
                theme(
                    axis.line.x = element_line(color="black"),
                    axis.line.y = element_line(color="black"),
                    axis.text.x = element_text(size=12),
                    axis.text.y = element_text(size=12),
                    axis.title.x = element_text(size=14)
                ) +
                xlab("")
            g = tableGrob(tb, rows = NULL, vp=viewport(layout.pos.row=5,layout.pos.col=1:2))
            grid.draw(g)
            print(p, vp=viewport(layout.pos.row=6,layout.pos.col=1:2))
        }else{
            toPlot = rbind(cohort %>% subset(type=="P")%>% mutate(type="All scores"),
                          cohort %>% subset(type=="P" & sys_can)%>% mutate(type="Systems candidates"))
            tb = toPlot %>% mutate(type=ifelse(type=="All scores", "All", "Sys cans")) %>%
                group_by(type) %>%
                summarise(min=format(min(score), scientific = T, digits = 2), median=format(median(score), scientific = T, digits = 2),
                          mean=format(mean(score), scientific = T, digits = 2), max=format(max(score), scientific = T, digits = 2))
            p = ggplot(toPlot, aes(x=type, y=score)) + geom_boxplot(outlier.colour = NA) + #geom_point(position = position_jitter(width = 0.2)) +
                theme_boss() +
                theme(
                    axis.line.x = element_line(color="black"),
                    axis.line.y = element_line(color="black"),
                    axis.text.x = element_text(size=12),
                    axis.text.y = element_text(size=12),
                    axis.title.x = element_text(size=14)
                ) +
                xlab("")
            g = tableGrob(tb, rows = NULL, vp=viewport(layout.pos.row=5,layout.pos.col=1:2))
            grid.draw(g)
            print(p, vp=viewport(layout.pos.row=6,layout.pos.col=1:2))
        }
    }

    grid.text("C. Mean score within samples (only sys cans)", vp = viewport(layout.pos.row = 4, layout.pos.col = 4), gp=gpar(fontface='bold'), just="left")

    ## Get the mean score of each sample
    if(!is.null(cohort)){

        toPlot = cohort %>% subset(type=="P" & sys_can) %>% group_by(sample) %>% summarise(mean_score=mean(score))
        toPlot$type = paste0("samples(",nrow(toPlot), ")")
        samples2sys_cans = cohort %>% subset(type=="P" & sys_can) %>% group_by(sample) %>% summarise(n=n())
        toPlot = toPlot %>% left_join(samples2sys_cans)
        tb = cbind(cohort %>% subset(type=="P" & sys_can) %>% group_by(sample) %>% summarise(mean_score=mean(score)) %>% .$mean_score %>% summary(),
                    cohort %>% subset(type=="P" & sys_can) %>% group_by(sample) %>% summarise(n=n()) %>% .$n %>% summary()) %>% data.frame()
        colnames(tb) = c("Mean score", "Sys cans")
        tb = tb %>% t()
        tb = tb[,c(1,3,4,6)]
        tb = tb %>% data.frame() %>% tibble::rownames_to_column()
        colnames(tb) = c("type", "min", "median", "mean", "max")
        tb = tb %>% mutate(min=format(min, scientific = T, digits = 2), median=format(median, scientific = T, digits = 2),
                           mean=format(mean, scientific = T, digits = 2), max=format(max, scientific = T, digits = 2))

        p = ggplot(toPlot, aes(x=type, y=mean_score)) + geom_boxplot(outlier.colour = NA) + geom_point(aes(colour = factor(n)), position = position_jitter(width = 0.2)) +
            scale_color_manual(values = c("red", "orange", "green", "blue"), name="Number of\nsys cans", breaks=c(1,2,3,4)) +
            theme_boss() +
            theme(
                axis.line.x = element_line(color="black"),
                axis.line.y = element_line(color="black"),
                axis.text.x = element_text(size=12),
                axis.text.y = element_text(size=12),
                axis.title.x = element_text(size=14),
                legend.title=element_blank()
            ) + xlab("") + ylab("Mean sys candidate score within each sample")
            scale_colour_discrete(name="Sys\ncandidates")
        g = tableGrob(tb, rows = NULL, vp=viewport(layout.pos.row=5,layout.pos.col=4:5))
        grid.draw(g)
        print(p, vp=viewport(layout.pos.row=6,layout.pos.col=4:5))
    }

    ncg_cans = cohort %>% subset(sys_can & gene_type=="can") %>% select(entrez) %>% unique %>% .$entrez
    fps = cohort %>% subset(sys_can & NCG_FP) %>% .$entrez
    rst = cohort %>% subset(sys_can & gene_type=="rst") %>% select(entrez) %>% unique %>% .$entrez
    grid.text(paste0("D. Recurrence of sys candidates (",length(ncg_cans)+length(rst) ,") across samples"), vp = viewport(layout.pos.row = 7, layout.pos.col = 1), gp=gpar(fontface='bold'), just="left")

    ## Get in how many samples the sys cans are mutated
    if(!is.null(cohort)){
        samples =  cohort %>% subset(sys_can & type=="P") %>% select(sample) %>% unique %>% nrow
        toPlot = cohort %>% subset(type=="P" & sys_can) %>% group_by(symbol) %>% summarise(n=n())
        p = ggplot(toPlot, aes(x=n)) + geom_histogram(binwidth=1) + theme_boss() +
            theme(
                axis.line.x = element_line(color="black"),
                axis.line.y = element_line(color="black"),
                axis.text.x = element_text(size=12),
                axis.text.y = element_text(size=12),
                axis.title.x = element_text(size=14)
            ) + xlab("Number of samples") +
            scale_x_continuous(limits = c(0, samples))
        tb = toPlot %>%  .$n %>% summary() %>% cbind() %>% data.frame()
        colnames(tb) = "samples"
        tb = tb %>% t() %>% data.frame()
        tb = tb[,c(1,3,4,6)]
        colnames(tb) = c("min", "median", "mean", "max")
        g = tableGrob(tb, rows = NULL, vp=viewport(layout.pos.row=8,layout.pos.col=1:2))
        grid.draw(g)
        print(p, vp=viewport(layout.pos.row=9,layout.pos.col=1:2))
    }

    ## Get genes that are recurrent and plot mean score versus variance
    all_genes = cohort %>% subset(type=="P") %>% group_by(entrez, symbol) %>% summarise(n=n(), m=mean(score), r=max(score)-min(score)) %>% ungroup
    rec_genes = all_genes %>% subset(n>1)
    rec_genes = rec_genes %>% mutate(cl=ifelse(entrez%in%c(ncg_cans, rst), TRUE, FALSE))
    grid.text(paste0("E. Gene score across samples (", nrow(rec_genes),"/",nrow(all_genes), " genes)") , vp = viewport(layout.pos.row = 7, layout.pos.col = 4), gp=gpar(fontface='bold'), just="left")

    if(!is.null(cohort)){
        qs = quantile(rec_genes$r, seq(0,1,.05))[c(16,20)]
        tb = rbind(rec_genes %>%  .$r %>% summary() %>% cbind() %>% data.frame(),
                   qs %>% data.frame())
        tb[,1] = format(tb[,1], scientific = T, digits = 2)
        colnames(tb) = "score range"
        tb = tb %>% t() %>% data.frame()
        tb = tb[,c(1,3,4,7, 8, 6)]
        tb = tb %>% data.frame() %>% tibble::rownames_to_column()
        colnames(tb) = c("type", "min", "median", "mean", "75%", "95%", "max")

        p = ggplot(rec_genes, aes(x=r, y=m)) + geom_point(aes(color=cl)) +
#             geom_text(data=subset(rec_genes, r > 10),
#                       aes(x=r,y=m+0.5,label=symbol), size=3) +
            theme_boss() +
            theme(
                axis.line.x = element_line(color="black"),
                axis.line.y = element_line(color="black"),
                axis.text.x = element_text(size=12),
                axis.text.y = element_text(size=12),
                axis.title.x = element_text(size=14)
            ) + xlab("Score range") + ylab("Mean gene score across samples") +
            scale_color_manual(values = c("black", "red"), name="Sys candidates")

        g = tableGrob(tb, rows = NULL, vp=viewport(layout.pos.row=8,layout.pos.col=4:5))
        grid.draw(g)
        print(p, vp=viewport(layout.pos.row=9,layout.pos.col=4:5))
    }


    dev.off()
    message(paste0( "Results saved at: ", path, "/", out_fn,sep=""))
}

## this function adds information from gtex, and takes into account gene
## expression when calculating the rank of each prediction within each sample
getSysCans = function(path=NULL, cancer_type=NULL, tissue=NULL, scores_fn="scores.Rdata",
                      fp_dir="~/Mountpoints/rosalind_lustre/mourikisa/data/NCG_false_positives.txt",
                      cgc_fn="~/Mountpoints/rosalind_lustre/mourikisa/data/518_CGC_annotation_2211.xlsx",
                      previous.studies="~/Mountpoints/rosalind_lustre/mourikisa/data/CGCs_to_be_considered_OAC.tsv",
                      exclude_expr=c("Not Expressed")){

    require(readxl)

    if(is.null(path) | is.null(cancer_type) | is.null(tissue)){
        stop("Missing arguments")
    }

    ## Get the data we need for the definition of the sys cans
    ## Load geneInfo to get the symbols as well
    load("~/Mountpoints/rosalind_lustre/mourikisa/data/geneInfoNCG5.Rdata")
    geneInfoNCG5 = geneInfo %>% select(entrez, symbol) %>% unique

    ## Load the CGC that we will consider drivers
    message("Getting CGC annotation...")
    #wb = loadWorkbook(cgc_fn) ## changed function
    #cgcs = readWorksheet(wb, sheet=1) ## changed funstion
    cgcs = read_excel(cgc_fn, 1)
    ## refine to those that will be retained
    retained_CGC = cgcs %>% subset(RET.DIS=="RET")
    retained_CGC_trans = retained_CGC %>% subset(grepl("Trans", Drivers.to.be.retained))
    ## Get training and validation set
    load(paste0(path, "/training_set_noScale.Rdata"))
    training_ns = training_ns%>%tibble::rownames_to_column()%>%
        separate(rowname, into=c("cancer_type", "sample", "entrez"), sep="\\.") %>%
        mutate(entrez=as.numeric(entrez)) %>% data.frame()
    ## Refine the training set to include only the ones we decided to coonsider drivers
    message("Refining CGC drivers...")
    cgc_drivers = NULL
    for (i in 1:nrow(retained_CGC)){
        en = retained_CGC$entrez[i]
        gene_type= retained_CGC$GENETICS[i]
        symbol = retained_CGC$symbol[i]
        retained_drivers = retained_CGC$Drivers.to.be.retained[i]
        #message(paste(symbol, gene_type, sep = "\t"))
        if (gene_type=="O"){
            if(grepl("Trans", retained_drivers)){
                dd = training_ns %>% subset(entrez==en) %>% subset(CNVGain==1 | no_NTDam_muts!=0 | no_GOF_muts!=0 | BND==1 | INV==1 | INS==1)
            }else{
                dd = training_ns %>% subset(entrez==en) %>% subset(CNVGain==1 | no_NTDam_muts!=0 | no_GOF_muts!=0)
            }
            cgc_drivers = rbind(cgc_drivers, dd)
        }else if (gene_type=="TS"){
            if(grepl("Trans", retained_drivers)){
                dd = training_ns %>% subset(entrez==en) %>% subset(Copy_number==0 | no_TRUNC_muts!=0 | no_NTDam_muts!=0 | (Copy_number==1 & (no_TRUNC_muts>0 | no_NTDam_muts>0)) | BND==1 | INV==1 | INS==1)
            }else{
                dd = training_ns %>% subset(entrez==en) %>% subset(Copy_number==0 | no_TRUNC_muts!=0 | no_NTDam_muts!=0 | (Copy_number==1 & (no_TRUNC_muts>0 | no_NTDam_muts>0)))
            }
            cgc_drivers = rbind(cgc_drivers, dd)
        }else if (gene_type=="O/TS"){
            if(grepl("Trans", retained_drivers)){
                ## All drivers are considered
                dd = training_ns %>% subset(entrez==en)
            }else{
                ## Here we need to exlude all the genes with only SVs
                dd = training_ns %>% subset(entrez==en) %>% subset(!(no_TRUNC_muts==0 & no_NTDam_muts==0 & no_GOF_muts==0 & Copy_number!=0 & CNVGain==0 & (BND!=0 | INS!=0 | INV!=0)))
            }
            cgc_drivers = rbind(cgc_drivers, dd)
        }else{
            stop("Unknown gene type")
        }
    }

    ## Now add in the CGC drivers the CGCs identified from previous studies
    previous = read.delim(previous.studies, header = T, sep="\t")
    ## Get only those that were not considered already
    previous = previous %>% subset(to.be.considered & !already.considered)
    ## Get the entrez for these genes
    previous = previous %>% left_join(geneInfo%>%select(entrez, symbol), by=c("Gene.symbol"="symbol"))

    for(i in 1:nrow(previous)){
        en = previous$entrez[i]
        gene_type= previous$Type[i]
        symbol = previous$Gene.symbol[i]

        if(gene_type=="SVs"){
            dd = training_ns %>% subset(entrez==en) %>% subset(BND==1 | INV==1 | INS==1 | CNVLoss==1 | CNVGain==1)
        }else if(gene_type=="CNVs"){
            dd = training_ns %>% subset(entrez==en) %>% subset(CNVLoss==1 | CNVGain==1)
        }
        cgc_drivers = rbind(cgc_drivers, dd)
    }


    load(paste0(path, "/validation_set_noScale.Rdata"))
    validation_ns = validation_ns%>%tibble::rownames_to_column()%>%
        separate(rowname, into=c("cancer_type", "sample", "entrez"), sep="\\.")%>%
        mutate(type="P")%>%mutate(entrez=as.numeric(entrez))%>%data.frame()
    cohort = rbind(cgc_drivers, validation_ns) ## Instead of the whole training, we use only the subset of the refined CGC
    ## Get scores
    load(paste0(path, "/",scores_fn))
    scores = scores[["scores"]]

    cohort = cohort %>% left_join(scores)
    cohort$gene_type[is.na(cohort$gene_type)] = "cgc"
    cohort = cohort %>% left_join(geneInfoNCG5)

    ## Make NAs in CGC score to be the highest score in each patient + 1 so we can sort by score in the output table
    sample2maxscore = cohort %>% group_by(sample) %>% summarise(max_score=max(score, na.rm = T)+1)
    cohort$score[is.na(cohort$score) & cohort$gene_type=="cgc"] = sample2maxscore$max_score[match(cohort$sample, sample2maxscore$sample)][which(is.na(cohort$score))]


    samples = unique(cohort$sample)
    sys_cans = NULL
    if (!is.null(tissue)){
        message("Expression is considered for the selection of sys cans...")
        ## Load expression data (GTeX)
        message("Loading GTeX expression data")
        load("~/Mountpoints/rosalind_lustre/mourikisa/data/expressionNCG5_gtex.Rdata") ## name of data frame is gtex
        gtex = gtex %>% select(entrez, tissue, exp_level) %>% spread(tissue, exp_level)
        ## Checkpoint
        if(!(tissue%in%colnames(gtex))){
            stop("Wrong tissue supplied")
        }
        ## Select tissue of the cancer type
        gtex = gtex[,c("entrez", tissue)]
        colnames(gtex) = c("entrez", "gtex_tissue_expression")
        ## bring in the cohort table the expression
        message("Getting GTEx expression")
        cohort = cohort %>% left_join(gtex)


        total = length(samples)
        pb <- txtProgressBar(min = 0, max = total, style = 3)
        for (i in 1:length(samples)){
            Sys.sleep(0.1)
            ## When you get the candidates here you need to take into account expression in normal tissues and in the cancer sample

            ## Here I get the ranks for the non-CGC genes. The CGCs will be added as patient.rank==NA
            ## I also add 2 ranks, one with all the genes and one without amplifications
            d = cohort %>% subset(sample==samples[i] & gene_type!="cgc" & !gtex_tissue_expression%in%exclude_expr & !is.na(gtex_tissue_expression)) %>%
                arrange(desc(score)) %>% mutate(patient.rank=row_number(-score))
            d_withoutAmp = d %>% subset(sample==samples[i] & gene_type!="cgc" & CNVGain==0 & !gtex_tissue_expression%in%exclude_expr & !is.na(gtex_tissue_expression)) %>%
                arrange(desc(score)) %>% mutate(patient.rank.no.amp=row_number(-score))
            ## Bring in the rank without amplification
            d = d %>% left_join(d_withoutAmp)

            ## Add the CGCs
            d_cgcs = cohort %>% subset(sample==samples[i] & gene_type=="cgc" & !gtex_tissue_expression%in%exclude_expr & !is.na(gtex_tissue_expression))
            d = rbind.fill(d, d_cgcs)

            ## Add all those that are not expressed (I will exclude those, I won't use them anyways) - code left here for testing
            #d_ne = cohort %>% subset(sample==samples[i] & (gtex_tissue_expression=="Not Expressed" | is.na(gtex_tissue_expression)))
            #d = rbind.fill(d, d_ne)

            ## Add them to the big table
            sys_cans = rbind(sys_cans, d)
            setTxtProgressBar(pb, i)
        }
    }

    ## Add false positive annotation
    false_positive_genes <- read.table(fp_dir, header = T, sep = "\t")
    false_positive_genes <- false_positive_genes %>% select(entrez) %>% .$entrez
    sys_cans = sys_cans %>% mutate(NCG_FP=ifelse(entrez%in%false_positive_genes, TRUE, FALSE))

    ## In the final output we need everything (also the CGC that were excluded) so create an export table -- No we don't, I disabled this. If needed in the future maybe turn it back on
    #export = rbind(training_ns, validation_ns)
    #export = export %>% left_join(cohort)
    ## all the rest CGC are now coming in and their gene type will be NA
    #export$gene_type[is.na(export$gene_type)] = "CGC_discarded"


    message(paste0("Score file used: ", scores_fn))

    return(sys_cans)

}


## This function takes in the prediction scores and maps the first 100
## genes to pathways
scores2pathways = function(path, geneset="/Volumes/mourikisa/novel_driver_prediction/pancancer_model_training/geneSets/curated_gene_sets/c2.all.v5.1.entrez.gmt.txt"){

    load(paste0(path, "/scores.Rdata"))
    d = scores[["scores"]] %>% arrange(desc(score)) %>% slice(1:100)

    pathways = make_pathway_list(geneset)
    message(paste0(length(pathways), " pathways loaded..."))
    message(paste0(unlist(pathways) %>% unique() %>% length(), " genes found in those pathways..."))
    len = sapply(pathways, length)
    message("Excluding pathways with \"CANCER\" in their name...")
    pathways = pathways[!grepl("CANCER",names(pathways))]

    genes2map = d %>% select(entrez) %>% unique
    genes2map$path = sapply(as.list(as.character(unique(d$entrez))), function(x,y){
        sel=sapply(y, function(z,w) w%in%z, w = x);
        return( paste(names(y)[sel],collapse='.')) },
        y = pathways)

}

## This function was created to add features in the existing tables of MySQL database
## It pulls the tables from the MySQL database and adds features
## For the moment run it manually in Athena
addFeatures = function(cancer_type="OAC"){

    source("/home/mourikisa/novel_driver_prediction/pancancer_model_training/automation/config.R")

#     ## First feature to be added is the "all somatic SNVs and indels for every gene in the coresponding sample"
#     ## For this I need to load the mutation datasets from the Score_correction directory
#     load("/home/mourikisa/novel_driver_prediction/Score_correction/mutation_datasets.Rdata")
#     nsi2samples2genes = total_muts_nsi %>% subset(!is.na(entrez_19014)) %>% group_by(Cancer_type, Sample, entrez_19014, symbol_19014) %>% summarise(nsi_muts=n()) %>% ungroup %>% data.frame()
#     si2samples2genes = total_muts_si %>% subset(!is.na(entrez_19014)) %>% group_by(Cancer_type, Sample, entrez_19014, symbol_19014) %>% summarise(si_muts=n()) %>% ungroup %>% data.frame()
#     muts2samples2genes = nsi2samples2genes %>% full_join(si2samples2genes)
#     muts2samples2genes$nsi_muts[is.na(muts2samples2genes$nsi_muts)] = 0
#     muts2samples2genes$si_muts[is.na(muts2samples2genes$si_muts)] = 0
#     muts2samples2genes = muts2samples2genes %>% mutate(no_ALL_muts=nsi_muts+si_muts)
#     muts2samples2genes = muts2samples2genes %>% rename(Entrez=entrez_19014) %>% select(-symbol_19014, -nsi_muts, -si_muts)
#     muts2samples2genes$Entrez = as.numeric(muts2samples2genes$Entrez)
#     ## OAC is not in there (Add OAC)
#     load("/Volumes/mourikisa/data/OAC/Rdata/dataset_mutations_damaging.Rdata")
#     ## Watch out for this definition - depending on the version of ANNOVAR the categories may change name
#     ns = c("nonsynonymous","stopgain","frameshift deletion","splicing","frameshift insertion","nonframeshift deletion","nonframeshift insertion","nonframeshift substitution","stoploss","frameshift substitution")
#     oac_nsi2gene2sample = df_mut %>% subset(ExonicFunc.refGene%in%ns)  %>% subset(!is.na(entrez_19014)) %>% group_by(entrez_19014, symbol_19014, sample) %>% summarise(nsi_muts=n()) %>% ungroup %>%
#         mutate(Cancer_type="OAC") %>% rename(Sample=sample, Entrez=entrez_19014) %>% select(Cancer_type, Sample, Entrez, nsi_muts)
#     oac_si2gene2sample = df_mut %>% subset(!(ExonicFunc.refGene%in%ns))  %>% subset(!is.na(entrez_19014)) %>% group_by(entrez_19014, symbol_19014, sample) %>% summarise(si_muts=n()) %>% ungroup %>%
#         mutate(Cancer_type="OAC") %>% rename(Sample=sample, Entrez=entrez_19014) %>% select(Cancer_type, Sample, Entrez, si_muts)
#     oac_muts2samples2genes = oac_nsi2gene2sample %>% full_join(oac_si2gene2sample)
#     oac_muts2samples2genes$nsi_muts[is.na(oac_muts2samples2genes$nsi_muts)] = 0
#     oac_muts2samples2genes$si_muts[is.na(oac_muts2samples2genes$si_muts)] = 0
#     oac_muts2samples2genes = oac_muts2samples2genes %>% mutate(no_ALL_muts=nsi_muts+si_muts)
#     oac_muts2samples2genes = oac_muts2samples2genes %>% select(-nsi_muts, -si_muts)
#
#     d = getTable(tb=cancer_type)
#     d = d %>% left_join(muts2samples2genes)
#     d$no_ALL_muts[is.na(d$no_ALL_muts)] = 0
#     ## Reorder columns
#     d = d[,c(1:3, length(d), 4:(length(d)-1))]


#     ## Add histone compartment covariate in the features
#     gene_cov = read.table("/Volumes/mourikisa/novel_driver_prediction/Score_correction/gene_covariates.txt", header = T) %>% rename(symbol=gene) %>% select(symbol, reptime, hic)
#     ## Get their entrez through geneInfo table in NCG
#     dev_mode()
#     library(ncglib)
#     geneInfo = get_geneinfo(version="NCG5")
#     dev_mode()
#     geneInfo = geneInfo %>% select(entrez, symbol, cancer_type) %>% unique %>% rename(gene_type=cancer_type)
#     gene_cov = gene_cov %>% left_join(geneInfo)
#     ## exclude those for which we dont have entrez
#     gene_cov = gene_cov %>% subset(!is.na(entrez))
#     gene_cov = gene_cov %>% rename(Entrez=entrez) %>% select(Entrez, reptime, hic)
#     write.table(gene_cov, file="/Volumes/mourikisa/novel_driver_prediction/Score_correction/gene_covariates_geneInfo_crossed.tsv", quote = F, row.names = F, sep=)


    gene_cov = read.table(file="/home/mourikisa/novel_driver_prediction/Score_correction/gene_covariates_19014_crossed.tsv", header = T)
    ## Delete the columns if already exist
    d = d %>% select(-hic, -reptime)
    d = d %>% left_join(gene_cov)
    ## For those I have no information for hic, I set it to 0 for LAML and OAC
    ## or the rest I will set it to the median of the distribution - look at the Score_correction directory

    d$hic[is.na(d$hic)] = 25 ## Set it to the median of the overall distribution
    d$reptime[is.na(d$reptime)] = 394 ## Set it to the median of the overall distribution

    ## Correct the mistake with primates
    load("/mnt/lustre/users/k1469280/mourikisa/data/geneProperties.Rdata")
    geneProperties = geneProperties %>% select(Entrez, origin) %>% unique %>% subset(origin=="Primates") %>%
      mutate(primates=1) %>% select(-origin)
    d = d %>% left_join(geneProperties)
    d$primates[is.na(d$primates)] = 0
    ## Make primates factor
    d$primates = as.factor(d$primates)

    ## Reorder columns
    d = d[,c(1:45, length(d), 46:(length(d)-1))]


    ## Add microRNAs
    ## load ncglib
#     con = get_ncg_connection()
#     con %>% tbl_df()
#     mirna = con %>% tbl("NCG51_mirna_interactions") %>% collect(n=Inf)
    load("/mnt/lustre/users/k1469280/mourikisa/data/miRNA_NCG51.Rdata")
    ## Select only the human miRNAs
    mirna = mirna %>% separate(mir, into=c("sp", "crap"), sep="-", extra="merge", remove = F) %>% select(-crap) %>% subset(sp=="hsa")
    mirna = mirna %>% group_by(entrez) %>% summarise(mirna=length(unique(mir))) %>% rename(Entrez=entrez) %>% data.frame()
    ## For testing purposes (to see if there is actually a difference)
    #load("/Volumes/mourikisa/data/geneInfoNCG5.Rdata")
    #mirna = mirna %>% left_join(geneInfo%>%select(entrez, symbol, cancer_type)%>%rename(gene_type=cancer_type), by=c("Entrez"="entrez"))
    d = d %>% left_join(mirna)
    d$mirna[is.na(d$mirna)] = 0


    write.table(d, file="/home/mourikisa/novel_driver_prediction/Rdata/total_table_TCGA_19014_prepared_LAML.tsv", quote = F, sep = "\t", row.names = F, col.names = F)

#     ## then in the MySQLdatabase I run the following
#     rename table LAML to LAML_test;
#     create table PAAD like PAAD_test;
#     alter table PAAD add no_ALL_muts INT(11) after Entrez;
#     alter table PAAD add primates tinyint(1) after mammals;
#     alter table PAAD add reptime INT(11);
#     alter table PAAD add hic INT(11);

#     load data local infile '/home/mourikisa/novel_driver_prediction/Rdata/total_table_TCGA_19014_prepared_PAAD.tsv' into table PAAD;

    ## ------------------------------------------------------------------------
    ## Update WGD using the raw data - the results are always saved in the geneProperties_final.Rdata
    ## WGD data (raw)
    wgd = read.xlsx2("/Users/thmourikis/Desktop/WGD_data.xlsx", 1, startRow = 2)
    ## get geneSymbols
    load("/Users/thmourikis/Desktop/19014_GeneSymbols.Rdata")
    noWGD_19014 = geneSymbols %>% subset(symbol%in%wgd$Gene[wgd$Ohnolog..O.!="O"]) %>%
        select(Entrez,Symbol) %>% unique %>% mutate(WGD=0)
    WGD_19014 = geneSymbols %>% subset(symbol%in%wgd$Gene[wgd$Ohnolog..O.=="O"]) %>%
        select(Entrez,Symbol) %>% unique %>% mutate(WGD=1)
    WGD_data_19014 = rbind(WGD_19014, noWGD_19014) ## It may be the case that there are aliases as Ohnologs and non Ohnologs
    WGD_data_19014 = WGD_data_19014 %>%  arrange(desc(WGD)) %>% subset(!duplicated(Entrez))


    ## Update the Duplicability (I am adding the highest of the two - genic or Genomic because this is what Giovanni found significant)
    ## Get the alldups table with all duplicability information
    alldups = read.table('/Users/fc-8s-imac/ncg-data-update/duplicability_CGC_analysis/output/duplicability_withdomains.csv', header=T) %>%
        subset(mainEntrez==T) %>%
        rename(genegroup=variable)

    ## Get also the geneProperties_complete
    load("/Volumes/mourikisa/data/geneProperties_complete.Rdata")
    ## Remember to take unique in the joining because alldups is redundant for dataset column
    geneProperties = geneProperties %>% rename(Genic_old=Genic) %>% left_join(alldups%>%select(Entrez, Duplicability, Genic, Genomic)%>%unique)
    ## Get duplicability in 60% because this where Giovanni saw difference
    geneProperties = geneProperties %>% mutate(duplicated=ifelse(Duplicability>=60, 1, 0))
    save(geneProperties, file="/Volumes/mourikisa/data/geneProperties_complete.Rdata")

    ## Create geneProperties final
    ## Fic domains cause Giovanni set it to 0
    geneProperties = geneProperties %>% mutate(alldomains=ifelse(inCDD==0, NA, alldomains))
    geneProperties = geneProperties %>% rename(WGD=WGD_new)
    geneProperties = geneProperties %>% select(Entrez, Length.fullrefseq, duplicated, WGD, alldomains,
                              degree, betweenness, hub, central, age, origin, exp.breadth,
                              tot.tissues, hic, mirna)
    save(geneProperties, file="/Volumes/mourikisa/data/geneProperties_final.Rdata")


}


## Prepare tables for each cancer type and put them in the MySQL database
# for (ct in cancer_types){
#     cat(ct, "\n")
#     df = prepareTableML(df=total_table, cancer_type=ct)
#     write.table(df, file=paste("/home/mourikisa/novel_driver_prediction/Rdata/total_table_TCGA_19014_prepared_", ct, ".tsv", sep=""),
#                 row.names = F, col.names = F, sep = "\t", quote = F)
# }
## Load all data from 129_OAC/Rdata/total_table_19014.Rdata
getMLinput <- function(df, geneProperties_dir="/mnt/lustre/users/k1469280/mourikisa/data/geneProperties_final_mmImputed.Rdata"){

    df = df %>% mutate(Cancer_type="OAC") %>% rename(Sample=sample, Entrez=entrez_19014, Copy_number=Total_CN, CNV_type=CNV_type_corrected) %>% select(-symbol_19014, -ploidy, -CNV_entries, -in_19014)

    ## Replace numbers with names here
    ## We assume every gene with no mutation data that it's not mutated
    message("Fixing mutations...")
    df[,c("no_ALL_muts", "no_NSI_muts", "no_TRUNC_muts",
          "no_NTDam_muts",
          "no_GOF_muts")][is.na(df[,c("no_ALL_muts", "no_NSI_muts", "no_TRUNC_muts",
                                      "no_NTDam_muts",
                                      "no_GOF_muts")])] <- 0

    message("Fixing CNVs...")
    ## Copy number (where copy number is NA, put copy number equal to 2)
    ## I integrated copy number data by selecting segment mean >|0.3| therefore I took only gains and losses
    ## But at the same time the unique number of genes in the CNV data is quite high, therefore whatever is left with NA is probably 2
    df$Copy_number[is.na(df$Copy_number)] <- 2

    message("Fixing SVs...")
    df$BND[is.na(df$BND)] = 0
    df$INS[is.na(df$INS)] = 0
    df$INV[is.na(df$INV)] = 0

    ## Join with gene Properties
    message("Joining table with systems-level properties...")
    load(geneProperties_dir)
    geneProperties = geneProperties_mmImputed
    geneProperties = geneProperties %>% select(-symbol, -cancer_type, -cancer_dom, -cancer_rec)

    df <- df %>% left_join(geneProperties, by=c("Entrez"))

    ## Convert categorical features to multiple factors
    message("Performing cleaning of categorical variables...")
    ## CNV type
    df <- df %>%
        mutate(CNVGain=ifelse(is.na(CNV_type), 0, ifelse(CNV_type=="Gain",1, 0)),
               CNVLoss=ifelse(Copy_number==0 | Copy_number==1, 1, 0)) %>%
        select(-CNV_type)

    ## age
    df <- df %>%
        mutate(old=ifelse(is.na(age), NA, ifelse(age=="old",1, 0)),
               young=ifelse(is.na(age), NA, ifelse(age=="young",1, 0))) %>%
        select(-age)

    ## origin
    df <- df %>%
        mutate(luca=ifelse(is.na(origin), NA, ifelse(origin=="LUCA",1, 0)),
               eukaryotes=ifelse(is.na(origin), NA, ifelse(origin=="Eukaryotes",1, 0)),
               metazoans=ifelse(is.na(origin), NA, ifelse(origin=="Metazoans",1, 0)),
               vertebrates=ifelse(is.na(origin), NA, ifelse(origin=="Vertebrates",1, 0)),
               opisthokonts=ifelse(is.na(origin), NA, ifelse(origin=="Opisthokonts",1, 0)),
               mammals=ifelse(is.na(origin), NA, ifelse(origin=="Mammals",1, 0)),
               primates=ifelse(is.na(origin), NA, ifelse(origin=="Primates", 1, 0))) %>%
        select(-origin)

    ## exp.breadth.class
    df <- df %>%
        mutate(selective=ifelse(is.na(exp.breadth), NA, ifelse(exp.breadth=="Selective",1, 0)),
               always.expressed=ifelse(is.na(exp.breadth), NA, ifelse(exp.breadth=="AlwaysExpressed",1, 0)),
               middle=ifelse(is.na(exp.breadth), NA, ifelse(exp.breadth=="Middle",1, 0)),
               one.tissue=ifelse(is.na(exp.breadth), NA, ifelse(exp.breadth=="OneTissue",1, 0)),
               never.expressed=ifelse(is.na(exp.breadth), NA, ifelse(exp.breadth=="Neverexpressed",1, 0))) %>%
        select(-exp.breadth)



    ## Before you add (change NAs in these columns to 0)
    #df[,c("High", "Low", "Medium", "NotExpressed")][is.na(df[,c("High", "Low", "Medium", "NotExpressed")])] <- 0
    #df <- df %>% ungroup %>% mutate(tot.tissues=High+Low+Medium)
    df <- data.frame(df)

    message("Converting features to factors...")
    fcols <- c("duplicated",
               "WGD", "hub", "central", "CNVGain", "CNVLoss",
               "ExpT_ME", "ExpT_HE", "ExpT_LE", "ExpT_NE",
               "ExpT_NET", "old", "young", "luca", "eukaryotes",
               "metazoans", "vertebrates", "opisthokonts",
               "mammals", "primates", "selective", "always.expressed",
               "middle", "one.tissue", "never.expressed")
    cols <- which(colnames(df) %in% fcols)
    for(i in cols){
        df[,i] = factor(df[,i], levels = c(0,1))
    }

    ## Reorder columns
    df = df[,c(12, 1:2, 3:11, 13:length(df))]

    return(df)
}
## I just run in Athena createAllinput() after sourcing config.R
createAllInput <- function(save_dir="/home/mourikisa/novel_driver_prediction/pancancer_model_training/automation/median_mode_imputation/input_data"){
    cancer_types <- c("ACC", "BLCA", "CESC", "CHOL",
                      "HNSC", "KICH", "KIRC", "KIRP", "LIHC", "LUAD",
                      "LUSC", "OV", "PRAD", "READ", "SARC", "SKCM",
                      "STAD", "TGCT", "THCA", "THYM", "UCEC", "UCS", "UVM")
    message("Loading pancancer table....")
    load("/home/mourikisa/novel_driver_prediction/Rdata/total_table_TCGA_19014.Rdata")
    for (cancer_type in cancer_types){
        message(cancer_type)
        d = getMLinput(ct=cancer_type, df=total_table)
        write.table(d, file=paste0(save_dir, "/", cancer_type, "_ML.tsv"), row.names = F, quote = F, sep = "\t")
    }
}

doFeatureSelection = function(training, columnRange=1:length(training)){
    ## Based on http://machinelearningmastery.com/feature-selection-with-the-caret-r-package/
    ## 1st way
    # The Caret R package provides the findCorrelation which will analyze a correlation matrix of your data’s attributes report on attributes that can be removed.
    # A correlation matrix is created from these attributes and highly correlated attributes are identified, in this case the age attribute is remove as it correlates highly with the pregnant attribute.
    # Generally, you want to remove attributes with an absolute correlation of 0.75 or higher.

    # ensure the results are repeatable
    set.seed(7)
    # load the library
    library(mlbench)
    library(caret)

    # calculate correlation matrix
    num_training = sapply(training%>%select(-type)%>%data.frame(), function(x) as.numeric(x))
    num_training = num_training[,apply(num_training, 2, var, na.rm=TRUE) != 0]
    correlationMatrix <- cor(num_training)
    # summarize the correlation matrix
    print(correlationMatrix)
    # find attributes that are highly corrected (ideally >0.75)
    highlyCorrelated <- findCorrelation(correlationMatrix, cutoff=0.75, names = T)
    # print indexes of highly correlated attributes
    print(highlyCorrelated)


}


## Functions to get the mutations from TCGA data
applyMutationFilters = function(df=NULL){
    if(is.null(df)){
        return(NULL)
    }else{
        ## Apply the same filters as for nonsynonymous
        df <- subset(df, MeanMutFreq >= 0.10 | is.na(MeanMutFreq) | is.nan(MeanMutFreq))
        df <- subset(df, !(Center == "hgsc.bcm.edu" & MeanMutFreq == 1) | is.na(MeanMutFreq) | is.nan(MeanMutFreq))
        df <- subset(df, !(t_alt_count == 2 & t_ref_count == 4) | is.na(t_alt_count) | is.nan(t_alt_count) | is.na(t_ref_count) | is.nan(t_ref_count))
        return(df)
    }
}
## Watch out! Some of the terms below appear in the ExonicFunc.refGene
## and some in the Func.refGene of Annovar
## Make sure you include all the mutations
exonicFunc_nonsilent = c("frameshift deletion","frameshift insertion",
                         "frameshift substitution","stopgain","stoploss",
                         "nonsynonymous SNV","splicing")
exonicFunc_silent = c("nonframeshift deletion","nonframeshift insertion",
                      "nonframeshift substitution", "synonymous SNV")
## We also need to take intronic mutations
non_coding = c("intronic","UTR3", "UTR5")
nonsilentSNVs = c("stopgain","stoploss",
                  "nonsynonymous SNV","splicing")

getSilentMutations = function(dataDir="/Volumes/mourikisa/data/TCGA/01_03_2015/",
                        resDir="/Volumes/mourikisa/novel_driver_prediction/Score_correction/",
                        cancer_types=NULL){

    total_muts = NULL
    for (cancer_type in cancer_types){
        cat(cancer_type, "\n")
        tumour_path = paste0(dataDir, cancer_type, "/Tumor/Somatic_Mutations/", cancer_type,
                             "_concatenated_deduplicated_MutExpCNV_1SamplePerPatient_MMF_Annovar_dbNSFP_gAnn_SampleFilters_Silent.Rdata")
        load(tumour_path)
        maf = do.call(rbind, somatic_mutations)
        total_muts = rbind(total_muts, maf)
    }
    total_muts = unrowname(total_muts)
    return(total_muts)
}

getNonSilentMutations = function(dataDir="~/athena/data/TCGA/01_03_2015/",
                                  cancer_types=NULL){

    total_muts = NULL
    for (cancer_type in cancer_types){
        cat(cancer_type, "\n")
        tumour_path = paste0(dataDir, cancer_type, "/Tumor/Somatic_Mutations/", cancer_type,
                             "_concatenated_deduplicated_MutExpCNV_1SamplePerPatient_MMF_Annovar_dbNSFP_gAnn_SampleFilters_NonSilent_MutationFilters_OncodriveClust.Rdata")
        load(tumour_path)
        maf = do.call(rbind, somatic_mutations)
        total_muts = rbind(total_muts, maf)
    }
    total_muts = unrowname(total_muts)
    return(total_muts)
}

# ## Load mutation datasets in Score_correction directory
# par(mfrow = c(2, 2))
# with(test,scatter.smooth(x=driver_rate, y=muts_rate, lpars=list(col = "red", lwd = 3, lty = 3),
#                          col=ifelse(cancer_type=="cgc","red", ifelse(cancer_type=="can", "blue",ifelse(cancer_type=="rst","green", "black"))), pch=16,
#                          main="All genes"))
# legend(x="topleft",c("cgc", "can", "rst"),pch=16,col=c("red","blue","green"))
# with(test, text(x=driver_rate, y=muts_rate-0.03, labels=ifelse(driver_rate>0.3 | muts_rate>0.4, symbol, ""), cex = 0.8))
# text(x=0.85, y=0, paste0("cor: ", format(cor.test(test$muts_rate, test$driver_rate)$estimate, digits = 3)))
#
# with(test%>%subset(cancer_type=="cgc"),scatter.smooth(x=driver_rate, y=muts_rate, lpars=list(col = "red", lwd = 3, lty = 3),
#                                                       col=ifelse(cancer_type=="cgc","red", "black"), pch=16,
#                                                       main="CGC", xlim=c(0,0.1), ylim=c(0,0.4)))
# with(test%>%subset(cancer_type=="cgc"), text(x=driver_rate, y=muts_rate-0.03, labels=ifelse(driver_rate>0.05, symbol, ""), cex = 0.8))
# text(x=0.09, y=0, paste0("cor: ", format(cor.test(test$muts_rate[test$cancer_type=="cgc"], test$driver_rate[test$cancer_type=="cgc"])$estimate, digits = 3)))
#
# with(test%>%subset(cancer_type=="can"),scatter.smooth(x=driver_rate, y=muts_rate, lpars=list(col = "red", lwd = 3, lty = 3),
#                                                       col=ifelse(cancer_type=="can","blue", "black"), pch=16,
#                                                       main="Candidates", xlim=c(0,0.1), ylim=c(0,0.4)))
# with(test%>%subset(cancer_type=="can"), text(x=driver_rate, y=muts_rate-0.03, labels=ifelse(driver_rate>0.08 | muts_rate>0.2, symbol, ""), cex = 0.8))
# text(x=0.09, y=0, paste0("cor: ", format(cor.test(test$muts_rate[test$cancer_type=="can"], test$driver_rate[test$cancer_type=="can"])$estimate, digits = 3)))
#
# with(test%>%subset(cancer_type=="rst"),scatter.smooth(x=driver_rate, y=muts_rate, lpars=list(col = "red", lwd = 3, lty = 3),
#                                                       col=ifelse(cancer_type=="rst","green", "black"), pch=16,
#                                                       main="Rest", xlim=c(0,0.1), ylim=c(0,0.4)))
# with(test%>%subset(cancer_type=="rst"), text(x=driver_rate, y=muts_rate-0.03, labels=ifelse(driver_rate>0.07 | muts_rate>0.2, symbol, ""), cex = 0.8))
# text(x=0.09, y=0, paste0("cor: ", format(cor.test(test$muts_rate[test$cancer_type=="rst"], test$driver_rate[test$cancer_type=="rst"])$estimate, digits = 3)))

# ## Plot the synonymous VS the driver rate
# par(mfrow = c(2, 2))
# with(test,scatter.smooth(x=driver_rate, y=syn_muts_rate, lpars=list(col = "red", lwd = 3, lty = 3),
#                          col=ifelse(cancer_type=="cgc","red", ifelse(cancer_type=="can", "blue",ifelse(cancer_type=="rst","green", "black"))), pch=16,
#                          main="All genes"))
# legend(x="topleft",c("cgc", "can", "rst"),pch=16,col=c("red","blue","green"))
# with(test, text(x=driver_rate, y=syn_muts_rate-0.01, labels=ifelse(driver_rate>0.2 | syn_muts_rate>0.10, symbol, ""), cex = 0.8))
# text(x=0.85, y=0.25, paste0("cor: ", format(cor.test(test$syn_muts_rate, test$driver_rate)$estimate, digits = 3)))
#
# with(test%>%subset(cancer_type=="cgc"),scatter.smooth(x=driver_rate, y=syn_muts_rate, lpars=list(col = "red", lwd = 3, lty = 3),
#                                                       col=ifelse(cancer_type=="cgc","red", "black"), pch=16,
#                                                       main="CGC", xlim=c(0,0.2), ylim=c(0,0.15)))
# with(test%>%subset(cancer_type=="cgc"), text(x=driver_rate, y=syn_muts_rate-0.005, labels=ifelse(driver_rate>0.1 | syn_muts_rate>0.02, symbol, ""), cex = 0.8))
# text(x=0.175, y=0.15, paste0("cor: ", format(cor.test(test$syn_muts_rate[test$cancer_type=="cgc"], test$driver_rate[test$cancer_type=="cgc"])$estimate, digits = 3)))
#
#
# with(test%>%subset(cancer_type=="can"),scatter.smooth(x=driver_rate, y=syn_muts_rate, lpars=list(col = "red", lwd = 3, lty = 3),
#                                                       col=ifelse(cancer_type=="can","blue", "black"), pch=16,
#                                                       main="Candidates", xlim=c(0,0.2), ylim=c(0,0.15)))
# with(test%>%subset(cancer_type=="can"), text(x=driver_rate, y=syn_muts_rate-0.005, labels=ifelse(driver_rate>0.075 | syn_muts_rate>0.075, symbol, ""), cex = 0.8))
# text(x=0.175, y=0.15, paste0("cor: ", format(cor.test(test$syn_muts_rate[test$cancer_type=="can"], test$driver_rate[test$cancer_type=="can"])$estimate, digits = 3)))
#
#
# with(test%>%subset(cancer_type=="rst"),scatter.smooth(x=driver_rate, y=syn_muts_rate, lpars=list(col = "red", lwd = 3, lty = 3),
#                                                       col=ifelse(cancer_type=="rst","green", "black"), pch=16,
#                                                       main="Candidates", xlim=c(0,0.2), ylim=c(0,0.15)))
# with(test%>%subset(cancer_type=="rst"), text(x=driver_rate, y=syn_muts_rate-0.005, labels=ifelse(driver_rate>0.08 | syn_muts_rate>0.075, symbol, ""), cex = 0.8))
# text(x=0.175, y=0.15, paste0("cor: ", format(cor.test(test$syn_muts_rate[test$cancer_type=="rst"], test$driver_rate[test$cancer_type=="rst"])$estimate, digits = 3)))

## Get the covariates
# cov = read.table("/Volumes/mourikisa/novel_driver_prediction/Score_correction/gene_covariates.txt", header = T)
# pdf("/Volumes/mourikisa/novel_driver_prediction/Score_correction/replication_time_distribution.pdf")
# print(hist(cov$reptime, breaks = 100, main = "Replication time (1134 NAs excluded)", xlab = "Reptime"))
# dev.off()
# pdf("/Volumes/mourikisa/novel_driver_prediction/Score_correction/hic_distribution.pdf")
# print(hist(cov$hic, breaks = 100, main = "Chromatin compartment (414 NAs excluded)", xlab = "HiC"))
# dev.off()

# ## Check the values of covariates for CGC
# cov = cov %>% left_join(geneInfo, by=c("gene"="symbol")) ## There are about 10 duplicated symbols - not much of a problem at this stage
# par(mfrow = c(2, 2))
# with(cov, hist(reptime, breaks=100, main="All genes"))
# abline(v=100, lty=2, col="red")
# abline(v=1000, lty=2, col="red")
# with(cov,legend(x="topright",legend=cbind(paste0(names(summary(reptime)), ":", summary(reptime)))))
#
# with(cov%>%subset(cancer_type=="cgc"), hist(reptime, breaks=100, main="CGC", col = "red", xlim = c(0,1700)))
# abline(v=100, lty=2, col="red")
# abline(v=1000, lty=2, col="red")
# with(cov%>%subset(cancer_type=="cgc"),legend(x="topright",legend=cbind(paste0(names(summary(reptime)), ":", summary(reptime)))))
#
# with(cov%>%subset(cancer_type=="can"), hist(reptime, breaks=100, main="Candidates", col="blue", xlim = c(0,1700)))
# abline(v=100, lty=2, col="red")
# abline(v=1000, lty=2, col="red")
# with(cov%>%subset(cancer_type=="can"),legend(x="topright",legend=cbind(paste0(names(summary(reptime)), ":", summary(reptime)))))
#
# with(cov%>%subset(cancer_type=="rst"), hist(reptime, breaks=100, main="Rest", col="green", xlim = c(0,1700)))
# abline(v=100, lty=2, col="red")
# abline(v=1000, lty=2, col="red")
# with(cov%>%subset(cancer_type=="rst"),legend(x="topright",legend=cbind(paste0(names(summary(reptime)), ":", summary(reptime)))))
#
# par(mfrow = c(2, 2))
# with(cov, hist(hic, breaks=100, main="All genes"))
# abline(v=-50, lty=2, col="red")
# abline(v=50, lty=2, col="red")
# with(cov,legend(x="topleft",legend=cbind(paste0(names(summary(hic)), ":", summary(hic)))))
#
# with(cov%>%subset(cancer_type=="cgc"), hist(hic, breaks=100, main="CGC", col="red", xlim = c(-100,100)))
# abline(v=-50, lty=2, col="red")
# abline(v=50, lty=2, col="red")
# with(cov%>%subset(cancer_type=="cgc"),legend(x="topleft",legend=cbind(paste0(names(summary(hic)), ":", summary(hic)))))
#
# with(cov%>%subset(cancer_type=="can"), hist(hic, breaks=100, main="Candidates", col="blue", xlim = c(-100,100)))
# abline(v=-50, lty=2, col="red")
# abline(v=50, lty=2, col="red")
# with(cov%>%subset(cancer_type=="can"),legend(x="topleft",legend=cbind(paste0(names(summary(hic)), ":", summary(hic)))))
#
# with(cov%>%subset(cancer_type=="rst"), hist(hic, breaks=100, main="Rest", col="green", xlim = c(-100,100)))
# abline(v=-50, lty=2, col="red")
# abline(v=50, lty=2, col="red")
# with(cov%>%subset(cancer_type=="rst"),legend(x="topleft",legend=cbind(paste0(names(summary(hic)), ":", summary(hic)))))

## Wilcoxon on the distributions of cgc and rest
# gene_cov = read.table("/Volumes/mourikisa/novel_driver_prediction/Score_correction/gene_covariates.txt", header = T) %>% rename(symbol=gene) %>% select(symbol, reptime, hic)
# ## Get their entrez through geneInfo table in NCG
# dev_mode()
# library(ncglib)
# geneInfo = get_geneinfo(version="NCG5")
# dev_mode()
# geneInfo = geneInfo %>% select(entrez, symbol, cancer_type) %>% unique %>% rename(gene_type=cancer_type)
# gene_cov = gene_cov %>% left_join(geneInfo)
# ## exclude those for which we dont have entrez
# gene_cov = gene_cov %>% subset(!is.na(entrez))
# gene_cov = gene_cov %>% rename(Entrez=entrez) %>% select(Entrez, reptime, hic)
# wilcox.test(gene_cov%>%subset(gene_type=="cgc")%>%.$hic, gene_cov%>%subset(gene_type=="rst")%>%.$hic)
# wilcox.test(gene_cov%>%subset(gene_type=="cgc")%>%.$reptime, gene_cov%>%subset(gene_type=="rst")%>%.$reptime)

getNCGannotation = function(){
#     dev_mode()
#     library(ncglib)
#     ncg_connection = get_ncg_connection()
#     geneInfo = get_geneinfo("NCG5")
#     expValidation = ncg_connection %>% tbl("NCG5_experimental_validation") %>% select(entrez, methods) %>% data.frame()
#     cancerGenes = get_cancer_genes(version="NCG5")
#     dupbythresholds = get_dupbythresholds()
#     expression = ncg_connection %>% tbl("NCG5_expression_GTEx2015") %>% collect(n=Inf)
#     dev_mode()

    load("/mnt/lustre/users/k1469280/mourikisa/data/geneInfoNCG5.Rdata")
    load("/mnt/lustre/users/k1469280/mourikisa/data/cancerGenesNCG5.Rdata")
    load("/mnt/lustre/users/k1469280/mourikisa/data/expValidationNCG5.Rdata")
    load("/mnt/lustre/users/k1469280/mourikisa/data/dupByThresholdsNCG5.Rdata")
    load("/mnt/lustre/users/k1469280/mourikisa/data/expressionNCG5_gtex.Rdata")

    ## Fix gene info table from NCG
    geneInfo = geneInfo %>% select(symbol, entrez, description, uniprot, cancer, cancer_type, cancer_dom, cancer_rec, duplicability, origin, degree, complex) %>% unique
    ## Get a cancer gene with all the associated primary sites and cancer sites
    cancerGenes = cancerGenes %>% select(entrez, cancer_site) %>%
        group_by(entrez) %>% summarise(cancer_site=paste(unique(cancer_site), collapse=",")) %>%
        ungroup

    geneInfo = geneInfo %>% left_join(cancerGenes) %>% rename(cancer_info=cancer, gene_type=cancer_type)

    genicDup = dupbythresholds %>% subset(threshold=="60" | threshold=="20") %>% data.frame() %>% spread(threshold, ndups)
    colnames(genicDup) = c("entrez", "genic20", "genic60")
    geneInfo = geneInfo %>% left_join(genicDup)

    return(list(geneInfo=geneInfo, gtex=gtex))

}

## Variance and Dimensionality reduction
getBestmodel = function( test){
    ## without young plus All_muts plus HIC
    ## Change to the directory
    RES_DIR_LOCAL = "/Volumes/mourikisa/novel_driver_prediction/OAC_without_young_plusALLmuts_plusHIC/cgc_518_training"
    cv_stat_summary = read.table(paste0(RES_DIR_LOCAL, "/cv_stat_summary.tsv"), header=T)
    setwd(paste0(RES_DIR_LOCAL, "/excels"))
    best_models = NULL

    linear = read.xlsx(file="old/grid_predictions_linear.xlsx", 1, startRow = 7)
    linear = linear %>% select(-min, -q1, -median, -mean, -q3, -max)
    linear = linear %>% left_join(cv_stat_summary %>% subset(type=="Sensitivity" & set=="test") %>% select(analysis, iterations, min, q1, median, mean, q3, max, var)) %>% data.frame()
    linear = linear%>%rename(Predicted_genes=novel_drivers_genes_platt_rst_0.95)
    ## Print the best model
    best_models = rbind(best_models,
                        linear %>% arrange(desc(mean)) %>%
                            slice(1:5) %>% mutate(var_rank=row_number(var)) %>%
                            subset(var_rank==1) %>% select(analysis, nu, gamma, mean, var, Predicted_genes)
                        )
    p1= ggplot(linear, aes(x=var, y=mean, color=Predicted_genes)) +
        geom_point() + geom_point(data=linear%>%subset(analysis=="linear.0.2"), size=4) +
        geom_point(data=linear%>%subset(analysis=="linear.0.1"), size=5) +
        ggtitle("Linear models") +
        scale_color_gradient(low="blue", high="red") +
        xlab("Sensitivity variance") + ylab("Mean sensitivity")

    radial = read.xlsx(file="old/grid_predictions_radial.xlsx", 1, startRow = 7)
    radial = radial %>% select(-min, -q1, -median, -mean, -q3, -max)
    radial = radial %>% left_join(cv_stat_summary %>% subset(type=="Sensitivity" & set=="test") %>% select(analysis, iterations, min, q1, median, mean, q3, max, var)) %>% data.frame()
    radial = radial%>%rename(Predicted_genes=novel_drivers_genes_platt_rst_0.95)
    ## Print the best model
    best_models = rbind(best_models,
                        radial %>% arrange(desc(mean)) %>%
                            slice(1:5) %>% mutate(var_rank=row_number(var)) %>%
                            subset(var_rank==1) %>% select(analysis, nu, gamma, mean, var, Predicted_genes)
                        )
    p2= ggplot(radial, aes(x=var, y=mean, color=Predicted_genes)) + geom_point() + geom_point(data=radial%>%subset(analysis=="radial.0.2.0.015625"), size=4) +
        geom_point(data=radial%>%subset(analysis=="radial.0.1.0.0078125"), size=5) +
        ggtitle("Radial models") +
        scale_color_gradient(low="blue", high="red") +
        xlab("Sensitivity variance") + ylab("Mean sensitivity")

    polynomial = read.xlsx(file="old/grid_predictions_polynomial.xlsx", 1, startRow = 7)
    polynomial = polynomial %>% select(-min, -q1, -median, -mean, -q3, -max)
    polynomial = polynomial %>% left_join(cv_stat_summary %>% subset(type=="Sensitivity" & set=="test") %>% select(analysis, iterations, min, q1, median, mean, q3, max, var)) %>% data.frame()
    polynomial=polynomial%>%rename(Predicted_genes=novel_drivers_genes_platt_rst_0.95)
    ## Print the best model
    best_models = rbind(best_models,
                        polynomial %>% arrange(desc(mean)) %>%
                            slice(1:5) %>% mutate(var_rank=row_number(var)) %>%
                            subset(var_rank==1) %>% select(analysis, nu, gamma, mean, var, Predicted_genes)
                        )
    p3= ggplot(polynomial, aes(x=var, y=mean, color=Predicted_genes)) + geom_point() +
        geom_point(data=polynomial%>%subset(analysis=="polynomial.0.05.0.125.3"), size=5) +
        ggtitle("Polynomial models") +
        scale_color_gradient(low="blue", high="red") +
        xlab("Sensitivity variance") + ylab("Mean sensitivity")

    sigmoid = read.xlsx(file="old/grid_predictions_sigmoid.xlsx", 1, startRow = 7)
    sigmoid = sigmoid %>% select(-min, -q1, -median, -mean, -q3, -max)
    sigmoid = sigmoid %>% left_join(cv_stat_summary %>% subset(type=="Sensitivity" & set=="test") %>% select(analysis, iterations, min, q1, median, mean, q3, max, var)) %>% data.frame()
    sigmoid=sigmoid%>%rename(Predicted_genes=novel_drivers_genes_platt_rst_0.95)
    ## Print the best model
    best_models = rbind(best_models,
                        sigmoid %>% arrange(desc(mean)) %>%
                            slice(1:5) %>% mutate(var_rank=row_number(var)) %>%
                            subset(var_rank==1) %>% select(analysis, nu, gamma, mean, var, Predicted_genes)
                        )
    p4= ggplot(sigmoid, aes(x=var, y=mean, color=Predicted_genes)) + geom_point() +
        geom_point(data=sigmoid%>%subset(analysis=="sigmoid.0.05.0.0625"), size=4) +
        geom_point(data=sigmoid%>%subset(analysis=="sigmoid.0.05.0.5"), size=5) +
        ggtitle("Sigmoid models") +
        scale_color_gradient(low="blue", high="red") +
        xlab("Sensitivity variance") + ylab("Mean sensitivity")

    cat("Models picked...")
    print(best_models)

    pdf(file="/Volumes/mourikisa/novel_driver_prediction/OAC_without_young_plusALLmuts_plusHIC/cgc_518_training/1489_79_OAC_sensitivity_VS_variance.pdf", height = 20, width = 20)
    grid.arrange(p1,p2,p3,p4,ncol=2)
    dev.off()

    write.xlsx(linear%>%data.frame(), file="grid_predictions_linear.xlsx", row.names = F)
    write.xlsx(radial%>%data.frame(), file="grid_predictions_radial.xlsx", row.names = F)
    write.xlsx(polynomial%>%data.frame(), file="grid_predictions_polynomial.xlsx", row.names = F)
    write.xlsx(sigmoid%>%data.frame(), file="grid_predictions_sigmoid.xlsx", row.names = F)

    ## Get the scores (based on the best models above)
    without_young_plusALL_plusHIC_score = getScore(cancer_type="OAC", path="/Volumes/mourikisa/novel_driver_prediction/OAC_without_young_plusALLmuts_plusHIC/cgc_518_training/gridPrediction/",
                                                   linear_preds = "predictions_linear.0.1.Rdata", linearCVS = 0.61, linearVar = 0.02,
                                                   radial_preds = "predictions_radial.0.1.0.0078125.Rdata", radialCVS = 0.72, radialVar = 0.03,
                                                   polynomial_preds = "predictions_polynomial.0.05.0.125.3.Rdata", polynomialCVS = 0.54, polynomialVar = 0.02,
                                                   sigmoid_preds = "predictions_sigmoid.0.05.0.5.Rdata", sigmoidCVS = 0.93, sigmoidVar = 0.01)

    ## Get geneInfo
    annotation = getNCGannotation()
    TISSUE = "Esophagus"

    ## Get features and scores (for dimensionality reduction)
    load("/Volumes/mourikisa/novel_driver_prediction/OAC_without_young_plusALLmuts_plusHIC/cgc_518_training/training_set.Rdata")
    load("/Volumes/mourikisa/novel_driver_prediction/OAC_without_young_plusALLmuts_plusHIC/cgc_518_training/validation_set.Rdata")

    cohort = rbind(training%>%tibble::rownames_to_column()%>%separate(rowname, into=c("cancer_type", "sample", "entrez"), sep="\\.") %>% data.frame(),
                   validation%>%tibble::rownames_to_column()%>%separate(rowname, into=c("cancer_type", "sample", "entrez"), sep="\\.")%>%mutate(type="P")%>%data.frame())
    cohort = cohort %>% mutate(entrez=as.numeric(entrez)) %>% left_join(without_young_plusALL_plusHIC_score[["scores"]])
    cohort$gene_type[is.na(cohort$gene_type)] = "cgc"
    cohort = cohort %>% left_join(annotation[["gtex"]]%>%subset(tissue==TISSUE)%>%select(entrez, exp_level)%>%rename(tissue_exp_level=exp_level))

    test = cohort
    test[,4:37] <- sapply(test[,4:37], as.numeric)
    test = test %>% select(-one_of(nearZeroVar(test[,4:37], name=T)))

    ## PCA
    test.pca <- prcomp(test[1:79,c(8:18,20:31)])
    print(test.pca)
    library(ggbiplot)
    g = ggbiplot(test.pca, obs.scale = 1, var.scale = 1,
                 groups = test[1:79,"type"], ellipse = TRUE,
                 circle = FALSE)
    g <- g + scale_color_discrete(name = '')
    g <- g + theme(legend.direction = 'horizontal',
                   legend.position = 'top')
    print(g)

    ## tSNE
    # load the tsne package
    library(tsne)

    # initialize counter to 0
    x <- 0
    epc <- function(x) {
        x <<- x + 1
        filename <- paste("plot", x, "jpg", sep=".")
        cat("> Plotting TSNE to ", filename, " ")

        # plot to d:\\plot.x.jpg file of 2400x1800 dimension
        jpeg(filename, width=2400, height=1800)

        plot(x, t='n', main="T-SNE")
        text(x, labels=t$label)
        dev.off()
    }

    tsne_data <- tsne(test[,c(8:18,24:31)], max_iter=1000, epoch=100)
    test = cbind(test, tsne_data %>% data.frame())
    test = test %>% left_join(geneInfo)


    ggplot(test%>%subset(is.na(score) | score>28), aes(x=X1, y=X2, shape=type, color=score)) + geom_point(size=2) +scale_shape_manual(values = c(3, 15)) + scale_color_gradient(low = "blue", high = "red")

    p1 = ggplot(test, aes(X1, X2)) + geom_point(aes(color=score, shape=type)) +
        geom_density2d(data=test%>%subset(type=="C"),contour = T) + ## density based on training
        #geom_density2d(data=test%>%subset(type=="P"), contour=T, aes(fill = score)) +
        scale_color_gradient(low = "blue", high = "red") +
        geom_point(data=test%>%subset(type=="C"), size=2, color="black")

    p2 = ggplot(rbind(test%>%subset(type=="C"), test%>%subset(type=="P")%>%arrange(desc(score))%>%slice(1:100)), aes(X1, X2)) + geom_point(aes(color=score, shape=type)) +
        geom_density2d(data=test%>%subset(type=="C"),contour = T) + ## density based on training
        #geom_density2d(data=test%>%subset(type=="P"), contour=T, aes(fill = score)) +
        scale_color_gradient(low = "blue", high = "red") +
        geom_point(data=test%>%subset(type=="C"), size=2, color="black")

    grid.arrange(p1,p2,nrow=2)

    ## Isomap
    library(RDRToolbox)
    ## Only systems-level properties
    #iso_dim2 = Isomap(data=as.matrix(test[,c(8:18,24:31)]), dims=2)
    ## All features
    iso_dim2 = Isomap(data=as.matrix(test[,c(4:31)]), dims=2)

    iso_dim2 = iso_dim2 %>% data.frame()
    colnames(iso_dim2) = c("X1", "X2")

    test = cbind(test, iso_dim2)
    doms_recs = rbind(annotation[["geneInfo"]] %>% subset(cancer_dom==1) %>% select(entrez) %>% mutate(cancer_gene_type="Dominant"),
    annotation[["geneInfo"]] %>% subset(cancer_rec==1) %>% select(entrez) %>% mutate(cancer_gene_type="Recessive"))
    doms_recs = doms_recs %>% arrange(entrez, desc(cancer_gene_type)) %>% subset(!duplicated(entrez)) ## There are 4 genes with both Dominant and recessive annotation
    test = test %>% left_join(doms_recs)
    test = test %>% left_join(annotation[["geneInfo"]]%>%select(entrez, symbol)%>%unique)

    ## Plot the score gradient
    ggplot(test, aes(X1, X2)) + geom_point(aes(shape=type, color=score, text=paste("symbol/sample:", symbol, "/", sample))) +
        scale_color_gradient(low="blue", high="red") +
        geom_density2d(data=test%>%subset(type=="C"),contour = T) + ## density based on training
        geom_point(data=test%>%subset(type=="C"), aes(X1, X2, shape=type, text=paste("symbol/sample:", symbol, "/", sample))) +
        theme_boss() + ggtitle("Isopmap using all features")

    ggplotly()


    ## Make the plot with training and 44 only
    ## Add 44
    genes_44 = read.xlsx(file="/Volumes/mourikisa/novel_driver_prediction/OAC/cgc_518_training/shared_genes_annotation_0.95_intersection.xlsx", 1) %>%
        select(entrez) %>% unique %>% .$entrez
    test = test %>% mutate(in_44=ifelse(entrez%in%genes_44, TRUE, FALSE))
    test = test %>%
        mutate(color=ifelse(gene_type=="cgc" & (cancer_gene_type=="Recessive" | is.na(cancer_gene_type)), "Recessive",
                            ifelse(gene_type=="cgc" & (cancer_gene_type=="Dominant" & !is.na(cancer_gene_type)), "Dominant", NA)))
    test$color[is.na(test$color) & test$in_44] = "Prediction"
    test$color[is.na(test$color)] = "Other"
    p = ggplot(test, aes(X1, X2)) + geom_jitter(data=test%>%subset(color=="Other"), aes(X1, X2, color=color)) +
        geom_jitter(data=test%>%subset(color%in%c("Dominant", "Recessive")), aes(X1, X2, color=color)) +
        geom_jitter(data=test%>%subset(color=="Prediction"), aes(X1, X2, color=color)) +
        scale_colour_manual(values = c("green", "grey", "red", "black")) +
        geom_density2d(data=test%>%subset(type=="C"),contour = T) + ## density based on training
        geom_text(data=test%>%subset(symbol%in%c("RING1", "NCOR1", "SIN3A", "KAT2A", "KAT2B", "ITCH")), aes(X1, X2, label = symbol)) +
        theme_boss()
    pdf("/Volumes/FCShared/Thanos/CRUK/44_OAC_Isomap.pdf", width = 15, height = 10, useDingbats = F)
    print(p)
    dev.off()

    ## Get the minimum score for at least 3 predictions
    max_score = max(test$score[!is.na(test$score)])
    min_score_atleast3 = 10000000
    sys_cans = NULL
    for (s in unique(test$sample)){
        cat(s, "\n")
        d = test %>% subset(sample==s) %>% mutate(score=ifelse(is.na(score), max_score+1, score)) %>% arrange(sample, desc(score)) %>% slice(1:3)
        min_score = min(d$score[!is.na(d$score)])
        if (min_score < min_score_atleast3){
            min_score_atleast3 = min_score
        }
        sys_cans = rbind(sys_cans, d)
    }

    ## At least 3
    test_atleast3 = test %>%
        mutate(color=ifelse(gene_type=="cgc" & (cancer_gene_type=="Recessive" | is.na(cancer_gene_type)), "Recessive",
                            ifelse(gene_type=="cgc" & (cancer_gene_type=="Dominant" & !is.na(cancer_gene_type)), "Dominant", NA)))
    test_atleast3$color[is.na(test_atleast3$color) & test_atleast3$score>=min_score_atleast3] = "Prediction"
    test_atleast3$color[is.na(test_atleast3$color)] = "Other"
    ## Exactly 3
    test_exactly3 = test %>%
        mutate(color=ifelse(gene_type=="cgc" & (cancer_gene_type=="Recessive" | is.na(cancer_gene_type)), "Recessive",
                            ifelse(gene_type=="cgc" & (cancer_gene_type=="Dominant" & !is.na(cancer_gene_type)), "Dominant", NA)))
    sys_cans = sys_cans %>% select(cancer_type, sample, entrez) %>% mutate(check="Prediction")
    test_exactly3 = test_exactly3 %>% left_join(sys_cans)
    test_exactly3$color[is.na(test_exactly3$color) & test_exactly3$check=="Prediction"] = "Prediction"
    test_exactly3$color[is.na(test_exactly3$color)] = "Other"

    p1 = ggplot(test_atleast3, aes(X1, X2)) + geom_point(aes(shape=type, color=color)) +
        geom_point(data=test_atleast3%>%subset(type=="C"), aes(X1, X2, color=color, text=paste("symbol/sample:", symbol, "/", sample))) +
        scale_colour_manual(values = c("green", "grey", "red", "black")) +
        geom_density2d(data=test_atleast3%>%subset(type=="C"),contour = T) + ## density based on training
        theme_boss() +
        ggtitle(paste0("All genes - at least 3 drivers \n No genes:", test_atleast3%>%subset(score>=min_score_atleast3 | is.na(score))%>%select(entrez)%>%unique%>%nrow,
                       "\n No redundant genes:", test_atleast3%>%subset(score>=min_score_atleast3 | is.na(score))%>%nrow))

    p2 = ggplot(test_exactly3, aes(X1, X2)) + geom_point(aes(shape=type, color=color)) +
        geom_point(data=test_exactly3%>%subset(type=="C"), aes(X1, X2, color=color)) +
        scale_colour_manual(values = c("green", "grey", "red", "black")) +
        geom_density2d(data=test_exactly3%>%subset(type=="C"),contour = T) + ## density based on training
        theme_boss() +
        ggtitle(paste0("All genes - exactly 3 drivers \n No genes:", test_exactly3%>%subset(color=="Prediction" | is.na(score))%>%select(entrez)%>%unique%>%nrow,
                       "\n No redundant genes:",  test_exactly3%>%subset(color=="Prediction" | is.na(score))%>%nrow))

    grid.arrange(p1,p2, nrow=1)


    ## ExpT_NE is numeric so 1 factor is 2 now
    test_no_NE = test%>%subset(!(tissue_exp_level=="Not Expressed" & ExpT_NE==2) | is.na(tissue_exp_level))
    min_score_atleast3 = 10000000
    sys_cans_no_NE = NULL
    for (s in unique(test_no_NE$sample)){
        cat(s, "\n")
        d = test_no_NE %>% subset(sample==s) %>% mutate(score=ifelse(is.na(score), max_score+1, score)) %>% arrange(sample, desc(score)) %>% slice(1:3)
        min_score = min(d$score[!is.na(d$score)])
        if (min_score < min_score_atleast3){
            min_score_atleast3 = min_score
        }
        sys_cans_no_NE = rbind(sys_cans_no_NE, d)
    }
    ## At least 3
    test_no_NE_atleast3 = test_no_NE %>%
        mutate(color=ifelse(gene_type=="cgc" & (cancer_gene_type=="Recessive" | is.na(cancer_gene_type)), "Recessive",
                            ifelse(gene_type=="cgc" & (cancer_gene_type=="Dominant" & !is.na(cancer_gene_type)), "Dominant", NA)))
    test_no_NE_atleast3$color[is.na(test_no_NE_atleast3$color) & test_no_NE_atleast3$score>=min_score_atleast3] = "Prediction"
    test_no_NE_atleast3$color[is.na(test_no_NE_atleast3$color)] = "Other"
    ## Exactly 3
    test_no_NE_exactly3 = test_no_NE %>%
        mutate(color=ifelse(gene_type=="cgc" & (cancer_gene_type=="Recessive" | is.na(cancer_gene_type)), "Recessive",
                            ifelse(gene_type=="cgc" & (cancer_gene_type=="Dominant" & !is.na(cancer_gene_type)), "Dominant", NA)))
    sys_cans_no_NE = sys_cans_no_NE %>% select(cancer_type, sample, entrez) %>% mutate(check="Prediction")
    test_no_NE_exactly3 = test_no_NE_exactly3 %>% left_join(sys_cans_no_NE)
    test_no_NE_exactly3$color[is.na(test_no_NE_exactly3$color) & test_no_NE_exactly3$check=="Prediction"] = "Prediction"
    test_no_NE_exactly3$color[is.na(test_no_NE_exactly3$color)] = "Other"


    p3 = ggplot(test_no_NE_atleast3, aes(X1, X2)) + geom_point(aes(shape=type, color=color)) +
        geom_point(data=test_no_NE_atleast3%>%subset(type=="C"), aes(X1, X2, color=color)) +
        scale_colour_manual(values = c("green", "grey", "red", "black")) +
        geom_density2d(data=test_no_NE_atleast3%>%subset(type=="C"),contour = T) + ## density based on training
        theme_boss() +
        ggtitle(paste0("No not expressed genes - at least 3 drivers \n No genes:", test_no_NE_atleast3%>%subset(score>=min_score_atleast3 | is.na(score))%>%select(entrez)%>%unique%>%nrow,
                       "\n No redundant genes:", test_no_NE_atleast3%>%subset(score>=min_score_atleast3 | is.na(score))%>%nrow))

    p4 = ggplot(test_no_NE_exactly3, aes(X1, X2)) + geom_point(aes(shape=type, color=color)) +
        geom_point(data=test_no_NE_exactly3%>%subset(type=="C"), aes(X1, X2, color=color)) +
        scale_colour_manual(values = c("green", "grey", "red", "black")) +
        geom_density2d(data=test_no_NE_exactly3%>%subset(type=="C"),contour = T) + ## density based on training
        theme_boss() +
        ggtitle(paste0("No not expressed genes - exactly 3 drivers \n No genes:", test_no_NE_exactly3%>%subset(color=="Prediction" | is.na(score))%>%select(entrez)%>%unique%>%nrow,
                       "\n No redundant genes:",  test_no_NE_exactly3%>%subset(color=="Prediction" | is.na(score))%>%nrow))

    grid.arrange(p1,p2,p3,p4, nrow=2)

    ## Check from which samples
    test_atleast3 %>% subset(color!="Other") %>% group_by(sample) %>% summarise(all=n()) %>% left_join(test_no_NE %>% subset(color!="Other") %>% group_by(sample) %>% summarise(no_NE=n()))

    ## Get features and scores (for exporting - no scaling in the features)
    load("/Volumes/mourikisa/novel_driver_prediction/OAC_without_young_plusALLmuts_plusHIC/cgc_518_training/training_set_noScale.Rdata")
    load("/Volumes/mourikisa/novel_driver_prediction/OAC_without_young_plusALLmuts_plusHIC/cgc_518_training/validation_set_noScale.Rdata")
    cohort = rbind(training_ns%>%tibble::rownames_to_column()%>%separate(rowname, into=c("cancer_type", "sample", "entrez"), sep="\\.") %>% data.frame(),
                   validation_ns%>%tibble::rownames_to_column()%>%separate(rowname, into=c("cancer_type", "sample", "entrez"), sep="\\.")%>%mutate(type="P")%>%data.frame())
    cohort = cohort %>% mutate(entrez=as.numeric(entrez)) %>% left_join(without_young_plusALL_plusHIC_score[["scores"]])
    cohort$gene_type[is.na(cohort$gene_type)] = "cgc"

    ## Molecular properties
    mol_properties = c("no_ALL_muts", "no_TRUNC_muts", "no_NTDam_muts", "no_GOF_muts", "Copy_number", "CNVLoss", "ExpT_ME", "ExpT_HE", "ExpT_LE", "ExpT_NE", "ExpT_NET")

    ## Annotate and export the table
    export = cohort %>% select(cancer_type, sample, entrez, type, gene_type, score, kernels) %>% left_join(annotation[["geneInfo"]])
    export = export %>% left_join(cohort%>%select(cancer_type, sample, entrez, one_of(mol_properties))) %>% left_join(cohort%>%select(-one_of(mol_properties), -type, -gene_type, -score, -score_scaled, -kernels, -kernels_no))
    ## Get 44
    genes_44 = read.xlsx(file="/Volumes/mourikisa/novel_driver_prediction/OAC/cgc_518_training/shared_genes_annotation_0.95_intersection.xlsx", 1) %>%
        select(entrez) %>% unique %>% .$entrez
    export = export %>% mutate(in_44=ifelse(entrez%in%genes_44, TRUE, FALSE))
    export = export %>% left_join(annotation[["gtex"]]%>%subset(tissue==TISSUE)%>%select(entrez, exp_level)%>%rename(tissue_exp_level=exp_level))

    ## Add in_Cambridge annotation column
    reb_genes = read.xlsx("/Volumes/mourikisa/novel_driver_prediction/OAC/31_Cambridge_genes_annotations.xlsx", 1)
    reb_genes = reb_genes %>% select(entrez) %>% unique %>% .$entrez
    export = export %>% mutate(in_Cambridge_31=ifelse(entrez%in%reb_genes, TRUE, FALSE))

    ## Add false positive annotation
    fp_dir="/Volumes/mourikisa/data/NCG_false_positives.txt"
    false_positive_genes <- read.table(fp_dir, header = T, sep = "\t")
    false_positive_genes <- false_positive_genes %>% select(entrez) %>% .$entrez
    export = export %>% mutate(in_false_positives=ifelse(entrez%in%false_positive_genes, TRUE, FALSE))

    ## Add mutation frequency to the table
    ## Load OAC dataset
    load("/Volumes/mourikisa/data/OAC/Rdata/dataset_mutations_damaging.Rdata")
    oac_MutFreq = df_mut %>% subset(!is.na(entrez_19014)) %>% subset(damaging | truncating | gof) %>%
        select(entrez_19014, sample, ReadCount, VariantAlleleCount, ReadCountControl) %>% rename(entrez=entrez_19014) %>%
        mutate(MutFreq=round(VariantAlleleCount/ReadCount, digits=2)) %>% group_by(entrez, sample) %>% summarise(MutFreq=paste(MutFreq, collapse=","))
    ## Add them to export
    export = export %>% left_join(oac_MutFreq)

    ## Get at least 3,...,8 and add columns to export
    ## Get the minimum score for at least 3 predictions
    max_score = max(test$score[!is.na(test$score)])
    min_scores = NULL
    for (i in 3:8){
        cat(i, "\n")
        min_score_atleast = 10000000
        for (s in unique(test$sample)){
            cat(s, "\n")
            d = test %>% subset(sample==s) %>% mutate(score=ifelse(is.na(score), max_score+1, score)) %>% arrange(sample, desc(score)) %>% slice(1:i)
            min_score = min(d$score[!is.na(d$score)])
            if (min_score < min_score_atleast){
                min_score_atleast = min_score
            }
        }
        min_scores = rbind(min_scores, c(i, min_score_atleast))
    }
    min_scores = data.frame(min_scores)
    colnames(min_scores) = c("at_least", "min_score")
    ## Add the columns for at least predictions
    export = export %>% mutate(at_least_3=ifelse(score >= min_scores$min_score[min_scores$at_least==3] & !is.na(score), TRUE, FALSE))
    export = export %>% mutate(at_least_4=ifelse(score >= min_scores$min_score[min_scores$at_least==4] & !is.na(score), TRUE, FALSE))
    export = export %>% mutate(at_least_5=ifelse(score >= min_scores$min_score[min_scores$at_least==5] & !is.na(score), TRUE, FALSE))
    export = export %>% mutate(at_least_6=ifelse(score >= min_scores$min_score[min_scores$at_least==6] & !is.na(score), TRUE, FALSE))
    export = export %>% mutate(at_least_7=ifelse(score >= min_scores$min_score[min_scores$at_least==7] & !is.na(score), TRUE, FALSE))
    export = export %>% mutate(at_least_8=ifelse(score >= min_scores$min_score[min_scores$at_least==8] & !is.na(score), TRUE, FALSE))

    ## Add the coordinates as well
    export = export %>% left_join(test%>%select(entrez, sample, X1, X2, cancer_gene_type))

    #export = export %>% left_join(test%>%select(cancer_type, sample, entrez, X1, X2))
    write.xlsx(export%>%data.frame(), file="/Volumes/FCShared/Thanos/CRUK/1489_79_OAC_scores_plus_annotation_0208.xlsx", row.names = F)

    ## export the minimum values
    write.xlsx(min_scores%>%data.frame(), file="/Volumes/FCShared/Thanos/CRUK/1489_79_OAC_minimum_scores.xlsx")


}

## Calculate the score of each gene
getScore = function(cancer_type=NULL,
                    training="cgc",
                    path=NULL,
                    linear_preds=NULL, linearCVS=NULL, linearVar=NULL,
                    radial_preds=NULL, radialCVS=NULL, radialVar=NULL,
                    polynomial_preds=NULL, polynomialCVS=NULL, polynomialVar=NULL,
                    sigmoid_preds=NULL, sigmoidCVS=NULL, sigmoidVar=NULL){

    if (is.null(cancer_type)| is.null(path)){
        cat("Parameters missing...", "\n")
        return(NULL)
    }

    cat(paste0("Cancer_type: ", cancer_type), "\n")
    cat(paste0("Training: ", training), "\n")
    ## Get the number of predictions before intersection
    all_preds = NULL
    all_scores = NULL

    if(!is.null(linear_preds) & !is.null(linearCVS)){
        load(paste0(path,linear_preds))
        linear = preds
        rm(preds)
        ## Push it to the all_preds
        all_preds = rbind(all_preds, linear %>% subset(label) %>% mutate(kernel="linear", training=training) %>% data.frame())

        ## Kernel-specific score
        linear = linear %>% arrange(sample, dv) %>% group_by(sample) %>% mutate(rank=row_number(-dv)) %>% ungroup
        linear = linear %>% group_by(sample) %>% mutate(N=max(rank), Nlog10=log10(N)) %>% ungroup
        linear = linear %>% mutate(cvs=linearCVS, var=linearVar) %>% mutate(ratio=cvs/var)
        linear = linear %>% mutate(score=-log10(rank/N)*cvs)
        linear = linear %>% mutate(kernel="linear") %>% data.frame()
        ## Old score implementation
        #linear = linear %>% mutate(score=(-10*log10(dv_rank/nrow(linear)))*(linearCVS/linearVar))
        #linear = linear %>% mutate(score=(-10*log10(dv_rank/nrow(linear)))*linearCVS)
        #min_score = min(linear$score)
        #max_score = max(linear$score)
        #linear = linear %>% mutate(score_scaled = (score-min_score)/(max_score-min_score), kernel="linear", training=training)
        all_scores = rbind(all_scores, linear)
    }

    if(!is.null(radial_preds) & !is.null(radialCVS)){
        load(paste0(path,radial_preds))
        radial = preds
        rm(preds)
        ## Push it to the all_preds
        all_preds = rbind(all_preds, radial %>% subset(label) %>% mutate(kernel="radial", training=training) %>% data.frame())

        ## Kernel-specific score
        radial = radial %>% arrange(sample, dv) %>% group_by(sample) %>% mutate(rank=row_number(-dv)) %>% ungroup
        radial = radial %>% group_by(sample) %>% mutate(N=max(rank), Nlog10=log10(N)) %>% ungroup
        radial = radial %>% mutate(cvs=radialCVS, var=radialVar) %>% mutate(ratio=cvs/var)
        radial = radial %>% mutate(score=-log10(rank/N)*cvs)
        radial = radial %>% mutate(kernel="radial") %>% data.frame()
        ## Old score implementation
        #radial = radial %>% mutate(score=(-10*log10(dv_rank/nrow(radial)))*(radialCVS/radialVar))
        #radial = radial %>% mutate(score=(-10*log10(dv_rank/nrow(radial)))*radialCVS)
        #min_score = min(radial$score)
        #max_score = max(radial$score)
        #radial = radial %>% mutate(score_scaled = (score-min_score)/(max_score-min_score), kernel="radial", training=training)
        all_scores = rbind(all_scores, radial)
    }

    if(!is.null(polynomial_preds) & !is.null(polynomialCVS)){
        load(paste0(path,polynomial_preds))
        polynomial = preds
        rm(preds)
        ## Push it to the all_preds
        all_preds = rbind(all_preds, polynomial %>% subset(label) %>% mutate(kernel="polynomial", training=training) %>% data.frame())

        ## Kernel-specific score
        polynomial = polynomial %>% arrange(sample, dv) %>% group_by(sample) %>% mutate(rank=row_number(-dv)) %>% ungroup
        polynomial = polynomial %>% group_by(sample) %>% mutate(N=max(rank), Nlog10=log10(N)) %>% ungroup
        polynomial = polynomial %>% mutate(cvs=polynomialCVS, var=polynomialVar) %>% mutate(ratio=cvs/var)
        polynomial = polynomial %>% mutate(score=-log10(rank/N)*cvs)
        polynomial = polynomial %>% mutate(kernel="polynomial") %>% data.frame()
        #polynomial = polynomial %>% mutate(score=(-10*log10(dv_rank/nrow(polynomial)))*(polynomialCVS/polynomialVar))
        #polynomial = polynomial %>% mutate(score=(-10*log10(dv_rank/nrow(polynomial)))*polynomialCVS)
        #min_score = min(polynomial$score)
        #max_score = max(polynomial$score)
        #polynomial = polynomial %>% mutate(score_scaled = (score-min_score)/(max_score-min_score), kernel="polynomial", training=training)
        all_scores = rbind(all_scores, polynomial)
    }
    if(!is.null(sigmoid_preds) & !is.null(sigmoidCVS)){
        load(paste0(path,sigmoid_preds))
        sigmoid = preds
        rm(preds)
        ## Push it to the all_preds
        all_preds = rbind(all_preds, sigmoid %>% subset(label) %>% mutate(kernel="sigmoid", training=training) %>% data.frame())

        ## Kernel-specific score
        sigmoid = sigmoid %>% arrange(sample, dv) %>% group_by(sample) %>% mutate(rank=row_number(-dv)) %>% ungroup
        sigmoid = sigmoid %>% group_by(sample) %>% mutate(N=max(rank), Nlog10=log10(N)) %>% ungroup
        sigmoid = sigmoid %>% mutate(cvs=sigmoidCVS, var=sigmoidVar) %>% mutate(ratio=cvs/var)
        sigmoid = sigmoid %>% mutate(score=-log10(rank/N)*cvs)
        sigmoid = sigmoid %>% mutate(kernel="sigmoid") %>% data.frame()
        #sigmoid = sigmoid %>% mutate(score=(-10*log10(dv_rank/nrow(sigmoid)))*(sigmoidCVS/sigmoidVar))
        #sigmoid = sigmoid %>% mutate(score=(-10*log10(dv_rank/nrow(sigmoid)))*sigmoidCVS)
        #min_score = min(sigmoid$score)
        #max_score = max(sigmoid$score)
        #sigmoid = sigmoid %>% mutate(score_scaled = (score-min_score)/(max_score-min_score), kernel="sigmoid", training=training)
        all_scores = rbind(all_scores, sigmoid)
    }

    ## Final score for each gene across kernels
    scores = all_scores %>% group_by(cancer_type, sample, entrez, gene_type, N, Nlog10) %>%
        summarise(sum_of_scores=sum(score), kernels_used=length(unique(kernel))) %>%
        mutate(score=(1/(kernels_used*Nlog10))*sum_of_scores) %>% ungroup %>% data.frame()



    ## Add from how many kernels they have predicted as TRUE
    preds2kernels = all_preds %>% subset(label) %>% group_by(cancer_type, sample, entrez) %>% summarise(kernels_predicted=paste(unique(kernel), collapse=","), kernels_predicted_no=length(unique(kernel))) %>% ungroup
    scores = scores %>% left_join(preds2kernels)
    scores$kernels_predicted_no[is.na(scores$kernels_predicted_no)] = 0

    return(list(all_scores=all_scores, scores=scores))

}

## A function to get the value of objective function in libsvm
## Usage from http://stats.stackexchange.com/questions/25387/problem-with-e1071-libsvm
# a <- svm(x, y, scale = FALSE, type = 'C-classification', kernel = 'linear', cost = 50000)
# w <- t(a$coefs) %*% a$SV;
# b <- -a$rho;
# obj_func_str1 <- get_obj_func_info(w, b, 50000, x, y)
# obj_func_str2 <- get_obj_func_info(w, b - 5, 50000, x, y)
get_obj_func_info <- function(w, b, c_par, x, y) {
    xi <- rep(0, nrow(x))

    for (i in 1:nrow(x)) {
        xi[i] <- 1 - as.numeric(as.character(y[i]))*(sum(w*x[i,]) + b)
        if (xi[i] < 0) xi[i] <- 0
    }

    return(list(obj_func_value = 0.5*sqrt(sum(w * w)) + c_par*sum(xi),
                sum_xi = sum(xi), xi = xi))
}

mapply_pb = function(FUN, X, Y,  ...){
  env <- environment()
  pb_Total <- length(X)
  counter <- 0
  pb <- txtProgressBar(min = 0, max = pb_Total, style = 3)

  # wrapper around FUN
  wrapper <- function(...){
    curVal <- get("counter", envir = env)
    assign("counter", curVal +1 ,envir=env)
    setTxtProgressBar(get("pb", envir=env), curVal +1)
    FUN(...)
  }
  res <- mapply(wrapper, X, Y, ...)
  close(pb)
  res
}


make_pathway_list = function(pathMsigDbFile) {
  inputFile <- pathMsigDbFile
  con  <- file(inputFile, open = "r")
  c = 1
  pathway.list <- vector(mode="list",length=0)
  print("Loading gene sets....")
  while (length(oneLine <- readLines(con, n = 1, warn = FALSE)) > 0) {
    myVector <- do.call("rbind",strsplit(oneLine, "\t"))
    t = vector(mode="list",length=1)
    t[[1]] = myVector[3:length(myVector)]
    names(t) = myVector[1]
    pathway.list = c(pathway.list,t)
    c = c+1
  }

  close(con)
  return(pathway.list)
}

base_breaks_x = function(x, br, la){
  d <- data.frame(y=-Inf, yend=-Inf, x=0, xend=100)
  list(geom_segment(data=d, aes(x=x, y=y, xend=xend, yend=yend), inherit.aes=FALSE), scale_x_reverse(breaks=br, labels=la))
}

base_breaks_y = function(m, br, la ){
  d <- data.frame(x=Inf, xend=Inf, y=0, yend=m)
  list(geom_segment(data=d, aes(x=x, y=y, xend=xend, yend=yend), inherit.aes=FALSE), scale_y_continuous(breaks=br, labels=la))
}

map_to_geneset <- function ( x , tissue,
                             geneset="geneSets/c2.cp.kegg.v5.1.entrez.gmt.txt",
                             fout="KEGG_intersection_best_models.xlsx",
                             fpdf="KEGG_intersection_best_models.pdf",
                             install_NCG=T,
                             NCG_version="NCG5",
                             NCG_path=NULL,
                             uninformative_paths=c('CANCER','LEUKEMIA')) {


  kegg = make_pathway_list(geneset)
  len = sapply(kegg, length)

  sel.genes = unique(x[,c('entrez','gene_type')])
  sel.genes$path = sapply(as.list(as.character(unique(x$entrez))), function(x,y){
    sel=sapply(y, function(z,w) w%in%z, w = x);
    return( paste(names(y)[sel],collapse='.')) },
    y = kegg)

  dev_mode()
  if(install_NCG & !is.null(NCG_path)) install(NCG_path)
  library(ncglib)
  geneInfo=as.data.frame(get_geneinfo(version = NCG_version), stringsAsFactors=F)
  cancer_genes = as.data.frame(get_cancer_genes(version = NCG_version))
  dev_mode()

  sel.genes$symbol = geneInfo[match(sel.genes$entrez, geneInfo$entrez), "symbol"]

  ## refine a data frame to return for downstream analysis - part 1
  exp1 = sel.genes %>% select(entrez, symbol, gene_type, path) %>%
      mutate(type="prediction", primary_site=NA)  %>% data.frame()

  y = sapply(as.list(as.character(unique(cancer_genes$entrez))), function(x,y){
      sel=sapply(y, function(z,w) w%in%z, w = x);
      return( paste(names(y)[sel],collapse='.')) },
      y = kegg)

  # cancer genes
  can.genes = data.frame(entrez=unique(cancer_genes$entrez), path = y)
  can.genes$cancer_type = geneInfo[match(can.genes$entrez, geneInfo$entrez),'cancer_type']

  allGenes = geneInfo %>% subset(duplicability!=-1 | cancer_type=="can" | cancer_type=="cgc") %>% select(entrez, symbol)
  y = sapply(as.list(as.character(unique(allGenes$entrez))), function(x,y){
    sel=sapply(y, function(z,w) w%in%z, w = x);
    return( paste(names(y)[sel],collapse='.')) },
    y = kegg)

  ## Get all the 19014 genes
  allGenes = data.frame(entrez=unique(allGenes$entrez), path = y)
  allGenes$cancer_type = geneInfo[match(allGenes$entrez, geneInfo$entrez),'cancer_type']


  ## refine a data frame to return for downstream analysis - part 2

#   exp2 = can.genes %>% rename(gene_type=cancer_type) %>%
#       left_join(cancer_genes%>%select(entrez, primary_site), by=c("entrez")) %>%
#       group_by(entrez, path, gene_type) %>% summarise(primary_site=paste0(unique(primary_site), collapse=",")) %>%
#       left_join(geneInfo%>%select(entrez,symbol), by=c("entrez")) %>%
#       mutate(type="NCG5") %>%
#       select(entrez, symbol, gene_type, path, type, primary_site)

  exp2 = allGenes %>% rename(gene_type=cancer_type) %>%
      left_join(cancer_genes%>%select(entrez, primary_site), by=c("entrez")) %>%
      group_by(entrez, path, gene_type) %>% summarise(primary_site=paste0(unique(primary_site), collapse=",")) %>%
      left_join(geneInfo%>%select(entrez,symbol), by=c("entrez")) %>%
      mutate(type=ifelse(gene_type=="rst", "rst", "NCG5")) %>%
      select(entrez, symbol, gene_type, path, type, primary_site) %>% data.frame()

  exp = rbind(exp1,exp2)

  unique_path = unique(unlist(strsplit(can.genes$path, "\\.")))
  m = matrix(0, nr = nrow(can.genes), nc = length(unique_path), dimnames = list(can.genes$entrez, unique_path))
  for(i in unique_path) m[grep(i, can.genes$path), i ] =1
  df = data.frame(value=sort(apply(m, 2, sum)))
  df$path=rownames(df)
  df = unrowname(df)
  df = df[,c("path","value")]
  df$path_size = len[df$path]
  df$perc = with(df, (value/path_size)*100 )
  df$category = "NCG_cancer_genes_1571"
  res = df

  unique_path = unique(unlist(strsplit(subset(can.genes, cancer_type=='cgc')$path, "\\.")))
  m = matrix(0, nr = nrow(subset(can.genes, cancer_type=='cgc')), nc = length(unique_path), dimnames = list(subset(can.genes, cancer_type=='cgc')$entrez, unique_path))
  for(i in unique_path) m[grep(i, subset(can.genes, cancer_type=='cgc')$path), i ] =1
  df = data.frame(value=sort(apply(m, 2, sum)))
  df$path=rownames(df)
  df = unrowname(df)
  df = df[,c("path","value")]
  df$path_size = len[df$path]
  df$perc = with(df, (value/path_size)*100 )
  df$category = "NCG_cancer_genes_518"
  res = rbind(res,df)

  can.genes = data.frame(entrez=unique(subset(cancer_genes, primary_site==tissue)$entrez),
                         path = sapply(as.list(as.character(unique(subset(cancer_genes, primary_site==tissue)$entrez))), function(x,y){
    sel=sapply(y, function(z,w) w%in%z, w = x);
    return( paste(names(y)[sel],collapse='.')) },
    y = kegg))

  can.genes$cancer_type = geneInfo[match(can.genes$entrez, geneInfo$entrez),'cancer_type']

  unique_path = unique(unlist(strsplit(can.genes$path, "\\.")))
  m = matrix(0, nr = nrow(can.genes), nc = length(unique_path), dimnames = list(can.genes$entrez, unique_path))
  for(i in unique_path) m[grep(i, can.genes$path), i ] =1
  df = data.frame(value=sort(apply(m, 2, sum)))
  df$path=rownames(df)
  df = unrowname(df)
  df = df[,c("path","value")]
  df$path_size = len[df$path]
  df$perc = with(df, (value/path_size)*100 )
  df$category = paste0("NCG_cancer_genes_",nrow(can.genes),"_",tissue)
  res = rbind(res, df)


#   df$perc = 100*(df$value/nrow(can.genes))
#   df$label = paste0(df$path," ( ", df$value," / ", len[df$path], " )" )
#   df$path = factor(df$path, levels = names(sort(apply(m, 2, sum))))
#   df$label = factor(df$label, levels = unique(df$label[match(levels(df$path), df$path)]))

#   pdf(file=pdfname,h=pdf_height,w=pdf_width)
#
#   print(ggplot(df, aes(x=label,y=perc))+geom_bar(stat='identity')+coord_flip()+theme_boss()+ylab("")+xlab("")+
#           ggtitle(paste0(nrow(can.genes)," cancer genes of ", tissue)))


  # all novel prediction

  unique_path = unique(unlist(strsplit(sel.genes$path, "\\.")))
  m = matrix(0, nr = nrow(sel.genes), nc = length(unique_path), dimnames = list(sel.genes$entrez, unique_path))
  for(i in unique_path) m[grep(i, sel.genes$path), i ] =1

  df = data.frame(value=sort(apply(m, 2, sum)))
  df$path=rownames(df)
  df = unrowname(df)
  df = df[,c("path","value")]
  df$path_size = len[df$path]
  df$perc = with(df, (value/path_size)*100 )
  df$category = paste0("Novel_prediction_ALL_",nrow(sel.genes))
  res = rbind(res, df)

  # print(ggplot(df, aes(x=label,y=perc))+geom_bar(stat='identity')+coord_flip()+theme_boss()+ylab("")+xlab("")+ggtitle(paste0(nrow(sel.genes)," novel genes (all)")))

  # only novel rst

  unique_path = unique(unlist(strsplit(subset(sel.genes, gene_type=="rst")$path, "\\.")))
  m = matrix(0, nr = nrow(subset(sel.genes, gene_type=="rst")), nc = length(unique_path), dimnames = list(subset(sel.genes, gene_type=="rst")$entrez, unique_path))
  for(i in unique_path) m[grep(i, subset(sel.genes, gene_type=="rst")$path), i ] =1

  df = data.frame(value=sort(apply(m, 2, sum)))
  df$path=rownames(df)
  df = unrowname(df)
  df = df[,c("path","value")]
  df$path_size = len[df$path]
  df$perc = with(df, (value/path_size)*100 )
  df$category = paste0("Novel_prediction_RST_",nrow(subset(sel.genes, gene_type=="rst")))
  res = rbind(res, df)

  # print(ggplot(df, aes(x=label,y=perc))+geom_bar(stat='identity')+coord_flip()+theme_boss()+ylab("")+xlab("")+ggtitle(paste0(nrow(subset(sel.genes, gene_type=="rst"))," novel genes (rst)")))


  # dev.off()
  write.xlsx(subset(res, category=="NCG_cancer_genes_1571"), "NCG_cancer_genes_1571", file=fout, row.names = F, showNA = F)
  for (i in unique(res$category)[!unique(res$category)%in%"NCG_cancer_genes_1571"]){
    write.xlsx(subset(res, category==i), i, file=fout, row.names = F, showNA = F, append=T)
  }

  ## Rank-based measure of association (Spearman's rho)
  l = list()
  u = unique(res$category)
  for(i in 1:length(u)){
      l[[i]] = subset(res, category==u[i])$path
      l[[i]] = l[[i]][ with(subset(res, category==u[i]), order(value, perc, decreasing = T) ) ]
  }

  get_hard_core = function(x,y,xtitle,ytitle){
         # check if x is longer than y

      a = as.numeric(factor(x, levels=x))
      b = as.numeric(factor(y, levels=y))

      names(a) = x
      names(b) = y

      dfxy = data.frame(a=a, b=b[names(a)])
      pattern = paste(unique(uninformative_paths), collapse="|")
      dfxy = dfxy %>% tibble::rownames_to_column() %>% mutate(informative=ifelse(grepl(pattern, rowname), FALSE, TRUE)) %>% data.frame()
      dfxy$informative = factor(dfxy$informative, levels = c(TRUE, FALSE))
      #scatter.smooth(a, b[names(a)]) # geom_smooth
      options(scipen=2)
      pv = cor.test( dfxy$a, dfxy$b, m="spearman")$p.value
      rho = cor.test( dfxy$a, dfxy$b, m="spearman")$estimate

      ## Keep colours steady irrespectively from which level of the factor you find first
      myColors = c("red", "black")
      names(myColors) = levels(dfxy$informative)
      p = ggplot(dfxy, aes(x=a, y=b)) + geom_point(aes(colour = informative), size=rel(1.5)) + geom_smooth() +
          #geom_text(aes(label=ifelse(a>10 & b<10 & informative==TRUE,as.character(rowname),'')),hjust=0,vjust=0,size=rel(3)) +
          geom_text(aes(label=ifelse(informative==TRUE,as.character(rowname),'')),hjust=0,vjust=0,size=rel(3)) +
          #geom_text(aes(label=as.character(rowname)),hjust=0,vjust=0,size=rel(2.5)) +
          scale_colour_manual(values=myColors, name="Informative\npathways") +
          theme_boss() +
          theme(
              axis.line = element_line(colour = "black")
          ) +
          xlab(paste0(xtitle, " KEGG pathway ranks")) +
          ylab(paste0(ytitle, " KEGG pathway ranks")) +
          ggtitle(paste0("p-value=", pv, "; rho=", rho))


#       +
#           base_breaks_y(ymax, c(0,ymed,ymax), as.character(c(0,lmed,lmax)))+
#           base_breaks_x(gg$x, c(0,35,80,100), c("0","35","80","100"))
      return(p)
  }

  names(l) = u
  p1 = get_hard_core(l[[1]],l[[2]], names(l)[1], names(l)[2])
  p2 = get_hard_core(l[[1]],l[[3]], names(l)[1], names(l)[3])
  p3 = get_hard_core(l[[3]],l[[4]], names(l)[3], names(l)[4])
  p4 = get_hard_core(l[[3]],l[[5]], names(l)[3], names(l)[5])
  pdf(file=paste(fpdf, sep=""), h= 20, w=24)
  grid.arrange(p1, p2, p3, p4, ncol=2)
  dev.off()

  ## Attention if a gene is a cgc but not in the tissue examined (e.g blood)
  ## it exists twice in the table returned, once for the predictions and
  ## once for the NCG5 genes (in the second case we give primary site...etc)
  return(exp)
}


# rad = get_stat_novel_predictions("/Volumes/ceredam/novel_driver_prediction/ESCA/gridPrediction/",
#                                  "predictions_radial",
#                                  "Results/SVM_radial.novel_by_models.pdf")

get_stat_novel_predictions = function(path, kern="linear", gene_type, res_dir, plot="barplot"){

    specify_decimal <- function(x, k) format(round(x, k), nsmall=k)
    if (!(gene_type%in%c("all", "rst", "kcg"))){
        gene_type = "all"
        cat("----> get_stat_novel_predictions: unknown gene_type, it was set to \"all\"", "n")
    }

    if (kern=="linear"){pattern="predictions_linear"}
    if (kern=="radial"){pattern="predictions_radial"}
    if (kern=="polynomial"){pattern="predictions_polynomial"}
    if (kern=="sigmoid"){pattern="predictions_sigmoid"}

    if (gene_type=="rst"){genes="rst"}
    if (gene_type=="all"){genes="all"}
    if (gene_type=="kcg"){genes="NCG5"}

    ff = list.files(path, pattern, full.names = T)
    mm= list.files(path , pattern, full.names = F)
    ff = ff[-grep("xlsx", ff)]
    mm = mm[-grep("xlsx", mm)]
    mm = gsub(".Rdata","", mm)
    mm = gsub("predictions_","", mm)


    novel = list()
    for(i in 1:length(ff)){
      print(i)
      load(ff[i])
      if (gene_type=="rst"){
        rst = subset(as.data.frame(preds), gene_type=="rst" & label & prob_platt>.95)
        novel[[i]] = unique(rst$entrez)
      }else if (gene_type=="all"){
        all = subset(as.data.frame(preds), label & prob_platt>.95)
        novel[[i]] = unique(all$entrez)
      }else if (gene_type=="kcg"){
        kcg = subset(as.data.frame(preds), (gene_type=="cgc" | gene_type=="can") & label & prob_platt>.95)
        novel[[i]] = unique(kcg$entrez)
      }
    }

    names(novel)=mm


    if (plot=="barplot"){

        ## No predictions - clean novel list from 0s
        ix = which(lapply(novel, function(x) length(x))==0)
        cat(paste0(length(ix), " models had no predictions and excluded..."), "\n")
        novel[ix] = NULL

        print(length(unique(unlist(novel))))
        ## Filter mm also for models with 0 predictions
        mm = mm[!(mm%in%names(ix))]

        novel = mapply(function(x,y){ cbind('entrez'=x, 'model'=y) }, novel, as.list(mm), SIMPLIFY = T)
        df = do.call(rbind,novel)
        df = as.data.frame(df)
        df$previous = NA
        for(i in 2:length(mm) ){
            df$previous[ which(df$model == mm[i] ) ] = df$entrez[ which(df$model == mm[i] ) ]%in%df$entrez[ which(df$model == mm[i-1] ) ]
        }
        sdf = ddply(df, .(model), summarise, tot = length(entrez), previous = sum(previous))
        sdf$previous[1] = 0
        sdf$private = sdf$tot - sdf$previous
        sdf$p.private = (sdf$private/sdf$tot )*100
        sdf$p.previous = (sdf$previous/sdf$tot )*100
        sdf$label=paste0(sdf$tot,"\n(",sdf$private,")")
        mdf = melt(sdf, id.vars = "model", measure.vars = c("private","previous"))
        if (kern=="linear"){
            pdf(file=paste(res_dir,"/",gene_type, "_shared_genes_between_models_", kern, ".pdf", sep=""), h=6, w=10)
        }
        if (kern=="radial"){
            pdf(file=paste(res_dir,"/",gene_type, "_shared_genes_between_models_", kern, ".pdf", sep=""), h=6, w=40)
        }
        if (kern=="polynomial"){
            pdf(file=paste(res_dir,"/",gene_type, "_shared_genes_between_models_", kern, ".pdf", sep=""), h=20, w=150)
        }
        if (kern=="sigmoid"){
            pdf(file=paste(res_dir,"/",gene_type, "_shared_genes_between_models_", kern, ".pdf", sep=""), h=6, w=40)
        }
        p = ggplot(mdf, aes(x=model,y=value))+
            geom_bar(stat="identity", col="black", aes( fill=variable))+theme_boss()+scale_fill_manual(values=c("green","grey"))+theme(axis.text.x = element_text(angle=90, size=rel(1.5)))+
            geom_text(data=sdf, aes(x=model,y=tot, label=label),position = position_dodge(width=1), vjust=-0.15, size=rel(4))+ylab("Unique genes (n)")+xlab("")+
            ggtitle( paste0("Novel driver genes (",genes,",Platt) per model\nUnique genes across models = ", length(unique(df$entrez))) )+
            scale_y_continuous(limits=c(0, max(sdf$tot+500)), breaks=c(0,round(max(sdf$tot),0)/2,max(sdf$tot)), labels=as.character(c(0,round(max(sdf$tot)/2,0),max(sdf$tot))))
        print(p)
        dev.off()
        write.table(df, file=paste(res_dir,"/", gene_type, "_shared_genes_between_models_", kern, ".tsv",sep=""), quote=F, row.names = F, sep = "\t")
        write.table(sdf, file=paste(res_dir,"/", gene_type, "_shared_genes_between_models_per_gene_", kern, ".tsv",sep=""), quote=F, row.names = F, sep = "\t")
    }else if (plot=="heatmap"){

#        ## Get all the combinations of the models
#         mcombs = unique(data.frame(do.call(rbind, combn(mm, 2, simplify = FALSE))))
#
#         m1 = as.list(mcombs[,1])
#         m2 = as.list(mcombs[,2])
#
#         x = paste0(gsub("radial.","",mcombs[,1]),"_vs_",gsub("radial.","", mcombs[,2]) )
#
#         names(m1) = x
#         names(m2) = x
#
#         n1 = lapply(m1, function(x,y) y[[x]], y=novel)
#         n2 = lapply(m2, function(x,y) y[[x]], y=novel)
#
#         gimme_money_bad_guy = function(x,y){
#           length(setdiff(y,x)) / length(y)
#         }
#
#         res = mapply_pb( gimme_money_bad_guy, n1, n2, SIMPLIFY = T, USE.NAMES=F)
#         res = unlist(res)
#         y  = as.data.frame(do.call("rbind", strsplit(x, "\\_vs_"))); colnames(y)=c("previous","selected")
#         y$value = res
#         y$value[ which(!is.finite(y$value))] = NA
#         myPalette <- colorRampPalette(c("white",(brewer.pal(7, "YlOrRd"))), space="Lab")
#         r = c(0,max(y$value, na.rm=T))
#         ggplot(df, aes(x=previous, y=selected, fill=value)) +          geom_tile() + coord_equal() +
#           # geom_text(aes(label = as.character(100-as.numeric(df$m1INTERm2Perc2)))) +
#           scale_fill_gradientn(colours = myPalette(100), name='Percentage', limits = r, breaks =r, labels=as.character(r))
#                                ,na.value ="grey80") +
#           #theme_boss() +
#           theme(
#             legend.position = "bottom",
#             panel.grid.major = element_blank(),
#             panel.grid.minor = element_blank(),
#             panel.background = element_blank(),
#             axis.title.x = element_text(colour=NA),
#             axis.title.y = element_blank(),
#             axis.line = element_blank(),
#             axis.text.x=element_text(face="bold", angle=90, size=10),
#             axis.text.y=element_text(face="bold", size=10)
#           )
#
#
        # novel = NULL
#         for(i in 1:nrow(mcombs)){
#             cat(c(i, " ", unname(unlist(mcombs[i,]))), "\n")
#             ## Load the predictions
#             d1 = ff[grep(paste0(mcombs[i,1], ".Rdata"), ff)]
#             d2 = ff[grep(paste0(mcombs[i,2], ".Rdata"), ff)]
#             load(d1)
#             d1 = preds
#             load(d2)
#             d2 = preds
#
#             ## Rst predictions
#             rst1 = d1 %>% subset(gene_type=="rst" & label & prob_platt>.95) %>% select(entrez) %>% unique %>% .$entrez
#             rst2 = d2 %>% subset(gene_type=="rst" & label & prob_platt>.95) %>% select(entrez) %>% unique %>% .$entrez
#             m1INTERm2 = length(intersect(rst1, rst2))
#             m1SETDIFFm2 = length(setdiff(rst1, rst2))
#             m2SETDIFFm1 = length(setdiff(rst2, rst1))
#             novel = rbind(novel, data.frame(m1=mcombs[i,1], m2=mcombs[i,2],
#                                             m1Genes=length(rst1), m2Genes=length(rst2),
#                                             gene_type="rst", m1INTERm2=m1INTERm2,
#                                             m1SETDIFFm2=m1SETDIFFm2,
#                                             m2SETDIFFm1=m2SETDIFFm1))
#
#
#             ## All (cancer genes and rst predictions)
#             all1 = d1 %>% subset(label & prob_platt>.95) %>% select(entrez) %>% unique %>% .$entrez
#             all2 = d2 %>% subset(label & prob_platt>.95) %>% select(entrez) %>% unique %>% .$entrez
#             m1INTERm2 = length(intersect(all1, all2))
#             m1SETDIFFm2 = length(setdiff(all1, all2))
#             m2SETDIFFm1 = length(setdiff(all2, all1))
#             novel = rbind(novel, data.frame(m1=mcombs[i,1], m2=mcombs[i,2],
#                                             m1Genes=length(all1), m2Genes=length(all2),
#                                             gene_type="all", m1INTERm2=m1INTERm2,
#                                             m1SETDIFFm2=m1SETDIFFm2,
#                                             m2SETDIFFm1=m2SETDIFFm1))
#
#             ## Predictions in NCG5
#             kcg1 = d1 %>% subset((gene_type=="cgc" | gene_type=="can") & label & prob_platt>.95) %>% select(entrez) %>% unique %>% .$entrez
#             kcg2 = d2 %>% subset((gene_type=="cgc" | gene_type=="can") & label & prob_platt>.95) %>% select(entrez) %>% unique %>% .$entrez
#             m1INTERm2 = length(intersect(kcg1, kcg2))
#             m1SETDIFFm2 = length(setdiff(kcg1, kcg2))
#             m2SETDIFFm1 = length(setdiff(kcg2, kcg1))
#             novel = rbind(novel, data.frame(m1=mcombs[i,1], m2=mcombs[i,2],
#                                             m1Genes=length(kcg1), m2Genes=length(kcg2),
#                                             gene_type="NCG5", m1INTERm2=m1INTERm2,
#                                             m1SETDIFFm2=m1SETDIFFm2,
#                                             m2SETDIFFm1=m2SETDIFFm1))
#         }

        ## Get parameters in the df
        #novel %>% separate(m1, into=c())

#         novel %>% mutate(m1INTERm2Perc2) %>%
#             mutate(printm1=paste0(m1," (", m1Genes, ")"), printm2=paste0(m2," (", m2Genes, ")"),
#                    m1INTERm2Perc2=as.numeric(specify_decimal((m1INTERm2/m2Genes)*100, 2)),
#                    m1INTERm2Perc1=as.numeric(specify_decimal((m1INTERm2/m1Genes)*100, 2)))
#
#
#         for (t in unique(novel$gene_type)){
#             if (kern=="linear"){
#                 pdf(file=paste(res_dir,"/",t, "_shared_genes_between_models_", kern, ".pdf", sep=""), h=6, w=10)
#             }
#             if (kern=="radial"){
#                 pdf(file=paste(res_dir,"/",t, "_shared_genes_between_models_", kern, ".pdf", sep=""), h=6, w=40)
#             }
#
#             df = novel %>% subset(gene_type==t)
#             myPalette <- colorRampPalette(c("white",(brewer.pal(7, "YlOrRd"))), space="Lab")
#             r = c(0,100)
#             ggplot(df, aes(x=as.factor(printm1), y=as.factor(printm2))) +
#                 geom_tile(aes(fill = 100-as.numeric(m1INTERm2Perc2))) + coord_equal() +
#                 geom_text(aes(label = as.character(100-as.numeric(df$m1INTERm2Perc2)))) +
#                 scale_fill_gradientn(colours = myPalette(100), name='Percentage', limits = r, breaks =r, labels=as.character(r),na.value ="white") +
#                 #theme_boss() +
#                 theme(
#                     legend.position = "bottom",
#                     panel.grid.major = element_blank(),
#                     panel.grid.minor = element_blank(),
#                     panel.background = element_blank(),
#                     axis.title.x = element_text(colour=NA),
#                     axis.title.y = element_blank(),
#                     axis.line = element_blank(),
#                     axis.text.x=element_text(face="bold", angle=90, size=10),
#                     axis.text.y=element_text(face="bold", size=10)
#                 )
#
#         }

    }





}

stats_genes = function(cancer_type){

  load("/mnt/lustre/users/k1469280/mourikisa/data/geneInfoNCG5.Rdata")
  load("/mnt/lustre/users/k1469280/mourikisa/data/dupByThresholds.Rdata")

  geneInfo = as.data.frame(geneInfo, stringsAsFactors=F)
  dupByThresholds = as.data.frame(dupByThresholds, stringsAsFactors=F)

  d = getTable(tb=cancer_type)

  lof = rbind(lof_trunc = subset(d,no_TRUNC_muts>0, select =c('Cancer_type', 'Sample','Entrez')) %>%mutate(type="trunc"),
              lof_nondam = subset(d, no_NTDam_muts>0, select =c('Cancer_type', 'Sample','Entrez')) %>%mutate(type="ntdam"),
              lof_homdel = subset(d,(CNVLoss==1 & Copy_number==0), select =c('Cancer_type', 'Sample','Entrez')) %>% mutate(type="homdel"),
              lof_hetdelTruncORNTdam = subset(d, (CNVLoss==1 & Copy_number==1 & (no_TRUNC_muts>0 | no_NTDam_muts>0)),select =c('Cancer_type', 'Sample','Entrez')) %>%
              mutate(type="hetdelTruncORNTdam"))

  lof.sample = ddply(lof, .(Cancer_type,Sample), summarise, n_genes=length(unique(Entrez)))

  #lof.genes  = ddply(lof, .(Cancer_type,Entrez), summarise, n_samples=length(unique(Sample)))
  #lof.genes = cbind(lof.genes, geneInfo[ match(lof.genes$Entrez, geneInfo$entrez), c('symbol', 'cancer_type', 'complex')]) # , 'duplicability'
  #colnames(lof.genes)[5] = "gene_category"
#   colnames(lof.genes)[7] = "genic_dup_60"

  lof.genes = lof %>% group_by(Cancer_type, Entrez) %>%
    summarise(n_samples=length(unique(Sample)),
              driver_alteration_types=paste(unique(type), collapse=","))
  lof.genes = lof.genes %>%
    left_join(geneInfo%>%select(entrez, symbol, cancer_type, complex), by=c("Entrez"="entrez")) %>%
    rename(gene_category=cancer_type)
  lof.genes$genic_dup_60 =  with( subset(dupByThresholds,threshold=="60"), ndups[ match(lof.genes$Entrez, entrez ) ] )
  lof.genes$genic_dup_20 =  with( subset(dupByThresholds,threshold=="20"), ndups[ match(lof.genes$Entrez, entrez ) ] )

  ix = which(is.na(lof.genes$genic_dup_20)); if(length(ix)>0) lof.genes$genic_dup_20[ix] = 0
  ix = which(is.na(lof.genes$genic_dup_60)); if(length(ix)>0) lof.genes$genic_dup_60[ix] = 0

  list(LOF_samples = lof.sample, LOF_genes = lof.genes)
}


## Create training and summary statistics
createDescribeTraining = function(cancer_type, tissue, res_dir, gains=T){
    if (!file.exists(res_dir)){
      dir.create(file.path(res_dir))
    }
    ## Get data from the database
    d = getTable(tb=cancer_type)
    ## Mark training observations before you scale
    d = markTrainNovelty(df=d, tissue=tissue, gains = gains)
    ## Save training and validation set without scaling (Feature refinement as well)
    training_ns = d %>% subset(!is.na(type))
    # training_ns = cleanFeatures(df=training_ns, ct=cancer_type)
    training_ns = cleanFeatures(df=training_ns, paste0(res_dir,"/config.txt"))
    training_ns = training_ns %>% select(-no_NSI_muts, -private_domains, -commondomains, -High, -Low, -Medium, -NotExpressed)
    if(length(grep("training_set_noScale.Rdata",list.files(res_dir, recursive = F, full.names=F)))==0){
        save(training_ns, file=paste(res_dir,"/training_set_noScale.Rdata", sep=""))
    }
    validation_ns = d %>% subset(is.na(type)) %>% select(-type)
#     validation_ns = cleanFeatures(df=validation_ns, ct=cancer_type)
    validation_ns = cleanFeatures(df=validation_ns, paste0(res_dir,"/config.txt"))
    validation_ns = validation_ns %>% select(-no_NSI_muts, -private_domains, -commondomains, -High, -Low, -Medium, -NotExpressed)
    if(length(grep("validation_set_noScale.Rdata",list.files(res_dir, recursive = F, full.names=F)))==0){
        save(validation_ns, file=paste(res_dir,"/validation_set_noScale.Rdata", sep=""))
    }

    ## Scale dataset
    d = getScaledTable(df = d)
    ## Clean and prepare it for the SVM
    # d = cleanFeatures(df=d, ct=cancer_type)
    d = cleanFeatures(df=d, paste0(res_dir,"/config.txt"))
    ## Feature refinement
    d = d %>% select(-no_NSI_muts, -private_domains, -commondomains, -High, -Low, -Medium, -NotExpressed)
    ## Separate training and prediction
    training = d %>% subset(!is.na(type))
    ## if training is not there save it
    if(length(grep("training_set.Rdata",list.files(res_dir, recursive = F, full.names=F)))==0){
        save(training, file=paste(res_dir,"/training_set.Rdata", sep=""))
    }
    validation = d %>% subset(is.na(type)) %>% select(-type)
    ## Save validation set
    if(length(grep("validation_set.Rdata",list.files(res_dir, recursive = F, full.names=F)))==0){
        save(validation, file=paste(res_dir,"/validation_set.Rdata", sep=""))
    }

    ## Get Correlation matrix
    varTypes = sapply(training, function(x) class(x)) %>% data.frame() %>% tibble::rownames_to_column()
    colnames(varTypes) = c("varType", "type")
    fvars = varTypes %>% subset(type=="factor") %>% .$varType
    t = training
    for (v in fvars){
        t[,v] = as.numeric(t[,v])
    }
    corMatrix = round(cor(t),2)
    upper = corMatrix
    upper[upper.tri(corMatrix)] = ""
    upper = as.data.frame(upper)
    write.xlsx(upper,file=paste0(res_dir, "/corMatrix.xlsx"), showNA = FALSE)

    ## Produce statistics for training
    stats = rbind(ddply(training_ns,.(type),numcolwise(quantile,na.rm = TRUE)) %>% mutate(statistic=c("min", "q1", "median", "q3", "max")),
          ddply(training_ns,.(type),numcolwise(mean,na.rm = TRUE)) %>% mutate(statistic="mean"))
    write.table(stats, file = paste(res_dir, "/training_summary_contFeatures_noScale.tsv", sep=""), row.names = F, quote = FALSE, sep="\t")
    stats = stats %>% gather(feature, value, -type, -statistic) %>% spread(statistic, value)
#     pdf(file=paste(res_dir, "/training_numerical_summary_noScale.pdf" , sep=""), h= 16, w=20)
#     p = ggplot(stats, aes(x=factor(feature), ymin = min, lower = q1, middle = median, upper = q3, ymax = max)) +geom_boxplot(stat = "identity") + scale_y_log10()
#     print(p)
#     dev.off()

    stats = rbind(ddply(training,.(type),numcolwise(quantile,na.rm = TRUE)) %>% mutate(statistic=c("min", "q1", "median", "q3", "max")),
                  ddply(training_ns,.(type),numcolwise(mean,na.rm = TRUE)) %>% mutate(statistic="mean"))
    write.table(stats, file = paste(res_dir, "/training_summary_contFeatures_Scale.tsv", sep=""), row.names = F, quote = FALSE, sep="\t")
    stats = stats %>% gather(feature, value, -type, -statistic) %>% spread(statistic, value)
#     pdf(file=paste(res_dir, "/training_numerical_summary_Scale.pdf" , sep=""), h= 16, w=20)
#     p = ggplot(stats, aes(x=factor(feature), ymin = min, lower = q1, middle = median, upper = q3, ymax = max)) + geom_boxplot(stat = "identity") + scale_y_log10()
#     print(p)
#     dev.off()
    ## This will be the same to bot scaled and unscaled training because categorical stay as they are independently of the scaling
    if(gains){
    stats = rbind(training_ns %>% count(type, Genic) %>% rename(categories=Genic, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="Genic"),
          training %>% count(type, memberofcomplex) %>% rename(categories=memberofcomplex, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="memberofcomplex"),
          training %>% count(type, WGD) %>% rename(categories=WGD, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="WGD"),
          training %>% count(type, hub) %>% rename(categories=hub, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="hub"),
          training %>% count(type, central) %>% rename(categories=central, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="central"),
          training %>% count(type, CNVGain) %>% rename(categories=CNVGain, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="CNVGain"),
          training %>% count(type, CNVLoss) %>% rename(categories=CNVLoss, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="CNVLoss"),
          training %>% count(type, ExpT_ME) %>% rename(categories=ExpT_ME, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_ME"),
          training %>% count(type, ExpT_HE) %>% rename(categories=ExpT_HE, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_HE"),
          training %>% count(type, ExpT_LE) %>% rename(categories=ExpT_LE, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_LE"),
          training %>% count(type, ExpT_NE) %>% rename(categories=ExpT_NE, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_NE"),
          training %>% count(type, ExpT_NET) %>% rename(categories=ExpT_NET, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_NET"),
          training %>% count(type, old) %>% rename(categories=old, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="old"),
          training %>% count(type, young) %>% rename(categories=young, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="young"),
          training %>% count(type, luca) %>% rename(categories=luca, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="luca"),
          training %>% count(type, eukaryotes) %>% rename(categories=eukaryotes, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="eukaryotes"),
          training %>% count(type, metazoans) %>% rename(categories=metazoans, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="metazoans"),
          training %>% count(type, vertebrates) %>% rename(categories=vertebrates, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="vertebrates"),
          training %>% count(type, opisthokonts) %>% rename(categories=opisthokonts, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="opisthokonts"),
          training %>% count(type, mammals) %>% rename(categories=mammals, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="mammals"),
          training %>% count(type, selective) %>% rename(categories=selective, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="selective"),
          training %>% count(type, always_expressed) %>% rename(categories=always_expressed, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="always_expressed"),
          training %>% count(type, middle) %>% rename(categories=middle, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="middle"),
          training %>% count(type, one_tissue) %>% rename(categories=one_tissue, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="one_tissue"),
          training %>% count(type, never_expressed) %>% rename(categories=never_expressed, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="never_expressed")
    )
    }else{
      stats = rbind(training_ns %>% count(type, Genic) %>% rename(categories=Genic, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="Genic"),
                    training %>% count(type, memberofcomplex) %>% rename(categories=memberofcomplex, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="memberofcomplex"),
                    training %>% count(type, WGD) %>% rename(categories=WGD, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="WGD"),
                    training %>% count(type, hub) %>% rename(categories=hub, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="hub"),
                    training %>% count(type, central) %>% rename(categories=central, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="central"),
                    training %>% count(type, CNVLoss) %>% rename(categories=CNVLoss, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="CNVLoss"),
                    training %>% count(type, ExpT_ME) %>% rename(categories=ExpT_ME, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_ME"),
                    training %>% count(type, ExpT_HE) %>% rename(categories=ExpT_HE, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_HE"),
                    training %>% count(type, ExpT_LE) %>% rename(categories=ExpT_LE, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_LE"),
                    training %>% count(type, ExpT_NE) %>% rename(categories=ExpT_NE, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_NE"),
                    training %>% count(type, ExpT_NET) %>% rename(categories=ExpT_NET, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_NET"),
                    training %>% count(type, old) %>% rename(categories=old, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="old"),
                    training %>% count(type, young) %>% rename(categories=young, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="young"),
                    training %>% count(type, luca) %>% rename(categories=luca, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="luca"),
                    training %>% count(type, eukaryotes) %>% rename(categories=eukaryotes, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="eukaryotes"),
                    training %>% count(type, metazoans) %>% rename(categories=metazoans, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="metazoans"),
                    training %>% count(type, vertebrates) %>% rename(categories=vertebrates, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="vertebrates"),
                    training %>% count(type, opisthokonts) %>% rename(categories=opisthokonts, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="opisthokonts"),
                    training %>% count(type, mammals) %>% rename(categories=mammals, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="mammals"),
                    training %>% count(type, selective) %>% rename(categories=selective, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="selective"),
                    training %>% count(type, always_expressed) %>% rename(categories=always_expressed, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="always_expressed"),
                    training %>% count(type, middle) %>% rename(categories=middle, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="middle"),
                    training %>% count(type, one_tissue) %>% rename(categories=one_tissue, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="one_tissue"),
                    training %>% count(type, never_expressed) %>% rename(categories=never_expressed, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="never_expressed")
      )

    }


    write.table(stats, file = paste(res_dir, "/training_summary_catFeatures.tsv", sep=""), row.names = F, quote = FALSE, sep="\t")
#     pdf(file=paste(res_dir, "/training_categorical_summary.pdf" , sep=""), h= 16, w=20)
#     p = ggplot(stats, aes(x=factor(feature), y=perc*100, fill=factor(categories))) + geom_bar(stat="identity") +
#         theme_boss() + theme(
#             axis.text.x = element_text(angle=90, size=12),
#             axis.text.y = element_text(size=12),
#             axis.title.x = element_blank(),
#             axis.title.y = element_text(size = 12),
#             legend.title = element_blank()
#         ) + scale_fill_brewer(palette="Set1") +
#         ylab("Number of occurences (%)")
#     print(p)
#     dev.off()

}

createDescribeTrainingVogel = function(cancer_type, res_dir, gains=T){

    if (!file.exists(res_dir)){
        dir.create(file.path(res_dir))
    }
    ## Get data from the database
    d = getTable(tb=cancer_type)
    ## Mark training observations before you scale
    d = markTrainVogel(df=d, gains = gains)
    ## Save training and validation set without scaling (Feature refinement as well)
    training_ns = d %>% subset(!is.na(type))
    # training_ns = cleanFeatures(df=training_ns, ct=cancer_type)
    training_ns = cleanFeatures(df=training_ns, paste0(res_dir,"/config.txt"))
    training_ns = training_ns %>% select(-no_NSI_muts, -private_domains, -commondomains, -High, -Low, -Medium, -NotExpressed)
    if(length(grep("training_set_noScale.Rdata",list.files(res_dir, recursive = F, full.names=F)))==0){
        save(training_ns, file=paste(res_dir,"/training_set_noScale.Rdata", sep=""))
    }
    validation_ns = d %>% subset(is.na(type)) %>% select(-type)
    #     validation_ns = cleanFeatures(df=validation_ns, ct=cancer_type)
    validation_ns = cleanFeatures(df=validation_ns, paste0(res_dir,"/config.txt"))
    validation_ns = validation_ns %>% select(-no_NSI_muts, -private_domains, -commondomains, -High, -Low, -Medium, -NotExpressed)
    if(length(grep("validation_set_noScale.Rdata",list.files(res_dir, recursive = F, full.names=F)))==0){
        save(validation_ns, file=paste(res_dir,"/validation_set_noScale.Rdata", sep=""))
    }

    ## Scale dataset
    d = getScaledTable(df = d)
    ## Clean and prepare it for the SVM
    # d = cleanFeatures(df=d, ct=cancer_type)
    d = cleanFeatures(df=d, paste0(res_dir,"/config.txt"))
    ## Feature refinement
    d = d %>% select(-no_NSI_muts, -private_domains, -commondomains, -High, -Low, -Medium, -NotExpressed)
    ## Separate training and prediction
    training = d %>% subset(!is.na(type))
    ## if training is not there save it
    if(length(grep("training_set.Rdata",list.files(res_dir, recursive = F, full.names=F)))==0){
        save(training, file=paste(res_dir,"/training_set.Rdata", sep=""))
    }
    validation = d %>% subset(is.na(type)) %>% select(-type)
    ## Save validation set
    if(length(grep("validation_set.Rdata",list.files(res_dir, recursive = F, full.names=F)))==0){
        save(validation, file=paste(res_dir,"/validation_set.Rdata", sep=""))
    }

    ## Get Correlation matrix
    varTypes = sapply(training, function(x) class(x)) %>% data.frame() %>% tibble::rownames_to_column()
    colnames(varTypes) = c("varType", "type")
    fvars = varTypes %>% subset(type=="factor") %>% .$varType
    t = training
    for (v in fvars){
        t[,v] = as.numeric(t[,v])
    }
    corMatrix = round(cor(t),2)
    upper = corMatrix
    upper[upper.tri(corMatrix)] = ""
    upper = as.data.frame(upper)
    write.xlsx(upper,file=paste0(res_dir, "/corMatrix.xlsx"), showNA = FALSE)

    ## Produce statistics for training
    stats = rbind(ddply(training_ns,.(type),numcolwise(quantile,na.rm = TRUE)) %>% mutate(statistic=c("min", "q1", "median", "q3", "max")),
                  ddply(training_ns,.(type),numcolwise(mean,na.rm = TRUE)) %>% mutate(statistic="mean"))
    write.table(stats, file = paste(res_dir, "/training_summary_contFeatures_noScale.tsv", sep=""), row.names = F, quote = FALSE, sep="\t")
    stats = stats %>% gather(feature, value, -type, -statistic) %>% spread(statistic, value)
    #     pdf(file=paste(res_dir, "/training_numerical_summary_noScale.pdf" , sep=""), h= 16, w=20)
    #     p = ggplot(stats, aes(x=factor(feature), ymin = min, lower = q1, middle = median, upper = q3, ymax = max)) +geom_boxplot(stat = "identity") + scale_y_log10()
    #     print(p)
    #     dev.off()

    stats = rbind(ddply(training,.(type),numcolwise(quantile,na.rm = TRUE)) %>% mutate(statistic=c("min", "q1", "median", "q3", "max")),
                  ddply(training_ns,.(type),numcolwise(mean,na.rm = TRUE)) %>% mutate(statistic="mean"))
    write.table(stats, file = paste(res_dir, "/training_summary_contFeatures_Scale.tsv", sep=""), row.names = F, quote = FALSE, sep="\t")
    stats = stats %>% gather(feature, value, -type, -statistic) %>% spread(statistic, value)
    #     pdf(file=paste(res_dir, "/training_numerical_summary_Scale.pdf" , sep=""), h= 16, w=20)
    #     p = ggplot(stats, aes(x=factor(feature), ymin = min, lower = q1, middle = median, upper = q3, ymax = max)) + geom_boxplot(stat = "identity") + scale_y_log10()
    #     print(p)
    #     dev.off()
    ## This will be the same to bot scaled and unscaled training because categorical stay as they are independently of the scaling
    if(gains){
        stats = rbind(training_ns %>% count(type, Genic) %>% rename(categories=Genic, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="Genic"),
                      training %>% count(type, memberofcomplex) %>% rename(categories=memberofcomplex, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="memberofcomplex"),
                      training %>% count(type, WGD) %>% rename(categories=WGD, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="WGD"),
                      training %>% count(type, hub) %>% rename(categories=hub, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="hub"),
                      training %>% count(type, central) %>% rename(categories=central, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="central"),
                      training %>% count(type, CNVGain) %>% rename(categories=CNVGain, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="CNVGain"),
                      training %>% count(type, CNVLoss) %>% rename(categories=CNVLoss, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="CNVLoss"),
                      training %>% count(type, ExpT_ME) %>% rename(categories=ExpT_ME, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_ME"),
                      training %>% count(type, ExpT_HE) %>% rename(categories=ExpT_HE, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_HE"),
                      training %>% count(type, ExpT_LE) %>% rename(categories=ExpT_LE, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_LE"),
                      training %>% count(type, ExpT_NE) %>% rename(categories=ExpT_NE, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_NE"),
                      training %>% count(type, ExpT_NET) %>% rename(categories=ExpT_NET, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_NET"),
                      training %>% count(type, old) %>% rename(categories=old, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="old"),
                      training %>% count(type, young) %>% rename(categories=young, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="young"),
                      training %>% count(type, luca) %>% rename(categories=luca, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="luca"),
                      training %>% count(type, eukaryotes) %>% rename(categories=eukaryotes, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="eukaryotes"),
                      training %>% count(type, metazoans) %>% rename(categories=metazoans, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="metazoans"),
                      training %>% count(type, vertebrates) %>% rename(categories=vertebrates, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="vertebrates"),
                      training %>% count(type, opisthokonts) %>% rename(categories=opisthokonts, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="opisthokonts"),
                      training %>% count(type, mammals) %>% rename(categories=mammals, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="mammals"),
                      training %>% count(type, selective) %>% rename(categories=selective, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="selective"),
                      training %>% count(type, always_expressed) %>% rename(categories=always_expressed, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="always_expressed"),
                      training %>% count(type, middle) %>% rename(categories=middle, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="middle"),
                      training %>% count(type, one_tissue) %>% rename(categories=one_tissue, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="one_tissue"),
                      training %>% count(type, never_expressed) %>% rename(categories=never_expressed, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="never_expressed")
        )
    }else{
        stats = rbind(training_ns %>% count(type, Genic) %>% rename(categories=Genic, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="Genic"),
                      training %>% count(type, memberofcomplex) %>% rename(categories=memberofcomplex, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="memberofcomplex"),
                      training %>% count(type, WGD) %>% rename(categories=WGD, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="WGD"),
                      training %>% count(type, hub) %>% rename(categories=hub, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="hub"),
                      training %>% count(type, central) %>% rename(categories=central, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="central"),
                      training %>% count(type, CNVLoss) %>% rename(categories=CNVLoss, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="CNVLoss"),
                      training %>% count(type, ExpT_ME) %>% rename(categories=ExpT_ME, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_ME"),
                      training %>% count(type, ExpT_HE) %>% rename(categories=ExpT_HE, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_HE"),
                      training %>% count(type, ExpT_LE) %>% rename(categories=ExpT_LE, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_LE"),
                      training %>% count(type, ExpT_NE) %>% rename(categories=ExpT_NE, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_NE"),
                      training %>% count(type, ExpT_NET) %>% rename(categories=ExpT_NET, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_NET"),
                      training %>% count(type, old) %>% rename(categories=old, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="old"),
                      training %>% count(type, young) %>% rename(categories=young, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="young"),
                      training %>% count(type, luca) %>% rename(categories=luca, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="luca"),
                      training %>% count(type, eukaryotes) %>% rename(categories=eukaryotes, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="eukaryotes"),
                      training %>% count(type, metazoans) %>% rename(categories=metazoans, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="metazoans"),
                      training %>% count(type, vertebrates) %>% rename(categories=vertebrates, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="vertebrates"),
                      training %>% count(type, opisthokonts) %>% rename(categories=opisthokonts, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="opisthokonts"),
                      training %>% count(type, mammals) %>% rename(categories=mammals, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="mammals"),
                      training %>% count(type, selective) %>% rename(categories=selective, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="selective"),
                      training %>% count(type, always_expressed) %>% rename(categories=always_expressed, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="always_expressed"),
                      training %>% count(type, middle) %>% rename(categories=middle, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="middle"),
                      training %>% count(type, one_tissue) %>% rename(categories=one_tissue, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="one_tissue"),
                      training %>% count(type, never_expressed) %>% rename(categories=never_expressed, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="never_expressed")
        )

    }


    write.table(stats, file = paste(res_dir, "/training_summary_catFeatures.tsv", sep=""), row.names = F, quote = FALSE, sep="\t")

}


createDescribeTrainingCGC = function(cancer_type, res_dir, gains=T, mysql=F, input_fn=NULL, means_sds=NULL){

    if (!file.exists(res_dir)){
        dir.create(file.path(res_dir))
    }
    ## Get data from the database
    if(mysql){
        d = getTable(tb=cancer_type)
    }else if(mysql==FALSE & !is.null(input_fn)){
        d = read.table(paste(input_fn), sep="\t", header = T)
        ## We need to make it factors
        cat("Converting features to factors...", "\n")
        fcols <- c("duplicated",
                   "WGD", "hub", "central", "CNVGain", "CNVLoss",
                   "ExpT_ME", "ExpT_HE", "ExpT_LE", "ExpT_NE",
                   "ExpT_NET", "old", "young", "luca", "eukaryotes",
                   "metazoans", "vertebrates", "opisthokonts",
                   "mammals", "primates", "selective", "always.expressed",
                   "middle", "one.tissue", "never.expressed")
        cols <- which(colnames(d) %in% fcols)
        for(i in cols){
            d[,i] = factor(d[,i], levels = c(0,1))
        }
    }else{
        stop("Input file argument is not present")
    }

    ## Mark training observations before you scale
    d = markTrainCGC(df=d, gains = gains)
    cat("Getting negative observations...", "\n")
    negatives_ns = d[["n"]]
    d = d[["tp"]]
    ## Save training and validation set without scaling (Feature refinement as well)
    training_ns = d %>% subset(!is.na(type))
    # training_ns = cleanFeatures(df=training_ns, ct=cancer_type)
    training_ns = cleanFeatures(df=training_ns, paste0(res_dir,"/config.txt"))
    #training_ns = training_ns %>% select(-no_NSI_muts, -private_domains, -commondomains, -High, -Low, -Medium, -NotExpressed)
    if(length(grep("training_set_noScale.Rdata",list.files(res_dir, recursive = F, full.names=F)))==0){
        save(training_ns, file=paste(res_dir,"/training_set_noScale.Rdata", sep=""))
    }
    validation_ns = d %>% subset(is.na(type)) %>% select(-type)
    #     validation_ns = cleanFeatures(df=validation_ns, ct=cancer_type)
    validation_ns = cleanFeatures(df=validation_ns, paste0(res_dir,"/config.txt"))
    #validation_ns = validation_ns %>% select(-no_NSI_muts, -private_domains, -commondomains, -High, -Low, -Medium, -NotExpressed)
    if(length(grep("validation_set_noScale.Rdata",list.files(res_dir, recursive = F, full.names=F)))==0){
        save(validation_ns, file=paste(res_dir,"/validation_set_noScale.Rdata", sep=""))
    }
    negatives_ns = cleanFeatures(df=negatives_ns, paste0(res_dir,"/config.txt"))
    #negatives_ns = negatives_ns %>% select(-no_NSI_muts, -private_domains, -commondomains, -High, -Low, -Medium, -NotExpressed)
    if(length(grep("negative_set_noScale.Rdata",list.files(res_dir, recursive = F, full.names=F)))==0){
        save(negatives_ns, file=paste(res_dir,"/negative_set_noScale.Rdata", sep=""))
    }


    ## Scale dataset
    ##load(means_sds)
    d = getScaledTable(df = d, means_sds = means_sds)
    mean_sd = d[["mean_sd"]]
    ## Save means and sds
    if(length(grep("mean_sd_of_training_validation_features.Rdata",list.files(res_dir, recursive = F, full.names=F)))==0){
        save(mean_sd, file=paste(res_dir,"/mean_sd_of_training_validation_features.Rdata", sep=""))
    }
    d = d[["df_scaled"]]
    ## Clean and prepare it for the SVM
    # d = cleanFeatures(df=d, ct=cancer_type)
    d = cleanFeatures(df=d, paste0(res_dir,"/config.txt"))
    ## Feature refinement
    #d = d %>% select(-no_NSI_muts, -private_domains, -commondomains, -High, -Low, -Medium, -NotExpressed)
    ## Separate training and prediction
    training = d %>% subset(!is.na(type))
    ## if training is not there save it
    if(length(grep("training_set.Rdata",list.files(res_dir, recursive = F, full.names=F)))==0){
        save(training, file=paste(res_dir,"/training_set.Rdata", sep=""))
    }
    validation = d %>% subset(is.na(type)) %>% select(-type)
    ## Save validation set
    if(length(grep("validation_set.Rdata",list.files(res_dir, recursive = F, full.names=F)))==0){
        save(validation, file=paste(res_dir,"/validation_set.Rdata", sep=""))
    }

    ## Scale the negatives using the mean and sd of the TP set
    negatives = negatives_ns
    for(i in colnames(negatives)){
        if(i %in% colnames(mean_sd)){
            scale_mean = mean_sd["means",grep(i, colnames(mean_sd))]
            scale_sd = mean_sd["sds",grep(i, colnames(mean_sd))]
            negatives[,i] = (negatives[,i] - as.numeric(scale_mean))/as.numeric(scale_sd)
        }
    }
    ## Save negative set
    if(length(grep("negative_set.Rdata",list.files(res_dir, recursive = F, full.names=F)))==0){
        save(negatives, file=paste(res_dir,"/negative_set.Rdata", sep=""))
    }

    ## Get Correlation matrix
    varTypes = sapply(training, function(x) class(x)) %>% data.frame() %>% tibble::rownames_to_column()
    colnames(varTypes) = c("varType", "type")
    fvars = varTypes %>% subset(type=="factor") %>% .$varType
    t = training
    for (v in fvars){
        t[,v] = as.numeric(t[,v])
    }
    corMatrix = round(cor(t),2)
    upper = corMatrix
    upper[upper.tri(corMatrix)] = ""
    upper = as.data.frame(upper)
    write.xlsx(upper,file=paste0(res_dir, "/corMatrix.xlsx"), showNA = FALSE)

    ## Produce statistics for training
    stats = rbind(ddply(training_ns,.(type),numcolwise(quantile,na.rm = TRUE)) %>% mutate(statistic=c("min", "q1", "median", "q3", "max")),
                  ddply(training_ns,.(type),numcolwise(mean,na.rm = TRUE)) %>% mutate(statistic="mean"))
    write.table(stats, file = paste(res_dir, "/training_summary_contFeatures_noScale.tsv", sep=""), row.names = F, quote = FALSE, sep="\t")
    stats = stats %>% gather(feature, value, -type, -statistic) %>% spread(statistic, value)
    #     pdf(file=paste(res_dir, "/training_numerical_summary_noScale.pdf" , sep=""), h= 16, w=20)
    #     p = ggplot(stats, aes(x=factor(feature), ymin = min, lower = q1, middle = median, upper = q3, ymax = max)) +geom_boxplot(stat = "identity") + scale_y_log10()
    #     print(p)
    #     dev.off()

    stats = rbind(ddply(training,.(type),numcolwise(quantile,na.rm = TRUE)) %>% mutate(statistic=c("min", "q1", "median", "q3", "max")),
                  ddply(training_ns,.(type),numcolwise(mean,na.rm = TRUE)) %>% mutate(statistic="mean"))
    write.table(stats, file = paste(res_dir, "/training_summary_contFeatures_Scale.tsv", sep=""), row.names = F, quote = FALSE, sep="\t")

    ## get counts for NAs (if any)
    nas = sapply(training_ns, function(x) (sum(is.na(x))/length(x))*100) %>% data.frame()
    colnames(nas) = "percentage_NAs"
    write.xlsx(nas, file = paste(res_dir, "/NAs_percentages.xlsx", sep=""), row.names = T, sheetName = "Training_ns")
    nas = sapply(training, function(x) (sum(is.na(x))/length(x))*100) %>% data.frame()
    colnames(nas) = "percentage_NAs"
    write.xlsx(nas, file = paste(res_dir, "/NAs_percentages.xlsx", sep=""), row.names = T, append = T, sheetName = "Training")
    nas = sapply(validation_ns, function(x) (sum(is.na(x))/length(x))*100) %>% data.frame()
    colnames(nas) = "percentage_NAs"
    write.xlsx(nas, file = paste(res_dir, "/NAs_percentages.xlsx", sep=""), row.names = T, append = T, sheetName = "Validation_ns")
    nas = sapply(validation, function(x) (sum(is.na(x))/length(x))*100) %>% data.frame()
    colnames(nas) = "percentage_NAs"
    write.xlsx(nas, file = paste(res_dir, "/NAs_percentages.xlsx", sep=""), row.names = T, append = T, sheetName = "Validation")
}

createDescribeTrainingTwoClasses = function(cancer_type, tissue, res_dir, gains=T){
    if (!file.exists(res_dir)){
        dir.create(file.path(res_dir))
    }
    ## Get data from the database
    d = getTable(tb=cancer_type)
    ## Mark training observations before you scale
    d = markTrainTwoClasses(df=d, tissue=tissue, gains = gains)
    ## Save training and validation set without scaling (Feature refinement as well)
    training_ns = d %>% subset(!is.na(type))
    # training_ns = cleanFeatures(df=training_ns, ct=cancer_type)
    training_ns = cleanFeatures(df=training_ns, paste0(res_dir,"/config.txt"))
    training_ns = training_ns %>% select(-no_NSI_muts, -private_domains, -commondomains, -High, -Low, -Medium, -NotExpressed)
    if(length(grep("training_set_noScale.Rdata",list.files(res_dir, recursive = F, full.names=F)))==0){
        save(training_ns, file=paste(res_dir,"/training_set_noScale.Rdata", sep=""))
    }
    validation_ns = d %>% subset(is.na(type)) %>% select(-type)
    #     validation_ns = cleanFeatures(df=validation_ns, ct=cancer_type)
    validation_ns = cleanFeatures(df=validation_ns, paste0(res_dir,"/config.txt"))
    validation_ns = validation_ns %>% select(-no_NSI_muts, -private_domains, -commondomains, -High, -Low, -Medium, -NotExpressed)
    if(length(grep("validation_set_noScale.Rdata",list.files(res_dir, recursive = F, full.names=F)))==0){
        save(validation_ns, file=paste(res_dir,"/validation_set_noScale.Rdata", sep=""))
    }

    ## Scale dataset
    d = getScaledTable(df = d)
    ## Clean and prepare it for the SVM
    # d = cleanFeatures(df=d, ct=cancer_type)
    d = cleanFeatures(df=d, paste0(res_dir,"/config.txt"))
    ## Feature refinement
    d = d %>% select(-no_NSI_muts, -private_domains, -commondomains, -High, -Low, -Medium, -NotExpressed)
    ## Separate training and prediction
    training = d %>% subset(!is.na(type))
    ## if training is not there save it
    if(length(grep("training_set.Rdata",list.files(res_dir, recursive = F, full.names=F)))==0){
        save(training, file=paste(res_dir,"/training_set.Rdata", sep=""))
    }
    validation = d %>% subset(is.na(type)) %>% select(-type)
    ## Save validation set
    if(length(grep("validation_set.Rdata",list.files(res_dir, recursive = F, full.names=F)))==0){
        save(validation, file=paste(res_dir,"/validation_set.Rdata", sep=""))
    }

    ## Produce statistics for training
    number_of_categories = training_ns %>% select(type) %>% unique %>% nrow
    stats = rbind(ddply(training_ns,.(type),numcolwise(quantile,na.rm = TRUE)) %>% mutate(statistic=rep(c("min", "q1", "median", "q3", "max"), number_of_categories)),
                  ddply(training_ns,.(type),numcolwise(mean,na.rm = TRUE)) %>% mutate(statistic=rep(c("mean"), number_of_categories)))
    write.table(stats, file = paste(res_dir, "/training_summary_contFeatures_noScale.tsv", sep=""), row.names = F, quote = FALSE, sep="\t")
    stats = stats %>% gather(feature, value, -type, -statistic) %>% spread(statistic, value)
    #     pdf(file=paste(res_dir, "/training_numerical_summary_noScale.pdf" , sep=""), h= 16, w=20)
    #     p = ggplot(stats, aes(x=factor(feature), ymin = min, lower = q1, middle = median, upper = q3, ymax = max)) +geom_boxplot(stat = "identity") + scale_y_log10()
    #     print(p)
    #     dev.off()

    stats = rbind(ddply(training,.(type),numcolwise(quantile,na.rm = TRUE)) %>% mutate(statistic=rep(c("min", "q1", "median", "q3", "max"), number_of_categories)),
                  ddply(training_ns,.(type),numcolwise(mean,na.rm = TRUE)) %>% mutate(statistic=rep(c("mean"), number_of_categories)))
    write.table(stats, file = paste(res_dir, "/training_summary_contFeatures_Scale.tsv", sep=""), row.names = F, quote = FALSE, sep="\t")
    stats = stats %>% gather(feature, value, -type, -statistic) %>% spread(statistic, value)
    #     pdf(file=paste(res_dir, "/training_numerical_summary_Scale.pdf" , sep=""), h= 16, w=20)
    #     p = ggplot(stats, aes(x=factor(feature), ymin = min, lower = q1, middle = median, upper = q3, ymax = max)) + geom_boxplot(stat = "identity") + scale_y_log10()
    #     print(p)
    #     dev.off()
    ## This will be the same to bot scaled and unscaled training because categorical stay as they are independently of the scaling
    if(gains){
        stats = rbind(training_ns %>% count(type, Genic) %>% rename(categories=Genic, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="Genic"),
                      training %>% count(type, memberofcomplex) %>% rename(categories=memberofcomplex, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="memberofcomplex"),
                      training %>% count(type, WGD) %>% rename(categories=WGD, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="WGD"),
                      training %>% count(type, hub) %>% rename(categories=hub, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="hub"),
                      training %>% count(type, central) %>% rename(categories=central, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="central"),
                      training %>% count(type, CNVGain) %>% rename(categories=CNVGain, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="CNVGain"),
                      training %>% count(type, CNVLoss) %>% rename(categories=CNVLoss, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="CNVLoss"),
                      training %>% count(type, ExpT_ME) %>% rename(categories=ExpT_ME, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_ME"),
                      training %>% count(type, ExpT_HE) %>% rename(categories=ExpT_HE, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_HE"),
                      training %>% count(type, ExpT_LE) %>% rename(categories=ExpT_LE, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_LE"),
                      training %>% count(type, ExpT_NE) %>% rename(categories=ExpT_NE, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_NE"),
                      training %>% count(type, ExpT_NET) %>% rename(categories=ExpT_NET, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_NET"),
                      training %>% count(type, old) %>% rename(categories=old, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="old"),
                      training %>% count(type, young) %>% rename(categories=young, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="young"),
                      training %>% count(type, luca) %>% rename(categories=luca, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="luca"),
                      training %>% count(type, eukaryotes) %>% rename(categories=eukaryotes, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="eukaryotes"),
                      training %>% count(type, metazoans) %>% rename(categories=metazoans, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="metazoans"),
                      training %>% count(type, vertebrates) %>% rename(categories=vertebrates, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="vertebrates"),
                      training %>% count(type, opisthokonts) %>% rename(categories=opisthokonts, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="opisthokonts"),
                      training %>% count(type, mammals) %>% rename(categories=mammals, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="mammals"),
                      training %>% count(type, selective) %>% rename(categories=selective, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="selective"),
                      training %>% count(type, always_expressed) %>% rename(categories=always_expressed, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="always_expressed"),
                      training %>% count(type, middle) %>% rename(categories=middle, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="middle"),
                      training %>% count(type, one_tissue) %>% rename(categories=one_tissue, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="one_tissue"),
                      training %>% count(type, never_expressed) %>% rename(categories=never_expressed, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="never_expressed")
        )
    }else{
        stats = rbind(training_ns %>% count(type, Genic) %>% rename(categories=Genic, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="Genic"),
                      training %>% count(type, memberofcomplex) %>% rename(categories=memberofcomplex, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="memberofcomplex"),
                      training %>% count(type, WGD) %>% rename(categories=WGD, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="WGD"),
                      training %>% count(type, hub) %>% rename(categories=hub, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="hub"),
                      training %>% count(type, central) %>% rename(categories=central, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="central"),
                      training %>% count(type, CNVLoss) %>% rename(categories=CNVLoss, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="CNVLoss"),
                      training %>% count(type, ExpT_ME) %>% rename(categories=ExpT_ME, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_ME"),
                      training %>% count(type, ExpT_HE) %>% rename(categories=ExpT_HE, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_HE"),
                      training %>% count(type, ExpT_LE) %>% rename(categories=ExpT_LE, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_LE"),
                      training %>% count(type, ExpT_NE) %>% rename(categories=ExpT_NE, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_NE"),
                      training %>% count(type, ExpT_NET) %>% rename(categories=ExpT_NET, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="ExpT_NET"),
                      training %>% count(type, old) %>% rename(categories=old, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="old"),
                      training %>% count(type, young) %>% rename(categories=young, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="young"),
                      training %>% count(type, luca) %>% rename(categories=luca, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="luca"),
                      training %>% count(type, eukaryotes) %>% rename(categories=eukaryotes, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="eukaryotes"),
                      training %>% count(type, metazoans) %>% rename(categories=metazoans, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="metazoans"),
                      training %>% count(type, vertebrates) %>% rename(categories=vertebrates, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="vertebrates"),
                      training %>% count(type, opisthokonts) %>% rename(categories=opisthokonts, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="opisthokonts"),
                      training %>% count(type, mammals) %>% rename(categories=mammals, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="mammals"),
                      training %>% count(type, selective) %>% rename(categories=selective, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="selective"),
                      training %>% count(type, always_expressed) %>% rename(categories=always_expressed, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="always_expressed"),
                      training %>% count(type, middle) %>% rename(categories=middle, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="middle"),
                      training %>% count(type, one_tissue) %>% rename(categories=one_tissue, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="one_tissue"),
                      training %>% count(type, never_expressed) %>% rename(categories=never_expressed, occurrences=n) %>% mutate(perc=occurrences/sum(occurrences)) %>% mutate(feature="never_expressed")
        )

    }


    write.table(stats, file = paste(res_dir, "/training_summary_catFeatures.tsv", sep=""), row.names = F, quote = FALSE, sep="\t")
    #     pdf(file=paste(res_dir, "/training_categorical_summary.pdf" , sep=""), h= 16, w=20)
    #     p = ggplot(stats, aes(x=factor(feature), y=perc*100, fill=factor(categories))) + geom_bar(stat="identity") +
    #         theme_boss() + theme(
    #             axis.text.x = element_text(angle=90, size=12),
    #             axis.text.y = element_text(size=12),
    #             axis.title.x = element_blank(),
    #             axis.title.y = element_text(size = 12),
    #             legend.title = element_blank()
    #         ) + scale_fill_brewer(palette="Set1") +
    #         ylab("Number of occurences (%)")
    #     print(p)
    #     dev.off()

}

## Take the unscaled training and validation, combine them and infer mean and sd for each feature
## Then scale new incoming samples and predict using the best model from the corresponding cancer type
## Probably subset for what you consider driver first
## Example usage:
# scalePredictNewSamples(cancer_type_path = "/Volumes/mourikisa/novel_driver_prediction/ESCA/vogel_114_training",
#                        newSamples = oac, kern=c("linear", "radial", "polynomial","sigmoid"), mynu = c(0.15, 0.15, 0.05, 0.15), mygamma = c(0,0.0078125,0.015625,0.03125),
#                        mydegree = c(0,0,3,0), gains = F, fp_dir="/Volumes/mourikisa/data/NCG_false_positives.txt",
#                        res_dir = "/Volumes/mourikisa/novel_driver_prediction/OAC/prediction_using_best_ESCA/vogel_114_training", install_NCG = F)
scalePredictNewSamples = function(cancer_type_path, newSamples, kern=c("linear", "radial", "polynomial", "sigmoid"), mynu=c(0,0,0,0),
                                  mygamma=c(0,0,0,0), mydegree=c(0,0,0,0), gains=T, fp_dir="/mnt/lustre/users/k1469280/mourikisa/data/NCG_false_positives.txt", res_dir,
                                  install_NCG=F){

    models = data.frame(kern=kern, mynu=mynu, mygamma=mygamma, mydegree=mydegree)

    dev_mode()
    if(install_NCG) install(NCG_path)
    library(ncglib)
    geneInfo=get_geneinfo(version="NCG5")
    cancerGenes = get_cancer_genes(version="NCG5")
    dev_mode()

    ## Fix gene info table from NCG
    geneInfo = geneInfo %>% select(entrez, cancer_type) %>% unique
    ## Get a cancer gene with all the associated primary sites and cancer sites
    cancerGenes = cancerGenes %>% select(entrez, primary_site, cancer_site) %>%
        group_by(entrez) %>% summarise(primary_site=paste(unique(primary_site), collapse=","),
                                       cancer_site=paste(unique(cancer_site), collapse=",")) %>%
        ungroup
    geneInfo = geneInfo %>% left_join(cancerGenes, by=c("entrez"))

    ## Get false positive genes
    false_positive_genes <- read.table(fp_dir, header = T, sep = "\t")
    false_positive_genes <- false_positive_genes %>% select(entrez, symbol)
    ## For now lets include only driver alterations - based on our definition
    if(gains){
        newSamples = newSamples %>% subset(no_TRUNC_muts!=0 | no_NTDam_muts!=0 | no_GOF_muts!=0 | (CNVGain==1 & Copy_number>=4) |
                                                             (CNVLoss==1 & Copy_number==0) | ((CNVLoss==1 & Copy_number<2) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0)) & !(Entrez%in%false_positive_genes$entrez))
    }else{
        newSamples = newSamples %>% subset(no_TRUNC_muts!=0 | no_NTDam_muts!=0 | no_GOF_muts!=0 |
                                                             (CNVLoss==1 & Copy_number==0) | ((CNVLoss==1 & Copy_number<2) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0)) & !(Entrez%in%false_positive_genes$entrez))
    }

    ## training_ns
    load(paste0(cancer_type_path, "/training_set_noScale.Rdata"))
    ## validation_ns
    load(paste0(cancer_type_path, "/validation_set_noScale.Rdata"))
    ns = rbind(training_ns%>%select(-type), validation_ns)
    ## training
    load(paste0(cancer_type_path, "/training_set.Rdata"))
    ## validation
    load(paste0(cancer_type_path, "/validation_set.Rdata"))

    ## Get mean and sd of the corresponding cancer type
    stats = as.data.frame(lapply(ns, mean)) %>% gather(feature, mean) %>% subset(!is.na(mean)) %>% left_join(
        as.data.frame(lapply(ns, sd)) %>% gather(feature, sd) %>% subset(!is.na(sd)), by=c("feature")
    )

    ## Scale using the mean and sd of the corresponding cancer type
    x = newSamples%>%select(one_of(as.character(stats$feature)))
    x = as.list(x)
    y = as.list(stats$mean)
    names(y) = stats$feature
    z = as.list(stats$sd)
    names(z) = stats$feature

    ## Add factor columns etc and reorder the columns using the order of the unscaled data frame
    newSamples_scaled = cbind(data.frame(mapply( function(x,y,z) (x-y)/z, x, y, z, SIMPLIFY = F )), newSamples[,!(names(newSamples) %in% stats$feature)])
    newSamples_scaled = newSamples_scaled%>%select(one_of(names(newSamples)))

    ## Prepare new samples for prediction
    newSamples_scaled = cleanFeatures(df=newSamples_scaled, paste0(cancer_type_path,"/config.txt"))
    ## Feature refinement
    newSamples_scaled = newSamples_scaled %>% select(-no_NSI_muts, -private_domains, -commondomains, -High, -Low, -Medium, -NotExpressed)

    ## What features should be excluded
    config = read.table(paste0(cancer_type_path,"/config.txt"), header = F)
    config = config[10:nrow(config), 1]
    s = c("cancer_type", "sample", "entrez", "no_TRUNC_muts", "no_NTDam_muts",
          "no_GOF_muts", "Copy_number", "CNVLoss", "CNVGain")
    s = s[!(s%in%config)]

    newSamples = newSamples %>% rename(cancer_type=Cancer_type,
                                       sample=Sample, entrez=Entrez) %>%
        select(one_of(s))

    prediction_table = NULL
    training_size = nrow(training)
    training_genes = training%>%tibble::rownames_to_column()%>%separate(rowname, into=c("Cancer_type", "sample", "entrez"), sep="\\.")%>%select(entrez)%>%unique%>%.$entrez
    training_samples = training%>%tibble::rownames_to_column()%>%separate(rowname, into=c("Cancer_type", "sample", "entrez"), sep="\\.")%>%select(sample)%>%unique%>%.$sample
    prediction_size = nrow(newSamples_scaled)
    prediction_genes = newSamples_scaled%>%tibble::rownames_to_column()%>%separate(rowname, into=c("Cancer_type", "sample", "entrez"), sep="\\.")%>%select(entrez)%>%unique%>%.$entrez
    prediction_samples = newSamples_scaled%>%tibble::rownames_to_column()%>%separate(rowname, into=c("Cancer_type", "sample", "entrez"), sep="\\.")%>%select(sample)%>%unique%>%.$sample

    for (i in 1:nrow(models)){
        ## Get the model
        if (models$kern[i]=="linear"){
            svm.model <- svm(type ~ ., data = training, kernel=models$kern[i], type="one-classification", scale=FALSE, nu=models$mynu[i])
        }else if (models$kern[i]=="radial"){
            svm.model <- svm(type ~ ., data = training, kernel=models$kern[i], type="one-classification", scale=FALSE, gamma=models$mygamma[i], nu=models$mynu[i])
        }else if (models$kern[i]=="polynomial"){
            svm.model <- svm(type ~ ., data = training, kernel=models$kern[i], type="one-classification", scale=FALSE, gamma=models$mygamma[i], nu=models$mynu[i], degree=models$mydegree[i])
        }else if (models$kern[i]=="sigmoid"){
            svm.model <- svm(type ~ ., data = training, kernel=models$kern[i], type="one-classification", scale=FALSE, gamma=models$mygamma[i], nu=models$mynu[i])
        }

        ## Asses training sensitivity
        trainset = training %>% select(type)
        trainset$fitted = svm.model$fitted
        trainset = trainset %>% select(type, fitted)
        trainset = trainset %>% mutate(f=ifelse(fitted==TRUE, "C", "N"))
        trainset$type = factor(as.character(trainset$type), levels = c("C", "N"))
        trainset$f = factor(as.character(trainset$f), levels = c("C", "N"))
        ## Confusion matrix
        cM <- confusionMatrix(trainset$f,trainset$type, positive = c("C"))
        training_sensitivity = cM$byClass["Sensitivity"]

        ## Predict on the new samples using the model
        predictions = predict(svm.model, newSamples_scaled, decision.values = TRUE)

        ## Add probability from Platt
        predictions_platt = plattScaling(svm.model, predictions)
        colnames(predictions_platt) = c("dv", "label", "prob_platt")
        preds = predictions_platt %>% tibble::rownames_to_column()

        ## Take information from gene info
        preds = preds %>%
            separate(rowname, into=c("cancer_type", "sample", "entrez"), sep="\\.") %>%
            mutate(entrez=as.numeric(entrez)) %>%
            left_join(geneInfo%>%rename(gene_type=cancer_type), by=c("entrez"))

        ## Join also with systems level properties from the unscaled data frame
        preds = preds %>% left_join(newSamples, by=c("cancer_type", "sample", "entrez"))

        ## Save preds
        if (models$kern[i]=="linear"){
            analysis = paste(models$kern[i], ".", models$mynu[i], sep="")
            save(svm.model, file=paste(res_dir,"/svm.model_",analysis, ".Rdata", sep=""))
            save(preds, file=paste(res_dir,"/predictions_",analysis, ".Rdata", sep=""))
        }else if (models$kern[i]=="radial"){
            analysis = paste(models$kern[i], ".", models$mynu[i], ".", models$mygamma[i], sep="")
            save(svm.model, file=paste(res_dir,"/svm.model_",analysis, ".Rdata", sep=""))
            save(preds, file=paste(res_dir, "/predictions_",analysis, ".Rdata", sep=""))
        }else if (models$kern[i]=="polynomial"){
            analysis = paste(models$kern[i], ".", models$mynu[i], ".", models$mygamma[i], ".", models$mydegree[i], sep="")
            save(svm.model, file=paste(res_dir,"/svm.model_",analysis, ".Rdata", sep=""))
            save(preds, file=paste(res_dir, "/predictions_",analysis, ".Rdata", sep=""))
        }else if (models$kern[i]=="sigmoid"){
            analysis = paste(models$kern[i], ".", models$mynu[i], ".", models$mygamma[i], sep="")
            save(svm.model, file=paste(res_dir,"/svm.model_",analysis, ".Rdata", sep=""))
            save(preds, file=paste(res_dir, "/predictions_",analysis, ".Rdata", sep=""))
        }

        ## Get some stats for the predictions
        df = data.frame(analysis=analysis, nu=models$mynu[i],
                        gamma=models$mygamma[i], degree=models$mydegree[i],
                        novel_drivers_entries=preds%>%subset(label==TRUE)%>%nrow,
                        novel_drivers_genes=preds%>%subset(label==TRUE)%>%select(entrez)%>%unique%>%nrow,
                        novel_drivers_genes_inNCG5 = preds%>%subset(label==TRUE & (gene_type=="cgc" | gene_type=="can"))%>%select(entrez)%>%unique%>%nrow,
                        novel_drivers_genes_rst = preds%>%subset(label==TRUE & gene_type=="rst")%>%select(entrez)%>%unique%>%nrow,
                        novel_drivers_entries_platt=preds%>%subset(label==TRUE & prob_platt >0.95)%>%nrow,
                        novel_drivers_genes_platt=preds%>%subset(label==TRUE & prob_platt >0.95)%>%select(entrez)%>%unique%>%nrow,
                        novel_drivers_in_samples_mutated = preds%>%subset(label==TRUE & prob_platt >0.95)%>%select(sample)%>%unique%>%nrow,
                        novel_drivers_genes_platt_inNCG5=preds%>%subset(label==TRUE & prob_platt >0.95 & (gene_type=="cgc" | gene_type=="can"))%>%select(entrez)%>%unique%>%nrow,
                        novel_drivers_genes_platt_rst=preds%>%subset(label==TRUE & prob_platt >0.95 & gene_type=="rst")%>%select(entrez)%>%unique%>%nrow,
                        novel_drivers_genes_platt_rst_in_samples_mutated=preds%>%subset(label==TRUE & prob_platt >0.95 & gene_type=="rst")%>%select(sample)%>%unique%>%nrow
        )

        ## Read the cv_stats to get test sensitivity
        cv_stats = read.table(file=paste0(cancer_type_path, "/cv_stat_summary.tsv"), header = T, sep="\t")
        cv_stats = cv_stats %>% subset(kernel==models$kern[i] & type=="Sensitivity" & set=="test") %>% select(analysis, min, q1, median, mean, q3, max)
        df = df %>% left_join(cv_stats, by=c("analysis"))
        prediction_table = rbind(prediction_table, df)
    }


    ## Excel configuration
    require(xlsx)
    wb<-createWorkbook(type="xlsx")

    TEXT_STYLE           <- CellStyle(wb) + Font(wb,  heightInPoints=14)
    TABLE_COLNAMES_STYLE <- CellStyle(wb) + Font(wb, isBold=TRUE) +
        Alignment(rotation=45, horizontal="ALIGN_CENTER") +
        Border(color="black", position=c("TOP", "LEFT","BOTTOM", "RIGHT"), pen=rep("BORDER_THIN",4))


    write_text_in_cell = function(sheet, rowIndex, text, textStyle){
        rows <-createRow( sheet, rowIndex=rowIndex )
        sheetText <-createCell( rows, colIndex=1 )
        setCellValue( sheetText[[1,1]], text )
        setCellStyle( sheetText[[1,1]], textStyle )
    }

    sheet <- createSheet(wb, sheetName = "predictions")

    text = with( prediction_table, paste0("TRAINING SET. Number_of_genes = ", length(training_genes), " ; Samples = ", length(training_samples), " ; Total_entries = ", training_size))
    write_text_in_cell(sheet, 1, text, TEXT_STYLE)

    text = with( prediction_table, paste0("TEST SET.     Number_of_genes = ", length(prediction_genes), " ; Samples = ", length(prediction_samples), " ; Total_entries = ", prediction_size))
    write_text_in_cell(sheet, 2, text, TEXT_STYLE)

    to_remove = c("training_genes","training_samples",'training_set_size','prediction_genes','prediction_samples','prediction_set_size')

    rownames(prediction_table) = NULL

    addDataFrame(prediction_table[, !colnames(prediction_table)%in%to_remove ], sheet, startRow=3, startColumn=1, colnamesStyle = TABLE_COLNAMES_STYLE, row.names = F)

    saveWorkbook(wb,file=paste(res_dir,"/predictions.xlsx", sep=""))

}

## Get the table for each cancer type from the MySQL database
getTable = function(tb="LAML",user="db_user",pass="db_pass",db="tcga_31cts_prepared",...){

    cat(paste("Retrieving data from ",db," MySQL database...",sep=""), "\n")
    ## Get in which node you are
    node = Sys.info()[["nodename"]]
    temp2 <- gregexpr("[0-9]+", node)
    node = as.numeric(unique(unlist(regmatches(node, temp2))))
    if (node==42 | node==43){
        mydb = dbConnect(MySQL(), user=user,
                         password=pass,
                         dbname=db,
                         host='ibnode041')
    }else if (node==41){
        mydb = dbConnect(MySQL(), user='root',
                         dbname=db)
    }

    #tables = dbListTables(mydb)
    rs = dbSendQuery(mydb, paste("select * from ", tb, sep=""))
    data = fetch(rs, n=-1)

    ## Fix factors and NAs in vogel
    ## Keep in mind that factor levels may be different from the pancancer data if all levels are not represented in this cancer type
    fcols <- c("Genic", "memberofcomplex",
               "WGD", "hub", "central", "CNVGain", "CNVLoss",
               "ExpT_ME", "ExpT_HE", "ExpT_LE", "ExpT_NE",
               "ExpT_NET", "old", "young", "luca", "eukaryotes",
               "metazoans", "vertebrates", "opisthokonts",
               "mammals", "primates","selective", "always_expressed",
               "middle", "one_tissue", "never_expressed")
    cols <- which(colnames(data) %in% fcols)
    for(i in cols){
        ## Check unique values to see the levels
        ## If there is only one level explicilty define at least 2
        if (length(unique(data[,i]))<2){
            data[,i] = factor(data[,i], levels = c("0","1"))
        }else{
            data[,i] = factor(data[,i])
        }

    }

    return(data)
}

## Mark the training set because after scaling you cannot filter (e.g no_TRUNC_muts!=0)
markTrainNovelty <- function(df, tissue=NULL, fp_dir="/mnt/lustre/users/k1469280/mourikisa/data/NCG_false_positives.txt", gains=T,...){

    if (is.null(tissue)){
        stop("Please provide tissue to get training set")
    }

  ## To take trainig set based on the candidate genes for each cancer type
  ## Load geneInfo.Rdata and cancerGenes.Rdata from /mnt/lustre/users/k1469280/mourikisa/data in athena
  load("/mnt/lustre/users/k1469280/mourikisa/data/geneInfoNCG5.Rdata")
  load("/mnt/lustre/users/k1469280/mourikisa/data/cancerGenesNCG5.Rdata")

  ## Get information for cgc/cans and primary sites
  ## Fix gene info table from NCG
  geneInfo = geneInfo %>% select(entrez, cancer_type) %>% unique
  ## Get a cancer gene with all the associated primary sites and cancer sites
  cancerGenes = cancerGenes %>% select(entrez, primary_site, cancer_site) %>%
    group_by(entrez) %>% summarise(primary_site=paste(unique(primary_site), collapse=","),
                                   cancer_site=paste(unique(cancer_site), collapse=",")) %>%
    ungroup
  geneInfo = geneInfo %>% left_join(cancerGenes, by=c("entrez")) %>%
      rename(gene_type=cancer_type)

  ## Get false positive genes
  false_positive_genes <- read.table(fp_dir, header = T, sep = "\t")
  false_positive_genes <- false_positive_genes %>% select(entrez, symbol)

  ## Get primary site information for the df
  df = df %>% left_join(geneInfo, by=c("Entrez"="entrez"))

  ## Training set extraction
  cat("Marking genes for training...", "\n")
  #df = df %>% mutate(TP=ifelse((vogel=="Vog.Oncogene" | vogel=="Vog.TS") & (no_TRUNC_muts!=0 | no_NTDam_muts!=0 | no_GOF_muts!=0 | CNVGain==1 | CNVLoss==1 | ExpT_NET==1), T, F))
  #if( df%>%grepl(tissue, primary_site)%>%nrow==0 ) stop("Tissue not present")

  if(gains){
    df = df %>% mutate(TP=ifelse((grepl(tissue, primary_site)) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0 | no_GOF_muts!=0 | (CNVGain==1 & Copy_number>=4) |
                                    (CNVLoss==1 & Copy_number==0) | ((CNVLoss==1 & Copy_number<2) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0))) & !(Entrez%in%false_positive_genes$entrez), T, F))
  } else {
    df = df %>% mutate(TP=ifelse((grepl(tissue, primary_site)) &
                                   (no_TRUNC_muts!=0 |
                                    no_NTDam_muts!=0 |
                                    no_GOF_muts!=0  |
                                   (CNVLoss==1 & Copy_number==0) |
                                  ((CNVLoss==1 & Copy_number<2) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0))) & !(Entrez%in%false_positive_genes$entrez), T, F))

  }
  #df = df %>% mutate(TN=ifelse((Entrez%in%false_positive_genes$entrez) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0 | no_GOF_muts!=0 | CNVGain==1 | CNVLoss==1 | ExpT_NET==1), T, F))
  #df = df %>% mutate(TN2=ifelse((vogel=="Vog.Oncogene" | vogel=="Vog.TS") & (no_TRUNC_muts==0 & no_NTDam_muts==0 & no_GOF_muts==0 & CNVGain==0 & CNVLoss==0 & ExpT_NET==0), T, F))
  cat("Adding type annotation for training...", "\n")
  #df = df %>% mutate(type=ifelse(TP==T & vogel=="Vog.Oncogene", "O", ifelse(TP==T & vogel=="Vog.TS", "T",
  #                                                                          ifelse(TN==T, "N", NA))))

  ## Some of the cancer specific genes are also in the true negatives list (in NCG we only show a warning)
  #df = df %>% mutate(type=ifelse((TP==T & TN==F), "C", ifelse((TN==T & TP==F) | (TN==T & TP==T), "N", NA)))
  df = df %>% mutate(type=ifelse(TP==T, "C", NA))

  df = df %>% select(-TP)
  ## For cancer-specific training set
  df = df %>% select(-gene_type, -primary_site, -cancer_site)
  df$type = as.factor(df$type)
  ## Exclude genes without any alterations from training & prediction set
  if(gains){
    df = df %>% subset(no_TRUNC_muts!=0 | no_NTDam_muts!=0 | no_GOF_muts!=0 | (CNVGain==1 & Copy_number>=4) |
                         (CNVLoss==1 & Copy_number==0) | ((CNVLoss==1 & Copy_number<2) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0)) & !(Entrez%in%false_positive_genes$entrez))
  }else{
    df = df %>% subset(no_TRUNC_muts!=0 | no_NTDam_muts!=0 | no_GOF_muts!=0 |
                       (CNVLoss==1 & Copy_number==0) | ((CNVLoss==1 & Copy_number<2) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0)) & !(Entrez%in%false_positive_genes$entrez))
  }
  return(df)
}

markTrainVogel <- function(df, fp_dir="/mnt/lustre/users/k1469280/mourikisa/data/NCG_false_positives.txt", gains=T,...){

    ## To take trainig set based on the candidate genes for each cancer type
    ## Load geneInfo.Rdata and cancerGenes.Rdata from /mnt/lustre/users/k1469280/mourikisa/data in athena
    load("/mnt/lustre/users/k1469280/mourikisa/data/geneInfoNCG5.Rdata")
    load("/mnt/lustre/users/k1469280/mourikisa/data/cancerGenesNCG5.Rdata")

    ## Get information for cgc/cans and primary sites
    ## Fix gene info table from NCG
    geneInfo = geneInfo %>% select(entrez, cancer_type) %>% unique
    ## Get a cancer gene with all the associated primary sites and cancer sites
    cancerGenes = cancerGenes %>% select(entrez, primary_site, cancer_site) %>%
        group_by(entrez) %>% summarise(primary_site=paste(unique(primary_site), collapse=","),
                                       cancer_site=paste(unique(cancer_site), collapse=",")) %>%
        ungroup
    geneInfo = geneInfo %>% left_join(cancerGenes, by=c("entrez")) %>%
        rename(gene_type=cancer_type)

    ## Get false positive genes
    false_positive_genes <- read.table(fp_dir, header = T, sep = "\t")
    false_positive_genes <- false_positive_genes %>% select(entrez, symbol)

    ## Get primary site information for the df
    df = df %>% left_join(geneInfo, by=c("Entrez"="entrez"))

    ## Training set extraction
    cat("Marking genes for training...", "\n")
    #df = df %>% mutate(TP=ifelse((vogel=="Vog.Oncogene" | vogel=="Vog.TS") & (no_TRUNC_muts!=0 | no_NTDam_muts!=0 | no_GOF_muts!=0 | CNVGain==1 | CNVLoss==1 | ExpT_NET==1), T, F))
    #if( df%>%grepl(tissue, primary_site)%>%nrow==0 ) stop("Tissue not present")

    if(gains){
        df = df %>% mutate(TP=ifelse((vogel=="Vog.Oncogene" | vogel=="Vog.TS") & (no_TRUNC_muts!=0 | no_NTDam_muts!=0 | no_GOF_muts!=0 | (CNVGain==1 & Copy_number>=4) |
                                                                          (CNVLoss==1 & Copy_number==0) | ((CNVLoss==1 & Copy_number<2) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0))) & !(Entrez%in%false_positive_genes$entrez), T, F))
    } else {
        df = df %>% mutate(TP=ifelse((vogel=="Vog.Oncogene" | vogel=="Vog.TS") &
                                         (no_TRUNC_muts!=0 |
                                              no_NTDam_muts!=0 |
                                              no_GOF_muts!=0  |
                                              (CNVLoss==1 & Copy_number==0) |
                                              ((CNVLoss==1 & Copy_number<2) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0))) & !(Entrez%in%false_positive_genes$entrez), T, F))

    }
    #df = df %>% mutate(TN=ifelse((Entrez%in%false_positive_genes$entrez) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0 | no_GOF_muts!=0 | CNVGain==1 | CNVLoss==1 | ExpT_NET==1), T, F))
    #df = df %>% mutate(TN2=ifelse((vogel=="Vog.Oncogene" | vogel=="Vog.TS") & (no_TRUNC_muts==0 & no_NTDam_muts==0 & no_GOF_muts==0 & CNVGain==0 & CNVLoss==0 & ExpT_NET==0), T, F))
    cat("Adding type annotation for training...", "\n")
    #df = df %>% mutate(type=ifelse(TP==T & vogel=="Vog.Oncogene", "O", ifelse(TP==T & vogel=="Vog.TS", "T",
    #                                                                          ifelse(TN==T, "N", NA))))

    ## Some of the cancer specific genes are also in the true negatives list (in NCG we only show a warning)
    #df = df %>% mutate(type=ifelse((TP==T & TN==F), "C", ifelse((TN==T & TP==F) | (TN==T & TP==T), "N", NA)))
    df = df %>% mutate(type=ifelse(TP==T, "C", NA))

    df = df %>% select(-TP)
    ## For cancer-specific training set
    df = df %>% select(-gene_type, -primary_site, -cancer_site)
    df$type = as.factor(df$type)
    ## Exclude genes without any alterations from training & prediction set
    if(gains){
        df = df %>% subset(no_TRUNC_muts!=0 | no_NTDam_muts!=0 | no_GOF_muts!=0 | (CNVGain==1 & Copy_number>=4) |
                               (CNVLoss==1 & Copy_number==0) | ((CNVLoss==1 & Copy_number<2) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0)) & !(Entrez%in%false_positive_genes$entrez))
    }else{
        df = df %>% subset(no_TRUNC_muts!=0 | no_NTDam_muts!=0 | no_GOF_muts!=0 |
                               (CNVLoss==1 & Copy_number==0) | ((CNVLoss==1 & Copy_number<2) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0)) & !(Entrez%in%false_positive_genes$entrez))
    }
    return(df)
}

markTrainCGC <- function(df, fp_dir="/mnt/lustre/users/k1469280/mourikisa/data/NCG_false_positives.txt",
                         geneInfo_fn="/mnt/lustre/users/k1469280/mourikisa/data/geneInfoNCG5.Rdata",
                         cancerGenes_fn="/mnt/lustre/users/k1469280/mourikisa/data/cancerGenesNCG5.Rdata",
                         gains=T, ...){

    ## To take trainig set based on the candidate genes for each cancer type
    ## Load geneInfo.Rdata and cancerGenes.Rdata from /mnt/lustre/users/k1469280/mourikisa/data in athena
    load(geneInfo_fn)
    load(cancerGenes_fn)

    ## Get information for cgc/cans and primary sites
    ## Fix gene info table from NCG
    geneInfo = geneInfo %>% select(entrez, cancer_type) %>% unique
    ## Get a cancer gene with all the associated primary sites and cancer sites
    cancerGenes = cancerGenes %>% select(entrez, primary_site, cancer_site) %>%
        group_by(entrez) %>% summarise(primary_site=paste(unique(primary_site), collapse=","),
                                       cancer_site=paste(unique(cancer_site), collapse=",")) %>%
        ungroup
    geneInfo = geneInfo %>% left_join(cancerGenes, by=c("entrez")) %>%
        rename(gene_type=cancer_type)

    ## Get false positive genes
    false_positive_genes <- read.table(fp_dir, header = T, sep = "\t")
    false_positive_genes <- false_positive_genes %>% select(entrez, symbol)

    ## Get primary site information for the df
    df = df %>% left_join(geneInfo, by=c("Entrez"="entrez"))

    ## Training set extraction
    cat("Marking genes for training...", "\n")
    #df = df %>% mutate(TP=ifelse((vogel=="Vog.Oncogene" | vogel=="Vog.TS") & (no_TRUNC_muts!=0 | no_NTDam_muts!=0 | no_GOF_muts!=0 | CNVGain==1 | CNVLoss==1 | ExpT_NET==1), T, F))
    #if( df%>%grepl(tissue, primary_site)%>%nrow==0 ) stop("Tissue not present")

    if(gains){
        df = df %>% mutate(TP=ifelse((gene_type=="cgc") & (no_TRUNC_muts!=0 |
                                                               no_NTDam_muts!=0 |
                                                               no_GOF_muts!=0 |
                                                               (CNVGain==1) |
                                                               (Copy_number==0) |
                                                               ((Copy_number==1) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0)) |
                                                               BND!=0 |
                                                               INS!=0 |
                                                               INV!=0) & !(Entrez%in%false_positive_genes$entrez), T, F))
    } else {
        df = df %>% mutate(TP=ifelse((gene_type=="cgc") &
                                         (no_TRUNC_muts!=0 |
                                              no_NTDam_muts!=0 |
                                              no_GOF_muts!=0 |
                                              (Copy_number==0) |
                                              ((Copy_number==1) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0)) |
                                              BND!=0 |
                                              INS!=0 |
                                              INV!=0) & !(Entrez%in%false_positive_genes$entrez), T, F))

    }
    #df = df %>% mutate(TN=ifelse((Entrez%in%false_positive_genes$entrez) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0 | no_GOF_muts!=0 | CNVGain==1 | CNVLoss==1 | ExpT_NET==1), T, F))
    #df = df %>% mutate(TN2=ifelse((vogel=="Vog.Oncogene" | vogel=="Vog.TS") & (no_TRUNC_muts==0 & no_NTDam_muts==0 & no_GOF_muts==0 & CNVGain==0 & CNVLoss==0 & ExpT_NET==0), T, F))
    cat("Adding type annotation for training...", "\n")
    #df = df %>% mutate(type=ifelse(TP==T & vogel=="Vog.Oncogene", "O", ifelse(TP==T & vogel=="Vog.TS", "T",
    #                                                                          ifelse(TN==T, "N", NA))))

    ## Some of the cancer specific genes are also in the true negatives list (in NCG we only show a warning)
    #df = df %>% mutate(type=ifelse((TP==T & TN==F), "C", ifelse((TN==T & TP==F) | (TN==T & TP==T), "N", NA)))
    df = df %>% mutate(type=ifelse(TP==T, "C", NA))

    df = df %>% select(-TP)
    ## For cancer-specific training set
    df = df %>% select(-gene_type, -primary_site, -cancer_site)
    df$type = as.factor(df$type)

    ## Get negatives
    negatives = df %>% subset((no_TRUNC_muts==0 &
                                   no_NTDam_muts==0 &
                                   no_GOF_muts==0 &
                                   CNVGain==0 &
                            CNVLoss==0) | Entrez%in%false_positive_genes$entrez)

    ## Exclude genes without any alterations from training & prediction set  (I now added the exclusion of the NCG FPs as an option by default they are not excluded)
    if(gains){
        # df = df %>% subset((no_TRUNC_muts!=0 |
        #                        no_NTDam_muts!=0 |
        #                        no_GOF_muts!=0 |
        #                        (CNVGain==1) |
        #                        (Copy_number==0) |
        #                        ((Copy_number==1) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0)) |
        #                        BND!=0 |
        #                        INS!=0 |
        #                        INV!=0) & !(Entrez%in%false_positive_genes$entrez))
        df = df %>% subset((no_TRUNC_muts!=0 |
                                no_NTDam_muts!=0 |
                                no_GOF_muts!=0 |
                                (CNVGain==1) |
                                (Copy_number==0) |
                                ((Copy_number==1) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0)) |
                                BND!=0 |
                                INS!=0 |
                                INV!=0) )
    }else{
        # df = df %>% subset((no_TRUNC_muts!=0 |
        #                        no_NTDam_muts!=0 |
        #                        no_GOF_muts!=0 |
        #                        (Copy_number==0) |
        #                        ((Copy_number==1) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0)) |
        #                        BND!=0 |
        #                        INS!=0 |
        #                        INV!=0) & !(Entrez%in%false_positive_genes$entrez))
        df = df %>% subset((no_TRUNC_muts!=0 |
                                no_NTDam_muts!=0 |
                                no_GOF_muts!=0 |
                                (Copy_number==0) |
                                ((Copy_number==1) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0)) |
                                BND!=0 |
                                INS!=0 |
                                INV!=0) )
    }
    return(list(tp=df, n=negatives))
}

markTrainTwoClasses <- function(df, tissue=NULL, fp_dir="/mnt/lustre/users/k1469280/mourikisa/data/NCG_false_positives.txt", gains=T,...){

    if (is.null(tissue)){
        stop("Please provide tissue to get training set")
    }

    ## To take trainig set based on the candidate genes for each cancer type
    ## Load geneInfo.Rdata and cancerGenes.Rdata from /mnt/lustre/users/k1469280/mourikisa/data in athena
    load("/mnt/lustre/users/k1469280/mourikisa/data/geneInfoNCG5.Rdata")
    load("/mnt/lustre/users/k1469280/mourikisa/data/cancerGenesNCG5.Rdata")

    ## Get information for cgc/cans and primary sites
    ## Fix gene info table from NCG
    geneInfo = geneInfo %>% select(entrez, cancer_type) %>% unique
    ## Get a cancer gene with all the associated primary sites and cancer sites
    cancerGenes = cancerGenes %>% select(entrez, primary_site, cancer_site) %>%
        group_by(entrez) %>% summarise(primary_site=paste(unique(primary_site), collapse=","),
                                       cancer_site=paste(unique(cancer_site), collapse=",")) %>%
        ungroup
    geneInfo = geneInfo %>% left_join(cancerGenes, by=c("entrez"))

    ## Get false positive genes
    false_positive_genes <- read.table(fp_dir, header = T, sep = "\t")
    false_positive_genes <- false_positive_genes %>% select(entrez, symbol)

    ## Get primary site information for the df
    df = df %>% left_join(geneInfo, by=c("Entrez"="entrez"))

    ## Training set extraction
    cat("Marking genes for training...", "\n")
    #df = df %>% mutate(TP=ifelse((vogel=="Vog.Oncogene" | vogel=="Vog.TS") & (no_TRUNC_muts!=0 | no_NTDam_muts!=0 | no_GOF_muts!=0 | CNVGain==1 | CNVLoss==1 | ExpT_NET==1), T, F))
    #if( df%>%grepl(tissue, primary_site)%>%nrow==0 ) stop("Tissue not present")

    if(gains){
        df = df %>% mutate(TP=ifelse((grepl(tissue, primary_site)) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0 | no_GOF_muts!=0 | (CNVGain==1 & Copy_number>=4) |
                                                                          (CNVLoss==1 & Copy_number==0) | ((CNVLoss==1 & Copy_number<2) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0))) & !(Entrez%in%false_positive_genes$entrez), T, F))
    } else {
        df = df %>% mutate(TP=ifelse((grepl(tissue, primary_site)) &
                                         (no_TRUNC_muts!=0 |
                                              no_NTDam_muts!=0 |
                                              no_GOF_muts!=0  |
                                              (CNVLoss==1 & Copy_number==0) |
                                              ((CNVLoss==1 & Copy_number<2) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0))) & !(Entrez%in%false_positive_genes$entrez), T, F))

    }
    #df = df %>% mutate(TN=ifelse((Entrez%in%false_positive_genes$entrez) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0 | no_GOF_muts!=0 | CNVGain==1 | CNVLoss==1 | ExpT_NET==1), T, F))
    #df = df %>% mutate(TN2=ifelse((vogel=="Vog.Oncogene" | vogel=="Vog.TS") & (no_TRUNC_muts==0 & no_NTDam_muts==0 & no_GOF_muts==0 & CNVGain==0 & CNVLoss==0 & ExpT_NET==0), T, F))
    cat("Adding type annotation for training...", "\n")
    #df = df %>% mutate(type=ifelse(TP==T & vogel=="Vog.Oncogene", "O", ifelse(TP==T & vogel=="Vog.TS", "T",
    #                                                                          ifelse(TN==T, "N", NA))))

    ## Some of the cancer specific genes are also in the true negatives list (in NCG we only show a warning)
    #df = df %>% mutate(type=ifelse((TP==T & TN==F), "C", ifelse((TN==T & TP==F) | (TN==T & TP==T), "N", NA)))
    df = df %>% mutate(type=ifelse(TP==T, "C", NA))

    df = df %>% select(-TP)
    ## For cancer-specific training set
    df = df %>% select(-cancer_type, -primary_site, -cancer_site)
    df$type = as.factor(df$type)
    ## Exclude genes without any alterations from training & prediction set
    if(gains){
        df = df %>% subset(no_TRUNC_muts!=0 | no_NTDam_muts!=0 | no_GOF_muts!=0 | (CNVGain==1 & Copy_number>=4) |
                               (CNVLoss==1 & Copy_number==0) | ((CNVLoss==1 & Copy_number<2) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0)) & !(Entrez%in%false_positive_genes$entrez))
    }else{
        df = df %>% subset(no_TRUNC_muts!=0 | no_NTDam_muts!=0 | no_GOF_muts!=0 |
                               (CNVLoss==1 & Copy_number==0) | ((CNVLoss==1 & Copy_number<2) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0)) & !(Entrez%in%false_positive_genes$entrez))
    }
    return(df)
}

markTrainCclass <- function(df, fp_dir="/mnt/lustre/users/k1469280/mourikisa/data/NCG_false_positives.txt", ...){

    ## To take trainig set based on the candidate genes for each cancer type
    ## Load geneInfo.Rdata and cancerGenes.Rdata from /mnt/lustre/users/k1469280/mourikisa/data in athena
    load("/mnt/lustre/users/k1469280/mourikisa/data/geneInfoNCG5_2.Rdata")
    load("/mnt/lustre/users/k1469280/mourikisa/data/cancerGenesNCG5_2.Rdata")

    ## Get information for cgc/cans and primary sites
    ## Fix gene info table from NCG
    geneInfo = geneInfo %>% select(entrez, cancer_type) %>% unique
    ## Get a cancer gene with all the associated primary sites and cancer sites
    cancerGenes = cancerGenes %>% select(entrez, primary_site, cancer_site) %>%
        group_by(entrez) %>% summarise(primary_site=paste(unique(primary_site), collapse=","),
                                       cancer_site=paste(unique(cancer_site), collapse=",")) %>%
        ungroup
    geneInfo = geneInfo %>% left_join(cancerGenes, by=c("entrez"))

    ## Get false positive genes
    false_positive_genes <- read.table(fp_dir, header = T, sep = "\t")
    false_positive_genes <- false_positive_genes %>% select(entrez, symbol)

    ## Get primary site information for the df
    df = df %>% left_join(geneInfo, by=c("Entrez"="entrez"))

    ## Training set extraction
    cat("Marking genes for training...", "\n")
    #df = df %>% mutate(TP=ifelse((vogel=="Vog.Oncogene" | vogel=="Vog.TS") & (no_TRUNC_muts!=0 | no_NTDam_muts!=0 | no_GOF_muts!=0 | CNVGain==1 | CNVLoss==1 | ExpT_NET==1), T, F))
    df = df %>% mutate(TP=ifelse((grepl("blood", primary_site)) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0 | no_GOF_muts!=0 | (CNVGain==1 & Copy_number>=4) |
                                                                       (CNVLoss==1 & Copy_number==0) | ((CNVLoss==1 & Copy_number<2) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0))), T, F))
    #df = df %>% mutate(TN1=ifelse(Entrez%in%false_positive_genes$entrez & , T, F))
    df = df %>% mutate(TN=ifelse(grepl("blood", primary_site) & TP==F, T, F))
    cat("Adding type annotation for training...", "\n")
    #df = df %>% mutate(type=ifelse(TP==T & vogel=="Vog.Oncogene", "O", ifelse(TP==T & vogel=="Vog.TS", "T",
    #                                                                          ifelse(TN==T, "N", NA))))

    ## Some of the cancer specific genes are also in the true negatives list (in NCG we only show a warning)
    df = df %>% mutate(type=ifelse((TP==T & TN==F), "C", ifelse(TN==T & TP==F, "N", NA)))

    df = df %>% select(-TP, -TN)
    ## For cancer-specific training set
    df = df %>% select(-cancer_type, -primary_site, -cancer_site)
    df$type = as.factor(df$type)
    ## Exclude genes without any alterations from training & prediction set
    df = df %>% subset((no_TRUNC_muts!=0 | no_NTDam_muts!=0 | no_GOF_muts!=0 | (CNVGain==1 & Copy_number>=4) |
                           (CNVLoss==1 & Copy_number==0) | ((CNVLoss==1 & Copy_number<2) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0))) | !is.na(type))
    return(df)
}

## Scaling
getScaledTable <- function(df, excludeColumn=c( "Cancer_type", "Sample", "Entrez","type", "vogel"), type="cs",means_sds=NULL, ...){ #

    ## There is no is.nan for data frames!
    is.nan.data.frame <- function(x){do.call(cbind, lapply(x, is.nan))}

    cat("--> getScaledTable: All factors are excluded from scaling...\n")

    factor_cols <- names(sapply(df, function(x) class(x))[sapply(df, function(x) class(x))=="factor"])

    df_f <- df[,names(df)%in%c(excludeColumn,factor_cols)]

    df <-df[,!(names(df)%in%c(excludeColumn,factor_cols))]

    if(is.null(means_sds)){
        ## Get mean and sd for scaling new observations
        mean_sd = sapply(df, function(cl) list(means=mean(cl,na.rm=TRUE), sds=sd(cl,na.rm=TRUE)))

        cat("Scaling...", "\n")
        if (type=="cs"){
            df <- data.frame(lapply(df, function(x) (x-mean(x, na.rm = TRUE))/sd(x, na.rm = TRUE)))
            df <- cbind(df, df_f)
        }else if(type=="range"){
            df <- data.frame(lapply(df, function(x) rescale(x, to=c(-1,1))))
            df <- cbind(df, df_f)
        }
    }else{
        mean_sd = NULL
        cat("Scaling...", "\n")
        if (type=="cs"){
            df <- data.frame(lapply(seq_along(df), function(y,n,i) {
                (y[i]-unlist(means_sds["means",n[i]]))/unlist(means_sds["sds",n[i]])
            }, y=df, n=names(df)))
            df <- cbind(df, df_f)
        }
    }

    return(list(df_scaled=df, mean_sd=mean_sd))
}

write.config = function(x=NULL, res_dir, fname="config.txt"){
  if (!file.exists(res_dir)){
    dir.create(file.path(res_dir))
  }
  y = c('Cancer_type', 'Sample', 'Entrez', 'no_NSI_muts','no_NTDamFunction_muts', 'no_NTDamCons_muts', 'no_NTDamSC_muts','TPM', 'vogel', 'inCDD')
  if (length(x)>0) y = c(y,x)
  write.table(y, file=paste0(res_dir,'/',fname), sep="\t", row.names = F, col.names = F, quote=F)
}

## Clean features
cleanFeatures = function(df, config=NULL){
  # cat("Cleaning features for SVM model...\n")
  if(!is.null(config)){
    if(file.exists(config)){
      to_remove = read.delim(config, sep="\t", header=F)[,1]
      if(length(to_remove)>0){
         df = df %>% mutate(rowname=paste(Cancer_type, Sample, Entrez, sep="."))
        # df = df %>% mutate(rowname=paste(Sample, Entrez, sep="."))

        df = df[,which(!colnames(df)%in%to_remove)]

      }
    }
  }else{
    cat("--> cleanFeatures: it seems you don't have a cleaning config file...i'm going with defalut cleaning\n")
    df = df %>% mutate(rowname=paste(Cancer_type, Sample, Entrez, sep=".")) %>%
      select(-Cancer_type, -Sample, -Entrez, -no_NSI_muts, -no_NTDamFunction_muts, -no_NTDamCons_muts, -no_NTDamSC_muts,
             -TPM, -vogel, -inCDD)
  }

    rnames= df$rowname
    df = df %>% select(-rowname)
    df = data.frame(df)
    row.names(df) = rnames
    return(df)
}

## Check training distributions
checkTraining = function(df, saveDir){
    factor_cols <- names(sapply(df, function(x) class(x))[sapply(df, function(x) class(x))=="factor"])
    pdf(file=paste(saveDir,"/training_feature_dists.pdf",sep=""),height = unit(12, "inches"), width = unit(20, "inches"), useDingbats = F)
    total_stats = NULL
    for (c in colnames(df)[!(colnames(df)%in%c(factor_cols, "type"))]){
        print(c)
        a = df[,c(c,"type")]
        stats = data.frame(
        mi = summary(a[,1])[[1]],
        fQ = summary(a[,1])[[2]],
        md = summary(a[,1])[[3]],
        mn = summary(a[,1])[[4]],
        tQ = summary(a[,1])[[5]],
        mx = summary(a[,1])[[6]],
        row.names = c
        )
        total_stats = rbind(total_stats, stats)
        p = ggplot(a, aes_string(x=colnames(a)[2], y=colnames(a)[1],fill = colnames(a)[2])) +
                geom_boxplot() +
                theme(
                    axis.title.x = element_blank(),
                    axis.text.x = element_text(size = 12, color="black"),
                    axis.text.y = element_text(size = 12)
                ) +
                ylab(paste(colnames(a)[1],"(n)",sep="")) +
                ggtitle(colnames(a)[1])
        print(p)
    }
    write.table(total_stats, file=paste(saveDir,"/training_feature_dists.tsv",sep=""), quote = F, row.names = T, sep = "\t")
    for (c in colnames(df)[colnames(df)%in%c(factor_cols, "type")]){
        a = df[,c(c,"type")]
        a = table(a) %>% data.frame()
        p = ggplot(a, aes_string(x=colnames(a)[2], y="Freq",fill = colnames(a)[1])) +
            geom_bar(stat = "identity", position = "dodge") +
            theme(
                axis.title.x = element_blank(),
                axis.text.x = element_text(size = 12, color="black"),
                axis.text.y = element_text(size = 12)
            ) +
            ylab(paste(colnames(a)[1],"(n)",sep="")) +
            ggtitle(colnames(a)[1])
        print(p)
    }
    dev.off()
}

## Get a table with the commands to run in athena
## Starting range nu: seq(0.05, 0.9, 0.05)
## Starting range gamma: 2^seq(-7,4)
runCommands <- function(ct, kern, mynu, mygamma, mydegree, save_dir, fname="jobs.txt", ncore=1, res_dir = NULL, source_dir=NULL){
    if(is.null(res_dir)) res_dir = paste0(save_dir,'/',ct)
    jobName = paste0(kern, "_", ct)
    sub <- paste0("qsub -N ", jobName," -pe smp ",ncore," -l h_vmem=12G /mnt/lustre/users/k1469280/mourikisa/data/OAC/submitNoveltyDetection.sh")
    params <- expand.grid(mynu, mygamma, mydegree)
    params$p <- paste(params$Var1, params$Var2, params$Var3, sep=" ")
    df <- NULL
    for (p in unique(params$p)){
      cmd <- paste(sub, ct, kern, p, res_dir, source_dir, sep=" ")
      df <- rbind(df, cmd)
    }
    write.table(df, sep="\t", quote=F, row.names=F, col.names=F, file=paste0(save_dir,"/",fname))
}

## Implementation of Nelder-Mead optimization to find parameters giving best cross-validation
nmOptimizeLinear = function(training, startingPoints=3){
  x = training %>% subset(type=="C") %>% dplyr::select(-type) %>% data.matrix()
  y = rep(1, nrow(x))

  ## one-class SVM
  ocSVM <- function(x, y, xtest, ytest, mynu) {
    svm.model <- svm(x,y,kernel="linear", type="one-classification", scale=FALSE, nu=mynu)
    predict(svm.model, xtest, decision.values=T)
  }
  cv <- cv.setup(x, y, score=accuracy, num_folds = 3, num_iter = 3)
  ## Choose 100 random starting points of nu
  starting_nu = runif(startingPoints, min=0, max=0.9)
  optima = list()
  pb <- txtProgressBar(min = 0, max = startingPoints, style = 3)
  for (i in 1:startingPoints){
    Sys.sleep(0.1)
    res <- cv.nelder_mead(cv, ocSVM, mynu = starting_nu[i], num_evals = 10, maximize=TRUE)
    optima[paste(i,"_optimum", sep="")] = res$optimum
    optima[paste(i,"_starting_nu", sep="")] = starting_nu[i]
    optima[paste(i,"_solution_nu", sep="")] = res$solution
    setTxtProgressBar(pb, i)
  }
  close(pb)
  return(optima)
}

nmOptimizeRadial = function(training, startingPoints=3){
  x = training %>% subset(type=="C") %>% dplyr::select(-type) %>% data.matrix()
  y = rep(1, nrow(x))

  ## one-class SVM
  ocSVM <- function(x, y, xtest, ytest, mynu, mygamma) {
    svm.model <- svm(x,y,kernel="radial", type="one-classification", scale=FALSE, nu=mynu, gamma=mygamma)
    predict(svm.model, xtest, decision.values=T)
  }
  cv <- cv.setup(x, y, score=accuracy, num_folds = 3, num_iter = 3)
  ## Choose random starting points of nu
  starting_nu = runif(startingPoints, min=0, max=0.9)
  starting_gamma = 2^seq(-6,3)
  stps = expand.grid(starting_nu, starting_gamma)
  colnames(stps) = c("nu", "gamma")
  optima = list()
  pb <- txtProgressBar(min = 0, max = nrow(stps), style = 3)
  for (i in 1:nrow(stps)){
    Sys.sleep(0.1)
    res <- cv.nelder_mead(cv, ocSVM, mynu = stps$nu[i], mygamma=stps$gamma[i], num_evals = 10, maximize=TRUE)
    optima[paste(i,"_optimum", sep="")] = res$optimum
    optima[paste(i,"_starting_nu", sep="")] = stps$nu[i]
    optima[paste(i,"_starting_gamma", sep="")] = stps$gamma[i]
    optima[paste(i,"_solution_nu", sep="")] = res$solution$mynu
    optima[paste(i,"_solution_gamma", sep="")] = res$solution$mygamma
    setTxtProgressBar(pb, i)
  }
  close(pb)
  return(optima)
}

## After the new implementation in Novelty detection for each model I create the
## stats, so this function simply concatenates everything
gatherMetrics = function(res_dir){
  total_cv_stats = NULL
  modelDirs = list.dirs(res_dir, recursive=F)
  for (md in modelDirs){
    if (length(grep("cv_stats.tsv", list.files(md, recursive=F)))!=0){
      df = read.table(paste(md, "/cv_stats.tsv", sep=""), header = T, sep="\t")
      if (!("degree" %in% colnames(df))){
          df = df %>% mutate(degree=0) %>%
              select(analysis,kernel,nu,gamma,degree,iteration,type,trainSize,class,set,value)
      }
      total_cv_stats = rbind(total_cv_stats, df)
    }else{
      cat(paste(md, " has no cv_stats.tsv", sep=""), "\n")
    }
  }

  return(total_cv_stats)
}


## Given a directory gather results of cross-validation
getCVStats <- function(res_dir, svmmode="C-classification", classes=2){

    cat("Creating the folder list...", "\n")
    cv_dirs <- list.dirs(res_dir, recursive = F)
    all_fns <- NULL
    ## Make the list of the directories to be gathered
    for (cv_dir in cv_dirs){
        setwd(cv_dir)
        iter_dirs <- list.dirs(cv_dir, recursive = F)
        for (iter_dir in iter_dirs){
            cat(iter_dir, "\n")
            setwd(iter_dir)
            ## Check if all 4 files are there (for cases where the cross-validation still running)
            fns <- c("trainset.Rdata", "testset.Rdata", "svmModel.Rdata", "prediction.Rdata")
            if (sum(file.exists(fns), na.rm=TRUE)==4){
                ## Load the trainset, testset, model & prediciton
                load("testset.Rdata")
                load("prediction.Rdata")

                ## Check point for prediction hault
                if(length(prediction)!=nrow(testset)){
                    cat(paste(iter_dir," skipped! Not equal number of predictions and test set observations!", sep=""), "\n")
                    next
                }else{
                    all_fns <- c(all_fns, iter_dir)
                }

            }else{
                cat(paste(iter_dir, " skipped! Not all the files are present or the structur eof directory is wrong!", sep=""), "\n")
                next
            }
        }
    }

    cat("initiating data mining from cross-validation results...", "\n")
    ## Run it in parallel
    total_cv_stats <- foreach(i=all_fns, .combine=rbind) %dopar% {
        cv_analysis <- tail(unlist(strsplit(i, split = "/")), n=2)[1]
        setwd(i)
        cat(i, "\n")
        iter <- as.numeric(tail(unlist(strsplit(i, split = "\\.")), n=1))
        cv_stats <- NULL
        load("trainset.Rdata")
        load("testset.Rdata")
        load("svmModel.Rdata")
        load("prediction.Rdata")

        noTrain = trainset %>% nrow
        ## Concatenate the predictions to the true values
        testset$prediction = prediction
        testset = testset %>% select(type, prediction)
        ## For C-classification/one-classification
        if (svmmode=="one-classification"){
            testset = testset %>% mutate(p=ifelse(prediction==TRUE, "C", "N"))
        }else if (svmmode=="C-classification"){
            testset$p = testset$prediction
        }

        ## Performance metrics

        ## ---------------------- Test set -------------------------------------
        ## Confusion matrix
        if (classes==3){
            cM <- confusionMatrix(testset$p,testset$type, positive = c("T", "O"))
            cv_stats <- rbind(cv_stats, data.frame(analysis=cv_analysis, iteration=iter,
                                                   type="accuracy", trainSize=noTrain,
                                                   class="overall", set="test",
                                                   value=cM$overall[["Accuracy"]]))
            cv_stats <- rbind(cv_stats, data.frame(analysis=cv_analysis, iteration=iter,
                                                   type="Sensitivity", trainSize=noTrain,
                                                   class="N", set="test",
                                                   value=cM$byClass["Class: N","Sensitivity"]))
            cv_stats <- rbind(cv_stats, data.frame(analysis=cv_analysis, iteration=iter,
                                                   type="Sensitivity", trainSize=noTrain,
                                                   class="T", set="test",
                                                   value=cM$byClass["Class: T","Sensitivity"]))
            cv_stats <- rbind(cv_stats, data.frame(analysis=cv_analysis, iteration=iter,
                                                   type="Sensitivity", trainSize=noTrain,
                                                   class="O", set="test",
                                                   value=cM$byClass["Class: O","Sensitivity"]))
            cv_stats <- rbind(cv_stats, data.frame(analysis=cv_analysis, iteration=iter,
                                                   type="Specificity", trainSize=noTrain,
                                                   class="N", set="test",
                                                   value=cM$byClass["Class: N","Specificity"]))
            cv_stats <- rbind(cv_stats, data.frame(analysis=cv_analysis, iteration=iter,
                                                   type="Specificity", trainSize=noTrain,
                                                   class="T", set="test",
                                                   value=cM$byClass["Class: T","Specificity"]))
            cv_stats <- rbind(cv_stats, data.frame(analysis=cv_analysis, iteration=iter,
                                                   type="Specificity", trainSize=noTrain,
                                                   class="O", set="test",
                                                   value=cM$byClass["Class: O","Specificity"]))
        }else if (classes==2){
            cM <- confusionMatrix(testset$p,testset$type, positive = c("C"))
            cv_stats <- rbind(cv_stats, data.frame(analysis=cv_analysis, iteration=iter,
                                                   type="accuracy", trainSize=noTrain,
                                                   class="overall", set = "test",
                                                   value=cM$overall[["Accuracy"]]))
            cv_stats <- rbind(cv_stats, data.frame(analysis=cv_analysis, iteration=iter,
                                                   type="Sensitivity", trainSize=noTrain,
                                                   class="overall", set = "test",
                                                   value=cM$byClass["Sensitivity"]))
            cv_stats <- rbind(cv_stats, data.frame(analysis=cv_analysis, iteration=iter,
                                                   type="Specificity", trainSize=noTrain,
                                                   class="overall", set = "test",
                                                   value=cM$byClass["Specificity"]))
        }

        ## Area under the curve
        #auc <- multiclass.roc(as.numeric(pred$type), as.numeric(pred$prediction), levels=c(1,2,3))$auc[1]


        ## ---------------------- Train set -------------------------------------
        ## Confusion matrix
        trainset$fitted = svm.model$fitted
        trainset = trainset %>% select(type, fitted)
        ## For C-classification/one-classification
        if (svmmode=="one-classification"){
            trainset = trainset %>% mutate(f=ifelse(fitted==TRUE, "C", "N"))
        }else if (svmmode=="C-classification"){
            trainset$f = trainset$fitted
        }

        ## Confusion matrix
        if (classes==3){
            cM <- confusionMatrix(trainset$f,trainset$type, positive = c("T", "O"))
            cv_stats <- rbind(cv_stats, data.frame(analysis=cv_analysis, iteration=iter,
                                                   type="accuracy", trainSize=noTrain,
                                                   class="overall", set="train",
                                                   value=cM$overall[["Accuracy"]]))
            cv_stats <- rbind(cv_stats, data.frame(analysis=cv_analysis, iteration=iter,
                                                   type="Sensitivity", trainSize=noTrain,
                                                   class="N", set="train",
                                                   value=cM$byClass["Class: N","Sensitivity"]))
            cv_stats <- rbind(cv_stats, data.frame(analysis=cv_analysis, iteration=iter,
                                                   type="Sensitivity", trainSize=noTrain,
                                                   class="T", set="train",
                                                   value=cM$byClass["Class: T","Sensitivity"]))
            cv_stats <- rbind(cv_stats, data.frame(analysis=cv_analysis, iteration=iter,
                                                   type="Sensitivity", trainSize=noTrain,
                                                   class="O", set="train",
                                                   value=cM$byClass["Class: O","Sensitivity"]))
            cv_stats <- rbind(cv_stats, data.frame(analysis=cv_analysis, iteration=iter,
                                                   type="Specificity", trainSize=noTrain,
                                                   class="N", set="train",
                                                   value=cM$byClass["Class: N","Specificity"]))
            cv_stats <- rbind(cv_stats, data.frame(analysis=cv_analysis, iteration=iter,
                                                   type="Specificity", trainSize=noTrain,
                                                   class="T", set="train",
                                                   value=cM$byClass["Class: T","Specificity"]))
            cv_stats <- rbind(cv_stats, data.frame(analysis=cv_analysis, iteration=iter,
                                                   type="Specificity", trainSize=noTrain,
                                                   class="O", set="train",
                                                   value=cM$byClass["Class: O","Specificity"]))
        }else if (classes==2){
            cM <- confusionMatrix(trainset$f,trainset$type, positive = c("C"))
            cv_stats <- rbind(cv_stats, data.frame(analysis=cv_analysis, iteration=iter,
                                                   type="accuracy", trainSize=noTrain,
                                                   class="overall", set = "train",
                                                   value=cM$overall[["Accuracy"]]))
            cv_stats <- rbind(cv_stats, data.frame(analysis=cv_analysis, iteration=iter,
                                                   type="Sensitivity", trainSize=noTrain,
                                                   class="overall", set = "train",
                                                   value=cM$byClass["Sensitivity"]))
            cv_stats <- rbind(cv_stats, data.frame(analysis=cv_analysis, iteration=iter,
                                                   type="Specificity", trainSize=noTrain,
                                                   class="overall", set = "train",
                                                   value=cM$byClass["Specificity"]))
        }
        return(cv_stats)
    }
}

plotCVStats = function(total_cv_stats, saveDir){

    #iterations per analsyis
    an2iter <- total_cv_stats %>% select(analysis, iteration) %>% unique %>% group_by(analysis) %>% summarise(n=n()) %>% ungroup %>% rename(iterations=n)

    cv_stats_summary <- total_cv_stats %>% subset(!is.na(value)) %>% group_by(analysis, kernel, nu, gamma, type, set) %>% summarize(min=min(value),
                                                                                q1=quantile(value)[2],
                                                                                median=median(value),
                                                                                mean=mean(value),
                                                                                q3=quantile(value)[4],
                                                                                max=max(value)) %>% ungroup

    cv_stats_summary <- cv_stats_summary %>% left_join(an2iter, by="analysis")

    ## Plot overall sensitivity and specificity across different parameters
    #overall <- cv_stats_summary %>% subset(Cancer_type=="overall")
    overall <- cv_stats_summary %>% subset(type=="Sensitivity" | type=="Specificity")
    overall <- overall %>% select(analysis, kernel, nu, gamma, type, set, median) %>% spread(type, median)

    ## Plot test stats
    for (k in unique(overall$kernel)){

        df = overall %>% subset(kernel==k & set=="test")
        an2sy = df %>% select(analysis) %>% unique
        an2sy$symbol <- 1:nrow(an2sy)
        df = df %>% left_join(an2sy, by=c("analysis"))
        pdf(file=paste(saveDir,"/",k,"_CV_Test_ROCspace.pdf",sep=""),height = unit(12, "inches"), width = unit(20, "inches"), useDingbats = F)
        p <- ggplot(df,aes(x=1-Specificity,y=Sensitivity,group=analysis,shape=analysis)) +
        geom_text(aes(label=symbol))+
        #geom_point(size=5) +
        #geom_jitter(position = position_jitter(), size=5) +
        scale_x_continuous(limit=c(0,1)) +
        scale_y_continuous(limit=c(0,1)) +
        geom_abline(intercept = 0, slope =1, lty="dashed", color="red") +
        ggtitle(paste("ROC space (",k,")", sep = "")) +
        xlab("FPR (1-Specificity)") +
        ylab("TPR (Sensitivity)") +
        theme(
            axis.title.x=element_text(size=14),
            axis.title.y=element_text(size=14),
            axis.text.x=element_text(colour="black", size=12),
            axis.text.y=element_text(colour = "black", size=12),
            axis.line = element_line(colour = "black"),
            plot.title=element_text(size = 16),
            legend.text=element_text(size=16),
            legend.key.size = unit(3, "mm"),
            panel.grid.major = element_blank(),
            panel.grid.minor = element_blank(),
            panel.border = element_blank(),
            panel.background = element_blank()
        ) +
        scale_shape_manual(values = df$symbol)
        print(p)
        dev.off()
        write.table(df, file=paste(saveDir,"/",k,"_CV_Test_ROCspace.tsv",sep=""), quote = F, row.names=F, sep="\t")
    }

    ## Plot train stats
    for (k in unique(overall$kernel)){
      df = overall %>% subset(kernel==k & set=="train")
      an2sy = df %>% select(analysis) %>% unique
      an2sy$symbol <- 1:nrow(an2sy)
      df = df %>% left_join(an2sy, by=c("analysis"))
      pdf(file=paste(saveDir,"/",k,"_CV_Train_Sensitivity.pdf",sep=""),height = unit(12, "inches"), width = unit(20, "inches"), useDingbats = F)
      p = ggplot(df, aes(x=factor(analysis), y=Sensitivity)) +
        geom_text(aes(label=symbol)) +
        theme(
          axis.title.x = element_blank(),
          axis.text.x = element_text(angle=90, size=12),
          axis.text.y = element_text(size=12)
        ) + ggtitle(paste("Training sensitivity (",k,")", sep = ""))
      print(p)
      dev.off()
      write.table(df, file=paste(saveDir,"/",k,"_CV_Train_Sensitivity.tsv",sep=""), quote = F, row.names=F, sep="\t")
    }
}

################################################
# Platt scaling given a prediction
################################################
## For the moment the sigmoid is trained on the training values of the iteration selected
## and assess the predictions made in the validation set using this model
plattScaling = function(model, predictions){
  ## Impementation was taken from pseudocode in:
  ## Probabilistic Outputs for Support Vector Machines and Comparisons to Regularized Likelihood Methods, Platt, 1999
  ## Construct the training set for Platt's sigmoid

  decision_values = model$decision.values %>% as.vector()
  labels = model$fitted %>% as.vector()
  df = data.frame(dv=decision_values, lbs = labels)

  ## the prediction at the moment is a predict object coming from svm predict
  dv = attr(predictions, "decision.values") %>% as.vector()
  preds = data.frame(dv = dv)
  preds$label = as.vector(predictions)
  rownames(preds) = names(predictions)

  ## Input parameters
  out = df$dv
  target = df$lbs
  prior1 = df%>%subset(lbs==TRUE)%>%nrow
  prior0 = df%>%subset(lbs==FALSE)%>%nrow
  ## Output
  ## A, B = parameters of sigmoid

  A = 0
  B = log((prior0+1)/(prior1+1))
  hiTarget = (prior1+1)/(prior1+2)
  loTarget = 1/(prior0+2)
  lambda = 1e-3
  olderr = 1e300


  pp = rep((prior1+1)/(prior0+prior1+2), nrow(df))
  count = 0

  for (it in 1:100){
    a=0
    b=0
    c=0
    d=0
    e=0

    ## First compute the Hessian & gradient of error function
    ## with respect to A & B
    for (i in 1:length(pp)){
      if (target[i]==TRUE){
        t = hiTarget
      }else{
        t = loTarget
      }

      d1 = pp[i]-t
      d2 = pp[i]*(1-pp[i])
      a = a + out[i]*out[i]*d2
      b = b + d2
      c = c + out[i]*d2
      d = d + out[i]*d1
      e = e + d1
    }
    ## If gradient is really tiny, then stop
    if ((abs(d) < 1e-9) & (abs(e) < 1e-9)){break}

    oldA = A
    oldB = B
    err = 0

    ## Loop until goodness of fit increases
    while(TRUE){
      det = (a+lambda)*(b+lambda)-c*c
      if (det==0){ ## If determinant if Hessian is 0
        ## Increase stabilizer
        lambda = lambda*10
        next
      }
      A = oldA + ((b+lambda)*d-c*e)/det
      B = oldB + ((a+lambda)*e-c*d)/det

      ## Now compute the goodness of fit
      err = 0
      for (i in 1:length(pp)){
        p = 1/(1+exp(out[i]*A+B))
        pp[i] = p
        ## At this step, make sure log(0) returns -200
        if (p<=1.383897e-87){
          err = err - t*(-200)+(1-t)*log(1-p)
        }else if (p==1){
          err = err - t*log(p)+(1-t)*(-200)
        }else{
          err = err - t*log(p)+(1-t)*log(1-p)
        }
        if(err==-Inf) browser()
      }
      if (err < olderr*(1+1e-7)){
        lambda = lambda*0.1
        break
      }
      ## Error did not decrease: increase stabilizer by factor of 10
      ## and try again
      lambda = lambda* 10
      if (lambda >= 1e6){ ## something is broken. Give up
        break
      }
    }
    diff = err-olderr
    scale = 0.5*(err+olderr+1)
    if (diff > -1e-3*scale & diff < 1e-7*scale){
      count = count + 1
    }else{
      count = 0
    }
    olderr = err
    if (count==3){
      break
    }


  }

  applyPlatt = function(x){
    p = 1/(1+exp(x*A+B))
  }

  preds$prob = apply(preds, 1, function(x) applyPlatt(x[1]))
  return(preds)
}

##########
# Binning
##########
binning = function(model, predictions){

  ## Make a dataframe with decision values and labes from the training object
  df = data.frame(label=model$fitted, dv=as.vector(model$decision.values))
  ## Number of bins
  bins = 10
  dv = sort(df$dv, decreasing=FALSE)
  ## Find the bounds of each bin in the training vector
  ## Carefull this function returns the elements of the vector encoded as ranges
  dv_encoded = cut(dv, breaks=bins)
  bin_bounds = unique(cut(dv, breaks=bins))
  probs = NULL
  for (i in 1:length(bin_bounds)){
    bin = bin_bounds[i]
    lowerLimit = as.numeric(gsub("\\(", "", unlist(strsplit(as.character(bin), ","))[1]))
    upperLimit = as.numeric(gsub("\\]", "", unlist(strsplit(as.character(bin), ","))[2]))

    labelMatrix = df %>% subset(dv >= lowerLimit & dv < upperLimit) %>% count(label)
    if (length(unique(labelMatrix$label))==2){
      prob = labelMatrix$n[labelMatrix$label==TRUE]/sum(labelMatrix$n)
    }else if (length(unique(labelMatrix$label))==1){
      if (unique(labelMatrix$label)==FALSE){
        prob = 0
      }else if (unique(labelMatrix$label)==TRUE){
        prob = 1
      }
    }
    probs = c(probs, prob)
  }
  probsDF = data.frame(bin=bin_bounds, prob=probs)
  probsDF = probsDF %>% separate(bin, into = c("lowLimit", "upLimit"), sep=",", remove = FALSE) %>%
    mutate(lowLimit=as.numeric(gsub("\\(", "", lowLimit)),
           upLimit=as.numeric(gsub("\\]", "", upLimit)))

  ## Get the prediction decision values
  preds = attr(predictions, "decision.values")[,1] %>% data.frame()
  preds$label = as.vector(predictions)
  colnames(preds) = c("dv", "label")

  getBinProb = function(x){
    prob = subset(probsDF, x >= lowLimit & x < upLimit)$prob[1]
    ## if prob = NA it means that the value is outside of the bins
    ## Assign it to the closest bin
    ## Get the maximum and the minimum
    maximum = max(probsDF$upLimit)
    minimum = min(probsDF$lowLimit)
    if (is.na(prob)){
      if (x>maximum){
        prob = 1
      }else if (x<minimum){
        prob = 0
      }
    }
    return(prob)
  }
  preds$prob = sapply(preds$dv, function(x) getBinProb(x))
  return(preds)
}

## After inspection of the ROC space, we decide the best model based on the CV
## I give the parameters to the function, train the model on the whole training
## training set and predicts on the validation set
## the function also implements probability estimation using Platt's algorithm
## and binning
## For Platt training set at the moment I use the decision values of the training
## You need to open the tunnel to NCG5 brfore running the function
bestModelPredict  =function(kern, mynu, mygamma, trainingPath, validationPath, validation_nsPath){

  dev_mode()
  install("/Users/fc-8s-imac/ncg-data-update/ncglib/")
  library(ncglib)
  geneInfo=get_geneinfo()
  cancerGenes = get_cancer_genes()
  dev_mode()

  ## Load training and validation sets
  load(trainingPath)
  load(validationPath)
  load(validation_nsPath)
  ## Fix gene info table from NCG
  geneInfo = geneInfo %>% select(entrez, cancer_type) %>% unique
  ## Get a cancer gene with all the associated primary sites and cancer sites
  cancerGenes = cancerGenes %>% select(entrez, primary_site, cancer_site) %>%
    group_by(entrez) %>% summarise(primary_site=paste(unique(primary_site), collapse=","),
                                   cancer_site=paste(unique(cancer_site), collapse=",")) %>%
    ungroup
  geneInfo = geneInfo %>% left_join(cancerGenes, by=c("entrez"))

  if (kern=="linear"){
    mygamma = NULL
  }

  ## Training also contains "N" class
  training = training%>%subset(type=="C")

  ## Train on the whole set
  if (kern=="linear"){
    svm.model <- svm(type ~ ., data = training, kernel=kern, type="one-classification", scale=FALSE, nu=mynu)
  }else if (kern=="radial"){
    svm.model <- svm(type ~ ., data = training, kernel=kern, type="one-classification", scale=FALSE, gamma = mygamma, nu=mynu)
  }

  ## make prediction
  predictions = predict(svm.model, validation, decision.values = TRUE)
  ## add probability from Platt and Binning
  predictions_platt = plattScaling(svm.model, predictions)
  colnames(predictions_platt) = c("dv", "label", "prob_platt")
  #predictions_binning = binning(svm.model, predictions)
  #colnames(predictions_binning) = c("dv", "label", "prob_binning")
  preds = predictions_platt %>% tibble::rownames_to_column() #%>%
  #left_join(predictions_binning%>%add_rownames(), by=c("rowname","dv", "label"))

  ## Take information from gene info
  preds = preds %>%
    separate(rowname, into=c("cancer_type", "sample", "entrez"), sep="\\.") %>%
    mutate(entrez=as.numeric(entrez)) %>%
    left_join(geneInfo%>%rename(gene_type=cancer_type), by=c("entrez"))

  ## Join also with systems level properties from validaton set
  validation_ns = validation_ns %>% tibble::rownames_to_column() %>%
      separate(rowname, into=c("cancer_type", "sample", "entrez"), sep="\\.") %>%
      mutate(entrez=as.numeric(entrez)) %>%
      select(cancer_type, sample, entrez, no_TRUNC_muts, no_NTDam_muts,
             no_GOF_muts, Copy_number, CNVLoss, CNVGain)
  preds = preds %>% left_join(validation_ns, by=c("cancer_type", "sample", "entrez"))

  return(preds)
}

## Exactly the same as bestModelPredict but gives predictions for the whole grid
getGridPred = function(kern, mynu, mygamma, mydegree, dataPath, res_dir, tissue, gains=T, gof=T, install_NCG=TRUE, NCG_path="/Users/fc-8s-imac/ncg-data-update/ncglib/",fname_gene_prop = "Rdata/geneProperties.Rdata"){
    if (!file.exists(res_dir)){
        dir.create(file.path(res_dir))
    }

    ## What features should be excluded
    config = read.table(paste(dataPath, "/config.txt", sep=""), header = F)
    config = config[10:nrow(config), 1]
    s = c("cancer_type", "sample", "entrez", "no_ALL_muts","no_TRUNC_muts", "no_NTDam_muts",
          "no_GOF_muts", "Copy_number", "CNVLoss", "CNVGain")
    s = s[!(s%in%config)]


    dev_mode()
    if(install_NCG) install(NCG_path)
    library(ncglib)
    geneInfo=get_geneinfo(version="NCG5")
    cancerGenes = get_cancer_genes(version="NCG5")
    dev_mode()

    ## Load training and validation sets
    load(paste(dataPath, "/training_set.Rdata", sep=""))
    load(paste(dataPath, "/validation_set.Rdata", sep=""))
    load(paste(dataPath, "/validation_set_noScale.Rdata", sep=""))


    validation_ns = validation_ns %>% tibble::rownames_to_column() %>%
        separate(rowname, into=c("cancer_type", "sample", "entrez"), sep="\\.") %>%
        mutate(entrez=as.numeric(entrez)) %>%
        select(one_of(s))


    ## Fix gene info table from NCG
    geneInfo = geneInfo %>% select(entrez, cancer_type) %>% unique
    ## Get a cancer gene with all the associated primary sites and cancer sites
    cancerGenes = cancerGenes %>% select(entrez, primary_site, cancer_site) %>%
        group_by(entrez) %>% summarise(primary_site=paste(unique(primary_site), collapse=","),
                                       cancer_site=paste(unique(cancer_site), collapse=",")) %>%
        ungroup
    geneInfo = geneInfo %>% left_join(cancerGenes, by=c("entrez"))

    if (kern=="linear"){
        mygamma = 0
    }

    ## Training also contains "N" class? (just in case to be sure between the trials)
    training = training%>%subset(type=="C")

    ## Train for all the grid
    params <- expand.grid(mynu, mygamma, mydegree)
    colnames(params) = c("mynu", "mygamma", "mydegree")

    grid_predictions_table = NULL
    grid_predictions = list()
    novel_drivers_genes_platt_all = NULL
    novel_drivers_genes_platt_inNCG5_all = NULL
    novel_drivers_genes_platt_rst_all = NULL
    new_samples_covered_all = NULL

    for (i in 1:nrow(params)){
        ## Train on the whole set
        cat(i, "\n")
        if (kern=="linear"){
            svm.model <- svm(type ~ ., data = training, kernel=kern, type="one-classification", scale=FALSE, nu=params$mynu[i])
        }else if (kern=="radial"){
            svm.model <- svm(type ~ ., data = training, kernel=kern, type="one-classification", scale=FALSE, gamma=params$mygamma[i], nu=params$mynu[i])
        }else if (kern=="polynomial"){
            svm.model <- svm(type ~ ., data = training, kernel=kern, type="one-classification", scale=FALSE, gamma = params$mygamma[i], nu=params$mynu[i], degree = params$mydegree[i])
        }else if (kern=="sigmoid"){
            svm.model <- svm(type ~ ., data = training, kernel=kern, type="one-classification", scale=FALSE, gamma=params$mygamma[i], nu=params$mynu[i])
        }

        ## Asses training sensitivity
        trainset = training %>% select(type)
        trainset$fitted = svm.model$fitted
        trainset = trainset %>% select(type, fitted)
        trainset = trainset %>% mutate(f=ifelse(fitted==TRUE, "C", "N"))
        trainset$type = factor(as.character(trainset$type), levels = c("C", "N"))
        trainset$f = factor(as.character(trainset$f), levels = c("C", "N"))
        ## Confusion matrix
        cM <- confusionMatrix(trainset$f,trainset$type, positive = c("C"))
        training_sensitivity = cM$byClass["Sensitivity"]

        ## make prediction
        predictions = predict(svm.model, validation, decision.values = TRUE)
        ## add probability from Platt and Binning
        predictions_platt = plattScaling(svm.model, predictions)
        colnames(predictions_platt) = c("dv", "label", "prob_platt")
        #predictions_binning = binning(svm.model, predictions)
        #colnames(predictions_binning) = c("dv", "label", "prob_binning")
        preds = predictions_platt %>% tibble::rownames_to_column() #%>%
        #left_join(predictions_binning%>%add_rownames(), by=c("rowname","dv", "label"))

        ## Take information from gene info
        preds = preds %>%
            separate(rowname, into=c("cancer_type", "sample", "entrez"), sep="\\.") %>%
            mutate(entrez=as.numeric(entrez)) %>%
            left_join(geneInfo%>%rename(gene_type=cancer_type), by=c("entrez"))

        ## Join also with systems level properties from validaton set
        preds = preds %>% left_join(validation_ns, by=c("cancer_type", "sample", "entrez"))

        ## Save preds
        if (kern=="linear"){
            analysis = paste(kern, ".", params$mynu[i], sep="")
            save(preds, file=paste(res_dir,"/predictions_",analysis, ".Rdata", sep=""))
        }else if (kern=="radial"){
            analysis = paste(kern, ".", params$mynu[i], ".", params$mygamma[i], sep="")
            save(preds, file=paste(res_dir, "/predictions_",analysis, ".Rdata", sep=""))
        }else if (kern=="polynomial"){
            analysis = paste(kern, ".", params$mynu[i], ".", params$mygamma[i], ".", params$mydegree[i], sep="")
            save(preds, file=paste(res_dir, "/predictions_",analysis, ".Rdata", sep=""))
        }else if (kern=="sigmoid"){
            analysis = paste(kern, ".", params$mynu[i], ".", params$mygamma[i], sep="")
            save(preds, file=paste(res_dir, "/predictions_",analysis, ".Rdata", sep=""))
        }


        ## Save the predicted entrez to make the venn diagram
        grid_predictions[[analysis]] = preds%>%subset(label==TRUE & prob_platt >0.95)%>%select(entrez)%>%unique%>%.$entrez

        training_samples = training%>%tibble::rownames_to_column()%>%separate(rowname, into=c("Cancer_type", "sample", "entrez"), sep="\\.")%>%select(sample)%>%unique%>%.$sample
        prediction_samples = preds%>%subset(label==TRUE & prob_platt >0.95)%>%select(sample)%>%unique%>%.$sample

        ## Cutoffs
        cutoffs = c(0.95, 0.85, 0.75)
        new_samples = preds%>%subset(label==TRUE & prob_platt >0.95)%>%select(sample)%>%subset(!(sample%in%training_samples))%>%unique%>%.$sample
        new_samples_no_0.95 = length(new_samples)
        new_samples = preds%>%subset(label==TRUE & prob_platt >0.85)%>%select(sample)%>%subset(!(sample%in%training_samples))%>%unique%>%.$sample
        new_samples_no_0.85 = length(new_samples)
        new_samples = preds%>%subset(label==TRUE & prob_platt >0.75)%>%select(sample)%>%subset(!(sample%in%training_samples))%>%unique%>%.$sample
        new_samples_no_0.75 = length(new_samples)

        ## Store in the vectors to get the unique numbers
#         novel_drivers_genes_platt_all = c(novel_drivers_genes_platt_all, preds%>%subset(label==TRUE & prob_platt >0.95)%>%select(entrez)%>%unique%>%.$entrez)
#         novel_drivers_genes_platt_inNCG5_all = c(novel_drivers_genes_platt_inNCG5_all, preds%>%subset(label==TRUE & prob_platt >0.95 & (gene_type=="cgc" | gene_type=="can"))%>%
#                                                      select(entrez)%>%unique%>%.$entrez)
#         novel_drivers_genes_platt_rst_all = c(novel_drivers_genes_platt_rst_all, preds%>%subset(label==TRUE & prob_platt >0.95 & gene_type=="rst")%>%select(entrez)%>%unique%>%.$entrez)
#         new_samples_covered_all = c(new_samples_covered_all, new_samples)



        df = data.frame(analysis=analysis, nu=params$mynu[i],
                        gamma=params$mygamma[i], degree=params$mydegree[i],
                        training_set_size = nrow(training),
                        training_genes = training%>%tibble::rownames_to_column()%>%separate(rowname, into=c("Cancer_type", "sample", "entrez"), sep="\\.")%>%select(entrez)%>%unique%>%nrow,
                        training_samples = training%>%tibble::rownames_to_column()%>%separate(rowname, into=c("Cancer_type", "sample", "entrez"), sep="\\.")%>%select(sample)%>%unique%>%nrow,
                        training_sensitivity = training_sensitivity,
                        prediction_set_size=nrow(preds),
                        prediction_genes=preds%>%select(entrez)%>%unique%>%nrow,
                        prediction_samples=preds%>%select(sample)%>%unique%>%nrow,
                        novel_drivers_entries=preds%>%subset(label==TRUE)%>%nrow,
                        novel_drivers_genes=preds%>%subset(label==TRUE)%>%select(entrez)%>%unique%>%nrow,
                        novel_drivers_genes_inNCG5 = preds%>%subset(label==TRUE & (gene_type=="cgc" | gene_type=="can"))%>%select(entrez)%>%unique%>%nrow,
                        novel_drivers_genes_rst = preds%>%subset(label==TRUE & gene_type=="rst")%>%select(entrez)%>%unique%>%nrow,
                        novel_drivers_entries_platt_0.95=preds%>%subset(label==TRUE & prob_platt >0.95)%>%nrow,
                        novel_drivers_genes_platt_0.95=preds%>%subset(label==TRUE & prob_platt >0.95)%>%select(entrez)%>%unique%>%nrow,
                        novel_drivers_in_samples_mutated_0.95 = preds%>%subset(label==TRUE & prob_platt >0.95)%>%select(sample)%>%unique%>%nrow,
                        novel_drivers_genes_platt_inNCG5_0.95 =preds%>%subset(label==TRUE & prob_platt >0.95 & (gene_type=="cgc" | gene_type=="can"))%>%select(entrez)%>%unique%>%nrow,
                        novel_drivers_genes_platt_rst_0.95=preds%>%subset(label==TRUE & prob_platt >0.95 & gene_type=="rst")%>%select(entrez)%>%unique%>%nrow,
                        novel_drivers_genes_platt_rst_in_samples_mutated_0.95=preds%>%subset(label==TRUE & prob_platt >0.95 & gene_type=="rst")%>%select(sample)%>%unique%>%nrow,
                        new_samples_covered_0.95 = new_samples_no_0.95,
                        novel_drivers_entries_platt_0.85=preds%>%subset(label==TRUE & prob_platt >0.85)%>%nrow,
                        novel_drivers_genes_platt_0.85=preds%>%subset(label==TRUE & prob_platt >0.85)%>%select(entrez)%>%unique%>%nrow,
                        novel_drivers_in_samples_mutated_0.85 = preds%>%subset(label==TRUE & prob_platt >0.85)%>%select(sample)%>%unique%>%nrow,
                        novel_drivers_genes_platt_inNCG5_0.85 =preds%>%subset(label==TRUE & prob_platt >0.85 & (gene_type=="cgc" | gene_type=="can"))%>%select(entrez)%>%unique%>%nrow,
                        novel_drivers_genes_platt_rst_0.85=preds%>%subset(label==TRUE & prob_platt >0.85 & gene_type=="rst")%>%select(entrez)%>%unique%>%nrow,
                        novel_drivers_genes_platt_rst_in_samples_mutated_0.85=preds%>%subset(label==TRUE & prob_platt >0.85 & gene_type=="rst")%>%select(sample)%>%unique%>%nrow,
                        new_samples_covered_0.85 = new_samples_no_0.85,
                        novel_drivers_entries_platt_0.75=preds%>%subset(label==TRUE & prob_platt >0.75)%>%nrow,
                        novel_drivers_genes_platt_0.75=preds%>%subset(label==TRUE & prob_platt >0.75)%>%select(entrez)%>%unique%>%nrow,
                        novel_drivers_in_samples_mutated_0.75 = preds%>%subset(label==TRUE & prob_platt >0.75)%>%select(sample)%>%unique%>%nrow,
                        novel_drivers_genes_platt_inNCG5_0.75 =preds%>%subset(label==TRUE & prob_platt >0.75 & (gene_type=="cgc" | gene_type=="can"))%>%select(entrez)%>%unique%>%nrow,
                        novel_drivers_genes_platt_rst_0.75=preds%>%subset(label==TRUE & prob_platt >0.75 & gene_type=="rst")%>%select(entrez)%>%unique%>%nrow,
                        novel_drivers_genes_platt_rst_in_samples_mutated_0.75=preds%>%subset(label==TRUE & prob_platt >0.75 & gene_type=="rst")%>%select(sample)%>%unique%>%nrow,
                        new_samples_covered_0.75 = new_samples_no_0.75
                        )
        grid_predictions_table = rbind(grid_predictions_table, df)

        ## make the plots and save them for each iteration
        for (cutoff in cutoffs){
            cat(cutoff, "\n")
            positive_genes_platt_rst=preds%>%subset(label==TRUE & prob_platt >cutoff & gene_type=="rst")%>%select(entrez)%>%unique%>%nrow
            if (positive_genes_platt_rst==0){
                next
            }else{
                p = plotSysproperties(preds, cutoff, fname_gene_prop, tissue=tissue, install_NCG=install_NCG )
                mylegend<-g_legend(p[[1]])
                for(i in 1:length(p)) p[[i]] = p[[i]] + theme(legend.position="none")
                pdf(file=paste(res_dir, "/predictions_sysproperties_",analysis,"_",cutoff, ".pdf" , sep=""), h= 20, w=24)
                grid.arrange(p[[1]],p[[3]],p[[6]],p[[7]],p[[8]],
                             p[[13]],p[[14]],p[[15]],p[[16]],p[[17]],p[[18]],p[[19]],p[[20]],
                             mylegend, ncol=3)
                dev.off()
                p = plotMolProperties(preds, gains=gains, gof=gof)
                mylegend<-g_legend(p[[1]])
                for(i in 1:length(p)) p[[i]] = p[[i]] + theme(legend.position="none")
                pdf(file=paste(res_dir, "/predictions_molproperties_", analysis,"_",cutoff, ".pdf", sep=""), h= 20, w=24)
                if(gains & gof) {
                  grid.arrange(p[[1]],p[[2]],p[[3]],p[[4]],p[[5]], p[[6]], mylegend, ncol=3)
                }else{
                  grid.arrange(p[[1]],p[[2]],p[[3]],p[[4]],p[[5]], mylegend, ncol=3)
                }
                dev.off()
            }
        }
    }

    ## Read the cv_stats to get test sensitivity
    cv_stats = read.table(file=paste0(dataPath, "/cv_stat_summary.tsv"), header = T, sep="\t")
    cv_stats = cv_stats %>% subset(kernel==kern & type=="Sensitivity" & set=="test") %>% select(analysis, iterations, min, q1, median, mean, q3, max, var)
    grid_predictions_table = grid_predictions_table %>% left_join(cv_stats, by=c("analysis"))


    require(xlsx)
    wb<-createWorkbook(type="xlsx")

    TEXT_STYLE           <- CellStyle(wb) + Font(wb,  heightInPoints=14)
    TABLE_COLNAMES_STYLE <- CellStyle(wb) + Font(wb, isBold=TRUE) +
                              Alignment(rotation=45, horizontal="ALIGN_CENTER") +
                              Border(color="black", position=c("TOP", "LEFT","BOTTOM", "RIGHT"), pen=rep("BORDER_THIN",4))


    write_text_in_cell = function(sheet, rowIndex, text, textStyle){
      rows <-createRow( sheet, rowIndex=rowIndex )
      sheetText <-createCell( rows, colIndex=1 )
      setCellValue( sheetText[[1,1]], text )
      setCellStyle( sheetText[[1,1]], textStyle )
    }

    sheet <- createSheet(wb, sheetName = "predictions")

    text = with( grid_predictions_table, paste0("TRAINING SET. Number_of_genes = ", training_genes[1], " ; Samples = ", training_samples[1], " ; Total_entries = ", training_set_size[1]))
    write_text_in_cell(sheet, 1, text, TEXT_STYLE)

    text = with( grid_predictions_table, paste0("TEST SET.     Number_of_genes = ", prediction_genes[1], " ; Samples = ", prediction_samples[1], " ; Total_entries = ", prediction_set_size[1]))
    write_text_in_cell(sheet, 2, text, TEXT_STYLE)

#     ## Write unique numbers
#     text =  paste0("Unique novel drivers across models: ", length(unique(novel_drivers_genes_platt_all)))
#     write_text_in_cell(sheet, 3, text, TEXT_STYLE)
#     text =  paste0("Unique novel drivers across models (inNCG5): ", length(unique(novel_drivers_genes_platt_inNCG5_all)))
#     write_text_in_cell(sheet, 4, text, TEXT_STYLE)
#     text =  paste0("Unique novel drivers across models (rst): ", length(unique(novel_drivers_genes_platt_rst_all)))
#     write_text_in_cell(sheet, 5, text, TEXT_STYLE)
#     text =  paste0("Unique new samples covered across models: ", length(unique(new_samples_covered_all)))
#     write_text_in_cell(sheet, 6, text, TEXT_STYLE)

    to_remove = c("training_genes","training_samples",'training_set_size','prediction_genes','prediction_samples','prediction_set_size')

    rownames(grid_predictions_table) = NULL

    addDataFrame(grid_predictions_table[, !colnames(grid_predictions_table)%in%to_remove ], sheet, startRow=7, startColumn=1, colnamesStyle = TABLE_COLNAMES_STYLE, row.names = F)

    saveWorkbook(wb,file=paste(res_dir,"/grid_predictions_", kern, ".xlsx", sep=""))

    # write.xlsx(grid_predictions_table, file=paste(res_dir,"/grid_predictions_", kern, ".xlsx", sep=""))

#     ## Print unique numbers
#     cat(paste0("Unique novel drivers across models: ", length(unique(novel_drivers_genes_platt_all))), "\n")
#     cat(paste0("Unique novel drivers across models (inNCG5): ", length(unique(novel_drivers_genes_platt_inNCG5_all))), "\n")
#     cat(paste0("Unique novel drivers across models (rst): ", length(unique(novel_drivers_genes_platt_rst_all))), "\n")
#     cat(paste0("Unique new samples covered across models: ", length(unique(new_samples_covered_all))), "\n")


    ## Make a barplot for the recurrence of genes across models
#     t = table(table(unlist(grid_predictions)))
#     m = melt(t)
#     m[,1] = factor(m[,1])
#     save(grid_predictions, file=paste(res_dir, "/predictions_recurrence_",kern, ".Rdata" , sep=""))
#     pdf(file=paste(res_dir, "/predictions_recurrence_",kern, ".pdf" , sep=""), h= 20, w=24)
#     p = ggplot(m, aes_string(x=colnames(m)[1], y=colnames(m)[2])) +
#         geom_bar(stat="identity",col="black")+
#         geom_text(aes_string(label=colnames(m)[2]), position=position_dodge(width=0.9), vjust=-0.25, size=7)+
#         theme_boss()+theme(axis.line=element_line(colour="black"),
#                            axis.text.x=element_text(size = 12),
#                            axis.text.y=element_text(size = 12)) +
#         ggtitle("Recurrence of genes across models") +
#         xlab("") + ylab("Genes (n)")
#     print(p)
#     dev.off()

}

## Matteo's code for plotting systems-level properties
get_barplot = function(t, main=""){
    t = as.matrix(t)
    s = apply(t,1,sum)
    p = c()
    for(i in 1:ncol(t)) p[i] = fisher.test( matrix(c(t[,i],s),nc=2))$p.value
    names(p) = colnames(t)
    t = t/apply(t,1,sum)
    m = melt(t)
    m$p = p[match(m[,2], names(p))]
    m[,2] = factor(m[,2])
    p = unique(ddply(m, colnames(m)[2], mutate, value = value[which.max(value)], p=round(p[which.max(value)],3), pr = p[which.max(value)]))
    fdr = p.adjust(unique(p[,c(2,5)])$pr,'BH')
    names(fdr) = unique(p[,2])
    p$fdr = fdr[p[,2]]
    p$star = ""
    p$star[p$fdr<=0.1] = "*"
    p$print = paste0(p$p,p$star)
    p[,2] = factor(p[,2])
    r = ggplot(m, aes_string(fill=colnames(m)[1], x=colnames(m)[2], y=colnames(m)[3] )) +
        geom_bar(stat="identity", position=position_dodge(), col="black")+
        geom_text( data=p, aes_string(x = colnames(p)[2], y = colnames(p)[3], label=colnames(p)[8]), nudge_y = 0.1, size=4)+
        theme_boss()+theme(axis.line=element_line(colour="black"),
                           axis.text.x=element_text(size = 12),
                           axis.text.y=element_text(size = 12)) +
        scale_fill_manual(values=c("white","grey"))+ggtitle(main) +
        xlab("")
    r
}
get_boxplot = function(t, main="", log10=F){
    p = wilcox.test(t$value[t[,1]=="pred"], t$value[t[,1]=="rst"] )$p.value
    p <- format(p, scientific = TRUE)
    if (p < 0.05){
        p = paste(p, "*", sep="")
    }
    r = NULL
    if(log10==F){
        r = ggplot(t, aes_string(fill=colnames(t)[1], y=colnames(t)[3], x=colnames(t)[2] )) +geom_boxplot(col="black")+
            theme_boss() + theme(axis.line=element_line(colour="black"),
                                 axis.text.x=element_text(size = 12),
                                 axis.text.y=element_text(size = 12)) +
            scale_fill_manual(values=c("white","grey"))+ggtitle(paste0(main," p=",p))+
            xlab("")
    }else if (log10==T){
        r = ggplot(t, aes_string(fill=colnames(t)[1], y=colnames(t)[3], x=colnames(t)[2] )) +geom_boxplot(col="black")+
            theme_boss() + theme(axis.line=element_line(colour="black"),
                                 axis.text.x=element_text(size = 12),
                                 axis.text.y=element_text(size = 12)) +
            scale_fill_manual(values=c("white","grey"))+ggtitle(paste0(main," p=",p))+scale_y_log10()+
            xlab("")
    }
    r
}
g_legend<-function(a.gplot){
    tmp <- ggplot_gtable(ggplot_build(a.gplot))
    leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
    legend <- tmp$grobs[[leg]]
    return(legend)
}

plotSysproperties = function(predictions, cutoff, tissue, gene_class="rst"){

    ## Get positive predictions
    if(gene_class=="all"){
        predictions = predictions %>% subset(label==TRUE & prob_platt>cutoff) %>% select(entrez, label) %>% unique
        print(nrow(predictions))
    }else{
        predictions = predictions %>% subset(label==TRUE & prob_platt>cutoff & gene_type==gene_class) %>% select(entrez, label) %>% unique
        print(nrow(predictions))
    }
    ## Load the file with gene properties
    load("/mnt/lustre/users/k1469280/mourikisa/data/geneProperties_final_mmImputed.Rdata")
    geneProperties=geneProperties_mmImputed
#     ## Get cancer genes to compare
#     dev_mode()
#     if(install_NCG) install(NCG_path)
#     library(ncglib)
#     geneInfo=get_geneinfo(version = "NCG5")
#     cancerGenes = get_cancer_genes(version = "NCG5")
#     ncg_connection = get_ncg_connection()
#     ppi   = ncg_connection %>% tbl("NCG51_interactions") %>% collect(n=Inf)
#     cancer_genes = as.data.frame(subset(get_cancer_genes(), primary_site==tissue))
#     ppi = subset(ppi, entrez%in%cancer_genes$entrez)
#     primary_inter = unique(ppi$entrez2)
#     dev_mode()

    ## Load NCG data from files instead of pulling them directly from the database (to run it in the cluster)
    load("/mnt/lustre/users/k1469280/mourikisa/data/geneInfoNCG5.Rdata")
    load("/mnt/lustre/users/k1469280/mourikisa/data/cancerGenesNCG5.Rdata")
    cancer_genes = as.data.frame(subset(cancerGenes, primary_site==tissue))
    load("/mnt/lustre/users/k1469280/mourikisa/data/ppiNCG51.Rdata")

    ## Fix primary interactors
    ppi = as.data.frame(ppi)
    ppi = ppi %>% group_by(entrez2) %>% summarise(primary_inter=length(unique(entrez))) %>%
        rename(entrez=entrez2)

    ## Fix gene info table from NCG
    geneInfo = geneInfo %>% select(entrez, cancer_type) %>% unique
    ## Get a cancer gene with all the associated primary sites and cancer sites
    cancerGenes = cancerGenes %>% select(entrez, primary_site, cancer_site) %>%
        group_by(entrez) %>% summarise(primary_site=paste(unique(primary_site), collapse=","),
                                       cancer_site=paste(unique(cancer_site), collapse=",")) %>%
        ungroup
    geneInfo = geneInfo %>% left_join(cancerGenes, by=c("entrez"))
    geneProperties = geneProperties %>% rename(entrez=Entrez) %>% left_join(geneInfo, by=c("entrez"))

    ## Bring into the gene properties the predictions
    geneProperties = geneProperties %>% left_join(predictions, by=c("entrez"))

    if(gene_class=="all"){

      geneProperties = geneProperties %>% mutate(type=ifelse(label==TRUE & !is.na(label), "pred", "rst"))

    }else{
      geneProperties = geneProperties %>% subset(cancer_type==gene_class) %>% mutate(type=ifelse(label==TRUE & !is.na(label), "pred", "rst"))
    }

    mypredictions = geneProperties %>% subset(type=="pred") %>% nrow

    myrest = geneProperties %>% subset(type=="rst") %>% nrow

    #geneProperties = geneProperties %>% mutate(type=ifelse(grepl("blood", primary_site), "blood", "rst"))
    ## Bring into the gene properties the primary interactors
    geneProperties = geneProperties %>% left_join(ppi, by=c("entrez"))
    geneProperties$primary_inter[is.na(geneProperties$primary_inter)] = 0


    ## Convert properties to factors
    geneProperties$Genic = factor(geneProperties$Genic, levels=c("0","1"))
    geneProperties$WGD = factor(geneProperties$WGD, levels=c("0","1"))
    geneProperties$hub = factor(geneProperties$hub, levels=c("0","1"))
    geneProperties$central = factor(geneProperties$central, levels=c("0","1"))
    geneProperties$age = factor(geneProperties$age, levels=c("young","old"))
    geneProperties$origin = factor(geneProperties$origin, levels=c("LUCA","Eukaryotes","Opisthokonts","Metazoans","Vertebrates","Mammals","Primates"))
    geneProperties$exp.breadth = factor(geneProperties$exp.breadth, levels=c("Neverexpressed", "OneTissue",  "Selective","Middle", "AlwaysExpressed"  ))
    geneProperties$cancer_type = factor(geneProperties$cancer_type, levels=c("cgc",'can','rst'))
    geneProperties$type = factor(geneProperties$type, levels=c('pred','rst'))
    #geneProperties$type = factor(geneProperties$type, levels=c('blood','rst'))

    ## make the plots

    gimme_table <- function (t) {
      maximum = which(as.numeric(colnames(t))>9)
      if(length(maximum)>1){
        t = cbind(t[,1:(maximum[1]-1) ], "≥10"=apply(t[,maximum],1, sum))
      }
      t
    }

    p = list()
    p[[1]] = get_barplot(with(geneProperties, table( type, Genic) ), paste("Genic", "(", mypredictions, "vs", myrest, ")",sep="") )
    p[[2]] = get_barplot(with(geneProperties, table( type, inCDD) ), "in CDD" )
    p[[3]] = get_barplot( gimme_table(with(geneProperties, table( type, alldomains) )) , "all domains")
    p[[4]] = get_barplot( gimme_table(with(geneProperties, table( type, private_domains) )) , "private domains")
    p[[5]] = get_barplot( gimme_table(with(geneProperties, table( type, commondomains) )) , "common domains")
    p[[6]] = get_barplot(with(geneProperties, table( type, age) ), "age" )
    p[[7]] = get_barplot(with(geneProperties, table( type, origin) ), "origin" )
    p[[8]] = get_barplot(with(geneProperties, table( type, memberofcomplex) ), "Member of complex" )
    p[[9]] = get_barplot( gimme_table(with(geneProperties, table( type, High) )), "High" )
    p[[10]] = get_barplot( gimme_table(with(geneProperties, table( type, Low) )), "Low" )
    p[[11]] = get_barplot( gimme_table(with(geneProperties, table( type, Medium) )), "Medium" )
    p[[12]] = get_barplot( gimme_table(with(geneProperties, table( type, NotExpressed) )), "NotExpressed" )
    p[[13]] = get_barplot(with(geneProperties, table( type, exp.breadth) ), "exp.breadth" )
    p[[14]] = get_boxplot(melt(geneProperties, id.var="type", measure.vars = 'Length.fullrefseq' ),log10=T )
    p[[15]] = get_barplot(with(geneProperties, table( type, WGD) ), "WGD" )
    p[[16]] = get_boxplot(melt(geneProperties, id.var="type", measure.vars = 'degree' ), log10=T )
    p[[17]] = get_boxplot(melt(geneProperties, id.var="type", measure.vars = 'betweenness'),log10=T)
    p[[18]] = get_barplot(with(geneProperties, table( type, hub) ), "Hub" )
    p[[19]] = get_barplot(with(geneProperties, table( type, central) ), "Central" )
    p[[20]] = get_barplot( gimme_table( with(geneProperties, table( type, primary_inter) )) , paste0("Primary interactors of cancer genes of ", tissue))
    return(p)
}

## Plot properties
# pdf(file="predictions_properties.pdf", h= 16, w=20)
# grid.arrange(p[[1]],p[[3]],p[[6]],p[[7]],p[[8]],
#               p[[13]],p[[14]],p[[15]],p[[16]],p[[17]],p[[18]],p[[19]],p[[20]], ncol=3)
# dev.off()
# write.xlsx(geneProperties,"predictions_properties.xlsx", row.names=F, showNA=F)
## Assess the predictions (back-characterise them in terms of their features etc...)
## I am plotting also the features of the training set (no scaling)
# do.call("grid.arrange", c(plots, ncol=3))
plotMolProperties = function(predictions, gains = T, gof = T, gene_type="rst"){

    ## get numbers for false and true
    falses = predictions %>% subset(label==FALSE) %>% nrow
    trues = predictions %>% subset(label==TRUE) %>% nrow

    pl = function(m, s, main=""){
        m = data.frame(m)

        m$print = round((m$value)*100,digits = 1)
        m[,2] = factor(m[,2])
        p = ggplot(m, aes_string(fill=colnames(m)[1], x=colnames(m)[2], y=colnames(m)[3] )) +
            geom_bar(stat="identity", position=position_dodge(), col="black") +
            geom_text( data=m, aes_string(x = colnames(m)[2], y = colnames(m)[3],
                                          ymax=colnames(m)[3], label=colnames(m)[4]),
                       position = position_dodge(width=1), vjust=-0.7, size=7)+
            theme_boss()+theme(axis.line=element_line(colour="black"),
                               axis.text.x=element_text(size=12),
                               axis.text.y=element_text(size=12))+
            scale_fill_manual(values=c("grey","white"))+ggtitle(main)+
            scale_y_continuous(limits=c(0,1)) +
            xlab("")
    }

    if (gene_type=="rst"){predictions = predictions %>% subset(gene_type=="rst")}

    p = list()
    m = subset(predictions, no_TRUNC_muts!=0 & !((CNVLoss==1 & Copy_number<2) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0)))
    if(nrow(m)>0) {
       m = m %>% group_by(label, no_TRUNC_muts) %>% summarise(value=n()) %>% ungroup %>% mutate(value=ifelse(label==FALSE, value/falses, value/nrow(predictions)))
    } else{
      m = data.frame(label=c(FALSE, TRUE), no_TRUNC_muts=c(1,1), value=c(0,0))
    }
    p = c(p, list(pl(m, main=paste0("Truncating mutations \n", predictions%>%select(entrez)%>%unique%>%nrow,
                                    " genes in ", predictions%>%select(sample)%>%unique%>%nrow, " samples = ", nrow(predictions), " entries" ))))

    m = subset(predictions, no_NTDam_muts!=0 & !((CNVLoss==1 & Copy_number<2) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0)))
    if(nrow(m)>0) {
      m = m %>% group_by(label, no_NTDam_muts) %>% summarise(value=n()) %>% ungroup %>% mutate(value=ifelse(label==FALSE, value/falses, value/nrow(predictions)))
    } else{
      m = data.frame(label=c(FALSE, TRUE), no_NTDam_muts=c(1,1), value=c(0,0))
    }
    p = c(p, list(pl(m, main="Non-Truncating damaging mutations") ))

    if (gof){
        m = subset(predictions, no_GOF_muts!=0 & !((CNVLoss==1 & Copy_number<2) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0)))
        if(nrow(m)>0) {
        m = m %>% group_by(label, no_GOF_muts) %>% summarise(value=n()) %>% ungroup %>% mutate(value=ifelse(label==FALSE, value/falses, value/nrow(predictions)))
        } else{
        m = data.frame(label=c(FALSE, TRUE), no_GOF_muts=c(1,1), value=c(0,0))
        }
        p = c(p, list(pl(m, main="Gain-of-function mutations") ))
    }

    m = subset(predictions, CNVLoss==1 & !((CNVLoss==1 & Copy_number<2) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0)))
    if(nrow(m)>0) {
      m = m %>% group_by(label, CNVLoss) %>% summarise(value=n()) %>% ungroup %>% mutate(value=ifelse(label==FALSE, value/falses, value/nrow(predictions)))
    } else{
      m = data.frame(label=c(FALSE, TRUE), CNVLoss=c(1,1), value=c(0,0))
    }
    p = c(p, list(pl(m, main="CNV Losses") ))

    if(gains){
      m = subset(predictions, CNVGain==1 & !((CNVLoss==1 & Copy_number<2) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0)))
      if(nrow(m)>0) {
        m = m %>% group_by(label, CNVGain) %>% summarise(value=n()) %>% ungroup %>% mutate(value=ifelse(label==FALSE, value/falses, value/nrow(predictions)))
      } else{
        m = data.frame(label=c(FALSE, TRUE), CNVGain=c(1,1), value=c(0,0))
      }
      p = c(p, list(pl(m, main="CNV Gains") ))
    }

    m = subset(predictions, (CNVLoss==1 & Copy_number<2) & (no_TRUNC_muts!=0 | no_NTDam_muts!=0) )
    if(nrow(m)>0) {
      m = m %>% group_by(label, CNVLoss) %>% summarise(value=n()) %>% ungroup %>% mutate(value=ifelse(label==FALSE, value/falses, value/nrow(predictions)))
    } else{
      m = data.frame(label=c(FALSE, TRUE), CNVLoss=c(1,1), value=c(0,0))
    }
    p = c(p, list(pl(m, main="CNV Losses & (TRUNC OR NTDam muts)")))

    return(p)
}
# pdf(file="predictions_molecular_properties.pdf", h= 16, w=20)
# grid.arrange(p[[1]],p[[2]],p[[3]],p[[4]],p[[5]], ncol=3)
# dev.off()


################################################
# Feature Ranking with SVM-RFE
################################################
svmrfeFeatureRanking = function(training, mynu, mygamma, mydegree, kern){
  x = training%>%select(-type)%>%data.matrix()
  y = training%>%.$type
  n = ncol(x)

  survivingFeaturesIndexes = seq(1:n)
  featureRankedList = vector(length=n)
  rankedFeatureIndex = n

  while(length(survivingFeaturesIndexes)>0){
    #train the support vector machine
    if(kern=="linear"){
        svmModel = svm(x[, survivingFeaturesIndexes], y, nu = mynu, cachesize=500,  scale=F, type="one-classification", kernel=kern )
    }

    if (kern=="linear"){
        svm.model <- svm(x[, survivingFeaturesIndexes], y, kernel=kern, type="one-classification", scale=FALSE, nu=mynu)
    }else if (kern=="radial"){
        svm.model <- svm(x[, survivingFeaturesIndexes], y, kernel=kern, type="one-classification", scale=FALSE, gamma = mygamma, nu=mynu)
    }else if (kern=="polynomial"){
        ## probably changing degree
        svm.model <- svm(x[, survivingFeaturesIndexes], y, kernel=kern, type="one-classification", scale=FALSE, gamma = mygamma, nu=mynu, degree = mydegree)
    }else if (kern=="sigmoid"){
        svm.model <- svm(x[, survivingFeaturesIndexes], y, kernel=kern, type="one-classification", scale=FALSE, gamma = mygamma, nu=mynu)
    }

    #compute the weight vector
    w = t(svmModel$coefs)%*%svmModel$SV

    #compute ranking criteria
    rankingCriteria = w * w

    #rank the features
    ranking = sort(rankingCriteria, index.return = TRUE)$ix

    #update feature ranked list
    featureRankedList[rankedFeatureIndex] = survivingFeaturesIndexes[ranking[1]]
    rankedFeatureIndex = rankedFeatureIndex - 1

    #eliminate the feature with smallest ranking criterion
    survivingFeaturesIndexes = survivingFeaturesIndexes[-ranking[1]]

  }

  return (featureRankedList)
}

##################################################
# Feature Ranking with Average Multiclass SVM-RFE
##################################################
svmrfeFeatureRankingForMulticlass = function(x,y){
  n = ncol(x)

  survivingFeaturesIndexes = seq(1:n)
  featureRankedList = vector(length=n)
  rankedFeatureIndex = n

  while(length(survivingFeaturesIndexes)>0){
    #train the support vector machine
    svmModel = svm(x[, survivingFeaturesIndexes], y, cost = 10, cachesize=500,  scale=F, type="C-classification", kernel="linear" )

    #compute the weight vector
    multiclassWeights = svm.weights(svmModel)

    #compute ranking criteria
    multiclassWeights = multiclassWeights * multiclassWeights
    rankingCriteria = 0
    for(i in 1:ncol(multiclassWeights))rankingCriteria[i] = mean(multiclassWeights[,i])

    #rank the features
    (ranking = sort(rankingCriteria, index.return = TRUE)$ix)

    #update feature ranked list
    (featureRankedList[rankedFeatureIndex] = survivingFeaturesIndexes[ranking[1]])
    rankedFeatureIndex = rankedFeatureIndex - 1

    #eliminate the feature with smallest ranking criterion
    (survivingFeaturesIndexes = survivingFeaturesIndexes[-ranking[1]])
    cat(length(survivingFeaturesIndexes),"\n")
  }

}

################################################
# This function gives the weights of the hyperplane
################################################
svm.weights<-function(model){
  w=0
  if(model$nclasses==2){
    w=t(model$coefs)%*%model$SV
  }else{    #when we deal with OVO svm classification
    ## compute start-index
    start <- c(1, cumsum(model$nSV)+1)
    start <- start[-length(start)]

    calcw <- function (i,j) {
      ## ranges for class i and j:
      ri <- start[i] : (start[i] + model$nSV[i] - 1)
      rj <- start[j] : (start[j] + model$nSV[j] - 1)

      ## coefs for (i,j):
      coef1 <- model$coefs[ri, j-1]
      coef2 <- model$coefs[rj, i]
      ## return w values:
      w=t(coef1)%*%model$SV[ri,]+t(coef2)%*%model$SV[rj,]
      return(w)
    }

    W=NULL
    for (i in 1 : (model$nclasses - 1)){
      for (j in (i + 1) : model$nclasses){
        wi=calcw(i,j)
        W=rbind(W,wi)
      }
    }
    w=W
  }
  return(w)
}

#################################################
# Feature weights for a range of hyperparameters
#################################################
getFeatureWeights = function(trainingPath, kern, nuRange){
  load(trainingPath)
  training = training %>% subset(type=="C")
  df = NULL
  count = 1
  for (mynu in nuRange){
    frl = svmrfeFeatureRanking(training, mynu, kern)
    features = names(training)[frl]
    dff = data.frame(features=features)
    dff$rank = 1:nrow(dff)
    colnames(dff) = c("features", paste("rank_", mynu, sep=""))
    if (count==1){
      df = dff
    }else{
      df = df %>% left_join(dff, by=c("features"))
    }
    count = count+1
  }
  return(df)
}

## I usually plot weights by doing:
# df %>% gather(model, rank, -features) %>%
#   ggplot(aes(x=model, y=factor(features))) +
#   geom_tile(aes(fill=rank)) + coord_equal()+
#   scale_fill_gradient(low = "red",high = "yellow") +
#   theme(
#     axis.text.x=element_text(angle=90, colour = "black"),axis.text.y=element_text(colour = "black"), plot.title = element_text(vjust=2)
#   ) +
#   xlab("") + ylab("") + ggtitle("Feature ranking for \n linear nu=0.5 (refined features)")


## Prepare tables for each cancer type and put them in the MySQL database
# for (ct in cancer_types){
#     cat(ct, "\n")
#     df = prepareTableML(df=total_table, cancer_type=ct)
#     write.table(df, file=paste("/home/mourikisa/novel_driver_prediction/Rdata/total_table_TCGA_19014_prepared_", ct, ".tsv", sep=""),
#                 row.names = F, col.names = F, sep = "\t", quote = F)
# }
prepareTableML <- function(df, cancer_type, geneProperties_dir="Rdata/geneProperties.Rdata",...){
    df = df %>% subset(Cancer_type==ct)
    ## Fix TPM, where there is NA put -1 cause in the database is numeric
    df$TPM[is.na(df$TPM)] = -1

    cat("Fixing copy number variations according to new cut-offs...", "\n")
    df <- df %>% mutate(CNV_type=ifelse(Copy_number>=4, "Gain", ifelse(Copy_number<2, "Loss", NA)))

    ## Replace numbers with names here
    df[,c("no_NSI_muts", "no_TRUNC_muts",
          "no_NTDam_muts", "no_NTDamFunction_muts",
          "no_NTDamCons_muts", "no_NTDamSC_muts",
          "no_GOF_muts")][is.na(df[,c("no_NSI_muts", "no_TRUNC_muts",
                                      "no_NTDam_muts", "no_NTDamFunction_muts",
                                      "no_NTDamCons_muts", "no_NTDamSC_muts",
                                      "no_GOF_muts")])] <- 0

    ## Join with gene Properties
    cat("Joining table with systems-level properties...", "\n")
    load(geneProperties_dir)
    df <- df %>% left_join(geneProperties, by=c("Entrez"))

    ## Convert categorical features to multiple factors
    cat("Performing cleaning of categorical variables...", "\n")
    ## CNV type
    df <- df %>%
        mutate(CNVGain=ifelse(is.na(CNV_type), 0, ifelse(CNV_type=="Gain",1, 0)),
               CNVLoss=ifelse(is.na(CNV_type), 0, ifelse(CNV_type=="Loss",1, 0))) %>%
        select(-CNV_type)

    ## Copy number (where copy number is NA, put copy number equal to 2)
    df$Copy_number[which(is.na(df$Copy_number))] <- 2

    ## Convert the length to bp instead of aa
    df <- df %>% mutate(Length.fullrefseq=Length.fullrefseq*3)

    ## expr_class
    df <- df %>%
        mutate(ExpT_ME=ifelse(is.na(expr_class), 0, ifelse(expr_class=="ME",1, 0)),
               ExpT_HE=ifelse(is.na(expr_class), 0, ifelse(expr_class=="HE",1, 0)),
               ExpT_LE=ifelse(is.na(expr_class), 0, ifelse(expr_class=="LE",1, 0)),
               ExpT_NE=ifelse(is.na(expr_class), 0, ifelse(expr_class=="notExpr",1, 0)),
               ExpT_NET=ifelse(is.na(expr_class), 0, ifelse(expr_class=="notExpr(T)",1, 0))) %>%
        select(-expr_class)

    ## age
    df <- df %>%
        mutate(old=ifelse(is.na(age), 0, ifelse(age=="old",1, 0)),
               young=ifelse(is.na(age), 0, ifelse(age=="young",1, 0))) %>%
        select(-age)

    ## origin
    df <- df %>%
        mutate(luca=ifelse(is.na(origin), 0, ifelse(origin=="LUCA",1, 0)),
               eukaryotes=ifelse(is.na(origin), 0, ifelse(origin=="Eukaryotes",1, 0)),
               metazoans=ifelse(is.na(origin), 0, ifelse(origin=="Metazoans",1, 0)),
               vertebrates=ifelse(is.na(origin), 0, ifelse(origin=="Vertebrates",1, 0)),
               opisthokonts=ifelse(is.na(origin), 0, ifelse(origin=="Opisthokonts",1, 0)),
               mammals=ifelse(is.na(origin), 0, ifelse(origin=="Mammals",1, 0)),
               primates=ifelse(is.na(origin), 0, ifelse(origin=="Primates", 1, 0))) %>% ## Initially omitted from the models - added here for reference
        select(-origin)

    ## exp.breadth.class
    df <- df %>%
        mutate(selective=ifelse(is.na(exp.breadth), 0, ifelse(exp.breadth=="Selective",1, 0)),
               always.expressed=ifelse(is.na(exp.breadth), 0, ifelse(exp.breadth=="AlwaysExpressed",1, 0)),
               middle=ifelse(is.na(exp.breadth), 0, ifelse(exp.breadth=="Middle",1, 0)),
               one.tissue=ifelse(is.na(exp.breadth), 0, ifelse(exp.breadth=="OneTissue",1, 0)),
               never.expressed=ifelse(is.na(exp.breadth), 0, ifelse(exp.breadth=="Neverexpressed",1, 0))) %>%
        select(-exp.breadth)

    ## Degree, Betweenness, Hub, Central have NAs
    df <- df %>% mutate(degree=ifelse(is.na(degree), 0, degree), betweenness=ifelse(is.na(betweenness), 0, betweenness),
                        hub=ifelse(is.na(hub), 0, hub), central=ifelse(is.na(central), 0, central))

    ## Before you add (chnage NAs in these columns to 0)
    df[,c("High", "Low", "Medium", "NotExpressed")][is.na(df[,c("High", "Low", "Medium", "NotExpressed")])] <- 0
    df <- df %>% ungroup %>% mutate(tot.tissues=High+Low+Medium)
    df <- data.frame(df)

    cat("Converting features to factors...", "\n")
    fcols <- c("Genic", "memberofcomplex",
               "WGD", "hub", "central", "CNVGain", "CNVLoss",
               "ExpT_ME", "ExpT_HE", "ExpT_LE", "ExpT_NE",
               "ExpT_NET", "old", "young", "luca", "eukaryotes",
               "metazoans", "vertebrates", "opisthokonts",
               "mammals", "selective", "always.expressed",
               "middle", "one.tissue", "never.expressed")
    cols <- which(colnames(df) %in% fcols)
    for(i in cols){
        df[,i] = factor(df[,i], levels = c(0,1))
    }

#     df <- df %>% mutate(key=paste(Cancer_type, Sample, Entrez, sep=".")) %>%
#         select(-Cancer_type, -Sample, -Entrez)
#     rnames <- df$key
#     df <- df %>% select(-key)
#     df <- data.frame(df)
#     row.names(df) <- rnames

    return(df)
}

## Function to extract training set (Vog genes and NCG FP)
## Note that dplyr::filter is used - subset is very slow
## You need to run this function before the training because after scaling you cannot filter based on mutations etc
## Pay extra attention when you subset for training (no_GOF_muts is NAs or 0s)
get_VogNCGFP_training <- function(df, fp_dir="/mnt/lustre/users/k1469280/mourikisa/data/NCG_false_positives.txt", ...){

    ## Make sure you load the unscaled table because only there you can subset
    ## based on 0 mutations...etc, the scaled one has no 0s
    ## Load the table and get the rownames, then subset the scaled ttable based on the rownames
    #load("")

    ## Get false positive genes
    false_positive_genes <- read.table(fp_dir, header = T, sep = "\t")
    false_positive_genes <- false_positive_genes %>% select(entrez, symbol)

    ## Training set extraction
    cat("Extracting training dataset...", "\n")
    training <- df %>% dplyr::filter((vogel=="Vog.Oncogene" | vogel=="Vog.TS") & (no_TRUNC_muts!=0 | no_NTDam_muts!=0 | no_GOF_muts!=0 | CNVGain==1 | CNVLoss==1 | ExpT_NET==1))
    training <- training %>% mutate(type=ifelse(vogel=="Vog.Oncogene", "O", "T"))
    cat("Adding NCG False Positives...", "\n")
    trainingFP <- df %>% dplyr::filter(Entrez %in% false_positive_genes$entrez)
    #trainingFP <- trainingFP %>% mutate(rowname=paste(Cancer_type, Sample, Entrez, sep=".")) %>% select(-Cancer_type, -Sample, -Entrez)
    trainingFP <- trainingFP %>% mutate(type="N")

    ## Final training set
    training <- rbind(training, trainingFP)

    ## General housekeeping of the training set
    training$type <- as.factor(training$type)

    ## Exclude here any features that you dont want to include in the
    ## training/prediction
    #rnames <- training$rowname
    #training <- training %>% select(-TPM, -inCDD, -vogel, -rowname)
    #training = data.frame(training)
    #row.names(training) <- rnames
    #training = training %>% select(rowname,type)
    return(training)
}

## Load the total table first to avoid re-loading everytime
## Usually I load total_table_muts_cnv
mutProfile = function(df, entrez, symbol){

    mut_profile=list()
    lof = df %>% subset(Entrez==entrez & (!is.na(no_TRUNC_muts) | !is.na(no_NTDam_muts) | (CNV_type=="Loss" & Copy_number==0))) %>% select(Cancer_type, Sample) %>%
        unique %>% group_by(Cancer_type) %>% summarise(n=n()) %>% ungroup %>% data.frame()

    lof2samples = df %>% select(Cancer_type, Sample) %>% unique %>% group_by(Cancer_type) %>% summarise(total_pats=n()) %>%
        ungroup %>% left_join(lof, by=c("Cancer_type"))

    ##Make the NAs, 0s
    lof2samples$n[is.na(lof2samples$n)] <- 0
    lof2samples = lof2samples %>% mutate(label=paste(Cancer_type, "(", total_pats, ")", sep=""), freq=(n/total_pats)*100)

    ## sort based on the freq
    lof2samples$label = factor(as.character(lof2samples$label), level = lof2samples$label[order(lof2samples$freq,decreasing = F)],ordered = T)

    #pdf(file="/Users/fc-8s-imac/Desktop/smarca4_LOF_TCGA.pdf",height = unit(8.27, "inches"), width = unit(11.69, "inches"), useDingbats = F)
    p = ggplot(lof2samples%>%arrange(freq), aes(x=label, y=freq)) + geom_bar(stat="identity") + theme_boss() +
        theme(axis.text.x=element_text(angle=90),
              axis.title.y=element_text(size = 12)) +
        ylab(paste("% samples with LOF mutation in ", symbol, sep="")) + xlab("") +
        ggtitle(paste("Percentage of samples with LOF mutations in ",symbol, " across \n 31 cancer types", sep=""))

    mut_profile[["plot"]] = p
    mut_profile[["table"]] = lof2samples
    #dev.off()
    #write.table(lof2samples, file="/Users/fc-8s-imac/Desktop/smarca4_LOF_TCGA.tsv", quote=F, row.names = F, sep="\t")

    return(mut_profile)
}

excludeTraining <- function(training="", df){
    load(training)
    df <- df[!(row.names(df) %in% row.names(df)),]
    return(df)
}

get_TCGA_cancer_types <- function(){
    cancer_types <- c("ACC", "BLCA", "BRCA", "CESC", "CHOL", "COAD", "ESCA", "GBM",
                      "HNSC", "KICH", "KIRC", "KIRP", "LAML", "LGG", "LIHC", "LUAD",
                      "LUSC", "OV", "PAAD", "PCPG", "PRAD", "READ", "SARC", "SKCM",
                      "STAD", "TGCT", "THCA", "THYM", "UCEC", "UCS", "UVM")
    return(cancer_types)
}

get_TCGA_cancer_types_exprWithNormal <- function(){
    ctWithNormals <- c("BLCA", "BRCA", "CESC", "CHOL", "COAD", "ESCA",
                       "HNSC", "KICH", "KIRC", "KIRP", "LIHC", "LUAD",
                       "LUSC", "PAAD", "PCPG", "PRAD", "READ", "SARC", "SKCM",
                       "STAD", "THCA", "THYM", "UCEC")
    return(ctWithNormals)
}

################################################################
# check sensitivity for final predictions
# when CGCs altered in samples are considered True positives
# I will do it when consider different cuttofs in proabilities
# 95%, 96%, 97%, 98%, 99%, 100%
################################################################
checkPredictions = function(model, validation_set, geneInfo, plattTraining){

    ## Fix gene info table from NCG
    geneInfo = geneInfo %>% select(entrez, cancer_type) %>% unique
    colnames(geneInfo) = c("entrez", "geneType")
    ## Make predictions and get the associated probability
    predictions = predict(svm.model, validation_set%>%select(-type), decision.values = TRUE)
    preds_platt = plattScaling(plattTraining, predictions)
    colnames(preds_platt) = c("dv", "label", "prob_platt")
    preds_binning = binning(svm.model, predictions)
    colnames(preds_binning) = c("dv", "label", "prob_binning")
    preds = preds_platt %>% tibble::rownames_to_column() %>%
        left_join(preds_binning%>%tibble::rownames_to_column(), by=c("rowname","dv", "label"))
    preds = preds %>%
        separate(rowname, into=c("cancer_type", "sample", "entrez"), sep="\\.") %>%
        mutate(entrez=as.numeric(entrez)) %>%
        left_join(geneInfo, by=c("entrez"))

    cgcs_obs = preds %>% subset(geneType=="cgc") %>% nrow
    cgcs_unq = preds %>% subset(geneType=="cgc") %>% select(entrez) %>%
        unique %>% nrow
    samples = preds %>% select(sample) %>% unique %>% nrow
    cat("Samples: ", samples, "\n")
    cat("CGC genes: ", cgcs_obs, "\n")
    cat("CGC unique genes: ", cgcs_unq, "\n")

    platt_cuttoff = 0.95
    platt_sens_list_obs = NULL
    while (platt_cuttoff<=1){
        ## Sensitivity overall (observations)
        cat("Cut-off: ", platt_cuttoff, "\n")
        cat(paste("CGC observations: ", cgcs_obs, sep=""), "\n")
        genes = preds %>% subset(prob_platt > platt_cuttoff & geneType=="cgc") %>% nrow
        cat(paste("Observations with Platt's p > ", platt_cuttoff,": ", genes, sep=""), "\n")
        sens = genes/cgcs_obs
        platt_sens_list_obs = c(platt_sens_list_obs, sens)
        cat("Sensitivity (observations): ", sens, "\n")

        ## Sensitivity unique genes
        cat(paste("CGC unique genes: ", cgcs_unq, sep=""), "\n")
        genes = preds %>% subset(prob_platt > platt_cuttoff) %>%
            subset(geneType=="cgc") %>% select(entrez) %>% unique %>% nrow
        cat(paste("Unique genes with Platt's p > ", platt_cuttoff,": ", genes, sep=""), "\n")
        sens = genes/cgcs_unq
        cat("Sensitivity (unique CGC genes): ", sens, "\n")

        cat("\n")
        platt_cuttoff = platt_cuttoff + 0.01
    }
    return(platt_sens_list_obs)

}
thmourikis/sysSVM documentation built on May 30, 2019, 4:05 a.m.