R/casereport.R

Defines functions wgs.report

Documented in wgs.report

#' @name wgs.report
#' @title wgs.report
#'
#' @description
#'
#' Generate a WGS case report
#' 
#' @param opt list with the input parameters for the wgs casereport. For more information on parameters run "Rscript wgs.report --help" in the command line.
#' @export
wgs.report = function(opt){
    message("Loading Libraries -- Please wait...")
    suppressMessages(expr = {
        suppressPackageStartupMessages(expr = {
            library(forcats)
            library(stringr)
            library(gGnome)
            library(gTrack)
            library(gUtils)
            library(tidyr)
            library(tidyverse)
            library(ggforce)
            library(ggridges)
            library(ggrepel)
            library(ggExtra)
            library(httr)
            library(jsonlite)
            library(knitr)
            library(rmarkdown)
            library(ComplexHeatmap)
            library(deconstructSigs)
            library(DT)
            library(VariantAnnotation)
            #library(immunedeconv)
            library(readr)
            library(data.table)
            library(casereport)
            library(skitools)
            data.table::setDTthreads(1)
        })
    })

    if (inherits(opt, 'data.frame')){
        opt = as.list(opt)
    }

    ##################
    ## Initialize a report config
    ##################
    if (is.null(opt$outdir)){
        opt$outdir = './'
    }

    ##################
    ##Setting a bunch of default values
    ##################
    NA_default_params = c('cbs_cov_rds', 'cbs_nseg_rds', 'het_pileups_wgs',
                          "deconstruct_sigs", "deconstruct_variants",
                          "sigs_cohort", "tpm", "hrd_results",
                          "ot_results", "snv_vcf", "indel_vcf", "snpeff_snv_bcf",
                          "snpeff_indel_bcf", "drivers", "cohort_metadata", "germline_coding"
                          )
    for (param in NA_default_params){
        opt = set_param(opt, param, NA_character_)
    }
    default_opt = list(chrom_sizes = system.file('extdata', 'hg19.broad.chrom.sizes', package = "casereport"),
                       libdir = find.package('casereport'),
                       gencode = '~/DB/GENCODE/gencode.v19.annotation.gtf', # FIXME: mskilab path
                       genes = '~/DB/GENCODE/gencode.v19.genes.gtf', # FIXME: mskilab path
                       tpm_cohort = '~/data/PCAWG/final_release/tpm.cohort.tsv.gz', # FIXME: mskilab path
                       knit_only = FALSE,
                       amp_thresh = 2,
                       del_thresh = 0.5,
                       server = "https://mskilab.com/gGraph/",
                       tumor_type = "",
                       ref = 'hg19',
                       snpeff_config = '~/modules/SnpEff/snpEff.config', # FIXME: this is refering to an mskilab path
                       pmkb_interpretations = system.file('extdata', 'pmkb-interpretations-06-11-2021.csv', package = 'casereport'),
                       deconv = 'epic',
                       overwrite = FALSE,
                       verbose = TRUE,
                       quantile_thresh = 0.05,
                       include_surface = TRUE)
    missing_params = setdiff(names(default_opt), names(opt))
    if (length(missing_params) > 0){
        for (param in missing_params){
            opt[param] = default_opt[[param]]
        }
    }
    ## make sure $genes and $gencode are correct given genome version
    # FIXME: there are a bunch of mskilab paths here
    if (opt$ref == "hg38") {
        message("Using reference: ", opt$ref)
        opt$gencode = "~/DB/GENCODE/hg38/v38/gencode.v38.annotation.gtf"
        opt$genes = "~/DB/GENCODE/hg38/v38/gencode.v38.genes.gtf"
        Sys.setenv(DEFAULT_GENOME = "BSgenome.Hsapiens.UCSC.hg38::Hsapiens")
        opt$cytoband = system.file("extdata", "hg38.cytoband.txt", package = "casereport")
        opt$gencode_gtrack = NA # the gencode gtrack will be produced from the gencode gtf
        opt$chrom_sizes = system.file('extdata', 'hg38.chrom.sizes', package = "casereport")
    } else if (opt$ref == "hg19") {
        message("Using reference: ", opt$ref)
        opt$gencode = "~/DB/GENCODE/gencode.v19.annotation.gtf"
        opt$genes = "~/DB/GENCODE/gencode.v19.genes.gtf"
        Sys.setenv(DEFAULT_GENOME = "BSgenome.Hsapiens.UCSC.hg19::Hsapiens")
        opt$cytoband = system.file("extdata",  "hg19.cytoband.txt", package = "casereport")
        opt$gencode_gtrack = system.file("extdata", "gt.ge.hg19.rds", package = "casereport") ## this is provided
        opt$chrom_sizes = system.file('extdata', 'hg19.broad.chrom.sizes', package = "casereport")
    } else {
        stop("Invalid entry for $ref provided: ", opt$ref)
    }

    saveRDS(opt, paste0(opt$outdir, '/cmd.args.rds'))

    ## copy report.config from opt
    report.config = opt

    ## extra params for knitting
    report.config$set_title = opt$pair

    ## normalize paths for knitting
    report.config$jabba_rds = normalizePath(report.config$jabba_rds)
    report.config$outdir = normalizePath(report.config$outdir)

    ## add gTrack file names to report config
    report.config$coverage_gtrack = paste0(report.config$outdir, "/coverage.gtrack.rds")
    report.config$allele_gtrack = paste0(report.config$outdir, "/agtrack.rds")
    report.config$gencode_gtrack = opt$gencode_gtrack
    if (is.na(report.config$gencode_gtrack)){
        # this gtrack will be produced by create_genes_gtrack
        report.config$gencode_gtrack = paste0(report.config$outdir, "/", "gencode.gtrack.rds")
    }

    if (check_file(opt$drivers))
        report.config$drivers = opt$drivers
    else
        report.config$drivers = NA_character_

    ## add CGC genes file
    report.config$cgc = cgc.fname = system.file("extdata", "cgc.tsv", package = "casereport")

    ## add oncogenes, etc. to report config
    report.config$onc = system.file("extdata", "onc.rds", package = "casereport")
    report.config$tsg = system.file("extdata", "tsg.rds", package = "casereport")
    report.config$surface = system.file("extdata", "surface.rds", package = "casereport")

    ## purity/ploidy plots
    report.config$cn_plot = paste0(report.config$outdir, "/", "cn.pp.png")
    report.config$allele_plot = paste0(report.config$outdir, "/", "allele.scatter.png")

    ## SCNA
    report.config$gene_cn = paste0(report.config$outdir, "/", "genes_cn.rds")
    report.config$driver_scna = paste0(report.config$outdir, '/driver.genes.cnv.txt')
    report.config$surface_scna = paste0(report.config$outdir, '/surface.genes.cnv.txt')
    report.config$scna_gtracks = paste0(report.config$outdir, "/", "cn.gallery.txt")
    report.config$surface_scna_gtracks = paste0(report.config$outdir, "/", "surface.cn.gallery.txt")

    ## SNVS
    report.config$driver_mutations = paste0(report.config$outdir, "/", "driver.mutations.txt")

    ## Germline SNVS/INDELS
    report.config$driver_germline_mutations = paste0(report.config$outdir, "/", "driver.germline.mutations.txt")
    report.config$germline_coding = paste0(report.config$outdir, "/annotated.germline.coding.rds")

    ## SV
    report.config$sv_gtracks = paste0(report.config$outdir, "/", "sv.gallery.txt")

    ## whole genome vis
    report.config$wgs_gtrack_plot = file.path(report.config$outdir, "wgs.gtrack.png")
    report.config$wgs_circos_plot = file.path(report.config$outdir, "wgs.circos.png")

    ## fusions
    report.config$driver_fusions = file.path(report.config$outdir, "fusions.driver.txt")
    report.config$other_fusions = file.path(report.config$outdir, "fusions.other.txt")

    ## RNA expression analyses
    report.config$tpm_quantiles = paste0(report.config$outdir, "/", "tpm.quantiles.txt")
    report.config$rna_change = paste0(report.config$outdir, "/", "rna.change.txt")
    report.config$surface_rna_change = paste0(report.config$outdir, "/", "surface.rna.change.txt")
    report.config$rna_change_all = paste0(report.config$outdir, "/", "rna.change.all.txt")
    report.config$expression_histograms = paste0(report.config$outdir, "/", "expr.histograms.txt")
    report.config$surface_expression_histograms = paste0(report.config$outdir, "/", "surface.expr.histograms.txt")
    report.config$expression_gtracks = paste0(report.config$outdir, "/", "expr.gallery.txt")
    report.config$waterfall_plot = paste0(report.config$outdir, "/", "waterfall.png")
    report.config$rna_change_with_cn = paste0(report.config$outdir, "/", "driver.genes.expr.txt")
    report.config$surface_rna_change_with_cn = paste0(report.config$outdir, "/", "surface.genes.expr.txt")
    report.config$surface_expression_gtracks = paste0(report.config$outdir, "/", "surface.expr.gallery.txt")
    report.config$surface_waterfall_plot = paste0(report.config$outdir, "/", "surface.waterfall.png")

    ## HRD
    report.config$ot = paste0(report.config$outdir, "/ot.rds")
    report.config$ot_log = paste0(report.config$outdir, "/onenesstwoness.log.dat.png")
    report.config$ot_prob = paste0(report.config$outdir, "/onenesstwoness.prop.dat.png")
    report.config$oneness = paste0(report.config$outdir, "/Oneness.png")
    report.config$twoness = paste0(report.config$outdir, "/Twoness.png")

    ## deconstructSigs
    report.config$sig_composition = paste0(report.config$outdir, "/deconstruct_sigs.png")
    report.config$sig_histogram = paste0(report.config$outdir, "/sig.composition.png")

    ## Deconvolution
    report.config$deconv = paste0(report.config$outdir, "/", "deconv_results.txt")

    ## summary
    report.config$summary_stats = paste0(report.config$outdir, "/summary.rds")
    report.config$oncotable = paste0(report.config$outdir, "/oncotable.rds")
    report.config$summaryTable = paste0(report.config$outdir, "/summaryTable.txt")
    report.config$summarySurfaceTable = paste0(report.config$outdir, "/summarySurfaceTable.txt")

    saveRDS(report.config, paste0(report.config$outdir, "/", "report.config.rds"))

    ##################
    ## Start producing some analyses
    ##################

    if (!opt$knit_only) {
        message("Grabbing purity and ploidy")
        jabba = readRDS(opt$jabba_rds)

        if (is.null(report.config$purity)) {
            report.config$purity = jabba$purity
            saveRDS(report.config, paste0(opt$outdir, "/", "report.config.rds"))
        }

        if (is.null(report.config$ploidy)) {
            report.config$ploidy = jabba$ploidy
            saveRDS(report.config, paste0(opt$outdir, "/", "report.config.rds"))
        }
        
        
        message("Preparing gTracks")
        message("Preparing coverage gTrack")
        if (check_file(report.config$coverage_gtrack, overwrite = opt$overwrite, verbose = opt$verbose)){
            cvgt = readRDS(report.config$coverage_gtrack)
        } else {
            cov.file = opt$cbs_cov_rds
            cvgt = covcbs(cov.file, purity = jabba$purity, ploidy = jabba$ploidy, rebin = 5e3,
                          ylab = "CN", y.cap = FALSE, xaxis.chronly = TRUE)
            saveRDS(cvgt, report.config$coverage_gtrack)
        }

        if (check_file(report.config$allele_gtrack, overwrite = opt$overwrite)) {
            agt = readRDS(report.config$allele_gtrack)
        } else {
            message("Checking for hets")
            if (file.good(opt$het_pileups_wgs)) {
                agt = grab.agtrack(opt$het_pileups_wgs,
                                   purity = jabba$purity,
                                   ploidy = jabba$ploidy)
                saveRDS(agt, report.config$allele_gtrack)
            } else {
                message("no het pileups provided, skipping")
                agt = NULL            
            }
        }

        
       if (check_file(report.config$gencode_gtrack, opt$overwrite, opt$verbose)) {
           message("Genes gTrack exists")
       } else {
           tsg = readRDS(report.config$tsg)
           onc = readRDS(report.config$onc)
           # TODO: replace the gene list with the specified driver table
           genes.gt = create_genes_gtrack(genes = c(tsg, onc),
                                          cached.dir = report.config$outdir,
                                          gencode.fname = opt$gencode)
           if (!file.good(report.config$gencode_gtrack)){
               # this is in case that the path specified in report.config is different than the default path generated by track.gencode
               saveRDS(genes.gt, report.config$gencode_gtrack)
           }
       }


        ####################
        ## Events
        ####################
        ## if (file.good(report.config$complex)) {
        ##     gg = readRDS(report.config$complex)
        ## } else if (file.good(opt$complex)) {
        ##     file.copy(opt$complex, report.config$complex)
        ##     gg = gGnome::refresh(readRDS(opt$complex))
        ## } else {
        ##     gg = gGnome::events(gG(jabba = jabba))
        ##     saveRDS(gg, report.config$complex)
        ## }
        ## gg = read

        ## gg$set(purity = jabba$purity)
        ## gg$set(ploidy = jabba$ploidy)
        

        ###################
        ## purity/ploidy QC plots
        ###################
        if (!check_file(report.config$cn_plot, opt$overwrite, verbose = opt$verbose)) {
            message("generating total CN purity/ploidy plots")
            pt = pp_plot(jabba_rds = opt$jabba_rds,
                         cov.fname = opt$cbs_cov_rds,
                         hets.fname = opt$het_pileups_wgs,
                         allele = FALSE,
                         field = "ratio",
                         plot.min = -2,
                         plot.max = 2,
                         bins = 100,
                         verbose = TRUE)
            message("Saving results to: ", normalizePath(report.config$cn_plot))
            ppng(print(pt), filename = report.config$cn_plot, height = 500, width = 500)
        } 

        if (check_file(report.config$allele_plot, opt$overwrite, verbose = opt$verbose)) {
            message("Allele scatter plot already exists")
        } else {
            if (file.good(opt$het_pileups_wgs)) {
                pt = pp_plot(jabba_rds = opt$jabba_rds,
                             cov.fname = opt$cbs_cov_rds,
                             hets.fname = opt$het_pileups_wgs,
                             allele = TRUE,
                             field = "count",
                             plot.min = -2,
                             plot.max = 2,
                             scatter = TRUE,
                             bins = 100,
                             verbose = TRUE)
                ppng(print(pt), filename = report.config$allele_plot, height = 500, width = 500)
            } else {
                message("No hets supplied so no allele CN plot is generated.")
            }
        }


        ## message("Checking for cohort metadata")
        ## if (file.good(opt$cohort_metadata)) {
        ##     message("Metadata found! Reading...")
        ##     meta = data.table::fread(opt$cohort_metadata)
        ## } else {
        ##     message("TPM cohort metadata not supplied.")
        ##     meta = data.table() ## empty data table to avoid existence errors
        ## }
        
        message("Checking for RNA expression input")
        if (check_file(report.config$tpm_quantiles, opt$overwrite, opt$verbose)) {
            message("RNA quantiles analysis already exists")
        } else {
            if (file.good(opt$tpm_cohort) & file.good(opt$tpm)) {

                if (opt$include_surface) {
                    surface = readRDS(report.config$surface)
                } else {
                    surface = c()
                }

                melted.expr = compute_rna_quantiles(tpm = opt$tpm,
                                                    tpm.cohort = opt$tpm_cohort,
                                                    onc = readRDS(report.config$onc),
                                                    tsg = readRDS(report.config$tsg),
                                                    surface = surface,
                                                    id = opt$pair,
                                                    gencode.gtrack = report.config$gencode_gtrack,
                                                    quantile.thresh = opt$quantile_thresh,
                                                    verbose = TRUE)
                cohort=fread(opt$tpm_cohort)
                W=data.table(gene=cohort$"gene",Avg=rowMeans(log10(cohort[,!("gene")]+1)),SD=rowSds(log10(as.matrix(cohort[,!("gene")]+1))))
                melted.expr$zscore=(log10(melted.expr$value+1)-W$Avg)/W$SD
            } else {
                message("RNA input not supplied.")
                melted.expr = data.table(gene = as.character(),
                                         pair = as.character(),
                                         value = as.numeric(),
                                         qt = as.numeric(),
                                         role = as.character(),
                                         direction = as.character(),
                         zscore = as.numeric())

            }
            fwrite(melted.expr, report.config$tpm_quantiles)
        }
        if (check_file(report.config$rna_change, opt$overwrite, opt$verbose) &
            check_file(report.config$rna_change_all, opt$overwrite, opt$verbose)) {
            message("Over/underexpressed driver genes already identified")
        } else {
            melted.expr = data.table::fread(report.config$tpm_quantiles, header = TRUE)
            if (nrow(melted.expr)) {
                rna.change.dt = melted.expr[pair == opt$pair,][(role %like% "ONC" & qt > (1 - opt$quantile_thresh)) |
                                                               (role %like% "SURF" & qt > (1 - opt$quantile_thresh)) |
                                                               (role %like% "TSG" & qt < opt$quantile_thresh)]
                rna.change.all.dt = melted.expr[pair == opt$pair,]
            } else {
                rna.change.dt = melted.expr
                rna.change.all.dt = melted.expr
            }
            fwrite(rna.change.dt[role != 'SURF'], report.config$rna_change)
            fwrite(rna.change.dt[role %like% 'SURF'], report.config$surface_rna_change)
            fwrite(rna.change.all.dt, report.config$rna_change_all)
        }

        message('Calling CNVs for oncogenes and tumor suppressor genes')
        if (check_file(report.config$gene_cn, overwrite = opt$overwrite, verbose = opt$verbose)){
            genes_cn_annotated = readRDS(report.config$gene_cn)
        } else {
            kag_rds = gsub("jabba.simple.rds", "karyograph.rds", opt$jabba_rds)
            nseg = NULL
            if (!is.na(opt$cbs_nseg_rds)){
                if (file.exists(opt$cbs_nseg_rds) & file.size(opt$cbs_nseg_rds) > 0){
                    if (verbose)
                        message('Reading normal segmentation copy number from: ', opt$cbs_nseg_rds)
                    nseg = readRDS(opt$cbs_nseg_rds)
                }
            } else {
                if (file.exists(kag_rds) & file.size(kag_rds) > 0){
                    message('Loading JaBbA karyograph from ', kag_rds)
                    kag = readRDS(kag_rds)
                    if ('ncn' %in% names(mcols(kag$segstats))){
                        nseg = kag$segstats[,c('ncn')]
                    } else {
                        message('JaBbA karyograph does not contain the "ncn" field so CN = 2 will be assumed for the normal copy number of all seqnames.')
                    }
                } else {
                ## # TODO: perhaps we should allow to directly provide a nseg input
                message('No cbs_nseg_rds was provided and the JaBbA karyograph was not found at the expected location (', kag_rds, ') so we will use CN = 2 for the normal copy number of all chromosomes.')
                }
            }

            gg = gG(jabba = opt$jabba_rds)
            pl = gg$meta$ploidy
            pl = ifelse(is.null(pl), 2, pl)
            genes_cn = get_gene_copy_numbers(gg,
                                             gene_ranges = opt$genes,
                                             nseg = nseg,
                                             ploidy = pl,
                                             simplify_seqnames = (opt$ref == "hg19"),
                                             complex.fname = report.config$complex)

            genes_cn_annotated = get_gene_ampdel_annotations(genes_cn,
                                                             amp.thresh = opt$amp_thresh,
                                                             del.thresh = pmax(opt$del_thresh, 1))

            if (file.good(report.config$tpm_quantiles)) {

                message ("Merging SCNAs with RNA expression")

                melted.expr = data.table::fread(report.config$tpm_quantiles, header = TRUE)

                if (nrow(melted.expr)) {
                    genes_cn_annotated = merge.data.table(genes_cn_annotated,
                                                          melted.expr[pair == opt$pair,
                                                                      .(gene_name = gene,
                                                                        expr.quantile = qt,
                                                                        expr.value = value)],
                                                          by = "gene_name",
                                                          all.x = TRUE)

                    genes_cn_annotated[expr.quantile < opt$quantile_thresh, expr := "under"]
                    genes_cn_annotated[expr.quantile > (1 - opt$quantile_thresh), expr := "over"]
                } else {
                    genes_cn_annotated[, ":="(expr.quantile = NA, expr.value = NA, expr = NA)]
                }
            } else {
                genes_cn_annotated[, ":="(expr.quantile = NA, expr.value = NA, expr = NA)]
            }

            ## save annotated genes
            saveRDS(genes_cn_annotated, report.config$gene_cn)
        }

        if (check_file(report.config$driver_scna,  overwrite = opt$overwrite)) {
            message("Driver gene SCNAs already identified")
        } else {

            ## read table of SCNAs
            if (file.good(report.config$gene_cn)) {
                genes_cn_annotated = readRDS(report.config$gene_cn)
            } else {
                stop("SCNA analysis failed.")
            }
                
            if (genes_cn_annotated[, .N] > 0){

                ## read oncogenes and TSGs from config
                onc = readRDS(report.config$onc)
                tsg = readRDS(report.config$tsg)

                if (!is.null(opt$include_surface) && opt$include_surface) {
                    surface = readRDS(report.config$surface)

                    driver.genes_cn = genes_cn_annotated[(cnv == "amp" & gene_name %in% onc) |
                                                         ((cnv %in% c("hetdel", "homdel") | loh == "loh" ) & gene_name %in% tsg) |
                                                         (cnv == "amp" & gene_name %in% surface)]
                    driver.genes_cn[gene_name %in% surface, surface := TRUE]
                } else {
                    driver.genes_cn = genes_cn_annotated[(cnv == "amp" & gene_name %in% onc) |
                                                         ((cnv %in% c("hetdel", "homdel") | loh == "loh" ) & gene_name %in% tsg)]
                }
                
                # add whether gene is TSG or ONCO
                driver.genes_cn[gene_name %in% onc, annot := "ONC"]
                driver.genes_cn[gene_name %in% tsg, annot := "TSG"]

                fields = c("gene_name", "annot", "surface",
                           "cnv", "loh","expr", "min_cn",
                           "max_cn", "min_normalized_cn", "max_normalized_cn",
                           "expr.value", "expr.quantile",
                           "seqnames", "start", "end", "width", "ev.id", "ev.type")
                if ('cn.low' %in% names(genes_cn_annotated) && 'cn.high' %in% names(genes_cn_annotated)){
                    # include allelic CN in the table
                    fields = c("gene_name", "annot", "surface",
                           "cnv", "loh", "expr", "min_cn", 'cn.high', 'cn.low',
                           "max_cn", "min_normalized_cn", "max_normalized_cn",
                           "expr.value", "expr.quantile",
                           "seqnames", "start", "end", "width", "ev.id", "ev.type")
                }

                cn.fields = intersect(fields, names(driver.genes_cn))
                
                if(length(unique(driver.genes_cn$gene_name))<nrow(driver.genes_cn)){
                    toCheck = names(table(driver.genes_cn$gene_name)[table(driver.genes_cn$gene_name)>1])
                    for(i in 1:length(toCheck)){
                        geneLines = driver.genes_cn[gene_name==toCheck[i],]
                        geneLines = geneLines[,length:= end-start] 
                        geneLines = geneLines[order(length, decreasing=TRUE),]
                        driver.genes_cn = driver.genes_cn[gene_name!=toCheck[i] | (start==geneLines$start[1] & end==geneLines$end[1] & seqnames==geneLines$seqnames[1]),]
                        if(length(unique(geneLines$seqnames)) < nrow(geneLines)){
                            message(paste0("For gene ",toCheck[i]," multiple lines found not from the same chromosome. Retaining line with longest GRange. Check input if needed."))
                            }else{
                                message(paste0("For gene ",toCheck[i]," multiple lines found. Retaining line with longest GRange. Check input if needed."))
                                }
                        }
                    }
                
                if (!is.null(opt$include_surface) && opt$include_surface) {
                    fwrite(driver.genes_cn[is.na(surface) | !is.na(annot), ..cn.fields], report.config$driver_scna)
                    fwrite(driver.genes_cn[surface == TRUE & is.na(annot), ..cn.fields], report.config$surface_scna)
                } else {
                    fwrite(driver.genes_cn[, ..cn.fields], report.config$driver_scna)
                }
            }
        }

        if (check_file(report.config$rna_change_with_cn, opt$overwrite, opt$verbose)) {
            message("Driver gene expression changes already identified")
        } else {

            if (file.good(report.config$rna_change_all)) {
                rna.change.dt = fread(report.config$rna_change_all, header = TRUE)
            } else {
                stop("RNA expression analysis failed")
            }
            
            if (file.good(report.config$gene_cn)) {
                genes_cn_annotated = readRDS(report.config$gene_cn)
            } else {
                stop("SCNA analysis failed")
            }

            fields = c("gene_name", "annot", "min_cn",
                       "max_cn", "min_normalized_cn", "max_normalized_cn",
                       "seqnames", "start", "end", "width", "ev.id", "ev.type")
            cn.fields = intersect(fields, colnames(genes_cn_annotated))

            driver.genes.expr.dt = merge.data.table(rna.change.dt[, .(gene, expr = direction,
                                                                      expr.value = value,
                                                                      expr.quantile = qt, role, zscore)],
                                                    genes_cn_annotated[, ..cn.fields],
                                                    by.x = "gene",
                                                    by.y = "gene_name",
                                                    all.x = TRUE)
            fwrite(driver.genes.expr.dt[role != 'SURF' & ((role %like% 'ONC' & expr.quantile>0.95) | (role %like% 'TSG' & expr.quantile<0.05))], report.config$rna_change_with_cn)
            fwrite(driver.genes.expr.dt[role %like% 'SURF' & (expr.quantile>0.95 | expr.quantile<0.05)], report.config$surface_rna_change_with_cn)
        }


        if (check_file(report.config$wgs_gtrack_plot, opt$overwrite, opt$verbose)) {
            message("Whole-genome gTrack plot exists")
        } else {
            message("Generating whole-genome gTrack plots")

            gt = wgs_gtrack(report.config$jabba_rds,
                            report.config$coverage_gtrack,
                            report.config$allele_gtrack)
            sl = intersect(names(seqlengths(gt@data[[1]])),
                           names(seqlengths(gt@data[[2]])))
            if (length(gt@data) > 2) {
                sl = intersect(sl, names(seqlengths(gt@data[[3]])))
            }
            plot.chrs = grep("(^(chr)*[0-9XY]+$)", sl, value = TRUE)
            
        if (length(plot.chrs) == 0){
                stop('None of the sequences in your genome graph matches the default set of sequences.')
            }

            if (opt$ref == "hg19") {

                std.chrs = c(as.character(1:22), "X", "Y")
                std.chrs = std.chrs[which(std.chrs %in% plot.chrs)]
            } else {

                std.chrs = paste0("chr", c(as.character(1:22), "X", "Y"))
                std.chrs = std.chrs[which(std.chrs %in% plot.chrs)]

            }

            ## temporary fix??
            ## why is there an error here?
            std.chrs = gsub("chr", "", std.chrs)
                

            ppng(plot(gt, std.chrs),
                 filename = report.config$wgs_gtrack_plot,
                 height = 1000,
                 width = 5000)
            
        } 

        if (!check_file(report.config$wgs_circos_plot, opt$overwrite, opt$verbose)) {
            message("Generating whole-genome circos plot")

            tryCatch( expr = {
                ppng(wgs.circos(jabba_rds = report.config$jabba_rds,
                                cov_fn = report.config$cbs_cov_rds,
                                transform = TRUE,
                                field = "ratio",
                                link.h.ratio = 0.1,
                                cex.points = 0.1,
                                cytoband.path = opt$cytoband,
                                chr.sub = (opt$ref == "hg19")),
                     filename = report.config$wgs_circos_plot,
                     height = 850,
                     width = 1000)
            }, error = function(e) {
                if (file.exists(report.config$wgs_circos_plot)) {
                    file.remove(report.config$wgs_circos_plot)
                }
                message(e)
                stop("An error was encountered during circos plot creation")
                return(NA)
            })
            
        } else {
            message("Whole genome circos plot already exists")
        }

        ## ##################
        ## Driver gene mutations
        ## ##################
        if (check_file(report.config$driver_mutations, opt$overwrite, opt$verbose)) {
            message("Driver mutation analysis already exists.")
        } else {

            ## ################
            ## read cgc gene list
            ## ################

            ## check if SNPEFF needs to be run for SNVs
            if (file.good(opt$snpeff_snv_bcf)) {
                message("snpEff snv supplied. Filtering for oncogenes/TSGs.")

                ## grab annotations
                snv.dt = filter.snpeff(vcf = opt$snpeff_snv_bcf,
                                       gngt.fname = report.config$gencode_gtrack,
                                       onc = report.config$onc,
                                       tsg = report.config$tsg,
                                       drivers.fname = report.config$drivers,
                                       ref.name = opt$ref,
                                       type = "snv",
                                       verbose = TRUE)

            } else {
            
                if (file.good(opt$snv_vcf)) {

                    message("Running SNV SnpEff")
                    snpeff.libdir = normalizePath(system.file("extdata", "SnpEff_module", package = "casereport"))
                    snpeff.ref = opt$ref
                    snpeff.vcf = normalizePath(opt$snv_vcf)
                    snpeff.outdir = normalizePath(file.path(opt$outdir, "snpeff", "snv"))
                    snpeff.config = normalizePath(opt$snpeff_config)

                    snpeff.cm = paste("sh", file.path(snpeff.libdir, "run.sh"),
                                      snpeff.libdir,
                                      snpeff.ref,
                                      snpeff.vcf,
                                      snpeff.outdir,
                                      snpeff.config)

                    system(snpeff.cm)

                    report.config$snpeff_snv_bcf = file.path(snpeff.outdir, "annotated.bcf")
                    saveRDS(report.config, "report.config.rds")

                    ## grab annotations
                    snv.dt = filter.snpeff(vcf = report.config$snpeff_snv_bcf,
                                           gngt.fname = report.config$gencode_gtrack,
                                           onc = report.config$onc,
                                           tsg = report.config$tsg,
                                           drivers.fname = report.config$drivers,
                                           ref.name = opt$ref,
                                           type = "snv",
                                           verbose = TRUE)
                    
                } else {
                    message("SNV vcf does not exist or is empty")
                    snv.dt = NULL
                }

            }

            ## check if SNPEFF needs to be run for indels
            if (file.good(report.config$snpeff_indel_bcf)) {
                message("snpEff snv supplied. Filtering for oncogenes/TSGs.")

                ## grab annotations
                indel.dt = filter.snpeff(vcf = report.config$snpeff_indel_bcf,
                                       gngt.fname = report.config$gencode_gtrack,
                                       onc = report.config$onc,
                                       tsg = report.config$tsg,
                                       drivers.fname = report.config$drivers,
                                       ref.name = opt$ref,
                                       type = "indel",
                                       verbose = TRUE)
            } else {

            
                if (file.good(opt$indel_vcf)) {

                    message("Running SNV SnpEff")
                    snpeff.libdir = normalizePath(system.file("extdata", "SnpEff_module", package = "casereport"))
                    snpeff.ref = opt$ref
                    snpeff.vcf = normalizePath(opt$indel_vcf)
                    snpeff.outdir = normalizePath(file.path(opt$outdir, "snpeff", "indel"))
                    snpeff.config = normalizePath(opt$snpeff_config)

                    snpeff.cm = paste("sh", file.path(snpeff.libdir, "run.sh"),
                                      snpeff.libdir,
                                      snpeff.ref,
                                      snpeff.vcf,
                                      snpeff.outdir,
                                      snpeff.config)

                    system(snpeff.cm)

                    report.config$snpeff_indel_bcf = file.path(snpeff.outdir, "annotated.bcf")
                    saveRDS(opt, "cmd.args.rds")

                    ## grab annotations
                    indel.dt = filter.snpeff(vcf = report.config$snpeff_indel_bcf,
                                           gngt.fname = report.config$gencode_gtrack,
                                           onc = report.config$onc,
                                           tsg = report.config$tsg,
                                           drivers.fname = report.config$drivers,
                                           ref.name = opt$ref,
                                           type = "indel",
                                           verbose = TRUE)
                    
                } else {
                    message("Indel vcf does not exist or is empty")
                    indel.dt = NULL
                }
            }

            ## browser()
            if (!is.null(snv.dt) & !is.null(indel.dt)) {
                driver.mutations.dt = rbind(snv.dt, indel.dt, use.names =  TRUE, fill = TRUE) %>% unique
            } else if (!is.null(snv.dt)) {
                driver.mutations.dt = snv.dt
            } else {
                driver.mutations.dt = data.table()
            }

            if (driver.mutations.dt[,.N] > 0) {
                mutation.tier.driver.name = 'PMKB' # we can later switch to a different annotator or cycle through multiple annotators if we wish to
                mutation.tier.annotator = mutation.tier.annotators[['PMKB']]
                driver.mutations.dt = mutation.tier.annotator(driver.mutations.dt)

                ## add ONC and TSG annotations
                ## notice that if the mutation.tier.parser added "gene.type" annotations then this step could override some of these
                cgc = fread(report.config$cgc)
                tsg = readRDS(report.config$tsg)## cgc[get('Is Tumor Suppressor Gene') == 'Yes', get('Hugo Symbol')]
                onc = readRDS(report.config$onc)## cgc[get('Is Oncogene') == 'Yes', get('Hugo Symbol')]
                
                ## parsing drivers
                ## this can be a text file that contains
                ## either a 1-d vector of gene names
                ## in this case, the gene.type later on is basename(opt$drivers)
                ## or it can be a 2-column table
                ## 1st column is gene name,
                ## and 2nd col is annotation for gene.type
                ## header can be commented out via "#"
                txt.ptrn = "(.tsv|.txt|.csv|.tab)(.xz|.bz2|.gz){0,}$"
                no_ext <- function (x, compression = FALSE) 
                {
                    if (compression) 
                        x <- sub("[.](gz|bz2|xz)$", "", x)
                    sub("([^.]+)\\.[[:alnum:]]+$", "\\1", x)
                }
                if (check_file(opt$drivers)) {
                    if (grepl(".rds$", opt$drivers, ignore.case = TRUE)) {
                        drivers = readRDS(opt$drivers)
                        if (!(is.character(drivers) ||
                              inherits(drivers, c("matrix", "data.frame"))))
                            drivers = NULL
                            
                    } else if (grepl(txt.ptrn, opt$drivers)) {
                        drivers = fread(opt$driver)
                    }

                    if (!is.null(drivers)) {

                        if (NCOL(drivers) == 1) {
                            ## annotating drivers by file name
                            drivers = as.data.frame(drivers)
                            drivers[[2]] = no_ext(basename(opt$drivers))  
                        } 
                        ## presuming the formatting above
                        drivers = drivers[,c(1,2), drop = F]
                        colnames(drivers) = c("gene", "driver.type")
                        driver.mutations.dt$ord345987234 = 1:NROW(driver.mutations.dt)
                        driver.mutations.dt = merge(driver.mutations.dt,
                                                    drivers,
                                                    by = "gene",
                                                    all.x = TRUE)
                        driver.mutations.dt = driver.mutations.dt[order(ord345987234)]
                        driver.mutations.dt$ord345987234 = NULL
                    }

                }


                driver.mutations.dt[gene %in% tsg, gene.type := 'TSG']
                driver.mutations.dt[gene %in% onc, gene.type := 'ONC']
                                            # some genes are annotated in CGC as both
                driver.mutations.dt[gene %in% onc & gene %in% tsg, gene.type := 'ONC|TSG']
                cols = c("gene", "gene.type", "driver.type", "Tier", "seqnames", "pos", "impact", "REF", "ALT", "variant.p", "vartype", "annotation")
                driver.mutations.dt = driver.mutations.dt[,cols[cols %in% colnames(driver.mutations.dt)],
                                    with = FALSE]
            }

            

            fwrite(driver.mutations.dt, report.config$driver_mutations)
        }

        ## ##################
        ## Driver gene germline mutations
        ## ##################
        if (check_file(report.config$driver_germline_mutations, opt$overwrite, opt$verbose)) {
            message("Driver germline mutation analysis already exists.")
        } else {
            germline_dt = process_germline_muts(report.config$germline_coding,
                                                driver_germline_mutations = report.config$driver_germline_mutations,
                                                name = report.config$pair,
                                                germline_snpeff_snv_bcf = report.config$germline_snpeff_snv_bcf)
            message('Done proccessing germline mutations')
        }

        ## ##################
        ## Fusions
        ## ##################
        if (check_file(report.config$driver_fusions, opt$overwrite, opt$verbose) &
            check_file(report.config$other_fusions, opt$overwrite, opt$verbose)) {
            message("Fusions analysis already exists!")
        } else {
            
            ## ## if opt$fusions not available, run it
            ## if (check_file(report.config$fusions, overwrite = FALSE, verbose = opt$verbose)) {
            ##     fu = readRDS(opt$fusions)
            ## } else {
                
            ##     warning('Fusions were not supplied! Computing fusions can be time/memory intensive')

            ##     if (!exists("gff")){
                    
            ##         gff = skidb::read_gencode(fn = opt$gencode)
            ##     }
                
            ##     fu = fusions(gg, gff)
            ##     saveRDS(fu, paste0(opt$outdir, "/fusions.rds"))
            ##     opt$fusions = paste0(opt$outdir, "/fusions.rds")
            ##     saveRDS(opt, "cmd.args.rds")

            ## }
                
            message("Preparing fusion genes report")
            fusions.slickr.dt = fusion.wrapper(fusions.fname = opt$fusions,
                                               complex.fname = report.config$complex,
                                               cvgt.fname = report.config$coverage_gtrack,
                                               agt.fname = report.config$allele_gtrack,
                                               cgc.fname = report.config$cgc,
                                               gngt.fname = report.config$gencode_gtrack,
                                               pad = 0.5,
                                               height = 900,
                                               width = 1000,
                                               server = opt$server,
                                               pair = opt$pair,
                                               outdir = opt$outdir)

            ## make sure data table is not empty
            if (nrow(fusions.slickr.dt) == 0) {

                ## write empty data table, rmd file will deal with this.
                fwrite(fusions.slickr.dt, report.config$driver_fusions)
                fwrite(fusions.slickr.dt, report.config$other_fusions)
                
            } else {

                ## save data table for drivers and non-drivers separately
                fwrite(fusions.slickr.dt[driver == TRUE,], report.config$driver_fusions)
                fwrite(fusions.slickr.dt[driver == FALSE,], report.config$other_fusions)

            }
        }
        
        
        ## ##################
        ## SV gallery code
        ## ##################
        if (check_file(report.config$sv_gtracks, opt$overwrite, opt$verbose)) {
            message("SV gallery files already exist")
        } else {
            message("Preparing SV gallery")

            sv.slickr.dt = gallery.wrapper(complex.fname = report.config$complex,
                                           background.fname = system.file("extdata", "sv.burden.txt", package = "casereport"),
                                           cvgt.fname = report.config$coverage_gtrack,
                                           gngt.fname = report.config$gencode_gtrack,
                                           agt.fname = report.config$allele_gtrack,
                                           server = opt$server,
                                           pair = opt$pair,
                                           pad = 0.5,
                                           height = 900, ## png image height
                                           width = 1000, ## png image width
                                           outdir = opt$outdir)
            
            fwrite(sv.slickr.dt, report.config$sv_gtracks)
        } 

        ## #################
        ## CN gallery
        ## #################
        if (!check_file(report.config$scna_gtracks, overwrite = opt$overwrite)) {
            message("preparing CN gallery")
            ## grab ploidy
            pl = readRDS(opt$jabba_rds)$ploidy
            cn.gallery.dir = paste0(opt$outdir, '/cn_gallery')
            dir.create(cn.gallery.dir)
            cn.slickr.dt = cn.plot(drivers.fname = report.config$driver_scna,
                                   report.config$complex,
                                   cvgt.fname = report.config$coverage_gtrack,
                                   gngt.fname = report.config$gencode_gtrack,
                                   agt.fname = report.config$allele_gtrack,
                                   server = opt$server,
                                   pair = opt$pair,
                                   gg.rds=report.config$jabba_rds,
                                   amp.thresh = opt$amp_thresh,
                                   ploidy = pl,
                                   pad = 0.5,
                                   height = 900,
                                   width = 1000,
                                   overwrite = opt$overwrite,
                                   outdir = cn.gallery.dir)
            fwrite(cn.slickr.dt, report.config$scna_gtracks)
        } else {
            message("CN gallery files already exist")
        }
        
        if (!check_file(report.config$surface_scna_gtracks, overwrite = opt$overwrite)) {
            message("preparing Surface CN gallery")
            ## grab ploidy
            pl = readRDS(opt$jabba_rds)$ploidy
            surface.cn.gallery.dir = paste0(opt$outdir, '/surface_cn_gallery')
            dir.create(surface.cn.gallery.dir)
            surface.cn.slickr.dt = cn.plot(drivers.fname = report.config$surface_scna,
                                   complex.fname = report.config$complex,
                                   cvgt.fname = report.config$coverage_gtrack,
                                   gngt.fname = report.config$gencode_gtrack,
                                   agt.fname = report.config$allele_gtrack,
                                   server = opt$server,
                                   pair = opt$pair,
                                   gg.rds=report.config$jabba_rds,       
                                   amp.thresh = opt$amp_thresh,
                                   ploidy = pl,
                                   pad = 0.5,
                                   height = 900,
                                   width = 1000,
                                   overwrite = opt$overwrite,
                                   outdir = surface.cn.gallery.dir)
            fwrite(surface.cn.slickr.dt, report.config$surface_scna_gtracks)
        } else {
            message("CN gallery files already exist")
        }

        ## histograms of over and under-expressed genes
        if (check_file(report.config$expression_histograms, opt$overwrite, opt$verbose)) {
            message("expression histograms already exist")
        } else {
            expr.histograms.dt = plot_expression_histograms(report.config$rna_change,
                                                            report.config$tpm_quantiles,
                                                            pair = opt$pair,
                                                            outdir = opt$outdir,
                                                            res = 150)
            fwrite(expr.histograms.dt, report.config$expression_histograms)
        }
        if (check_file(report.config$surface_expression_histograms, opt$overwrite, opt$verbose)) {
            message("expression histograms already exist for surface genes")
        } else {
            surface.expr.histograms.dt = plot_expression_histograms(report.config$surface_rna_change,
                                                            report.config$tpm_quantiles,
                                                            pair = opt$pair,
                                                            outdir = opt$outdir,
                                                            res = 150)
            fwrite(surface.expr.histograms.dt, report.config$surface_expression_histograms)
        }

        ## gTrack of over and under-expressed genes
        if (check_file(report.config$expression_gtracks, overwrite = opt$overwrite, verbose = opt$verbose)) {
            message("gTracks of over/underexpressed genes already exist")
        } else {
            message("preparing expression gallery")
            expr.gallery.dir = paste0(opt$outdir, '/expr_gallery')
            dir.create(expr.gallery.dir)
            expr.slickr.dt = cn.plot(drivers.fname = report.config$rna_change,
                                     complex.fname = report.config$complex,
                                     cvgt.fname = report.config$coverage_gtrack,
                                     gngt.fname = report.config$gencode_gtrack,
                                     agt.fname = report.config$allele_gtrack,
                                     server = opt$server,
                                     pair = opt$pair,
                                     gg.rds = report.config$jabba_rds,
                                     amp.thresh = opt$amp_thresh,
                                     ploidy = report.config$ploidy,
                                     pad = 0.5,
                                     height = 1600,
                                     width = 1000,
                                     overwrite = opt$overwrite,
                                     outdir = expr.gallery.dir)
            fwrite(expr.slickr.dt, report.config$expression_gtracks)
        }
        ## gTrack of over and under-expressed surface genes
        if (check_file(report.config$surface_expression_gtracks, overwrite = opt$overwrite, verbose = opt$verbose)) {
            message("gTracks of over/underexpressed surface genes already exist")
        } else {
            message("preparing expression gallery for surface genes")
            expr.gallery.dir = paste0(opt$outdir, '/expr_gallery')
            dir.create(expr.gallery.dir)
            surface.expr.slickr.dt = cn.plot(drivers.fname = report.config$surface_rna_change,
                                     complex.fname = report.config$complex,
                                     cvgt.fname = report.config$coverage_gtrack,
                                     gngt.fname = report.config$gencode_gtrack,
                                     agt.fname = report.config$allele_gtrack,
                                     server = opt$server,
                                     pair = opt$pair,
                                     gg.rds = report.config$jabba_rds,
                                     amp.thresh = opt$amp_thresh,
                                     ploidy = report.config$ploidy,
                                     pad = 0.5,
                                     height = 1600,
                                     width = 1000,
                                     overwrite = opt$overwrite,
                                     outdir = expr.gallery.dir)
            fwrite(surface.expr.slickr.dt, report.config$surface_expression_gtracks)
        }

        ## expression waterfall plot
        if (check_file(report.config$waterfall_plot, overwrite = opt$overwrite, verbose = opt$verbose)) {
            message("Waterfall plot already exists")
        } else {
            if (file.good(opt$tpm) & file.good(opt$tpm_cohort)) {
                message("Generating waterfall plot")
                pt = rna.waterfall.plot(melted.expr.fn = report.config$tpm_quantiles,
                                        rna.change.fn = report.config$rna_change,
                                        pair = opt$pair)
                ppng(print(pt), filename = report.config$waterfall_plot,
                     width = 1600, height = 1200, res = 150)
            } else {
                message("Skipping waterfall plot - RNA expression not supplied")
            }
        }
        ## surface genes expression waterfall plot
        if (check_file(report.config$surface_waterfall_plot, overwrite = opt$overwrite, verbose = opt$verbose)) {
            message("Waterfall plot for surface genes already exists")
        } else {
            if (file.good(opt$tpm) & file.good(opt$tpm_cohort)) {
                message("Generating waterfall plot for surface genes")
                pt = rna.waterfall.plot(melted.expr.fn = report.config$tpm_quantiles,
                                        rna.change.fn = report.config$surface_rna_change,
                                        pair = opt$pair)
                ppng(print(pt), filename = report.config$surface_waterfall_plot,
                     width = 1600, height = 1200, res = 150)
            } else {
                message("Skipping waterfall plot for surface genes - RNA expression not supplied")
            }
        }

        ## ##################
        ## deconstructSigs composition plot
        ## ##################
        if (check_file(report.config$sig_composition, opt$overwrite, opt$verbose)) {
            message("Signatures composition plot already exists")
        } else {
            if (file.good(opt$deconstruct_sigs)) {
                message("Creating signatures composition plot from deconstructSigs input")
                sig = readRDS(opt$deconstruct_sigs)
                png(filename = report.config$sig_composition, width = 1000, height = 1000)
                deconstructSigs::plotSignatures(sig)
                dev.off()
            } else {
                message("deconstructSigs not supplied. skipping signatures composition")
            }
        }

        ######################
        ## deconstructSigs histogram
        ######################
        if (check_file(report.config$sig_histogram, overwrite = opt$overwrite, verbose = opt$verbose)) {
            message("Signatures histogram plot already exists")
        } else {

            if (file.good(opt$deconstruct_variants)) {

                ## identify correct background type
                sig.fn = system.file("extdata", "all.signatures.txt", package = "casereport")
                background.type = "Cell cohort"
                if (file.good(opt$sigs_cohort)){
                    sig.fn = opt$sigs_cohort
                    background.type = "supplied" ## what is the background dist, used for plot title
                } else {
                    if (!is.null(opt$tumor_type) & !is.na(opt$tumor_type)) {

                        ## tumor type specific signature
                        tumor.sig.fn = system.file("extdata", paste0(opt$tumor_type, ".signatures.txt"), package = "casereport")

                        ## check if file exists for specific tumor type
                        if (file.exists(tumor.sig.fn)) {
                            message("Using supplied background signature burden for tumor type: ", opt$tumor_type)
                            sig.fn = tumor.sig.fn
                            background.type = opt$tumor_type
                        } else {
                            message("Using background signature burden for whole Cell cohort")
                        }
                    } else {
                        message("Using background signature burden for whole Cell cohort")
                    }
                }
                sigMet=fread(system.file("extdata", "sig.metadata.txt", package = "casereport"), sep="\t")
                sigbar = deconstructsigs_histogram(sigs.fn = opt$deconstruct_variants,
                                                   sigs.cohort.fn = sig.fn,
                                                   id = opt$pair,
                                                   cohort.type = background.type,
                                                   sigMet=sigMet,
                                                   outdir=opt$outdir)
                ppng(print(sigbar),
                     filename = report.config$sig_histogram,
                     height = 800, width = 800, res = 150)

                presentSigs=fread(file.path(opt$outdir,"Sig.csv"))
                presentSigs$pair=NULL
                thisMet=sigMet[sigMet$Signature %in% presentSigs$Signature,]
                metTable=merge(presentSigs,thisMet,by="Signature")
                metTable=metTable[,c("Signature","Mutational.process","sig_count","perc")]
                colnames(metTable)=c("Signature","Mutational.process","sig_count","quantile")
                metTable=metTable[order(-metTable$quantile), ]
                fwrite(metTable, file.path(opt$outdir,"signatureMetadata.csv"))
                
            } else {
                message("deconstructSigs output not supplied.")
            }
        }
                

        ## ##################
        ## HRDetect results
        ## 
        ## ##################
        # FIXME: should probably check for the existence of the plots and not just the RDS file
        if (!file.good(paste0(opt$outdir, "/hrdetect.rds")) | opt$overwrite){
            if (file.good(opt$hrd_results)){
                hrd.res = readRDS(opt$hrd_results)
                ## melt the input data matrix
                hrd.dat = as.data.table(hrd.res$data_matrix) %>% data.table::melt()
                hrd.dat[, pair := opt$pair]
                ## melt the output results
                hrd.out = as.data.table(hrd.res$hrdetect_output) %>% data.table::melt()
                hrd.out[, pair := opt$pair]

                hrd = merge.data.table(hrd.out, hrd.dat[, .(variable, data = value)], by = "variable")

                ## original training data
                hrd_cohort = fread(system.file("extdata", "hrdetect.og.txt", package = "casereport"))
                hrd_cohort = data.table::melt(hrd_cohort, id.vars = c("pair", "is.hrd"))
                ## if (opt$pair %in% hrd_cohort$pair){
                hrd_cohort = rbindlist(list(hrd_cohort[pair!=opt$pair], hrd.dat), fill = TRUE)
                ## }

                hrd_cohort[, range(value), by = .(variable, is.hrd)]

                hrd.yes = hrd.out[variable=="Probability", value>0.7]

                ## these are the dimensions that need to be logged
                ldat = hrd_cohort[variable != "del.mh.prop"]
                ldat[, variable := factor(variable, levels = setdiff(levels(variable), "del.mh.prop"))]
                ## plot the distribution of raw count
                hrd.plot = ggplot(ldat, aes(x = value, y = variable)) +
                    geom_density_ridges(aes(point_colour = is.hrd,
                                            fill = is.hrd),
                                        bandwidth = 0.1,
                                        alpha = 0.5,
                                        scale = 0.9,
                                        rel_min_height = 0.01,
                                        color = NA,
                                        jittered_points = TRUE,
                                        position = position_points_jitter(width = 0.01, height = 0),
                                        point_shape = '|',
                                        point_size = 3,
                                        point_alpha = 0.3) +
                    scale_discrete_manual("point_colour", values = binary.cols) +
                    scale_fill_manual(values = binary.cols) +
                    geom_segment(data = ldat[pair==opt$pair],
                                 aes(x = value, xend = value,
                                     y = as.numeric(variable), yend = as.numeric(variable) + 0.9),
                                 color = ifelse(hrd.yes, "#D7261E", "#1f78b4"),
                                 alpha = 1, lwd = 3) +
                    scale_x_continuous(trans = "log1p",
                                       breaks = c(0, 1, 10, 100, 1000, 10000, 100000),
                                       labels = c(0, 1, 10,
                                                  expression(10^2),
                                                  expression(10^3),
                                                  expression(10^4),
                                                  expression(10^5)),
                                       limits = c(0, 100000),
                                       expand = c(0, 0)) +
                    labs(title = paste0("HRDetect features compared to Davies et al. 2017"), x = "Burden") +
                    theme_minimal() +
                    theme(legend.position = "none",
                          title = element_text(size = 20, family = "sans"),
                          axis.title = element_text(size = 20, family = "sans"),
                          axis.text.x = element_text(size = 15, family = "sans"),
                          axis.text.y = element_text(size = 20, family = "sans"))

                ## these are the dimensions that need to be logged
                rdat = hrd_cohort[variable == "del.mh.prop"]
                rdat[, variable := factor(variable, levels = "del.mh.prop")]
                ## plot the distribution of raw count
                hrd.plot.2 = ggplot(rdat, aes(x = value, y = variable)) +
                    geom_density_ridges(aes(point_colour = is.hrd,
                                            fill = is.hrd),
                                        bandwidth = 0.1,
                                        alpha = 0.5,
                                        scale = 0.9,
                                        rel_min_height = 0.01,
                                        color = NA,
                                        jittered_points = TRUE,
                                        position = position_points_jitter(width = 0.01, height = 0),
                                        point_shape = '|',
                                        point_size = 3,
                                        point_alpha = 0.3) +
                    scale_discrete_manual("point_colour", values = binary.cols, labels = NULL) +
                    scale_fill_manual(values = binary.cols,
                                      labels = c("Non HR deficient", "HR deficient")) +
                    ## scale_y_discrete(labels = hrdetect.dims) +
                    labs(x = "Proportion") +
                    guides(point_colour = FALSE) +
                    geom_segment(data = rdat[pair==opt$pair],
                                 aes(x = value, xend = value,
                                     y = as.numeric(variable), yend = as.numeric(variable) + 3),
                                 color = ifelse(hrd.yes, "#D7261E", "#1f78b4"),
                                 alpha = 1, lwd = 3) +
                    theme_minimal() +
                    theme(legend.position = "bottom",
                          legend.text = element_text(size=20),
                          title = element_text(size = 20, family = "sans"),
                          axis.title = element_text(size = 20, family = "sans"),
                          axis.text.x = element_text(size = 15, family = "sans"),
                          axis.text.y = element_text(size = 20, family = "sans"))

                ## finally draw the output probability in a waterfall plot
                ## draw the plots
                png(paste0(opt$outdir, "/hrdetect.log.dat.png"), width = 800, height = 800)
                print(hrd.plot)
                dev.off()

                png(paste0(opt$outdir, "/hrdetect.prop.dat.png"), width = 800, height = 200)
                print(hrd.plot.2)
                dev.off()

                saveRDS(hrd.out, paste0(opt$outdir, "/hrdetect.rds"))
            }
        } else {
            message("hrdetect output already exists")
        }

        ## ##################
        ## Oneness / Twoness
        ## 
        ## ##################
        all.ot = check_file(report.config$ot_log, opt$overwrite) &
            check_file(report.config$ot_prob, opt$overwrite) &
            check_file(report.config$oneness, opt$overwrite) &
            check_file(report.config$twoness, opt$overwrite)
        if (!all.ot){
            if (file.good(opt$ot_results)) {
                message('Processing oneness_twoness')
                ot.res = readRDS(opt$ot_results)
                ot = merge(list(ot.res$expl_variables),
                           list(ot.res$ot_scores))
                ot$`.` = NULL
                ot$"NA." = NULL
                ## ot = data.table::melt(ot)
                ot$pair = opt$pair
                tx = paste0("ot[,c(", paste(!names(ot) %in% c("qrdup", "qrdel", "tib"), collapse = ","), "),drop=F]")
                ot = eval(parse(text = tx)) ## this is just for robustness
                ## ## melt the input data matrix
                ## ot.dat = as.data.table(ot.res$data_matrix) %>% data.table::melt()
                ## ot.dat[, pair := opt$pair]
                ## ## melt the output results
                ## ot.out = as.data.table(ot.res$expl_variables)
                ## ot.out$`.` = NULL
                ## ot.out = data.table::melt(ot.out)
                ## ot.out[, pair := opt$pair]
                ## ot.out = ot.out[!variable %in% c("qrdup", "qrdel", "tib")]
                saveRDS(ot, paste0(opt$outdir, "/ot.rds"))

                ## ot = merge(ot.out, ot.dat[, .(variable, data = value)], by = "variable")

                ## original training data
                ot_cohort = readRDS(system.file("extdata", "ot_scores_cohort.rds", package = "casereport"))

                ## if (opt$pair %in% hrd_cohort$pair){
                ot_cohort = rbind(ot_cohort[!pair %in% opt$pair], ot, fill = T)
                ## }

                ## ot_cohort[, range(value), by = .(variable, is.hrd)]

                ot_cohort[, ot.status := ifelse(BRCA1 > 0.5, "Oneness",
                                         ifelse(BRCA2 > 0.5, "Twoness",
                                         ifelse(SUM12 > 0.9, "OT+", "OT-")))]

                ot.stat = ot_cohort[ot_cohort$pair %in% opt$pair]$ot.status

                ot.stat = c(
                    "Oneness" = "#C93312",
                    "Twoness" = "#3B9AB2",
                    "OT+" = "#0B775E",
                    ## "OT-" = "#899DA4"
                    "OT-" = "#899DA480"
                )

                ot_dat = data.table::melt(ot_cohort, id.vars = c("pair", "ot.status",
                                                                 "fmut_bi"))

                

                ## these are the dimensions that need to be logged
                ldat = ot_dat[!variable %in% c("del.mh.prop", "BRCA1", "BRCA2", "SUM12", "OTHER")]
                ldat[, variable := factor(variable, levels = setdiff(levels(variable), "del.mh.prop"))]
                ## plot the distribution of raw count
                ot.plot = ggplot(ldat, aes(x = value, y = variable)) +
                    geom_density_ridges(aes(point_colour = ot.status,
                                            fill = ot.status),
                                        bandwidth = 0.1,
                                        alpha = 0.5,
                                        scale = 0.9,
                                        rel_min_height = 0.01,
                                        color = NA,
                                        jittered_points = TRUE,
                                        position = position_points_jitter(width = 0.01, height = 0),
                                        point_shape = '|',
                                        point_size = 3,
                                        point_alpha = 0.3) +
                    scale_discrete_manual("point_colour", values = ot.stat) +
                    scale_fill_manual(values = ot.stat) +
                    geom_segment(data = ldat[pair==opt$pair],
                                 aes(x = value, xend = value,
                                     y = as.numeric(variable), yend = as.numeric(variable) + 0.9, colour = ot.status),
                                 alpha = 1, lwd = 3) +
                    scale_colour_manual(values = ot.stat) + 
                    scale_x_continuous(trans = "log1p",
                                       breaks = c(0, 1, 10, 100, 1000, 10000, 100000),
                                       labels = c(0, 1, 10,
                                                  expression(10^2),
                                                  expression(10^3),
                                                  expression(10^4),
                                                  expression(10^5)),
                                       limits = c(0, 100000),
                                       expand = c(0, 0)) +
                    labs(title = paste0("HRDetect features compared to Davies et al. 2017"), x = "Burden") +
                    theme_minimal() +
                    theme(legend.position = "none",
                          title = element_text(size = 20, family = "sans"),
                          axis.title = element_text(size = 20, family = "sans"),
                          axis.text.x = element_text(size = 15, family = "sans"),
                          axis.text.y = element_text(size = 20, family = "sans"))

                ## these are the dimensions that need to be logged
                rdat = ot_dat[variable %in% "del.mh.prop"]
                rdat[, variable := factor(variable, levels = "del.mh.prop")]
                ## plot the distribution of raw count
                ot.plot.2 = ggplot(rdat, aes(x = value, y = variable)) +
                    geom_density_ridges(aes(point_colour = ot.status,
                                            fill = ot.status),
                                        bandwidth = 0.1,
                                        alpha = 0.5,
                                        scale = 0.9,
                                        rel_min_height = 0.01,
                                        color = NA,
                                        jittered_points = TRUE,
                                        position = position_points_jitter(width = 0.01, height = 0),
                                        point_shape = '|',
                                        point_size = 3,
                                        point_alpha = 0.3) +
                    scale_discrete_manual("point_colour", values = ot.stat, labels = NULL) +
                    scale_fill_manual(values = ot.stat,
                                      labels = c("Non HR deficient", "HR deficient")) +
                    ## scale_y_discrete(labels = hrdetect.dims) +
                    labs(x = "Proportion") +
                    guides(point_colour = FALSE) +
                    geom_segment(data = rdat[pair==opt$pair],
                                 aes(x = value, xend = value,
                                     y = as.numeric(variable), yend = as.numeric(variable) + 3, colour = ot.status),
                                 alpha = 1, lwd = 3) +
                    scale_colour_manual(values = ot.stat) + 
                    theme_minimal() +
                    theme(legend.position = "bottom",
                          legend.text = element_text(size=20),
                          title = element_text(size = 20, family = "sans"),
                          axis.title = element_text(size = 20, family = "sans"),
                          axis.text.x = element_text(size = 15, family = "sans"),
                          axis.text.y = element_text(size = 20, family = "sans"))

                ## finally draw the output probability in a waterfall plot

                prob_dat1 = ot_dat[variable %in% c("BRCA1")]
                prob_dat1$marked_pair = prob_dat1$pair %in% opt$pair
                prob_dat1[, fpair := reorder(pair, value)]


                prob.cols = binary.cols
                prob.cols["TRUE"] = ifelse(prob_dat1[pair == opt$pair]$value > 0.5,
                       binary.cols["TRUE"], "black")
                
                prob.brca1 = ggplot(prob_dat1, aes(x = fpair, y = value)) +
                    geom_bar(aes(color = marked_pair), stat = "identity") +
                    scale_colour_manual(values = prob.cols) +
                    ylab("Oneness") +
                    theme_minimal() +
                    theme(legend.position = "none",
                          legend.text = element_text(size=20),
                          title = element_text(size = 20, family = "sans"),
                          axis.title = element_text(size = 20, family = "sans"),
                          axis.text.x = element_blank(),
                          axis.title.x = element_blank(),
                          axis.text.y = element_text(size = 20, family = "sans"))

                
                prob_dat2 = ot_dat[variable %in% c("BRCA2")]
                prob_dat2$marked_pair = prob_dat2$pair %in% opt$pair
                prob_dat2[, fpair := reorder(pair, value)]


                prob.cols = binary.cols
                prob.cols["TRUE"] = ifelse(prob_dat2[pair == opt$pair]$value > 0.5,
                       binary.cols["TRUE"], "black")
                
                prob.brca2 = ggplot(prob_dat2, aes(x = fpair, y = value)) +
                    geom_bar(aes(color = marked_pair), stat = "identity") +
                    scale_colour_manual(values = prob.cols) +
                    ylab("Twoness") + 
                    theme_minimal() +
                    theme(legend.position = "none",
                          legend.text = element_text(size=20),
                          title = element_text(size = 20, family = "sans"),
                          axis.title = element_text(size = 20, family = "sans"),
                          axis.text.x = element_blank(),
                          axis.title.x = element_blank(),
                          axis.text.y = element_text(size = 20, family = "sans"))

                
                
                ## draw the plots
                png(report.config$ot_log, width = 800, height = 800)
                print(ot.plot)
                dev.off()

                png(report.config$ot_prob, width = 800, height = 200)
                print(ot.plot.2)
                dev.off()

                png(report.config$oneness, width = 800, height = 200)
                print(prob.brca1)
                dev.off()

                png(report.config$twoness, width = 800, height = 200)
                print(prob.brca2)
                dev.off()
            } else {
                message("Oneness/twoness data not provided")
            }
        } else {
            message("Oneness/twoness analysis already exists.")
        }

        ## ##################
        ## Enhancer/gene proximity
        ## ##################
        proximity.gallery.fname = paste0(opt$outdir, "/", "proximity.gallery.txt")
        if (check_file(proximity.gallery.fname, overwrite = opt$overwrite, verbose = opt$verbose)) {
            message("Proximity gallery already exists")
        } else {
            if (!file.good(report.config$rna_change)) {
                message("No RNA analysis, skipping proximity")
                proximity.gallery.dt = data.table(gene = as.character(),
                                                  plot.fname = as.character())
            } else if (!file.good(opt$proximity)) {
                message("No proximity.rds supplied - skipping!")
                proximity.gallery.dt = data.table(gene = as.character(),
                                                  plot.fname = as.character())
            } else {
        
                prox = readRDS(opt$proximity)

                if (length(prox)) {
                    pdt = prox$dt
                    pdt = pdt[reldist<0.25 & refdist>5e6]
                } else {
                    pdt = data.table(gene_name = c())
                }
                cool.exp = fread(report.config$rna_change) %>% setkey("gene")

                if (any(cool.exp[direction=="over", gene] %in% pdt$gene_name)) {
                    
                    ## filter by overexpressed genes
                    pdt = merge.data.table(pdt, cool.exp[direction=="over"], by.x = "gene_name", by.y = "gene", all.x = TRUE)
                    pgs = pdt[(direction=="over"), unique(gene_name)]
                    gngt = readRDS(report.config$gencode_gtrack)
                    gngt$name = "Genes"
                    gngt$height=15
                    gngt$cex.label=1.2
                    enh = readRDS(opt$enhancer)

                    ## read some gTracks
                    cvgt = readRDS(report.config$coverage_gtrack)

                    ## read gGraph
                    gg = readRDS(report.config$complex)

                    proximity.fns = lapply(setNames(nm = pgs),
                                           function(g) {
                                               this.png = paste0(opt$outdir, "/", g, ".proximity.png")
                                               w = prox[pdt[gene_name==g, walk.id]]
                                               this.enh = copy(enh)
                                               this.enh$col = binary.cols[as.character(seq_along(enh) %in% w$dt$qid)]
                                               gt.enh = gTrack(this.enh, name = "enhancer")
                                               cvgt$xaxis.cex.label=2.0
                                               cvgt$xaxis.cex.tick=2.0
                                               pgt = c(cvgt, gg$gtrack(height = 30),
                                                       w$gtrack(name = "shortest walk", height=15), gngt, gt.enh)
                                               png(this.png, height = 1200, width = 1800, pointsize = 20)
                                               plot(pgt, w$footprint + 1e6, legend.params = list(plot = FALSE), y0 = 0, mai=c(0,0,0,0))
                                               dev.off()
                                               return(this.png)
                                           }) %>% unlist
                    proximity.gallery.dt = data.table(gene = names(proximity.fns),
                                                      plot.fname = proximity.fns)

                } else {
                    proximity.gallery.dt = data.table(gene = as.character(),
                                                      plot.fname = as.character())
                }
            }
            fwrite(proximity.gallery.dt, proximity.gallery.fname)
        }


        ## #################
        ## summarize
        ## #################
        if (check_file(report.config$summary_stats, opt$overwrite, opt$verbose)) {
            message("Summary already exists, skipping")
        } else {
            message("Computing CN/mutation summary")
            summary.list = create.summary(jabba_rds = opt$jabba_rds,
                                          snv_vcf = opt$snv_vcf,
                                          indel_vcf = opt$indel_vcf,
                                          verbose = TRUE,
                                          amp.thresh = opt$amp_thresh,
                                          del.thresh = opt$del_thresh)
            saveRDS(summary.list, report.config$summary_stats)
        }

        
        ## #################
        ## deconv
        ## #################
        #if (check_file(report.config$deconv, opt$overwrite, opt$verbose)) {
        #  message("Deconvolution data already exists, skipping")
        #} else {
          #if (file.good(opt$tpm)){
          #  message("Running Deconvolution algorithm")
          #  tpm_raw = as.character(opt$tpm)
          #  tpm_read <- data.table::fread(tpm_raw, header = TRUE)
            # make sure gene name columns is deduped and take the top count per gene
          #  names(tpm_read)[1:2] = c('gene', 'count')
          #  tpm_read[, count_deduped := max(count), by = gene]
          #  tpm_read = tpm_read[!duplicated(gene), .(gene, count_deduped)]
          #  tpm_read_new <- tpm_read[,-1]
          #  tpm_read_new_name <- as.matrix(tpm_read[,1])
          #  rownames(tpm_read_new) <- tpm_read_new_name[,1]
          #  deconv_results = immunedeconv::deconvolute(tpm_read_new, opt$deconv)
          #  data.table::fwrite(deconv_results, file.path(opt$outdir,"deconv_results.txt"), sep = '\t', quote = F, row.names = F)
          #}
        #}
        
        
        ## ################
        ## create oncotable
        ## ################

        if (check_file(report.config$oncotable, opt$overwrite, opt$verbose)) {
            message("Oncotable already exists. Skipping!")
        } else {
            message("Preparing oncotable...")
            oncotable.input = data.table(pair = opt$pair,
                                         jabba_rds = opt$jabba_rds,
                                         fusions = opt$fusions,
                                         complex = report.config$complex,
                                         scna = report.config$gene_cn,
                                         annotated_snv_bcf = opt$snpeff_snv_bcf,
                                         annotated_indel_bcf = opt$snpeff_indel_bcf,
                                         rna = report.config$rna_change_all,
                                         proximity = opt$proximity,
                                         deconstruct_variants = opt$deconstruct_variants,
                                         hrd_results = opt$hrd_results,
                                         key = "pair")
            oncotable = casereport::oncotable(oncotable.input,
                                  gencode = opt$genes,
                                  verbose = TRUE)
            saveRDS(oncotable, report.config$oncotable)
        } 

        ## ################
        ## create summaryTable
        ## ################


       if (check_file(report.config$summaryTable, opt$overwrite, opt$verbose)) {
            message("Summary Table already exists. Skipping!")
        } else {
            message("Generating summary table")
        wol=makeSummaryTables(report.config$driver_scna,report.config$surface_scna, report.config$driver_fusions, report.config$rna_change_with_cn,
                              report.config$surface_rna_change_with_cn, report.config$driver_mutations, report.config$oncotable,report.config$gene_cn,
                              report.config$onc,report.config$tsg,report.config$surface)
        driverTable=wol[[1]]
        surfaceTable=wol[[2]]
        driverTable$tier=as.character(driverTable$tier)
        driverTable[is.na(driverTable$tier),]$tier="Undefined"
        surfaceTable$tier=as.character(surfaceTable$tier)
        surfaceTable[is.na(surfaceTable$tier),]$tier="Undefined"
        #wol=wol[!is.na(wol$type),]
        #wol=wol[wol$type!="",]
              
        fwrite(driverTable,report.config$summaryTable)
        fwrite(surfaceTable,report.config$summarySurfaceTable)
        }


    }


    message("Start knitting")
    print(normalizePath(paste0(opt$outdir, "/", opt$pair,".wgs.report.html")))
    rmarkdown::render(
        input = normalizePath(system.file('extdata', 'case_report_module/wgs.report.rmd', package = 'casereport')),
        output_format = "html_document",
        output_file = normalizePath(paste0(opt$outdir, "/", opt$pair,".wgs.report.html")),
        knit_root_dir = normalizePath(opt$outdir),
        ## params = report.config,
        params = list(set_title = paste0(opt$pair),
                      pair = opt$pair,
                      jabba_rds = normalizePath(opt$jabba_rds),
                      outdir = normalizePath(opt$outdir),
                      tumor_type = opt$tumor_type,
                      server = opt$server),
        quiet = FALSE)

    message("WGS case report completed.")
    
    if(!is.na(opt$pairs_out) & !(opt$pairs_out == "/dev/null")){
            outRDS = readRDS(normalizePath(opt$pairs_out))
            if(!("casereport_html" %in% colnames(outRDS))){
                outRDS$casereport_html = NA
            }
            if(!(opt$pair %in% outRDS$pair)){
               outRDS = rbind(outRDS,data.table(pair=opt$pair,casereport_html=normalizePath(paste0(opt$outdir, "/", opt$pair,".wgs.report.html"))), fill=TRUE)
            } else{
                outRDS[pair == opt$pair]$casereport_html = normalizePath(paste0(opt$outdir, "/", opt$pair,".wgs.report.html"))
                }
           saveRDS(outRDS,normalizePath(opt$pairs_out))
           
            message("Added html path to selected pairs table.")
     }
    
    if(!is.na(opt$html_vizdir) & !(opt$html_vizdir == "/dev/null")){
            move = paste("cp", normalizePath(paste0(opt$outdir, "/", opt$pair,".wgs.report.html")), normalizePath(opt$html_vizdir))
            system(move)
            message("Copied html output to selected directory.")
        }
    
}
mskilab/casereport documentation built on Oct. 19, 2022, 3 a.m.