access_data_analysis: ACCESS data analysis

# library(data.table)
# library(stringr)
# library(tidyr)
# library(dplyr)

#' @export
SV_incorporation = function(
  master.ref,results.dir,
  dmp.dir = '/juno/work/access/production/resources/cbioportal/current/mskimpact',
  access.genes = '/work/access/production/resources/msk-access/current/regions_of_interest/current/MSK-ACCESS-v1_0-target_genes.list',
  criteria = 'stringent'
){
  # # test input section -----------------------------------------------------------
  # master.ref = fread('/juno/work/bergerm1/bergerlab/zhengy1/access_data_analysis/data/example_master_file.csv')
  # results.dir = paste0('/juno/work/bergerm1/MSK-ACCESS/ACCESS-Projects/test_access/access_data_analysis/output_042020/')
  # dmp.dir = '/ifs/work/bergerm1/zhengy1/dmp-2020/mskimpact/'
  # # criteria <- 'permissive'
  # criteria <- 'stringent'
  #
  # DMP fusion calls --------------------------------------------------------
  DMP.fusion <- fread(paste0(dmp.dir,'/data_sv.txt')) %>%
    transmute(DMP_SAMPLE_ID = Sample_ID,EventType = Class,Gene1 = Site1_Hugo_Symbol,Gene2 = Site2_Hugo_Symbol,
              Chr1 = Site1_Chromosome,Chr2 = Site2_Chromosome,Pos1 = Site1_Position,Pos2 = Site2_Position,PairedReadCount = Tumor_Paired_End_Read_Count,
              SplitReadCount = Tumor_Split_Read_Count,TumorReadCount = Tumor_Read_Count,EventInfo = Event_Info) %>% data.table()

  # execution ---------------------------------------------------------------

  dir.create(paste0(results.dir,'/results_',criteria,'_combined'))
  gene.list <- c('ALK','BRAF','ERBB2','EGFR','FGFR1','FGFR2','FGFR3','KIT','MET','NTRK1','NTRK2','NTRK3','PDGFRA','RET','ROS1')
  access.genes.info <- fread(access.genes,header=F)
  access.gene.list <- access.genes.info$V1
  # x <- unique(master.ref$cmo_patient_id)[1]
  lapply(unique(master.ref$cmo_patient_id),function(x){
    # get sample sheet --------------------------------------------------------
    sample.sheet <- fread(paste0(results.dir,'/',x,'/',x,'_sample_sheet.tsv'))

    # get plasma SV calls -----------------------------------------------------
    total.sv <- do.call(rbind,lapply(sample.sheet[Sample_Type == 'duplex']$Sample_Barcode,function(y){
      SV.filename <- master.ref[cmo_sample_id_plasma == y]$sv_path
      if(!file.exists(SV.filename)){stop(paste0('SV file: ',SV.filename,' ----- does not exist'))}
      tmp.SV <- fread(SV.filename) %>% filter(Significance == 'KeyGene')
      return(tmp.SV)
    })) %>%
      transmute(
        TumorId = paste0(TumorId,'___total'),
        SV_Type,Gene1,Gene2,Chr1,Chr2,Pos1,Pos2,PairedReadCount = PairEndReadSupport,
        SplitReadCount = SplitReadSupport,TumorReadCount,Fusion)

    # get DMP SV calls --------------------------------------------------------
    DMP.sv <- do.call(rbind,lapply(sample.sheet[Sample_Type == 'DMP_Tumor']$Sample_Barcode,function(y){
      DMP.fusion[DMP_SAMPLE_ID == y]
    }))
    if(!is.null(DMP.sv)){
      DMP.sv <- DMP.sv %>%
        dplyr::transmute(
          TumorId = paste0(DMP_SAMPLE_ID,'___DMP_Tumor'),
          SV_Type = case_when(
            EventType == 'INVERSION' ~ 'INV',
            EventType == 'DELETION' ~ 'DEL',
            EventType == 'INSERTION' ~ 'INS',
            EventType == 'DUPLICATION' ~ 'DUP',
            EventType == 'TRANSLOCATION' ~ 'BND',
            TRUE ~ 'UNKNOWN'),
          Gene1,Gene2,Chr1,Chr2,Pos1,Pos2,PairedReadCount,
          SplitReadCount,TumorReadCount = NA,Fusion = EventInfo)
    }else{
      # dummy df if there is no DMP fusion found
      DMP.sv <- data.frame(matrix(nrow = 0,ncol = ncol(DMP.fusion)))
      colnames(DMP.sv) <- colnames(total.sv)
    }
    print('done with reading in')
    print(colnames(total.sv))
    print(colnames(DMP.sv))

    # event desc. reconciliating possible DMP vs manta ------------------------
    rbind(total.sv,DMP.sv) %>%
      # consolidate information for dcasting data frame
      unite(sample_info,PairedReadCount,SplitReadCount,TumorReadCount,sep = '-') %>% unite(gene_pair,Gene1,Gene2,sep = '__') %>% unite(chr_pair,Chr1,Chr2,sep = '__') %>%
      # selecting all unique events with their description (manta outputs will be the same, within DMP will be the same)
      unite(event_info,gene_pair,chr_pair,SV_Type,Pos1,Pos2,sep = '---') %>% select(event_info,Fusion) %>% unique() %>% data.table() -> event.desc
    event.desc <- event.desc[,.(HGVSp_Short = paste0(unique(Fusion),collapse = ' | ')),.(event_info)]

    # make into a row ---------------------------------------------------------
    rbind(total.sv,DMP.sv) %>% select(-Fusion) %>%
      # consolidate information for dcasting data frame
      unite(sample_info,PairedReadCount,SplitReadCount,TumorReadCount,sep = '-') %>% unite(gene_pair,Gene1,Gene2,sep = '__') %>% unite(chr_pair,Chr1,Chr2,sep = '__') %>%
      unite(event_info,gene_pair,chr_pair,SV_Type,Pos1,Pos2,sep = '---') %>% spread(TumorId,sample_info) %>%
      merge(event.desc,by = 'event_info',all.x = T) %>%
      # parse out information
      separate(event_info,into = c('gene_pair','chr_pair','SV_Type','Pos1','Pos2'),sep = '---') %>%
      # process information for row binding into SNV table
      setnames(c('gene_pair','chr_pair','SV_Type','Pos1','Pos2'),c('Hugo_Symbol','Chromosome','Variant_Classification','Start_Position','End_Position')) %>%
      mutate(Reference_Allele = '',Tumor_Seq_Allele2 = '',ExAC_AF = '',Hotspot = ifelse(grepl(paste0(gene.list,collapse = '|'),Hugo_Symbol),'Significance gene',''),
             DMP = '',duplex_support_num = '',call_confidence = '') %>%
      select(Hugo_Symbol,Chromosome,Start_Position,End_Position,Variant_Classification,HGVSp_Short,Reference_Allele,Tumor_Seq_Allele2,ExAC_AF,Hotspot,DMP,duplex_support_num,call_confidence,everything()) %>%
      data.table() -> SV.table

    # adding some annotation columns specific to snv table --------------------
    apply(sample.sheet[Sample_Type != 'plasma_simplex'],1,function(y){
      current.colname <- paste0(
        y[which(colnames(sample.sheet) == 'Sample_Barcode')],
        '___',
        case_when(y[which(colnames(sample.sheet) == 'Sample_Type')] == 'duplex' ~'total',
                  y[which(colnames(sample.sheet) == 'Sample_Type')] == 'DMP_Tumor' ~'DMP_Tumor',
                  y[which(colnames(sample.sheet) == 'Sample_Type')] == 'DMP_Normal' ~'DMP_Normal',
                  y[which(colnames(sample.sheet) == 'Sample_Type')] == 'unfilterednormal' ~'unfilterednormal'))

      # If any of the sample does not already have a column
      if(!current.colname %in% colnames(SV.table)){
        SV.table[,eval(current.colname) := NA]
      }

      # only for plasma sample -- '.called' column
      if(y[which(colnames(sample.sheet) == 'Sample_Type')] == 'duplex'){
        SV.table[,paste0(y[which(colnames(sample.sheet) == 'Sample_Barcode')],'___duplex.called') := case_when(
          is.na(get(paste0(y[which(colnames(sample.sheet) == 'Sample_Barcode')],'___total'))) ~ 'Not Called',
          is.na(all(!str_split(Hugo_Symbol,'__') %in% access.gene.list)) ~ 'Not Covered',
          T ~ 'Called'
        )]
      }

      # edit DMP signout column
      if(y[which(colnames(sample.sheet) == 'Sample_Type')] == 'DMP_Tumor'){
        SV.table[!is.na(get(current.colname)),DMP := 'Signed out']
      }
    })
    # adding CH filler column
    SV.table$CH = ''

    # append and write --------------------------------------------------------
    tmp.snv.table.path = paste0(results.dir,'/results_',criteria,'/',x,'_SNV_table.csv')
    if(!file.exists(tmp.snv.table.path)){
      stop(paste0(tmp.snv.table.path,' does not exist. Check if filter_calls was run correcly'))
    }
    SNV.table <- fread(tmp.snv.table.path)

    snv.sv.table.directory <- paste0(results.dir,'/results_',criteria,'_combined/',x,'_table.csv')
    
    if (nrow(SNV.table) == 0) {
      write.csv(SV.table,snv.sv.table.directory,quote = F,row.names = F)
    } else {
      write.csv(rbind(SNV.table,SV.table),snv.sv.table.directory,quote = F,row.names = F)
    }
  })

}

# Executable -----------------------------------------------------------------------------------------------------------
suppressPackageStartupMessages({
  library(data.table)
  library(tidyr)
  library(stringr)
  library(dplyr)
  library(argparse)
})

if (!interactive()) {

  parser=ArgumentParser()
  parser$add_argument('-m', '--masterref', type='character', help='File path to master reference file')
  parser$add_argument('-o', '--resultsdir', type='character', help='Output directory')
  parser$add_argument('-dmp', '--dmpdir', type='character', default = '/juno/work/access/production/resources/cbioportal/current/msk_solid_heme',
                      help='Directory of clinical DMP IMPACT repository [default]')
  parser$add_argument('-genes', '--genelist', type='character', default = '/work/access/production/resources/msk-access/current/regions_of_interest/current/MSK-ACCESS-v1_0-target_genes.list',
                      help='File path to genes covered by ACCESS [default]')
  parser$add_argument('-c', '--criteria', type='character', default = 'stringent',
                      help='Calling criteria [default]')
  args=parser$parse_args()

  master.ref = args$masterref
  results.dir = args$resultsdir
  dmpdir = args$dmpdir
  access.genes=args$genelist
  criteria = args$criteria

  cat(paste0(paste0(c(paste0(rep('-',15),collapse = ''),'Arguments input: ',master.ref,results.dir,dmpdir,access.genes,criteria,
                      paste0(rep('-',15),collapse = '')),collapse = "\n"),'\n'))

  if(!criteria %in% c('stringent','permissive')){
    stop('Criteria argument should be either stringent or permissive')
  }

  suppressWarnings(SV_incorporation(fread(master.ref),results.dir,dmpdir,access.genes,criteria))

}