facets-suite: Functions to run and parse output from FACETS

Documented in run_facets

#' Run FACETS
#'
#' Runs FACETS on an input count file, with specified parameter settings.
#'
#' @param read_counts Read counts object, generated by `snp-pileup`.
#' @param cval Segmentation parameter, higher values for higher sensitivity.
#' @param dipLogR Manual dipLogR value, if left empty `facets` finds the most likely sample baseline.
#' @param ndepth Minimum depth in normal to retain SNP, see `facets` help.
#' @param snp_nbhd Minimum basepair distance for SNPs, see `facets` help.
#' @param min_nhet Minimum number of heterozygous SNPs on segment required for clustering, see `facets` help.
#' @param genome Genome build.
#' @param seed Seed value for random number generation, set to enable full reproducibility.
#'
#' @return A list object containing the following items. See \href{www.github.com/mskcc/facets}{FACETS documentation} for more details:
#' \itemize{
#'       \item{\code{snps}:} {SNPs used for copy-number segmentation, \code{het==1} for heterozygous loci.}
#'       \item{\code{segs}:} {Inferred copy-number segmentation.}
#'       \item{\code{purity}:} {Inferred sample purity, i.e. fraction of tumor cells of the total cellular population.}
#'       \item{\code{ploidy}:} {Inferred sample ploidy.}
#'       \item{\code{dipLogR}:} {Inferred dipLogR, the sample-specific baseline corresponding to the diploid state.}
#'       \item{\code{alBalLogR}:} {Alternative dipLogR value(s) at which a balanced solution was found.}
#'       \item{\code{flags}:} {Warning flags from the naïve segmentation algorithm.}
#'       \item{\code{em_flags}:} {Warning flags from the expectation-maximization segmentation algorithm.}
#'       \item{\code{loglik}:} {Log-likelihood value of the fitted model.}
#' }
#'
#' @examples
#' \dontrun{
#' library(pctGCdata)
#' run_facets(test_read_counts, cval = 500, genome = 'hg38')
#' }
#' 
#' @import facets
#' @import pctGCdata

#' @export run_facets
run_facets = function(read_counts,
                      cval = 100,
                      dipLogR = NULL,
                      ndepth = 35,
                      snp_nbhd = 250,
                      min_nhet = 15,
                      genome = c('hg18', 'hg19', 'hg38', 'mm9', 'mm10'),
                      seed = 100,
                      facets_lib_path = '') {
    
    if ( facets_lib_path != '' ) {
        library(facets, lib.loc = facets_lib_path)
    } else {
        library(facets) 
    }
    print(paste0('loaded facets version: ', packageVersion('facets')))
    
    # Check input 
    missing_cols = setdiff(c('Chromosome', 'Position', 'NOR.DP', 'TUM.DP', 'NOR.RD', 'TUM.RD'), names(read_counts)) 
    if (length(missing_cols) > 0) {
        stop(paste0('Input missing column(s)', paste(missing_cols, collapse = ', '), '.'), call. = FALSE)
    }
    
    set.seed(seed)
    genome = match.arg(genome)

    # Run FACETS algorithm
    dat = facets::preProcSample(read_counts, ndepth = ndepth, het.thresh = 0.25, snp.nbhd = snp_nbhd, cval = 25,
                                gbuild = genome, hetscale = TRUE, unmatched = FALSE, ndepthmax = 1000)
    out = facets::procSample(dat, cval = cval, min.nhet = min_nhet, dipLogR = dipLogR)
    fit = facets::emcncf(out)
    
    # Fix bad NAs
    fit$cncf = cbind(fit$cncf, cf = out$out$cf, tcn = out$out$tcn, lcn = out$out$lcn)
    fit$cncf$lcn[fit$cncf$tcn == 1] = 0
    fit$cncf$lcn.em[fit$cncf$tcn.em == 1] = 0
    
    # Generate output
    list(
        snps = out$jointseg,
        segs = fit$cncf,
        purity = as.numeric(fit$purity),
        ploidy = as.numeric(fit$ploidy),
        dipLogR = out$dipLogR,
        alBalLogR = out$alBalLogR,
        flags = out$flags,
        em_flags = fit$emflags,
        loglik = fit$loglik
    )
}

## load facets output from .Rds file or legacy run of .Rdata file
#' @export load_facets_output
load_facets_output <- function(rds_rdata_file) {
    if (grepl('rds$', rds_rdata_file, ignore.case = T)) {
        readRDS(rds_rdata_file)
    }
    else {
        load(rds_rdata_file)
        list(
            snps = out$jointseg,
            segs = fit$cncf,
            purity = as.numeric(fit$purity),
            ploidy = as.numeric(fit$ploidy),
            dipLogR = out$dipLogR,
            alBalLogR = out$alBalLogR,
            flags = out$flags,
            em_flags = fit$emflags,
            loglik = fit$loglik
        )
    }
}

#' @export create_legacy_output
create_legacy_output <- function(facets_output, directory, sample_id, counts_file, sel_run_type, run_details) {
    
    sample_id = ifelse(sel_run_type == '', sample_id, paste0(sample_id, '_', sel_run_type))
    output_prefix = paste0(directory, '/', sample_id)
    
    ### create cncf.txt
    facets_output$segs %>%
        mutate(ID=sample_id,
               loc.start = start,
               loc.end = end) %>%
        select(ID, chrom, loc.start, loc.end, seg, num.mark, nhet, cnlr.median, mafR, 
               segclust, cnlr.median.clust, mafR.clust, cf, tcn, lcn, 
               cf.em, tcn.em, lcn.em) %>%
        write.table(file=paste0(output_prefix, '.cncf.txt'), quote=F, row.names=F, sep='\t')
    
    ### create .Rdata
    out =
        list(
            jointseg = facets_output$snps,
            out = facets_output$segs %>% 
                select(chrom, seg, num.mark, nhet, cnlr.median, mafR, 
                       segclust, cnlr.median.clust, mafR.clust, cf, tcn, lcn),
            nX=23,
            chromlevels = c(1:22, "X"),
            dipLogR = facets_output$dipLogR,
            alBalLogR = facets_output$alBalLogR,
            IGV = NULL
        )
    fit = 
        list(
            loglik = facets_output$loglik,
            purity = facets_output$purity,
            ploidy = facets_output$ploidy,
            dipLogR = facets_output$dipLogR,
            seglen = -1,
            cncf = facets_output$segs,
            emflags = facets_output$em_flags
        )
    save(out, fit, file=paste0(output_prefix, ".Rdata"), compress=T)
    
    ### create .CNCF.png
    system(paste0('mv ', output_prefix, '.png ', output_prefix, '.CNCF.png'))
    
    #### create .out file 
    purity_cval = cval = NA
    if(nrow(run_details) == 2) {
        cval        = (run_details %>% filter(run_type == 'hisens'))$cval[1]
        purity_cval = (run_details %>% filter(run_type == 'purity'))$cval[1]
    } else {
        cval   = run_details$cval[1]
        purity = run_details$purity[1]
    }
    
    run = run_details %>% filter(run_type == sel_run_type) %>% head(n=1)

    out_txt = "# INPUT PARAMETERS GIVEN\n"
    out_txt = paste0(out_txt, '# TAG = ', sample_id, '\n')
    out_txt = paste0(out_txt, '# Facets version = ', as.character(packageVersion('facets')), '\n')
    out_txt = paste0(out_txt, '# Input = ', basename(as.character(run$input_file)),'\n')
    out_txt = paste0(out_txt, '# Seed = ', run$seed, '\n')
    out_txt = paste0(out_txt, '# snp.nbhd = ', run$snp_nbhd, ' \n')
    out_txt = paste0(out_txt, '# ndepth = ', run$ndepth,' \n')
    out_txt = paste0(out_txt, '# purity_cval = ', purity_cval,'\n')
    out_txt = paste0(out_txt, '# unmatched = FALSE \n')
    out_txt = paste0(out_txt, '# cval = ', cval,'\n')
    out_txt = paste0(out_txt, '# min.nhet = ', run$min_nhet,'\n')
    out_txt = paste0(out_txt, '# genome = hg19\n')
    out_txt = paste0(out_txt, '# tumor_id = \n')
    
    out_txt = paste0(out_txt, '\n# LOADED MODULE INFO\n')
    out_txt = paste0(out_txt, '# Facets version = ', as.character(packageVersion('facets')) , '\n\n')
    out_txt = paste0(out_txt, '\n# FACETS OUTPUT\n')
    out_txt = paste0(out_txt, '# Purity = ', run$purity, '\n')
    out_txt = paste0(out_txt, '# Ploidy = ', run$ploidy,' \n')
    out_txt = paste0(out_txt, '# dipLogR = ', run$dipLogR,'\n')
    out_txt = paste0(out_txt, '# dipt = -1\n')
    out_txt = paste0(out_txt, '# loglik = \n')
    out_txt = paste0(out_txt, '# output flags\n')
    out_txt = paste0(out_txt, '# ', run$flags,'\n\n')   
  
    cat(out_txt, file=paste0(directory, '/', sample_id, '.out'))
}