maftools: Summarize, Analyze and Visualize MAF Files

Documented in prepAscat

#'Prepare input files for ASCAT
#' @description Function takes the output from \code{\link{gtMarkers}} and generates `logR` and `BAF` files required for ASCAT analysis.
#' @details The function will filter SNPs with low coverage (default <15), estimate BAF, logR, and generates the input files for ASCAT. Alternatively, logR file can be segmented with \code{\link{segmentLogR}}
#' @param t_counts read counts from tumor generated by \code{\link{gtMarkers}}
#' @param n_counts read counts from normal generated by \code{\link{gtMarkers}}
#' @param sample_name Sample name. Used as a basename for output files. Default `NA`, parses from `t_counts` file.
#' @param min_depth Min read depth required to consider a marker. Default 15
#' @param normalize If TRUE, normalizes for library size. Default TRUE
#' @seealso \code{\link{gtMarkers}} \code{\link{prepAscat_t}} \code{\link{segmentLogR}}
#' @references Van Loo P, Nordgard SH, Lingjærde OC, et al. Allele-specific copy number analysis of tumors. Proc Natl Acad Sci U S A. 2010;107(39):16910-16915. doi:10.1073/pnas.1009843107
#' @export
prepAscat = function(t_counts = NULL, n_counts = NULL, sample_name = NA, min_depth = 15, normalize = TRUE){

  if(any(is.null(t_counts) | is.null(n_counts))) stop("Missing tumor or normal read counts!")

  if(is.na(sample_name)){
    sample_name = gsub(pattern = "\\.tsv$", replacement = "", x = basename(path = t_counts))
  }

  counts = c(t_counts, n_counts)

  #library sizes
  tot_map_reads = lapply(counts, function(x){
    x = data.table::fread(input = x,nrows = 1)
    as.numeric(x$V2)
  })
  names(tot_map_reads) = c("tumor", "normal")
  map_ratio = tot_map_reads$tumor/tot_map_reads$normal
  message("Library sizes:")
  message("Tumor:  ", tot_map_reads$tumor)
  message("Normal: ", tot_map_reads$normal)
  message("Library size difference: ", round(map_ratio, digits = 3))
  message("------")

  counts = lapply(counts, function(x){
    message("Counts file: ", x)
    x = data.table::fread(input = x)
    x[,loci := gsub(pattern = "^chr", replacement = "", x = loci)]
    message("Markers: ", nrow(x))
    if(nrow(x[duplicated(loci)]) > 0){
      message("Removed ", nrow(x[duplicated(loci)]), " duplicated loci")
      x = x[!duplicated(loci)]
    }
    x[, tot_depth := apply(x[,.(A, T, G, C)], 1, sum)]
    x = x[tot_depth > min_depth]
    message("Markers > ", min_depth, ": ", nrow(x))
    message("------")
    x[, baf := apply(x[,.(A, T, G, C)], 1, function(r) {r = sort(r); r[4]/sum(r[4], r[3])})]
    #x[, baf := ifelse(test = baf < 0.5, yes = baf, no = 1 - baf)]
    #x$baf = ifelse(x$baf <0.5,x$baf,1-x$baf)
    x$baf = ifelse(runif(length(x$baf))<0.5,x$baf,1-x$baf)
    x = data.frame(data.table::tstrsplit(x = x$loci, split = ":"), x$baf, x$tot_depth, row.names = x$loci)
    colnames(x) = c("chr", "pos", "BAF", "depth")
    x$pos = as.numeric(as.character(x$pos))
    x
  })
  names(counts) = c("tumor", "normal")
  com_loci = intersect(rownames(counts[[1]]), rownames(counts[[2]]))
  counts = lapply(counts, function(x) x[com_loci,, drop = FALSE])
  message("Final number SNPs: ", nrow(counts[[1]]))

  #normalize for sequencing depth (might not be the best way) and estimate logR
  # t_depth = sum(counts[[1]][,"depth"])
  # n_depth = sum(counts[[2]][,"depth"])
  # counts[[2]][,"depth"] <- counts[[2]][,"depth"] * (n_depth/t_depth)
  if(normalize){
    counts[[2]][,"depth"] <- counts[[2]][,"depth"] * map_ratio
  }


  counts[[2]][,"logR"] <- 0
  counts[[1]][,"logR"] <- round(log2(counts[[1]][,"depth"]/counts[[2]][,"depth"]), digits = 3)
  #counts[[1]][,"logR"] <- counts[[1]][,"logR"] - median(counts[[1]][,"logR"], na.rm = TRUE)

  counts = lapply(counts, function(x){
    x[,"BAF"] = ifelse(is.na(x[,"BAF"]) | is.nan(x[,"BAF"]), yes = NA, x[,"BAF"])
    x[,"logR"] = ifelse(is.na(x[,"logR"]) | is.nan(x[,"logR"]), yes = NA, x[,"logR"])
    x[,"logR"] = ifelse(is.infinite(x[,"logR"]), yes = NA, x[,"logR"])
    x
  })

  data.table::fwrite(x = counts[[1]][,c("chr", "pos", "BAF")], file = paste0(sample_name, ".tumour.BAF.txt"), sep = "\t", row.names = TRUE)
  data.table::fwrite(x = counts[[1]][,c("chr", "pos", "logR")], file = paste0(sample_name, ".tumour.logR.txt"), sep = "\t", row.names = TRUE)
  data.table::fwrite(x = counts[[2]][,c("chr", "pos", "BAF")], file = paste0(sample_name, ".normal.BAF.txt"), sep = "\t", row.names = TRUE)
  data.table::fwrite(x = counts[[2]][,c("chr", "pos", "logR")], file = paste0(sample_name, ".normal.logR.txt"), sep = "\t", row.names = TRUE)

  message("Generated following files:")
  message(paste0(sample_name, ".tumour.BAF.txt"))
  message(paste0(sample_name, ".tumour.logR.txt"))
  message(paste0(sample_name, ".normal.BAF.txt"))
  message(paste0(sample_name, ".normal.logR.txt"))
  message("------")

  # ascat_obj = NULL
  # if("ASCAT" %in% rownames(installed.packages())){
  #   message("Running ASCAT::ascat.loadData()")
  #   ascat_obj = ASCAT::ascat.loadData(Tumor_LogR_file = paste0(sample_name, ".tumour.logR.txt"),
  #                                     Tumor_BAF_file = paste0(sample_name, ".tumour.BAF.txt"),
  #                                     Germline_LogR_file = paste0(sample_name, ".normal.logR.txt"),
  #                                     Germline_BAF_file = paste0(sample_name, ".normal.BAF.txt"),
  #                                     chrs = c(1:22, "X", "Y"), sexchromosomes = c("X", "Y"))
  #   message("Running ASCAT::ascat.plotRawData()")
  #   ASCAT::ascat.plotRawData(ASCATobj = ascat_obj, img.prefix = sample_name)
  #   message("Returned ASCAT object!")
  # }
}