R/prepAscat.R

Defines functions prep_ascat

Documented in prep_ascat

#'Prepare input files for ASCAT
#' @param t_counts read counts from tumor generated by `get_counts`
#' @param n_counts read counts from normal generated by `get_counts`
#' @param sample_name Sample name. Used as a basename for output files. Default NA, parses from t_counts file.
#' @param min_depth Min read depth required to consider a marker. Default 30
#' @param normalize If TRUE, normalizes for library size
#' @return An \code{\link{ascat.loadData}} object; ascat data structure
#' @export
prep_ascat = function(t_counts = NULL, n_counts = NULL, sample_name = NA, min_depth = 30, normalize = FALSE){

  if(any(is.null(t_counts) | is.null(n_counts))) stop("Missing tumor or normal read counts!")

  if(is.na(sample_name)){
    sample_name = gsub(pattern = "\\.tsv$", replacement = "", x = basename(path = t_counts))
  }

  counts = c(t_counts, n_counts)
  
  #library sizes
  tot_map_reads = lapply(counts, function(x){
    x = data.table::fread(input = x,nrows = 1)
    as.numeric(x$V2)
  })
  names(tot_map_reads) = c("tumor", "normal")
  map_ratio = tot_map_reads$tumor/tot_map_reads$normal
  message("Library sizes:")
  message("Tumor:  ", tot_map_reads$tumor)
  message("Normal: ", tot_map_reads$normal)
  message("Library size difference: ", round(map_ratio, digits = 3))
  message("------")
  
  counts = lapply(counts, function(x){
    message("Counts file: ", x)
    x = data.table::fread(input = x)
    x[,loci := gsub(pattern = "^chr", replacement = "", x = loci)]
    message("Markers: ", nrow(x))
    if(nrow(x[duplicated(loci)]) > 0){
      message("Removed ", nrow(x[duplicated(loci)]), " duplicated loci")
      x = x[!duplicated(loci)]
    }
    x[, tot_depth := apply(x[,.(A, T, G, C)], 1, sum)]
    x = x[tot_depth > min_depth]
    message("Markers > ", min_depth, ": ", nrow(x))
    message("------")
    x[, baf := apply(x[,.(A, T, G, C)], 1, function(r) {r = sort(r); r[4]/sum(r[4], r[3])})]
    #x[, baf := ifelse(test = baf < 0.5, yes = baf, no = 1 - baf)]
    #x$baf = ifelse(x$baf <0.5,x$baf,1-x$baf)
    x$baf = ifelse(runif(length(x$baf))<0.5,x$baf,1-x$baf)
    x = data.frame(data.table::tstrsplit(x = x$loci, split = ":"), x$baf, x$tot_depth, row.names = x$loci)
    colnames(x) = c("chr", "pos", "BAF", "depth")
    x$pos = as.numeric(as.character(x$pos))
    x
  })
  names(counts) = c("tumor", "normal")
  com_loci = intersect(rownames(counts[[1]]), rownames(counts[[2]]))
  counts = lapply(counts, function(x) x[com_loci,, drop = FALSE])
  message("Final number SNPs: ", nrow(counts[[1]]))

  #normalize for sequencing depth (might not be the best way) and estimate logR
  # t_depth = sum(counts[[1]][,"depth"])
  # n_depth = sum(counts[[2]][,"depth"])
  # counts[[2]][,"depth"] <- counts[[2]][,"depth"] * (n_depth/t_depth)
  if(normalize){
    counts[[2]][,"depth"] <- counts[[2]][,"depth"] * map_ratio  
  }
  

  counts[[2]][,"logR"] <- 0
  counts[[1]][,"logR"] <- round(log2(counts[[1]][,"depth"]/counts[[2]][,"depth"]), digits = 3)
  #counts[[1]][,"logR"] <- counts[[1]][,"logR"] - median(counts[[1]][,"logR"], na.rm = TRUE)

  counts = lapply(counts, function(x){
    x[,"BAF"] = ifelse(is.na(x[,"BAF"]) | is.nan(x[,"BAF"]), yes = NA, x[,"BAF"])
    x[,"logR"] = ifelse(is.na(x[,"logR"]) | is.nan(x[,"logR"]), yes = NA, x[,"logR"])
    x[,"logR"] = ifelse(is.infinite(x[,"logR"]), yes = NA, x[,"logR"])
    x
  })

  data.table::fwrite(x = counts[[1]][,c("chr", "pos", "BAF")], file = paste0(sample_name, ".tumour.BAF.txt"), sep = "\t", row.names = TRUE)
  data.table::fwrite(x = counts[[1]][,c("chr", "pos", "logR")], file = paste0(sample_name, ".tumour.logR.txt"), sep = "\t", row.names = TRUE)
  data.table::fwrite(x = counts[[2]][,c("chr", "pos", "BAF")], file = paste0(sample_name, ".normal.BAF.txt"), sep = "\t", row.names = TRUE)
  data.table::fwrite(x = counts[[2]][,c("chr", "pos", "logR")], file = paste0(sample_name, ".normal.logR.txt"), sep = "\t", row.names = TRUE)

  message("Generated following files:")
  message(paste0(sample_name, ".tumour.BAF.txt"))
  message(paste0(sample_name, ".tumour.logR.txt"))
  message(paste0(sample_name, ".normal.BAF.txt"))
  message(paste0(sample_name, ".normal.logR.txt"))
  message("------")

  ascat_obj = NULL
  if("ASCAT" %in% rownames(installed.packages())){
    message("Running ASCAT::ascat.loadData()")
    ascat_obj = ASCAT::ascat.loadData(Tumor_LogR_file = paste0(sample_name, ".tumour.logR.txt"),
                                      Tumor_BAF_file = paste0(sample_name, ".tumour.BAF.txt"),
                                      Germline_LogR_file = paste0(sample_name, ".normal.logR.txt"),
                                      Germline_BAF_file = paste0(sample_name, ".normal.BAF.txt"),
                                      chrs = c(1:22, "X", "Y"), sexchromosomes = c("X", "Y"))
    message("Running ASCAT::ascat.plotRawData()")
    ASCAT::ascat.plotRawData(ASCATobj = ascat_obj, img.prefix = sample_name)
    message("Returned ASCAT object!")
  }

  ascat_obj
}
CompEpigen/ezASCAT documentation built on Dec. 17, 2021, 3:04 p.m.