R/extract_summary.R

Defines functions extract_summary

Documented in extract_summary

#' Extract area under the curve for every peak from from given bigWig files.
#' @details This function is a wrapper around bwtool summary command and extracts peak intesities from bigWig files for every peak in an input BED file.
#' Once extraction is complete signal values are combined into a single data.table
#'
#' @param coldata Coldata generated from \code{\link{read_coldata}}
#' @param bed Input bed file. Can also be a gz compressed narrowPeak/BED
#' @param genome can be hg19 or mm10
#' @param startFrom NULL Default is to estimate complete AUC for each BED entry. Can be 'tss', 'tes' or 'center'
#' @param up Default 2500. Only applicable if \code{startFrom} is provided.
#' @param down Default 2500. Only applicable if \code{startFrom} is provided.
#' @param op_dir Directory to store results. Defult "./"
#' @param rmAfter remove matrix files generated by bwtool once data is read into R. Default TRUE.
#' @param keepBed Keep output BED as is? Default TRUE
#' @param bw_path path to bwtool. Default looks under system path.
#' @param bedHeader Does input BED file has header. Default FALSE
#' @param nthreads Threads to use. Default 4.
#' @param remove_dups Remove duplicated BED entries with same start and end coordinates. Default FALSE.
#'
#' @export
#'
extract_summary = function(coldata = NULL, bed = NULL, genome = NULL, startFrom = NULL, up = 2500, down = 2500, op_dir = "./",
                           rmAfter = TRUE, keepBed = TRUE, bwt_path = NULL,
                           bedHeader = FALSE, nthreads = 4, remove_dups = FALSE){

  check_bwtools(path = bwt_path, warn = FALSE)
  up = as.numeric(up)
  down = as.numeric(down)

  if(is.null(coldata)){
    stop("Missing coldata. Use read_coldata to generate one.")
  }

  if(is.null(bed)){
    if(is.null(genome)){
      stop("Please provide a BED file or a genome")
    }else{
      if(is.null(startFrom)){
        message("Estimating signal around tss +/- 2500")
        startFrom = "tss"
        bedSimple = make_genome_bed2(genome = genome, op_dir = op_dir, startFrom = "tss", up = 2500, down = 2500)
      }else{
        bedSimple = make_genome_bed2(genome = genome, op_dir = op_dir, startFrom = startFrom, up = up, down = down)
      }
    }
  }else{
    if(is.null(startFrom)){
      bedSimple = make_bed(bed = bed, op_dir = op_dir)
    }else{
      bedSimple = extend_bed(bed = bed, op_dir = op_dir, startFrom = startFrom, up = up, down = down)
    }
  }


  op_dir = paste0(op_dir, "/")
  if(!dir.exists(paths = op_dir)){
    dir.create(path = op_dir, showWarnings = FALSE, recursive = TRUE)
  }

  summaries = parallel::mclapply(seq_along(1:nrow(coldata)), FUN = function(i){
    #bn = gsub(pattern = "\\.bw$|\\.bigWig$", replacement = "", x = basename(bw))
    bn = coldata$sample_names[i]
    bw = coldata$files[i]
    message(paste0("Processing ", bn, " .."))
    cmd = paste("bwtool summary -with-sum -keep-bed -header", bedSimple, bw, paste0(op_dir, bn, ".summary"))
    system(command = cmd, intern = TRUE)
    paste0(op_dir, bn, ".summary")
  }, mc.cores = nthreads)


  #summaries = list.files(path = op_dir, pattern = "*\\.summary$", full.names = TRUE)
  summary_list = lapply(summaries, function(x){
      x = data.table::fread(x)
      colnames(x)[1] = 'chromosome'
      x = x[,.(chromosome, start, end, size, sum)]
      x[,id := paste0(chromosome, ":", start, "-", end)]

      if(remove_dups){
        if(nrow(x[duplicated(id)]) > 0){
          warning("Duplicated BED entries found. Retained entries with largest value.")
          x = x[order(sum, decreasing = TRUE)]
          x = x[!duplicated(id)]
        }
      }
      x
    })



  names(summary_list) = gsub(pattern = "*\\.summary$", replacement = "", x = basename(path = unlist(summaries)))
  #return(summary_list)

  if(rmAfter){
    lapply(summaries, function(x) system(command = paste0("rm ", x), intern = TRUE))
  }

  if(keepBed){
    if(!is.null(bed)){
      if(is.data.frame(x = bed)){
        b = bed
        colnames(b)[1:3] = c("chrom", "start", "end")
      }else{
        if(as.logical(length(grep(pattern = '\\.gz$', x = bed, fixed = FALSE)))){
          b = data.table::fread(cmd = paste0("zcat < ", bed), header = bedHeader)
          colnames(b)[1:3] = c("chrom", "start", "end")
        }else{
          b = data.table::fread(input = bed, header = bedHeader)
          colnames(b)[1:3] = c("chrom", "start", "end")
        }
      }
    }else{
      b = data.table::fread(input = bedSimple, header = bedHeader)
      colnames(b)[1:3] = c("chrom", "start", "end")
    }
  }

  if(remove_dups){
    sum_tbl = data.table::rbindlist(l = summary_list, idcol = "sample", use.names = TRUE, fill = TRUE)
    sum_tbl = data.table::dcast(data = sum_tbl, formula =  id ~ sample, value.var = "sum")
    #return(sum_tbl)
    keepBed = FALSE #FIX this later!!
    if(keepBed){
      sum_tbl[,chromosome := NULL][,start := NULL][, end := NULL]
      b[, id :=paste0(chrom, ":", start, "-", end)]
      b = b[!duplicated(id)][order(id)]
      print(head(b))
      print(head(sum_tbl[order(id)]))
      sum_tbl = merge(b, sum_tbl, by = "id")
    }

  }else{
    sum_tbl = data.table::as.data.table(lapply(summary_list, function(x) x[,sum]))
    sum_tbl = cbind(summary_list[[1]][,.(chromosome, start, end, size, id)], sum_tbl)
    keepBed = FALSE #FIX this later!!
    if(keepBed){
      sum_tbl[,chromosome := NULL][,start := NULL][, end := NULL]
      sum_tbl = cbind(b, sum_tbl)
    }
  }

  system(command = paste0("rm ", bedSimple))

  message("Done!")
  if(is.null(startFrom)){
    up = down = NULL
    return(list(summaries = sum_tbl, cdata = coldata, param = c(up = up, down = down, startFrom = startFrom, genome = genome)))
  }else{
    return(list(summaries = sum_tbl, cdata = coldata, param = c(up = up, down = down, startFrom = startFrom, genome = genome)))
  }
}
PoisonAlien/peakseason documentation built on May 14, 2019, 4:01 a.m.