#' Extract area under the curve for every peak from from given bigWig files.
#' @details This function is a wrapper around bwtool summary command and extracts peak intesities from bigWig files for every peak in an input BED file.
#' Once extraction is complete signal values are combined into a single data.table
#'
#' @param coldata Coldata generated from \code{\link{read_coldata}}
#' @param bed Input bed file. Can also be a gz compressed narrowPeak/BED
#' @param genome can be hg19 or mm10
#' @param startFrom NULL Default is to estimate complete AUC for each BED entry. Can be 'tss', 'tes' or 'center'
#' @param up Default 2500. Only applicable if \code{startFrom} is provided.
#' @param down Default 2500. Only applicable if \code{startFrom} is provided.
#' @param op_dir Directory to store results. Defult "./"
#' @param rmAfter remove matrix files generated by bwtool once data is read into R. Default TRUE.
#' @param keepBed Keep output BED as is? Default TRUE
#' @param bw_path path to bwtool. Default looks under system path.
#' @param bedHeader Does input BED file has header. Default FALSE
#' @param nthreads Threads to use. Default 4.
#' @param remove_dups Remove duplicated BED entries with same start and end coordinates. Default FALSE.
#'
#' @export
#'
extract_summary = function(coldata = NULL, bed = NULL, genome = NULL, startFrom = NULL, up = 2500, down = 2500, op_dir = "./",
rmAfter = TRUE, keepBed = TRUE, bwt_path = NULL,
bedHeader = FALSE, nthreads = 4, remove_dups = FALSE){
check_bwtools(path = bwt_path, warn = FALSE)
up = as.numeric(up)
down = as.numeric(down)
if(is.null(coldata)){
stop("Missing coldata. Use read_coldata to generate one.")
}
if(is.null(bed)){
if(is.null(genome)){
stop("Please provide a BED file or a genome")
}else{
if(is.null(startFrom)){
message("Estimating signal around tss +/- 2500")
startFrom = "tss"
bedSimple = make_genome_bed2(genome = genome, op_dir = op_dir, startFrom = "tss", up = 2500, down = 2500)
}else{
bedSimple = make_genome_bed2(genome = genome, op_dir = op_dir, startFrom = startFrom, up = up, down = down)
}
}
}else{
if(is.null(startFrom)){
bedSimple = make_bed(bed = bed, op_dir = op_dir)
}else{
bedSimple = extend_bed(bed = bed, op_dir = op_dir, startFrom = startFrom, up = up, down = down)
}
}
op_dir = paste0(op_dir, "/")
if(!dir.exists(paths = op_dir)){
dir.create(path = op_dir, showWarnings = FALSE, recursive = TRUE)
}
summaries = parallel::mclapply(seq_along(1:nrow(coldata)), FUN = function(i){
#bn = gsub(pattern = "\\.bw$|\\.bigWig$", replacement = "", x = basename(bw))
bn = coldata$sample_names[i]
bw = coldata$files[i]
message(paste0("Processing ", bn, " .."))
cmd = paste("bwtool summary -with-sum -keep-bed -header", bedSimple, bw, paste0(op_dir, bn, ".summary"))
system(command = cmd, intern = TRUE)
paste0(op_dir, bn, ".summary")
}, mc.cores = nthreads)
#summaries = list.files(path = op_dir, pattern = "*\\.summary$", full.names = TRUE)
summary_list = lapply(summaries, function(x){
x = data.table::fread(x)
colnames(x)[1] = 'chromosome'
x = x[,.(chromosome, start, end, size, sum)]
x[,id := paste0(chromosome, ":", start, "-", end)]
if(remove_dups){
if(nrow(x[duplicated(id)]) > 0){
warning("Duplicated BED entries found. Retained entries with largest value.")
x = x[order(sum, decreasing = TRUE)]
x = x[!duplicated(id)]
}
}
x
})
names(summary_list) = gsub(pattern = "*\\.summary$", replacement = "", x = basename(path = unlist(summaries)))
#return(summary_list)
if(rmAfter){
lapply(summaries, function(x) system(command = paste0("rm ", x), intern = TRUE))
}
if(keepBed){
if(!is.null(bed)){
if(is.data.frame(x = bed)){
b = bed
colnames(b)[1:3] = c("chrom", "start", "end")
}else{
if(as.logical(length(grep(pattern = '\\.gz$', x = bed, fixed = FALSE)))){
b = data.table::fread(cmd = paste0("zcat < ", bed), header = bedHeader)
colnames(b)[1:3] = c("chrom", "start", "end")
}else{
b = data.table::fread(input = bed, header = bedHeader)
colnames(b)[1:3] = c("chrom", "start", "end")
}
}
}else{
b = data.table::fread(input = bedSimple, header = bedHeader)
colnames(b)[1:3] = c("chrom", "start", "end")
}
}
if(remove_dups){
sum_tbl = data.table::rbindlist(l = summary_list, idcol = "sample", use.names = TRUE, fill = TRUE)
sum_tbl = data.table::dcast(data = sum_tbl, formula = id ~ sample, value.var = "sum")
#return(sum_tbl)
keepBed = FALSE #FIX this later!!
if(keepBed){
sum_tbl[,chromosome := NULL][,start := NULL][, end := NULL]
b[, id :=paste0(chrom, ":", start, "-", end)]
b = b[!duplicated(id)][order(id)]
print(head(b))
print(head(sum_tbl[order(id)]))
sum_tbl = merge(b, sum_tbl, by = "id")
}
}else{
sum_tbl = data.table::as.data.table(lapply(summary_list, function(x) x[,sum]))
sum_tbl = cbind(summary_list[[1]][,.(chromosome, start, end, size, id)], sum_tbl)
keepBed = FALSE #FIX this later!!
if(keepBed){
sum_tbl[,chromosome := NULL][,start := NULL][, end := NULL]
sum_tbl = cbind(b, sum_tbl)
}
}
system(command = paste0("rm ", bedSimple))
message("Done!")
if(is.null(startFrom)){
up = down = NULL
return(list(summaries = sum_tbl, cdata = coldata, param = c(up = up, down = down, startFrom = startFrom, genome = genome)))
}else{
return(list(summaries = sum_tbl, cdata = coldata, param = c(up = up, down = down, startFrom = startFrom, genome = genome)))
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.