R/extract_matrix.R

Defines functions extract_matrix

Documented in extract_matrix

#' Extract matrix of signal values from given bigWig files around region of interset - bed file.
#' @details This function is a wrapper around bwtool matrix command and extracts signals from bigWig files from an input BED file.
#' Once extraction is complete matrix can be summarized by mean or median signals.
#'
#' @param coldata Input coldata generated from \code{\link{read_coldata}}
#' @param bed Input bed file
#' @param genome Can be hg19 or mm10. Default NULL. If \code{bed} and \code{genome} arguments are given, \code{bed} will be used.
#' @param binSize Default 25bp
#' @param startFrom region of interest. Default from peak 'center'. Can be 'tss' or 'tes'.
#' @param up upsteram in bp. Default 2500.
#' @param down downstream in bp. Default 2500.
#' @param k  cluster regions with k-means where k is the number of clusters. Defualt NULL.
#' @param op_dir Directory to store results. Defult "./"
#' @param rmAfter remove matrix files generated by bwtool once data is read into R. Default TRUE.
#' @param bw_path path to bwtool. Default looks under system path.
#' @param nthreads threads to use. Default 4.
#'
#' @import data.table
#'
#' @export
#'
extract_matrix = function(coldata = NULL, bed = NULL, genome = NULL, binSize = 25, startFrom = "center",
                          up = 2500, down = 2500, k = NULL,
                          op_dir = "./", rmAfter = TRUE, bwt_path = NULL, nthreads = 4){

  check_bwtools(path = bwt_path, warn = FALSE)

  if(is.null(coldata)){
    stop("Missing coldata. Use read_coldata to generate one.")
  }

  if(!is.null(k)){
    k = as.numeric(as.character(k))
    if(k <2 | k >10){
      stop("k should be between 2 and 10")
    }
  }

  bigWigs = as.character(coldata$files)

  if(is.null(bed)){
    if(is.null(genome)){
      stop("Please provide either a BED file or genome")
    }else{
      if(length(genome) > 1){
        warning("Multiple genomes provided. Using first one..")
        genome = genome[1]
      }
      size = check_size(size = c(up, down))
      bed = make_genome_bed2(genome = genome, op_dir = op_dir, startFrom = startFrom, up = up, down = down)
    }
  }else{
    size = check_size(size = c(up, down))
    bed = make_bed(bed = bed, op_dir = op_dir)
  }


  if(is.null(k)){
    mats = parallel::mclapply(bigWigs, function(x){
      bwt_mats(bw = x, binSize = binSize, bed = bed, size = size, startFrom = startFrom, op_dir = op_dir)
    }, mc.cores = nthreads)
  }else{
    bws = paste0(bigWigs, collapse = ",")
    mats = bwt_mats_k(bw = bws, binSize = binSize, bed = bed, size = size, startFrom = startFrom, op_dir = op_dir, num_k = k)
  }


  mats = as.character(unlist(x = mats))
  sig_list = lapply(mats, data.table::fread)
  #names(sig_list) = gsub(pattern = "*\\.matrix$", replacement = "", x = basename(path = mats))
  if(is.null(k)){
    names(sig_list) = coldata$sample_names
  }else{
    names(sig_list) = "clustered_matrix"
  }

  if(rmAfter){
    lapply(mats, function(x) system(command = paste0("rm ", x), intern = TRUE))
  }

  sig_list$param = c(size = size, binSize = binSize, startFrom = startFrom, k_num = k)
  sig_list$cdata = coldata
  system(command = paste0("rm ", bed))

  sig_list
}
PoisonAlien/peakseason documentation built on May 14, 2019, 4:01 a.m.