R/loadBED12.R

#' loadBED12
#' 
#' Load CAGE data from a file in BED12 format.
#' 
#' Loads a BED12-formatted file in a \code{data.table}, and adds an extra column
#' holding the sample name (same for each row).  If no sample name is provided, 
#' it is created from the file name by removing the \code{.bed} and compression 
#' extensions.
#' 
#' We use the BED12 format to represent data related to the CAGE and CAGEscan 
#' methods.  In brief, in a file repesents \sQuote{CAGEscan pairs}, then each 
#' line is one read count and the score is the sum of the mapping qualities of 
#' both reads.  In files representing \sQuote{CAGEscan fragments}, each line is 
#' one RNA molecule and the score is the number of CAGEscan pairs that were used
#' to build the fragment.  For \sQuote{CAGEscan clusters}, each line is one 
#' transcript model and the score is the number of molecules used to build the 
#' model.
#' 
#' @param file Name of the BED12 file or full path to it.  If multiple names are
#'   priovided, multiple files will be loaded, but in that case sample names can
#'   not be provided with the \code{samplename} argument (see below).
#' @param samplename Optional. Name of the sample represented by the file.
#'   
#' @return When the file is found, \code{loadBED12} always returns a 
#'   \code{data.table}, so that it can safely be used in a accumulator loop.  It
#'   will check if the file is empty, because in our current pipeline produces
#'   such files when a sample contains no properly paired reads.  In that case,
#'   it will return an empty \code{data.table}.  If the file is not found, it
#'   aborts with an error.  The columns \code{chrom} and \code{library}
#'   are factors.  The other columns are numeric or character according to their
#'   contents. 
#'   
#' @seealso \code{\link{bedFieldNames}}, \code{\link{data.table}}
#'   
#' @examples 
#' fileA <- system.file("extdata", "BED12_A.bed", package="smallCAGEqc")
#' fileB <- system.file("extdata", "BED12_B.bed", package="smallCAGEqc")
#' fileC <- system.file("extdata", "BED12_C.bed", package="smallCAGEqc")
#' 
#' loadBED12(fileA, "A")
#' loadBED12(c(fileA, fileB, fileC))
#'   
#' @export loadBED12

loadBED12 <- function(file, samplename) {
  
  loadOneFile <- function(file, samplename) {
    if (! file.exists(file))
      stop(paste("Could not find file:", file))
    
    if (file.info(file)$size == 0 )
      return(data.table::data.table())
    
    if (missing(samplename))
      samplename <- sub(".bed(.gz|.bz2|.xz|)", "", basename(file))
    
    DT <- data.table::fread(file, sep="\t")
    data.table::setnames(DT, bedFieldNames())
    DT$library <- samplename
    DT
  }
  
  if (length(file) == 1)
    bed <- loadOneFile(file, samplename)
  
  if (length(file) > 1){
    if (! missing(samplename))
      stop("Sample names not yet supported when loading multiple files")
    bed <- Reduce( function(X,Y) {rbind(X, loadOneFile(Y))}
                 , file
                 , data.table::data.table())
  }
  
  bed$chrom   <- factor(bed$chrom  )
  bed$library <- factor(bed$library)
  return(bed)
}
charles-plessy/smallCAGEqc documentation built on May 13, 2019, 3:31 p.m.