R/read_bs_encode_haib.R

#' Read ENCODE HAIB bed formatted BS-Seq file
#'
#' \code{read_bs_encode_haib} reads a file containing methylation data from
#' BS-Seq experiments using the \code{\link{scan}} function. The BS-Seq file
#' should be in ENCODE HAIB \code{bed} format. Read the Important section below
#' on when to use this function.
#'
#' @param file The name of the file to read data values from.
#' @param chr_discarded A vector with chromosome names to be discarded.
#' @param is_GRanges Logical: if TRUE a GRanges object is returned, otherwise a
#'   data.frame object is returned.
#'
#' @return A \code{\link[GenomicRanges]{GRanges}} object if \code{is_GRanges} is
#'   TRUE, otherwise a \code{\link[data.table]{data.table}} object.
#'
#'   The GRanges object contains two additional metadata columns: \itemize{
#'   \item \code{total_reads}: total reads mapped to each genomic location.
#'   \item \code{meth_reads}: methylated reads mapped to each genomic location.
#'   } These columns can be accessed as follows:
#'   \code{granges_object$total_reads}
#'
#' @section Important: Unless you want to create a different workflow when
#'   processing the BS-Seq data, you should NOT call this function, since this
#'   is a helper function. Instead you should call the
#'   \code{\link{preprocess_bs_seq}} function.
#'
#' @author C.A.Kapourani \email{C.A.Kapourani@@ed.ac.uk}
#'
#' @references
#' \url{http://rohsdb.cmb.usc.edu/GBshape/cgi-bin/hgTables?db=hg19&hgta_group=regulation&hgta_track=wgEncodeHaibMethylRrbs&hgta_table=wgEncodeHaibMethylRrbsBcbreast0203015BiochainSitesRep2&hgta_doSchema=describe+table+schema}
#'
#' @seealso \code{\link{pool_bs_seq_rep}}, \code{\link{preprocess_bs_seq}}
#'
#' @examples
#' # Get the location of the RRBS file
#' rrbs_file <- system.file("extdata", "rrbs.bed", package = "processHTS")
#' bs_data <- read_bs_encode_haib(file = rrbs_file, chr_discarded = "chr1", is_GRanges = TRUE)
#'
#' @export
read_bs_encode_haib <- function(file, chr_discarded = NULL, is_GRanges = TRUE){
  message("Reading file ", file, " ...")
  data_raw <- scan(file=file,
                   skip=1,
                   sep="\t",
                   what=list("character",  # Reference chromosome or scaffold
                             integer(),    # Start position in chromosome
                             NULL,         # End position in chromosome
                             NULL,         # Name of item
                             integer(),    # Score from 0-1000. Capped number
                             "character",  # Strand : + or - or . for unknown
                             NULL,         # Start position
                             NULL,         # End position
                             NULL,         # Color value R,G,B
                             NULL,         # Number of reads or coverage
                             integer()     # Methylation percentage
                   ))


  # Convert to actual methylated reads -------------------------
  data_raw[[11]] <- as.integer(round(0.01 * data_raw[[5]] * data_raw[[11]]))


  # Store only required fields
  bs_data <- data.table::data.table(chr = data_raw[[1]],
                                    start = data_raw[[2]],
                                    strand = data_raw[[6]],
                                    total_reads = data_raw[[5]],
                                    meth_reads = data_raw[[11]])
  rm(data_raw)


  # Remove selected chromosomes  -------------------------------
  bs_data <- discard_chr(x = bs_data, chr_discarded = chr_discarded)


  # Sorting data -----------------------------------------------
  # With order priority: 1. chr, 2. start, 3. strand
  message("Sorting BS-Seq data ...")
  bs_data <- bs_data[order(bs_data$chr, bs_data$start, bs_data$strand)]


  if (is_GRanges){
    # Create a GRanges object -----------------------------------
    message("Creating GRanges object ...")
    bs_data <- GenomicRanges::GRanges(seqnames = bs_data$chr,
                      strand = bs_data$strand,
                      ranges = IRanges::IRanges(start=bs_data$start, width=1),
                      total_reads = bs_data$total_reads,
                      meth_reads  = bs_data$meth_reads)
  }
  message("Finished reading BS-Seq file!\n")
  return(bs_data)
}
andreaskapou/processHTS documentation built on May 12, 2019, 3:33 a.m.