R/read_data.R

Defines functions read_annotation read_methylation_report

Documented in read_annotation read_methylation_report

#' @title  Read a methylation report file 
#' 
#' @description \code{read_methylation_report} read a file containing methylation information using \code{readr}.
#'              In general, the metyylation report file was generated by \code{Bismark methylation extractor}.
#'              The methylation report file format is the following: 
#'              <chromosome> <position> <strand> <count methylated> <count unmethylated> <C-context> <trinucleotide context>
#'              Note: the 7th column is optional.
#' 
#' @param file the path of methylation report file
#' 
#' @param min_coverage filter out the sites which total reads < min_coverage. the default is 4.
#' 
#' @param type specify the methylation context type : "CG", "CHG" or "CHH",the default is "CG".
#'
#' @return A \code{GRanges} object. 
#' The GRanges object contains three additional metadata columns:
#'   \itemize{ \item{ \code{methylated_reads}: the number if methylated reads for each C sites.}  
#'   \item{ \code{total_reads}: Total number of reads for each C sites.} 
#'   \item{ \code{type}:the C-context type: CG,CHG, or CHH} 
#'   }
#' 
#' @author Hongen Kang  \email{geneprophet@163.com}
#' 
#' @export
#'
#' @examples
#' # Obtain the path to files
#' \dontrun{
#' #' file <- system.file("extdata", "example_human_chr22_CpG_report.txt", package = "BSDMR")
#' human_met <- read_methylation_report(file,min_coverage=4)
#' file <- system.file("extdata", "example_mouse_chr15_CpG_report.txt", package = "BSDMR")
#' mouse_met <- read_methylation_report(file,min_coverage=4)
#' }
#' 
read_methylation_report <- function(file,
                                    min_coverage = 4,
                                    type = "CG"){
  message("reading methylation report file ...")
  met_report <- readr::read_delim(file, "\t", 
                                  escape_double = FALSE, col_names = FALSE,
                                  col_types = cols(X1 = col_character(), X4 = col_integer(), X5 = col_integer()),
                                  trim_ws = TRUE)
  #filter sites which coverages under min_coverage and non-methylation
  met_report <- met_report %>% dplyr::filter(X4+X5>=min_coverage & X4>0 & X6 == type)
  
  CpG_gr <- GenomicRanges::GRanges(
    seqnames = paste0("chr",met_report$X1),
    ranges = IRanges::IRanges(start = met_report$X2, end = met_report$X2),
    strand = met_report$X3,
    methylated_reads = met_report$X4,
    total_reads = met_report$X4+met_report$X5,
    type = met_report$X6
  )
  
  return(CpG_gr)
  
}

#' @title Read a annotation file
#' 
#' @description \code{read_annotation} reads a file containing annotation data, which
#'   will be used to select genomic regions for downstream analysis.
#'   The annotation file format is the following: "chromosome", "EnsemblID", "name","start", "end",
#'   "strand".
#'
#' @param file the path of annotation file
#'
#' @return A \code{GRanges} object.
#' The GRanges object contains three additional metadata columns:
#'      \itemize{ \item{ \code{id}: EnsemblID}  
#'      \item{ \code{center}: the center of the region,(strat+end)/2} 
#'      \item{ \code{annotation}:the annotation of the region, it can be promoter,downstream,gene,UTR...} }
#' 
#' 
#' 
#' @author Hongen Kang  \email{geneprophet@163.com}
#' @export
#'
#' @examples
#' # Obtain the path to files
#' \dontrun{
#' #' file <- system.file("extdata", "human_chr22_annotation.txt", package = "BSDMR")
#' human_anno <- read_annotation(file)
#' file <- system.file("extdata", "mouse_chr15_annotation.txt", package = "BSDMR")
#' mouse_anno <- read_annotation(file)
#' }
#' 
read_annotation <- function(file){
  message("reading annotation file...")
  anno <- readr::read_delim(file, 
                            "\t", 
                            escape_double = FALSE,
                            col_names = FALSE, 
                            col_types = cols(X1 = col_character()), 
                            trim_ws = TRUE)
  anno_gr <- GenomicRanges::GRanges(
    seqnames = paste0("chr",anno$X1),
    ranges = IRanges::IRanges(start = anno$X4,end = anno$X5),
    strand = anno$X6,
    id = anno$X2,
    center = round((anno$X4+anno$X5)/2),
    annotation = anno$X3
  )
  return(anno_gr)
}
geneprophet/BSDMR documentation built on March 3, 2021, 5:50 a.m.