#' Wrapper method for processing ENCODE HAIB and Caltech HTS data
#'
#' \code{process_bismark_beatson_wrap} is a wrapper method for processing HTS
#' data and returning the methylation promoter regions and the corresponding
#' gene expression data for those promoter regions. Note that the format of
#' BS-Seq data should be in the bismark format and for the RNA-Seq data in
#' Beatson bed format.
#'
#' @param bs_files Filename (or vector of filenames if there are replicates) of
#' the BS-Seq '.bed' formatted data to read values from.
#' @param rna_files Filename of the RNA-Seq '.bed' formatted data to read values
#' from. Currently, this version does not support pooling RNA-Seq replicates.
#' @param chrom_size_file Optional filename containing genome chromosome sizes.
#' @param chr_discarded A vector with chromosome names to be discarded.
#' @param upstream Integer defining the length of bp upstream of TSS for
#' creating the promoter region.
#' @param downstream Integer defining the length of bp downstream of TSS for
#' creating the promoter region.
#' @param min_bs_cov The minimum number of reads mapping to each CpG site. CpGs
#' with less reads will be considered as noise and will be discarded.
#' @param max_bs_cov The maximum number of reads mapping to each CpG site. CpGs
#' with more reads will be considered as noise and will be discarded.
#' @param gene_log2_transf Logical, whether or not to log2 transform the gene
#' expression data.
#' @param gene_outl_thresh Logical, whehter or not to remove outlier gene
#' expression data.
#' @param gex_outlier Numeric, denoting the threshold above of which the gene
#' expression data (before the log2 transformation) are considered as noise.
#' @inheritParams create_methyl_region
#'
#' @return A \code{processHTS} object which contains following information:
#' \itemize{ \item{ \code{methyl_region}: A list containing methylation data,
#' where each entry in the list is an \eqn{L_{i} X 3} dimensional matrix,
#' where \eqn{L_{i}} denotes the number of CpGs found in region \code{i}. The
#' columns contain the following information: \enumerate{ \item{ 1st column:
#' Contains the locations of CpGs relative to TSS. Note that the actual
#' locations are scaled to the (fmin, fmax) region. } \item{ 2nd column:
#' Contains the total reads of each CpG in the corresponding location.} \item{
#' 3rd column: Contains the methylated reads each CpG in the corresponding
#' location.} } } \item{\code{gex}: A vector containing the corresponding gene
#' expression levels for each entry of the \code{methyl_region} list.} \item{
#' \code{prom_region}: A \code{\link[GenomicRanges]{GRanges}} object
#' containing corresponding annotated promoter regions for each entry of the
#' \code{methyl_region} list. The GRanges object has one additional metadata
#' column named \code{tss}, which stores the TSS of each promoter. } \item{
#' \code{rna_data}: A \code{\link[GenomicRanges]{GRanges}} object containing
#' the corresponding RNA-Seq data for each entry of the \code{methyl_region}
#' list. The GRanges object has three additional metadata columns which are
#' explained in \code{\link{read_rna_encode_caltech}}} \item{ \code{upstream}:
#' Integer defining the length of bp upstream of TSS.} \item{
#' \code{downstream}: Integer defining the length of bp downstream of TSS.}
#' \item{ \code{cpg_density}: Integer defining the minimum number of CpGs that
#' have to be in a methylated region. Regions with less than \code{n} CpGs are
#' discarded.} \item{ \code{sd_thresh}: Numeric defining the minimum standard
#' deviation of the methylation change in a region. This is used to filter
#' regions with no methylation change.} \item{ \code{fmin}: Minimum range
#' value for region location scaling.} \item{ \code{fmax}: Maximum range value
#' for region location scaling.} }
#'
#' @author C.A.Kapourani \email{C.A.Kapourani@@ed.ac.uk}
#'
#' @export
process_bismark_beatson_wrap <- function(bs_files, rna_files,
chrom_size_file = NULL,
chr_discarded = NULL, upstream = -7000,
downstream = 7000, min_bs_cov = 4,
max_bs_cov = 1000, cpg_density = 10,
sd_thresh = 10e-02,
ignore_strand = TRUE,
gene_log2_transf = TRUE,
gene_outl_thresh = TRUE,
gex_outlier = 300,
fmin = -1, fmax = 1){
# Process BS-Seq file and return data in the required format
bs_data <- preprocess_bs_seq(files = bs_files,
file_format = "bismark_cov",
chr_discarded = chr_discarded,
min_bs_cov = min_bs_cov,
max_bs_cov = max_bs_cov)
# Read the chromosome size file, if it is supplied
if (!is.null(chrom_size_file)){
chrom_size <- read_chrom_size(file = chrom_size_file)
}else{
chrom_size <- NULL
}
# Read RNA-Seq BED file
rna_data <- read_rna_beatson(file = rna_files,
chr_discarded = chr_discarded,
is_GRanges = TRUE)
# Create promoter regions
prom_reg <- create_prom_region(annot_data = rna_data,
chrom_size = chrom_size,
upstream = upstream,
downstream = downstream)
# Create methylation regions data
methyl_reg <- create_methyl_region(bs_data = bs_data,
prom_region = prom_reg,
cpg_density = cpg_density,
sd_thresh = sd_thresh,
ignore_strand = ignore_strand,
fmin = fmin,
fmax = fmax)
# # Keep only the corresponding gene expression data
# rna_data <- rna_data[methyl_reg$prom_ind]
# # Keep only the corresponding gene annotation data
# prom_reg <- prom_reg[methyl_reg$prom_ind]
proc_data <- preprocess_final_HTS_data(methyl_region = methyl_reg,
prom_reg = prom_reg,
rna_data = rna_data,
gene_log2_transf = gene_log2_transf,
gene_outl_thresh = gene_outl_thresh,
gex_outlier = gex_outlier)
# Create object
obj <- structure(list(methyl_region = proc_data$methyl_region,
gex = proc_data$gex,
prom_region = proc_data$prom_reg,
rna_data = proc_data$rna_data,
upstream = upstream,
downstream = downstream,
cpg_density = cpg_density,
sd_thresh = sd_thresh,
fmin = fmin,
fmax = fmax),
class = "processHTS")
return(obj)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.