#' Create methylation regions for each gene promoter.
#'
#' \code{create_methyl_region} creates methylation regions using BS-Seq and
#' annotated gene promoter regions. BS-Seq data give information for the
#' methylation of CpGs individually, and annotated data are used to locate the
#' TSS of each gene and its promoter region.
#'
#' @param bs_data \code{\link[GenomicRanges]{GRanges}} object containing the
#' BS-Seq data. The GRanges object should also have two additional metadata
#' columns named \code{total_reads} and \code{meth_reads}. A GRanges object
#' used in this function can be the output of
#' \code{\link{read_bs_encode_haib}} or its wrapper function
#' \code{\link{preprocess_bs_seq}}.
#' @param prom_region \code{\link[GenomicRanges]{GRanges}} object containing
#' promoter regions, i.e. N bp upstream and M bp downstream of TSS location.
#' The GRanges object should also have one additional metadata column named
#' \code{tss}. A GRanges object used in this function can be the output of
#' \code{\link{create_prom_region}}.
#' @param cpg_density Optional integer defining the minimum number of CpGs that
#' have to be in a methylated region. Regions with less than \code{n} CpGs are
#' discarded.
#' @param sd_thresh Optional numeric defining the minimum standard deviation of
#' the methylation change in a region. This is used to filter regions with no
#' methylation change.
#' @param ignore_strand Logical, whether or not to ignore strand information.
#' @param fmin Optional minimum range value for region location scaling. Under
#' this version, this parameter should be left to its default value.
#' @param fmax Optional maximum range value for region location scaling. Under
#' this version, this parameter should be left to its default value.
#'
#' @return A \code{methyl_region} object containing the following information:
#' \itemize{ \item{ \code{meth_data}: A list containing methylation data,
#' where each entry in the list is an \eqn{L_{i} X 3} dimensional matrix,
#' where \eqn{L_{i}} denotes the number of CpGs found in region \code{i}. The
#' columns contain the following information: \enumerate{ \item{ 1st column:
#' Contains the locations of CpGs relative to TSS. Note that the actual
#' locations are scaled to the (fmin, fmax) region. } \item{ 2nd column:
#' Contains the total reads of each CpG in the corresponding location.} \item{
#' 3rd column: Contains the methylated reads each CpG in the corresponding
#' location.} } } \item{ \code{prom_ind}: A vector storing the corresponding
#' promoter indices, so as to map each methylation region with its
#' corresponding gene promoter.} } The lengths of \code{meth_data} and
#' \code{prom_ind} should be the same.
#'
#' @author C.A.Kapourani \email{C.A.Kapourani@@ed.ac.uk}
#'
#' @seealso \code{\link{preprocess_bs_seq}}, \code{\link{create_prom_region}}
#'
#' @examples
#' # Load the gene annotation example dataset
#' ann_data <- annot_data
#' prom_reg <- create_prom_region(ann_data, upstream=-10000, downstream=10000)
#' bs_data <- rrbs_data
#'
#' # Create the meth_region object
#' meth_regions <- create_methyl_region(bs_data, prom_reg)
#' # Extract the promoter indices
#' prom_indices <- meth_regions$prom_ind
#'
#' # Extract the methylated data list
#' methylated_data <- meth_regions$meth_data
#' # Get the 1st methylated region
#' meth_reg <- methylated_data[[1]]
#'
#' @importFrom stats sd
#' @importFrom methods is
#' @export
create_methyl_region <- function(bs_data, prom_region, cpg_density = 1,
sd_thresh = 0, ignore_strand = FALSE,
fmin = -1, fmax = 1){
message("Creating methylation regions ...")
assertthat::assert_that(methods::is(bs_data, "GRanges"))
assertthat::assert_that(methods::is(prom_region, "GRanges"))
# Find overlaps between promoter regions and BS-Seq data -------
overlaps <- GenomicRanges::findOverlaps(query = prom_region,
subject = bs_data,
ignore.strand = ignore_strand)
if (length(overlaps) < 2){
stop("Not enough matches between the BS-Seq and RNA-Seq data.")
}
# Convert data in vector format for faster lookup --------------
query_hits <- S4Vectors::queryHits(overlaps)
subj_hits <- S4Vectors::subjectHits(overlaps)
# Indices of promoter locations
prom_loc <- unique(query_hits)
tss_loc <- prom_region$tss # TSS locations
tss_strand <- as.character(GenomicRanges::strand(prom_region))
cpg_loc <- GenomicRanges::ranges(bs_data)@start # CpG locations
tot_reads <- bs_data$total_reads # Total reads
meth_reads <- bs_data$meth_reads # Methylated reads
# Extract upstream and downstream lengths in bps
width <- GenomicRanges::ranges(prom_region)@width[1]
if (identical(tss_strand[1], "+")){
upstream <- GenomicRanges::ranges(prom_region)@start[1] - tss_loc[1]
downstream <- width + upstream - 1
}else if (identical(tss_strand[1], "-")){
downstream <- abs(GenomicRanges::ranges(prom_region)@start[1] - tss_loc[1])
upstream <- downstream - width + 1
}
# Initialize variables -----------------------------------------
n <- 1 # Data points counter
LABEL <- FALSE # Flag variable
meth_data <- list() # List where data will be stored
prom_counter <- 0 # Promoter counter
prom_ind <- vector(mode = "integer") # Vector of promoter indices
cpg_ind <- vector(mode = "integer") # Vector of CpG indices
cpg_ind <- c(cpg_ind, subj_hits[1]) # Add the first subject hit
for (i in 2:NROW(query_hits)){
# If query hits is the same as the previous one
if (query_hits[i] == query_hits[i - 1]){
cpg_ind <- c(cpg_ind, subj_hits[i]) # Add subject hit
}else{
prom_counter <- prom_counter + 1 # Increase promoter counter
LABEL <- TRUE
}
if(LABEL){
# Only keep regions that have more than 'n' CpGs
if (length(cpg_ind) > cpg_density){
# If standard deviation of the methylation level is above threshold
if (sd(meth_reads[cpg_ind] / tot_reads[cpg_ind]) > sd_thresh){
# Promoter indices
prom_ind <- c(prom_ind, prom_loc[prom_counter])
# Locations of CpGs in the genome
region <- cpg_loc[cpg_ind]
# TSS location for promoter 'promCount'
tss <- tss_loc[prom_loc[prom_counter]]
# Extract strand information, i.e. direction
strand_direction <- tss_strand[prom_loc[prom_counter]]
# Shift CpG locations relative to TSS
center_data <- .center_loc(region = region,
tss = tss,
strand_direction = strand_direction)
# In the "-" strand the order of the locations should change
Order <- base::order(center_data)
meth_data[[n]] <- matrix(0, nrow = length(cpg_ind), ncol = 3)
# Store normalized locations of methylated CpGs in (fmin,fmax) region
meth_data[[n]][ ,1] <- minmax_scaling(data = center_data[Order],
xmin = upstream,
xmax = downstream,
fmin = fmin,
fmax = fmax)
# Store total reads in the corresponding locations
meth_data[[n]][ ,2] <- tot_reads[cpg_ind][Order]
# Store methylated reads in the corresponding locations
meth_data[[n]][ ,3] <- meth_reads[cpg_ind][Order]
# Increase data points counter
n <- n + 1
}
}
LABEL <- FALSE
cpg_ind <- vector(mode="integer")
cpg_ind <- c(cpg_ind, subj_hits[i])
}
}
methyl_region <- structure(list(meth_data = meth_data,
prom_ind = prom_ind),
class = "methyl_region")
message("Done!\n")
return(methyl_region)
}
# Center CpG locations relative to TSS
#
# \code{center_loc} centera CpG locations relative to TSS
#
# @param region CpG locations
# @param tss TSS location
# @param strand_direction Strand direction
#
# @return Centered location data relative to TSS
#
.center_loc <- function(region, tss, strand_direction){
assertthat::assert_that(is.character(strand_direction))
center <- region - tss
if (identical(strand_direction, "-")){
center <- (-center) # If '-' strand, swap CpG locations
}
return(center)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.