R/process_data.R

Defines functions impute_bulk_met partition_bulk_dataset create_anno_region read_chrom_size read_expr create_region_object read_anno read_met

Documented in create_anno_region create_region_object impute_bulk_met partition_bulk_dataset read_anno read_chrom_size read_expr read_met

#' @title Read methylation file
#'
#' @description \code{read_met} reads a file containing methylation data using
#'   the \code{fread} function. Since there are different technologies, e.g.
#'   array, bulk BS-Seq, scBS-Seq, and still there is no standard file format,
#'   different options are available, check the Important section below on the
#'   file format for each type you choose. If a file format is not availabe, you
#'   need to read the file and create a \code{GRanges} object, where the data
#'   are ordered by chromosome and genomic location.
#'
#' @param file File name.
#' @param type Type of technology as character. Either "bulk_seq", "sc_seq" or
#'   "array". Check the Important section below for more details.
#' @param strand_info Logical, whether or not the file contains strand
#'   information.
#' @param chr_discarded Optional vector with chromosomes to be discarded.
#' @param min_bulk_cov Minimum number of reads mapping to each CpG site. Used
#'   only for "bulk_seq" and CpGs with less reads will be discarded as noise.
#' @param max_bulk_cov Maximum number of reads mapping to each CpG site. Used
#'   only for "bulk_seq" and CpGs with less reads will be discarded as noise.
#' @param delimiter Delimiter format the columns are splitted. Default is tab.
#'
#' @return A \code{GRanges} object.
#'
#'   The GRanges object contains one or two additional metadata columns:
#'   \itemize{ \item{ \code{met}: Methylation level. \itemize{ \item{For "array"
#'   this is the Beta or M-values} \item{For "sc_seq" this is either 0 or 1
#'   (unmethylated or methylated)} \item{For "bulk_seq" this contains the number
#'   of methylated reads for each CpG.} } }  \item{ \code{total}: Total number
#'   of reads for each CpG. Present only for "bulk_seq" type. } } These columns
#'   can be accessed as follows: \code{granges_obj$met}
#'
#' @section Important: Depending on technology \code{type} we assume different
#'   file formats. \itemize{ \item{ \code{"array"} File format: "chromosome",
#'   "start", "strand" (optional), "met" .} \item{ \code{"sc_seq"} File format:
#'   "chromosome", "start", "strand" (optional), "met" . Where "met" should
#'   contain only 1s or 0s. } \item{ \code{"bulk_seq"} File format:
#'   "chromosome", "start", "strand" (optional), "met", "total". } } By default
#'   columns are considered in tab delimited format.
#'
#' @author C.A.Kapourani \email{C.A.Kapourani@@ed.ac.uk}
#'
#' @seealso \code{\link{read_anno}}, \code{\link{read_expr}},
#'   \code{\link{create_region_object}}
#'
#' @examples
#' # Obtain the path to files
#' file <- system.file("extdata", "dummy_met.bed", package = "BPRMeth")
#' met_dt <- read_met(file)
#'
#' # Extract methylation level
#' met <- met_dt$met
#'
#' @export
read_met <- function(file, type = "sc_seq", strand_info = FALSE,
                     chr_discarded = NULL, min_bulk_cov = 4,
                     max_bulk_cov = 1000, delimiter = "\t"){

    chr <- NULL # So RMD CHECK does not complain
    message("Reading file ", file, " ...")
    met_dt <- data.table::fread(input = file, sep = delimiter,
                                stringsAsFactors = FALSE, showProgress = FALSE)
    if (identical(type, "sc_seq") || identical(type, "array")) {
        if (strand_info) {
            met_dt <- met_dt %>%
                data.table::setnames(c("chr", "start", "strand", "met")) %>%
                subset(!(chr %in% chr_discarded)) %>%
                .[, c("end") := list(start)] %>%
                .[,c("chr", "start", "end", "strand", "met")] %>%
                data.table::setkey(chr, start, end) %>% GRanges()
        }else {
            met_dt <- met_dt %>%
                data.table::setnames(c("chr", "start", "met")) %>%
                subset(!(chr %in% chr_discarded)) %>%
                .[, c("end") := list(start)] %>%
                .[,c("chr", "start", "end", "met")] %>%
                data.table::setkey(chr, start, end) %>% GRanges()
        }
    }else if (identical(type, "bulk_seq")) {
        if (strand_info) {
            met_dt <- met_dt %>%
                data.table::setnames(c("chr", "start", "strand", "met",
                                       "total")) %>%
                subset(!(chr %in% chr_discarded)) %>%
                .[, c("end") := list(start)] %>%
                .[, c("chr", "start", "end", "strand", "met", "total")] %>%
                data.table::setkey(chr, start, end) %>% GRanges()
        }else {
            met_dt <- met_dt %>%
                data.table::setnames(c("chr", "start", "met", "total")) %>%
                subset(!(chr %in% chr_discarded)) %>%
                .[, c("end") := list(start)] %>%
                .[, c("chr", "start", "end", "met", "total")] %>%
                data.table::setkey(chr, start, end) %>% GRanges()
        }
        met_dt <- subset(met_dt, met_dt$total >= min_bulk_cov)
        met_dt <- subset(met_dt, met_dt$total <= max_bulk_cov)
    }
    return(met_dt)
}


#' @title Read annotation file
#'
#' @description \code{read_anno} reads a file containing annotation data, which
#'   will be used to select genomic regions of interest for downstream analysis.
#'   The annotation file format is the following: "chromosome", "start", "end",
#'   "strand", "id", "name" (optional). Read the Important section below for
#'   more details.
#'
#' @param file File name.
#' @param chrom_size_file Optional file name to read genome chromosome sizes.
#' @param chr_discarded Optional vector with chromosomes to be discarded.
#' @param is_centre Logical, whether 'start' and 'end' locations are
#'   pre-centred. If TRUE, the mean of the locations will be chosen as centre.
#'   If FALSE, the 'start' will be chosen as the center; e.g. for genes the
#'   'start' denotes the TSS and we use this as centre to obtain K-bp upstream
#'   and downstream of TSS.
#' @param is_window Whether to consider a predefined window region around
#'   centre. If TRUE, then 'upstream' and 'downstream' parameters are used,
#'   otherwise we consider the whole region from start to end location.
#' @param upstream Integer defining the length of bp upstream of 'centre' for
#'   creating the genomic region. If is_window = FALSE, this parameter is
#'   ignored.
#' @param downstream Integer defining the length of bp downstream of 'centre'
#'   for creating the genomic region. If is_window = FALSE, this parameter is
#'   ignored.
#' @param is_anno_region Logical, whether or not to create genomic region. It
#'   should be set to TRUE, if you want to use the object as input to
#'   \code{\link{create_region_object}} function.
#' @param delimiter Delimiter format the columns are splitted. Default is tab.
#'
#' @return A \code{GRanges} object.
#'
#'   The GRanges object contains two or three additional metadata columns:
#'   \itemize{ \item{ \code{id}: Feature ID, e.g. Ensembl IDs for each gene.}
#'   \item{ \code{centre}: The central (middle) location of the genomic region.
#'   This is required when transforming 'methylation regions' in the (-1, 1)
#'   interval, the 'centre' would be at 0.} \item{\code{name} (Optional) the
#'   feature name.} } These columns can be accessed as follows:
#'   \code{granges_obj$id}
#'
#' @section Important: \itemize{ \item{When having strand information, and we
#'   are in the antisense strand ("-"), the 'start' is automatically switched
#'   with the 'end' location.} \item{By default columns are considered in tab
#'   delimited format.} \item{ The "name" feature is optional.} \item{When there
#'   is no "strand" info, this can be set to "*".}}
#'
#' @author C.A.Kapourani \email{C.A.Kapourani@@ed.ac.uk}
#'
#' @seealso \code{\link{read_met}}, \code{\link{create_anno_region}},
#'   \code{\link{create_region_object}}
#'
#' @examples
#' # Obtain the path to files
#' file <- system.file("extdata", "dummy_anno.bed", package = "BPRMeth")
#' anno_dt <- read_anno(file, chr_discarded = c("X"))
#'
#' # Extract feature id
#' anno_ids <- anno_dt$id
#'
#' @export
read_anno <- function(file, chrom_size_file = NULL, chr_discarded = NULL,
                      is_centre = FALSE, is_window = TRUE, upstream = -5000,
                      downstream = 5000, is_anno_region = TRUE,
                      delimiter = "\t") {

    if (!is_centre & !is_window) {
        stop("One of 'is_centre' or 'is_window' must be set to TRUE.")
    }
    chr <- NULL # So RMD CHECK does not complain
    message("Reading file ", file, " ...")
    anno_dt <- data.table::fread(input = file, sep = delimiter,
                                 stringsAsFactors = FALSE, showProgress = FALSE)
    # Read the chromosome size file, if it is supplied
    if (!is.null(chrom_size_file)) {
        chrom_size <- read_chrom_size(file = chrom_size_file)
    }else {chrom_size <- NULL }
    if (NCOL(anno_dt) == 5) {
        anno_dt <- anno_dt %>%
            data.table::setnames(c("chr", "start", "end", "strand", "id"))
    }else{
        anno_dt <- anno_dt %>%
            data.table::setnames(c("chr","start","end","strand","id","name"))
    }
    anno_dt <- anno_dt %>% subset(!(chr %in% chr_discarded)) %>%
        data.table::setkey(chr, start, end) %>% GRanges()
    if (is_anno_region) {
        # Create genomic region
        anno_dt <- create_anno_region(anno = anno_dt,chrom_size = chrom_size,
            is_centre = is_centre, is_window = is_window, upstream = upstream,
            downstream = downstream)
    }

    return(anno_dt)
}


#' @name create_region_object
#' @rdname create_region_object
#' @aliases genomic_region region_object met_region create_region
#'   create_region_obj
#'
#' @title Create genomic region data
#'
#' @description \code{create_region_object} creates genomic regions (e.g. forms
#'   methylation regions data) using as input methylation and annotation data
#'   with genomic regions of interest.
#'
#' @param met_dt A \code{GRanges} object with methylation data, whose format
#'   should be similar to \code{\link{read_met}} function.
#' @param anno_dt A \code{GRanges} object with annotation data, whose format
#'   should be similar to  \code{\link{read_anno}}.
#' @param cov Integer defining the minimum coverage of CpGs that each region
#'   must contain.
#' @param sd_thresh Optional numeric defining the minimum standard deviation of
#'   the methylation change in a region. This is used to filter regions with no
#'   methylation variability.
#' @param ignore_strand Logical, whether or not to ignore strand information.
#' @param filter_empty_region Logical, whether to discard genomic regions that
#'   have no CpG coverage or do not pass filtering options.
#' @param fmin Minimum range value for location scaling. Under this version, it
#'   should be left to its default value -1.
#' @param fmax Maximum range value for location scaling. Under this version, it
#'   should be left to its default value 1.
#'
#' @return A \code{list} object containing the two elements: \itemize{ \item{
#'   \code{met}: A list containing methylation region data, where each entry in
#'   the list is an \eqn{L_{i} X D} dimensional matrix, where \eqn{L_{i}}
#'   denotes the number of CpGs found in region \code{i}. The columns contain
#'   the following information: \enumerate{ \item{ 1st column: Contains the
#'   locations of CpGs relative to centre. Note that the actual locations are
#'   scaled to the (fmin, fmax) region. } \item{ 2nd column: If "bulk" data
#'   (i.e. binomial) it contains the total number of reads at each CpG location,
#'   otherwise the methylation level.} \item{ 3rd column: If "bulk" data, the
#'   methylated reads at each CpG location, otherwise this D = 2 and this column
#'   is absent.} }. Rownames of each matrix contain the actual CpG genomic
#'   coordinates as <chr>:<location>. } \item{ \code{anno}: The annotation
#'   object.} } Note: The lengths of \code{met} and \code{anno} should match.
#'
#' @author C.A.Kapourani \email{C.A.Kapourani@@ed.ac.uk}
#'
#' @examples
#' \dontrun{
#' # Download the files and change the working directory to that location
#' met_dt <- read_met("name_of_met_file")
#' anno_dt <- read_anno("name_of_anno_file")
#'
#' obj <- create_region_object(met_dt, anno_dt)
#'
#' # Extract methylation regions
#' met <- obj$met
#' }
#'
#' @seealso \code{\link{read_met}}, \code{\link{read_anno}}
#'
#' @importFrom methods is
#' @export
create_region_object <- function(met_dt, anno_dt, cov = 5, sd_thresh = 1e-1,
                                 ignore_strand = TRUE,
                                 filter_empty_region = TRUE,
                                 fmin = -1, fmax = 1){
    message("Creating methylation regions ...")
    assertthat::assert_that(methods::is(met_dt, "GRanges"))
    assertthat::assert_that(methods::is(anno_dt, "GRanges"))
    # Find overlaps between met and anno
    overlaps <- GenomicRanges::findOverlaps(query = anno_dt, subject = met_dt,
                                            ignore.strand = ignore_strand)
    if (length(overlaps) < 2) { stop("Low overlap between anno and met data!")}

    # Convert data in vector format for efficiency
    query_hits <- S4Vectors::queryHits(overlaps)
    subj_hits  <- S4Vectors::subjectHits(overlaps)
    gen_ind    <- unique(query_hits) # Indices of genomic locations
    centre     <- anno_dt$centre     # Central locations
    id         <- anno_dt$id         # (Ensembl) IDs
    chrom      <- as.character(anno_dt@seqnames) # Chrom info
    strand     <- as.character(GenomicRanges::strand(anno_dt))
    cpg_loc    <- GenomicRanges::ranges(met_dt)@start  # CpG locations
    met        <- met_dt$met         # Methylated read
    # Number of columns for each matrix
    if (is.null(met_dt$total)) { D <- 2 } else {total <- met_dt$total; D <- 3 }

    # Extract upstream and downstream lengths
    N <- NROW(anno_dt)
    up_anno <- vector(mode = "integer", N)    # Start location in chromosome
    down_anno <- vector(mode = "integer", N)  # End location in chromosome
    anno_start <- GenomicRanges::ranges(anno_dt)@start # Start info
    anno_end <- anno_start + GenomicRanges::ranges(anno_dt)@width - 1 # End info
    for (i in 1:N) {
        # Depending on the strand we change regions up or downstream of centre
        if (identical(strand[i], "-")) {
            up_anno[i] <- centre[i] - anno_end[i]
            down_anno[i] <- abs(anno_start[i] - centre[i])
        }else {
            up_anno[i] <- anno_start[i] - centre[i]
            down_anno[i] <- anno_end[i] - centre[i]
        }
    }
    rm(N, anno_start, anno_end)

    # # Extract upstream and downstream lengths in bps
    # width <- GenomicRanges::ranges(anno_dt)@width[1]
    # if (identical(strand[1], "-")) {
    #     downstream <- abs(GenomicRanges::ranges(anno_dt)@start[1] - centre[1])
    #     upstream   <- downstream - width + 1
    # }else{
    #     upstream   <- GenomicRanges::ranges(anno_dt)@start[1] - centre[1]
    #     downstream <- width + upstream - 1
    # }

    # Initialize variables -----------------------------------------
    met_region <- list()
    # Initilize list to NA
    for (j in 1:length(id)) { met_region[[id[j]]] <- NA }
    LABEL     <- FALSE                     # Flag variable
    reg_count <- 0                         # Region counter
    cpg_ind   <- vector(mode = "integer")  # Vector of CpG indices
    cpg_ind   <- c(cpg_ind, subj_hits[1])  # Add the first subject hit

    total_iter <- NROW(query_hits)
    for (i in 2:total_iter) {
        # If query hits is the same as the previous one
        if (query_hits[i] == query_hits[i - 1]) {
            cpg_ind <- c(cpg_ind, subj_hits[i])  # Add subject hit
            # In case we have the last region
            if (i == total_iter) { reg_count <- reg_count + 1; LABEL <- TRUE }
        }else {reg_count <- reg_count + 1; LABEL <- TRUE } # Increase counter

        if (LABEL) {
            # Central location for region 'reg_count'
            idx <- id[gen_ind[reg_count]]
            # Only keep regions that have more than 'n' CpGs
            if (length(cpg_ind) > cov) {
                # If sd of the methylation level is above threshold
                if (D == 2) { obs_var <- stats::sd(met[cpg_ind])
                }else {obs_var <- stats::sd(met[cpg_ind]/total[cpg_ind]) }
                if (obs_var > sd_thresh) {
                    # Locations of CpGs in the genome
                    region <- cpg_loc[cpg_ind]
                    # Middle location for region 'reg_count'
                    middle <- centre[gen_ind[reg_count]]
                    # Extract chromosome information
                    chrom_info<- chrom[gen_ind[reg_count]]
                    # Extract strand information, i.e. direction
                    strand_direction <- strand[gen_ind[reg_count]]
                    # Extract upstream information
                    upstream <- up_anno[gen_ind[reg_count]]
                    # Extract downstream information
                    downstream <- down_anno[gen_ind[reg_count]]
                    # Shift CpG locations relative to TSS
                    center_data  <- .do_centre_loc(region = region,
                        centre = middle, strand_direction = strand_direction)
                    # In "-" strand the order of the locations should change
                    Order <- base::order(center_data)
                    met_region[[idx]] <- matrix(0, length(cpg_ind), D)
                    # Store actual genomic coordinates of CpGs
                    rownames(met_region[[idx]]) <- paste(chrom_info,
                                                    region[Order], sep = ":")
                    # Store normalized locations of CpGs
                    met_region[[idx]][, 1] <- round(.minmax_scaling(
                        data = center_data[Order], xmin = upstream,
                        xmax = downstream, fmin = fmin, fmax = fmax), 4)
                    # Store reads in the corresponding locations
                    if (D == 2) { met_region[[idx]][, 2] <- met[cpg_ind][Order]
                    }else{
                        # Store total reads in the corresponding locations
                        met_region[[idx]][, 2] <- total[cpg_ind][Order]
                        # Store methylated reads in the corresponding locations
                        met_region[[idx]][, 3] <- met[cpg_ind][Order]
                    }
                }
            }
            LABEL   <- FALSE
            cpg_ind <- vector(mode = "integer")
            cpg_ind <- c(cpg_ind, subj_hits[i])
        }
    }
    # Keep only covered genomic regions
    if (filter_empty_region) {
        cov_ind <- which(!is.na(met_region))
        met_region <- met_region[cov_ind]
        anno_dt <- anno_dt[cov_ind, ]
    }
    return(structure(list(met = met_region, anno = anno_dt),
                     class = "region_object"))
}


#' @title Read expression data file
#'
#' @description \code{read_expr} reads a file containing expression data. It
#'   should contain two columns: "id", "expr" and by default columns are
#'   considered in tab delimited format.
#'
#' @param file File name
#' @param log2_transf Logical, whether to log2 transform the expression data.
#' @param delimiter Delimiter format the columns are splitted. Default is tab.
#'
#' @return A \code{\link[data.table]{data.table}} object.
#'
#' @author C.A.Kapourani \email{C.A.Kapourani@@ed.ac.uk}
#'
#' @seealso \code{\link{read_met}}, \code{\link{read_anno}}
#'
#' @examples
#' # Obtain the path to files
#' file <- system.file("extdata", "dummy_expr.bed", package = "BPRMeth")
#' expr_dt <- read_expr(file)
#'
#' # Extract feature id
#' expr_ids <- expr_dt$id
#'
#' @export
read_expr <- function(file, log2_transf = FALSE, delimiter = "\t"){
    expr_dt <- data.table::fread(input = file, sep = delimiter,
                                 stringsAsFactors = FALSE,
                                 col.names = c("id", "expr"))
    if (log2_transf) { expr_dt$expr <- log2(expr_dt$expr + 0.1) }
    return(expr_dt)
}


#' @title Read genome chromosome sizes file
#'
#' @description \code{read_chrom_size} reads a file containing genome chromosome
#'   sizes using the \code{\link[data.table]{fread}} function.
#'
#' @param file File name
#' @param delimiter Delimiter format the columns are splitted. Default is tab.
#'
#' @return A \code{\link[data.table]{data.table}} object.
#'
#' @author C.A.Kapourani \email{C.A.Kapourani@@ed.ac.uk}
#'
#' @seealso \code{\link{read_met}}, \code{\link{read_anno}}
#'
#' @examples
#' chr_file <- system.file("extdata", "hg19.chr.sizes", package = "BPRMeth")
#' chr_data <- read_chrom_size(chr_file)
#'
#' # Extract the size of chr1
#' chr_data[1]
#'
#' @export
read_chrom_size <- function(file, delimiter = "\t"){
    chr_size <- data.table::fread(input = file, sep = delimiter,
                                  stringsAsFactors = TRUE,
                                  col.names = c("chr", "size"))
    return(chr_size)
}


#' @title Create annotation regions
#'
#' @description \code{create_anno_region} creates annotation regions from
#'   annotation data, using the central point of the annotation features as
#'   ground truth labels we create genomic regions \code{N} bp upstream and
#'   \code{M} bp downstream of central location.
#'
#' @param anno A \code{GRanges} object containing the annotation data, this
#'   normally would be the output from \code{\link{read_anno}} function.
#' @param chrom_size Object containing genome chromosome sizes, normally would
#'   be the output of \code{\link{read_chrom_size}} function.
#' @inheritParams read_anno
#' @return A \code{GRanges} object containing the genomic regions.
#'
#'   The GRanges object contains two or three additional metadata column:
#'   \itemize{\item{\code{id}: Genomic region id.} \item{\code{centre}: Central
#'   location of each genomic region.} \item{\code{name}: (Optional) Genomic
#'   region name.} } This column can be accessed as follows:
#'   \code{granges_object$tss}
#'
#' @author C.A.Kapourani \email{C.A.Kapourani@@ed.ac.uk}
#'
#' @seealso \code{\link{create_region_object}}, \code{\link{read_anno}}
#'
#' @examples
#' # Obtain the path to files
#' file <- system.file("extdata", "dummy_anno.bed", package = "BPRMeth")
#' anno_dt <- read_anno(file, is_anno_region = FALSE)
#' # Create genomic region
#' gen_region <- create_anno_region(anno_dt)
#' # Extract ID
#' id <- gen_region$id
#'
#' @importFrom methods is
#' @export
create_anno_region <- function(anno, chrom_size = NULL, is_centre = FALSE,
                               is_window = TRUE, upstream = -5000,
                               downstream = 5000){
    assertthat::assert_that(methods::is(anno, "GRanges"))
    if (!is_centre & !is_window) {
        stop("One of 'is_centre' or 'is_window' must be set to TRUE.")
    }
    N <- NROW(anno)  # Number of features
    if (upstream > 0 ) { upstream <- -upstream }
    # Create empty vectors
    centre <- vector(mode = "integer", N)  # Centre location
    up     <- vector(mode = "integer", N)  # Start location in chromosome
    down   <- vector(mode = "integer", N)  # End location in chromosome
    chrom  <- as.character(anno@seqnames)                   # Chrom info
    strand <- as.character(GenomicRanges::strand(anno))     # Strand info
    start  <- GenomicRanges::ranges(anno)@start             # Start info
    end    <- start + GenomicRanges::ranges(anno)@width - 1 # End info

    for (i in 1:N) {
        # If we consider the centre of the region as centre or the beginning
        if (is_centre) {
            centre[i] <- floor((start[i] + end[i]) / 2)
        }else{
            # Depending on the strand we change regions up or downstream of centre
            if (identical(strand[i], "-")) { centre[i] <- end[i]
            } else{centre[i] <- start[i] }
        }
        if (!is_window) {
            # Fixed issue for negative lengths of IRanges
            up[i] <- start[i]
            down[i] <- end[i]
        }else{
            # Depending on the strand we change regions up or downstream of centre
            if (identical(strand[i], "-")) {
                # Set downstream bp promoter region
                up[i] <- max(0, centre[i] - downstream)
                # Set upstream bp promoter region
                if (is.null(chrom_size)) {down[i] <- centre[i] - upstream
                }else {down[i] <- min(chrom_size[chrom_size$chr == chrom[i]]$size,
                                      centre[i] - upstream) }
            }else {
                # Set upstream bp promoter region
                up[i] <- max(0, centre[i] + upstream)
                # Set downstream bp promoter region
                if (is.null(chrom_size)) { down[i] <- centre[i] + downstream
                }else {down[i] <- min(chrom_size[chrom_size$chr == chrom[i]]$size,
                                      centre[i] + downstream) }
            }
        }
    }

    # Create GRanges object
    if (is.null(anno$name)) {
        genomic_region <- GenomicRanges::GRanges(seqnames = chrom,
            ranges = IRanges::IRanges(start = up, end = down),
            strand = strand, id = anno$id, centre = centre)
    }else {
        genomic_region <- GenomicRanges::GRanges(seqnames = chrom,
            ranges = IRanges::IRanges(start = up, end = down),
            strand = strand, id = anno$id, centre = centre, name = anno$name)
    }
    return(genomic_region)
}


#' @title Partition bulk methylation dataset to training and test set
#'
#' @param dt_obj BPRMeth data `region_object` object
#' @param cpg_train_prcg Fraction of CpGs in each genomic region to keep for
#'   training set.
#'
#' @return The BPRMeth data `region_object` object with the following changes.
#'   The `met` element will now contain only the `training` data. An additional
#'   element called `met_test` will store the data that will be used during
#'   testing to evaluate the imputation performance. These data will not be seen
#'   from BPRMeth during inference.
#'
#' @author C.A.Kapourani \email{C.A.Kapourani@@ed.ac.uk}
#'
#' @examples
#' # Partition the synthetic data from BPRMeth package
#' dt <- partition_bulk_dataset(encode_met)
#'
#' @seealso \code{\link{create_region_object}}, \code{\link{read_met}},
#'   \code{\link{impute_bulk_met}}
#'
#' @export
#'
partition_bulk_dataset <- function(dt_obj, cpg_train_prcg = 0.5){
    assertthat::assert_that(cpg_train_prcg >= 0 & cpg_train_prcg <= 1)
    # If `met_test` element already exists, stop
    if (!is.null(dt_obj$met_test)) { stop("Stopping. Test data already exist!")}
    train = test <- dt_obj$met
    N <- length(dt_obj$met)       # Number of cgenomic regions
    for (n in seq_len(N)) {     # Iterate over region
        # Get fraction of CpGs covered
        pivot <- cpg_train_prcg * NROW(dt_obj$met[[n]])
        idx <- sort(sample(NROW(dt_obj$met[[n]]), round(pivot)))
        train[[n]] <- dt_obj$met[[n]][idx,,drop = FALSE]
        test[[n]] <- dt_obj$met[[n]][-idx,,drop = FALSE]
    }
    # Set the training data as the `met` element
    dt_obj$met <- train
    # Set the test data as the `met_test` element
    dt_obj$met_test <- test
    return(dt_obj)
}


#' @title Impute/predict bulk methylation states
#'
#' @description Make predictions of missing methylation states, i.e. perfrom
#'   imputation using BPRmeth This requires keepin a subset of data as a held
#'   out test set during BPRMeth inference or providing a different file that
#'   contains chromosome and CpG locations.
#'
#' @param obj Output of BPRMeth inference object.
#' @param anno A \code{GRanges} object with annotation data, whose format
#'   should be similar to  \code{\link{read_anno}}.
#' @param test_data Test data to evaluate performance.
#' @param return_test Whether or not to return a list with the predictions.
#'
#' @return A list containing two vectors, the true methylation state and the
#'   predicted/imputed methylation states.
#'
#' @author C.A.Kapourani \email{C.A.Kapourani@@ed.ac.uk}
#'
#' @examples
#' # Extract synthetic data
#' dt <- encode_met
#'
#' # Partition to train and test set
#' dt <- partition_bulk_dataset(dt)
#'
#' # Create basis object
#' basis_obj <- create_rbf_object(M = 3)
#'
#' # Run BPRMeth
#' fit <- infer_profiles_mle(X = dt$met, model = "binomial",
#'    basis = basis_obj, is_parallel = FALSE, opt_itnmax = 10)
#'
#' # Perform imputation
#' imputation_obj <- impute_bulk_met(obj = fit, anno = dt$anno,
#'                                   test_data = dt$met_test)
#'
#' @seealso \code{\link{partition_bulk_dataset}}
#'
#' @export
impute_bulk_met <- function(obj, anno, test_data = NULL, return_test = FALSE) {
    # Either test data or test file should be supplied
    assertthat::assert_that(!is.null(test_data))
    N         <- length(test_data)              # Number of genomic regions
    test_pred <- test_data                      # Copy test data
    act_obs   <- vector(mode = "list", length = N) # Actual CpG states
    pred_obs  <- vector(mode = "list", length = N) # Predicted CpG states
    # Iterate over each genomic region
    for (n in seq_len(N)) {
        # Predict expression
        pred <- eval_probit_function(obj = obj$basis,
                                     obs = test_data[[n]][, 1], w = obj$W[n,])
        if (NCOL(test_pred[[n]]) == 2) {
            test_pred[[n]][,2] <- pred
            # Actual CpG states
            act_obs[[n]]  <- test_data[[n]][, 2]
            # Predicted CpG states (i.e. function evaluations)
            pred_obs[[n]] <- test_pred[[n]][, 2]
        } else {
            out <- cbind(test_pred[[n]][,1], pred)
            test_pred[[n]] <- out
            # Actual CpG states
            act_obs[[n]]  <- test_data[[n]][, 3] / test_data[[n]][, 2]
            # Predicted CpG states (i.e. function evaluations)
            pred_obs[[n]] <- test_pred[[n]][, 2]
        }
    }
    if (return_test) {
        return(list(test_pred = test_pred, act_obs = do.call("c", act_obs),
                    pred_obs = do.call("c", pred_obs)))
    }else{
        return(list(act_obs = do.call("c", act_obs),
                    pred_obs = do.call("c", pred_obs)))
    }
}

Try the BPRMeth package in your browser

Any scripts or data that you put into this service are public.

BPRMeth documentation built on Nov. 8, 2020, 5:54 p.m.