Nothing
#' A class to manage metagene analysis.
#'
#' This class will allow to load, convert and normalize alignments and regions
#' files/data. Once the data is ready, the user can then chose to produce
#' metagene plots on the data (or a subset of the data).
#'
#' @section Constructor:
#' \describe{
#' \item{}{\code{mg <- metagene$new(regions, bam_files, padding_size = 0,
#' cores = SerialParam(), verbose = FALSE,
#' force_seqlevels = FALSE, paired_end = FALSE,
#' assay = 'chipseq'))}}
#' \item{regions}{Either a \code{vector} of BED, narrowPeak or broadPeak
#' filenames, a \code{GRanges} object or a
#' \code{GRangesList} object.}
#' \item{bam_files}{A \code{vector} of BAM filenames. The BAM files must be
#' indexed. i.e.: if a file is named file.bam, there must
#' be a file named file.bam.bai or file.bai in the same
#' directory.}
#' \item{padding_size}{The regions will be extended on each side by the
#' value of this parameter. The padding_size must be a
#' non-negative integer. Default = 0.}
#' \item{cores}{The number of cores available to parallelize the analysis.
#' Either a positive integer or a \code{BiocParallelParam}.
#' Default: \code{SerialParam()}.}
#' \item{verbose}{Print progression of the analysis. A logical constant.
#' Default: \code{FALSE}.}
#' \item{force_seqlevels}{If \code{TRUE}, Remove regions that are not found
#' in bam file header. Default: \code{FALSE}. TRUE and FALSE
#' respectively correspond to pruning.mode = "coarse"
#' and "error" in ?seqinfo.}
#' \item{paired_end}{If \code{TRUE}, metagene will deal with paired-ended
#' data. If \code{FALSE}, single-ended data are expected.
#' Default: \code{FALSE}}
#' \item{assay}{\code{'chipseq'} or \code{'rnaseq'}, the two available
#' options. Default: \code{'chipseq'}}
#' }
#'
#' \code{metagene$new} returns a \code{metagene} object that contains the
#' coverages for every BAM files in the regions from the \code{regions}
#' param.
#'
#' @return
#' \code{metagene$new} returns a \code{metagene} object which contains the
#' normalized coverage values for every regions and for every BAM files.
#'
#' @section Methods:
#' \describe{
#' \item{}{\code{mg$plot(region_names = NULL, design_names = NULL,
#' title = NULL, x_label = NULL)}}
#' \item{region_names}{The names of the regions to extract. If \code{NULL},
#' all the regions are returned. Default: \code{NULL}.}
#' \item{design_names}{The names of the experiments to extract. If a design
#' was added to the \code{metagene} object, \code{design_names}
#' correspond to the column names in the design, otherwise
#' \code{design_names} corresponds to the BAM name or the BAM
#' filename. If \code{NULL}, all the experiments are
#' returned. Default: \code{NULL}.}
#' \item{title}{A title to add to the graph. If \code{NULL}, will be
#' automatically created. Default: NULL}
#' \item{x_label}{X-axis label to add to the metagene plot. If \code{NULL},
#' metagene will use generic label. Default: \code{NULL}.}
#' }
#' \describe{
#' \item{}{\code{mg$produce_table(design, bin_count, noise_removal,
#' normalization, flip_regions, bin_size = NULL}}
#' \item{design}{A \code{data.frame} that describe to experiment to plot.
#' see \code{plot} function for more details. \code{NA} can
#' be used keep previous design value. Default: \code{NA}.}
#' \item{bin_count}{The number of bin to create. \code{NA} can be used to
#' keep previous bin_count value. A bin_count value of 100
#' will be used if no value is specified. Default:
#' \code{NA}.}
#' \item{noise_removal}{The algorithm to use to remove control(s). Possible
#' values are \code{NA}, \code{NULL} or "NCIS". By
#' default, value is \code{NULL}. Use \code{NA} keep
#' previous \code{noise_removal} value (i.e. if
#' \code{produce_table} was called before). See
#' Liand and Keles 2012 for the NCIS algorithm.}
#' \item{normalization}{The algorithm to use to normalize samples. Possible
#; values are \code{NA}, \code{NULL} or "RPM". By
#' default, value is \code{NULL} and no normalization
#' will be performed. Use \code{NA} keep
#' previous \code{normalization} value (i.e. if
#' \code{produce_table} was called before).}
#' \item{flip_regions}{Should regions on negative strand be flip_regions?
#' Default: \code{FALSE}.}
#' \item{bin_size}{Deprecated.}
#' }
#' \describe{
#' \item{}{\code{mg$produce_data_frame(alpha = 0.05, sample_count = 1000,
#' avoid_gaps = FALSE, gaps_threshold = 0)}}
#' \item{alpha}{The range of the estimation to be shown with the ribbon.
#' \code{1 - alpha / 2} and \code{alpha / 2} will be used.
#' Default: 0.05.}
#' \item{sample_count}{The number of draw to do in the bootstrap
#' calculation. Default: 1000.}
#' \item{avoid_gaps}{Provide the possibility to remove values = 0 and refit
#' the data_frame for this suppression.
#' Default : \code{FALSE}.}
#' \item{gaps_threshold}{It works with avoid_gaps argument. It lets to remove
#' values <= at gaps_threshold. Default : 0.}
#' }
#' \describe{
#' \item{}{mg$get_params()}
#' }
#' \describe{
#' \item{}{mg$get_design()}
#' }
#' \describe{
#' \item{}{mg$get_regions(region_names = NULL)}
#' \item{region_names}{The names of the regions to extract. If \code{NULL},
#' all the regions are returned. Default: \code{NULL}.}
#' }
#' \describe{
#' \item{}{mg$get_table = function()}
#' }
#' \describe{
#' \item{}{mg$get_matrices = function()}
#' }
#' \describe{
#' \item{}{mg$get_data_frame(region_names = NULL, design_names = NULL)}
#' \item{region_names}{The names of the regions to extract. If \code{NULL},
#' all the regions are returned. Default: \code{NULL}.}
#' \item{design_names}{The names of the experiments to extract. If a design
#' was added to the \code{metagene} object, \code{design_names}
#' correspond to the column names in the design, otherwise
#' \code{design_names} corresponds to the BAM name or the BAM
#' filename. If \code{NULL}, all the experiments are
#' returned. Default: \code{NULL}.}
#' }
#' \describe{
#' \item{}{get_plot = function()}
#' }
#' \describe{
#' \item{}{get_raw_coverages = function(filenames)}
#' \item{filenames}{The name of the file to extract raw coverages. Can be
#' the filename with the extension of the name of the bam
#' file (if a named bam files was used during the creation
#' of the metagene object). If \code{NULL}, returns the
#' coverage of every bam files. Default: \code{NULL}.}
#' }
#' \describe{
#' \item{}{get_normalized_coverages = function(filenames)}
#' \item{filenames}{The name of the file to extract normalized coverages
#' (in RPM). Can be the filename with the extension of
#' the name of the bam file (if a named bam files was used during
#' the creation of the metagene object). If \code{NULL},
#' returns the coverage every bam files. Default:
#' \code{NULL}.}
#' }
#' \describe{
#' \item{}{\code{mg$export(bam_file, region, file)}}
#' \item{bam_file}{The name of the bam file to export.}
#' \item{region}{The name of the region to export.}
#' \item{file}{The name of the ouput file.}
#' }
#' \describe{
#' \item{}{\code{mg$add_design(design = NULL, check_bam_files = FALSE)}}
#' \item{design}{A \code{data.frame} that describe to experiment to plot.
#' See \code{plot} function for more details. \code{NA} can be
#' used keep previous design value. Default: \code{NA}.}
#' \item{check_bam_files}{Force check that all the bam files from the first
#' columns of the design are present in current
#' metagene object. Default: \code{FALSE}}
#' }
#' \describe{
#' \item{}{\code{mg$unflip_regions()}}
#' }
#'
#' \describe{
#' \item{}{\code{mg$flip_regions()}}
#' }
#' \describe{
#' \item{}{\code{mg$unflip_regions()}}
#' }
#'
#' @examples
#' region <- get_demo_regions()[1]
#' bam_file <- get_demo_bam_files()[1]
#' mg <- metagene$new(regions = region, bam_files = bam_file)
#' \dontrun{
#' df <- metagene$plot()
#' }
#'
#' @importFrom R6 R6Class
#' @importFrom data.table data.table
#' @export
#' @format A metagene experiment manager
metagene <- R6Class("metagene",
public = list(
# Methods
initialize = function(regions, bam_files, padding_size = 0,
cores = SerialParam(), verbose = FALSE,
force_seqlevels = FALSE, paired_end = FALSE,
assay = 'chipseq') {
# Check params...
private$check_param(regions = regions, bam_files = bam_files,
padding_size = padding_size,
cores = cores, verbose = verbose,
force_seqlevels = force_seqlevels,
assay = assay)
# Save params
private$parallel_job <- Parallel_Job$new(cores)
private$params[["padding_size"]] <- padding_size
private$params[["verbose"]] <- verbose
if (is.null(names(bam_files))) {
new_names <- tools::file_path_sans_ext(basename(bam_files))
names(bam_files) <- new_names
}
private$params[["bin_count"]] <- NULL
private$params[["bam_files"]] <- bam_files
private$params[["force_seqlevels"]] <- force_seqlevels
private$params[["flip_regions"]] <- FALSE
private$params[["assay"]] <- tolower(assay)
private$params[["df_needs_update"]] <- TRUE
private$params[["df_arguments"]] <- ""
private$params[["table_needs_update"]] <- TRUE
# Prepare bam files
private$print_verbose("Prepare bam files...")
private$bam_handler <- Bam_Handler$new(bam_files,
cores = cores,
paired_end = paired_end)
# Prepare regions
private$print_verbose("Prepare regions...")
private$regions <- private$prepare_regions(regions)
# Parse bam files
private$print_verbose("Parse bam files...\n")
private$print_verbose("coverages...\n")
private$coverages <- private$produce_coverages()
},
get_bam_count = function(filename) {
# Parameters validation are done by Bam_Handler object
private$bam_handler$get_aligned_count(filename)
},
get_params = function() {
private$params
},
get_design = function() {
private$design
},
get_regions = function(region_names = NULL) {
if (is.null(region_names)) {
private$regions
} else {
new_names <- tools::file_path_sans_ext(
basename(region_names))
region_names <- new_names
stopifnot(all(region_names %in% names(private$regions)))
private$regions[region_names]
}
},
get_table = function() {
if (length(private$table) == 0) {
return(NULL)
}
if (private$params[['assay']] == 'chipseq'){
return(copy(private$table))
} else if (private$params[['assay']] == 'rnaseq'){
if('bin' %in% colnames(private$table)){
returnedCol <- c("region","exon","bam","design","nuc","bin",
"nuctot","exonsize","regionstartnuc",
"regionsize","value","strand")
} else {
returnedCol <- c("region","exon","bam","design","nuc",
"nuctot","exonsize","regionstartnuc",
"regionsize","value","strand")
}
return(copy(private$table[,returnedCol,with=FALSE]))
}
},
get_matrices = function() {
if (is.null(self$get_table())){
return(NULL)
}
if (private$params[['assay']] == 'chipseq') {
matrices <- list()
nbcol <- private$params[["bin_count"]]
nbrow <- vapply(self$get_regions(), length, numeric(1))
for (regions in names(self$get_regions())) {
matrices[[regions]] <- list()
for (design_name in colnames(self$get_design())[-1]) {
matrices[[regions]][[design_name]] <- list()
matrices[[regions]][[design_name]][["input"]] <-
matrix(private$table[region == regions &
design == design_name,]$value,
nrow=nbrow[regions], ncol=nbcol, byrow=TRUE)
}
}
return (matrices)
} else {
stop(paste('unsupported function for assay of type',
private$params[['assay']],
'. Only available for "chipseq" assay.'))
}
},
get_data_frame = function(region_names = NULL, design_names = NULL) {
if (nrow(private$df) == 0) {
NULL
} else if (is.null(region_names) & is.null(design_names)) {
return(copy(private$df))
} else {
if (!is.null(region_names)) {
stopifnot(is.character(region_names))
stopifnot(all(region_names %in%
unique(private$table$region)))
} else {
region_names <- names(private$regions)
}
if (!is.null(design_names)) {
stopifnot(is.character(design_names))
stopifnot(all(design_names %in%
unique(private$table$design)))
} else {
design_names <- colnames(private$design)[-1]
}
i <- (private$df$region %in% region_names &
private$df$design %in% design_names)
return(copy(private$df[i,]))
}
},
get_plot = function() {
if (is.character(private$graph)) {
NULL
} else {
private$graph
}
},
get_raw_coverages = function(filenames = NULL) {
if (is.null(filenames)) {
private$coverages
} else {
stopifnot(is.character(filenames))
stopifnot(length(filenames) > 0)
bam_names <- private$get_bam_names(filenames)
stopifnot(length(bam_names) == length(filenames))
private$coverages[bam_names]
}
},
get_normalized_coverages = function(filenames = NULL) {
normalize_coverage <- function(filename) {
count <- private$bam_handler$get_aligned_count(filename)
weight <- 1 / (count / 1000000)
coverages[[filename]] <- coverages[[filename]] * weight
}
coverages <- self$get_raw_coverages(filenames)
coverage_names <- names(coverages)
coverages <-
private$parallel_job$launch_job(data = coverage_names,
FUN = normalize_coverage)
names(coverages) <- coverage_names
coverages
},
add_design = function(design, check_bam_files = FALSE) {
private$design = private$fetch_design(design, check_bam_files)
private$params[["table_needs_update"]] <- TRUE
},
produce_table = function(design = NA, bin_count = NA, bin_size = NULL,
noise_removal = NA, normalization = NA,
flip_regions = FALSE) {
if (!is.null(bin_size)) {
warning("bin_size is now deprecated. Please use bin_count.")
bin_size <- NULL
}
design <- private$fetch_design(design)
private$check_produce_table_params(bin_count = bin_count,
bin_size = bin_size,
design = design,
noise_removal = noise_removal,
normalization = normalization,
flip_regions = flip_regions)
bin_count <- private$get_param_value(bin_count, "bin_count")
noise_removal <- private$get_param_value(noise_removal,
"noise_removal")
normalization <- private$get_param_value(normalization,
"normalization")
if(!is.null(normalization) && !is.null(noise_removal)) {
stop("NCIS background removal and RPM normalization are mutually exclusive.")
}
coverages <- private$coverages
#addition of private$params[["table_needs_update"]] comes from
#troubles in table update when adding a design with the add_design
#function and changing nothing else in produce_table parameters
if (private$table_need_update(design = design,
bin_count = bin_count,
bin_size = bin_size,
noise_removal = noise_removal,
normalization = normalization) |
private$params[["table_needs_update"]]) {
if (!is.null(normalization)) {
coverages <- private$normalize_coverages(coverages)
message('Normalization done')
}
if (private$params[['assay']] == 'rnaseq'){
# here the word 'gene' = 'region'
bam_files_names <- names(private$params[["bam_files"]])
# useful variables for caculations of
# standard data table structure (std_dt_struct)
gene_count <- length(private$regions)
gene_names <- names(private$regions)
exon_length_by_exon_by_gene <- width(private$regions)
exon_length_by_exon_by_gene_cum <- vapply(
exon_length_by_exon_by_gene, sum, numeric(1))
exon_count_by_gene <- vapply(private$regions, length,
numeric(1))
## standard data table structure for one replicat
col_gene <- rep(gene_names, times=unlist(map(
1:gene_count,
~sum(exon_length_by_exon_by_gene[[
gene_names[.x]]]))))
exon_names <- unlist(map(exon_count_by_gene, ~ 1:.x))
col_exon <- as.vector(rep(exon_names,
times=unlist(exon_length_by_exon_by_gene)))
col_exon_size <- rep(as.vector(#useful for flip function
unlist(exon_length_by_exon_by_gene)),
times=as.vector(unlist(
exon_length_by_exon_by_gene)))
gene_size <- unlist(map(1:length(self$get_regions()),
~sum(width(self$get_regions()[[.x]]))))
col_gene_size <- rep(gene_size, times = gene_size)
gene_length_cum <- c(0,
cumsum(gene_size)[-length(gene_size)])+1
col_gene_start_nuc <- rep(gene_length_cum,
times = gene_size)
col_nuc <- unlist(map(as.vector(unlist(
exon_length_by_exon_by_gene_cum)), ~ 1:.x))
exon_strand_by_exon_by_gene <- unlist(map(gene_names,
~as.vector(strand(private$regions[[.x]]))))
col_strand <- as.vector(rep(exon_strand_by_exon_by_gene,
times=unlist(exon_length_by_exon_by_gene)))
length_std_dt_struct = length(col_gene)
# the number of not empty cases in the design param
copies_count <- sum(replace(unlist(design[,-1]),
which(unlist(design[,-1]) == 2),1))
#multiplication of standard data table structure
col_gene <- rep(col_gene, copies_count)
col_exon <- rep(col_exon, copies_count)
col_nuctot <- 1:length(col_nuc)
col_nuctot <- rep(col_nuctot, copies_count)
col_nuc <- rep(col_nuc, copies_count)
col_exon_size <- rep(col_exon_size, copies_count)
col_gene_size <- rep(col_gene_size, copies_count)
col_strand <- rep(col_strand, copies_count)
col_gene_start_nuc <- rep(col_gene_start_nuc,
copies_count)
## other columns of data table
design_names <- colnames(design)[-1]
bam_names_in_design <- tools::file_path_sans_ext(
basename(
as.character(design[,1])))
bfile_names_by_design <- tools::file_path_sans_ext(
unlist(map(design_names ,
~design[which(design[,
which(colnames(
design) == .x)] > 0),1])))
col_bam <- rep(bfile_names_by_design,
each=length_std_dt_struct)
nb_bfile_by_design <- unlist(map(design_names ,
~length(which(design[,which(
colnames(design) == .x)] > 0))))
col_design <- rep(design_names,
times=(nb_bfile_by_design *
length_std_dt_struct))
## col_values
#NB : lapply(Views...) -> out of limits of view
grtot <- self$get_regions()
col_values <- list()
idx <- 1 #index for col_values list
idx_sd_loop <- 1
for(bam in bam_names_in_design) {
for (i in 1:length(grtot)){
gr <- grtot[[i]]
sq <- unique(as.character(seqnames(gr)))
val <- Views(
coverages[[bam]][[sq]],
start(gr),
end(gr))
col_values[[idx]] <- unlist(lapply(
val, as.numeric))
idx <- idx + 1
}
}
col_values <- unlist(col_values)
if (!is.null(bin_count)) {
message('produce data table : RNA-Seq binned')
col_bins <- trunc(
(col_nuc/(col_gene_size+1))*bin_count)+1
col_bins <- as.integer(col_bins)
private$table <- data.table(
region = col_gene,
exon = col_exon,
bam = col_bam,
design = col_design,
bin = col_bins,
nuc = col_nuc,
nuctot = col_nuctot,
exonsize = col_exon_size,
regionsize = col_gene_size,
regionstartnuc = col_gene_start_nuc,
value = col_values,
strand = col_strand)
} else {
message('produce data table : RNA-Seq')
private$table <- data.table(
region = col_gene,
exon = col_exon,
bam = col_bam,
design = col_design,
nuc = col_nuc,
nuctot = col_nuctot,
exonsize = col_exon_size,
regionsize = col_gene_size,
regionstartnuc = col_gene_start_nuc,
value = col_values,
strand = col_strand)
}
} else { # chipseq
if (!is.null(noise_removal)) {
coverages <- private$remove_controls(coverages, design)
} else {
coverages <- private$merge_chip(coverages, design)
}
message('produce data table : ChIP-Seq')
if (is.null(bin_count)) {
bin_count = 100
}
region_length <- vapply(self$get_regions(), length,
numeric(1))
col_regions <- names(self$get_regions()) %>%
map(~ rep(.x, length(coverages) * bin_count *
region_length[.x])) %>% unlist()
col_designs <- map(region_length, ~ rep(names(coverages),
each = bin_count * .x)) %>% unlist
col_bins <- rep(1:bin_count,
length(coverages) * sum(region_length))
pairs <- expand.grid(colnames(design)[-1],
names(self$get_regions()),
stringsAsFactors = FALSE)
col_values <- map2(pairs$Var1, pairs$Var2,
~ private$get_subtable(coverages[[.x]], .y,
bin_count)) %>% unlist
#TODO : improve col_strand production
col_strand <- list()
idx <- 1
for (region_names in unique(col_regions)){
col_strand[[idx]] <- rep(rep(
as.vector(strand(private$regions)[[region_names]]),
each=bin_count),length(unique(col_designs)))
idx <- idx + 1
}
col_strand <- unlist(col_strand)
private$table <- data.table(region = col_regions,
design = col_designs,
bin = col_bins,
value = col_values,
strand = col_strand)
}
private$params[["bin_size"]] <- bin_size
private$params[["bin_count"]] <- bin_count
private$params[["noise_removal"]] <- noise_removal
private$params[["normalization"]] <- normalization
private$params[["df_needs_update"]] <- TRUE
private$params[["table_needs_update"]] <- FALSE
private$design <- design
} else {
message(paste('WARNING : table is unchanged regarding',
'design, bin_count, noise_removal, normalization.'))
}
if (flip_regions == TRUE) {
self$flip_regions()
} else {
self$unflip_regions()
}
invisible(self)
},
produce_data_frame = function(alpha = 0.05, sample_count = 1000,
avoid_gaps = FALSE,
bam_name = NULL,
gaps_threshold = 0) {
#arguments checking
stopifnot(is.numeric(alpha))
stopifnot(is.numeric(sample_count))
stopifnot(alpha >= 0 & alpha <= 1)
stopifnot(sample_count > 0)
sample_count <- as.integer(sample_count)
stopifnot(is.logical(avoid_gaps))
if (!is.null(bam_name)){
stopifnot(is.character(bam_name))
bam_name <- tools::file_path_sans_ext(basename(bam_name))
bam_names <- tools::file_path_sans_ext(basename(
private$params[["bam_files"]]))
if (!bam_name %in% bam_names){
stop(paste("bam_name argument is no one of bam_names",
"provided to the metagene object"))
}
}
stopifnot(gaps_threshold >= 0)
#add checks
list_of_arguments <- paste(alpha,
sample_count,
avoid_gaps,
bam_name,
gaps_threshold)
if (private$params[["df_arguments"]] != list_of_arguments){
private$params[["df_arguments"]] <- list_of_arguments
private$params[["df_needs_update"]] <- TRUE
}
if (private$params[['df_needs_update']]){
# 1. Get the correctly formatted table
if (is.null(self$get_table())) {
self$produce_table()
}
# 2. Produce the data.frame
private$df <- data.table::copy(self$get_table())
if (private$params[['assay']] == 'chipseq') {
message('produce data frame : ChIP-Seq')
private$data_frame_need_update(alpha, sample_count)
sample_size <- self$get_table()[bin == 1,][
,.N, by = .(region, design)][
, .(min(N))]
sample_size <- as.integer(sample_size)
out_cols <- c("value", "qinf", "qsup")
bootstrap <- function(df) {
sampling <- matrix(df$value[sample(seq_along(
df$value),
sample_size * sample_count,
replace = TRUE)],
ncol = sample_size)
values <- colMeans(sampling)
res <- quantile(values, c(alpha/2, 1-(alpha/2)))
res <- c(mean(df$value), res)
names(res) <- out_cols
as.list(res)
}
private$df <- private$df[,
c(out_cols) := bootstrap(.SD),
by = .(region, design, bin)]
private$df <- unique(private$df)
} else if (private$params[['assay']] == 'rnaseq'
& !('bin' %in% colnames(private$df))){
message('produce data frame : RNA-Seq')
private$data_frame_need_update(alpha, sample_count)
sample_size <- self$get_table()[nuc == 1,][
,.N, by = .(region, design)][
, .(min(N))]
sample_size <- as.integer(sample_size)
out_cols <- c("value", "qinf", "qsup")
bootstrap <- function(df) {
sampling <- matrix(df$value[sample(
seq_along(df$value),
sample_size * sample_count,
replace = TRUE)],
ncol = sample_size)
values <- colMeans(sampling)
res <- quantile(values, c(alpha/2, 1-(alpha/2)))
res <- c(mean(df$value), res)
names(res) <- out_cols
as.list(res)
}
if(avoid_gaps){
if (!is.null(bam_name)){
private$data_frame_avoid_gaps_updates(bam_name,
gaps_threshold)
} else {
private$data_frame_avoid_gaps_updates(
private$df$bam[1],
gaps_threshold)
}
}
###################
if(all(rowSums(self$get_design()[,-1, drop=FALSE]) == 1) &
all(colSums(self$get_design()[,-1, drop=FALSE]) == 1)){
private$df$qinf <- private$df$value
private$df$qsup <- private$df$value
private$df$group <- paste0(private$df$design,'_',private$df$region)
private$df <- data.frame(private$df)
#return(private$df)
} else {
###################
private$df <- private$df[,
c(out_cols) := bootstrap(.SD),
by = .(region, design, nuctot)]
}
#filter to avoid duplicated ligne (to reduce df dims)
#it does not matter concerning the plot. Plot works !
private$df <- private$df[which(!duplicated(paste(
private$df$region,
private$df$design,
private$df$nuctot))),]
} else if (private$params[['assay']] == 'rnaseq'
& ('bin' %in% colnames(private$df))){
message('produce data frame : RNA-Seq binned')
private$data_frame_need_update(alpha, sample_count)
sample_size <- self$get_table()[bin == 1,][
,.N, by = .(design)][
, .(min(N))]
sample_size <- as.integer(sample_size)
out_cols <- c("value", "qinf", "qsup")
bootstrap <- function(df) {
sampling <- matrix(df$value[sample(
seq_along(df$value),
sample_size * sample_count,
replace = TRUE)],
ncol = sample_size)
values <- colMeans(sampling)
res <- quantile(values, c(alpha/2, 1-(alpha/2)))
res <- c(mean(df$value), res)
names(res) <- out_cols
as.list(res)
}
if(avoid_gaps){
message('Avoiding gaps')
if (!is.null(bam_name)){
private$data_frame_avoid_gaps_updates(bam_name,
gaps_threshold)
} else {
private$data_frame_avoid_gaps_updates(
private$df$bam[1],
gaps_threshold)
}
}
private$df <- private$df[,
c(out_cols) := bootstrap(.SD),
by = .(design, bin)]
# checked : ok !
private$df <- private$df[which(!duplicated(paste(
private$df$design,
private$df$bin))),]
}
private$df <- as.data.frame(private$df)
#private$df$design <- as.factor(private$df$design)
private$df$group <- paste(private$df$design,
private$df$region,
sep="_")
private$df$group <- as.factor(private$df$group)
private$params[["df_needs_update"]] <- FALSE
invisible(self)
}
},
plot = function(region_names = NULL, design_names = NULL, title = NULL,
x_label = NULL) {
# 1. Get the correctly formatted table
if (length(private$table) == 0) {
self$produce_table()
}
# 2. Produce the data frame
if (nrow(private$df) == 0) {
self$produce_data_frame()
}
df <- self$get_data_frame(region_names = region_names,
design_names = design_names)
# 3. Produce the graph
if (is.null(title)) {
title <- paste(unique(private$df[["group"]]), collapse=" vs ")
}
p <- private$plot_graphic(df = df, title = title,
x_label = x_label)
print(p)
private$graph <- p
invisible(self)
},
export = function(bam_file, region, file) {
region <- tools::file_path_sans_ext(basename(region))
region <- private$regions[[region]]
param <- Rsamtools::ScanBamParam(which = region)
alignments <- GenomicAlignments::readGAlignments(bam_file,
param = param)
weight <- 1 - private$bam_handler$get_rpm_coefficient(bam_file)
seqlevels(alignments) <- seqlevels(region)
# TODO: don't use the weight param of coverage
coverage <- GenomicAlignments::coverage(alignments, weight=weight)
rtracklayer::export(coverage, file, "BED")
invisible(coverage)
},
flip_regions = function() {
if (private$params[["flip_regions"]] == FALSE) {
private$flip_table()
private$params[["flip_regions"]] <- TRUE
}
invisible(self)
},
unflip_regions = function() {
if (private$params[["flip_regions"]] == TRUE) {
private$flip_table()
private$params[["flip_regions"]] <- FALSE
}
invisible(self)
}
),
private = list(
params = list(),
regions = GRangesList(),
table = data.table(),
design = data.frame(),
coverages = list(),
df = data.frame(),
graph = "",
bam_handler = "",
parallel_job = "",
check_param = function(regions, bam_files, padding_size,
cores, verbose, force_seqlevels, assay) {
# Check parameters validity
if (!is.character(assay)) {
stop("verbose must be a character value")
}
assayTypeAuthorized <- c('chipseq', 'rnaseq')
if (!(tolower(assay) %in% assayTypeAuthorized)) {
stop("assay values must be one of 'chipseq' or 'rnaseq'")
}
if (!is.logical(verbose)) {
stop("verbose must be a logicial value (TRUE or FALSE)")
}
if (!is.logical(force_seqlevels)) {
stop(paste("force_seqlevels must be a logicial ",
"value (TRUE or FALSE)",sep=""))
}
if (!(is.numeric(padding_size) || is.integer(padding_size)) ||
padding_size < 0 || as.integer(padding_size) != padding_size) {
stop("padding_size must be a non-negative integer")
}
isBiocParallel = is(cores, "BiocParallelParam")
isInteger = ((is.numeric(cores) || is.integer(cores)) &&
cores > 0 &&as.integer(cores) == cores)
if (!isBiocParallel && !isInteger) {
stop(paste0("cores must be a positive numeric or ",
"BiocParallelParam instance"))
}
if (!is.vector(bam_files, "character")) {
stop("bam_files must be a vector of BAM filenames")
}
if (!all(sapply(bam_files, file.exists))) {
stop("At least one BAM file does not exist")
}
if (!is(regions, "GRangesList") && !is.character(regions)
&& !is(regions, "GRanges") && !is.list(regions)) {
stop(paste0("regions must be either a vector of BED ",
"filenames, a GRanges object or a GrangesList object"))
}
# Validation specific to regions as a vector
if (is.character(regions) && !all(sapply(regions, file.exists))) {
stop("regions must be a list of existing BED files")
}
},
check_design = function(design, check_bam_files = FALSE) {
stopifnot(is.logical(check_bam_files))
if(!is.null(design) && !is.data.frame(design) &&
!identical(design, NA)) {
stop("design must be a data.frame object, NULL or NA")
}
if (is.data.frame(design)) {
if(ncol(design) < 2) {
stop("design must have at least 2 columns")
}
if (!(is.character(design[,1]) || is.factor(design[,1]))) {
stop("The first column of design must be BAM filenames")
}
if (!all(apply(design[, -1, drop=FALSE], MARGIN=2,
is.numeric))) {
stop(paste0("All design column, except the first one,",
" must be in numeric format"))
}
if (check_bam_files == TRUE) {
samples <- as.character(design[,1])
if (!all(private$check_bam_files(samples))) {
stop("Design contains bam files absent from metagene.")
}
}
}
},
check_produce_table_params = function(bin_count, bin_size, design,
noise_removal, normalization,
flip_regions) {
# At least one file must be used in the design
if (!identical(design, NA)) {
if (!is.null(design)) {
if (sum(rowSums(design[ , -1, drop=FALSE]) > 0) == 0) {
stop(paste("At least one BAM file must be ",
"used in the design",sep=""))
}
}
}
# Test only BAM file used in the design
if (!identical(design, NA)) {
if(!is.null(design) &&
!all(apply(design[rowSums(design[, -1, drop=FALSE]) > 0, 1,
drop=FALSE], MARGIN = 2,
FUN=private$check_bam_files))) {
stop("At least one BAM file does not exist")
}
}
# bin_count should be a numeric value without digits
if (!identical(bin_count, NA)) {
if (!is.null(bin_count)) {
if (!is.numeric(bin_count) || bin_count < 0 ||
as.integer(bin_count) != bin_count) {
stop("bin_count must be NULL or a positive integer")
}
}
}
if (!identical(noise_removal, NA)) {
if (!is.null(noise_removal)) {
if (!noise_removal %in% c("NCIS", "RPM")) {
msg <- 'noise_removal must be NA, NULL, "NCIS" or '
msg <- paste0(msg, '"RPM".')
stop(msg)
}
}
}
if (!identical(normalization, NA)) {
if (!is.null(normalization)) {
if (!normalization == "RPM") {
msg <- "normalization must be NA, NULL or \"RPM\"."
stop(msg)
}
}
}
if (!is.logical(flip_regions)) {
msg <- "flip_regions must be a logical."
stop(msg)
}
},
table_need_update = function(design, bin_count, bin_size,
noise_removal, normalization) {
if (!identical(private$design, design)) {
return(TRUE)
}
if (!identical(private$params[["bin_count"]], bin_count)) {
return(TRUE)
}
if (!identical(private$params[["noise_removal"]], noise_removal)) {
return(TRUE)
}
if (!identical(private$params[["normalization"]], normalization)) {
return(TRUE)
}
return(FALSE)
},
data_frame_need_update = function(alpha = NA, sample_count = NA) {
#need_update = FALSE
# Fetch saved values
alpha = private$get_param_value(alpha, "alpha")
sample_count = private$get_param_value(sample_count,
"sample_count")
# Add default, if needed
if (is.null(alpha)) {
alpha <- 0.05
}
if (is.null(sample_count)) {
sample_count <- 1000
}
if (nrow(private$df) == 0) {
private$params[['df_needs_update']] <- TRUE
#private$params[["alpha"]] <- alpha
#private$params[["sample_count"]] <- sample_count
} else {
# Check if data frame need update
if (!identical(private$params[["alpha"]], alpha)) {
private$params[['df_needs_update']] <- TRUE
#private$params[["alpha"]] <- alpha
}
if (!identical(private$params[["sample_count"]],
sample_count)) {
private$params[['df_needs_update']] <- TRUE
#private$params[["sample_count"]] <- sample_count
}
}
#need_update
},
get_param_value = function(param_value, param_name) {
param_name <- as.character(param_name)
if (identical(param_value, NA)) {
if (! param_name %in% names(private$params)) {
param_value <- NULL
} else {
param_value <- private$params[[param_name]]
}
}
return(param_value)
},
print_verbose = function(to_print) {
if (private$params[["verbose"]]) {
cat(paste0(to_print, "\n"))
}
},
get_subtable = function(coverages, region, bcount) {
gr <- private$regions[[region]]
grl <- split(gr, GenomeInfoDb::seqnames(gr))
i <- vapply(grl, length, numeric(1)) > 0
do.call("c", lapply(grl[i], private$get_view_means,
bcount = bcount, cov = coverages))
},
get_view_means = function(gr, bcount, cov) {
chr <- unique(as.character(GenomeInfoDb::seqnames(gr)))
gr <- intoNbins(gr, bcount)
stopifnot(length(chr) == 1)
views <- Views(cov[[chr]], start(gr), end(gr))
viewMeans(views)
},
prepare_regions = function(regions) {
if (is(regions, "character")) {
names <- sapply(regions, function(x)
file_path_sans_ext(basename(x)))
import_file <- function(region) {
ext <- tolower(tools::file_ext(region))
if (ext == "narrowpeak") {
extraCols <- c(signalValue = "numeric",
pValue = "numeric", qValue = "numeric",
peak = "integer")
rtracklayer::import(region, format = "BED",
extraCols = extraCols)
} else if (ext == "broadpeak") {
extraCols <- c(signalValue = "numeric",
pValue = "numeric", qValue = "numeric")
rtracklayer::import(region, format = "BED",
extraCols = extraCols)
} else if (ext == "gtf" | ext == "gff") {
split(rtracklayer::import(region),
rtracklayer::import(region)$gene_id)
} else {
rtracklayer::import(region)
}
}
regions <- private$parallel_job$launch_job(data = regions,
FUN = import_file)
names(regions) <- names
} else if (is(regions, "GRanges")) {
regions <- GRangesList("regions" = regions)
} else if (is(regions, "list")) {
regions <- GRangesList(regions)
}
if (is.null(names(regions))) {
names(regions) <- sapply(seq_along(regions), function(x) {
paste("region", x, sep = "_")
})
}
if (private$params[['assay']] == "rnaseq"){
stopifnot(all(sum(width(GenomicRanges::reduce(private$regions)))
== sum(width(private$regions))))
}
# TODO: Check if there is a id column in the mcols of every ranges.
# If not, add one by merging seqnames, start and end.
GRangesList(lapply(regions, function(x) {
# Add padding
start(x) <- start(x) - private$params$padding_size
start(x)[start(x) < 0] <- 1
end(x) <- end(x) + private$params$padding_size
# Clean seqlevels
x <- sortSeqlevels(x)
#seqlevels(x) <- unique(as.character(seqnames(x)))
x
}))
},
produce_coverages = function() {
regions <- GenomicRanges::reduce(BiocGenerics::unlist(private$regions))
res <- private$parallel_job$launch_job(
data = private$params[["bam_files"]],
FUN = private$bam_handler$get_coverage,
regions = regions,
force_seqlevels= private$params[["force_seqlevels"]])
names(res) <- names(private$params[["bam_files"]])
lapply(res, GenomeInfoDb::sortSeqlevels)
},
plot_graphic = function(df, title, x_label) {
# Prepare x label
if (is.null(x_label)) {
if (private$params[['assay']] == "chipseq") {
x_label <- "Distance in bins"
} else if (private$params[['assay']] == "rnaseq" &
!('bin' %in% colnames(private$table))) {
x_label <- "Distance in nucleotides"
} else if (private$params[['assay']] == "rnaseq" &
('bin' %in% colnames(private$table))) {
x_label <- "Distance in bins"
}
}
# Prepare y label
y_label <- "Mean coverage"
if (is.null(private$params[["normalization"]])) {
y_label <- paste(y_label, "(raw)")
} else {
y_label <- paste(y_label, "(RPM)")
}
# Produce plot
p <- plot_metagene(df) +
ylab(y_label) +
xlab(x_label) +
ggtitle(title)
p
},
fetch_design = function(design, check_bam_files = FALSE) {
private$check_design(design = design, check_bam_files)
get_complete_design <- function() {
bam_files <- names(private$params[["bam_files"]])
design <- data.frame(bam_files = bam_files, stringsAsFactors=FALSE)
for (bam_file in names(private$coverages)) {
colname <- file_path_sans_ext(basename(bam_file))
design[[colname]] <- rep(0, length(bam_files))
i <- bam_files == bam_file
design[[colname]][i] <- 1
}
design
}
if (is.null(design)) {
return(get_complete_design())
}
if (identical(design, NA)) {
if (all(dim(private$design) == c(0, 0))) {
return(get_complete_design())
} else {
return(private$design)
}
}
design[,1] <- as.character(design[,1])
return(design)
},
remove_controls = function(coverages, design) {
results <- list()
for (design_name in colnames(design)[-1]) {
i <- design[[design_name]] == 1
j <- design[[design_name]] == 2
chip_bam_files <- as.character(design[,1][i])
chip_names <- private$get_bam_names(chip_bam_files)
input_bam_files <- as.character(design[,1][j])
input_names <- private$get_bam_names(input_bam_files)
chip_coverages <- coverages[chip_names]
chip_coverages <- Reduce("+", chip_coverages)
if (length(input_bam_files) > 0) {
noise_ratio <-
private$bam_handler$get_noise_ratio(chip_names,
input_names)
input_coverages <- coverages[input_names]
input_coverages <- noise_ratio * Reduce("+",
input_coverages)
results[design_name] <- chip_coverages - input_coverages
i <- results[[design_name]] < 0
results[[design_name]][i] <- 0
} else {
results[design_name] <- chip_coverages
}
}
results
},
normalize_coverages = function(coverages) {
bam_names <- unlist(lapply(private$bam_handler$get_bam_files()$bam,
private$bam_handler$get_bam_name))
for (bam_name in bam_names) {
#which_rows = design[[design_name]]==1
#bam_files <- as.character(design[,1][which_rows])
count <- private$bam_handler$get_aligned_count(bam_name)
#count <- sum(unlist(counts))
weight <- 1 / (count / 1000000)
coverages[[bam_name]] <- coverages[[bam_name]] * weight
}
coverages
},
merge_chip = function(coverages, design) {
result <- list()
normalization = private$params[["normalization"]]
for (design_name in colnames(design)[-1]) {
i <- design[[design_name]] == 1
bam_files <- as.character(design[,1][i])
bam_names <- private$get_bam_names(bam_files)
cov <- coverages[bam_names]
result[[design_name]] <- Reduce("+", cov)
if(!is.null(normalization) && normalization =="RPM" && sum(i) > 1) {
result[[design_name]] = result[[design_name]] / sum(i)
}
}
result
},
flip_table = function() {
if(!all(private$table[,length(levels(as.factor(strand))),
by=region][,2] == 1) &
private$params[["assay"]] == 'rnaseq'){
stop(paste('Strands of exons in one gene/region',
'must have the same sign to be flipped.'))
}
if (private$params[['assay']] == 'chipseq'){
message('ChIP-Seq flip/unflip')
i <- which(private$table$strand == '-')
private$table$bin[i] <- (self$get_params()$bin_count + 1) -
private$table$bin[i]
private$table$bin <- as.integer(private$table$bin)
private$params[["df_needs_update"]] <- TRUE
} else if (private$params[['assay']] == 'rnaseq'){
message('RNA-Seq flip/unflip')
i <- which(private$table$strand == '-')
#col_nuc
private$table$nuc[i] <- (private$table$regionsize[i] + 1) -
private$table$nuc[i]
private$table$nuc <- as.integer(private$table$nuc)
#col_nuctot
private$table$nuctot[i] <- (private$table$regionsize[i] + 1) -
private$table$nuctot[i] +
private$table$regionstartnuc[i] * 2 - 2
private$table$nuctot <- as.integer(private$table$nuctot)
#col_bin
if(!is.null(private$params[["bin_count"]])){
private$table$bin[i] <- (self$get_params()$bin_count + 1) -
private$table$bin[i]
private$table$bin <- as.integer(private$table$bin)
}
private$params[["df_needs_update"]] <- TRUE
}
},
get_bam_names = function(filenames) {
if (all(filenames %in% colnames(private$design)[-1])) {
filenames
} else {
stopifnot(private$check_bam_files(filenames))
vapply(filenames,
private$bam_handler$get_bam_name,
character(1))
}
},
check_bam_files = function(bam_files) {
all(vapply(bam_files,
function(x) {
!is.null((private$bam_handler$get_bam_name(x)))
},
logical(1)))
},
data_frame_avoid_gaps_updates = function(bam_name, gaps_threshold) {
#bootstrap not executed at this point. Don't work on design !
#how_namy_by_exon_by_design
dfdt <- data.table::copy(private$df)
nb_nuc_removed <- dfdt[value <= gaps_threshold
& bam == bam_name, length(value),
by=c('exon', 'region')]
#assignment of new exonsize
for (i in 1:length(nb_nuc_removed$V1)){
#selected = lines of the ith region and exon of nb_nuc_removed
selected <- which(
private$df$region == nb_nuc_removed$region[i] &
private$df$exon == nb_nuc_removed$exon[i])
#retrieve the exonsize value of the ith region and exon
original_exonsize <- unique(private$df$exonsize[selected])
#replace former exonsixe
new_exonsize <- original_exonsize-nb_nuc_removed$V1[i]
private$df$exonsize[selected] <- new_exonsize
}
nb_nuc_removed_by_gene <- dfdt[value <= gaps_threshold
& bam == bam_name, length(value),
by=c('region')]
#assignment of new region/genesize
for (i in 1:length(unique(nb_nuc_removed_by_gene$region))){
#selected = lines of the ith region of nb_nuc_removed
selected <- which(
private$df$region == nb_nuc_removed_by_gene$region[i])
#retrieve the regionsize value of the ith region and exon
original_regionsize <- unique(private$df$regionsize[selected])
#replace former regionsize
new_regionsize <- (original_regionsize
- nb_nuc_removed_by_gene$V1[i])
private$df$regionsize[selected] <- new_regionsize
}
### removal of zero values
## stop if all bam haven't the same amount of lines in table
stopifnot(length(unique(private$table[, .N, by=bam]$N)) == 1)
bam_line_count <- private$table[bam == bam_name, .N]
#lines_to_remove for bam_name
lines_to_remove <- which(private$df$bam == bam_name &
private$df$value <= gaps_threshold)
# %% provide the idx for the first bam
lines_to_remove <- (lines_to_remove %% bam_line_count)
#to avoid 0 if there is a x %% x = 0
lines_to_remove <- replace(lines_to_remove,
which(lines_to_remove == 0),
bam_line_count)
bam_count <- length(unique(private$df$bam))
#lines_to_remove for all bam
lines_to_remove <- unlist(map((0:(bam_count-1)),
~ lines_to_remove + bam_line_count * .x))
private$df <- private$df[-lines_to_remove,]
#reinitialization of nuctot before flip in next section to
# clear gaps in nuctot number seauence
private$df$nuctot <- rep(1:length(which(
private$df$bam == bam_name)),
times = length(unique(private$df$bam)))
#reorder the nuc and nuctot variables
if(private$params[["flip_regions"]] == TRUE){
flip_by_bam_n_region <- map2(rep(unique(private$df$bam),
each=length(unique(private$df$region))),
rep(unique(private$df$region),
times=length(unique(private$df$bam))),
~which(private$df$bam == .x & private$df$region == .y
& private$df$strand == '-'))
not_empty_idx <- which(map(flip_by_bam_n_region,
~length(.x)) > 0)
if (length(not_empty_idx) > 0){
map(flip_by_bam_n_region[not_empty_idx],
~ (private$df$nuc[.x] <- length(.x):1))
map(flip_by_bam_n_region[not_empty_idx],
~ (private$df$nuctot[.x] <-
max(private$df$nuctot[.x]):
min(private$df$nuctot[.x])))
}
unflip_by_bam_n_region <- map2(rep(unique(private$df$bam),
each=length(unique(private$df$region))),
rep(unique(private$df$region),
times=length(unique(private$df$bam))),
~which(private$df$bam == .x & private$df$region == .y
& (private$df$strand == '+' |
private$df$strand == '*')))
not_empty_idx <- which(map(unflip_by_bam_n_region,
~length(.x)) > 0)
if (length(not_empty_idx) > 0){
map(unflip_by_bam_n_region[not_empty_idx],
~ (private$df$nuc[.x] <- 1:length(.x)))
map(unflip_by_bam_n_region[not_empty_idx],
~ (private$df$nuctot[.x] <-
min(private$df$nuctot[.x]):
max(private$df$nuctot[.x])))
}
} else { # if private$params[["flip_regions"]] == FALSE
by_bam_n_region <- map2(rep(unique(private$df$bam),
each=length(unique(private$df$region))),
rep(unique(private$df$region),
times=length(unique(private$df$bam))),
~which(private$df$bam == .x & private$df$region == .y))
not_empty_idx <- which(map(by_bam_n_region,
~length(.x)) > 0)
if (length(not_empty_idx) > 0){
map(by_bam_n_region[not_empty_idx],
~ (private$df$nuc[.x] <- 1:length(.x)))
map(by_bam_n_region[not_empty_idx],
~ (private$df$nuctot[.x] <-
min(private$df$nuctot[.x]):
max(private$df$nuctot[.x])))
}
}
if(!is.null(private$params[["bin_count"]])){
#reinitialization of region/gene_size to be able to rebuild
#bin column
length_by_region_n_bam <- private$df[,length(nuc),
by=c('region','bam')]$V1
private$df$regionsize <- rep(length_by_region_n_bam,
times=length_by_region_n_bam)
#rebuild the correct bin column
col_bins <- trunc((private$df$nuc/(private$df$regionsize+1))
*private$params[["bin_count"]])+1
private$df$bin <- as.integer(col_bins)
}
}
)
)
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.