#' Ribosome occupancy metaprofiles at single-nucleotide resolution.
#'
#' This function generates metaprofiles displaying the abundance of P-sites
#' around the start and the stop codon of annotated CDSs. For each sample the
#' intensity of the signal in the metaprofiles corresponds, for each nucleotide,
#' to the sum of the number of P-sites (defined by their leftmost position)
#' mapping on that position for all transcripts. Multiple samples and replicates
#' can be handled.
#'
#' @param data Either list of data tables or GRangesList object from
#' \code{\link{psite_info}}.
#' @param annotation Data table as generated by \code{\link{create_annotation}}.
#' @param sample Either character string, character string vector or named list
#' of character string(s)/character string vector(s) specifying the name of
#' the sample(s) and replicate(s) of interest. If a list is provided, each
#' element of the list is considered as an independent sample associated with
#' one ore multiple replicates. Multiple samples and replicates are handled
#' and visualised according to \code{multisamples} and \code{plot_style}.
#' @param multisamples Either "average" or "independent". It specifies how to
#' handle multiple samples and replicates stored in \code{sample}:
#' * if \code{sample} is a character string vector and \code{multisamples} is
#' set to "average" the elements of the vector are considered as replicates
#' of one sample and a single metaprofile is returned.
#' * if \code{sample} is a character string vector and \code{multisamples} is
#' set to "independent", each element of the vector is analysed independently
#' of the others. The number of plots returned and their organization is
#' specified by \code{plot_style}.
#' * if \code{sample} is a list, \code{multisamples} must be set to "average".
#' Each element of the list is analysed independently of the others, its
#' replicates averaged and its name reported in the plot. The number of plots
#' returned and their organization is specified by \code{plot_style}.
#' Note: when this parameter is set to "average" the bar plot associated with
#' each sample displays the nucleotide-specific mean signal computed across
#' the replicates and the corresponding standard error is also reported.
#' Default is "average".
#' @param plot_style Either "split", "facet", "overlap" or "mirror". It specifies
#' how to organize and display multiple metaprofiles:
#' * "split": one metaprofile for each sample is returned as an independent
#' ggplot object;
#' * "facet": the metaprofiles are placed one below the other, in
#' independent boxes.
#' * "overlap": the metaprofiles are placed one on top of the other;
#' * "mirror": \code{sample} must be either a character string vector or
#' a list of exactly two elements and the resulting metaprofiles are mirrored
#' along the x axis.
#' Default is "split".
#' @param scale_factors Either "auto", a named numeric vector or "none". It
#' specifies how metaprofiles should be scaled before merging
#' multiple replicates (if any):
#' * "auto": each metaprofile is scaled so that the area under the curve is 1.
#' * named numeric vector: \code{scale_factors} must be the same length of
#' unlisted \code{sample} and each scale factor must be named after the
#' corresponding string in unlisted \code{sample}. No specific order is
#' required. Each metaprofile is multiplied by the matching scale factor.
#' * "none": no scaling is applied.
#' Default is "auto".
#' @param transcripts Character string vector listing the name of transcripts to
#' be included in the analysis. Default is NULL, i.e. all transcripts are
#' used. Please note: transcripts with either 5' UTR, coding sequence or 3'
#' UTR shorter than \code{utr5l}, \eqn{2*}\code{cdsl} and \code{utr3l},
#' respectively, are automatically discarded.
#' @param length_range Integer or integer vector for restricting the plot to a
#' chosen range of read lengths. Default is NULL, i.e. all read lengths are
#' used. If specified, this parameter prevails over \code{cl}.
#' @param cl Integer value in [1,100] specifying a confidence level for
#' restricting the plot to an automatically-defined range of read lengths. The
#' new range is computed according to the most frequent read lengths, which
#' accounts for the cl% of the sample and is defined by discarding the
#' (100-cl)% of read lengths falling in the tails of the read lengths
#' distribution. If multiple samples are analysed, a single range of read
#' lengths is computed such that at least the cl% of all samples is
#' represented. Default is 100.
#' @param utr5l Positive integer specifying the length (in nucleotides) of the
#' 5' UTR region flanking the start codon to be considered in the analysis.
#' Default is 25.
#' @param cdsl Positive integer specifying the length (in nucleotides) of the
#' CDS regions flanking both the start and stop codon to be considered in the
#' analysis. Default is 50.
#' @param utr3l Positive integer specifying the length (in nucleotides) of the
#' 3' UTR region flanking the stop codon to be considered in the analysis.
#' Default is 25.
#' @param colour Character string or character string vector specifying the
#' colour of the metaprofile(s). If \code{sample} is a list of multiple
#' elements and \code{multisamples} is set to "average", a colour for
#' each element of the list is required. If this parameter is not specified
#' the R default palette is employed. Default is NULL.
#' @return List containing: one or more ggplot object(s) and the data table with
#' the corresponding x- and y-axis values ("plot_dt"); an additional data
#' table with raw and scaled number of P-sites per codon in the selected
#' region for each sample ("count_dt").
#' @examples
#' ## data(reads_list)
#' ## data(mm81cdna)
#' ##
#' ## ## Generate fake samples and replicates
#' ## for(i in 2:6){
#' ## samp_name <- paste0("Samp", i)
#' ## set.seed(i)
#' ## reads_list[[samp_name]] <- reads_list[["Samp1"]][sample(.N, 5000)]
#' ## }
#' ##
#' ## ## Compute and add p-site details
#' ## psite_offset <- psite(reads_list, flanking = 6, extremity = "auto")
#' ## reads_psite_list <- psite_info(reads_list, psite_offset)
#' ##
#' ## ## Define the list of samples and replicate to use as input
#' ## input_samples <- list("S1" = c("Samp1", "Samp2"),
#' ## "S2" = c("Samp3", "Samp4", "Samp5"),
#' ## "S3" = c("Samp6"))
#' ##
#' ## ## Generate metaprofiles
#' ## example_metaprofile <- metaprofile_psite(reads_psite_list, mm81cdna,
#' ## sample = input_samples,
#' ## multisamples = "average",
#' ## plot_style = "facet",
#' ## utr5l = 20, cdsl = 40, utr3l = 20,
#' ## colour = c("#333f50", "#39827c", "gray70"))
#' @import data.table
#' @import ggplot2
#' @export
metaprofile_psite <- function(data, annotation, sample, multisamples = "average",
plot_style = "split", scale_factors = "auto",
transcripts = NULL, length_range = NULL, cl = 100,
utr5l = 25, cdsl = 50, utr3l = 25,
colour = NULL) {
if(class(data[[1]])[1] == "GRanges"){
data_tmp <- list()
for(i in names(data)){
data_tmp[[i]] <- as.data.table(data[[i]])[, c("width", "strand") := NULL
][, seqnames := as.character(seqnames)]
setnames(data_tmp[[i]], c("seqnames", "start", "end"), c("transcript", "end5", "end3"))
}
data <- data_tmp
}
check_sample <- setdiff(unlist(sample), names(data))
if(length(check_sample) != 0){
cat("\n")
stop(sprintf("incorrect sample name(s): \"%s\" not found\n\n",
paste(check_sample, collapse = ", ")))
}
if(length(sample) == 0){
cat("\n")
stop("at least one sample name must be spcified\n\n")
}
if(is.numeric(scale_factors)) {
if(!all(unlist(sample) %in% names(scale_factors))){
cat("\n")
stop("scale factor for one or more sample is missing\n\n")
}
}
if(length(length_range) != 0 & !inherits(length_range, "numeric") & !inherits(length_range, "integer")){
cat("\n")
warning("class of length_range is neither numeric nor integer. Set to default NULL\n", call. = FALSE)
length_range = NULL
}
if(!(multisamples %in% c("average", "independent"))){
cat("\n")
warning("parameter multisamples must be either \"average\" or \"independent\".
Set to default \"average\"\n", call. = FALSE)
multisamples <- "average"
}
if(multisamples == "independent" & is.list(sample)) {
cat("\n")
warning("parameter multisamples is set to \"independent\" but parameter sample is a list:
parameter multisamples will be coerced to default \"average\"\n", call. = FALSE)
multisamples <- "average"
}
if(length(sample) != 2 & plot_style == "mirror") {
cat("\n")
warning("parameter plot_style is set to \"mirror\" but parameter sample is a list of dimension > 2:
parameter plot_style will be coerced to default \"split\"\n", call. = FALSE)
plot_style <- "split"
}
if(is.character(sample) & length(sample) == 1) {
multisamples <- "independent"
plot_style <- "split"
}
if(is.character(sample) & length(sample) > 1 & multisamples == "average") {
sample <- list("Average" = sample)
plot_style <- "split"
cat("\n")
warning("Default name of averaged samples is \"Average\":
consider to use a named list of one element to provide a meaningful plot title\n", call. = FALSE)
}
if(is.list(sample) & length(sample) == 1){
plot_style <- "split"
}
if(!(plot_style %in% c("split", "facet", "overlap", "mirror"))){
cat("\n")
warning("parameter plot_style must be either \"split\", \"facet\", \"overlap\", or \"mirror\".
Set to default \"split\"\n", call. = FALSE)
plot_style <- "split"
}
#define color vector
if((length(colour) < length(sample)) &
((plot_style %in% c("overlap", "mirror")) |
(plot_style %in% c("split", "facet") & length(colour) != 1))){
if(length(colour) !=0){
warning("Not enough colors specified:
default ggplot color palette will be used\n", call. = FALSE)
}
default_gg_col <- function(n) {
hues = seq(15, 375, length = n + 1)
hcl(h = hues, l = 65, c = 100)[1:n]
}
colour <- default_gg_col(length(sample))
} else {
if(plot_style %in% c("split", "facet") & length(colour) == 1){
colour <- rep(colour, length(sample))
}
}
# select transcripts
l_transcripts <- as.character(annotation[l_utr5 >= utr5l &
l_cds > 2 * (cdsl + 1) &
l_utr3 >= utr3l, transcript])
if (length(transcripts) == 0) {
c_transcripts <- l_transcripts
} else {
c_transcripts <- intersect(l_transcripts, transcripts)
}
# define length range taking into account all (unlisted) samples
if(length(length_range) == 0){
for(samp in as.character(unlist(sample))){
dt <- data[[samp]][transcript %in% c_transcripts]
if(length(length_range) == 0){
length_range <- seq(quantile(dt$length, (1 - cl/100)/2),
quantile(dt$length, 1 - (1 - cl/100)/2))
} else {
xmin <- min(min(length_range), quantile(dt$length, (1 - cl/100)/2))
xmax <- max(max(length_range), quantile(dt$length, 1 - (1 - cl/100)/2))
length_range <- seq(xmin, xmax)
}
}
}
# check if all samples have reads of the specified lengths
# especially required if only one read length is passed
if(length(length_range) != 0){
if(is.list(sample)){
samp_dt <- data.table(stack(sample))
setnames(samp_dt, c("sample", "sample_l"))
} else {
samp_dt <- data.table("sample" = sample, "sample_l" = sample)
}
for(samp in samp_dt$sample){
dt <- data[[samp]][cds_start != 0 & cds_stop !=0]
if(length(c_transcripts) != 0) {
dt <- dt[transcript %in% c_transcripts]
}
len_check <- unique(dt$length)
if(sum(length_range %in% len_check) == 0) {
cat("\n")
warning(sprintf("\"%s\" doesn't contain any reads of the selected lengths: sample removed\n", samp), call. = FALSE)
#select element of sample which include the sample to be removed (useful if sample is a list)
sel_l_samp <- samp_dt[sample == samp, sample_l]
#remove the sample from the list/vector
if(is.list(samp)){
sample[[sel_l_samp]] <- sample[[sel_l_samp]][sample[[sel_l_samp]] != samp]
} else {
sample <- sample[sample != samp]
}
}
}
}
if(is.null(unlist(sample))){
cat("\n")
stop("none of the data tables listed in sample contains any reads of the specified lengths\n\n")
}
# compute count of reads of defined lengths and scale them
final_dt <- data.table()
for(samp in as.character(unlist(sample))){
if(length(c_transcripts) == 0) {
dt <- data[[samp]]
} else {
dt <- data[[samp]][transcript %in% c_transcripts &
length %in% length_range]
}
start_sub <- dt[psite_from_start %in% seq(-utr5l, cdsl)]
start_tab <- setkey(start_sub, psite_from_start
)[CJ(-utr5l:cdsl), .(count = .N), by = .EACHI
][, reg := "start"]
setnames(start_tab, c("distance", "count", "region"))
stop_sub <- dt[psite_from_stop %in% seq(-cdsl, utr3l)]
stop_tab <- setkey(stop_sub, psite_from_stop
)[CJ(-cdsl:utr3l), .(count = .N), by = .EACHI
][, reg := "stop"]
setnames(stop_tab, c("distance", "count", "region"))
samp_final_tab <- rbind(start_tab, stop_tab)
#scaling/normalization
if(is.character(scale_factors) & scale_factors[1] == "auto"){
samp_final_tab[, scaled_count := count / sum(count)]
y_title <- "P-site frequency"
} else {
y_title <- "# P-sites"
if(is.numeric(scale_factors)){
samp_final_tab[, scaled_count := count * scale_factors[samp]]
} else {
samp_final_tab[, scaled_count := count]
}
}
samp_final_tab[, tmp_samp := samp]
final_dt <- rbind(final_dt, samp_final_tab)
}
final_dt[, region := factor(region, levels = c("start", "stop"),
labels = c("Distance from start (nt)", "Distance from stop (nt)"))]
output <- list()
output[["count_dt"]] <- copy(final_dt[, c("tmp_samp", "region", "distance", "count", "scaled_count")])
if(is.character(scale_factors) & scale_factors[1] == "none"){
output[["count_dt"]][, scaled_count := NULL]
}
setnames(output[["count_dt"]], "tmp_samp", "sample")
# compute mean samples if a list is provided
if(is.list(sample)){
samp_dt <- data.table(stack(sample))
setnames(samp_dt, c("tmp_samp", "sample"))
# set names of samples as specified in parameter sample
final_dt <- merge.data.table(final_dt, samp_dt, sort = FALSE)[, tmp_samp := NULL]
# compute mean and se
plot_dt <- final_dt[, .(mean_scaled_count = mean(scaled_count),
se_scaled_count = sd(scaled_count/sqrt(.N))), by = .(region, distance, sample)]
if(any(lengths(sample) != 1)){
output[["plot_dt"]] <- copy(plot_dt[, c("sample", "region", "distance", "mean_scaled_count", "se_scaled_count")])
setnames(output[["plot_dt"]], c("distance", "mean_scaled_count", "se_scaled_count"), c("x", "y", "y_se"))
} else {
output[["plot_dt"]] <- copy(final_dt[, c("sample", "region", "distance", "scaled_count")])
setnames(output[["plot_dt"]], c("distance", "scaled_count"), c("x", "y"))
}
} else {
plot_dt <- final_dt[, sample := tmp_samp
][, se_scaled_count := NA]
setnames(plot_dt, "scaled_count", "mean_scaled_count")
output[["plot_dt"]] <- copy(plot_dt[, c("sample", "region", "distance", "mean_scaled_count")])
setnames(output[["plot_dt"]], c("distance", "mean_scaled_count"), c("x", "y"))
}
plot_dt[, sample := factor(sample, levels = unique(sample))]
# define data table for vertical lines
lines3nt <- data.table(region = rep(c("Distance from start (nt)", "Distance from stop (nt)"),
times = c(length(seq(3, cdsl, 3)), length(seq(-2, -cdsl, -3)))),
line = c(seq(3, cdsl, 3), rev(seq(-2, -cdsl, -3))))
linered <- data.table(region = c("Distance from start (nt)", "Distance from stop (nt)"), line =c(0, 1))
oldw <- getOption("warn")
options(warn=-1)
if(plot_style == "split"){
i <- 0
for(samp in unique(plot_dt$sample)){ # generate a plot for each sample and store it
i <- i + 1
sel_col = colour[i]
sub_plot_dt <- plot_dt[sample == samp]
plot <- ggplot(sub_plot_dt, aes(distance, mean_scaled_count)) +
geom_line(linewidth = 1.50, color = sel_col)
if(!is.na(sub_plot_dt$se_scaled_count[1])){
plot <- plot + geom_ribbon(aes(ymin = mean_scaled_count - se_scaled_count,
ymax = mean_scaled_count + se_scaled_count),
fill = sel_col, color = NA, alpha = 0.20, show.legend = F)
}
plot <- plot + geom_vline(data = linered, aes(xintercept = line), linetype = 1, color = "red") +
theme_bw(base_size = 30) +
facet_grid(. ~ region, switch = "x", scales = "free_x") +
theme(panel.grid.major.x = element_blank(), panel.grid.minor = element_blank(),
axis.title.x = element_blank(), plot.title = element_text(hjust = 0.5),
strip.background = element_blank(), strip.placement = "outside") +
geom_vline(data = lines3nt, aes(xintercept = line), linetype = 3, color = "gray60") +
labs(title = samp, y = y_title)
output[[paste0("plot_", samp)]] <- plot
}
} else {
if(plot_style == "mirror") {
plot_dt[sample == unique(plot_dt$sample)[2], mean_scaled_count := -mean_scaled_count]
}
plot <- ggplot(plot_dt, aes(distance, mean_scaled_count, color = sample, fill = sample)) +
geom_line(linewidth = 1.50) +
geom_ribbon(aes(ymin = mean_scaled_count - se_scaled_count,
ymax = mean_scaled_count + se_scaled_count),
color = NA, alpha = 0.20, show.legend = F)
if(identical(plot_style, "mirror")){
plot <- plot + geom_hline(yintercept = 0, linetype = 2, color = "gray20")
}
plot <- plot + labs(y = y_title) +
geom_vline(data = linered, aes(xintercept = line), linetype = 1, color = "red") +
theme_bw(base_size = 30)
if(plot_style == "facet"){
plot <- plot + facet_grid(sample ~ region, switch = "x", scales = "free_x")
} else {
plot <- plot + facet_grid(. ~ region, switch = "x", scales = "free_x")
}
plot <- plot + theme(panel.grid.major.x = element_blank(), panel.grid.minor = element_blank(),
axis.title.x = element_blank(), plot.title = element_text(hjust = 0.5),
strip.background = element_blank(), strip.placement = "outside") +
scale_fill_manual(values = colour) +
scale_color_manual(values = colour) +
scale_y_continuous(labels = abs) +
geom_vline(data = lines3nt, aes(xintercept = line), linetype = 3, color = "gray60")
if(uniqueN(colour) > 1 & plot_style != "facet"){
plot <- plot + theme(legend.position = c(0.98,1), legend.justification = c(1, 1),
legend.title = element_blank(), legend.background = element_blank())
} else {
plot <- plot + theme(legend.position = "none")
}
output[["plot"]] <- plot
}
options(warn = oldw)
return(output)
}
#' Ribosome occupancy metaheatmaps at single-nucleotide resolution.
#'
#' This function generates heatmap-like metaprofiles (metaheatmaps) displaying
#' the abundance of P-sites around the start and the stop codon of annotated
#' CDSs for multiple samples. It works similarly to
#' \code{\link{metaprofile_psite}} but the intensity of signal is represented by
#' a continuous color scale rather than by the height of a line chart. This
#' graphical option always returns one heatmap displaying all the specified
#' samples thus optimizing the visualization of several profiles in a small
#' area.
#'
#' @param data Either list of data tables or GRangesList object from
#' \code{\link{psite_info}}.
#' @param annotation Data table as generated by \code{\link{create_annotation}}.
#' @param sample Either character string, character string vector or named list
#' of character string(s)/character string vector(s) specifying the name of
#' the sample(s) and replicate(s) of interest. If a list is provided, each
#' element of the list is considered as an independent sample associated with
#' one ore multiple replicates. Multiple samples and replicates are handled
#' according to \code{multisamples}.
#' @param multisamples Either "average" or "independent". It specifies how to
#' handle multiple samples and replicates stored in \code{sample}:
#' * if \code{sample} is a character string vector and \code{multisamples} is
#' set to "average" the elements of the vector are considered as replicates
#' of one sample.
#' * if \code{sample} is a character string vector and \code{multisamples} is
#' set to "independent", each element of the vector is analysed independently
#' of the others.
#' * if \code{sample} is list, \code{multisamples} must be set to "average".
#' Each element of the list is analysed independently of the others, its
#' replicates averaged and its name reported in the plot.
#' Note: when this parameter is set to "average" the intensity of the profile
#' associated with each sample reports the nucleotide-specific mean signal
#' computed across the replicates.
#' Default is "average".
#' @param scale_factors Either "auto", a named numeric vector or "none". It
#' specifies how metaprofiles should be scaled before merging
#' multiple replicates (if any):
#' * "auto": each metaprofile is scaled so that the area under the curve is 1.
#' * named numeric vector: \code{scale_factors} must be the same length of
#' unlisted \code{sample} and each scale factor must be named after the
#' corresponding string in unlisted \code{sample}. No specific order is
#' required. Each metaprofile is multiplied by the matching scale factor.
#' * "none": no scaling is applied.
#' Default is "auto".
#' @param transcripts Character string vector listing the name of transcripts to
#' be included in the analysis. Default is NULL, i.e. all transcripts are
#' used. Please note: transcripts with either 5' UTR, coding sequence or 3'
#' UTR shorter than \code{utr5l}, \eqn{2*}\code{cdsl} and \code{utr3l},
#' respectively, are automatically discarded.
#' @param length_range Integer or integer vector for restricting the plot to a
#' chosen range of read lengths. Default is NULL, i.e. all read lengths are
#' used. If specified, this parameter prevails over \code{cl}.
#' @param cl Integer value in [1,100] specifying a confidence level for
#' restricting the plot to an automatically-defined range of read lengths. The
#' new range is computed according to the most frequent read lengths, which
#' accounts for the cl% of the sample and is defined by discarding the
#' (100-cl)% of read lengths falling in the tails of the read lengths
#' distribution. If multiple samples are analysed, a single range of read
#' lengths is computed such that at least the cl% of all sample are
#' represented. Default is 100.
#' @param utr5l Positive integer specifying the length (in nucleotides) of the
#' 5' UTR region flanking the start codon to be considered in the analysis.
#' Default is 25.
#' @param cdsl Positive integer specifying the length (in nucleotides) of the
#' CDS regions flanking both the start and stop codon to be considered in the
#' analysis. Default is 50.
#' @param utr3l Positive integer specifying the length (in nucleotides) of the
#' 3' UTR region flanking the stop codon to be considered in the analysis.
#' Default is 25.
#' @param colour Character string specifying the
#' colour of the plot. The colour scheme is as follow: tiles
#' corresponding to the lowest signal are always white, tiles corresponding to
#' the highest signal are of the specified colour and the progression between
#' these two colours follows either linear or logarithmic gradients (see
#' \code{log_colour}).Default is "black".
#' @param log_colour Logical value whether to use a logarithmic colour scale
#' (strongly suggested in case of large signal variations). Default is FALSE.
#' @return List containing: one or more ggplot object(s) and the data table with
#' the corresponding x- and y-axis values ("plot_dt"); an additional data
#' table with raw and scaled number of P-sites per codon in the selected
#' region for each sample ("count_dt").
#' @examples
#' ## data(reads_list)
#' ## data(mm81cdna)
#' ##
#' ## ## Generate fake samples and replicates
#' ## for(i in 2:6){
#' ## samp_name <- paste0("Samp", i)
#' ## set.seed(i)
#' ## reads_list[[samp_name]] <- reads_list[["Samp1"]][sample(.N, 5000)]
#' ## }
#' ##
#' ## ## Compute and add p-site details
#' ## psite_offset <- psite(reads_list, flanking = 6, extremity = "auto")
#' ## reads_psite_list <- psite_info(reads_list, psite_offset)
#' ##
#' ## ## Define the list of samples and replicate to use as input
#' ## input_samples <- list("S1" = c("Samp1", "Samp2"),
#' ## "S2" = c("Samp3", "Samp4", "Samp5"),
#' ## "S3" = c("Samp6"))
#' ##
#' ## Generate metaheatmap
#' ## example_metaheatmap <- metaheatmap_psite(reads_psite_list, mm81cdna,
#' ## sample = input_samples,
#' ## multisamples = "average",
#' ## utr5l = 20, cdsl = 40, utr3l = 20,
#' ## colour = "#333f50"))
#' @import data.table
#' @import ggplot2
#' @export
metaheatmap_psite <- function(data, annotation, sample, multisamples = "average",
scale_factors = "auto", transcripts = NULL,
length_range = NULL, cl = 100,
utr5l = 25, cdsl = 50, utr3l = 25,
colour = "black", log_colour = FALSE) {
if(class(data[[1]])[1] == "GRanges"){
data_tmp <- list()
for(i in names(data)){
data_tmp[[i]] <- as.data.table(data[[i]])[, c("width", "strand") := NULL
][, seqnames := as.character(seqnames)]
setnames(data_tmp[[i]], c("seqnames", "start", "end"), c("transcript", "end5", "end3"))
}
data <- data_tmp
}
check_sample <- setdiff(unlist(sample), names(data))
if(length(check_sample) != 0){
cat("\n")
stop(sprintf("incorrect sample name(s): \"%s\" not found\n\n",
paste(check_sample, collapse = ", ")))
}
if(length(sample) == 0){
cat("\n")
stop("at least one sample name must be spcified\n\n")
}
if(is.numeric(scale_factors)) {
if(!all(unlist(sample) %in% names(scale_factors))){
cat("\n")
stop("scale factor for one or more sample is missing\n\n")
}
}
if(length(length_range) != 0 & !inherits(length_range, "numeric") & !inherits(length_range, "integer")){
cat("\n")
warning("class of length_range is neither numeric nor integer. Set to default NULL\n", call. = FALSE)
length_range = NULL
}
if(!(multisamples %in% c("average", "independent"))){
cat("\n")
warning("parameter multisamples must be either \"average\" or \"independent\".
Set to default \"average\"\n", call. = FALSE)
multisamples <- "average"
}
if(multisamples == "independent" & is.list(sample)) {
cat("\n")
warning("parameter multisamples is set to \"independent\" but parameter sample is a list:
parameter multisamples will be coerced to default \"average\"\n", call. = FALSE)
multisamples <- "average"
}
if(is.character(sample) & length(sample) == 1) {
multisamples <- "independent"
}
if(is.character(sample) & length(sample) > 1 & multisamples == "average") {
sample <- list("Average" = sample)
cat("\n")
warning("Default name of averaged samples is \"Average\":
consider to use a named list of one element to provide a meaningful plot title\n", call. = FALSE)
}
# select transcripts
l_transcripts <- as.character(annotation[l_utr5 >= utr5l &
l_cds > 2 * (cdsl + 1) &
l_utr3 >= utr3l, transcript])
if (length(transcripts) == 0) {
c_transcripts <- l_transcripts
} else {
c_transcripts <- intersect(l_transcripts, transcripts)
}
# define length range taking into account all (unlisted) samples
if(length(length_range) == 0){
for(samp in as.character(unlist(sample))){
dt <- data[[samp]][transcript %in% c_transcripts]
if(length(length_range) == 0){
length_range <- seq(quantile(dt$length, (1 - cl/100)/2),
quantile(dt$length, 1 - (1 - cl/100)/2))
} else {
xmin <- min(min(length_range), quantile(dt$length, (1 - cl/100)/2))
xmax <- max(max(length_range), quantile(dt$length, 1 - (1 - cl/100)/2))
length_range <- seq(xmin, xmax)
}
}
}
# check if all samples have reads of the specified lengths
# especially required if only one read length is passed
if(length(length_range) != 0){
if(is.list(sample)){
samp_dt <- data.table(stack(sample))
setnames(samp_dt, c("sample", "sample_l"))
} else {
samp_dt <- data.table("sample" = sample, "sample_l" = sample)
}
for(samp in samp_dt$sample){
dt <- data[[samp]][cds_start != 0 & cds_stop !=0]
if(length(c_transcripts) != 0) {
dt <- dt[transcript %in% c_transcripts]
}
len_check <- unique(dt$length)
if(sum(length_range %in% len_check) == 0) {
cat("\n")
warning(sprintf("\"%s\" doesn't contain any reads of the selected lengths: sample removed\n", samp), call. = FALSE)
#select element of sample which include the sample to be removed (useful if sample is a list)
sel_l_samp <- samp_dt[sample == samp, sample_l]
#remove the sample from the list/vector
if(is.list(samp)){
sample[[sel_l_samp]] <- sample[[sel_l_samp]][sample[[sel_l_samp]] != samp]
} else {
sample <- sample[sample != samp]
}
}
}
}
if(is.null(unlist(sample))){
cat("\n")
stop("none of the data tables listed in sample contains any reads of the specified lengths\n\n")
}
# compute count of reads of defined lengths and scale them
final_dt <- data.table()
for(samp in as.character(unlist(sample))){
if(length(c_transcripts) == 0) {
dt <- data[[samp]]
} else {
dt <- data[[samp]][transcript %in% c_transcripts &
length %in% length_range]
}
start_sub <- dt[psite_from_start %in% seq(-utr5l, cdsl)]
start_tab <- setkey(start_sub, psite_from_start
)[CJ(-utr5l:cdsl), .(count = .N), by = .EACHI
][, reg := "start"]
setnames(start_tab, c("distance", "count", "region"))
stop_sub <- dt[psite_from_stop %in% seq(-cdsl, utr3l)]
stop_tab <- setkey(stop_sub, psite_from_stop
)[CJ(-cdsl:utr3l), .(count = .N), by = .EACHI
][, reg := "stop"]
setnames(stop_tab, c("distance", "count", "region"))
samp_final_tab <- rbind(start_tab, stop_tab)
#scaling/normalization
if(is.character(scale_factors) & scale_factors[1] == "auto"){
samp_final_tab[, scaled_count := count / sum(count)]
legend_title <- "P-site\nfrequency\n"
} else {
legend_title <- "# P-sites\n"
if(is.numeric(scale_factors)){
samp_final_tab[, scaled_count := count * scale_factors[samp]]
} else {
samp_final_tab[, scaled_count := count]
}
}
samp_final_tab[, tmp_samp := samp]
final_dt <- rbind(final_dt, samp_final_tab)
}
final_dt[, region := factor(region, levels = c("start", "stop"),
labels = c("Distance from start (nt)", "Distance from stop (nt)"))]
output <- list()
output[["count_dt"]] <- copy(final_dt[, c("tmp_samp", "region", "distance", "count", "scaled_count")])
if(is.character(scale_factors) & scale_factors[1] == "none"){
output[["count_dt"]][, scaled_count := NULL]
}
setnames(output[["count_dt"]], "tmp_samp", "sample")
# compute mean samples if a list is provided
if(is.list(sample)){
samp_dt <- data.table(stack(sample))
setnames(samp_dt, c("tmp_samp", "sample"))
# set names of samples as specified in parameter sample
final_dt <- merge.data.table(final_dt, samp_dt, sort = FALSE)[, tmp_samp := NULL]
# compute mean
plot_dt <- final_dt[, .(mean_scaled_count = mean(scaled_count)), by = .(region, distance, sample)]
} else {
plot_dt <- final_dt[, sample := tmp_samp]
setnames(plot_dt, "scaled_count", "mean_scaled_count")
}
output[["plot_dt"]] <- copy(plot_dt[, c("sample", "region", "distance", "mean_scaled_count")])
setnames(output[["plot_dt"]], c("distance", "mean_scaled_count"), c("x", "y"))
plot_dt[, sample := factor(sample, levels = rev(unique(sample)))]
# define data table for vertical lines
lines3nt <- data.table(region = rep(c("Distance from start (nt)", "Distance from stop (nt)"),
times = c(length(seq(3, cdsl, 3)), length(seq(-2, -cdsl, -3)))),
line = c(seq(3, cdsl, 3), rev(seq(-2, -cdsl, -3))))
linered <- data.table(region = c("Distance from start (nt)", "Distance from stop (nt)"), line =c(0, 1))
maxl <- max(plot_dt$mean_scaled_count)
oldw <- getOption("warn")
options(warn=-1)
plot <- ggplot(plot_dt, aes(as.numeric(as.character(distance)), sample)) +
geom_vline(data = lines3nt, aes(xintercept = line), linetype = 3, color = "gray60") +
geom_vline(data = linered, aes(xintercept = line), linetype = 1, color = "red") +
geom_tile(aes(fill = mean_scaled_count), height = 0.75) +
theme_bw(base_size = 30) +
theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
strip.placement = "outside", axis.title = element_blank(),
plot.title = element_blank(), strip.background = element_blank()) +
facet_grid(. ~ region, scales = "free", switch = "x")
if(log_colour == F) {
minl <- min(plot_dt$mean_scaled_count)
if(is.character(scale_factors) && scale_factors == "auto"){
breaks_v = c(minl, maxl)
labels_v = c(round(minl, 2), round(maxl, 2))
} else {
breaks_v = c(minl, minl/2 + maxl/2, maxl)
labels_v = c(round(minl, 2), round(minl/2 + maxl/2, 2), round(maxl, 2))
}
plot <- plot +
scale_fill_gradient(legend_title, low = "white", high = colour, na.value = "white",
limits = c(minl, maxl),
breaks = breaks_v,
labels = labels_v)
} else {
minl <- min(plot_dt[mean_scaled_count != 0, mean_scaled_count])
if(is.character(scale_factors) && scale_factors == "auto"){
breaks_v = c(minl, maxl)
labels_v = c(round(minl, 2), round(maxl, 2))
} else {
breaks_v = c(minl, 10^(log10(minl)/2 + log10(maxl)/2), maxl)
labels_v = c(round(minl, 2), round(10^(log10(minl)/2 + log10(maxl)/2), 2), round(maxl, 2))
}
plot <- plot +
scale_fill_gradient(legend_title, low = "white", high = colour, trans = "log", na.value = "transparent",
limits = c(minl, maxl),
breaks = breaks_v,
labels = labels_v)
}
output[["plot"]] <- plot
options(warn = oldw)
return(output)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.