#' @import data.table
#' @import Biostrings
#' @import ggplot2
#' @import ggrepel
#' @import dplyr
#' @import robustbase
#' @import qvalue
#' @import nortest
#' @import matrixStats
#' @import sm
#' @import corrplot
#' @import DescTools
#' @import GenomicAlignments
#' @import rlist
#' @import gdata
#' @import nlme
#' @import EnhancedVolcano
#' @import fitdistrplus
#' @title bam2count
#' @description Function to generate a read counts table from bam files
#' @param bamfolder Path to the folder containing bam files (one bam file per sample is expected)
#' @param annotation Annotation data table produced by \code{\link{read_annotation}} listing transcript names and lengths of their 5'UTR, CDS and 3'UTR segments.
#' It has five columns: transcript, l_tr, l_utr5, l_cds and l_utr3.
#' Transcript names and segment lengths must correspond to the reference sequences to which the reads were mapped.
#' @details This function is designed for bam files generated by mapping to a transcriptome (not a genome).
#' For each sample, it counts the number of reads mapping to different chromosomes (the 'seqname' bam field).
#' Annotation must be provided to ensure a complete transcript list, including those with zero counts.
#' Low count transcripts can be filtered out later using the \code{\link{min_count_filter}} function.
#' RNA and RPF bam files can be placed in the same folder and imported together using this function if CELP bias correction on RPF counts is not desired.
#' If CELP correction is desired, RNA and RPF bams should be placed in separate folders.
#' RNA bams should be imported using this function; RPF bams should be imported using the \code{\link{bamtolist_rW}} function following the CELP workflow.
#' @return A data frame where the first column contains transcript IDs and the remaining columns contain read counts in imported samples.
#' @examples
#' rna_count_LMCN <- bam2count(bamfolder = "./Data/Bam/RNA", annotation = annotation_human_cDNA)
#' rpf_count_LMCN <- bam2count(bamfolder = "./Data/Bam/RPF", annotation = annotation_human_cDNA)
#' @export
bam2count <- function(bamfolder, annotation){
annotation_tr <- as.data.frame(annotation$transcript)
names(annotation_tr) <- "transcript"
names <- list.files(path = bamfolder, pattern = ".bam$")
counts_list <- list()
counts_list[["annotation"]] <- annotation_tr
for (n_i in names){
filename <- paste(bamfolder, n_i, sep = "/")
bam_i <- GenomicAlignments::readGAlignments(filename)
counts_i <- as.data.frame(bam_i) %>% dplyr::count(seqnames)
sample_name_i <- unlist(strsplit(n_i, ".bam"))
names(counts_i) <- c("transcript", sample_name_i)
counts_list[[sample_name_i]] <- counts_i
}
counts_df <- Reduce(function(x, y) merge(x, y, by = "transcript", all=TRUE), counts_list)
counts_df[is.na(counts_df)] <- 0
return(counts_df)
}
#' @title gm_mean_z
#' @description Function to calculate geometric mean of a vector of non-negative numbers
#' @details Zeros will remain in the calculation and render the geometric mean zero.
#' Compare with \code{\link{gm_mean}}.
#' @param x Vector of non-negative numbers (may include zero)
#' @return Geometric mean
#' @export
gm_mean_z <- function(x){
exp(sum(log(x)) / length(x))
}
#' @title gm_mean
#' @description Function to calculate geometric mean of positive numbers in a vector
#' @details Zeros and negative numbers will be effectively treated as 1:
#' They will not contribute to the product of numbers but will be included in calculation of vector length.
#' Compare with \code{\link{gm_mean_z}}.
#' @param x A vector of numbers (may include zero or negative numbers)
#' @return Geometric mean of the vector with the zeros and negative numbers replaced by 1
#' @export
gm_mean = function(x, na.rm=TRUE){
exp(sum(log(x[x > 0]), na.rm=na.rm) / length(x))
}
#' @title normalize_median_of_ratios_append
#' @description Function to normalize a RNA-seq or ribo-seq dataset for library size variation
#' @details The original data columns are retained and normalized columns are added.
#' Compare with \code{\link{normalize_median_of_ratios2}}.
#'
#' Use the \code{data_columns} argument to exclude gene or transcript ID and other metadata columns from the
#' calculations, and, to normalize RNA and RPF counts separately while keeping them in the same data frame.
#' @param expression_data_frame A data frame containing RNA-seq, ribo-seq or similar data.
#' Rows are genes/transcripts and columns are samples.
#' The data frame may contain additional columns for gene/transcript ID or other metadata.
#' @param data_columns A vector of numbers specifying the columns to be normalized together.
#' @return An augmented data frame with all the original columns retained and normalized columns
#' appended to the end. New columns are named 'norm.<old_column>'.
#' @examples
#' rna_observed_normalized_LMCN <- normalize_median_of_ratios_append(rna_rpf_count_LMCN, c(2:9))
#' @export
normalize_median_of_ratios_append <- function (expression_data_frame, data_columns){
gm_mean_z <- function(x){
exp(sum(log(x)) / length(x))
}
edf <- expression_data_frame
geo.mean.vec <- apply(edf[,data_columns], 1, function(x) gm_mean_z(x))
ratios.df <- edf[,data_columns]/geo.mean.vec
# Division by 0 gm_mean creates NAs here.
normalization.factors <- apply(ratios.df, 2, function(x) median(x, na.rm=TRUE))
# NAs are removed from calculation of median here.
print("Normalization factors:")
print(normalization.factors)
normalized.edf <- t(t(edf[,data_columns])/normalization.factors)
colnames(normalized.edf) <- paste0("norm.", colnames(normalized.edf))
combined.df <- data.frame(edf,normalized.edf)
return(combined.df)
}
#' @title normalize_median_of_ratios
#' @description Function to normalize a RNA-seq or ribo-seq dataset for library size variation
#' @details The original columns of non-normalized counts are replaced in situ with normalized counts.
#' Compare with \code{\link{normalize_median_of_ratios_append}}.
#'
#' Use the \code{data_columns} argument to exclude gene or transcript ID and other metadata columns from the
#' calculations, and, to normalize RNA and RPF counts separately while keeping them in the same data frame.
#' @param expression_data_frame A data frame containing RNA-seq, ribo-seq or similar data.
#' Rows are genes/transcripts and columns are samples.
#' The data frame may contain additional columns for gene/transcript ID or other metadata.
#' @param data_columns A vector of numbers specifying the columns to be normalized together.
#' @return A data frame the same size as the input data frame where non-normalized counts in the 'data_columns'
#' are replaced in situ by normalized counts. Column names are not changed.
#' @examples
#' The data set rna_CELP_rpf_count_LMCN contains 17 columns: [1] transcript [2:9] RNA counts [10:17] RPF counts.
#' RNA and RPF counts are normalized separately.
#' rna_CELP_rpf_count_norm1_LMCN <- Ribolog::normalize_median_of_ratios(rna_CELP_rpf_count_LMCN, c(2:9))
#' rna_CELP_rpf_count_norm2_LMCN <- Ribolog::normalize_median_of_ratios(rna_CELP_rpf_count_norm1_LMCN, c(10:17))
#' @export
normalize_median_of_ratios <- function (expression_data_frame, data_columns){
if (length(data_columns) < 2){
print('Skipped Normalisation because less than two data columns specified.')
return(expression_data_frame)
}
gm_mean_z <- function(x){
exp(sum(log(x)) / length(x))
}
edf <- expression_data_frame
id.names <- names(edf)
geo.mean.vec <- apply(edf[,data_columns], 1, function(x) gm_mean_z(x))
ratios.df <- edf[,data_columns]/geo.mean.vec
# Division by 0 gm_mean will create NAs here.
normalization.factors <- apply(ratios.df, 2, function(x) median(x, na.rm=TRUE))
# NAs will be removed from calculation of median here.
print("Normalization factors:")
print(normalization.factors)
normalized.edf <- t(t(edf[,data_columns])/normalization.factors)
edf[, data_columns] <- normalized.edf
names(edf) <- id.names
return(edf)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.