#' Read length filtering.
#'
#' This function provides multiple options for filtering the reads according to
#' their length. Read lengths to keep are either specified by the user or
#' automatichally selected on the basis of the trinucleotide periodicity of reads
#' mapping on the CDS.
#'
#' @param data Either list of data tables or GRangesList object from
#' \code{\link{bamtolist}}, \code{\link{bedtolist}} or
#' \code{\link{duplicates_filter}}.
#' @param sample Character string or character string vector specifying the name
#' of the sample(s) to process. Default is NULL i.e. all samples are
#' processed.
#' @param length_filter_mode Either "periodicity" or "custom". It specifies how
#' read length selection should be performed. "periodicity": only read lengths
#' satisfying a periodicity threshold (see \code{periodicity_threshold}) are
#' kept. It ensures the removal of all reads with low or no periodicity;
#' "custom": only read lengths specified by the user are kept (see
#' \code{length_range}). Default is "periodicity".
#' @param periodicity_threshold Integer in [10, 100]. Only read lengths
#' satisfying this threshold (i.e. a higher percentage of read extremities
#' falls in one of the three reading frames along the CDS) are kept. This
#' parameter is considered only if \code{length_filter_mode} is set to
#' "periodicity". Default is 50.
#' @param length_range Integer or integer vector specifying one read
#' length or a range of read lengths to keep, respectively. This parameter is
#' considered only if \code{length_filter_mode} is set to "custom".
#' @param txt Logical value whether to write in a txt file statistics on the
#' filtering step. Similar information are displayed by default in the
#' console. Default is FALSE.
#' @param txt_file Character string specifying the path, name and extension
#' (e.g. "PATH/NAME.extension") of the plain text file where statistics on the
#' filtering step shuold be written. If the specified folder doesn't exist, it
#' is automatically created. If NULL (the default), the information are
#' written in \emph{"length_filtering.txt"}, saved in the working directory.
#' This parameter is considered only if \code{txt} is TRUE.
#' @param output_class Either "datatable" or "granges". It specifies the format
#' of the output i.e. a list of data tables or a GRangesList object. Default
#' is "datatable".
#' @return A list of data tables or a GRangesList object.
#' @examples
#' data(reads_list)
#'
#' ## Keep reads of length between 27 and 30 nucleotides (included):
#' filtered_list <- length_filter(reads_list, length_filter_mode = "custom",
#' length_range = 27:30)
#'
#' ## Keep reads of lengths satisfying a periodicity threshold (70%):
#' filtered_list <- length_filter(reads_list, length_filter_mode = "periodicity",
#' periodicity_threshold = 70)
#' @import data.table
#' @export
length_filter <- function(data, sample = NULL,
length_filter_mode = "periodicity",
periodicity_threshold = 50,
length_range = NULL, output_class = "datatable",
txt = FALSE, txt_file = NULL){
check_sample <- setdiff(unlist(sample), names(data))
if(length(check_sample) != 0){
cat("\n")
stop(sprintf("incorrect sample name(s): \"%s\" not found\n\n",
paste(check_sample, collapse = ", ")))
}
if(is.null(sample)){
sample <- names(data)
}
if(!(length_filter_mode %in% c("custom", "periodicity"))){
cat("\n")
warning("parameter length_filter_mode must be either \"periodicity\" or \"custom\"\nset to default \"periodicity\"\n", call. = FALSE)
length_filter_mode = "periodicity"
}
if(length_filter_mode == "custom" & !is.numeric(length_range)
& !is.integer(length_range)){
stop("length_range must be of class integer\n\n")
}
if(length_filter_mode == "periodicity" & ((!is.numeric(periodicity_threshold)
& !is.integer(periodicity_threshold)) | periodicity_threshold < 10
| periodicity_threshold > 100)){
stop("periodicity_threshold must be an integer between 10 and 100 \n\n")
}
if (txt == T | txt == TRUE) {
options(warn=-1)
if (length(txt_file) == 0) {
dir <- getwd()
txt_file <- paste0(dir, "/length_filtering.txt")
} else {
txt_file_split <- strsplit(txt_file, "/")[[1]]
txt_dir <- paste(txt_file_split[-length(txt_file_split)], collapse = "/")
if (!dir.exists(txt_dir)) {
dir.create(txt_dir, recursive = TRUE)
}
}
options(warn=0)
cat("sample\tinitial_reads\tfinal_reads\tpercentage_kept\tpercentage_removed\n", file = txt_file)
}
for(samp in names(data)){
dt <- data[[samp]]
if(samp %in% sample){
cat(sprintf("processing %s\n", samp))
if(class(dt)[1] == "GRanges"){
dt <- as.data.table(dt)[, c("width", "strand") := NULL
][, seqnames := as.character(seqnames)]
setnames(dt, c("seqnames", "start", "end"), c("transcript", "end5", "end3"))
}
nreads <- nrow(dt)
cat(sprintf("reads: %s M\n", format(round((nreads / 1000000), 2), nsmall = 2)))
if (txt == T | txt == TRUE) {
cat(sprintf("%s\t", samp), file = txt_file, append = TRUE)
cat(sprintf("%i\t", nreads), file = txt_file, append = TRUE)
}
if(identical(length_filter_mode, "custom")) {
dt <- dt[length %in% length_range]
} else {
if(identical(length_filter_mode, "periodicity")){
nreads <- nrow(dt)
subdt5 <- dt[cds_start != 0 &
(end5 - cds_start) >= 0 &
(cds_stop - end5) >= 0]
subdt5[, end5_frame := as.factor((end5 - cds_start) %% 3)]
t_end5 <- subdt5[, .N, by = list(length, end5_frame)
][, end5_perc := (N / sum(N)) * 100, by = length]
keep_length5 <- unique(t_end5[end5_perc >= periodicity_threshold, length])
subdt3 <- dt[cds_start != 0 &
(end3 - cds_start) >= 0 &
(cds_stop - end3) >= 0]
subdt3[, end3_frame := as.factor((end3 - cds_start) %% 3)]
t_end3 <- subdt3[, .N, by = list(length, end3_frame)
][, end3_perc := (N / sum(N)) * 100, by = length]
keep_length3 <- unique(t_end3[end3_perc >= periodicity_threshold, length])
keep_length <- intersect(keep_length5, keep_length3)
dt <- dt[length %in% keep_length]
}
}
cat(sprintf("%s M (%s %%) reads removed\n",
format(round((nreads - nrow(dt))/ 1000000, 2), nsmall = 2),
format(round(((nreads - nrow(dt))/nreads) * 100, 2), nsmall = 2) ))
cat(sprintf("reads (kept): %s M\n\n", format(round((nrow(dt) / 1000000), 2), nsmall = 2)))
if (txt == T | txt == TRUE) {
cat(sprintf("%i\t", nrow(dt)), file = txt_file, append = TRUE)
cat(sprintf("%.2f\t", round((nrow(dt) / nreads) * 100, 2)), file = txt_file, append = TRUE)
cat(sprintf("%.2f\n", round(((nreads - nrow(dt)) / nreads) * 100, 2)), file = txt_file, append = TRUE)
}
}
if(nrow(dt) == 0){
cat(sprintf("Sample %s has no more reads. It won't be included in the output", samp))
next
}
if(output_class == "granges"){
dt <- GenomicRanges::makeGRangesFromDataFrame(dt,
keep.extra.columns = TRUE,
ignore.strand = TRUE,
seqnames.field = c("transcript"),
start.field = "end5",
end.field = "end3",
strand.field = "strand",
starts.in.df.are.0based = FALSE)
GenomicRanges::strand(dt) <- "+"
}
data[[samp]] <- dt
}
if(output_class == "granges"){
data <- GenomicRanges::GRangesList(data)
}
return(data)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.