#' @title Pipeline to eliminate false positive predictions of retrotransposons
#' @description This function takes an \code{\link{LTRpred}} output table as input
#' and eliminates false positive predictions.
#' @param pred \code{LTRpred.tbl} generated with \code{\link{LTRpred}}
#' @param sim LTR similarity threshold. Only putative LTR transposons that fulfill this
#' LTR similarity threshold will be retained.
#' @param n.orfs minimum number of ORFs detected in the putative LTR transposon.
#' @param strategy quality filter strategy. Options are
#' \itemize{
#' \item \code{strategy = "default"} : see section \code{Quality Control}
#' \item \code{strategy = "stringent"} : in addition to filter criteria specified in section \code{Quality Control},
#' the filter criteria \code{!is.na(protein_domain)) | (dfam_target_name != "unknown")} is applied
#' }
#' @author Hajk-Georg Drost
#' @details
#' \strong{Quality Control}
#'
#' \itemize{
#' \item \code{ltr.similarity}: Minimum similarity between LTRs. All TEs not matching this
#' criteria are discarded.
#' \item \code{n.orfs}: minimum number of Open Reading Frames that must be found between the
#' LTRs. All TEs not matching this criteria are discarded.
#' \item \code{PBS or Protein Match}: elements must either have a predicted Primer Binding
#' Site or a protein match of at least one protein (Gag, Pol, Rve, ...) between their LTRs. All TEs not matching this criteria are discarded.
#' \item The relative number of N's (= nucleotide not known) in TE <= 0.1. The relative number of N's is computed as follows: absolute number of N's in TE / width of TE.
#' }
#' @examples
#' # example prediction file generated by LTRpred
#' pred.file <- system.file("Athaliana_TAIR10_chr_all_LTRpred_DataSheet.csv", package = "LTRpred")
#' # read LTRpred generated prediction file (data sheet)
#' pred <- read.ltrpred(pred.file)
#' # apply quality filter
#' pred <- quality.filter(pred, sim = 70, n.orfs = 1)
#' @return A quality filtered \code{LTRpred.tbl}.
#' @seealso \code{\link{LTRpred}}, \code{\link{LTRpred.meta}}, \code{\link{read.ltrpred}}, \code{\link{quality.filter.meta}}
#' @export
quality.filter <- function(pred, sim, n.orfs, strategy = "default"){
if (!is.element(strategy, c("default", "stringent")))
stop(
"Please choose a quality filter strategy that is supported, e.g. strategy = 'default' or strategy = 'stringent'."
)
# try to reduce false positives by filtering for PBS and ORFs
ltr_similarity <- PBS_start <- protein_domain <- orfs <- NULL
TE_N_abs <- width <- dfam_target_name <- NULL
message(
"The LTRpred prediction table has been filtered (default) to remove potential false positives. Predicted LTRs must have an PBS or Protein Domain and must fulfill thresholds: sim = ",
sim,
"%; #orfs = ",
n.orfs,
". Furthermore, TEs having more than 10% of N's in their sequence have also been removed."
)
message("/n")
message("Input non-unique #TEs: ", length(pred$ID))
message("Input unique #TEs: ", length(unique(pred$ID)))
filtered.res <- dplyr::filter(
pred,
(TE_N_abs / width) <= 0.1,
ltr_similarity >= sim,
(!is.na(PBS_start)) |
(!is.na(protein_domain)),
orfs >= n.orfs
)
if (strategy == "stringent") {
filtered.res <-
dplyr::filter(filtered.res, (!is.na(protein_domain)) |
(dfam_target_name != "unknown"))
}
message("Output non-unique #TEs: ", length(filtered.res$ID))
message("Output unique #TEs: ", length(unique(filtered.res$ID)))
return(filtered.res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.