LTRpred: De novo functional annotation of retrotransposons

Documented in quality.filter

#' @title Pipeline to eliminate false positive predictions of retrotransposons
#' @description This function takes an \code{\link{LTRpred}} output table as input
#' and eliminates false positive predictions.
#' @param pred \code{LTRpred.tbl} generated with \code{\link{LTRpred}}
#' @param sim LTR similarity threshold. Only putative LTR transposons that fulfill this 
#' LTR similarity threshold will be retained.
#' @param n.orfs minimum number of ORFs detected in the putative LTR transposon.
#' @param strategy quality filter strategy. Options are
#' \itemize{
#' \item \code{strategy = "default"} : see section \code{Quality Control} 
#' \item \code{strategy = "stringent"} : in addition to filter criteria specified in section \code{Quality Control},
#' the filter criteria \code{!is.na(protein_domain)) | (dfam_target_name != "unknown")} is applied
#' }
#' @author Hajk-Georg Drost
#' @details 
#' \strong{Quality Control}
#' 
#' \itemize{
#' \item \code{ltr.similarity}: Minimum similarity between LTRs. All TEs not matching this
#'  criteria are discarded.
#'  \item \code{n.orfs}: minimum number of Open Reading Frames that must be found between the
#'   LTRs. All TEs not matching this criteria are discarded.
#'  \item \code{PBS or Protein Match}: elements must either have a predicted Primer Binding
#'  Site or a protein match of at least one protein (Gag, Pol, Rve, ...) between their LTRs. All TEs not matching this criteria are discarded.
#'  \item The relative number of N's (= nucleotide not known) in TE <= 0.1. The relative number of N's is computed as follows: absolute number of N's in TE / width of TE.
#' }
#' @examples 
#' # example prediction file generated by LTRpred 
#' pred.file <- system.file("Athaliana_TAIR10_chr_all_LTRpred_DataSheet.csv", package = "LTRpred")
#' # read LTRpred generated prediction file (data sheet)
#' pred <- read.ltrpred(pred.file)
#' # apply quality filter
#' pred <- quality.filter(pred, sim = 70, n.orfs = 1)
#' @return A quality filtered \code{LTRpred.tbl}.
#' @seealso \code{\link{LTRpred}}, \code{\link{LTRpred.meta}}, \code{\link{read.ltrpred}}, \code{\link{quality.filter.meta}} 
#' @export 

quality.filter <- function(pred, sim, n.orfs, strategy = "default"){
    
  if (!is.element(strategy, c("default", "stringent")))
    stop(
      "Please choose a quality filter strategy that is supported, e.g. strategy = 'default' or strategy = 'stringent'."
    )
  
  # try to reduce false positives by filtering for PBS and ORFs
  ltr_similarity <- PBS_start <- protein_domain <- orfs <- NULL
  TE_N_abs <- width <- dfam_target_name <- NULL
  
  message(
    "The LTRpred prediction table has been filtered (default) to remove potential false positives. Predicted LTRs must have an PBS or Protein Domain and must fulfill thresholds: sim = ",
    sim,
    "%; #orfs = ",
    n.orfs,
    ". Furthermore, TEs having more than 10% of N's in their sequence have also been removed."
  )
  message("/n")
  message("Input non-unique #TEs: ", length(pred$ID))
  message("Input unique #TEs: ", length(unique(pred$ID)))
  filtered.res <- dplyr::filter(
    pred,
    (TE_N_abs / width) <= 0.1,
    ltr_similarity >= sim,
    (!is.na(PBS_start)) |
      (!is.na(protein_domain)),
    orfs >= n.orfs
  )
  
  if (strategy == "stringent") {
    filtered.res <-
      dplyr::filter(filtered.res, (!is.na(protein_domain)) |
                      (dfam_target_name != "unknown"))
  }
  
  message("Output non-unique #TEs: ", length(filtered.res$ID))
  message("Output unique #TEs: ", length(unique(filtered.res$ID)))
  return(filtered.res)   
}