R/LTRpred.R

Defines functions LTRpred

Documented in LTRpred

#' @title Predict LTR retrotransposons in a given genome
#' @description Main pipeline to perform
#' sufficient LTR retrotransposon predictions for any genome of interest.
#' @param genome.file path to the genome file in \code{fasta} format.
#' @param centromere_start a numeric vector storing the centromere start coordinates in the \code{genome.file}. The position in the numeric vector should correspond to the chromosome name in the \code{genome.file} fasta file. If \code{centromere_start = NULL} (default), then no centromeres will be drawn.
#' @param centromere_end a numeric vector storing the centromere end coordinates in the \code{genome.file}. The position in the numeric vector should correspond to the chromosome name in the \code{genome.file} fasta file. If \code{centromere_end = NULL} (default), then no centromeres will be drawn.
#' @param ltr_age_estimation a logical value indicating wether or not ltr-age estimation shall be performed (default \code{ltr_age_estimation = TRUE}).
#' @param model a model as specified in \code{\link[ape]{dist.dna}}: a character string specifying the evolutionary model to be used - must be one of
#'  \itemize{
#' \item  \code{K80} (the default)
#' \item \code{raw}
#' \item  \code{N}
#' \item  \code{TS}
#' \item  \code{TV}
#' \item  \code{JC69}
#' \item  \code{F81} 
#' \item \code{K81}
#' \item \code{F84}
#' \item \code{BH87}
#' \item \code{T92}
#' \item \code{TN93}
#' \item \code{GG95}
#' \item \code{logdet}
#' \item \code{paralin}
#' }
#' @param mutation_rate a mutation rate per site per year. For retrotransposons the default is \eqn{mutation_rate = 1.3 * 10E-8} (Wicker and Keller, 2007).
#' @param index.file.harvest specify the name of the enhanced suffix array index file that is computed
#'  by \code{suffixerator} for the use of \code{LTRharvest}. This often can be used in case the suffix file was previously 
#'  generated, e.g. during a previous call of this function. In this case the suffix array index
#'  file does not need to be re-computed for new analyses. This is particularly useful when 
#'  running \code{LTRpred} with different parameter settings. A possible index file name could be \code{BASENAME_index.fsa}.
#' @param index.file.digest specify the name of the enhanced suffix array index file that is computed
#'  by \code{suffixerator} for the use of \code{LTRdigest}. This often can be used in case the suffix file was previously 
#'  generated, e.g. during a previous call of this function. In this case the suffix array index
#'  file does not need to be re-computed for new analyses. This is particularly useful when 
#'  running \code{LTRpred} with different parameter settings. A possible index file name could be \code{BASENAME_index_ltrdigest.fsa}.
#' @param LTRdigest.gff path to the LTRdigest generated GFF file, in case LTRdigest files were pre-computed previously.
#' @param LTRpred.folder name/path of/to an existing LTRpred folder.
#' @param tabout.file path to the LTRdigest generated tabout file file, in case LTRdigest files were pre-computed previously.
#' @param LTRharvest.folder either \code{LTRharvest.folder = NULL} (default) or \code{LTRharvest.folder = "skip"} to skip moving a LTRharvest folder if it is not present.
#' @param orf.file path to the file generated by \code{\link{ORFpred}}, in case the orf prediction file was generated previously.
#' @param annotate annotation database that shall be queried to annotate predicted LTR transposons.
#' Default is \code{annotate = NULL} indicating that no annotation query is being performed.
#' Possible options are: \code{annotate = "Dfam"} (here the Dfam database must be stored locally and a nhammer search is performed against the Dfam database) or \code{annotate = "Repbase"} (here the Repbase database must be stored locally and a blastn search is performed against the Repbase database). Please consult the vignettes for details.
#' @param Dfam.db folder path to the local Dfam database or \code{Dfam.db = "download"} in case the Dfam
#'  database shall be automatically downloaded before performing query analyses.
#' @param dfam.eval E-value threshhold to perform HMMer search against the Dfam database.
#' @param dfam.file path to pre-computed \code{\link{dfam.query}} output file. Can only be used in combination
#' with \code{annotate = "Dfam"}.
#' @param cluster shall predicted transposons be clustered with \code{\link{CLUSTpred}}.
#' @param clust.sim cluster reject if sequence similarity is lower than this threshold when performing clustering with \code{\link{CLUSTpred}}.
#' @param clust.file file path to pre-computed clustering file generated by \code{\link{CLUSTpred}}.
#' @param copy.number.est shall copy number estimation (including solo LTR prediction) be performed? Default is \code{copy.number.est = FALSE}.
#' @param fix.chr.name shall chromosome names be fixed?
#' @param cn.eval evalue for copy number estimation (BLAST hit threshold).
#' @param range define the genomic interval in which predicted LTR transposons shall be reported
#' . In case \code{range[1] = 1000} and \code{range[2] = 10000} then candidates are only 
#' reported if they start after position 1000 and end before position 10000 in their respective 
#' sequence coordinates. If \code{range[1] = 0} and \code{range[2] = 0}, 
#' so \code{range = c(0,0)} (default) then the entire genome is being scanned.
#' @param seed  the minimum length for the exact maximal repeats. Only repeats with the specified minimum length are considered in all subsequent analyses. Default is \code{seed = 30}.
#' @param minlenltr minimum LTR length. Default is \code{minlenltr = 100}. 
#' @param maxlenltr maximum LTR length. Default is \code{maxlenltr = 3500}.
#' @param mindistltr minimum distance of LTR starting positions. Default is \code{mindistltr = 4000}.
#' @param maxdistltr maximum distance of LTR starting positions. Default is \code{maxdistltr = 25000}.
#' @param similar minimum similarity value between the two LTRs in percent. \code{similar = 70}.
#' @param mintsd minimum target site duplications (TSDs) length. If no search for TSDs
#' shall be performed, then specify \code{mintsd = NULL}. Default is \code{mintsd = 4}.
#' @param maxtsd maximum target site duplications (TSDs) length. If no search for TSDs
#' shall be performed, then specify \code{maxtsd = NULL}. Default is \code{maxtsd = 20}.
#' @param vic number of nucleotide positions left and right (the vicinity) of the predicted
#'  boundary of a LTR that will be searched for TSDs and/or one motif (if specified). 
#'  Default is \code{vic = 60}.
#' @param overlaps specify how overlapping LTR retrotransposon predictions shall be treated. 
#' If \code{overlaps = "no"} is selected, then neither nested nor overlapping predictions will be reported in the output. In case \code{overlaps = "best"} is selected then in the case of two or more nested or overlapping predictions, solely the LTR retrotransposon prediction with
#' the highest similarity between its LTRs will be reported.
#' If \code{overlaps = "all"} is selected then all LTR retrotransposon predictions 
#' will be reported whether there are nested and/or overlapping predictions or not. 
#' Default is \code{overlaps = "best"}.
#' @param xdrop specify the xdrop value (> 0) for extending a seed repeat in both directions
#'  allowing for matches, mismatches, insertions, and deletions. The xdrop extension process
#'   stops as soon as the extension involving matches, mismatches, insersions, and deletions 
#'   has a score smaller than T - X, where T denotes the largest score seen so far. Default is \code{xrop = 5}.
#' @param mat specify the positive match score for the X-drop extension process. Default is \code{mat = 2}.
#' @param mis specify the negative mismatch score for the X-drop extension process. Default is \code{mis = -2}.
#' @param ins specify the negative insertion score for the X-drop extension process. Default is \code{ins = -3}.
#' @param del specify the negative deletion score for the X-drop extension process. Default is \code{del = -3}.
#' @param motif specify 2 nucleotides for the starting motif and 2 nucleotides for the ending
#'  motif at the beginning and the ending of each LTR, respectively.
#'  Only palindromic motif sequences - where the motif sequence is equal to its complementary
#'  sequence read backwards - are allowed, e.g. \code{motif = "tgca"}. Type the nucleotides without any space
#'  separating them. If this option is not selected by the user, candidate pairs will not be
#'  screened for potential motifs. If this options is set but no allowed number of
#'  mismatches is specified by the argument \code{motifmis} and a search for the exact 
#'  motif will be conducted. If \code{motif = NULL} then no explicit motif is being specified.
#' @param motifmis allowed number of mismatches in the TSD motif specified in \code{motif}.
#' The number of mismatches needs to be between [0,3].  Default is \code{motifmis = 0}.
#' @param aaout shall the protein sequence of the HMM matches to the predicted LTR transposon 
#' be generated as fasta file or not. Options are \code{aaout = "yes"} or \code{aaout = "no"}.
#' @param aliout shall the alignment of the protein sequence of the HMM matches to the predicted LTR transposon 
#' be generated as fasta file or not. Options are \code{aaout = "yes"} or \code{aaout = "no"}.
#' @param pptlen a two dimensional numeric vector specifying the minimum and maximum allowed
#' lengths for PPT predictions. If a purine-rich region that does not fulfill this range is
#' found, it will be discarded. Default is \code{pptlen = c(8,30)} (minimum = 8; maximum = 30).
#' @param uboxlen a two dimensional numeric vector specifying the minimum and maximum allowed
#' lengths for U-box predictions. If a T-rich region preceding a PPT that does not fulfill the PPT length criteria is
#' found, it will be discarded. Default is \code{uboxlen = c(3,30)} (minimum = 3; maximum = 30).
#' @param pptradius a numeric value specifying the area around the 3' LTR beginning to be 
#' considered when searching for PPT. Default value is \code{pptradius = 30}.
#' @param trnas path to the fasta file storing the unique tRNA sequences that shall be matched to the
#' predicted LTR transposon (tRNA library). 
#' @param pbsalilen a two dimensional numeric vector specifying the minimum and maximum allowed
#' lengths for PBS/tRNA alignments. If the local alignments are shorter or longer than this
#' range, it will be discarded. Default is \code{pbsalilen = c(11,30)} (minimum = 11; maximum = 30).
#' @param pbsoffset a two dimensional numeric vector specifying the minimum and maximum allowed
#' distance between the start of the PBS and the 3' end of the 5' LTR. Local alignments not 
#' fulfilling this criteria will be discarded. Default is \code{pbsoffset = c(0,5)} (minimum = 0; maximum = 5).
#' @param pbstrnaoffset a two dimensional numeric vector specifying the minimum and maximum allowed
#' PBS/tRNA alignment offset from the 3' end of the tRNA. Local alignments not 
#' fulfilling this criteria will be discarded. Default is \code{pbstrnaoffset = c(0,5)} (minimum = 0; maximum = 5).
#' @param pbsmaxedist a numeric value specifying the maximal allowed unit edit distance in a
#' local PBS/tRNA alignment.
#' @param pbsradius a numeric value specifying the area around the 5' LTR end to be 
#' considered when searching for PBS Default value is \code{pbsradius = 30}.
#' @param hmms a character string or a character vector storing either the hmm files for
#' searching internal domains between the LTRs of predicted LTR transposons or a vector of
#' Pfam IDs from http://pfam.xfam.org/ that are downloaded and used to search for corresponding protein domains
#' within the predicted LTR transposons. As an option users can rename all of their hmm files
#' so that they start for example with the name \code{hmms = "hmm_*"}. This way all files starting with 
#' \code{hmm_} will be considered for the subsequent protein domain search. In case Pfam IDs 
#' are specified, the \code{LTRpred} function will automatically download the corresponding 
#' HMM files and use them for further protein domain searches. In case users prefer to specify 
#' Pfam IDs please specify them in the \code{pfam.ids} parmeter and choose \code{hmms = NULL}.  
#' @param pdomevalcutoff a numeric value specifying the E-value cutoff for corresponding HMMER searches. All hits that do not fulfill this criteria are discarded. Default is \code{pdomevalcutoff = 1E-5}.
#' @param pbsmatchscore specify the match score used in the PBS/tRNA Smith-Waterman alignment.
#' Default is \code{pbsmatchscore = 5}.
#' @param pbsmismatchscore specify the mismatch score used in the PBS/tRNA Smith-Waterman alignment.
#' Default is \code{pbsmismatchscore = -10}.
#' @param pbsinsertionscore specify the insertion score used in the PBS/tRNA Smith-Waterman alignment.
#' Default is \code{pbsinsertionscore = -20}.
#' @param pbsdeletionscore specify the deletion score used in the PBS/tRNA Smith-Waterman alignment.
#' Default is \code{pbsdeletionscore = -20}.
#' @param pfam.ids a character vector storing the Pfam IDs from http://pfam.xfam.org/
#' that shall be downloaded and used to perform protein domain searches within the sequences
#' between the predicted LTRs.
#' @param cores number of cores to be used for multicore processing.
#' @param dfam.cores number of cores to be used for multicore processing when running Dfam query (in case \code{annotate = "Dfam"}).
#' @param hmm.cores number of cores to be used for multicore processing when performing hmmer protein search with \code{\link{LTRdigest}}.
#' @param orf.style type of predicting open reading frames (see documentation of USEARCH).
#' @param min.codons minimum number of codons in the predicted open reading frame.
#' @param trans.seqs logical value indicating wheter or not predicted open reading frames
#' shall be translated and the corresponding protein sequences stored in the output folder.
#' @param output.path a path/folder to store all results returned by \code{\link{LTRharvest}}, \code{\link{LTRdigest}}, and \code{LTRpred}. 
#' If \code{output.path = NULL} (Default) then a folder with the name of the input genome file
#' will be generated in the current working directory of R and all results are then stored in this folder.
#' @param quality.filter shall false positives be filtered out as much as possible? Default is \code{quality.filter = TRUE}. 
#' See \code{Description} for details.
#' @param n.orfs minimum number of Open Reading Frames that must be found between the LTRs (if \code{quality.filter = TRUE}). See \code{Details} for further information on quality control.
#' @param job_num a job number in case this function is run in parallel mode in \code{\link{LTRpred.meta}}.
#' @param verbose shall further information be printed on the console or not. 
#' @author Hajk-Georg Drost
#' @details This function provides the main pipeline to perform \code{de novo} LTR transposon
#' predictions.
#' 
#' \strong{Quality Control}
#' 
#' \itemize{
#' \item \code{ltr.similarity}: Minimum similarity between LTRs. All TEs not matching this
#'  criteria are discarded.
#'  \item \code{n.orfs}: minimum number of Open Reading Frames that must be found between the
#'   LTRs. All TEs not matching this criteria are discarded.
#'  \item \code{PBS or Protein Match}: elements must either have a predicted Primer Binding
#'  Site or a protein match of at least one protein (Gag, Pol, Rve, ...) between their LTRs. All TEs not matching this criteria are discarded.
#'  \item The relative number of N's (= nucleotide not known) in TE <= 0.1. The relative number of N's is computed as follows: absolute number of N's in TE / width of TE.
#' }
#' 
#' @seealso \code{\link{LTRharvest}}, \code{\link{LTRdigest}}, 
#' \code{\link{read.prediction}}, \code{\link{read.tabout}}, \code{\link{read.seqs}},
#' \code{\link{pred2fasta}}, \code{\link{pred2gff}}
#' @importFrom magrittr %>%
#' @return 
#' The \code{LTRpred} function generates the following output files:
#' 
#' \itemize{
#' \item *_BetweenLTRSeqs.fsa : DNA sequences predicted LTR retrotransposons, in particular of the region between the LTRs in fasta format. 
#' \item *_Details.tsv : A spread sheet containing detailed information about the predicted LTRs.
#' \item *_FullLTRRetrotransposonSeqs.fsa : DNA sequences of the entire predicted LTR retrotransposon.
#' \item *_index.fsa.* : The suffixarray index file used to predict putative LTR retrotransposons.
#' \item *_Prediction.gff : A spread sheet containing detailed additional information about the predicted LTRs (partially redundant with the *_Details.tsv file).
#' \item *_index_ltrdigest.fsa : The suffixarray index file used to predict putative LTR retrotransposonswith \code{LTRdigest}.
#' \item *_LTRdigestPrediction.gff : A spread sheet containing detailed information about the predicted LTRs.
#' \item *-ltrdigest_tabout.csv : A spread sheet containing additional detailed information about the predicted LTRs.
#' \item *-ltrdigest_complete.fas : The full length DNA sequences of all predicted LTR transposons.
#' \item *-ltrdigest_conditions.csv : Contains information about the parameters used for a given
#' \code{LTRdigest} run.
#' \item *-ltrdigest_pbs.fas : Stores the predicted PBS sequences for the putative LTR retrotransposons.
#' \item *-ltrdigest_ppt.fas : Stores the predicted PPT sequences for the putative LTR retrotransposons.
#' \item *-ltrdigest_5ltr.fas and *-ltrdigest_3ltr.fas: Stores the predicted 5' and 3' LTR sequences. Note: If the direction of the putative retrotransposon could be predicted, these
#' files will contain the corresponding 3' and 5' LTR sequences. If no direction could be
#' predicted, forward direction with regard to the original sequence will be assumed by 
#' \code{LTRdigest}, i.e. the 'left' LTR will be considered the 5' LTR.
#' \item *-ltrdigest_pdom_<domainname>.fas : Stores the DNA sequences of the HMM matches
#' to the LTR retrotransposon candidates. 
#' \item *-ltrdigest_pdom_<domainname>_aa.fas : Stores the concatenated protein sequences of 
#' the HMM matches to the LTR retrotransposon candidates.
#' \item *-ltrdigest_pdom_<domainname>_ali.fas : Stores the alignment information for all matches of the given protein domain model to the translations of all candidates.
#' \item *_ORF_prediction_nt.fsa : Stores the predicted open reading frames within the predicted LTR transposons as DNA sequence.
#' \item *_ORF_prediction_aa.fsa : Stores the predicted open reading frames within the predicted LTR transposons as protein sequence.
#' \item *_LTRpred.gff : Stores the \code{LTRpred} predicted LTR transposons in GFF format.
#' \item *_LTRpred.bed : Stores the \code{LTRpred} predicted LTR transposons in BED format. 
#' \item *_LTRpred_DataSheet.tsv : Stores the output table as data sheet.
#' }
#' The ' * ' is an place holder for the name of the input genome file.
#' 
#' \strong{If annotate = "Dfam"}
#' In case the Dfam annotation option is specified the following
#' additional files are generated and stored in the LTRpred result folder:
#' 
#' \itemize{
#' \item *-ltrdigest_complete.fas_DfamAnnotation.out : a data table storing the output of the HMMer search of predicted retrotransposons against the Dfam database. 
#' }
#' 
#' \strong{if cluster = TRUE}
#' \itemize{
#' \item *-ltrdigest_complete.fas_CLUSTpred.blast6out : usearch cluster result in BLAST table format.
#' \item *-ltrdigest_complete.fas_CLUSTpred.log : log file of usearch run.
#' \item *-ltrdigest_complete.fas_CLUSTpred.uc : usearch cluster output file.
#' }  
#'       
#' @examples 
#' 
#' \dontrun{
#' # generate de novo LTR transposon prediction
#' LTRpred(genome.file = system.file("Hsapiens_ChrY.fa", package = "LTRpred"),
#'           trnas       = system.file("hg38-tRNAs.fa", package = "LTRpred"),
#'           hmms        = paste0(system.file("HMMs/", package = "LTRpred"),"hmm_*"))
#' 
#' # run LTRpred with pre-computed predictions from LTRdigest()  
#' genome <- system.file("TAIR10_chr_all_LTRdigestPrediction.gff",package = "LTRpred")   
#' tabout <- system.file("TAIR10_chr_all-ltrdigest_tabout.csv",package = "LTRpred")
#' orf.pred <- system.file("nt.fa",package = "LTRpred")              
#' LTRpred(LTRdigest.gff = genome,
#'         tabout.file   = tabout,
#'         orf.file      = orf.pred)
#'}
#'
#' @references 
#' R Edgar. Search and clustering orders of magnitude faster than BLAST. Bioinformatics (2010) 26 (19): 2460-2461.
#' 
#' D Ellinghaus, S Kurtz and U Willhoeft. LTRharvest, an efficient and flexible software for de novo detection of LTR retrotransposons. BMC Bioinformatics (2008). 9:18.
#' 
#' S Steinbiss et al. Fine-grained annotation and classification of de novo predicted LTR retrotransposons. Nucl. Acids Res. (2009) 37 (21): 7002-7013.
#' @export

LTRpred <- function(genome.file       = NULL,
                    centromere_start = NULL,
                    centromere_end = NULL,
                    ltr_age_estimation = TRUE,
                    model = "K80",
                    mutation_rate = 1.3 * 1e-07,
                    index.file.harvest = NULL,
                    index.file.digest = NULL,
                    LTRdigest.gff     = NULL,
                    tabout.file       = NULL,
                    LTRharvest.folder = NULL,
                    LTRpred.folder    = NULL,
                    orf.file          = NULL,
                    annotate          = NULL,
                    Dfam.db           = NULL,
                    dfam.eval         = 1e-3,
                    dfam.file         = NULL,
                    cluster           = FALSE,
                    clust.sim         = 0.9,
                    clust.file        = NULL,
                    copy.number.est   = FALSE,
                    fix.chr.name      = FALSE,
                    cn.eval           = 1e-5,
                    range             = c(0,0),
                    seed              = 30,
                    minlenltr         = 100,
                    maxlenltr         = 3500,
                    mindistltr        = 4000,
                    maxdistltr        = 25000,
                    similar           = 70,
                    mintsd            = 4,
                    maxtsd            = 20,
                    vic               = 60,
                    overlaps          = "no",
                    xdrop             = 5,
                    mat               = 2,
                    mis               = -2,
                    ins               = -3,
                    del               = -3,
                    motif             = NULL,
                    motifmis          = 0,
                    aaout             = "yes",
                    aliout            = "yes",
                    pptlen            = c(8,30),
                    uboxlen           = c(3,30),
                    pptradius         = 30,
                    trnas             = NULL,
                    pbsalilen         = c(11,30),
                    pbsoffset         = c(0,5),
                    pbstrnaoffset     = c(0,5),
                    pbsmaxedist       = 1,
                    pbsradius         = 30,
                    hmms              = NULL,
                    pdomevalcutoff    = 1E-5,
                    pbsmatchscore     = 5,
                    pbsmismatchscore  = -10,
                    pbsinsertionscore = -20,
                    pbsdeletionscore  = -20,
                    pfam.ids          = NULL,
                    cores             = 1,
                    dfam.cores        = NULL,
                    hmm.cores         = NULL,
                    orf.style         = 7,
                    min.codons        = 200,
                    trans.seqs        = FALSE,
                    output.path       = NULL,
                    quality.filter    = TRUE,
                    n.orfs            = 0,
                    job_num           = 1,
                    verbose           = TRUE) {
  
    test_installation_gt()  
    test_installation_hmmer()
    test_installation_usearch()
    test_installation_vsearch()
    test_installation_blast()
    test_installation_perl()
    
    message("Running LTRpred on genome '", genome.file, "' with ", cores, " core(s) and searching for retrotransposons using the overlaps option (overlaps = '", overlaps, "') ...")
    message("\n")
    
    if (!is.null(LTRharvest.folder))
        if (LTRharvest.folder != "skip")
            stop(
                "The argument 'LTRharvest.folder' can only either be 'NULL' (default) or 'skip' when LTRharvest folder movement shall be skipped.",
                call. = FALSE
            )
    
    if (!is.null(annotate)) {
        if (!is.element(annotate, c("Repbase", "Dfam")))
            stop("Only Dfam or Repbase can be used to annotate predicted LTR transposons!",
                 call. = FALSE)
    }
    
    if (parallel::detectCores() < cores)
        stop("Your system does not provide the number of cores you specified.",
             call. = FALSE)
    
    if (!is.null(dfam.file) & is.null(annotate))
        stop("Please specify annotate = 'Dfam' when providing a dfam.file path.",
             call. = FALSE)
    
    if (!is.null(genome.file)){
      if (!file.exists(genome.file))
        stop("The genome.file '", genome.file, "' does not seem to exist. Please provide a valid path to a genome assembly file in fasta format.", call. = FALSE)
    }
      
    if (is.null(dfam.cores) && is.null(hmm.cores)) {
        dfam.cores <- hmm.cores <- cores
    }
    
    if ((!is.null(dfam.cores)) && is.null(hmm.cores)) {
        hmm.cores <- cores
    }
    
    if (is.null(dfam.cores) && (!is.null(hmm.cores))) {
        dfam.cores <- cores
    }
    
    if (is.null(hmms)) {
      message("No hmm files were specified, thus the internal HMM library will be used! See '",
              paste0(system.file("HMMs/", package = "LTRpred"), "hmm_*"),"' for details.")
      hmms <- paste0(system.file("HMMs/", package = "LTRpred"), "hmm_*")
    }
      
      
    if (is.null(trnas)) {
      message("No tRNA files were specified, thus the internal tRNA library will be used! See '",
              system.file("tRNAs/tRNA_library.fa", package = "LTRpred"),"' for details.")
      trnas <- system.file("tRNAs/tRNA_library.fa", package = "LTRpred")
    }
      
    
    chromosome_ltrharvest <- start <- end <- query_name <- bits <- dfam_target_name <- NULL
    Type <- Cluster <- Perc_Ident <- Clust_Cluster <- Clust_Target <- Clust_cn <- NULL
    query_id <- cn_3ltr <- cn_5ltr <- orfs <- species <- ltr_similarity <- similarity <- NULL
    protein_domain <- strand <- annotation <- pred_tool <- frame <- score <- NULL
    lLTR_start <- `PBS/tRNA_edist` <- repeat_region_length <- PBS_length <- dfam_acc <- NULL
    Query <- Target <- n <- NULL
    
    if (is.null(output.path)) {
        if (!is.null(genome.file)) {
            output.path <-
                file.path(getwd(), paste0(unlist(stringr::str_split(basename(
                    genome.file
                ), "[.]"))[1], "_ltrpred"))
        } 
        
        if (is.null(genome.file) & is.null(LTRpred.folder)) {
            output.path <-
                file.path(getwd(), paste0(unlist(stringr::str_split(basename(
                    LTRdigest.gff
                ), "[.]"))[1], "_ltrpred"))
        }
        
        if (is.null(genome.file) & !is.null(LTRpred.folder)) {
            output.path <- LTRpred.folder
        }
        
      if (!is.null(output.path)) {
        if (dir.exists(output.path)) {
          message(
            "The output folder '",
            output.path,
            "' seems to exist already and will be used to store LTRpred results ..."
          )
          #unlink(output.path, recursive = TRUE)
          #dir.create(output.path)
        } else {
          message("The output folder '",
                  output.path,
                  "' does not seem to exist yet and will be created ...")
          dir.create(output.path)
        }
      }
    }
    
    rTSD_end <- lTSD_start <- PPT_end <- PPT_start <- PBS_end <- PBS_start <- NULL
    ID <- chromosome <- width <- orf.id <- NULL
    
    message("\n")
    message("LTRpred - Step 1:")
    
    if (!is.null(genome.file) &
        is.null(LTRdigest.gff) & is.null(tabout.file)) {
        LTRharvest(
            genome.file,
            index.file  = index.file.harvest,
            range       = range,
            seed        = seed,
            minlenltr   = minlenltr,
            maxlenltr   = maxlenltr,
            mindistltr  = mindistltr,
            maxdistltr  = maxdistltr,
            similar     = similar,
            mintsd      = mintsd,
            maxtsd      = maxtsd,
            vic         = vic,
            overlaps    = overlaps,
            xdrop       = xdrop,
            mat         = mat,
            mis         = mis,
            ins         = ins,
            del         = del,
            motif       = motif,
            motifmis    = motifmis,
            output.path = NULL,
            verbose     = verbose
        )
        
        chopped.foldername <-
            unlist(stringr::str_split(basename(genome.file), "[.]"))[1]
        
        message("\n")
        message("LTRpred - Step 2:")
        
        LTRdigest(
            input.gff3        = file.path(
                paste0(chopped.foldername, "_ltrharvest"),
                paste0(chopped.foldername, "_Prediction.gff")
            ),
            genome.file       = genome.file,
            aaout             = aaout,
            aliout            = aliout,
            pptlen            = pptlen,
            uboxlen           = uboxlen,
            pptradius         = pptradius,
            trnas             = trnas,
            pbsalilen         = pbsalilen,
            pbsoffset         = pbsoffset,
            pbstrnaoffset     = pbstrnaoffset,
            pbsmaxedist       = pbsmaxedist,
            pbsradius         = pbsradius,
            hmms              = hmms,
            pdomevalcutoff    = pdomevalcutoff,
            pbsmatchscore     = pbsmatchscore,
            pbsmismatchscore  = pbsmismatchscore,
            pbsinsertionscore = pbsinsertionscore,
            pbsdeletionscore  = pbsdeletionscore,
            pfam.ids          = pfam.ids,
            cores             = hmm.cores,
            index.file        = index.file.digest,
            output.path       = NULL
        )
        
        message("\n")
        message("LTRpred - Step 3:")
        message("Import LTRdigest Predictions...")

        LTRdigestOutput <-
            read.prediction(
                gff.file    = file.path(
                    paste0(chopped.foldername, "_ltrdigest"),
                    paste0(chopped.foldername, "_LTRdigestPrediction.gff")
                ),
                tabout.file = file.path(
                    paste0(chopped.foldername, "_ltrdigest"),
                    paste0(chopped.foldername, "-ltrdigest_tabout.csv")
                ),
                program     = "LTRdigest"
            )
    } else {
        LTRdigestOutput <- read.prediction(gff.file    = LTRdigest.gff,
                                           tabout.file = tabout.file,
                                           program     = "LTRdigest")
    }
    
    if (is.null(orf.file) & !is.null(LTRdigestOutput)) {
        chopped.foldername <-
            unlist(stringr::str_split(basename(genome.file), "[.]"))[1]
        
        if (is.null(LTRdigest.gff)) {
            folder_path <- getwd()
        } else {
            folder_path <-
                unlist(stringr::str_split(LTRdigest.gff, .Platform$file.sep))
            exclude <- c(length(folder_path) - 1, length(folder_path))
            if (exclude[1] == 1) {
                folder_path <- getwd()
            } else {
                folder_path <-
                    stringr::str_c(folder_path[-exclude], collapse = .Platform$file.sep)
            }
        }
        
        if (any(is.na(LTRdigestOutput)))
          return(paste0("No prediction for ", genome.file))
     
        message("\n")
        message("LTRpred - Step 4:")
        message("Perform ORF Prediction using 'usearch -fastx_findorfs' ...")
        ORFTable <-
            ORFpred(
                seq.file = file.path(
                    folder_path,
                    paste0(chopped.foldername, "_ltrdigest"),
                    paste0(chopped.foldername, "-ltrdigest_complete.fas")
                ),
                orf.style  = orf.style,
                min.codons = min.codons,
                trans.seqs = trans.seqs,
                output     = output.path
            )
    } else {
        if (is.null(LTRdigestOutput)) {
            chopped.foldername <-
                unlist(stringr::str_split(basename(genome.file), "[.]"))[1]
            folder_path <-
                unlist(stringr::str_split(LTRdigest.gff, .Platform$file.sep))
            exclude <- c(length(folder_path) - 1, length(folder_path))
            if (exclude[1] == 1) {
                folder_path <- getwd()
            } else {
                folder_path <-
                    stringr::str_c(folder_path[-exclude], collapse = .Platform$file.sep)
            }
            message(
                file.path(
                    folder_path,
                    paste0(chopped.foldername, "_ltrdigest")
                ),
                " is empty and therefore ORF prediction has been omitted."
            )
        } else {
            ORFTable <- read.orfs(input.file = orf.file)
        }
    }
    
    if (!is.null(LTRdigestOutput)) {
        
        if (nrow(LTRdigestOutput$ltr.retrotransposon) > 0) {
            LTRdigestOutput$ltr.retrotransposon <-
                dplyr::mutate(LTRdigestOutput$ltr.retrotransposon,
                              orf.id = paste0(chromosome_ltrharvest, "_", start, "_", end))
            
            if (nrow(ORFTable) > 0) {
              LTRdigestOutput$ltr.retrotransposon <-
                dplyr::full_join(LTRdigestOutput$ltr.retrotransposon,
                                 ORFTable,
                                 by = c("orf.id" = "seq.id"))
              
              message(
                "Join ORF Prediction table: nrow(df) = ",
                nrow(LTRdigestOutput$ltr.retrotransposon),
                " candidates."
              )
              message("unique(ID) = ", length(unique(LTRdigestOutput$ltr.retrotransposon$ID)), " candidates.")
              message("unique(orf.id) = ", length(unique(
                LTRdigestOutput$ltr.retrotransposon$orf.id
              )), " candidates.")
            }
            
            LTRdigestOutput$ltr.retrotransposon <-
                dplyr::mutate(LTRdigestOutput$ltr.retrotransposon,
                              repeat_region_length = ifelse(!is.na(rTSD_end), (
                                  as.numeric(rTSD_end) - as.numeric(lTSD_start)
                              ) + 1L, NA))
            
            LTRdigestOutput$ltr.retrotransposon <-
                dplyr::mutate(LTRdigestOutput$ltr.retrotransposon,
                              PPT_length = ifelse(!is.na(PPT_end), (
                                  as.numeric(PPT_end) - as.numeric(PPT_start)
                              ) + 1L, NA))
            
            LTRdigestOutput$ltr.retrotransposon <-
                dplyr::mutate(LTRdigestOutput$ltr.retrotransposon,
                              PBS_length = ifelse(!is.na(PBS_end), (
                                  as.numeric(PBS_end) - as.numeric(PBS_start)
                              ) + 1L, NA))
            
            element_start <-
                element_length <- `width.y` <- `start.y` <- `end.y` <- NULL
            
            res <-
                dplyr::select(
                    LTRdigestOutput$ltr.retrotransposon,
                    -c(
                        element_start,
                        element_length,
                        `width.y`,
                        `start.y`,
                        `end.y`
                    )
                )
            names(res)[c(4, 5, 9)] <- c("start", "end", "width")
            
        } else {
            warning("The LTRgigest prediction file contains no entries! Therefore, no ORF prediction result is joined with the LTRgigest prediction file.")
        }
        
        
        
        #    ProteinMatch <- dplyr::select(LTRdigestOutput$protein.match,ID, start, end, match_width, reading_frame)
        #    colnames(ProteinMatch) <- c("ID","protein_domain_start","protein_domain_end","protein_domain_match_width","protein_domain_reading_frame")
        #
        #    suppressWarnings(LTRdigestOutput$ltr.retrotransposon <- dplyr::inner_join(LTRdigestOutput$ltr.retrotransposon, ProteinMatch, by = "ID"))
        #
        
        # if (nrow(LTRdigestOutput$RR_tract) > 0) {
        #     RR_tract <-
        #         dplyr::select(LTRdigestOutput$RR_tract, ID, start, end, width)
        #     colnames(RR_tract) <-
        #         c("ID",
        #           "RR_tract_start",
        #           "RR_tract_end",
        #           "RR_tract_length")
        #     suppressWarnings(
        #         LTRdigestOutput$ltr.retrotransposon <-
        #             dplyr::inner_join(LTRdigestOutput$ltr.retrotransposon, RR_tract, by = "ID")
        #     )
        # }
        # 
  
        
        if (!is.null(genome.file)) {
            chopped.foldername <-
                unlist(stringr::str_split(basename(genome.file), "[.]"))[1]
            if (is.null(LTRdigest.gff)) {
                folder_path <- getwd()
            } else {
                folder_path <-
                    unlist(stringr::str_split(LTRdigest.gff, .Platform$file.sep))
                exclude <- c(length(folder_path) - 1, length(folder_path))
                if (exclude[1] == 1) {
                    folder_path <- getwd()
                } else {
                    folder_path <-
                        stringr::str_c(folder_path[-exclude], collapse = .Platform$file.sep)
                }
            }
            
            # annotate predicted LTRtransposons
            if (!is.null(annotate)) {
                if (annotate == "Dfam") {
                    if (!is.null(dfam.file)) {
                        Dfam.tbl <- read.dfam(dfam.file = dfam.file)
                    }
                    
                    if (is.null(dfam.file)) {
                        message("A HMMer search against the Dfam database located at '", Dfam.db, "' using ", dfam.cores, " cores is performed to annotate de novo predicted retrotransposons ...")
                        # perform query against the Dfam database
                        predSeqs <-
                            file.path(
                                folder_path,
                                paste0(chopped.foldername, "_ltrdigest"),
                                paste0(
                                    chopped.foldername,
                                    "-ltrdigest_complete.fas"
                                )
                            )
                        dfam.query(
                            seq.file      = predSeqs,
                            Dfam.db       = Dfam.db,
                            eval          = dfam.eval,
                            cores         = dfam.cores,
                            output.folder = folder_path
                        )
                        
                        Dfam.tbl <-
                            read.dfam(dfam.file = file.path(
                                folder_path,
                                paste0(
                                    basename(predSeqs),
                                    "_DfamAnnotation.out"
                                )
                            ))
                    }
                    
                    
                    if (!is.null(Dfam.tbl)) {
                        # join Dfam prediction table with LTRpred table
                        # choose hit in Dfam with highest bit score
                        Dfam.tbl.grouped <-
                            dplyr::filter(dplyr::group_by(Dfam.tbl, query_name),
                                          bits == max(bits))
                        names(Dfam.tbl.grouped)[3] <- "orf.id"
                        names(Dfam.tbl.grouped)[-3] <-
                            paste0("dfam_", names(Dfam.tbl.grouped)[-3])
                        
                        suppressWarnings(res <-
                            dplyr::left_join(res, Dfam.tbl.grouped, by = "orf.id"))
                        
                        message("Join Dfam Annotation table: nrow(df) = ",nrow(res), " candidates.")
                        message("unique(ID) = ",length(unique(res$ID)), " candidates.")
                        message("unique(orf.id) = ",length(unique(res$orf.id)), " candidates.")

                        res <-
                            dplyr::mutate(res,
                                          dfam_target_name = ifelse(
                                              is.na(dfam_target_name),
                                              "unknown",
                                              as.character(dfam_target_name)
                                          ))
                    }
                    
                    if (is.null(Dfam.tbl)) {
                        warning("The Dfam file has not been joined with the prediction file!") 
                    }
                    
                }
            }
            if (is.null(annotate)) {
                res <- dplyr::mutate(
                    res,
                    dfam_target_name = rep(NA, nrow(res)),
                    dfam_acc = rep(NA, nrow(res)),
                    dfam_bits = rep(NA, nrow(res)),
                    dfam_e_value = rep(NA, nrow(res)),
                    dfam_bias = rep(NA, nrow(res)),
                    `dfam_hmm-st` = rep(NA, nrow(res)),
                    `dfam_hmm-en` = rep(NA, nrow(res)),
                    `dfam_strand` = rep(NA, nrow(res)),
                    `dfam_ali-st` = rep(NA, nrow(res)),
                    `dfam_ali-en` = rep(NA, nrow(res)),
                    `dfam_env-st` = rep(NA, nrow(res)),
                    `dfam_env-en` = rep(NA, nrow(res)),
                    `dfam_modlen` = rep(NA, nrow(res)),
                    dfam_target_description = rep(NA, nrow(res))
                )
            }
                # if (annotate == "Repbase") {
                #     repbase.clean()
                #     repbase.query()
                #     repbase.filter()
                # }
            
            
            # perform transposon clustering
            if (!cluster &&
                !is.null(clust.file))
                warning(
                    "Please specify 'cluster = TRUE' when specifying 'clust.file'! Since 'cluster = FALSE' no clustering will be performed!"
                )
            if (cluster) {
                message("Perform clustering of similar LTR transposons using 'vsearch --cluster_fast' ...")
                predSeqs <-
                    file.path(
                        folder_path,
                        paste0(chopped.foldername, "_ltrdigest"),
                        paste0(chopped.foldername, "-ltrdigest_complete.fas")
                    )
                
                if (is.null(clust.file)) {
                    cluster.file <- CLUSTpred(
                                            file       = predSeqs,
                                            similarity = clust.sim,
                                            cores      = cores,
                                            output     = output.path
                                            )
                    
                    # # read cluster output generated by CLUSTpred()
                    # if (is.null(output.path)) {
                    #     cluster.file <- read.uc("CLUSTpred.uc")
                    # }
                    # if (!is.null(output.path)) {
                    #     cluster.file <- read.uc(file.path(output.path, "CLUSTpred.uc"))
                    # }
                }
                
                if (!is.null(clust.file)) {
                    # read cluster output generated by CLUSTpred()
                    cluster.file <- read.uc(clust.file)
                }
                
                # check if cluster file is empty
                if (nrow(cluster.file) > 0) {
                    cluster.file.type.H <-
                        dplyr::filter(cluster.file, Type == "H")
                    cluster.file.type.H <-
                        dplyr::select(cluster.file.type.H,
                                      Cluster,
                                      Query,
                                      Target,
                                      Perc_Ident)
                    names(cluster.file.type.H) <-
                        paste0("Clust_", names(cluster.file.type.H))
                    names(cluster.file.type.H)[2] <- "orf.id"
                    
                    res <-
                        dplyr::left_join(res, cluster.file.type.H, by = "orf.id")
                    
                    message("Join Cluster table: nrow(df) = ",nrow(res), " candidates.")
                    message("unique(ID) = ",length(unique(res$ID)), " candidates.")
                    message("unique(orf.id) = ",length(unique(res$orf.id)), " candidates.")

                    res <-
                        dplyr::mutate(
                            res,
                            Clust_Cluster = ifelse(
                                is.na(Clust_Cluster),
                                "unique",
                                as.character(paste0("cl_", Clust_Cluster))
                            ),
                            Clust_Target = ifelse(
                                is.na(Clust_Target),
                                "none",
                                as.character(Clust_Target)
                            )
                        )
                    
                    # compute copy number of clustered elements
                    Clust.cn <-
                        dplyr::filter(
                            dplyr::summarise(dplyr::group_by(res, Clust_Cluster), Clust_cn = dplyr::n()),
                            Clust_Cluster != "unique"
                        )
                    
                    # join copy number information of clustered elements with result table
                    suppressWarnings(res <-
                        dplyr::left_join(res, Clust.cn, by = "Clust_Cluster"))
                    
                    message("Join Cluster Copy Number table: nrow(df) = ",nrow(res), " candidates.")
                    message("unique(ID) = ",length(unique(res$ID)), " candidates.")
                    message("unique(orf.id)) = ",length(unique(res$orf.id)), " candidates.")

                    res <-
                        dplyr::mutate(res, Clust_cn = ifelse(is.na(Clust_cn), 1, as.numeric(Clust_cn) + 1))
                }
                
                if (nrow(cluster.file) == 0) {
                    warning("CLUSTpred: The cluster file was empty and therefore has not been joined with the prediction file.", call. = FALSE)   
                }
            } else {
                res <- dplyr::mutate(res, Clust_Cluster = rep(NA, nrow(res)),
                                     Clust_Target = rep(NA, nrow(res)),
                                     Clust_Perc_Ident = rep(NA, nrow(res)),
                                     Clust_cn = rep(NA, nrow(res))
                                     )
                
            }
            
            # perform methylation mark counting
            message("\n")
            message("LTRpred - Step 5:")
            message("Perform methylation context quantification..")

            full.te.seq <-
                file.path(
                    folder_path,
                    paste0(chopped.foldername, "_ltrdigest"),
                    paste0(chopped.foldername, "-ltrdigest_complete.fas")
                )
            seq_3ltr <-
                file.path(
                    folder_path,
                    paste0(chopped.foldername, "_ltrdigest"),
                    paste0(chopped.foldername, "-ltrdigest_3ltr.fas")
                )
            
            seq_5ltr <-
                file.path(
                    folder_path,
                    paste0(chopped.foldername, "_ltrdigest"),
                    paste0(chopped.foldername, "-ltrdigest_3ltr.fas")
                )
            
            if ((!file.exists(full.te.seq)) || (!file.exists(seq_3ltr)) || (!file.exists(seq_5ltr))) {
                warning("At least one of the files: '",full.te.seq,"', '",seq_3ltr,"', '",seq_5ltr,"' does not exist and therefore Methylation Context Quantification is omitted!", call. = FALSE)
            } else {
                # count CG: absolute
                full.te.seq.CG.abs <-
                    motif.count(seq.file = full.te.seq,
                                motif    = "CG")
                seq_3ltr.CG.abs <- motif.count(seq.file = seq_3ltr,
                                               motif    = "CG")
                seq_5ltr.CG.abs <- motif.count(seq.file = seq_5ltr,
                                               motif    = "CG")
                # count CG: relative
                full.te.seq.CG.rel <-
                    motif.count(seq.file = full.te.seq,
                                motif    = "CG",
                                as.ratio = TRUE)
                seq_3ltr.CG.rel <- motif.count(seq.file = seq_3ltr,
                                               motif    = "CG",
                                               as.ratio = TRUE)
                seq_5ltr.CG.rel <- motif.count(seq.file = seq_5ltr,
                                               motif    = "CG",
                                               as.ratio = TRUE)
                
                
                # count CHG: absolute
                full.te.seq.CHG.abs <-
                    motif.count(seq.file = full.te.seq,
                                motif    = "CHG")
                seq_3ltr.CHG.abs <- motif.count(seq.file = seq_3ltr,
                                                motif    = "CHG")
                seq_5ltr.CHG.abs <- motif.count(seq.file = seq_5ltr,
                                                motif    = "CHG")
                
                # count CHG: relative
                full.te.seq.CHG.rel <-
                    motif.count(seq.file = full.te.seq,
                                motif    = "CHG",
                                as.ratio = TRUE)
                seq_3ltr.CHG.rel <- motif.count(seq.file = seq_3ltr,
                                                motif    = "CHG",
                                                as.ratio = TRUE)
                seq_5ltr.CHG.rel <- motif.count(seq.file = seq_5ltr,
                                                motif    = "CHG",
                                                as.ratio = TRUE)
                
                # count CHH: absolute
                full.te.seq.CHH.abs <-
                    motif.count(seq.file = full.te.seq,
                                motif    = "CHH")
                seq_3ltr.CHH.abs <- motif.count(seq.file = seq_3ltr,
                                                motif    = "CHH")
                seq_5ltr.CHH.abs <- motif.count(seq.file = seq_5ltr,
                                                motif    = "CHH")
                
                # count CHH: relative
                full.te.seq.CHH.rel <-
                    motif.count(seq.file = full.te.seq,
                                motif    = "CHH",
                                as.ratio = TRUE)
                seq_3ltr.CHH.rel <- motif.count(seq.file = seq_3ltr,
                                                motif    = "CHH",
                                                as.ratio = TRUE)
                seq_5ltr.CHH.rel <- motif.count(seq.file = seq_5ltr,
                                                motif    = "CHH",
                                                as.ratio = TRUE)
                
                # count CCG: absolute
                full.te.seq.CCG.abs <-
                    motif.count(seq.file = full.te.seq,
                                motif    = "CCG")
                seq_3ltr.CCG.abs <- motif.count(seq.file = seq_3ltr,
                                                motif    = "CCG")
                seq_5ltr.CCG.abs <- motif.count(seq.file = seq_5ltr,
                                                motif    = "CCG")
                
                # count CCG: relative
                full.te.seq.CCG.rel <-
                    motif.count(seq.file = full.te.seq,
                                motif    = "CCG",
                                as.ratio = TRUE)
                seq_3ltr.CCG.rel <- motif.count(seq.file = seq_3ltr,
                                                motif    = "CCG",
                                                as.ratio = TRUE)
                seq_5ltr.CCG.rel <- motif.count(seq.file = seq_5ltr,
                                                motif    = "CCG",
                                                as.ratio = TRUE)
                
                # count N's: to assess the quality of the corresponding sequences
                full.te.seq.N.abs <-
                    motif.count(seq.file = full.te.seq,
                                motif    = "N",
                                as.ratio = FALSE)
                seq_3ltr.N.abs <- motif.count(seq.file = seq_3ltr,
                                              motif    = "N",
                                              as.ratio = FALSE)
                seq_5ltr.N.abs <- motif.count(seq.file = seq_5ltr,
                                              motif    = "N",
                                              as.ratio = FALSE)
                
                
                if (!identical(names(full.te.seq.CHG.rel), names(seq_3ltr.CHG.rel)))
                    stop("Order of names in files: '",
                         full.te.seq,
                         "' and '",
                         seq_3ltr,
                         "' do not match.",
                         call. = FALSE
                    )
                
                # construct count data.frame
                count.df <- data.frame(
                    orf.id = names(full.te.seq.CHG.rel),
                    TE_CG_abs = full.te.seq.CG.abs,
                    TE_CG_rel = full.te.seq.CG.rel,
                    TE_CHG_abs = full.te.seq.CHG.abs,
                    TE_CHG_rel = full.te.seq.CHG.rel,
                    TE_CHH_abs = full.te.seq.CHH.abs,
                    TE_CHH_rel = full.te.seq.CHH.rel,
                    TE_CCG_abs = full.te.seq.CCG.abs,
                    TE_CCG_rel = full.te.seq.CCG.rel,
                    TE_N_abs   = full.te.seq.N.abs,
                    CG_3ltr_abs = seq_3ltr.CG.abs,
                    CG_3ltr_rel = seq_3ltr.CG.rel,
                    CHG_3ltr_abs = seq_3ltr.CHG.abs,
                    CHG_3ltr_rel = seq_3ltr.CHG.rel,
                    CHH_3ltr_abs = seq_3ltr.CHH.abs,
                    CHH_3ltr_rel = seq_3ltr.CHH.rel,
                    CCG_3ltr_abs = seq_3ltr.CCG.abs,
                    CCG_3ltr_rel = seq_3ltr.CCG.rel,
                    N_3ltr_abs = seq_3ltr.N.abs,
                    CG_5ltr_abs = seq_5ltr.CG.abs,
                    CG_5ltr_rel = seq_5ltr.CG.rel,
                    CHG_5ltr_abs = seq_5ltr.CHG.abs,
                    CHG_5ltr_rel = seq_5ltr.CHG.rel,
                    CHH_5ltr_abs = seq_5ltr.CHH.abs,
                    CHH_5ltr_rel = seq_5ltr.CHH.rel,
                    CCG_5ltr_abs = seq_5ltr.CCG.abs,
                    CCG_5ltr_rel = seq_5ltr.CCG.rel,
                    N_5ltr_abs = seq_5ltr.N.abs
                )
                
                suppressWarnings(res <-
                    dplyr::left_join(res, dplyr::tbl_df(count.df), by = "orf.id"))
                message("Join methylation context (CG, CHG, CHH, CCG) count table: nrow(df) = ",nrow(res), " candidates.")
                message("unique(ID) = ",length(unique(res$ID)), " candidates.")
                message("unique(orf.id) = ",length(unique(res$orf.id)), " candidates.")
            }
            
            message("Copy files to result folder '",output.path,"'.")
            # perform internal file handling
            if (is.null(LTRharvest.folder)) {
                if (file.exists(file.path(
                    folder_path,
                    paste0(chopped.foldername, "_ltrharvest")
                ))) {
                    if (!file.exists(file.path(
                        output.path,
                        paste0(chopped.foldername, "_ltrharvest")
                    ))) {
                        file.move(file.path(
                            folder_path,
                            paste0(chopped.foldername, "_ltrharvest")
                        ),
                        file.path(
                            output.path,
                            paste0(chopped.foldername, "_ltrharvest")
                        ))   
                    }
                }
            }
            
            if (file.exists(file.path(
                folder_path,
                paste0(chopped.foldername, "_ltrdigest")
            ))) {
                if (!file.exists(file.path(
                    output.path,
                    paste0(chopped.foldername, "_ltrdigest")
                ))) {
                    file.move(file.path(
                        folder_path,
                        paste0(chopped.foldername, "_ltrdigest")
                    ),
                    file.path(
                        output.path,
                        paste0(chopped.foldername, "_ltrdigest")
                    ))   
                }
            }
            
            pred2gff(res, file.path(
                output.path,
                paste0(chopped.foldername, "_LTRpred.gff")
            ))
            pred2bed(res, file.path(
                output.path,
                paste0(chopped.foldername, "_LTRpred.bed")
            ))
          
            if ((!is.null(annotate)) && (annotate == "Dfam")) {
              if (is.null(dfam.file)) {
                  file.move(file.path(
                      folder_path,
                      paste0(chopped.foldername, "-ltrdigest_complete.fas", "_DfamAnnotation.out")
                  ),
                  file.path(
                      output.path,
                      paste0(chopped.foldername, "-ltrdigest_complete.fas", "_DfamAnnotation.out")
                  ))
              }
              if (!is.null(dfam.file)) {
                if (file.exists(dfam.file)) {
                    file.move(dfam.file,
                              file.path(
                                  output.path,
                                  basename(dfam.file)
                              ))
                  }
               }
            }
        } 
        
        if (is.null(genome.file)) {
            if (file.exists(LTRdigest.gff))
                file.move(LTRdigest.gff, file.path(output.path, LTRdigest.gff))
            
            if (file.exists(tabout.file))
                file.move(tabout.file, file.path(output.path, tabout.file))
            
            pred2gff(res, file.path(output.path, paste0(
                basename(LTRdigest.gff), "_LTRpred.gff"
            )))
            pred2bed(res, file.path(output.path, paste0(
                basename(LTRdigest.gff), "_LTRpred.bed"
            )))
            
            if ((!is.null(annotate)) && (annotate == "Dfam")) {
                if (is.null(dfam.file)) {
                    if (file.exists(file.path(
                        folder_path,
                        paste0(chopped.foldername, "-ltrdigest_complete.fas", "_DfamAnnotation.out")
                    ))) {
                        file.move(file.path(
                            folder_path,
                            paste0(chopped.foldername, "-ltrdigest_complete.fas", "_DfamAnnotation.out")
                        ),
                        file.path(
                            output.path,
                            paste0(chopped.foldername, "-ltrdigest_complete.fas", "_DfamAnnotation.out")
                        )) 
                    }
                }
                if (!is.null(dfam.file)) {
                    if (file.exists(dfam.file)) {
                        file.move(dfam.file,
                                  file.path(
                                      output.path,
                                      basename(dfam.file)
                                  ))
                    }
                }
            }
        }
        
        #return(res)
    }
    
    if (is.null(LTRdigestOutput)) {
        if (!is.null(genome.file)) {
            chopped.foldername <-
                unlist(stringr::str_split(basename(genome.file), "[.]"))[1]
            folder_path <-
                unlist(stringr::str_split(LTRdigest.gff, .Platform$file.sep))
            exclude <- c(length(folder_path) - 1, length(folder_path))
            if (exclude[1] == 1) {
                folder_path <- getwd()
            } else {
                folder_path <-
                    stringr::str_c(folder_path[-exclude], collapse = .Platform$file.sep)
            }
            
            if (is.null(LTRharvest.folder)) {
                if (file.exists(file.path(
                    folder_path,
                    paste0(chopped.foldername, "_ltrharvest")
                ))) {
                    file.move(file.path(
                        folder_path,
                        paste0(chopped.foldername, "_ltrharvest")
                    ),
                    file.path(
                        output.path,
                        paste0(chopped.foldername, "_ltrharvest")
                    ))
                }
            }
            
            if (file.exists(file.path(
                folder_path,
                paste0(chopped.foldername, "_ltrdigest")
            ))) {
                file.move(file.path(
                    folder_path,
                    paste0(chopped.foldername, "_ltrdigest")
                ),
                file.path(
                    output.path,
                    paste0(chopped.foldername, "_ltrdigest")
                ))
            }
        } else {
            if (file.exists(LTRdigest.gff))
                file.move(LTRdigest.gff, file.path(output.path, LTRdigest.gff))
            
            if (file.exists(tabout.file))
                file.move(tabout.file, file.path(output.path, tabout.file))
        }
    }
    
    res <- dplyr::mutate(res, pred_tool = rep("LTRpred", nrow(res)))
    res <- dplyr::mutate(res, species = rep(chopped.foldername, nrow(res)))
    res <- dplyr::mutate(res, ID = paste0(chopped.foldername,"_",ID))
    
    if (nrow(ORFTable) > 0) {
      res <- dplyr::mutate(res, orfs = ifelse(is.na(orfs), 0, orfs))
    } else {
      res <- dplyr::mutate(res, orfs = rep(0, nrow(res)))
    }
  
    if (fix.chr.name) {
        # generate accurate chromosome naming (sometimes chromosome names are chopped because the naming convention is violated)
      message("Chromosome names are being fixed ...")
        chr.ids <- unlist(lapply(res$orf.id, function(x) {
            
            s <- unlist(stringr::str_split(x,"_"))
            l <- length(s)
            
            if (l >= 3) {
              s <- s[-c(l - 1, l)]
            }
            
            final_s <- ifelse(!is.na(stringr::str_c(s, collapse = "")), stringr::str_c(s, collapse = ""), "none")
            
            return(final_s)
        }))
        
        if (nrow(res) != length(chr.ids))
          stop("Something went wrong when trying to fix chromosome names!", call. = FALSE)
        
        res <- dplyr::mutate(res, chromosome = chr.ids)
    }
    
    res <-
        dplyr::mutate(res, cn_3ltr = rep(NA, nrow(res)), cn_5ltr = rep(NA, nrow(res)))
    
    message("\n")
    message("LTRpred - Step 6:")
    
    if (ltr_age_estimation) {
      seqs_3ltr <-
        file.path(
          output.path,
          paste0(chopped.foldername, "_ltrdigest"),
          paste0(chopped.foldername, "-ltrdigest_3ltr.fas")
        )
      seqs_5ltr <-
        file.path(
          output.path,
          paste0(chopped.foldername, "_ltrdigest"),
          paste0(chopped.foldername, "-ltrdigest_5ltr.fas")
        )
      
      ltr_age_estimation_res <- ltr_age_estimation(
        res,
        ltr_seqs_3_prime = seqs_3ltr,
        ltr_seqs_5_prime = seqs_5ltr,
        model = model,
        mutation_rate = mutation_rate
      )
      
      ltr_age_mya <- ltr_evo_distance <- NULL
      res <-
        dplyr::left_join(
          res,
          dplyr::select(
            ltr_age_estimation_res,
            orf.id,
            ltr_age_mya,
            ltr_evo_distance
          ),
          by = "orf.id"
        )
      
      res <-
        dplyr::select(
          res,
          species,
          ID,
          dfam_target_name,
          ltr_similarity,
          ltr_age_mya,
          ltr_evo_distance,
          similarity,
          protein_domain,
          orfs,
          chromosome,
          start,
          end,
          strand,
          width,
          annotation,
          pred_tool,
          frame,
          score,
          lLTR_start:`PBS/tRNA_edist`,
          orf.id,
          repeat_region_length:PBS_length,
          dfam_acc:cn_5ltr
        )
    }
    
    if (!ltr_age_estimation){
      message("\n")
      message("This step was skipped, because: 'ltr_age_estimation = ", ltr_age_estimation, "'.")
    }
    
    message("\n")
    message("LTRpred - Step 7:")
    
    if (quality.filter) {
        res <- quality.filter(res, sim = similar, n.orfs = n.orfs)
    }
    
    if (!quality.filter) {
        message("No quality filters were applied, please be aware that your prediction file may include some false positive predictions of retrotransposons. You can still apply quality control criteria using the function LTRpred::quality.filter() ...")
    }
        
    pred2tsv(res, file.path(
        output.path,
        paste0(chopped.foldername, "_LTRpred_DataSheet.tsv")
    ))
    
    # perform copy number estimation analysis
    if (copy.number.est) {
        data_sheet <-
            file.path(output.path,
                      paste0(chopped.foldername, "_LTRpred_DataSheet.tsv"))
        seqs_3ltr <-
            file.path(
                output.path,
                paste0(chopped.foldername, "_ltrdigest"),
                paste0(chopped.foldername, "-ltrdigest_3ltr.fas")
            )
        seqs_5ltr <-
            file.path(
                output.path,
                paste0(chopped.foldername, "_ltrdigest"),
                paste0(chopped.foldername, "-ltrdigest_5ltr.fas")
            )
        
        message("Perform solo LTR Copy Number Estimation....")
        solo.ltr.cn <- ltr.cn(
            data.sheet     = data_sheet,
            LTR.fasta_3ltr = seqs_3ltr,
            LTR.fasta_5ltr = seqs_5ltr,
            genome         = genome.file,
            ltr.similarity = similar,
            eval           = cn.eval,
            cores          = cores
        )
        
        # quantify copy number of LTRs for 3 prime LTR and 5 prime LTR separately
        
        if ((nrow(solo.ltr.cn$pred_3ltr) > 0) && (nrow(solo.ltr.cn$pred_5ltr) > 0) && (!is.null(solo.ltr.cn$pred_3ltr)) && (!is.null(solo.ltr.cn$pred_5ltr))) {
            
            # write estimated solo LTR loci to LTRpred output folder
            cn2bed(
                solo.ltr.cn$pred_3ltr,
                type = "solo",
                filename = paste0(chopped.foldername,"_soloLTRs_3ltr"),
                output = output.path
            )
            
            cn2bed(
                solo.ltr.cn$pred_5ltr,
                type = "solo",
                filename = paste0(chopped.foldername,"_solo_LTRs_5ltr"),
                output = output.path
            )
            
            solo.ltr.cn.n_3ltr <-
                dplyr::summarise(dplyr::group_by(solo.ltr.cn$pred_3ltr, query_id),
                                 cn_3ltr = dplyr::n())
            solo.ltr.cn.n_5ltr <-
                dplyr::summarise(dplyr::group_by(solo.ltr.cn$pred_5ltr, query_id),
                                 cn_5ltr = dplyr::n())
            
            names(solo.ltr.cn.n_3ltr)[1] <- "orf.id"
            names(solo.ltr.cn.n_5ltr)[1] <- "orf.id"
            
            res <- dplyr::left_join(res, solo.ltr.cn.n_3ltr, by = "orf.id")
            
            message("Join solo LTR Copy Number Estimation table: nrow(df) = ",nrow(res), " candidates.")
            message("unique(ID) = ",length(unique(res$ID)), " candidates.")
            message("unique(orf.id) = ",length(unique(res$orf.id)), " candidates.")
            
            res <-
                dplyr::mutate(res, cn_3ltr = ifelse(is.na(cn_3ltr), 0, as.numeric(cn_3ltr)))
            suppressWarnings(res <- dplyr::full_join(res, solo.ltr.cn.n_5ltr, by = "orf.id"))
            res <-
                dplyr::mutate(res, cn_5ltr = ifelse(is.na(cn_5ltr), 0, as.numeric(cn_5ltr)))
            
        } else {
            warning("The LTR copy number estimation returned an empty file. This suggests that there were no solo LTRs found in the input genome sequence. ", call. = FALSE)
            res <-
                dplyr::mutate(res, cn_3ltr = rep(NA, nrow(res)), cn_5ltr = rep(NA, nrow(res)))
        }
    } else {
        res <-
            dplyr::mutate(res, cn_3ltr = rep(NA, nrow(res)), cn_5ltr = rep(NA, nrow(res)))
    }
    
  
    unlink(file.path(
        output.path,
        paste0(chopped.foldername, "_LTRpred_DataSheet.tsv")
    ))
    
    pred2tsv(res, file.path(
        output.path,
        paste0(chopped.foldername, "_LTRpred_DataSheet.tsv")
    ))
    
    pred2gff(res, file.path(
      output.path,
      paste0(chopped.foldername, "_LTRpred.gff")
    ))
    
    pred2bed(res, file.path(
      output.path,
      paste0(chopped.foldername, "_LTRpred.bed")
    ))
    
    # if (!file.exists(file.path(output.path, "Plots")))
    #   dir.create(file.path(output.path, "Plots"))
    # 
    # plot_genomic_distr <- plot_element_distr_along_chromosome(pred = res, genome.file = genome.file, centromere_start = centromere_start, centromere_end = centromere_end)
    
    message("\n")
    # message("Please cite the following paper when using LTRpred for your own research: Benoit, Drost et al., 2019 PloS Genetics, 15(9):e1008370.")
    message("\n")
    message("LTRpred finished all analyses successfully. All output files were stored at '", output.path,"'.")
    
    return(paste0("Successful job ", job_num," ."))
}
HajkD/LTRpred documentation built on April 22, 2022, 4:35 p.m.