#' @title Predict LTR retrotransposons in a given genome
#' @description Main pipeline to perform
#' sufficient LTR retrotransposon predictions for any genome of interest.
#' @param genome.file path to the genome file in \code{fasta} format.
#' @param centromere_start a numeric vector storing the centromere start coordinates in the \code{genome.file}. The position in the numeric vector should correspond to the chromosome name in the \code{genome.file} fasta file. If \code{centromere_start = NULL} (default), then no centromeres will be drawn.
#' @param centromere_end a numeric vector storing the centromere end coordinates in the \code{genome.file}. The position in the numeric vector should correspond to the chromosome name in the \code{genome.file} fasta file. If \code{centromere_end = NULL} (default), then no centromeres will be drawn.
#' @param ltr_age_estimation a logical value indicating wether or not ltr-age estimation shall be performed (default \code{ltr_age_estimation = TRUE}).
#' @param model a model as specified in \code{\link[ape]{dist.dna}}: a character string specifying the evolutionary model to be used - must be one of
#' \itemize{
#' \item \code{K80} (the default)
#' \item \code{raw}
#' \item \code{N}
#' \item \code{TS}
#' \item \code{TV}
#' \item \code{JC69}
#' \item \code{F81}
#' \item \code{K81}
#' \item \code{F84}
#' \item \code{BH87}
#' \item \code{T92}
#' \item \code{TN93}
#' \item \code{GG95}
#' \item \code{logdet}
#' \item \code{paralin}
#' }
#' @param mutation_rate a mutation rate per site per year. For retrotransposons the default is \eqn{mutation_rate = 1.3 * 10E-8} (Wicker and Keller, 2007).
#' @param index.file.harvest specify the name of the enhanced suffix array index file that is computed
#' by \code{suffixerator} for the use of \code{LTRharvest}. This often can be used in case the suffix file was previously
#' generated, e.g. during a previous call of this function. In this case the suffix array index
#' file does not need to be re-computed for new analyses. This is particularly useful when
#' running \code{LTRpred} with different parameter settings. A possible index file name could be \code{BASENAME_index.fsa}.
#' @param index.file.digest specify the name of the enhanced suffix array index file that is computed
#' by \code{suffixerator} for the use of \code{LTRdigest}. This often can be used in case the suffix file was previously
#' generated, e.g. during a previous call of this function. In this case the suffix array index
#' file does not need to be re-computed for new analyses. This is particularly useful when
#' running \code{LTRpred} with different parameter settings. A possible index file name could be \code{BASENAME_index_ltrdigest.fsa}.
#' @param LTRdigest.gff path to the LTRdigest generated GFF file, in case LTRdigest files were pre-computed previously.
#' @param LTRpred.folder name/path of/to an existing LTRpred folder.
#' @param tabout.file path to the LTRdigest generated tabout file file, in case LTRdigest files were pre-computed previously.
#' @param LTRharvest.folder either \code{LTRharvest.folder = NULL} (default) or \code{LTRharvest.folder = "skip"} to skip moving a LTRharvest folder if it is not present.
#' @param orf.file path to the file generated by \code{\link{ORFpred}}, in case the orf prediction file was generated previously.
#' @param annotate annotation database that shall be queried to annotate predicted LTR transposons.
#' Default is \code{annotate = NULL} indicating that no annotation query is being performed.
#' Possible options are: \code{annotate = "Dfam"} (here the Dfam database must be stored locally and a nhammer search is performed against the Dfam database) or \code{annotate = "Repbase"} (here the Repbase database must be stored locally and a blastn search is performed against the Repbase database). Please consult the vignettes for details.
#' @param Dfam.db folder path to the local Dfam database or \code{Dfam.db = "download"} in case the Dfam
#' database shall be automatically downloaded before performing query analyses.
#' @param dfam.eval E-value threshhold to perform HMMer search against the Dfam database.
#' @param dfam.file path to pre-computed \code{\link{dfam.query}} output file. Can only be used in combination
#' with \code{annotate = "Dfam"}.
#' @param cluster shall predicted transposons be clustered with \code{\link{CLUSTpred}}.
#' @param clust.sim cluster reject if sequence similarity is lower than this threshold when performing clustering with \code{\link{CLUSTpred}}.
#' @param clust.file file path to pre-computed clustering file generated by \code{\link{CLUSTpred}}.
#' @param copy.number.est shall copy number estimation (including solo LTR prediction) be performed? Default is \code{copy.number.est = FALSE}.
#' @param fix.chr.name shall chromosome names be fixed?
#' @param cn.eval evalue for copy number estimation (BLAST hit threshold).
#' @param range define the genomic interval in which predicted LTR transposons shall be reported
#' . In case \code{range[1] = 1000} and \code{range[2] = 10000} then candidates are only
#' reported if they start after position 1000 and end before position 10000 in their respective
#' sequence coordinates. If \code{range[1] = 0} and \code{range[2] = 0},
#' so \code{range = c(0,0)} (default) then the entire genome is being scanned.
#' @param seed the minimum length for the exact maximal repeats. Only repeats with the specified minimum length are considered in all subsequent analyses. Default is \code{seed = 30}.
#' @param minlenltr minimum LTR length. Default is \code{minlenltr = 100}.
#' @param maxlenltr maximum LTR length. Default is \code{maxlenltr = 3500}.
#' @param mindistltr minimum distance of LTR starting positions. Default is \code{mindistltr = 4000}.
#' @param maxdistltr maximum distance of LTR starting positions. Default is \code{maxdistltr = 25000}.
#' @param similar minimum similarity value between the two LTRs in percent. \code{similar = 70}.
#' @param mintsd minimum target site duplications (TSDs) length. If no search for TSDs
#' shall be performed, then specify \code{mintsd = NULL}. Default is \code{mintsd = 4}.
#' @param maxtsd maximum target site duplications (TSDs) length. If no search for TSDs
#' shall be performed, then specify \code{maxtsd = NULL}. Default is \code{maxtsd = 20}.
#' @param vic number of nucleotide positions left and right (the vicinity) of the predicted
#' boundary of a LTR that will be searched for TSDs and/or one motif (if specified).
#' Default is \code{vic = 60}.
#' @param overlaps specify how overlapping LTR retrotransposon predictions shall be treated.
#' If \code{overlaps = "no"} is selected, then neither nested nor overlapping predictions will be reported in the output. In case \code{overlaps = "best"} is selected then in the case of two or more nested or overlapping predictions, solely the LTR retrotransposon prediction with
#' the highest similarity between its LTRs will be reported.
#' If \code{overlaps = "all"} is selected then all LTR retrotransposon predictions
#' will be reported whether there are nested and/or overlapping predictions or not.
#' Default is \code{overlaps = "best"}.
#' @param xdrop specify the xdrop value (> 0) for extending a seed repeat in both directions
#' allowing for matches, mismatches, insertions, and deletions. The xdrop extension process
#' stops as soon as the extension involving matches, mismatches, insersions, and deletions
#' has a score smaller than T - X, where T denotes the largest score seen so far. Default is \code{xrop = 5}.
#' @param mat specify the positive match score for the X-drop extension process. Default is \code{mat = 2}.
#' @param mis specify the negative mismatch score for the X-drop extension process. Default is \code{mis = -2}.
#' @param ins specify the negative insertion score for the X-drop extension process. Default is \code{ins = -3}.
#' @param del specify the negative deletion score for the X-drop extension process. Default is \code{del = -3}.
#' @param motif specify 2 nucleotides for the starting motif and 2 nucleotides for the ending
#' motif at the beginning and the ending of each LTR, respectively.
#' Only palindromic motif sequences - where the motif sequence is equal to its complementary
#' sequence read backwards - are allowed, e.g. \code{motif = "tgca"}. Type the nucleotides without any space
#' separating them. If this option is not selected by the user, candidate pairs will not be
#' screened for potential motifs. If this options is set but no allowed number of
#' mismatches is specified by the argument \code{motifmis} and a search for the exact
#' motif will be conducted. If \code{motif = NULL} then no explicit motif is being specified.
#' @param motifmis allowed number of mismatches in the TSD motif specified in \code{motif}.
#' The number of mismatches needs to be between [0,3]. Default is \code{motifmis = 0}.
#' @param aaout shall the protein sequence of the HMM matches to the predicted LTR transposon
#' be generated as fasta file or not. Options are \code{aaout = "yes"} or \code{aaout = "no"}.
#' @param aliout shall the alignment of the protein sequence of the HMM matches to the predicted LTR transposon
#' be generated as fasta file or not. Options are \code{aaout = "yes"} or \code{aaout = "no"}.
#' @param pptlen a two dimensional numeric vector specifying the minimum and maximum allowed
#' lengths for PPT predictions. If a purine-rich region that does not fulfill this range is
#' found, it will be discarded. Default is \code{pptlen = c(8,30)} (minimum = 8; maximum = 30).
#' @param uboxlen a two dimensional numeric vector specifying the minimum and maximum allowed
#' lengths for U-box predictions. If a T-rich region preceding a PPT that does not fulfill the PPT length criteria is
#' found, it will be discarded. Default is \code{uboxlen = c(3,30)} (minimum = 3; maximum = 30).
#' @param pptradius a numeric value specifying the area around the 3' LTR beginning to be
#' considered when searching for PPT. Default value is \code{pptradius = 30}.
#' @param trnas path to the fasta file storing the unique tRNA sequences that shall be matched to the
#' predicted LTR transposon (tRNA library).
#' @param pbsalilen a two dimensional numeric vector specifying the minimum and maximum allowed
#' lengths for PBS/tRNA alignments. If the local alignments are shorter or longer than this
#' range, it will be discarded. Default is \code{pbsalilen = c(11,30)} (minimum = 11; maximum = 30).
#' @param pbsoffset a two dimensional numeric vector specifying the minimum and maximum allowed
#' distance between the start of the PBS and the 3' end of the 5' LTR. Local alignments not
#' fulfilling this criteria will be discarded. Default is \code{pbsoffset = c(0,5)} (minimum = 0; maximum = 5).
#' @param pbstrnaoffset a two dimensional numeric vector specifying the minimum and maximum allowed
#' PBS/tRNA alignment offset from the 3' end of the tRNA. Local alignments not
#' fulfilling this criteria will be discarded. Default is \code{pbstrnaoffset = c(0,5)} (minimum = 0; maximum = 5).
#' @param pbsmaxedist a numeric value specifying the maximal allowed unit edit distance in a
#' local PBS/tRNA alignment.
#' @param pbsradius a numeric value specifying the area around the 5' LTR end to be
#' considered when searching for PBS Default value is \code{pbsradius = 30}.
#' @param hmms a character string or a character vector storing either the hmm files for
#' searching internal domains between the LTRs of predicted LTR transposons or a vector of
#' Pfam IDs from http://pfam.xfam.org/ that are downloaded and used to search for corresponding protein domains
#' within the predicted LTR transposons. As an option users can rename all of their hmm files
#' so that they start for example with the name \code{hmms = "hmm_*"}. This way all files starting with
#' \code{hmm_} will be considered for the subsequent protein domain search. In case Pfam IDs
#' are specified, the \code{LTRpred} function will automatically download the corresponding
#' HMM files and use them for further protein domain searches. In case users prefer to specify
#' Pfam IDs please specify them in the \code{pfam.ids} parmeter and choose \code{hmms = NULL}.
#' @param pdomevalcutoff a numeric value specifying the E-value cutoff for corresponding HMMER searches. All hits that do not fulfill this criteria are discarded. Default is \code{pdomevalcutoff = 1E-5}.
#' @param pbsmatchscore specify the match score used in the PBS/tRNA Smith-Waterman alignment.
#' Default is \code{pbsmatchscore = 5}.
#' @param pbsmismatchscore specify the mismatch score used in the PBS/tRNA Smith-Waterman alignment.
#' Default is \code{pbsmismatchscore = -10}.
#' @param pbsinsertionscore specify the insertion score used in the PBS/tRNA Smith-Waterman alignment.
#' Default is \code{pbsinsertionscore = -20}.
#' @param pbsdeletionscore specify the deletion score used in the PBS/tRNA Smith-Waterman alignment.
#' Default is \code{pbsdeletionscore = -20}.
#' @param pfam.ids a character vector storing the Pfam IDs from http://pfam.xfam.org/
#' that shall be downloaded and used to perform protein domain searches within the sequences
#' between the predicted LTRs.
#' @param cores number of cores to be used for multicore processing.
#' @param dfam.cores number of cores to be used for multicore processing when running Dfam query (in case \code{annotate = "Dfam"}).
#' @param hmm.cores number of cores to be used for multicore processing when performing hmmer protein search with \code{\link{LTRdigest}}.
#' @param orf.style type of predicting open reading frames (see documentation of USEARCH).
#' @param min.codons minimum number of codons in the predicted open reading frame.
#' @param trans.seqs logical value indicating wheter or not predicted open reading frames
#' shall be translated and the corresponding protein sequences stored in the output folder.
#' @param output.path a path/folder to store all results returned by \code{\link{LTRharvest}}, \code{\link{LTRdigest}}, and \code{LTRpred}.
#' If \code{output.path = NULL} (Default) then a folder with the name of the input genome file
#' will be generated in the current working directory of R and all results are then stored in this folder.
#' @param quality.filter shall false positives be filtered out as much as possible? Default is \code{quality.filter = TRUE}.
#' See \code{Description} for details.
#' @param n.orfs minimum number of Open Reading Frames that must be found between the LTRs (if \code{quality.filter = TRUE}). See \code{Details} for further information on quality control.
#' @param job_num a job number in case this function is run in parallel mode in \code{\link{LTRpred.meta}}.
#' @param verbose shall further information be printed on the console or not.
#' @author Hajk-Georg Drost
#' @details This function provides the main pipeline to perform \code{de novo} LTR transposon
#' predictions.
#'
#' \strong{Quality Control}
#'
#' \itemize{
#' \item \code{ltr.similarity}: Minimum similarity between LTRs. All TEs not matching this
#' criteria are discarded.
#' \item \code{n.orfs}: minimum number of Open Reading Frames that must be found between the
#' LTRs. All TEs not matching this criteria are discarded.
#' \item \code{PBS or Protein Match}: elements must either have a predicted Primer Binding
#' Site or a protein match of at least one protein (Gag, Pol, Rve, ...) between their LTRs. All TEs not matching this criteria are discarded.
#' \item The relative number of N's (= nucleotide not known) in TE <= 0.1. The relative number of N's is computed as follows: absolute number of N's in TE / width of TE.
#' }
#'
#' @seealso \code{\link{LTRharvest}}, \code{\link{LTRdigest}},
#' \code{\link{read.prediction}}, \code{\link{read.tabout}}, \code{\link{read.seqs}},
#' \code{\link{pred2fasta}}, \code{\link{pred2gff}}
#' @importFrom magrittr %>%
#' @return
#' The \code{LTRpred} function generates the following output files:
#'
#' \itemize{
#' \item *_BetweenLTRSeqs.fsa : DNA sequences predicted LTR retrotransposons, in particular of the region between the LTRs in fasta format.
#' \item *_Details.tsv : A spread sheet containing detailed information about the predicted LTRs.
#' \item *_FullLTRRetrotransposonSeqs.fsa : DNA sequences of the entire predicted LTR retrotransposon.
#' \item *_index.fsa.* : The suffixarray index file used to predict putative LTR retrotransposons.
#' \item *_Prediction.gff : A spread sheet containing detailed additional information about the predicted LTRs (partially redundant with the *_Details.tsv file).
#' \item *_index_ltrdigest.fsa : The suffixarray index file used to predict putative LTR retrotransposonswith \code{LTRdigest}.
#' \item *_LTRdigestPrediction.gff : A spread sheet containing detailed information about the predicted LTRs.
#' \item *-ltrdigest_tabout.csv : A spread sheet containing additional detailed information about the predicted LTRs.
#' \item *-ltrdigest_complete.fas : The full length DNA sequences of all predicted LTR transposons.
#' \item *-ltrdigest_conditions.csv : Contains information about the parameters used for a given
#' \code{LTRdigest} run.
#' \item *-ltrdigest_pbs.fas : Stores the predicted PBS sequences for the putative LTR retrotransposons.
#' \item *-ltrdigest_ppt.fas : Stores the predicted PPT sequences for the putative LTR retrotransposons.
#' \item *-ltrdigest_5ltr.fas and *-ltrdigest_3ltr.fas: Stores the predicted 5' and 3' LTR sequences. Note: If the direction of the putative retrotransposon could be predicted, these
#' files will contain the corresponding 3' and 5' LTR sequences. If no direction could be
#' predicted, forward direction with regard to the original sequence will be assumed by
#' \code{LTRdigest}, i.e. the 'left' LTR will be considered the 5' LTR.
#' \item *-ltrdigest_pdom_<domainname>.fas : Stores the DNA sequences of the HMM matches
#' to the LTR retrotransposon candidates.
#' \item *-ltrdigest_pdom_<domainname>_aa.fas : Stores the concatenated protein sequences of
#' the HMM matches to the LTR retrotransposon candidates.
#' \item *-ltrdigest_pdom_<domainname>_ali.fas : Stores the alignment information for all matches of the given protein domain model to the translations of all candidates.
#' \item *_ORF_prediction_nt.fsa : Stores the predicted open reading frames within the predicted LTR transposons as DNA sequence.
#' \item *_ORF_prediction_aa.fsa : Stores the predicted open reading frames within the predicted LTR transposons as protein sequence.
#' \item *_LTRpred.gff : Stores the \code{LTRpred} predicted LTR transposons in GFF format.
#' \item *_LTRpred.bed : Stores the \code{LTRpred} predicted LTR transposons in BED format.
#' \item *_LTRpred_DataSheet.tsv : Stores the output table as data sheet.
#' }
#' The ' * ' is an place holder for the name of the input genome file.
#'
#' \strong{If annotate = "Dfam"}
#' In case the Dfam annotation option is specified the following
#' additional files are generated and stored in the LTRpred result folder:
#'
#' \itemize{
#' \item *-ltrdigest_complete.fas_DfamAnnotation.out : a data table storing the output of the HMMer search of predicted retrotransposons against the Dfam database.
#' }
#'
#' \strong{if cluster = TRUE}
#' \itemize{
#' \item *-ltrdigest_complete.fas_CLUSTpred.blast6out : usearch cluster result in BLAST table format.
#' \item *-ltrdigest_complete.fas_CLUSTpred.log : log file of usearch run.
#' \item *-ltrdigest_complete.fas_CLUSTpred.uc : usearch cluster output file.
#' }
#'
#' @examples
#'
#' \dontrun{
#' # generate de novo LTR transposon prediction
#' LTRpred(genome.file = system.file("Hsapiens_ChrY.fa", package = "LTRpred"),
#' trnas = system.file("hg38-tRNAs.fa", package = "LTRpred"),
#' hmms = paste0(system.file("HMMs/", package = "LTRpred"),"hmm_*"))
#'
#' # run LTRpred with pre-computed predictions from LTRdigest()
#' genome <- system.file("TAIR10_chr_all_LTRdigestPrediction.gff",package = "LTRpred")
#' tabout <- system.file("TAIR10_chr_all-ltrdigest_tabout.csv",package = "LTRpred")
#' orf.pred <- system.file("nt.fa",package = "LTRpred")
#' LTRpred(LTRdigest.gff = genome,
#' tabout.file = tabout,
#' orf.file = orf.pred)
#'}
#'
#' @references
#' R Edgar. Search and clustering orders of magnitude faster than BLAST. Bioinformatics (2010) 26 (19): 2460-2461.
#'
#' D Ellinghaus, S Kurtz and U Willhoeft. LTRharvest, an efficient and flexible software for de novo detection of LTR retrotransposons. BMC Bioinformatics (2008). 9:18.
#'
#' S Steinbiss et al. Fine-grained annotation and classification of de novo predicted LTR retrotransposons. Nucl. Acids Res. (2009) 37 (21): 7002-7013.
#' @export
LTRpred <- function(genome.file = NULL,
centromere_start = NULL,
centromere_end = NULL,
ltr_age_estimation = TRUE,
model = "K80",
mutation_rate = 1.3 * 1e-07,
index.file.harvest = NULL,
index.file.digest = NULL,
LTRdigest.gff = NULL,
tabout.file = NULL,
LTRharvest.folder = NULL,
LTRpred.folder = NULL,
orf.file = NULL,
annotate = NULL,
Dfam.db = NULL,
dfam.eval = 1e-3,
dfam.file = NULL,
cluster = FALSE,
clust.sim = 0.9,
clust.file = NULL,
copy.number.est = FALSE,
fix.chr.name = FALSE,
cn.eval = 1e-5,
range = c(0,0),
seed = 30,
minlenltr = 100,
maxlenltr = 3500,
mindistltr = 4000,
maxdistltr = 25000,
similar = 70,
mintsd = 4,
maxtsd = 20,
vic = 60,
overlaps = "no",
xdrop = 5,
mat = 2,
mis = -2,
ins = -3,
del = -3,
motif = NULL,
motifmis = 0,
aaout = "yes",
aliout = "yes",
pptlen = c(8,30),
uboxlen = c(3,30),
pptradius = 30,
trnas = NULL,
pbsalilen = c(11,30),
pbsoffset = c(0,5),
pbstrnaoffset = c(0,5),
pbsmaxedist = 1,
pbsradius = 30,
hmms = NULL,
pdomevalcutoff = 1E-5,
pbsmatchscore = 5,
pbsmismatchscore = -10,
pbsinsertionscore = -20,
pbsdeletionscore = -20,
pfam.ids = NULL,
cores = 1,
dfam.cores = NULL,
hmm.cores = NULL,
orf.style = 7,
min.codons = 200,
trans.seqs = FALSE,
output.path = NULL,
quality.filter = TRUE,
n.orfs = 0,
job_num = 1,
verbose = TRUE) {
test_installation_gt()
test_installation_hmmer()
test_installation_usearch()
test_installation_vsearch()
test_installation_blast()
test_installation_perl()
message("Running LTRpred on genome '", genome.file, "' with ", cores, " core(s) and searching for retrotransposons using the overlaps option (overlaps = '", overlaps, "') ...")
message("\n")
if (!is.null(LTRharvest.folder))
if (LTRharvest.folder != "skip")
stop(
"The argument 'LTRharvest.folder' can only either be 'NULL' (default) or 'skip' when LTRharvest folder movement shall be skipped.",
call. = FALSE
)
if (!is.null(annotate)) {
if (!is.element(annotate, c("Repbase", "Dfam")))
stop("Only Dfam or Repbase can be used to annotate predicted LTR transposons!",
call. = FALSE)
}
if (parallel::detectCores() < cores)
stop("Your system does not provide the number of cores you specified.",
call. = FALSE)
if (!is.null(dfam.file) & is.null(annotate))
stop("Please specify annotate = 'Dfam' when providing a dfam.file path.",
call. = FALSE)
if (!is.null(genome.file)){
if (!file.exists(genome.file))
stop("The genome.file '", genome.file, "' does not seem to exist. Please provide a valid path to a genome assembly file in fasta format.", call. = FALSE)
}
if (is.null(dfam.cores) && is.null(hmm.cores)) {
dfam.cores <- hmm.cores <- cores
}
if ((!is.null(dfam.cores)) && is.null(hmm.cores)) {
hmm.cores <- cores
}
if (is.null(dfam.cores) && (!is.null(hmm.cores))) {
dfam.cores <- cores
}
if (is.null(hmms)) {
message("No hmm files were specified, thus the internal HMM library will be used! See '",
paste0(system.file("HMMs/", package = "LTRpred"), "hmm_*"),"' for details.")
hmms <- paste0(system.file("HMMs/", package = "LTRpred"), "hmm_*")
}
if (is.null(trnas)) {
message("No tRNA files were specified, thus the internal tRNA library will be used! See '",
system.file("tRNAs/tRNA_library.fa", package = "LTRpred"),"' for details.")
trnas <- system.file("tRNAs/tRNA_library.fa", package = "LTRpred")
}
chromosome_ltrharvest <- start <- end <- query_name <- bits <- dfam_target_name <- NULL
Type <- Cluster <- Perc_Ident <- Clust_Cluster <- Clust_Target <- Clust_cn <- NULL
query_id <- cn_3ltr <- cn_5ltr <- orfs <- species <- ltr_similarity <- similarity <- NULL
protein_domain <- strand <- annotation <- pred_tool <- frame <- score <- NULL
lLTR_start <- `PBS/tRNA_edist` <- repeat_region_length <- PBS_length <- dfam_acc <- NULL
Query <- Target <- n <- NULL
if (is.null(output.path)) {
if (!is.null(genome.file)) {
output.path <-
file.path(getwd(), paste0(unlist(stringr::str_split(basename(
genome.file
), "[.]"))[1], "_ltrpred"))
}
if (is.null(genome.file) & is.null(LTRpred.folder)) {
output.path <-
file.path(getwd(), paste0(unlist(stringr::str_split(basename(
LTRdigest.gff
), "[.]"))[1], "_ltrpred"))
}
if (is.null(genome.file) & !is.null(LTRpred.folder)) {
output.path <- LTRpred.folder
}
if (!is.null(output.path)) {
if (dir.exists(output.path)) {
message(
"The output folder '",
output.path,
"' seems to exist already and will be used to store LTRpred results ..."
)
#unlink(output.path, recursive = TRUE)
#dir.create(output.path)
} else {
message("The output folder '",
output.path,
"' does not seem to exist yet and will be created ...")
dir.create(output.path)
}
}
}
rTSD_end <- lTSD_start <- PPT_end <- PPT_start <- PBS_end <- PBS_start <- NULL
ID <- chromosome <- width <- orf.id <- NULL
message("\n")
message("LTRpred - Step 1:")
if (!is.null(genome.file) &
is.null(LTRdigest.gff) & is.null(tabout.file)) {
LTRharvest(
genome.file,
index.file = index.file.harvest,
range = range,
seed = seed,
minlenltr = minlenltr,
maxlenltr = maxlenltr,
mindistltr = mindistltr,
maxdistltr = maxdistltr,
similar = similar,
mintsd = mintsd,
maxtsd = maxtsd,
vic = vic,
overlaps = overlaps,
xdrop = xdrop,
mat = mat,
mis = mis,
ins = ins,
del = del,
motif = motif,
motifmis = motifmis,
output.path = NULL,
verbose = verbose
)
chopped.foldername <-
unlist(stringr::str_split(basename(genome.file), "[.]"))[1]
message("\n")
message("LTRpred - Step 2:")
LTRdigest(
input.gff3 = file.path(
paste0(chopped.foldername, "_ltrharvest"),
paste0(chopped.foldername, "_Prediction.gff")
),
genome.file = genome.file,
aaout = aaout,
aliout = aliout,
pptlen = pptlen,
uboxlen = uboxlen,
pptradius = pptradius,
trnas = trnas,
pbsalilen = pbsalilen,
pbsoffset = pbsoffset,
pbstrnaoffset = pbstrnaoffset,
pbsmaxedist = pbsmaxedist,
pbsradius = pbsradius,
hmms = hmms,
pdomevalcutoff = pdomevalcutoff,
pbsmatchscore = pbsmatchscore,
pbsmismatchscore = pbsmismatchscore,
pbsinsertionscore = pbsinsertionscore,
pbsdeletionscore = pbsdeletionscore,
pfam.ids = pfam.ids,
cores = hmm.cores,
index.file = index.file.digest,
output.path = NULL
)
message("\n")
message("LTRpred - Step 3:")
message("Import LTRdigest Predictions...")
LTRdigestOutput <-
read.prediction(
gff.file = file.path(
paste0(chopped.foldername, "_ltrdigest"),
paste0(chopped.foldername, "_LTRdigestPrediction.gff")
),
tabout.file = file.path(
paste0(chopped.foldername, "_ltrdigest"),
paste0(chopped.foldername, "-ltrdigest_tabout.csv")
),
program = "LTRdigest"
)
} else {
LTRdigestOutput <- read.prediction(gff.file = LTRdigest.gff,
tabout.file = tabout.file,
program = "LTRdigest")
}
if (is.null(orf.file) & !is.null(LTRdigestOutput)) {
chopped.foldername <-
unlist(stringr::str_split(basename(genome.file), "[.]"))[1]
if (is.null(LTRdigest.gff)) {
folder_path <- getwd()
} else {
folder_path <-
unlist(stringr::str_split(LTRdigest.gff, .Platform$file.sep))
exclude <- c(length(folder_path) - 1, length(folder_path))
if (exclude[1] == 1) {
folder_path <- getwd()
} else {
folder_path <-
stringr::str_c(folder_path[-exclude], collapse = .Platform$file.sep)
}
}
if (any(is.na(LTRdigestOutput)))
return(paste0("No prediction for ", genome.file))
message("\n")
message("LTRpred - Step 4:")
message("Perform ORF Prediction using 'usearch -fastx_findorfs' ...")
ORFTable <-
ORFpred(
seq.file = file.path(
folder_path,
paste0(chopped.foldername, "_ltrdigest"),
paste0(chopped.foldername, "-ltrdigest_complete.fas")
),
orf.style = orf.style,
min.codons = min.codons,
trans.seqs = trans.seqs,
output = output.path
)
} else {
if (is.null(LTRdigestOutput)) {
chopped.foldername <-
unlist(stringr::str_split(basename(genome.file), "[.]"))[1]
folder_path <-
unlist(stringr::str_split(LTRdigest.gff, .Platform$file.sep))
exclude <- c(length(folder_path) - 1, length(folder_path))
if (exclude[1] == 1) {
folder_path <- getwd()
} else {
folder_path <-
stringr::str_c(folder_path[-exclude], collapse = .Platform$file.sep)
}
message(
file.path(
folder_path,
paste0(chopped.foldername, "_ltrdigest")
),
" is empty and therefore ORF prediction has been omitted."
)
} else {
ORFTable <- read.orfs(input.file = orf.file)
}
}
if (!is.null(LTRdigestOutput)) {
if (nrow(LTRdigestOutput$ltr.retrotransposon) > 0) {
LTRdigestOutput$ltr.retrotransposon <-
dplyr::mutate(LTRdigestOutput$ltr.retrotransposon,
orf.id = paste0(chromosome_ltrharvest, "_", start, "_", end))
if (nrow(ORFTable) > 0) {
LTRdigestOutput$ltr.retrotransposon <-
dplyr::full_join(LTRdigestOutput$ltr.retrotransposon,
ORFTable,
by = c("orf.id" = "seq.id"))
message(
"Join ORF Prediction table: nrow(df) = ",
nrow(LTRdigestOutput$ltr.retrotransposon),
" candidates."
)
message("unique(ID) = ", length(unique(LTRdigestOutput$ltr.retrotransposon$ID)), " candidates.")
message("unique(orf.id) = ", length(unique(
LTRdigestOutput$ltr.retrotransposon$orf.id
)), " candidates.")
}
LTRdigestOutput$ltr.retrotransposon <-
dplyr::mutate(LTRdigestOutput$ltr.retrotransposon,
repeat_region_length = ifelse(!is.na(rTSD_end), (
as.numeric(rTSD_end) - as.numeric(lTSD_start)
) + 1L, NA))
LTRdigestOutput$ltr.retrotransposon <-
dplyr::mutate(LTRdigestOutput$ltr.retrotransposon,
PPT_length = ifelse(!is.na(PPT_end), (
as.numeric(PPT_end) - as.numeric(PPT_start)
) + 1L, NA))
LTRdigestOutput$ltr.retrotransposon <-
dplyr::mutate(LTRdigestOutput$ltr.retrotransposon,
PBS_length = ifelse(!is.na(PBS_end), (
as.numeric(PBS_end) - as.numeric(PBS_start)
) + 1L, NA))
element_start <-
element_length <- `width.y` <- `start.y` <- `end.y` <- NULL
res <-
dplyr::select(
LTRdigestOutput$ltr.retrotransposon,
-c(
element_start,
element_length,
`width.y`,
`start.y`,
`end.y`
)
)
names(res)[c(4, 5, 9)] <- c("start", "end", "width")
} else {
warning("The LTRgigest prediction file contains no entries! Therefore, no ORF prediction result is joined with the LTRgigest prediction file.")
}
# ProteinMatch <- dplyr::select(LTRdigestOutput$protein.match,ID, start, end, match_width, reading_frame)
# colnames(ProteinMatch) <- c("ID","protein_domain_start","protein_domain_end","protein_domain_match_width","protein_domain_reading_frame")
#
# suppressWarnings(LTRdigestOutput$ltr.retrotransposon <- dplyr::inner_join(LTRdigestOutput$ltr.retrotransposon, ProteinMatch, by = "ID"))
#
# if (nrow(LTRdigestOutput$RR_tract) > 0) {
# RR_tract <-
# dplyr::select(LTRdigestOutput$RR_tract, ID, start, end, width)
# colnames(RR_tract) <-
# c("ID",
# "RR_tract_start",
# "RR_tract_end",
# "RR_tract_length")
# suppressWarnings(
# LTRdigestOutput$ltr.retrotransposon <-
# dplyr::inner_join(LTRdigestOutput$ltr.retrotransposon, RR_tract, by = "ID")
# )
# }
#
if (!is.null(genome.file)) {
chopped.foldername <-
unlist(stringr::str_split(basename(genome.file), "[.]"))[1]
if (is.null(LTRdigest.gff)) {
folder_path <- getwd()
} else {
folder_path <-
unlist(stringr::str_split(LTRdigest.gff, .Platform$file.sep))
exclude <- c(length(folder_path) - 1, length(folder_path))
if (exclude[1] == 1) {
folder_path <- getwd()
} else {
folder_path <-
stringr::str_c(folder_path[-exclude], collapse = .Platform$file.sep)
}
}
# annotate predicted LTRtransposons
if (!is.null(annotate)) {
if (annotate == "Dfam") {
if (!is.null(dfam.file)) {
Dfam.tbl <- read.dfam(dfam.file = dfam.file)
}
if (is.null(dfam.file)) {
message("A HMMer search against the Dfam database located at '", Dfam.db, "' using ", dfam.cores, " cores is performed to annotate de novo predicted retrotransposons ...")
# perform query against the Dfam database
predSeqs <-
file.path(
folder_path,
paste0(chopped.foldername, "_ltrdigest"),
paste0(
chopped.foldername,
"-ltrdigest_complete.fas"
)
)
dfam.query(
seq.file = predSeqs,
Dfam.db = Dfam.db,
eval = dfam.eval,
cores = dfam.cores,
output.folder = folder_path
)
Dfam.tbl <-
read.dfam(dfam.file = file.path(
folder_path,
paste0(
basename(predSeqs),
"_DfamAnnotation.out"
)
))
}
if (!is.null(Dfam.tbl)) {
# join Dfam prediction table with LTRpred table
# choose hit in Dfam with highest bit score
Dfam.tbl.grouped <-
dplyr::filter(dplyr::group_by(Dfam.tbl, query_name),
bits == max(bits))
names(Dfam.tbl.grouped)[3] <- "orf.id"
names(Dfam.tbl.grouped)[-3] <-
paste0("dfam_", names(Dfam.tbl.grouped)[-3])
suppressWarnings(res <-
dplyr::left_join(res, Dfam.tbl.grouped, by = "orf.id"))
message("Join Dfam Annotation table: nrow(df) = ",nrow(res), " candidates.")
message("unique(ID) = ",length(unique(res$ID)), " candidates.")
message("unique(orf.id) = ",length(unique(res$orf.id)), " candidates.")
res <-
dplyr::mutate(res,
dfam_target_name = ifelse(
is.na(dfam_target_name),
"unknown",
as.character(dfam_target_name)
))
}
if (is.null(Dfam.tbl)) {
warning("The Dfam file has not been joined with the prediction file!")
}
}
}
if (is.null(annotate)) {
res <- dplyr::mutate(
res,
dfam_target_name = rep(NA, nrow(res)),
dfam_acc = rep(NA, nrow(res)),
dfam_bits = rep(NA, nrow(res)),
dfam_e_value = rep(NA, nrow(res)),
dfam_bias = rep(NA, nrow(res)),
`dfam_hmm-st` = rep(NA, nrow(res)),
`dfam_hmm-en` = rep(NA, nrow(res)),
`dfam_strand` = rep(NA, nrow(res)),
`dfam_ali-st` = rep(NA, nrow(res)),
`dfam_ali-en` = rep(NA, nrow(res)),
`dfam_env-st` = rep(NA, nrow(res)),
`dfam_env-en` = rep(NA, nrow(res)),
`dfam_modlen` = rep(NA, nrow(res)),
dfam_target_description = rep(NA, nrow(res))
)
}
# if (annotate == "Repbase") {
# repbase.clean()
# repbase.query()
# repbase.filter()
# }
# perform transposon clustering
if (!cluster &&
!is.null(clust.file))
warning(
"Please specify 'cluster = TRUE' when specifying 'clust.file'! Since 'cluster = FALSE' no clustering will be performed!"
)
if (cluster) {
message("Perform clustering of similar LTR transposons using 'vsearch --cluster_fast' ...")
predSeqs <-
file.path(
folder_path,
paste0(chopped.foldername, "_ltrdigest"),
paste0(chopped.foldername, "-ltrdigest_complete.fas")
)
if (is.null(clust.file)) {
cluster.file <- CLUSTpred(
file = predSeqs,
similarity = clust.sim,
cores = cores,
output = output.path
)
# # read cluster output generated by CLUSTpred()
# if (is.null(output.path)) {
# cluster.file <- read.uc("CLUSTpred.uc")
# }
# if (!is.null(output.path)) {
# cluster.file <- read.uc(file.path(output.path, "CLUSTpred.uc"))
# }
}
if (!is.null(clust.file)) {
# read cluster output generated by CLUSTpred()
cluster.file <- read.uc(clust.file)
}
# check if cluster file is empty
if (nrow(cluster.file) > 0) {
cluster.file.type.H <-
dplyr::filter(cluster.file, Type == "H")
cluster.file.type.H <-
dplyr::select(cluster.file.type.H,
Cluster,
Query,
Target,
Perc_Ident)
names(cluster.file.type.H) <-
paste0("Clust_", names(cluster.file.type.H))
names(cluster.file.type.H)[2] <- "orf.id"
res <-
dplyr::left_join(res, cluster.file.type.H, by = "orf.id")
message("Join Cluster table: nrow(df) = ",nrow(res), " candidates.")
message("unique(ID) = ",length(unique(res$ID)), " candidates.")
message("unique(orf.id) = ",length(unique(res$orf.id)), " candidates.")
res <-
dplyr::mutate(
res,
Clust_Cluster = ifelse(
is.na(Clust_Cluster),
"unique",
as.character(paste0("cl_", Clust_Cluster))
),
Clust_Target = ifelse(
is.na(Clust_Target),
"none",
as.character(Clust_Target)
)
)
# compute copy number of clustered elements
Clust.cn <-
dplyr::filter(
dplyr::summarise(dplyr::group_by(res, Clust_Cluster), Clust_cn = dplyr::n()),
Clust_Cluster != "unique"
)
# join copy number information of clustered elements with result table
suppressWarnings(res <-
dplyr::left_join(res, Clust.cn, by = "Clust_Cluster"))
message("Join Cluster Copy Number table: nrow(df) = ",nrow(res), " candidates.")
message("unique(ID) = ",length(unique(res$ID)), " candidates.")
message("unique(orf.id)) = ",length(unique(res$orf.id)), " candidates.")
res <-
dplyr::mutate(res, Clust_cn = ifelse(is.na(Clust_cn), 1, as.numeric(Clust_cn) + 1))
}
if (nrow(cluster.file) == 0) {
warning("CLUSTpred: The cluster file was empty and therefore has not been joined with the prediction file.", call. = FALSE)
}
} else {
res <- dplyr::mutate(res, Clust_Cluster = rep(NA, nrow(res)),
Clust_Target = rep(NA, nrow(res)),
Clust_Perc_Ident = rep(NA, nrow(res)),
Clust_cn = rep(NA, nrow(res))
)
}
# perform methylation mark counting
message("\n")
message("LTRpred - Step 5:")
message("Perform methylation context quantification..")
full.te.seq <-
file.path(
folder_path,
paste0(chopped.foldername, "_ltrdigest"),
paste0(chopped.foldername, "-ltrdigest_complete.fas")
)
seq_3ltr <-
file.path(
folder_path,
paste0(chopped.foldername, "_ltrdigest"),
paste0(chopped.foldername, "-ltrdigest_3ltr.fas")
)
seq_5ltr <-
file.path(
folder_path,
paste0(chopped.foldername, "_ltrdigest"),
paste0(chopped.foldername, "-ltrdigest_3ltr.fas")
)
if ((!file.exists(full.te.seq)) || (!file.exists(seq_3ltr)) || (!file.exists(seq_5ltr))) {
warning("At least one of the files: '",full.te.seq,"', '",seq_3ltr,"', '",seq_5ltr,"' does not exist and therefore Methylation Context Quantification is omitted!", call. = FALSE)
} else {
# count CG: absolute
full.te.seq.CG.abs <-
motif.count(seq.file = full.te.seq,
motif = "CG")
seq_3ltr.CG.abs <- motif.count(seq.file = seq_3ltr,
motif = "CG")
seq_5ltr.CG.abs <- motif.count(seq.file = seq_5ltr,
motif = "CG")
# count CG: relative
full.te.seq.CG.rel <-
motif.count(seq.file = full.te.seq,
motif = "CG",
as.ratio = TRUE)
seq_3ltr.CG.rel <- motif.count(seq.file = seq_3ltr,
motif = "CG",
as.ratio = TRUE)
seq_5ltr.CG.rel <- motif.count(seq.file = seq_5ltr,
motif = "CG",
as.ratio = TRUE)
# count CHG: absolute
full.te.seq.CHG.abs <-
motif.count(seq.file = full.te.seq,
motif = "CHG")
seq_3ltr.CHG.abs <- motif.count(seq.file = seq_3ltr,
motif = "CHG")
seq_5ltr.CHG.abs <- motif.count(seq.file = seq_5ltr,
motif = "CHG")
# count CHG: relative
full.te.seq.CHG.rel <-
motif.count(seq.file = full.te.seq,
motif = "CHG",
as.ratio = TRUE)
seq_3ltr.CHG.rel <- motif.count(seq.file = seq_3ltr,
motif = "CHG",
as.ratio = TRUE)
seq_5ltr.CHG.rel <- motif.count(seq.file = seq_5ltr,
motif = "CHG",
as.ratio = TRUE)
# count CHH: absolute
full.te.seq.CHH.abs <-
motif.count(seq.file = full.te.seq,
motif = "CHH")
seq_3ltr.CHH.abs <- motif.count(seq.file = seq_3ltr,
motif = "CHH")
seq_5ltr.CHH.abs <- motif.count(seq.file = seq_5ltr,
motif = "CHH")
# count CHH: relative
full.te.seq.CHH.rel <-
motif.count(seq.file = full.te.seq,
motif = "CHH",
as.ratio = TRUE)
seq_3ltr.CHH.rel <- motif.count(seq.file = seq_3ltr,
motif = "CHH",
as.ratio = TRUE)
seq_5ltr.CHH.rel <- motif.count(seq.file = seq_5ltr,
motif = "CHH",
as.ratio = TRUE)
# count CCG: absolute
full.te.seq.CCG.abs <-
motif.count(seq.file = full.te.seq,
motif = "CCG")
seq_3ltr.CCG.abs <- motif.count(seq.file = seq_3ltr,
motif = "CCG")
seq_5ltr.CCG.abs <- motif.count(seq.file = seq_5ltr,
motif = "CCG")
# count CCG: relative
full.te.seq.CCG.rel <-
motif.count(seq.file = full.te.seq,
motif = "CCG",
as.ratio = TRUE)
seq_3ltr.CCG.rel <- motif.count(seq.file = seq_3ltr,
motif = "CCG",
as.ratio = TRUE)
seq_5ltr.CCG.rel <- motif.count(seq.file = seq_5ltr,
motif = "CCG",
as.ratio = TRUE)
# count N's: to assess the quality of the corresponding sequences
full.te.seq.N.abs <-
motif.count(seq.file = full.te.seq,
motif = "N",
as.ratio = FALSE)
seq_3ltr.N.abs <- motif.count(seq.file = seq_3ltr,
motif = "N",
as.ratio = FALSE)
seq_5ltr.N.abs <- motif.count(seq.file = seq_5ltr,
motif = "N",
as.ratio = FALSE)
if (!identical(names(full.te.seq.CHG.rel), names(seq_3ltr.CHG.rel)))
stop("Order of names in files: '",
full.te.seq,
"' and '",
seq_3ltr,
"' do not match.",
call. = FALSE
)
# construct count data.frame
count.df <- data.frame(
orf.id = names(full.te.seq.CHG.rel),
TE_CG_abs = full.te.seq.CG.abs,
TE_CG_rel = full.te.seq.CG.rel,
TE_CHG_abs = full.te.seq.CHG.abs,
TE_CHG_rel = full.te.seq.CHG.rel,
TE_CHH_abs = full.te.seq.CHH.abs,
TE_CHH_rel = full.te.seq.CHH.rel,
TE_CCG_abs = full.te.seq.CCG.abs,
TE_CCG_rel = full.te.seq.CCG.rel,
TE_N_abs = full.te.seq.N.abs,
CG_3ltr_abs = seq_3ltr.CG.abs,
CG_3ltr_rel = seq_3ltr.CG.rel,
CHG_3ltr_abs = seq_3ltr.CHG.abs,
CHG_3ltr_rel = seq_3ltr.CHG.rel,
CHH_3ltr_abs = seq_3ltr.CHH.abs,
CHH_3ltr_rel = seq_3ltr.CHH.rel,
CCG_3ltr_abs = seq_3ltr.CCG.abs,
CCG_3ltr_rel = seq_3ltr.CCG.rel,
N_3ltr_abs = seq_3ltr.N.abs,
CG_5ltr_abs = seq_5ltr.CG.abs,
CG_5ltr_rel = seq_5ltr.CG.rel,
CHG_5ltr_abs = seq_5ltr.CHG.abs,
CHG_5ltr_rel = seq_5ltr.CHG.rel,
CHH_5ltr_abs = seq_5ltr.CHH.abs,
CHH_5ltr_rel = seq_5ltr.CHH.rel,
CCG_5ltr_abs = seq_5ltr.CCG.abs,
CCG_5ltr_rel = seq_5ltr.CCG.rel,
N_5ltr_abs = seq_5ltr.N.abs
)
suppressWarnings(res <-
dplyr::left_join(res, dplyr::tbl_df(count.df), by = "orf.id"))
message("Join methylation context (CG, CHG, CHH, CCG) count table: nrow(df) = ",nrow(res), " candidates.")
message("unique(ID) = ",length(unique(res$ID)), " candidates.")
message("unique(orf.id) = ",length(unique(res$orf.id)), " candidates.")
}
message("Copy files to result folder '",output.path,"'.")
# perform internal file handling
if (is.null(LTRharvest.folder)) {
if (file.exists(file.path(
folder_path,
paste0(chopped.foldername, "_ltrharvest")
))) {
if (!file.exists(file.path(
output.path,
paste0(chopped.foldername, "_ltrharvest")
))) {
file.move(file.path(
folder_path,
paste0(chopped.foldername, "_ltrharvest")
),
file.path(
output.path,
paste0(chopped.foldername, "_ltrharvest")
))
}
}
}
if (file.exists(file.path(
folder_path,
paste0(chopped.foldername, "_ltrdigest")
))) {
if (!file.exists(file.path(
output.path,
paste0(chopped.foldername, "_ltrdigest")
))) {
file.move(file.path(
folder_path,
paste0(chopped.foldername, "_ltrdigest")
),
file.path(
output.path,
paste0(chopped.foldername, "_ltrdigest")
))
}
}
pred2gff(res, file.path(
output.path,
paste0(chopped.foldername, "_LTRpred.gff")
))
pred2bed(res, file.path(
output.path,
paste0(chopped.foldername, "_LTRpred.bed")
))
if ((!is.null(annotate)) && (annotate == "Dfam")) {
if (is.null(dfam.file)) {
file.move(file.path(
folder_path,
paste0(chopped.foldername, "-ltrdigest_complete.fas", "_DfamAnnotation.out")
),
file.path(
output.path,
paste0(chopped.foldername, "-ltrdigest_complete.fas", "_DfamAnnotation.out")
))
}
if (!is.null(dfam.file)) {
if (file.exists(dfam.file)) {
file.move(dfam.file,
file.path(
output.path,
basename(dfam.file)
))
}
}
}
}
if (is.null(genome.file)) {
if (file.exists(LTRdigest.gff))
file.move(LTRdigest.gff, file.path(output.path, LTRdigest.gff))
if (file.exists(tabout.file))
file.move(tabout.file, file.path(output.path, tabout.file))
pred2gff(res, file.path(output.path, paste0(
basename(LTRdigest.gff), "_LTRpred.gff"
)))
pred2bed(res, file.path(output.path, paste0(
basename(LTRdigest.gff), "_LTRpred.bed"
)))
if ((!is.null(annotate)) && (annotate == "Dfam")) {
if (is.null(dfam.file)) {
if (file.exists(file.path(
folder_path,
paste0(chopped.foldername, "-ltrdigest_complete.fas", "_DfamAnnotation.out")
))) {
file.move(file.path(
folder_path,
paste0(chopped.foldername, "-ltrdigest_complete.fas", "_DfamAnnotation.out")
),
file.path(
output.path,
paste0(chopped.foldername, "-ltrdigest_complete.fas", "_DfamAnnotation.out")
))
}
}
if (!is.null(dfam.file)) {
if (file.exists(dfam.file)) {
file.move(dfam.file,
file.path(
output.path,
basename(dfam.file)
))
}
}
}
}
#return(res)
}
if (is.null(LTRdigestOutput)) {
if (!is.null(genome.file)) {
chopped.foldername <-
unlist(stringr::str_split(basename(genome.file), "[.]"))[1]
folder_path <-
unlist(stringr::str_split(LTRdigest.gff, .Platform$file.sep))
exclude <- c(length(folder_path) - 1, length(folder_path))
if (exclude[1] == 1) {
folder_path <- getwd()
} else {
folder_path <-
stringr::str_c(folder_path[-exclude], collapse = .Platform$file.sep)
}
if (is.null(LTRharvest.folder)) {
if (file.exists(file.path(
folder_path,
paste0(chopped.foldername, "_ltrharvest")
))) {
file.move(file.path(
folder_path,
paste0(chopped.foldername, "_ltrharvest")
),
file.path(
output.path,
paste0(chopped.foldername, "_ltrharvest")
))
}
}
if (file.exists(file.path(
folder_path,
paste0(chopped.foldername, "_ltrdigest")
))) {
file.move(file.path(
folder_path,
paste0(chopped.foldername, "_ltrdigest")
),
file.path(
output.path,
paste0(chopped.foldername, "_ltrdigest")
))
}
} else {
if (file.exists(LTRdigest.gff))
file.move(LTRdigest.gff, file.path(output.path, LTRdigest.gff))
if (file.exists(tabout.file))
file.move(tabout.file, file.path(output.path, tabout.file))
}
}
res <- dplyr::mutate(res, pred_tool = rep("LTRpred", nrow(res)))
res <- dplyr::mutate(res, species = rep(chopped.foldername, nrow(res)))
res <- dplyr::mutate(res, ID = paste0(chopped.foldername,"_",ID))
if (nrow(ORFTable) > 0) {
res <- dplyr::mutate(res, orfs = ifelse(is.na(orfs), 0, orfs))
} else {
res <- dplyr::mutate(res, orfs = rep(0, nrow(res)))
}
if (fix.chr.name) {
# generate accurate chromosome naming (sometimes chromosome names are chopped because the naming convention is violated)
message("Chromosome names are being fixed ...")
chr.ids <- unlist(lapply(res$orf.id, function(x) {
s <- unlist(stringr::str_split(x,"_"))
l <- length(s)
if (l >= 3) {
s <- s[-c(l - 1, l)]
}
final_s <- ifelse(!is.na(stringr::str_c(s, collapse = "")), stringr::str_c(s, collapse = ""), "none")
return(final_s)
}))
if (nrow(res) != length(chr.ids))
stop("Something went wrong when trying to fix chromosome names!", call. = FALSE)
res <- dplyr::mutate(res, chromosome = chr.ids)
}
res <-
dplyr::mutate(res, cn_3ltr = rep(NA, nrow(res)), cn_5ltr = rep(NA, nrow(res)))
message("\n")
message("LTRpred - Step 6:")
if (ltr_age_estimation) {
seqs_3ltr <-
file.path(
output.path,
paste0(chopped.foldername, "_ltrdigest"),
paste0(chopped.foldername, "-ltrdigest_3ltr.fas")
)
seqs_5ltr <-
file.path(
output.path,
paste0(chopped.foldername, "_ltrdigest"),
paste0(chopped.foldername, "-ltrdigest_5ltr.fas")
)
ltr_age_estimation_res <- ltr_age_estimation(
res,
ltr_seqs_3_prime = seqs_3ltr,
ltr_seqs_5_prime = seqs_5ltr,
model = model,
mutation_rate = mutation_rate
)
ltr_age_mya <- ltr_evo_distance <- NULL
res <-
dplyr::left_join(
res,
dplyr::select(
ltr_age_estimation_res,
orf.id,
ltr_age_mya,
ltr_evo_distance
),
by = "orf.id"
)
res <-
dplyr::select(
res,
species,
ID,
dfam_target_name,
ltr_similarity,
ltr_age_mya,
ltr_evo_distance,
similarity,
protein_domain,
orfs,
chromosome,
start,
end,
strand,
width,
annotation,
pred_tool,
frame,
score,
lLTR_start:`PBS/tRNA_edist`,
orf.id,
repeat_region_length:PBS_length,
dfam_acc:cn_5ltr
)
}
if (!ltr_age_estimation){
message("\n")
message("This step was skipped, because: 'ltr_age_estimation = ", ltr_age_estimation, "'.")
}
message("\n")
message("LTRpred - Step 7:")
if (quality.filter) {
res <- quality.filter(res, sim = similar, n.orfs = n.orfs)
}
if (!quality.filter) {
message("No quality filters were applied, please be aware that your prediction file may include some false positive predictions of retrotransposons. You can still apply quality control criteria using the function LTRpred::quality.filter() ...")
}
pred2tsv(res, file.path(
output.path,
paste0(chopped.foldername, "_LTRpred_DataSheet.tsv")
))
# perform copy number estimation analysis
if (copy.number.est) {
data_sheet <-
file.path(output.path,
paste0(chopped.foldername, "_LTRpred_DataSheet.tsv"))
seqs_3ltr <-
file.path(
output.path,
paste0(chopped.foldername, "_ltrdigest"),
paste0(chopped.foldername, "-ltrdigest_3ltr.fas")
)
seqs_5ltr <-
file.path(
output.path,
paste0(chopped.foldername, "_ltrdigest"),
paste0(chopped.foldername, "-ltrdigest_5ltr.fas")
)
message("Perform solo LTR Copy Number Estimation....")
solo.ltr.cn <- ltr.cn(
data.sheet = data_sheet,
LTR.fasta_3ltr = seqs_3ltr,
LTR.fasta_5ltr = seqs_5ltr,
genome = genome.file,
ltr.similarity = similar,
eval = cn.eval,
cores = cores
)
# quantify copy number of LTRs for 3 prime LTR and 5 prime LTR separately
if ((nrow(solo.ltr.cn$pred_3ltr) > 0) && (nrow(solo.ltr.cn$pred_5ltr) > 0) && (!is.null(solo.ltr.cn$pred_3ltr)) && (!is.null(solo.ltr.cn$pred_5ltr))) {
# write estimated solo LTR loci to LTRpred output folder
cn2bed(
solo.ltr.cn$pred_3ltr,
type = "solo",
filename = paste0(chopped.foldername,"_soloLTRs_3ltr"),
output = output.path
)
cn2bed(
solo.ltr.cn$pred_5ltr,
type = "solo",
filename = paste0(chopped.foldername,"_solo_LTRs_5ltr"),
output = output.path
)
solo.ltr.cn.n_3ltr <-
dplyr::summarise(dplyr::group_by(solo.ltr.cn$pred_3ltr, query_id),
cn_3ltr = dplyr::n())
solo.ltr.cn.n_5ltr <-
dplyr::summarise(dplyr::group_by(solo.ltr.cn$pred_5ltr, query_id),
cn_5ltr = dplyr::n())
names(solo.ltr.cn.n_3ltr)[1] <- "orf.id"
names(solo.ltr.cn.n_5ltr)[1] <- "orf.id"
res <- dplyr::left_join(res, solo.ltr.cn.n_3ltr, by = "orf.id")
message("Join solo LTR Copy Number Estimation table: nrow(df) = ",nrow(res), " candidates.")
message("unique(ID) = ",length(unique(res$ID)), " candidates.")
message("unique(orf.id) = ",length(unique(res$orf.id)), " candidates.")
res <-
dplyr::mutate(res, cn_3ltr = ifelse(is.na(cn_3ltr), 0, as.numeric(cn_3ltr)))
suppressWarnings(res <- dplyr::full_join(res, solo.ltr.cn.n_5ltr, by = "orf.id"))
res <-
dplyr::mutate(res, cn_5ltr = ifelse(is.na(cn_5ltr), 0, as.numeric(cn_5ltr)))
} else {
warning("The LTR copy number estimation returned an empty file. This suggests that there were no solo LTRs found in the input genome sequence. ", call. = FALSE)
res <-
dplyr::mutate(res, cn_3ltr = rep(NA, nrow(res)), cn_5ltr = rep(NA, nrow(res)))
}
} else {
res <-
dplyr::mutate(res, cn_3ltr = rep(NA, nrow(res)), cn_5ltr = rep(NA, nrow(res)))
}
unlink(file.path(
output.path,
paste0(chopped.foldername, "_LTRpred_DataSheet.tsv")
))
pred2tsv(res, file.path(
output.path,
paste0(chopped.foldername, "_LTRpred_DataSheet.tsv")
))
pred2gff(res, file.path(
output.path,
paste0(chopped.foldername, "_LTRpred.gff")
))
pred2bed(res, file.path(
output.path,
paste0(chopped.foldername, "_LTRpred.bed")
))
# if (!file.exists(file.path(output.path, "Plots")))
# dir.create(file.path(output.path, "Plots"))
#
# plot_genomic_distr <- plot_element_distr_along_chromosome(pred = res, genome.file = genome.file, centromere_start = centromere_start, centromere_end = centromere_end)
message("\n")
# message("Please cite the following paper when using LTRpred for your own research: Benoit, Drost et al., 2019 PloS Genetics, 15(9):e1008370.")
message("\n")
message("LTRpred finished all analyses successfully. All output files were stored at '", output.path,"'.")
return(paste0("Successful job ", job_num," ."))
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.