#' @title Query the RepBase to annotate putative LTRs
#' @description Validate or annotate putative LTR
#' transposons that have been predicted using LTRharvest or LTRdigest.
#' @param query file path to the putative LTR transposon sequences in \code{fasta} format.
#' @param repbase.path file path to the RepBase file in \code{fasta} format.
#' @param output file name of the BLAST output.
#' @param max.target.seqs maximum number of hits that shall be retrieved that still fulfill the e-value criterium.
#' Default is \code{max.target.seqs = 10000}.
#' @param eval e-value threshold for BLAST hit detection. Default is \code{eval = 1E-10}.
#' @param cores number of cores to use to perform parallel computations.
#' @author Hajk-Georg Drost
#' @details
#' The RepBase database provides a collection of curated transposable element annotations.
#'
#' This function allows users to validate or annotate putative LTR
#' transposons that have been predicted using LTRharvest or LTRdigest by blasting predicted LTR transposons
#' to transposons known (annotated) in other species (e.g. such as Arabidopsis thaliana).
#'
#' Internally, this function performs a \code{blastn} search of the putative LTR transposons predicted
#' by LTRharvest or LTRdigest against the Repbase fasta file that is specified by the user.
#'
#' For this purpose it is required that the user has a working version of BLAST+ running on his or her machine.
#' @examples
#' \dontrun{
#' # Example annotation run against the A thaliana RepBase using 4 cores
#' q <- repbase.query(seq.file = "path/to/LTRtransposonSeqs.fasta",
#' repbase.path = "path/to/Athaliana_repbase.ref",
#' cores = 4)
#'
#' Annot <- dplyr::select(dplyr::filter(dplyr::group_by(q,query_id),
#' (bit_score == max(bit_score))),
#' query_id:q_len,evalue,bit_score,scope)
#' # select only hits with a scope > 0.1
#' Annot.HighMatches <- dplyr::filter(Annot, scope >= 0.1)
#' # Annotate the proportion of hits
#' barplot(sort(table(unlist(lapply(stringr::str_split(
#' names(table(Annot.HighMatches$subject_id)),"_"),
#' function(x) x[2]))), decreasing = TRUE))
#' }
#' @references
#' http://www.girinst.org/repbase/
#'
#' Altschul, S.F., Gish, W., Miller, W., Myers, E.W. & Lipman, D.J. (1990) "Basic local alignment search tool." J. Mol. Biol. 215:403-410.
#'
#' Gish, W. & States, D.J. (1993) "Identification of protein coding regions by database similarity search." Nature Genet. 3:266-272.
#'
#' Madden, T.L., Tatusov, R.L. & Zhang, J. (1996) "Applications of network BLAST server" Meth. Enzymol. 266:131-141.
#'
#' Altschul, S.F., Madden, T.L., Schaeffer, A.A., Zhang, J., Zhang, Z., Miller, W. & Lipman, D.J. (1997) "Gapped BLAST and PSI-BLAST: a new generation of protein database search programs." Nucleic Acids Res. 25:3389-3402.
#'
#' Zhang Z., Schwartz S., Wagner L., & Miller W. (2000), "A greedy algorithm for aligning DNA sequences" J Comput Biol 2000; 7(1-2):203-14.
#' @export
blast_repbase <- function(query,
repbase.path,
output = "RepbaseOutput.txt",
max.target.seqs = 10000,
eval = 1E-10,
cores = 1) {
is_blast_installed()
s_len <- alig_length <- NULL
# Create a blast-able database of repbase and perform blastn search of
# LTR transposons against this blast formatted repbase file
system(
paste0(
"makeblastdb -in ",
repbase.path,
" -parse_seqids -input_type fasta -dbtype nucl"
)
)
system(
paste0(
"blastn -db ",
repbase.path,
" -query ",
query,
" -out ",
output ,
" ",
"-evalue ",
eval,
" -max_target_seqs ",
max.target.seqs,
" -num_threads ",
cores,
" -dust no -outfmt '6 qseqid sseqid pident nident
length mismatch gapopen gaps positive ppos qstart qend qlen qcovs qcovhsp qseq sstart send slen sseq evalue bitscore score'"
)
)
# Define the column names of the BLAST output
BLASTColNames <- c(
"query_id",
"subject_id",
"perc_identity",
"num_ident_matches",
"alig_length",
"mismatches",
"gap_openings",
"n_gaps",
"pos_match",
"ppos",
"q_start",
"q_end",
"q_len",
"qcov",
"qcovhsp",
"query_seq",
"s_start",
"s_end",
"s_len",
"subject_seq",
"evalue",
"bit_score",
"score_raw"
)
# Read the BLAST output and define the columns
BLASTRepBaseOutput <- readr::read_tsv(output, col_names = FALSE)
colnames(BLASTRepBaseOutput) <- BLASTColNames
# Add an additional variable named 'scope' to the BLAST output
# Scope is defined by the formula: 1 - (abs(s_len - alig_length) / s_len)
# and is computed for each hit of the BLAST output
# This scope measure allows to quantify the percentage of sequence similarity
# over the entire query and subject sequence length.
# A scope of 0.5 can be interpreted as a 50% coverage (in terms of sequence similarity)
# found between the query and subject DNA string
BLASTRepBaseOutput <-
dplyr::mutate(BLASTRepBaseOutput, scope = 1 - (abs(s_len - alig_length) / s_len))
return(BLASTRepBaseOutput)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.