metablastr: Perform Massive Local BLAST Searches

Documented in blast_repbase

#' @title Query the RepBase to annotate putative LTRs
#' @description Validate or annotate putative LTR
#' transposons that have been predicted using LTRharvest or LTRdigest.
#' @param query file path to the putative LTR transposon sequences in \code{fasta} format.
#' @param repbase.path file path to the RepBase file in \code{fasta} format.
#' @param output file name of the BLAST output.
#' @param max.target.seqs maximum number of hits that shall be retrieved that still fulfill the e-value criterium.
#' Default is \code{max.target.seqs = 10000}.
#' @param eval e-value threshold for BLAST hit detection. Default is \code{eval = 1E-10}.
#' @param cores number of cores to use to perform parallel computations.
#' @author Hajk-Georg Drost
#' @details
#' The RepBase database provides a collection of curated transposable element annotations.
#'
#' This function allows users to validate or annotate putative LTR
#' transposons that have been predicted using LTRharvest or LTRdigest by blasting predicted LTR transposons
#' to transposons known (annotated) in other species (e.g. such as Arabidopsis thaliana).
#'
#' Internally, this function performs a \code{blastn} search of the putative LTR transposons predicted
#' by LTRharvest or LTRdigest against the Repbase fasta file that is specified by the user.
#'
#' For this purpose it is required that the user has a working version of BLAST+ running on his or her machine.
#' @examples
#' \dontrun{
#' # Example annotation run against the A thaliana RepBase using 4 cores
#' q <- repbase.query(seq.file     = "path/to/LTRtransposonSeqs.fasta",
#'                   repbase.path = "path/to/Athaliana_repbase.ref",
#'                   cores        = 4)
#'
#' Annot <- dplyr::select(dplyr::filter(dplyr::group_by(q,query_id),
#'                                     (bit_score == max(bit_score))),
#'                                      query_id:q_len,evalue,bit_score,scope)
#' # select only hits with a scope > 0.1
#' Annot.HighMatches <- dplyr::filter(Annot, scope >= 0.1)
#' # Annotate the proportion of hits
#' barplot(sort(table(unlist(lapply(stringr::str_split(
#'         names(table(Annot.HighMatches$subject_id)),"_"),
#'         function(x) x[2]))), decreasing = TRUE))
#' }
#' @references
#' http://www.girinst.org/repbase/
#'
#' Altschul, S.F., Gish, W., Miller, W., Myers, E.W. & Lipman, D.J. (1990) "Basic local alignment search tool." J. Mol. Biol. 215:403-410.
#'
#' Gish, W. & States, D.J. (1993) "Identification of protein coding regions by database similarity search." Nature Genet. 3:266-272.
#'
#' Madden, T.L., Tatusov, R.L. & Zhang, J. (1996) "Applications of network BLAST server" Meth. Enzymol. 266:131-141.
#'
#' Altschul, S.F., Madden, T.L., Schaeffer, A.A., Zhang, J., Zhang, Z., Miller, W. & Lipman, D.J. (1997) "Gapped BLAST and PSI-BLAST: a new generation of protein database search programs." Nucleic Acids Res. 25:3389-3402.
#'
#' Zhang Z., Schwartz S., Wagner L., & Miller W. (2000), "A greedy algorithm for aligning DNA sequences" J Comput Biol 2000; 7(1-2):203-14.
#' @export

blast_repbase <- function(query,
                          repbase.path,
                          output   = "RepbaseOutput.txt",
                          max.target.seqs = 10000,
                          eval     = 1E-10,
                          cores    = 1) {
  
  is_blast_installed()
  
  s_len <- alig_length <- NULL
  # Create a blast-able database of repbase and perform blastn search of
  # LTR transposons against this blast formatted repbase file
  system(
    paste0(
      "makeblastdb -in ",
      repbase.path,
      " -parse_seqids -input_type fasta -dbtype nucl"
    )
  )
  system(
    paste0(
      "blastn -db ",
      repbase.path,
      " -query ",
      query,
      " -out ",
      output ,
      " ",
      "-evalue ",
      eval,
      " -max_target_seqs ",
      max.target.seqs,
      " -num_threads ",
      cores,
      " -dust no -outfmt '6 qseqid sseqid pident nident
                   length mismatch gapopen gaps positive ppos qstart qend qlen qcovs qcovhsp qseq sstart send slen sseq evalue bitscore score'"
    )
  )
  
  # Define the column names of the BLAST output
  BLASTColNames <- c(
    "query_id",
    "subject_id",
    "perc_identity",
    "num_ident_matches",
    "alig_length",
    "mismatches",
    "gap_openings",
    "n_gaps",
    "pos_match",
    "ppos",
    "q_start",
    "q_end",
    "q_len",
    "qcov",
    "qcovhsp",
    "query_seq",
    "s_start",
    "s_end",
    "s_len",
    "subject_seq",
    "evalue",
    "bit_score",
    "score_raw"
  )
  # Read the BLAST output and define the columns
  BLASTRepBaseOutput <- readr::read_tsv(output, col_names = FALSE)
  colnames(BLASTRepBaseOutput) <- BLASTColNames
  # Add an additional variable named 'scope' to the BLAST output
  # Scope is defined by the formula: 1 - (abs(s_len - alig_length) / s_len)
  # and is computed for each hit of the BLAST output
  # This scope measure allows to quantify the percentage of sequence similarity
  # over the entire query and subject sequence length.
  # A scope of 0.5 can be interpreted as a 50% coverage (in terms of sequence similarity)
  # found between the query and subject DNA string
  BLASTRepBaseOutput <-
    dplyr::mutate(BLASTRepBaseOutput, scope = 1 - (abs(s_len - alig_length) / s_len))
  
  return(BLASTRepBaseOutput)
}