R/blast.R

#*  Copyright (C) 2018 the DEUS contributors.
#*  Website: https://github.com/timjeske/DEUS
#*
#*  This file is part of the DEUS R package.
#*
#*  The DEUS R package is free software: you can redistribute it and/or modify
#*  it under the terms of the GNU General Public License as published by
#*  the Free Software Foundation, either version 3 of the License, or
#*  (at your option) any later version.
#*
#*  This program is distributed in the hope that it will be useful,
#*  but WITHOUT ANY WARRANTY; without even the implied warranty of
#*  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
#*  GNU General Public License for more details.
#*
#*  You should have received a copy of the GNU General Public License
#*  along with this program.  If not, see <http://www.gnu.org/licenses/>.



#' Run BLAST for small RNA sequences
#'
#' This function takes a vector of sequences in FASTA style as input and applies BLAST running on a local blast database.
#' By default, the parameters are configured for blasting small RNA sequences of strand-specific RNA-seq experiments with 100\% identity over the whole query sequence.
#'
#' @param blast_exec Path to your installation of the 'blastn' binary
#' @param blast_db Path to your local BLAST database
#' @param ncores Number of cores used by BLAST
#' @param sig_sequences Vector of sequences in FASTA style generated by the \link[DEUS]{sequencesAsFasta}
#' @param strand Query strand orientation to search against database.
#' By default 'plus', assuming underlying strand-specific RNA-seq experiments.
#' @param identity Identity cutoff applied for filtering BLAST results.
#' By default 100\%, to find only exact matches over the whole length of the query sequence.
#' @param dust The parameter specifies whether low complexity filtering is applied to the query sequence.
#' By default 'no', to prevent false-negative annotations of sequences of interest.
#' @param soft_masking The parameter specifies whether low complexity filter is applied to the database.
#' By default 'no', to prevent false-negative annotations of sequences of interest.
#' @return Each row of the resulting data frame corresponds to a BLAST hit including the columns 'qseqid', 'qlen', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart', 'send', 'evalue' and 'bitscore'.
#' @export

runBlast <- function(blast_exec, blast_db, ncores, sig_sequences, strand = 'plus', identity=100, dust='no', soft_masking='false') {
  blast_f6 <- c('qseqid', 'qlen', 'sseqid', 'pident', 'length', 'mismatch', 'gapopen', 'qstart', 'qend', 'sstart' ,'send', 'evalue', 'bitscore')
  command <- paste("-db",blast_db, "-num_threads", ncores , "-perc_identity", identity, "-dust", dust, "-soft_masking", soft_masking, "-strand", strand, "-task blastn-short -qcov_hsp_perc 100 -max_target_seqs 2000 -outfmt", sprintf(" '6 %s'",paste(collapse=" ",blast_f6)),sep=" ")
  message("Blastn configuration:")
  message(paste(blast_exec,command))
  blast_out <- system2(blast_exec,command, input=sig_sequences, stdout=TRUE)
  blast_out_df <- `names<-`(read.table(quote="",sep='\t',textConnection(blast_out)),blast_f6)

  # rename qsedid to SequenceID, length to Length, evalue to BlastEvalue, sseqid to Annotation
  colnames(blast_out_df) <- gsub("qseqid", "SequenceID",colnames(blast_out_df))
  colnames(blast_out_df) <- gsub("length", "Length", colnames(blast_out_df))
  colnames(blast_out_df) <- gsub("evalue", "BlastEvalue", colnames(blast_out_df))
  colnames(blast_out_df) <- gsub("sseqid", "Annotation", colnames(blast_out_df))
  return(blast_out_df)
}
timjeske/DEUS documentation built on June 6, 2019, 12:59 p.m.