#' @title Main Orthology Inference Function
#' @description This function takes nucleotide or protein sequences for a set of organisms
#' and performs orthology inference to detect orthologous genes within the given organisms
#' based on selected orthology inference programs.
#' @param query_file a character string specifying the path to the sequence file of interest (query organism).
#' @param subject_files a character string specifying the paths to the sequence files of interest (subject organisms).
#' Different orthology inference methods can detect orthologs using multiple subject organisms, e.g. "OrthoMCL", and "PO" (ProteinOrtho).
#' @param outgroup_file a character string specifying the paths to the sequence files of interest (outgroup organisms).
#' This argument is only used by \code{InParanoid}.
#' @param eval a numeric value specifying the E-Value cutoff for BLAST hit detection.
#' @param seq_type a character string specifying the sequence type stored in the input file.
#' Options are are: "cds", "protein", or "dna". In case of "cds", sequence are translated to protein sequences,
#' in case of "dna", cds prediction is performed on the corresponding sequences which subsequently are
#' translated to protein sequences. Default is \code{seq_type} = "protein".
#' @param format a character string specifying the file format of the sequence file, e.g. "fasta", "gbk". Default is "fasta".
#' @param ortho_detection a character string specifying the orthology inference method that shall be performed
#' to detect orthologous genes. Default is \code{ortho_detection} = "RBH" (BLAST reciprocal best hit).
#' Further methods are: "RBH" (BLAST reciprocal best hit), "PO" (ProteinOrtho), and "OrthoMCL.
#' @param delete_corrupt_cds a logical value indicating whether sequences with corrupt base triplets should be removed from the input \code{file}. This is the case when the length of coding sequences cannot be divided by 3 and thus the coding sequence contains at least one corrupt base triplet.
#' @param cdd.path path to the cdd database folder (specify when using \code{ortho_detection} = \code{"DELTA"}).
#' @param path a character string specifying the path to the corresponding orthology inference tool.
#' For "BH" and "RBH": path to BLAST, "PO": path to ProteinOrtho 5.07, "OrthoMCL": path to OrthoMCL.
#' @param add_params a character string specifying additional parameters that shall be handed to the orthology inference method (tool).
#' Default is \code{add_params} = \code{NULL}.
#' @param comp_cores a numeric value specifying the number of cores to be used for multicore computations.
#' @param quiet a logical value specifying whether a successful interface call shall be printed out.
#' @param clean_folders a boolean value spefiying whether all internall folders storing the output of used programs
#' shall be removed. Default is \code{clean_folders} = \code{FALSE}.
#' @details
#' This function takes sequence files of a query organism and a subject organism and performs orthology inference
#' using a defined orthology inference method to dectect orthologous genes.
#'
#' The following interfaces are implemented in the \code{orthologs} function:
#'
#' BLAST based methods:
#'
#' \itemize{
#' \item BLAST best hit (BH)
#' \item BLAST reciprocal best hit (RBH)
#' \item DELTA-BLAST reciprocal best hit (DELTA)
#' }
#'
#'
#' @author Hajk-Georg Drost
#' @return A data.table storing the query_ids of orthologous genes in the first column, the subject_ids of orthologous genes
#' in the second column and the amino acid sequences in the third column.
#' @examples \dontrun{
#'
#' ### BLAST Best Hit
#'
#' # perform orthology inference using BLAST best hit
#' # and fasta sequence files storing protein sequences
#' orthologs(query_file = system.file('seqs/ortho_thal_aa.fasta', package = 'orthologr'),
#' subject_files = system.file('seqs/ortho_lyra_aa.fasta', package = 'orthologr'),
#' seq_type = "protein",
#' ortho_detection = "BH")
#'
#'
#' ### BLAST Reciprocal Best Hit
#'
#' # perform orthology inference using BLAST reciprocal best hit
#' # and fasta sequence files storing protein sequences
#' orthologs(query_file = system.file('seqs/ortho_thal_aa.fasta', package = 'orthologr'),
#' subject_files = system.file('seqs/ortho_lyra_aa.fasta', package = 'orthologr'),
#' seq_type = "protein",
#' ortho_detection = "RBH")
#'
#'
#' # multicore version
#' orthologs(query_file = system.file('seqs/ortho_thal_aa.fasta', package = 'orthologr'),
#' subject_files = system.file('seqs/ortho_lyra_aa.fasta', package = 'orthologr'),
#' seq_type = "protein",
#' ortho_detection = "RBH",
#' comp_cores = 2)
#' }
#' @seealso \code{\link{blast_rec}}, \code{\link{dNdS}}
#' @export
orthologs <- function(query_file,
subject_files,
seq_type = "protein",
outgroup_file = NULL,
eval = "1E-5",
format = "fasta",
ortho_detection = "RBH",
delete_corrupt_cds = FALSE,
cdd.path = NULL,
path = NULL,
add_params = NULL,
comp_cores = 1,
quiet = FALSE,
clean_folders = FALSE){
if(!is.element(ortho_detection,
c("BH", "RBH", "Orthofinder2")))
stop("Please choose a orthology detection method that is supported by this function.", call. = FALSE)
i <- NULL
if (seq_type == "cds") {
f_sep <- .Platform$file.sep
filename_qry <-
unlist(strsplit(
query_file,
f_sep,
fixed = FALSE,
perl = TRUE,
useBytes = FALSE
))
filename_qry <- filename_qry[length(filename_qry)]
write.proteome(proteome = cds2aa(
read.cds(query_file, format = format, delete_corrupt_cds = delete_corrupt_cds)
),
file.name = file.path(tempdir(), paste0(filename_qry, "_translated.fasta")))
if (length(subject_files) > 1) {
subj_short.names <- vector("character", length(subject_files))
for (organism in 1:length(subject_files)) {
short.name <-
unlist(
strsplit(
subject_files[organism],
f_sep,
fixed = FALSE,
perl = TRUE,
useBytes = FALSE
)
)
short.name <-
short.name[length(short.name)]
subj_short.names[i] <- short.name
write.proteome(
proteome = cds2aa(
read.cds(subject_files[organism], format = format, delete_corrupt_cds = delete_corrupt_cds), delete_corrupt_cds = delete_corrupt_cds),
file.name = file.path(
tempdir(),
paste0(short.name, "_translated.fasta")
)
)
}
subject_files <-
file.path(tempdir(),
paste0(short.name, "_translated.fasta"))
} else {
filename_subj <-
unlist(
strsplit(
subject_files,
f_sep,
fixed = FALSE,
perl = TRUE,
useBytes = FALSE
)
)
filename_subj <-
filename_subj[length(filename_subj)]
write.proteome(
proteome = cds2aa(
read.cds(
subject_files,
format = format,
delete_corrupt_cds = delete_corrupt_cds
),
delete_corrupt_cds = delete_corrupt_cds
),
file.name = file.path(tempdir(),
paste0(filename_subj, "_translated.fasta"))
)
subject_files <-
file.path(tempdir(),
paste0(filename_subj, "_translated.fasta"))
}
query_file <-
file.path(tempdir(), paste0(filename_qry, "_translated.fasta"))
}
if (ortho_detection == "BH") {
if (length(subject_files) > 1)
stop("The BLAST best hit method is only defined for pairwise comparisons.", call. = FALSE)
ortho_tbl <- data.table::copy(
blast_best(query_file = query_file,
subject_file = subject_files,
path = path,
delete_corrupt_cds = delete_corrupt_cds,
comp_cores = comp_cores,
eval = eval,
blast_params = add_params,
seq_type = seq_type,
format = format )
)
if (clean_folders)
clean_all_folders(file.path(tempdir(),"_blast_db"))
}
if(ortho_detection == "RBH"){
if (length(subject_files) > 1)
stop("The BLAST best reciprocal hit method is only defined for pairwise comparisons.", call. = FALSE)
ortho_tbl <- data.table::copy(
blast_rec( query_file = query_file,
subject_file = subject_files,
path = path,
delete_corrupt_cds = delete_corrupt_cds,
comp_cores = comp_cores,
eval = eval,
blast_params = add_params,
seq_type = seq_type,
format = format )
)
if (clean_folders)
clean_all_folders(file.path(tempdir(),"_blast_db"))
}
if (ortho_detection == "Orthofinder2") {
message("This option is currently under development and will be available soon.", call. = FALSE)
# ortho_tbl <- data.table::copy(
#
# ProteinOrtho( query_file = query_file,
# subject_files = subject_files,
# po_path = path,
# eval = eval,
# comp_cores = comp_cores,
# po_params = add_params,
# seq_type = seq_type,
# format = format )
#
# )
#
#
# if(clean_folders)
# clean_all_folders(file.path(tempdir(),"_ProteinOrtho"))
}
return(ortho_tbl)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.