#' @title Genome-wide pairwise inference of synonymous versus non-synonymous substitution rates (dnds) across multiple subject organisms
#' @description This function allows you to compute dnds maps between a query organism
#' and a set subject organisms stored in the same folder. The corresponding dnds maps are then stored in an output folder.
#' @param query a character string specifying the path to the CDS file of the query organism.
#' @param subjects_folder a character string specifying the path to the folder where CDS files of the subject organisms are stored.
#' @param output_folder a character string specifying the path to the folder where output dnds maps should be stored.
#' @param format a character string specifying the file format of the sequence file, e.g. \code{format = "fasta"}, \code{format = "gbk"}. See
#' \code{\link{import_proteome}} for more details.
#' @param ortho_detection a character string specifying the orthology inference method that shall be performed to detect orthologous genes. Options are:
#' \itemize{
#' \item \code{ortho_detection ="DIAMOND-BH"}: DIAMOND unidirectional best hit.
#' \item \code{ortho_detection = "DIAMOND-RBH"}: DIAMOND reciprocal/bidirectional best hit (default).
#' }
#' @param delete_corrupt_cds a logical value indicating whether sequences with corrupt base triplets should be removed from the input \code{file}. This is the case when the length of coding sequences cannot be divided by 3 and thus the coding sequence contains at least one corrupt base triplet.
#' @param store_locally a logical value indicating whether or not alignment files shall be stored locally rather than in \code{tempdir()}.
#' @param sensitivity_mode specify the level of alignment sensitivity. The higher the sensitivity level, the more deep homologs can be found, but at the cost of reduced computational speed.
#' \itemize{
#' \item \code{sensitivity_mode = "fast"} : fastest alignment mode, but least sensitive (default). Designed for finding hits of >70% identity and short read alignment such as NGS reads.
#' \item \code{sensitivity_mode = "mid-sensitive"} : fast alignments between the \code{fast} mode and the sensitive mode in sensitivity.
#' \item \code{sensitivity_mode = "sensitive"} : fast alignments, but full sensitivity for hits >40% identity.
#' \item \code{sensitivity_mode = "more-sensitive"} : more sensitive than the \code{sensitive} mode.
#' \item \code{sensitivity_mode = "very-sensitive"} : sensitive alignment mode.
#' \item \code{sensitivity_mode = "ultra-sensitive"} : most sensitive alignment mode (sensitivity as high as BLASTP).
#' }
#' @param out_format a character string specifying the format of the file in which the DIAMOND results shall be stored.
#' Available options are:
#' \itemize{
#' \item \code{out_format = "pair"} : Pairwise
#' \item \code{out_format = "xml"} : XML
#' \item \code{out_format = "csv"} : Comma-separated file
#' }
#' @param hard_mask shall low complexity regions be hard masked with TANTAN? Default is \code{db_hard_mask = TRUE}.
#' @param diamond_exec_path a path to the DIAMOND executable or \code{conda/miniconda} folder.
#' @param add_makedb_options a character string specifying additional makedb options that shall be passed on to the diamond makedb command line call, e.g. \code{add_make_options = "--taxonnames"} (Default is \code{add_diamond_options = NULL}).
#' @param add_diamond_options a character string specifying additional diamond options that shall be passed on to the diamond command line call, e.g. \code{add_diamond_options = "--block-size 4.0 --compress 1 --no-self-hits"} (Default is \code{add_diamond_options = NULL}).
#' @param evalue a character string specifying the e-value for DIAMOND based orthology inference that is performed
#' in the process of dnds computations. Please use the scientific notation.
#' @param max_target_seqs maximum number of aligned sequences that shall be retained. Please be aware that \code{max_target_seqs} selects best hits based on the database entry and not by the best e-value. See details here: https://academic.oup.com/bioinformatics/advance-article/doi/10.1093/bioinformatics/bty833/5106166 .
#' @param min_qry_coverage_hsp minimum \code{qcovhsp} (= query coverage of the HSP) of an orthologous hit (a value between 1 and 100).
#' @param min_qry_perc_identity minimum \code{perc_identity} (= percent sequence identity between query and selected HSP) of an orthologous hit (a value between 1 and 100).
#' @param aa_aln_type a character string specifying the amino acid alignement type: \code{aa_aln_type = "multiple"} or \code{aa_aln_type = "pairwise"}.
#' Default is \code{aa_aln_type = "pairwise"}.
#' @param aa_aln_tool a character string specifying the program that should be used e.g. "clustalw".
#' @param codon_aln_tool a character string specifying the codon alignment tool that shall be used.
#' Default is \code{codon_aln_tool = "pal2nal"}. Right now only "pal2nal" can be selected as codon alignment tool.
#' @param dnds_estimation a character string specifying the dnds estimation method, e.g. \code{dnds_estimation = "Li"} (default).
#' @param cores number of computing cores that shall be used to perform parallelized computations.
#' @param progress_bar should a progress bar be shown. Default is \code{progress_bar = TRUE}.
#' @param sep a file separator that is used to store maps as csv file.
#' @param diamond_hit_tables a file path to a folder where DIAMOND hit tables internally generated with \code{\link{diamond_best_hits}} or \code{\link{diamond_reciprocal_best_hits}} will be stored.
#' @param ... additional parameters that shall be passed to \code{\link{dnds}}.
#' @details
#' Given a query organism and a set of subject organisms that are stored in the same folder,
#' this function crawls through all subject organism and as a first step computes the pairwise
#' dnds Maps between query and subject organism and as a second step stores the corresponding Map
#' in an output folder.
#' @author Hajk-Georg Drost
#' @seealso \code{\link{dnds}}, \code{\link{import_dnds_across_multiple_species}}, \code{\link{import_dnds_tbl}}
#' @examples
#' \dontrun{
#' # running dnds across several species
#' dnds_across_multiple_species(
#' query = system.file('seqs/ortho_thal_cds.fasta', package = 'homologr'),
#' subjects_folder = system.file('seqs/map_gen_example', package = 'homologr'),
#' aa_aln_type = "pairwise",
#' aa_aln_tool = "NW",
#' codon_aln_tool = "pal2nal",
#' dnds_estimation = "Li",
#' output_folder = "homologr_dnds_maps",
#' quiet = TRUE,
#' cores = 1
#' )
#'
#' # running dnds across several species using DIAMOND executable
#' # path '/opt/miniconda3/bin/'
#' dnds_across_multiple_species(
#' query = system.file('seqs/ortho_thal_cds.fasta', package = 'homologr'),
#' subjects_folder = system.file('seqs/map_gen_example', package = 'homologr'),
#' diamond_exec_path = "/opt/miniconda3/bin/",
#' aa_aln_type = "pairwise",
#' aa_aln_tool = "NW",
#' codon_aln_tool = "pal2nal",
#' dnds_estimation = "Li",
#' output_folder = "homologr_dnds_maps",
#' quiet = TRUE,
#' cores = 1
#' )
#' }
#' @export
dnds_across_multiple_species <- function(query,
subjects_folder,
output_folder,
format = "fasta",
ortho_detection = "DIAMOND-RBH",
delete_corrupt_cds = FALSE,
store_locally = FALSE,
sensitivity_mode = "ultra-sensitive",
out_format = "csv",
evalue = "1E-5",
max_target_seqs = 5000,
cores = 1,
hard_mask = TRUE,
diamond_exec_path = NULL,
add_makedb_options = NULL,
add_diamond_options = NULL,
min_qry_coverage_hsp = 50,
min_qry_perc_identity = 10,
aa_aln_type = "pairwise",
aa_aln_tool = "NW",
codon_aln_tool = "pal2nal",
dnds_estimation = "Li",
progress_bar = TRUE,
diamond_hit_tables = file.path(output_folder, "diamond_hit_tables"),
sep = ";",
... ){
if (!file.exists(subjects_folder))
stop("The specified folder path '", subjects_folder, "' does not seem to exist. Please provide a valid path to the folder storing all subject genomes.", call. = FALSE)
# retrieve all subject files within a given folder
subj.files <- list.files(subjects_folder)
if (length(subj.files) == 0)
stop("Your subjects_folder ", subjects_folder, " seems to be empty...", call. = FALSE)
if (!dplyr::between(min_qry_coverage_hsp, 1, 100))
stop("Please provide a valid min_qry_coverage_hsp value between 1 and 100.", call. = FALSE)
if (!dplyr::between(min_qry_perc_identity, 1, 100))
stop("Please provide a valid min_qry_perc_identity value between 1 and 100.", call. = FALSE)
message("Starting pairwise genome comparisons (orthology inference and dnds estimation) between query species: ", basename(query), " and subject species: ", paste0(subj.files, collapse = ", "))
# initialize progress bar
if (progress_bar & (length(subj.files) > 1))
pb <- utils::txtProgressBar(1, length(subj.files), style = 3)
if (!file.exists(output_folder)){
message("\n")
message("Creating dnds output folder: ", output_folder)
dir.create(output_folder)
}
if (!file.exists(diamond_hit_tables)){
message("\n")
message("Creating DIAMOND hits output folder: ", diamond_hit_tables)
dir.create(diamond_hit_tables, recursive = TRUE)
}
qcovhsp <- perc_identity <- NULL
message("\n")
for (i in seq_len(length(subj.files))) {
# compute pairwise dnds maps between query and all subject files
OrgQuery_vs_OrgSubj <- dnds(
query = query,
subject = file.path(subjects_folder, subj.files[i]),
format = format,
ortho_detection = ortho_detection,
delete_corrupt_cds = delete_corrupt_cds,
store_locally = store_locally,
sensitivity_mode = sensitivity_mode,
out_format = out_format,
evalue = evalue,
max_target_seqs = max_target_seqs,
cores = cores,
hard_mask = hard_mask,
diamond_exec_path = diamond_exec_path,
add_makedb_options = add_makedb_options,
add_diamond_options = add_diamond_options,
aa_aln_type = aa_aln_type,
aa_aln_tool = aa_aln_tool,
codon_aln_tool = codon_aln_tool,
dnds_estimation = dnds_estimation,
output_path = diamond_hit_tables,
print_citation = FALSE,
...
)
message("Filtering for DIAMOND hits with min_qry_coverage_hsp >= ", min_qry_coverage_hsp, " and min_qry_perc_identity >= ", min_qry_perc_identity, " ...")
OrgQuery_vs_OrgSubj <- dplyr::filter(OrgQuery_vs_OrgSubj, qcovhsp >= min_qry_coverage_hsp, perc_identity >= min_qry_perc_identity)
dnds_output_i <- file.path(
output_folder,
paste0(
"dnds_query=",
basename(query),
"_subject=",
subj.files[i],
"_ortho_method=",
ortho_detection,
"_evalue=",
evalue,
"_dnds_estimation=",
dnds_estimation,
"_min_qry_coverage_hsp=",
min_qry_coverage_hsp,
"_min_qry_perc_identity=",
min_qry_perc_identity,
".csv"
))
message("\n")
message("Storing final dnds table in: ", dnds_output_i)
readr::write_delim(
OrgQuery_vs_OrgSubj,
dnds_output_i,
delim = sep,
col_names = TRUE
)
if (progress_bar)
utils::setTxtProgressBar(pb, i)
}
message("\n")
message("Citation:")
message("Please cite the following paper when using homologr for your own research:")
message(
"B Buchfink and HG Drost. Methods in Molecular Biology (2021)."
)
message("\n")
message("All maps are stored in ", output_folder, ".")
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.