#' @title Import \code{\link{dnds}} tables by gene locus and splice varaint for a set of species
#' @description Given an input folder storing dnds tables generated by \code{\link{dnds_across_multiple_species}} and
#' annotation files stored in an annotation folder for the query (one annotation file) and subject species in \code{gtf} or \code{gff}
#' file format, this function selects the best DIAMOND hit to represent either a gene locus (e.g. the splice variant of the gene locus with lowest e-value) or the best DIAMOND hit for a splice varaint.
#' @param dnds_output_folder file path to folder storing \code{dnds tables} generated with \code{\link{dnds_across_multiple_species}} and
#' stored conform with \code{\link{import_dnds_tbl}}.
#' @param annotation_file_query file path to the annotation file of the query species in \code{gtf} or \code{gff} file format.
#' @param annotation_folder_subject file path to a folder storing the annotation files of the subject species in \code{gtf} or \code{gff} file format.
#' @param output_folder file path to a folder in which orthologs tables shall be stored.
#' @param output_type type of ortholog table that shall be printed out (or stored in a variable).
#' Available options are:
#' \itemize{
#' \item \code{output_type = "gene_locus"} (default): find for each gene locus a representative splice variant that
#' maximizes the sequence homology (in terms of smallest e-value and longest splice variant in case of same evalue)
#' to the subject gene locus and its representative splice variant. The output table contains only once representative splice variant per gene locus.
#' \item \code{output_type = "splice_variant"}: for each homologous gene locus determine for each splice variant their
#' respective splice variant homolog. he output table contains several splice variants and their homologous splice variants per gene locus.
#' }
#' @param format a vector of length 2 storing the annotation file formats of the query annotation file and subject annotation file: either \code{gtf} or \code{gff} format. E.g. \code{format = c("gtf","gtf")}.
#' @author Hajk-Georg Drost
#' @examples \dontrun{
#' # running dnds across several species using DIAMOND executable
#' # path '/opt/miniconda3/bin/'
#' dnds_across_multiple_species(
#' query = system.file('seqs/ortho_thal_cds.fasta', package = 'homologr'),
#' subjects_folder = system.file('seqs/map_gen_example', package = 'homologr'),
#' diamond_exec_path = "/opt/miniconda3/bin/",
#' aa_aln_type = "pairwise",
#' aa_aln_tool = "NW",
#' codon_aln_tool = "pal2nal",
#' dnds_estimation = "Li",
#' output_folder = file.path(tempdir(), "homologr_dnds_maps"),
#' quiet = TRUE,
#' cores = 1
#' )
#' # Import dnds tables by gene locus and splice varaint for a set of species
#' import_dnds_tables <- import_dnds_across_multiple_species(
#' dnds_output_folder = "homologr_dnds_maps",
#' annotation_file_query = "system.file('seqs/ortho_thal.gtf', package = 'homologr')",
#' annotation_folder_subject = system.file('seqs/plants_subject_files', package = 'homologr'),
#' output_folder = file.path(tempdir(), "homologr_dnds_maps", "orthologs_tables"),
#' output_type = "gene_locus",
#' format = c("gtf", "gtf"))
#'
#' # look at results
#' lapply(import_dnds_tables, head)
#' }
#' @seealso \code{\link{dnds_across_multiple_species}}, \code{\link{dnds}}
#' @export
import_dnds_across_multiple_species <-
function(dnds_output_folder,
annotation_file_query,
annotation_folder_subject,
output_folder,
output_type = "gene_locus",
format = c("gtf", "gtf")) {
if (!file.exists(dnds_output_folder))
stop("The dnds_output_folder folder '", dnds_output_folder, "' does not seem to exist. Please specify a valid folder path.", call. = FALSE)
if (!file.exists(annotation_folder_subject))
stop("The subject annotation folder '", annotation_folder_subject, "' does not seem to exist. Please specify a valid folder path.", call. = FALSE)
dnds_files <- file.path(dnds_output_folder, list.files(dnds_output_folder, pattern = ".*\\.(csv|tsv)"))
dnds_species <- basename(dnds_files)
qry_species <- unlist(stringr::str_split(basename(annotation_file_query), "[.]"))[1]
sbj_files <- file.path(annotation_folder_subject, list.files(annotation_folder_subject))
sbj_species <- unlist(sapply(basename(sbj_files), function(x) unlist(stringr::str_split(x, "[.]"))[1]))
message("Generating single splice variant dnds tables for qry: ", qry_species, " and subject species: ", paste0(sbj_species, collapse = ", "))
if (length(dnds_files) != length(sbj_files))
stop("The number of files stored in the dnds_folder and the number of files stored in annotation_folder_subject do not match.",
" Please make sure that you provide dnds tables and annotation files for all species.", call. = FALSE)
if (length(annotation_file_query) > 1)
stop("This function only supports one query annotation file as input.", call. = FALSE)
res <- vector("list", length = length(dnds_files))
for (i in seq_len(length(dnds_files))) {
message("Processing dnds_file '", dnds_files[i], "' qry_annotation '", annotation_file_query, "' and subj_annotation '", sbj_files[i], "' ...")
if (!stringr::str_detect(dnds_species[i], sbj_species[i]))
stop("The species in the dnds_file used for this computation does not match with the species of the subject annotation file ... Please fix this.", call. = FALSE)
res[i] <- list(internal_import_dnds_across_multiple_species(
dnds_output_folder = dnds_files[i],
annotation_file_query = annotation_file_query,
annotation_file_subject = sbj_files[i],
output_folder = output_folder,
output_type = output_type,
format = format))
message("\n")
}
names(res) <- paste0(qry_species, "_vs_", sbj_species)
message("All ortholog tables were successfully built.")
res <- dplyr::bind_rows(res)
# add scope column
q_len <- alig_length <- NULL
res <- dplyr::mutate(res, scope = 1 - (abs(q_len - alig_length) / q_len))
return(res)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.