R/import_dnds_across_multiple_species.R

Defines functions import_dnds_across_multiple_species

Documented in import_dnds_across_multiple_species

#' @title Import \code{\link{dnds}} tables by gene locus and splice varaint for a set of species
#' @description Given an input folder storing dnds tables generated by \code{\link{dnds_across_multiple_species}} and
#' annotation files stored in an annotation folder for the query (one annotation file) and subject species in \code{gtf} or \code{gff}
#' file format, this function selects the best DIAMOND hit to represent either a gene locus (e.g. the splice variant of the gene locus with lowest e-value) or the best DIAMOND hit for a splice varaint.
#' @param dnds_output_folder file path to folder storing \code{dnds tables} generated with \code{\link{dnds_across_multiple_species}} and
#' stored conform with \code{\link{import_dnds_tbl}}.
#' @param annotation_file_query file path to the annotation file of the query species in \code{gtf} or \code{gff} file format.
#' @param annotation_folder_subject file path to a folder storing the annotation files of the subject species in \code{gtf} or \code{gff} file format.
#' @param output_folder file path to a folder in which orthologs tables shall be stored.
#' @param output_type type of ortholog table that shall be printed out (or stored in a variable).
#' Available options are:
#' \itemize{
#' \item \code{output_type = "gene_locus"} (default): find for each gene locus a representative splice variant that
#' maximizes the sequence homology (in terms of smallest e-value and longest splice variant in case of same evalue)
#' to the subject gene locus and its representative splice variant. The output table contains only once representative splice variant per gene locus.
#' \item \code{output_type = "splice_variant"}: for each homologous gene locus determine for each splice variant their
#' respective splice variant homolog. he output table contains several splice variants and their homologous splice variants per gene locus.
#' }
#' @param format a vector of length 2 storing the annotation file formats of the query annotation file and subject annotation file: either \code{gtf} or \code{gff} format. E.g. \code{format = c("gtf","gtf")}.
#' @author Hajk-Georg Drost
#' @examples \dontrun{
#' # running dnds across several species using DIAMOND executable
#' # path '/opt/miniconda3/bin/'
#' dnds_across_multiple_species(
#'   query      = system.file('seqs/ortho_thal_cds.fasta', package = 'homologr'),
#'   subjects_folder = system.file('seqs/map_gen_example', package = 'homologr'),
#'   diamond_exec_path = "/opt/miniconda3/bin/",
#'   aa_aln_type      = "pairwise",
#'   aa_aln_tool      = "NW",
#'   codon_aln_tool   = "pal2nal",
#'   dnds_estimation  = "Li",
#'   output_folder   = file.path(tempdir(), "homologr_dnds_maps"),
#'   quiet           = TRUE,
#'   cores           = 1
#' )
#' # Import dnds tables by gene locus and splice varaint for a set of species
#' import_dnds_tables <- import_dnds_across_multiple_species(
#' dnds_output_folder = "homologr_dnds_maps",
#' annotation_file_query = "system.file('seqs/ortho_thal.gtf', package = 'homologr')",
#' annotation_folder_subject = system.file('seqs/plants_subject_files', package = 'homologr'),
#' output_folder = file.path(tempdir(), "homologr_dnds_maps", "orthologs_tables"),
#' output_type = "gene_locus",
#' format = c("gtf", "gtf"))
#'
#' # look at results
#' lapply(import_dnds_tables, head)
#' }
#' @seealso \code{\link{dnds_across_multiple_species}}, \code{\link{dnds}}
#' @export
import_dnds_across_multiple_species <-
  function(dnds_output_folder,
           annotation_file_query,
           annotation_folder_subject,
           output_folder,
           output_type = "gene_locus",
           format = c("gtf", "gtf")) {

    if (!file.exists(dnds_output_folder))
      stop("The dnds_output_folder folder '", dnds_output_folder, "' does not seem to exist. Please specify a valid folder path.", call. = FALSE)
    if (!file.exists(annotation_folder_subject))
      stop("The subject annotation folder '", annotation_folder_subject, "' does not seem to exist. Please specify a valid folder path.", call. = FALSE)

    dnds_files <- file.path(dnds_output_folder, list.files(dnds_output_folder, pattern = ".*\\.(csv|tsv)"))
    dnds_species <- basename(dnds_files)
    qry_species <- unlist(stringr::str_split(basename(annotation_file_query), "[.]"))[1]
    sbj_files <- file.path(annotation_folder_subject, list.files(annotation_folder_subject))
    sbj_species <- unlist(sapply(basename(sbj_files), function(x) unlist(stringr::str_split(x, "[.]"))[1]))

    message("Generating single splice variant dnds tables for qry: ", qry_species, " and subject species: ", paste0(sbj_species, collapse = ", "))

    if (length(dnds_files) != length(sbj_files))
      stop("The number of files stored in the dnds_folder and the number of files stored in annotation_folder_subject do not match.",
           " Please make sure that you provide dnds tables and annotation files for all species.", call. = FALSE)

    if (length(annotation_file_query) > 1)
      stop("This function only supports one query annotation file as input.", call. = FALSE)

    res <- vector("list", length = length(dnds_files))

    for (i in seq_len(length(dnds_files))) {
      message("Processing dnds_file '", dnds_files[i], "' qry_annotation '", annotation_file_query, "' and subj_annotation '", sbj_files[i], "' ...")
      if (!stringr::str_detect(dnds_species[i], sbj_species[i]))
        stop("The species in the dnds_file used for this computation does not match with the species of the subject annotation file ... Please fix this.", call. = FALSE)

      res[i] <- list(internal_import_dnds_across_multiple_species(
        dnds_output_folder = dnds_files[i],
        annotation_file_query = annotation_file_query,
        annotation_file_subject = sbj_files[i],
        output_folder = output_folder,
        output_type = output_type,
        format = format))

      message("\n")
    }

    names(res) <- paste0(qry_species, "_vs_", sbj_species)
    message("All ortholog tables were successfully built.")
    res <- dplyr::bind_rows(res)
    # add scope column
    q_len <- alig_length <- NULL
    res <- dplyr::mutate(res, scope = 1 - (abs(q_len - alig_length) / q_len))

    return(res)
  }
drostlab/homologr documentation built on Sept. 28, 2020, 12:44 a.m.