R/generate_ortholog_tables_all.R

Defines functions generate_ortholog_tables_all

Documented in generate_ortholog_tables_all

#' @title Generate ortholog tables by gene locus and splice varaint for a set of species
#' @description Given an input folder storing dNdS tables generated by \code{\link{dNdS}} and
#' annotation files stored in an annotation folder for the query (one annotation file) and subject species in \code{gtf} or \code{gff}
#' file format, this function selects the best BLAST hit to represent either a gene locus (e.g. the splice variant of the gene locus with lowest e-value) or the best BLAST hit for a splice varaint.
#' @param dNdS_folder file path to folder storing a \code{dNdS tables} generated with \code{\link{dNdS}} and
#' stored conform with \code{\link{read.dnds.tbl}}.
#' @param annotation_file_query file path to the annotation file of the query species in \code{gtf} or \code{gff} file format.
#' @param annotation_folder_subject file path to a folder storing the annotation files of the subject species in \code{gtf} or \code{gff} file format.
#' @param output_folder file path to a folder in which orthologs tables shall be stored.
#' @param output_type type of ortholog table that shall be printed out (or stored in a variable).
#' Available options are:
#' \itemize{
#' \item \code{output_type = "gene_locus"} (Default): find for each gene locus a representative splice variant that
#' maximizes the sequence homology (in terms of smalles e-value and longest splice variant in case of same evalue) 
#' to the subject gene locus and its representative splice variant. The output table contains only once representative splice variant per gene locus.
#' \item \code{output_type = "splice_variant"}: for each homologous gene locus determine for each splice variant their
#' respective splice variant homolog. he output table contains several splice variants and their homologous splice variants per gene locus.
#' }
#' @param format a vector of length 2 storing the annotation file formats of the query annotation file and subject annotation file: either \code{gtf} or \code{gff} format. E.g. \code{format = c("gtf","gtf")}.
#' @author Hajk-Georg Drost
#' @export
generate_ortholog_tables_all <-
        function(dNdS_folder,
                 annotation_file_query,
                 annotation_folder_subject,
                 output_folder,
                 output_type = "gene_locus",
                 format = c("gtf", "gtf")) {
                
                if (!file.exists(dNdS_folder))
                        stop("The dNdS folder '", dNdS_folder, "' does not seem to exist. Please specify a valid folder path.", call. = FALSE)
                if (!file.exists(annotation_folder_subject))
                        stop("The subject annotation folder '", annotation_folder_subject, "' does not seem to exist. Please specify a valid folder path.", call. = FALSE)
                
                dnds_files <- file.path(dNdS_folder, list.files(dNdS_folder, pattern = ".*\\.(csv|tsv)"))
                dnds_species <- basename(dnds_files)
                qry_species <- unlist(stringr::str_split(basename(annotation_file_query), "[.]"))[1]    
                sbj_files <- file.path(annotation_folder_subject, list.files(annotation_folder_subject))
                sbj_species <- unlist(sapply(basename(sbj_files), function(x) unlist(stringr::str_split(x, "[.]"))[1]))
                
                message("Generating ortholog tables for qry: ", qry_species, " and subject species: ", paste0(sbj_species, collapse = ", "))
                
                if (length(dnds_files) != length(sbj_files))
                        stop("The number of files stored in the dnds_folder and the number of files stored in annotation_folder_subject do not match.",
                             " Please make sure that you provide dnds tables and annotation files for all species.", call. = FALSE)
                
                if (length(annotation_file_query) > 1)
                        stop("This function only supports one query annotation file as input.", call. = FALSE)
                
                res <- vector("list", length = length(dnds_files))
                
                for (i in seq_len(length(dnds_files))) {
                        message("Processing dnds_file '", dnds_files[i], "' qry_annotation '", annotation_file_query, "' and subj_annotation '", sbj_files[i], "' ...")
                        if (!stringr::str_detect(dnds_species[i], sbj_species[i]))
                                stop("The species in the dnds_file used for this computation does not match with the species of the subject annotation file ... Please fix this.", call. = FALSE)
                        
                        res[i] <- list(generate_ortholog_tables(
                                dNdS_file = dnds_files[i],
                                annotation_file_query = annotation_file_query,
                                annotation_file_subject = sbj_files[i],
                                output_folder = output_folder,
                                output_type = output_type,
                                format = format))
                                
                                message("\n")    
                }
                
                names(res) <- paste0(qry_species, "_vs_", sbj_species)
                message("All ortholog tables were successfully built.")
                res <- dplyr::bind_rows(res)
                # add scope column
                q_len <- alig_length <- NULL
                res <- dplyr::mutate(res, scope = 1 - (abs(q_len - alig_length) / q_len))
                
                return(res)
        }
HajkD/orthologr documentation built on Oct. 13, 2023, 12:11 a.m.