R/generate_ortholog_tables.R

Defines functions generate_ortholog_tables

Documented in generate_ortholog_tables

#' @title Generate ortholog tables by gene locus and splice varaint
#' @description Given an input dNdS table generated by \code{\link{dNdS}} and
#' annotation files for the query and subject species in \code{gtf} or \code{gff}
#' file format, this function selects the best BLAST hit to represent either a gene locus (e.g. the splice variant of the gene locus with lowest e-value) or the best BLAST hit for a splice varaint.
#' @param dNdS_file file path to a \code{dNdS_tbl} generated with \code{\link{dNdS}} and
#' stored conform with \code{\link{read.dnds.tbl}}.
#' @param annotation_file_query file path to the annotation file of the query species in \code{gtf} or \code{gff} file format.
#' @param annotation_file_subject file path to the annotation file of the subject species in \code{gtf} or \code{gff} file format.
#' @param output_folder file path to a folder in which orthologs tables shall be stored.
#' @param output_type type of ortholog table that shall be printed out (or stored in a variable).
#' Available options are:
#' \itemize{
#' \item \code{output_type = "gene_locus"} (Default): find for each gene locus a representative splice variant that
#' maximizes the sequence homology (in terms of smalles e-value and longest splice variant in case of same evalue) 
#' to the subject gene locus and its representative splice variant. The output table contains only once representative splice variant per gene locus.
#' \item \code{output_type = "splice_variant"}: for each homologous gene locus determine for each splice variant their
#' respective splice variant homolog. he output table contains several splice variants and their homologous splice variants per gene locus.
#' }
#' @param format a vector of length 2 storing the annotation file formats of the query annotation file and subject annotation file: either \code{gtf} or \code{gff} format. E.g. \code{format = c("gtf","gtf")}.
#' @author Hajk-Georg Drost
#' @export
generate_ortholog_tables <-
        function(dNdS_file,
                 annotation_file_query,
                 annotation_file_subject,
                 output_folder = getwd(),
                 output_type = "gene_locus",
                 format) {
                
                if (!file.exists(dNdS_file))
                        stop("Please provide a valid path to the dNdS_file.", call. = FALSE)
                
                if (!file.exists(annotation_file_query))
                        stop("Please provide a valid path to the annotation_file_query", call. = FALSE)
                
                if (!file.exists(annotation_file_subject))
                        stop("Please provide a valid path to the annotation_file_subject", call. = FALSE)
                
                if (!file.exists(output_folder))
                        stop("Please provide a valid path to the output_folder", call. = FALSE)
                
                if (!is.element(output_type, c("gene_locus", "splice_variant")))
                        stop("Please specify an output_type that is supported by this function.", call. = FALSE)
                
                
                message("Generating orthologs table for ", basename(annotation_file_query), " vs ", basename(annotation_file_subject), " ...")
                
                qry_species_name <-
                        unlist(stringr::str_split(basename(annotation_file_query), "[.]"))[1]
                sbj_species_name <-
                        unlist(stringr::str_split(basename(annotation_file_subject), "[.]"))[1]
                
                select_orthologs_by_gene_locus_output_name <-
                        paste0("orthologs_by_gene_",
                               qry_species_name,
                               "_vs_",
                               sbj_species_name,
                               ".csv")
                
                select_orthologs_by_splice_variant_output_name <-
                        paste0(
                                "orthologs_by_splice_variant_",
                                qry_species_name,
                                "_vs_",
                                sbj_species_name,
                                ".csv"
                        )
                
                # import dNdS file
                dNdS_tbl <-
                        read.dnds.tbl(dNdS_file)
                
                # check if annotation file is corrupt and remove outliers
                annotation_file_query <-
                        check_annotation(annotation_file_query, remove_annotation_outliers = TRUE)
                
                annotation_file_subject <- check_annotation(annotation_file_subject, remove_annotation_outliers = TRUE)
                
                # select orthologs by gene_locus
                select_orthologs_by_gene_locus <- select_orthologs(
                        dnds_tbl = dNdS_tbl,
                        annotation_file_query = annotation_file_query,
                        annotation_file_subject = annotation_file_subject,
                        collapse_by = "gene_locus",
                        format = format
                )
                
                # select orthologs by splice_variant
                select_orthologs_splice_variant <- select_orthologs(
                        dnds_tbl = dNdS_tbl,
                        annotation_file_query = annotation_file_query,
                        annotation_file_subject = annotation_file_subject,
                        collapse_by = "splice_variant",
                        format = format
                )
                
                if (!file.exists(output_folder))
                        dir.create(output_folder)
                
                by_gene_output_folder <- file.path(output_folder, paste0("orthologs_by_gene_locus_qry_", stringr::str_to_lower(qry_species_name)))
                by_splice_variant_output_folder <- file.path(output_folder, paste0("orthologs_by_splice_variant_qry_", stringr::str_to_lower(qry_species_name)))
                
                message("\n")
                message(
                        "Store ortholog by gene locus table at '",
                        file.path(by_gene_output_folder, select_orthologs_by_gene_locus_output_name),
                        "'."
                )
                
                
                if (!dir.exists(by_gene_output_folder))
                        dir.create(by_gene_output_folder)
                
                if (!dir.exists(by_splice_variant_output_folder))
                        dir.create(by_splice_variant_output_folder)
                
                
                readr::write_delim(
                        select_orthologs_by_gene_locus,
                        file.path(by_gene_output_folder, select_orthologs_by_gene_locus_output_name),
                        col_names = TRUE,
                        delim = ";"
                )
                
                message("\n")
                message(
                        "Store ortholog by splice variant table at '",
                        file.path(by_splice_variant_output_folder, select_orthologs_by_splice_variant_output_name),
                        "'."
                )
                readr::write_delim(
                        select_orthologs_splice_variant,
                        file.path(by_splice_variant_output_folder, select_orthologs_by_splice_variant_output_name),
                        col_names = TRUE,
                        delim = ";"
                )
                
                if (output_type == "gene_locus")
                        return(select_orthologs_by_gene_locus)
                if (output_type == "splice_variant")
                        return(select_orthologs_splice_variant)
        }
drostlab/orthologr documentation built on Oct. 13, 2023, 3:26 a.m.