R/internal_import_dnds_across_multiple_species.R

Defines functions internal_import_dnds_across_multiple_species

Documented in internal_import_dnds_across_multiple_species

#' @title Generate ortholog tables by gene locus and splice varaint
#' @description Given an input dNdS table generated by \code{\link{dNdS}} and
#' annotation files for the query and subject species in \code{gtf} or \code{gff}
#' file format, this function selects the best BLAST hit to represent either a gene locus (e.g. the splice variant of the gene locus with lowest e-value) or the best BLAST hit for a splice varaint.
#' @param dnds_output_folder file path to a \code{dNdS_tbl} generated with \code{\link{dNdS}} and
#' stored conform with \code{\link{read.dnds.tbl}}.
#' @param annotation_file_query file path to the annotation file of the query species in \code{gtf} or \code{gff} file format.
#' @param annotation_file_subject file path to the annotation file of the subject species in \code{gtf} or \code{gff} file format.
#' @param output_folder file path to a folder in which orthologs tables shall be stored.
#' @param output_type type of ortholog table that shall be printed out (or stored in a variable).
#' Available options are:
#' \itemize{
#' \item \code{output_type = "gene_locus"} (default): find for each gene locus a representative splice variant that
#' maximizes the sequence homology (in terms of smallest e-value and longest splice variant in case of same evalue)
#' to the subject gene locus and its representative splice variant. The output table contains only once representative splice variant per gene locus.
#' \item \code{output_type = "splice_variant"}: for each homologous gene locus determine for each splice variant their
#' respective splice variant homolog. he output table contains several splice variants and their homologous splice variants per gene locus.
#' }
#' @param format a vector of length 2 storing the annotation file formats of the query annotation file and subject annotation file: either \code{gtf} or \code{gff} format. E.g. \code{format = c("gtf","gtf")}.
#' @author Hajk-Georg Drost
#' @export
internal_import_dnds_across_multiple_species <-
  function(dnds_output_folder,
           annotation_file_query,
           annotation_file_subject,
           output_folder = getwd(),
           output_type = "gene_locus",
           format) {

    if (!file.exists(dnds_output_folder))
      stop("Please provide a valid path to the dnds_output_folder.", call. = FALSE)

    if (!file.exists(annotation_file_query))
      stop("Please provide a valid path to the annotation_file_query.", call. = FALSE)

    if (!file.exists(annotation_file_subject))
      stop("Please provide a valid path to the annotation_file_subject.", call. = FALSE)

    if (!file.exists(output_folder))
      stop("Please provide a valid path to the output_folder.", call. = FALSE)

    if (!is.element(output_type, c("gene_locus", "splice_variant")))
      stop("Please specify an output_type that is supported by this function.", call. = FALSE)


    message("Generating orthologs table for ", basename(annotation_file_query), " vs ", basename(annotation_file_subject), " ...")

    qry_species_name <-
      unlist(stringr::str_split(basename(annotation_file_query), "[.]"))[1]
    sbj_species_name <-
      unlist(stringr::str_split(basename(annotation_file_subject), "[.]"))[1]

    select_orthologs_by_gene_locus_output_name <-
      paste0("orthologs_by_gene_",
             qry_species_name,
             "_vs_",
             sbj_species_name,
             ".csv")

    select_orthologs_by_splice_variant_output_name <-
      paste0(
        "orthologs_by_splice_variant_",
        qry_species_name,
        "_vs_",
        sbj_species_name,
        ".csv"
      )

    # import dNdS file
    dNdS_tbl <- import_dnds_tbl(dnds_output_folder)

    # check if annotation file is corrupt and remove outliers
    annotation_file_query <-
      check_annotation_file_completeness(annotation_file_query, remove_annotation_outliers = TRUE)

    annotation_file_subject <- check_annotation_file_completeness(annotation_file_subject, remove_annotation_outliers = TRUE)

    # select orthologs by gene_locus
    select_orthologs_by_gene_locus <- select_orthologs(
      dnds_tbl = dnds_output_folder,
      annotation_file_query = annotation_file_query,
      annotation_file_subject = annotation_file_subject,
      collapse_by = "gene_locus",
      format = format
    )

    # select orthologs by splice_variant
    select_orthologs_splice_variant <- select_orthologs(
      dnds_tbl = dNdS_tbl,
      annotation_file_query = annotation_file_query,
      annotation_file_subject = annotation_file_subject,
      collapse_by = "splice_variant",
      format = format
    )

    if (!file.exists(output_folder))
      dir.create(output_folder)

    by_gene_output_folder <- file.path(output_folder, paste0("orthologs_by_gene_locus_qry_", stringr::str_to_lower(qry_species_name)))
    by_splice_variant_output_folder <- file.path(output_folder, paste0("orthologs_by_splice_variant_qry_", stringr::str_to_lower(qry_species_name)))

    message("\n")
    message(
      "Store ortholog by gene locus table at '",
      file.path(by_gene_output_folder, select_orthologs_by_gene_locus_output_name),
      "'."
    )


    if (!dir.exists(by_gene_output_folder))
      dir.create(by_gene_output_folder)

    if (!dir.exists(by_splice_variant_output_folder))
      dir.create(by_splice_variant_output_folder)


    readr::write_delim(
      select_orthologs_by_gene_locus,
      file.path(by_gene_output_folder, select_orthologs_by_gene_locus_output_name),
      col_names = TRUE,
      delim = ";"
    )

    message("\n")
    message(
      "Store ortholog by splice variant table at '",
      file.path(by_splice_variant_output_folder, select_orthologs_by_splice_variant_output_name),
      "'."
    )
    readr::write_delim(
      select_orthologs_splice_variant,
      file.path(by_splice_variant_output_folder, select_orthologs_by_splice_variant_output_name),
      col_names = TRUE,
      delim = ";"
    )

    if (output_type == "gene_locus")
      return(select_orthologs_by_gene_locus)
    if (output_type == "splice_variant")
      return(select_orthologs_splice_variant)
  }
drostlab/homologr documentation built on Sept. 28, 2020, 12:44 a.m.