R/select_best_splice_Variants.R

Defines functions select_best_splice_variants

Documented in select_best_splice_variants

#' @title Select best splice variant representing orthologous gene loci
#' @description This function selects orthologs based on either gene locus or splice variant.
#' @param dnds_tbl a tibble returned by \code{\link{dnds}}.
#' @param annotation_file_query file path to query annotation file in either \code{gtf} or \code{gff} format.
#' @param annotation_file_subject file path to query annotation file in either \code{gtf} or \code{gff} format.
#' @param collapse_by locus type by which orthologs should be determined. Options are:
#' \itemize{
#' \item \code{type = "gene_locus"}: the splice variant having the lowest e-value of the BLAST search is chosen to represent the gene locus for the orthology relationship.
#' \item \code{type = "splice_variant"}: all splice variants of homologous gene loci will be returned.
#' }
#' @param format a vector of length 2 storing the annotation file formats of the query annotation file and subject annotation file: either \code{gtf} or \code{gff} format. E.g. \code{format = c("gtf","gtf")}.
#' @seealso \code{\link{dnds}}, \code{\link{internal_import_dnds_across_multiple_species}}, \code{\link{import_dnds_across_multiple_species}},
#' \code{\link{dnds_across_multiple_species}}
#' @author Hajk-Georg Drost
#' @export
select_best_splice_variants <-
  function(dnds_tbl,
           annotation_file_query,
           annotation_file_subject,
           collapse_by = "gene_locus",
           format = c("gtf", "gtf")) {
    if (!is.element(collapse_by, c("gene_locus", "splice_variant")))
      stop(
        "Please specify a collapse method provided by this function: collapse_by = 'gene_locus' or collapse_by = 'splice_variant'.",
        call. = FALSE
      )

    if (!all(is.element(format, c("gtf", "gff"))))
      stop(
        "Please provide a vector of length 2 that contains available formats: either 'gtf' or 'gff'.",
        call. = FALSE
      )

    if (length(format) != 2)
      stop(
        "Please provide a vector of length 2 to specify the query and subject annotation file formats. E.g. format = c('gtf', 'gtf').",
        call. = FALSE
      )

    if (!file.exists(annotation_file_query))
      stop(
        "The file '",
        annotation_file_query,
        " does not seem to exist. Please check the path to your annotation file.",
        call. = FALSE
      )

    if (!file.exists(annotation_file_subject))
      stop(
        "The file '",
        annotation_file_subject,
        " does not seem to exist. Please check the path to your annotation file.",
        call. = FALSE
      )

    '.' <- NULL

    temp_store <-
      file.path(tempdir(),
                paste0(
                  "final_join_dnds_annotation_tbl_",
                  basename(annotation_file_query),
                  "_vs_",
                  basename(annotation_file_subject)
                ))

    if (file.exists(temp_store)) {
      message("Start ortholog selection process by ",
              collapse_by,
              " ...")

      message("Import pre-computed file final_join_dnds_annotation_tbl ...")
      final_join_dnds_annotation_tbl <-
        readRDS(temp_store)

    } else {
      message("Start ortholog selection process by ",
              collapse_by,
              " ...")

      message(
        "Import query annotation file '",
        annotation_file_query,
        "' using format ",
        format[1],
        " ..."
      )

      tryCatch({
        # import annotation for query
        imported_annotation_qry <- tibble::as_tibble(rtracklayer::import(annotation_file_query, format = format[1]))
      }, error = function(e) {stop("In select_best_splice_variants() something went wrong when trying to import rtracklayer::import('",annotation_file_query,"') ... Please make sure that the gff/gtf annotation file is not corrupted.", call. = FALSE)})

      if (nrow(imported_annotation_qry) == 0)
        stop("The annotation file '", annotation_file_query, "' seems to be empty. Please provide a non-empty query annotation file.", call. = FALSE)

      message("Select orthologs in query species ...")

      if (format[1] == "gff") {
        Name <- NULL
        extracted_features_qry <-
          dplyr::do(
            dplyr::group_by(imported_annotation_qry, Name),
            extract_features(. , format = format[1])
          )

        colnames(extracted_features_qry)[2] <-
          "query_gene_locus_id"

        extracted_features_qry <-
          dplyr::select(
            dplyr::ungroup(extracted_features_qry),
            "query_gene_locus_id",
            "query_id"
          )

      }

      if (format[1] == "gtf") {
        gene_id <- NULL
        # extracted_features_qry <-
        #         dplyr::do(
        #                 dplyr::group_by(imported_annotation_qry, gene_id),
        #                 extract_features(. , format = format[1])
        #         )


        select_qry_ids <- unique(imported_annotation_qry[imported_annotation_qry$transcript_id %in% dnds_tbl$query_id,]$gene_id)

        extracted_features_qry <-
          dplyr::do(
            dplyr::group_by(imported_annotation_qry[imported_annotation_qry$gene_id %in% select_qry_ids,],
                            gene_id),
            extract_features(. , format = format[1])
          )

        colnames(extracted_features_qry)[2] <-
          "query_gene_locus_id"

        extracted_features_qry <-
          dplyr::select(
            dplyr::ungroup(extracted_features_qry),
            "query_gene_locus_id",
            "query_id"
          )
      }

      message(
        "Import subject annotation file '",
        annotation_file_subject,
        "' using format ",
        format[2],
        " ..."
      )

      tryCatch({
        imported_annotation_sbj <-
          tibble::as_tibble(
            rtracklayer::import(annotation_file_subject, format = format[2])
          )}, error = function(e) {stop("In select_best_splice_variants() something went wrong when trying to import rtracklayer::import('",annotation_file_subject,"') ... Please make sure that the gff/gtf annotation file is not corrupted.", call. = FALSE)})
      # import annotation for subject

      if (nrow(imported_annotation_sbj) == 0)
        stop("The annotation file '", annotation_file_subject, "' seems to be empty. Please provide a non-empty subject annotation file.", call. = FALSE)

      message("Select orthologs in subject species ...")

      if (format[2] == "gff") {
        Name <- NULL
        extracted_features_sbj <-
          dplyr::do(
            dplyr::group_by(imported_annotation_sbj, Name),
            extract_features(. , format = format[2])
          )
        colnames(extracted_features_sbj)[2] <-
          "subject_gene_locus_id"
        colnames(extracted_features_sbj)[3] <-
          "subject_id"

        extracted_features_sbj <-
          dplyr::select(
            dplyr::ungroup(extracted_features_sbj),
            "subject_gene_locus_id",
            "subject_id"
          )

      }

      if (format[2] == "gtf") {
        gene_id <- NULL
        # extracted_features_sbj <-
        #         dplyr::do(
        #                 dplyr::group_by(imported_annotation_sbj, gene_id),
        #                 extract_features(. , format = format[2])
        #         )

        select_sbj_ids <- unique(imported_annotation_sbj[imported_annotation_sbj$transcript_id %in% dnds_tbl$subject_id,]$gene_id)

        extracted_features_sbj <-
          dplyr::do(
            dplyr::group_by(imported_annotation_sbj[imported_annotation_sbj$gene_id %in% select_sbj_ids,], gene_id),
            extract_features(. , format = format[2])
          )

        colnames(extracted_features_sbj)[2] <-
          "subject_gene_locus_id"
        colnames(extracted_features_sbj)[3] <-
          "subject_id"

        extracted_features_sbj <-
          dplyr::select(
            dplyr::ungroup(extracted_features_sbj),
            "subject_gene_locus_id",
            "subject_id"
          )
      }

      if (nrow(extracted_features_qry) == 0)
        stop("Something went wrong during the feature extraction of 'query_gene_locus_id' ...", call. = FALSE)
      if (nrow(extracted_features_sbj) == 0)
        stop("Something went wrong during the feature extraction of 'subject_gene_locus_id' ...", call. = FALSE)

      message("Join dnds and annotation tables ...")
      # join dnds tbl with query annotation
      joined_dnds_annotation_tbl_qry <-
        dplyr::inner_join(dnds_tbl,
                          extracted_features_qry,
                          by = "query_id")

      if (nrow(joined_dnds_annotation_tbl_qry) == 0)
        stop("When joining the 'dnds_tbl' and the 'extracted_features_qry' table by 'query_id', no entries were retained. Thus, this join created an empty table. Please check what might have gone wrong during the query feature extraction process.", call. = FALSE)

      # join dnds tbl with subject annotation
      joined_dnds_annotation_tbl_sbj <-
        dplyr::inner_join(dnds_tbl,
                          extracted_features_sbj,
                          by = "subject_id")

      if (nrow(joined_dnds_annotation_tbl_sbj) == 0)
        stop("When joining the 'dnds_tbl' and the 'extracted_features_sbj' table by 'query_id', no entries were retained. Thus, this join created an empty table. Please check what might have gone wrong during the subject feature extraction process.", call. = FALSE)
      joined_dnds_annotation_tbl_sbj <-
        dplyr::select(
          joined_dnds_annotation_tbl_sbj,
          "query_id",
          "subject_gene_locus_id",
          "subject_id"
        )

      # join query and subject tables
      final_join_dnds_annotation_tbl <-
        dplyr::inner_join(
          joined_dnds_annotation_tbl_qry,
          joined_dnds_annotation_tbl_sbj,
          by = c("query_id", "subject_id")
        )

      if (nrow(final_join_dnds_annotation_tbl) == 0)
        stop("When joining the 'joined_dnds_annotation_tbl_qry' and the 'joined_dnds_annotation_tbl_sbj' tables by 'query_id' and 'subject_id', no entries were retained. Thus, this join created an empty table. Please check what might have gone wrong during the joining process.", call. = FALSE)

      if (!file.exists(temp_store)) {
        saveRDS(final_join_dnds_annotation_tbl,
                temp_store)
      }
    }

    if (collapse_by == "gene_locus") {
      message(
        "Select splice variant with smallest e-value for each gene locus of query species ..."
      )
      query_gene_locus_id <-
        subject_gene_locus_id <- NULL
      res_qry <-
        dplyr::do(
          dplyr::group_by(
            final_join_dnds_annotation_tbl,
            query_gene_locus_id
          ),
          filter_best_hits(.)
        )

      if (nrow(res_qry) == 0)
        stop("After best splice variant filtering no results were retained. The output was empty ...", call. = FALSE)

      message(
        "Select splice variant with smallest e-value for each gene locus of subject species ..."
      )
      res_sbj <-
        dplyr::ungroup(dplyr::do(
          dplyr::group_by(
            final_join_dnds_annotation_tbl,
            subject_gene_locus_id
          ),
          filter_best_hits(.)
        ))

      if (nrow(res_sbj) == 0)
        stop("After best splice variant filtering no results were retained. The output was empty ...", call. = FALSE)
      res_sbj <-
        dplyr::select(dplyr::ungroup(res_sbj),
                      "query_id",
                      "subject_id")

      res <-
        dplyr::inner_join(dplyr::ungroup(res_qry),
                          res_sbj,
                          by = c("query_id", "subject_id"))


      if (nrow(res) == 0)
        stop("After joining the best splice variants of the query and subject by 'query_id' and 'subject_id' no overlaps were found. The output was empty ...", call. = FALSE)

      if (length(unique(res$query_id)) == length(unique(res$query_gene_locus_id))) {
        message("Good News: ")
        message(
          "Number of unique query_ids matches the number of unique query_gene_locus_ids. Thus, each gene locus has now a representative splice variant. The one having the smallest evalue and longest alignment length if two splice variants have the same evalue."
        )
        message("\n")
      } else {
        message(
          "Please check your input data. It seems that the number of unique query_ids DOES NOT match the number of unique query_gene_locus_ids. Thus, some query gene loci seem to have multiple splice variants as representatives."
        )
        message("\n")
      }

      if (length(unique(res$subject_id)) == length(unique(res$subject_gene_locus_id))) {
        message("Good news:")
        message(
          "Number of unique subject_ids matches the number of unique subject_gene_locus_ids. Thus, each gene locus has now a representative splice variant. The one having the smallest evalue and longest alignment length if two splice variants have the same evalue."
        )
        message("\n")
      } else {
        message(
          "Please check your input data. It seems that the number of unique subject_ids DOES NOT match the number of unique subject_gene_locus_ids Thus, some subject gene loci seem to have multiple splice variants as representatives."
        )
        message("\n")
      }

      if (length(unique(dnds_tbl$query_id)) < length(unique(res$query_id)))
        stop(
          "Something must have gone wrong... The unique number of query_ids in the dNdS_tbl is smaller than the unique number of query_ids in the collapsed output file. Please check your input data.",
          call. = FALSE
        )

    }

    if (collapse_by == "splice_variant") {
      res <- final_join_dnds_annotation_tbl
      query_id <- subject_id <- NULL
      res <- dplyr::distinct(
        res,
        query_id,
        query_gene_locus_id,
        subject_id,
        subject_gene_locus_id,
        .keep_all = TRUE
      )
    }

    qry_species_name <-
      unlist(stringr::str_split(basename(annotation_file_query), "[.]"))[1]
    sbj_species_name <-
      unlist(stringr::str_split(basename(annotation_file_subject), "[.]"))[1]
    res <-
      dplyr::mutate(
        res,
        query_species = rep(qry_species_name, nrow(res)),
        subject_species = rep(sbj_species_name, nrow(res))
      )

    evalue <- bit_score <- perc_identity <- NULL
    s_end <-
      dN <-
      dS <- query_species <- subject_species <- score_raw <- NULL
    res <- dplyr::select(
      res,
      query_species,
      subject_species,
      query_id,
      query_gene_locus_id,
      subject_id,
      subject_gene_locus_id,
      dN,
      dS,
      dNdS,
      evalue,
      bit_score,
      perc_identity:score_raw
    )

    message("... best splice variant selection finished successfully!")
    return(res)
  }
drostlab/homologr documentation built on Sept. 28, 2020, 12:44 a.m.