R/homologue_functions.R

Defines functions select_and_filter map_to_ensembl_homologues_with_biomart map_to_homologues_oneway map_to_homologues

Documented in map_to_ensembl_homologues_with_biomart map_to_homologues map_to_homologues_oneway select_and_filter

#' map_to_homologues
#'
#' Takes gene ids from one species and maps them to gene ids in another
#' species. Can convert entrez ids and ensembl ids.
#'
#' Note: This function is used in project `dtg_rnaseq`. But the version there
#' uses dot-separated argument names. The argnames have been converted to
#' underscore-separated. If this function-script is copied back into the
#' `dtg_rnaseq` project you will have to double-check the function-calls.
#'
#' @param        gene_ids      A vector of gene identifiers for species `sp1`
#'   and of type `idtype_sp1`.
#' @param        dataset_sp1   The `biomaRt` dataset that houses the data for
#'   species `sp1`.
#' @param        dataset_sp2   The `biomaRt` dataset that houses the data for
#'   species `sp2`.
#' @param        sp1           The name of species 1 (using ensembl prefixes
#'   like `hsapiens` / `mmusculus`).
#' @param        sp2           The name of species 2 (using ensembl prefixes
#'   like `hsapiens` / `mmusculus`).
#' @param        idtype_sp1    The type of identifier used for species `sp1`;
#'   this should match one of the field / column names in the biomaRt dataset
#'   for species `sp1`.
#' @param        idtype_sp2    The type of identifier used for species `sp2`;
#'   this should match one of the field / column names in the biomaRt dataset
#'   for species `sp2`.
#' @param        one_to_one    Boolean. Should the function only return
#'   one-to-one homology mappings?
#'
#' @return       A `data.frame` with columns `id_sp1` and `id_sp2`.
#'
#' @export
#'

map_to_homologues <- function(gene_ids = character(0),
                              dataset_sp1 = NULL,
                              dataset_sp2 = NULL,
                              sp1 = "hsapiens",
                              sp2 = "mmusculus",
                              idtype_sp1 = "ensembl_gene_id",
                              idtype_sp2 = "ensembl_gene_id",
                              one_to_one = FALSE) {
  stopifnot(idtype_sp1 %in% c("ensembl_gene_id", "entrezgene"))
  stopifnot(idtype_sp2 %in% c("ensembl_gene_id", "entrezgene"))

  # Check the validity of the input gene_ids and return an empty result if
  #   no gene_ids were provided
  #
  if (is.null(gene_ids) || length(gene_ids) == 0) {
    null_results <- data.frame(
      id_sp1 = character(0),
      id_sp2 = character(0),
      stringsAsFactors = FALSE
    )
    return(null_results)
  }
  stopifnot(is.character(gene_ids))

  # Check that the one_to_one argument is TRUE/FALSE
  stopifnot(is.logical(one_to_one) && length(one_to_one) == 1)

  # Map from ids in species 1 to ids in species 2
  forward_map <- map_to_homologues_oneway(
    gene_ids = gene_ids,
    dataset_sp1 = dataset_sp1,
    dataset_sp2 = dataset_sp2,
    sp1 = sp1,
    sp2 = sp2,
    idtype_sp1 = idtype_sp1,
    idtype_sp2 = idtype_sp2
  )

  # If the user wants only homologues that are one:one from one species to the
  # other, we have to compare the mappings from sp1->sp2 and the mappings from
  # sp2->sp1, keeping only the genes in sp1 that map to a single gene in sp2
  # and the genes in sp2 that map to a single gene in sp1
  if (one_to_one) {
    # Make mapping from second species back to first and
    # Filter the forward map based on 1:many maps in both forward and reverse
    # map
    reverse_map <- map_to_homologues_oneway(
      gene_ids = forward_map %>%
        drop_incomplete_cases() %>%
        magrittr::extract2("id_sp2") %>%
        unique() %>%
        sort(),
      dataset_sp1 = dataset_sp2,
      dataset_sp2 = dataset_sp1,
      sp1 = sp2,
      sp2 = sp1,
      idtype_sp1 = idtype_sp2,
      idtype_sp2 = idtype_sp1
    )
    return(
      keep_only_one_to_one_homologues(forward_map, reverse_map)
    )
  } else {
    return(
      forward_map
    )
  }
}

###############################################################################

#' map_to_homologues_oneway
#'
#' Takes gene ids from one species and maps them to gene ids in another
#' species. Can convert entrez ids and ensembl ids
#'
#'  --- Non-exported function ---
#'  --- All inputs should have been validity-checked by map_to_homologues ---
#'
#'  Selects all genes from the biomart dataset for species1 that have
#'    homologues in species 2
#'  1) Map input-ids in species 1 to ensembl gene in species 2
#'  2) Map ensembl gene in species 2 to output-ids in species 2
#'
#'  Map species 1 gene_ids to ensembl-ids in species 2
#'  - Since dataset_sp1 is the mart for species 1, we don't have to pass the
#'  sp1, host or mart.name arguments into map_to_ensembl_homologues...
#'
#' @inheritParams   map_to_homologues
#'
#' @importFrom   dplyr         arrange   bind_rows   filter
#' @importFrom   magrittr      %>%
#' @importFrom   rlang         .data
#'

map_to_homologues_oneway <- function(gene_ids = character(0),
                                     dataset_sp1 = NULL,
                                     dataset_sp2 = NULL,
                                     sp1 = "hsapiens",
                                     sp2 = "mmusculus",
                                     idtype_sp1 = "ensembl_gene_id",
                                     idtype_sp2 = "ensembl_gene_id") {
  sp1_to_sp2_ensembl <- map_to_ensembl_homologues_with_biomart(
    gene_ids = gene_ids,
    dataset_sp1 = dataset_sp1,
    sp2 = sp2,
    idtype_sp1 = idtype_sp1
  )

  # Map species 2 ensembl-ids to species 2 output idtype
  sp2_ensembl_to_output <- if (idtype_sp2 == "ensembl_gene_id") {
    ens_sp2 <- unique(sp1_to_sp2_ensembl[, "ensembl_gene_sp2"])
    data.frame(
      ensembl_gene_id = ens_sp2,
      id_sp2 = ens_sp2,
      stringsAsFactors = FALSE
    )
  } else {
    select_and_filter(
      filters = "ensembl_gene_id",
      values = sp1_to_sp2_ensembl[, "ensembl_gene_sp2"],
      attributes = c("ensembl_gene_id", idtype_sp2),
      mart = dataset_sp2
    ) %>%
      setNames(c("ensembl_gene_id", "id_sp2"))
  }

  # Merge species 1 gene_ids with species 2 gene_ids of type idtype_sp2
  # - removing the intermediate ensembl-ids in species 2
  sp1_to_sp2_initial <- merge(
    x = sp1_to_sp2_ensembl, y = sp2_ensembl_to_output,
    by.x = "ensembl_gene_sp2", by.y = "ensembl_gene_id", all.x = TRUE
  )
  sp1_to_sp2 <- unique(sp1_to_sp2_initial[c("id_sp1", "id_sp2")])

  # Remove any rows where the sp2.entrez is NA AND the sp1.entrez has at least
  # one non-NA value in sp2.entrez
  has_valid <- with(
    sp1_to_sp2,
    id_sp1[which(!is.na(id_sp2))]
  )

  sp1_to_sp2 %>%
    dplyr::filter(.data[["id_sp1"]] %in% has_valid &
      !is.na(.data[["id_sp2"]])) %>%
    dplyr::bind_rows(
      dplyr::filter(sp1_to_sp2, !.data[["id_sp1"]] %in% has_valid)
    ) %>%
    dplyr::arrange(
      .data[["id_sp1"]], .data[["id_sp2"]]
    ) %>%
    as.data.frame(stringsAsFactors = FALSE)
}

###############################################################################

#' map_to_ensembl_homologues_with_biomart
#'
#' A function to map from a set of `gene_ids` in one species (sp1) to the
#'   ensembl-gene id of their homologues in a separate species (sp2). The
#'   mappings are performed using biomart databases.
#'
#' @inheritParams   map_to_homologues
#'
#' @return       A data.frame with columns id_sp1 and ensembl_gene_sp2.
#'
#' @importFrom   magrittr      %>%
#' @importFrom   dplyr         arrange
#' @importFrom   rlang         .data
#'
#' @include      homologue_db.R
#'

map_to_ensembl_homologues_with_biomart <- function(gene_ids = character(0),
                                                   dataset_sp1 = NULL,
                                                   sp2 = "mmusculus",
                                                   idtype_sp1 =
                                                     "ensembl_gene_id") {
  results_empty <- data.frame(
    id_sp1 = character(0),
    ensembl_gene_sp2 = character(0),
    stringsAsFactors = FALSE
  )

  # Check that the idtype is a valid input to the function
  # - it must be either "entrezgene" or "ensembl_gene_id"
  stopifnot(idtype_sp1 %in% c("entrezgene", "ensembl_gene_id"))

  # Check that gene_ids is a vector (it can't be missing)
  # - If gene_ids is empty, return empty results
  # - If gene_ids is NULL or is not a vector, throw an error
  stopifnot(!is.null(gene_ids) && is.vector(gene_ids))
  if (length(gene_ids) == 0) {
    return(results_empty)
  }

  # The homologue column for sp2 in the biomaRt object for sp1 is obtained:
  homologue_field <- get_ensembl_homologue_field(sp2)

  # Checks on dataset_sp1
  # - If it is defined, it must
  #   a) be a Mart; b) have a homologue field for sp2; and c) have a field for
  #   the input idtype_sp1

  # TODO:
  # - move mart-validity checks into select_and_filter

  if (is.null(dataset_sp1)) {
    stop("Mart should be provided as `dataset_sp1`")
  }

  stopifnot(
    is_valid_mart(dataset_sp1, idtype_sp1) &&
      is_valid_mart(dataset_sp1, homologue_field)
  )

  # If the input ids are non-ensembl, they have to be mapped to ensembl-ids
  #   before use in homology searches
  ensembl_ids_sp1 <- if (idtype_sp1 == "ensembl_gene_id") {
    gene_ids
  } else {
    # Map non-ensembl ids for sp1 to ensembl_gene_id
    id_to_ensembl_sp1 <- select_and_filter(
      filters = idtype_sp1, # should be entrezgene as of 2017/03/03
      values = gene_ids,
      attributes = c(idtype_sp1, "ensembl_gene_id"),
      mart = dataset_sp1
    )
    id_to_ensembl_sp1[, "ensembl_gene_id"]
  }

  # Map ensembl_gene_id for sp1 to ensembl_gene_id for sp2
  ensemblmap_sp1_sp2 <- select_and_filter(
    filters = "ensembl_gene_id",
    values = ensembl_ids_sp1,
    attributes = c("ensembl_gene_id", homologue_field),
    mart = dataset_sp1
  )

  # Map input_id for sp1 to ensembl_gene_id for sp2
  id_to_ensembl_sp1_sp2 <- if (idtype_sp1 == "ensembl_gene_id") {
    ensemblmap_sp1_sp2
  } else {
    merge(
      x = id_to_ensembl_sp1,
      y = ensemblmap_sp1_sp2,
      by = "ensembl_gene_id"
    )
  }
  # An entry should be present in the output for every gene that was in the
  #   input; even if if maps to NA
  missing_inputs <- setdiff(
    gene_ids,
    id_to_ensembl_sp1_sp2[, idtype_sp1]
  )

  # Output should be sorted by input id and then by output id, and should have
  #   column names id_sp1 and ensembl_gene_sp2
  data.frame(
    id_sp1 = c(
      id_to_ensembl_sp1_sp2[, idtype_sp1],
      missing_inputs
    ),
    ensembl_gene_sp2 = c(
      id_to_ensembl_sp1_sp2[, homologue_field],
      rep(NA, length(missing_inputs))
    ),
    stringsAsFactors = FALSE
  ) %>%
    dplyr::arrange(
      .data[["id_sp1"]],
      .data[["ensembl_gene_sp2"]]
    )
}

###############################################################################

#' select_and_filter : extracts data from biomaRt, removes missing values
#'
#' @param        filters       Vector of filters for use in a biomaRt select().
#' @param        values        List (if multiple filters used) or vector of
#'   values for use in biomaRt::select().
#' @param        attributes    Vector of field-names for return from a biomaRt
#'   query.
#' @param        mart          A biomaRt `mart` object.
#'
#' @return       A data.frame containing columns for each of the `attributes`
#'   that were selected; and where only complete observations were kept (ie,
#'   there is no missing values in the returned data).
#'
#' @importFrom   biomaRt       getBM
#' @importFrom   stats         na.omit   setNames
#' @importFrom   magrittr      %>%
#'

select_and_filter <- function(filters,
                              values,
                              attributes,
                              mart) {
  # TODO: test this with no input `values` - it throws a warning at present

  # If none of the provided values contain valid (non-NA) values, return an
  #   empty dataframe that has the same column names as the requested
  #   attributes
  # Otherwise, search biomaRt using the values, and filter to keep only the
  #   results that are 'complete' (ie, have no NAs)

  null_result <- attributes %>%
    lapply(
      function(x) character(0)
    ) %>%
    data.frame(
      stringsAsFactors = FALSE
    ) %>%
    setNames(attributes)

  values_filtered <- na.omit(values)

  if (length(values_filtered) == 0) {
    return(null_result)
  }

  bm_result <- biomaRt::getBM(
    filters = filters,
    values = values_filtered,
    attributes = attributes,
    mart = mart
  ) %>%
    lapply(
      function(x) {
        x <- as.character(x)
        x[x == ""] <- NA
        x
      }
    ) %>%
    as.data.frame(stringsAsFactors = FALSE) %>%
    na.omit()

  bm_result
}

###############################################################################
russHyde/homologiser documentation built on May 19, 2020, 8:20 p.m.