R/xref_species.R

Defines functions xref_gidb_species xref_ah_species

Documented in xref_ah_species xref_gidb_species

#' Cross reference the taxonomy data from AnnotationHubData::getSpeciesList()
#'
#' Previously, the logic of this function resided in download_eupath_metadata(),
#' but I want to be able to test and poke at it separately to more effectively
#' ensure as many taxa as possible pass.  Therefore, I split it into its own
#' function. The secondary function of this is to set the 'Species' column as
#' appropriately as possible.
#'
#' @param valid Dataframe of entries which have thus far been deemed 'valid' by my tests.
#' @param invalid Dataframe of entries which failed.
#' @param verbose Print some information about what is found?
#' @param taxon_column metadata column with the taxonomy information.
#' @param species_column metadata column with my generated species name.
#' @return Likely smaller data frame of valid information and larger dataframe of invalid.
xref_ah_species <- function(metadatum, ah_species, verbose = FALSE,
                            metadata_taxon_column = "TaxonUnmodified",
                            metadata_species_column = "GenusSpecies",
                            gidb_species_column = "GIDB_Genus_Species",
                            xref_column = "TaxonXref") {

  ## In this process I am adding a new column 'TaxonXref' which is the set of
  ## Species or SpeciesStrain names that we can successfully match against the
  ## set of things in getSpeciesList().
  ## In the ideal world, this would always be identical to the taxonomy names
  ## provided by the EuPathDB.  This is not an ideal world, so some of them
  ## will get filled in with just the species names provided by the eupathdb.
  ## If we cannot match even that, then the entries will get moved into the
  ## pile of invalid entries for future examination and probably deletion.
  retlist <- list(
    "ID" = NULL,
    "status" = "unmatched")

  initial_valid <- metadatum[[metadata_taxon_column]] %in% ah_species
  if (isTRUE(initial_valid)) {
    retlist[["ID"]] <- metadatum[[metadata_taxon_column]]
    retlist[["status"]] <- "exact_taxon"
    return(retlist)
  }

  ## We have previously filled in the 'Species' column with information from GenomeInfoDb.
  ## So let us query that now and see if we pick up more valid entries.
  valid <- metadatum[[gidb_species_column]] %in% ah_species
  if (isTRUE(valid)) {
    if (isTRUE(verbose)) {
      message("Found a match between the GIDB genus/species and the AH species list.")
    }
    retlist[["ID"]] <- metadatum[[gidb_species_column]]
    retlist[["status"]] <- "exact_gidb_species"
    return(retlist)
  } else {
    retlist[["ID"]] <- NULL
    retlist[["status"]] <- "unexpected_gidb"
    return(retlist)
  }

  ## If we get here, see if AH has genus/species.
  valid <- metadatum[["GenusSpecies"]] %in% ah_species

}

#' Cross reference information from the taxonDB vs. the downloaded metadata.
#'
#' @param metadatum The downloaded metadata for an individual species.
#' @param all_taxa_ids The taxonDB result.
#' @param taxon_number_column The metadatum column which should contain the relevant number.
#' @param verbose Be a chatty catty?
xref_gidb_species <- function(metadatum, all_taxa_ids,
                              taxon_number_column = "TaxonomyID",
                              verbose = verbose) {
  id <- metadatum[[taxon_number_column]]
  retlist <- list(
    "ID" = id,
    "status" = "unmatched")
  if (is.null(id)) {
    return(retlist)
  }
  if (is.na(id)) {
    return(retlist)
  }
  if (is.na(id)) {
    return(NULL)
  }

  gs <- NULL
  found <- all_taxa_ids[["tax_id"]] == id
  if (sum(found) == 0) {
    retlist[["ID"]] <- NULL
    retlist[["status"]] <- "mismatched"
  } else if (sum(found) == 1) {
    gs <- paste0(all_taxa_ids[found, "genus"], " ", all_taxa_ids[found, "species"])
    retlist[["ID"]] <- gs
    retlist[["status"]] <- "exact_match"
  } else if (sum(found) > 1) {
    message("Found more than one taxonomy ID match, returning the first match.")
    matched <- all_taxa_ids[found, ]
    gs <- paste0(all_taxa_ids[1, "genus"], " ", all_taxa_ids[1, "species"])
    retlist[["ID"]] <- gs
    retlist[["status"]] <- "multi_match"
  } else {
    message("This should not happen, returning NULL.")
    return(NULL)
  }
  return(retlist)
}
khughitt/EuPathDB documentation built on Nov. 4, 2023, 4:19 a.m.