R/resolve_taxnomy.R

Defines functions resolve_taxonomy

# Update `scientificName`, `taxonID`, `taxonRank` and `morphospeciesID` using assignments from parataxonomy and expert taxonomy.
#
resolve_taxonomy <- function(sorting, para, expert){

  taxonomy <-
    dplyr::left_join(sorting,
                     dplyr::select(para, subsampleID, individualID, scientificName, taxonRank, taxonID, morphospeciesID),
              by = "subsampleID")  %>%
    ## why are there so many other shared columns (siteID, collectDate, etc?  and why don't they match!?)
    ## we use `select` to avoid these
    dplyr::left_join(
      dplyr::select(expert, -uid, -namedLocation, -domainID, -siteID, -collectDate, -plotID, -setDate, -collectDate),
      by = "individualID") %>%
    dplyr::distinct() %>%
    ## Prefer the para table cols over the sorting table cols only for sampleType=="other carabid"
    dplyr::mutate(taxonRank.x = ifelse(is.na(taxonRank.y) | sampleType != "other carabid", taxonRank.x, taxonRank.y),
           scientificName.x = ifelse(is.na(scientificName.y) | sampleType != "other carabid", scientificName.x, scientificName.y),
           taxonID.x = ifelse(is.na(taxonID.y) | sampleType != "other carabid", taxonID.x, taxonID.y),
           morphospeciesID.x =  ifelse(is.na(morphospeciesID.y) | sampleType != "other carabid", morphospeciesID.x, morphospeciesID.y)) %>%
    ## Prefer expert values where available
    dplyr::mutate(taxonRank = ifelse(is.na(taxonRank), taxonRank.x, taxonRank),
           scientificName = ifelse(is.na(scientificName), scientificName.x, scientificName),
           taxonID = ifelse(is.na(taxonID), taxonID.x, taxonID),
           morphospeciesID =  ifelse(is.na(morphospeciesID), morphospeciesID.x, morphospeciesID),
           nativeStatusCode = ifelse(is.na(nativeStatusCode.y), nativeStatusCode.x, nativeStatusCode.y),
           sampleCondition = ifelse(is.na(sampleCondition.y), sampleCondition.x, sampleCondition.y)
    ) %>%
    dplyr::select(-ends_with(".x"), -ends_with(".y")) %>%
    dplyr::select(-individualCount)
  ## individualCount could now be misleading, because it is tied to subsampleID, but subsampleID is repeated for each individualID
  ## Most of the time, the subsample all share the same expert ID, but not always.
  ## In cases where the subsample is split into separate taxa by experts, the "individualCount must also be split.
  ## There is no certain way to split the part of the sub-sample that was not pinned.
  ## For computing richness alone, we do not need individualCounts anyway.


  #### Should we add a "species" column, using morphospecies or the best available?
  ## Use morphospecies if available for higher-rank-only classifications,
  ## Otherwise, binomialize the scientific name:
  taxonomy <- taxonomy %>%
    dplyr::mutate(morphospecies =
             ifelse(taxonRank %in% c("subgenus", "genus", "family", "order") & !is.na(morphospeciesID),
                    morphospeciesID,
                    taxadb::clean_names(scientificName)
             )
    )

  ## Beetles must be identified as carabids by both sorting table and the taxonomists (~3 non-Carabidae slip through in sorting)
  beetles <- taxonomy %>%
    dplyr::filter(grepl("carabid", sampleType)) %>%
    dplyr::filter(family == "Carabidae" | is.na(family))

  beetles
}
eco4cast/NEONeco4cast documentation built on Aug. 30, 2020, 12:03 a.m.