knitr::opts_chunk$set(echo = TRUE)
library(here)
library(tidyverse)
library(lazyeval)
library(rdataretriever)
library(taxadb)

source(here::here("data-raw", "helper_functions.R"))

Get mammal and trait data

bird_trait <- read_csv(system.file("extdata", "elton_traits/elton_traits_BirdFuncDat.csv", package = "biodivTS"))
mamm_trait <- read.csv(system.file("extdata", "elton_traits/elton_traits_MammFuncDat.csv", package = "biodivTS")) %>%
  rename(nocturnal = activity_nocturnal, crepuscular = activity_crepuscular, diurnal = activity_diurnal)

Check which authority would result in the most matches

##Counts show that OTT is the best authority for both data sets
get_match_counts(bird_trait, "scientific")
get_match_counts(mamm_trait, "scientific")

Get id's for the Elton trait data

get_trait_ids <- function(data, provider, id, common = TRUE){
  #id <- lazyeval::as_name(id)
  sci_match <-  by_name(unique(data$scientific), provider) %>%
    drop_na(acceptedNameUsageID) %>%
    #get original specid so we can figure out ho w many don't have matches
    left_join(data %>% select_(id, "scientific"), by = c("input" = "scientific")) %>%
    mutate(match_type = "scientific")

  if(common){
    com_names <- data %>% 
      #the id argument should be inplace of specid, but it doesn't work for some reason
      filter(!specid %in% sci_match$specid) %>%
      pull(english) %>%
      unique() %>%
      by_common("itis") %>%
      #mutate(match_type = "common")
      #filter(taxonRank == "species") %>% #only want ID's to species, since that's the level of the trait data
      drop_na(acceptedNameUsageID) %>%
      left_join(bird_trait %>% select_(id, "english"), by = c("input" = "english"))%>%
      mutate(match_type = "common")

    #there are both sci and common names, only want common name 
    return(bind_rows(sci_match, com_names))
  }else{ return(sci_match)}
}

bird_ids <- get_trait_ids(bird_trait, "itis", "specid")
mammal_ids <- get_trait_ids(mamm_trait, "itis", "msw3_id", FALSE)

Are there any unresolved ids for birds? Yes - just one, which is a synonym to two different accepted id's

unres_trait <- get_dupe_ids(bird_ids, "specid") %>%
  left_join(bird_ids)

unres_trait

#resolve to the desired ID (in this case the one with complete hierarchy data)
bird_ids <- unres_trait %>% 
  filter(!is.na(kingdom)) %>%
  bind_rows(bird_ids %>% filter(!sort %in% unres_trait$sort))

What about for mammals? Yes - also a synonym to two different accepted id's. Further research shows that the Callospermophilus saturatus uses the subgenus, so we'll keep the name with the genus

unres_mamm_trait <- get_dupe_ids(mammal_ids, "msw3_id") %>% 
  left_join(mammal_ids)

unres_mamm_trait

mammal_ids <- unres_mamm_trait %>% 
  filter(acceptedNameUsageID == "ITIS:632452") %>%
  bind_rows(mammal_ids %>% filter(!sort %in% unres_mamm_trait$sort))

Join bird ids back to original trait data

#only want one column for old name and one column for new name, these are the old ID columns to remove
col_ex <- c("passnonpass", "iocorder", "blfamilylatin", "blfamilyenglish", "blfamsequid", "taxo")

#join on scientific and common
sci_matches <- bird_ids  %>%
  filter(match_type == "scientific") %>%
  select(id = acceptedNameUsageID, input, scientificName) %>%
  distinct() %>%
  right_join(bird_trait %>% select(.dots = -col_ex), by = c("input" = "scientific"), na_matches = "never") %>%
  #drop_na(id) %>%
  distinct() %>%
  select(-english)

comm_matches <- bird_ids  %>%
  filter(match_type == "common") %>%
  select(id = acceptedNameUsageID, input, scientificName) %>%
  distinct() %>%
  right_join(bird_trait %>% filter(!specid %in% sci_matches$specid) %>% select(.dots = -col_ex), by = c("input" = "english"), na_matches = "never") %>%
  distinct() %>%
  select(-scientific)

#there are more rows after ID data is joined because for species that matched on common names there may be more than one scientificName 
elton_bird <- bind_rows(sci_matches, comm_matches) %>% rename(sourceName = input)

Do the same for mammal trait data

elton_mamm <- mammal_ids %>%
  select(id = acceptedNameUsageID, input, scientificName) %>%
  right_join(mamm_trait %>% select(-mswfamilylatin), by = c("input" = "scientific"), na_matches = "never") %>%
  distinct() %>%
  rename(sourceName = input)

Check against alternative providers, first for mammals

syn_res <- match_providers(elton_mamm, "itis")

#names that resolve to more than one id
alt_dupes <- syn_res %>%
  select(acceptedNameUsageID, input) %>%
  distinct() %>%
  group_by(input) %>%
  filter(n() > 1)

add_syn_res <- syn_res %>% 
  filter(!input %in% alt_dupes$input) %>% #excluding duplicate matched species
  select(id = acceptedNameUsageID, sourceName = input, scientificName) %>%
  distinct() %>%
  group_by(sourceName) %>%
  top_n(1, scientificName) %>%
  left_join(elton_mamm %>% select(-c(id, scientificName)), by = "sourceName", na_matches =  "never") %>%
  ungroup() %>%
  distinct()

elton_mamm <- elton_mamm %>% 
  filter(!sourceName %in% add_syn_res$sourceName) %>%
  bind_rows(add_syn_res)

then birds

syn_res <- match_providers(elton_bird, "itis")

#names that resolve to more than one id
alt_dupes <- syn_res %>%
  select(acceptedNameUsageID, input) %>%
  distinct() %>%
  group_by(input) %>%
  filter(n() > 1)

add_syn_res <- syn_res %>% 
  filter(!input %in% alt_dupes$input) %>% #excluding duplicate matched species
  select(id = acceptedNameUsageID, sourceName = input, scientificName) %>%
  distinct() %>%
  group_by(sourceName) %>%
  top_n(1, scientificName) %>%
  left_join(elton_bird %>% select(-c(id, scientificName)), by = "sourceName", na_matches =  "never") %>%
  ungroup() %>%
  distinct()

elton_bird <- elton_bird %>% 
  filter(!sourceName %in% add_syn_res$sourceName) %>%
  bind_rows(add_syn_res)

Grab only the trait columns we're interested in, standardize column names

elton_bird <- elton_bird %>% 
  select(-specid, -ends_with("source"), -ends_with("enteredby"), -ends_with("certainty"), -ends_with("speclevel"), -ends_with("comment"))

elton_mamm <- elton_mamm %>%
  select(-msw3_id, -forstrat_value, -ends_with("source"), -ends_with("certainty"), -ends_with("speclevel"), -ends_with("comment"))

Check for multiple matches to the same ID

elton_bird <- undupe_ids(elton_bird)

elton_mamm <- undupe_ids(elton_mamm)
#save locally
usethis::use_data(elton_bird, elton_mamm)


karinorman/biodivTS documentation built on Dec. 23, 2024, 10 p.m.