knitr::opts_chunk$set(echo = TRUE)
library(here) library(tidyverse) library(lazyeval) library(taxadb) library(DBI) library(dbplyr) source(here::here("data-raw", "helper_functions.R"))
Access database
# #personal computer # con <- dbConnect( # drv = RMariaDB::MariaDB(), # dbname = "biotime", # username = "root", # password = "Ka1172353!", # host = "localhost" # ) # # #thelio # con <- DBI::dbConnect(RMariaDB::MariaDB(), # host="mariadb", # user="root", # password="password", # dbname="biotime") # # biotime_sql <- tbl(con, "allrawdata") %>% # collect() %>% # rename_all(.funs = tolower) %>% # select(-id_all_raw_data) # ``` # ```r # data <- biotime_sql %>% # group_by(id_species, plot, year, study_id) %>% # summarise(year_abun = sum(na.omit(abundance)), # year_biomass = sum(na.omit(biomass))) %>% # left_join(tbl(con, "datasets") %>% select(taxa = TAXA, study_id = STUDY_ID) %>% collect()) %>% # left_join(tbl(con, "species") %>% select(id_species = ID_SPECIES, genus_species = GENUS_SPECIES) %>% collect()) %>% # ungroup() # # #should be zero to be appropriately aggregated # #data %>% group_by(study_id, plot, year, id_species) %>% filter(n()>1) %>% View()
Replace NA plots with a plot label (these are studies with no plot subdivision)
# #replace plot NA's that are really studies with only one plot with a plot label # data <- data %>% # mutate(plot = case_when( # is.na(plot) ~ "A", # TRUE ~ plot # ))
#file created by 01_biotime_grid.R load(here::here("data/bt_grid_collate.rda")) data <- bt_grid_collate %>% rename_with(tolower) %>% mutate(species = str_replace(species, "_", " ")) rm(bt_grid_collate)
Check which authority would result in the most matches
##Counts show that OTT is the best authority get_match_counts(data %>% filter(taxa == "Birds"), "genus_species") get_match_counts(data, "genus_species")
OTT is the best authority but doesn't have common names, so we use the next best ITIS
#Getting matches by sci name and common name biotime_ids <- filter_name(unique(data$species), "itis") %>% bind_rows(filter_common(unique(data$species), "itis")) %>% filter(taxonRank == "species") %>% #only want ID's to species, since that's the level of the trait data drop_na(acceptedNameUsageID)
Some names match to multiple acceptedUsage ID's, so we have to manually choose which one we want.
unres <- get_dupe_ids(biotime_ids, "sort") unres
Add the unresolved ID to all the other ids and join with biotime data
biotime_ids <- unres %>% left_join(biotime_ids) %>% #join ID's back with all the columns filter(acceptedNameUsageID %in% c( "ITIS:1026896", #accepted on the ITIS website "ITIS:172921", "ITIS:508923", "ITIS:527684", "ITIS:782604", "ITIS:159807", "ITIS:28034", "ITIS:71868", "ITIS:683103" )) %>% bind_rows(biotime_ids %>% filter(!sort %in% unres$sort)) #join to resolved ID's, excluding the observations we resolved manually biotime_data <- biotime_ids %>% select(id = acceptedNameUsageID,input) %>% distinct() %>% right_join(data, by = c("input" = "species")) %>% rename(sourceName = input) %>% mutate(scientificName = taxadb::get_names(id)) %>% mutate(row_num = row_number())
Some ID's matched to more than one species in biotime, so we need to consolidate those entries so there is only one per ID (by removing the sourceName)
#find the entries for species that match to the same ID as another species dupe_id_entry <- biotime_data %>% group_by(id) %>% filter(n_distinct(sourceName) > 1) biotime_data <- biotime_data %>% filter(!row_num %in% dupe_id_entry$row_num) %>% #setdiff(biotime_data, dupe_id_entry) %>% bind_rows(dupe_id_entry %>% select(-sourceName) %>% group_by(across(c(-row_num, -abundance, -biomass))) %>% summarize(abundance = sum(abundance), biomass = sum(biomass))) %>% select(-row_num)
Do any unmatched species have matches with other providers?
NOT EXECUTED FOR CURRENT ITERATION (9/4/20)
# # #Check for potential matches from synonyms found in the other providers # syn_res <- match_providers(biotime_data, "itis", common = TRUE) # # #some names have multiple matches (same ID but multiple scinames), have to pick one # ## in this case all the duplicates have an accepted and a synonym, but there are some single matches that are just synonyms # ## so we have to find duplicate ID's and then filter # alt_dupes <- syn_res %>% # select(acceptedNameUsageID, input) %>% # distinct() %>% # group_by(input) %>% # filter(n() > 1) # # # #filter duplicates for accepted, and replace old entries # # syn_res <- alt_dupes %>% # # filter(acceptedNameUsageID %in% c( # # ITIS:331277, # # )) %>% # # bind_rows(syn_res %>% # # filter(!acceptedNameUsageID %in% alt_dupes$acceptedNameUsageID)) %>% # # #then finish clean up # # select(id = acceptedNameUsageID, input, scientificName) %>% # # distinct() %>% # # left_join(data %>% select(-species, -genus), by = c("input" = "species"), na_matches = "never") %>% # # rename(sourceName = input) %>% # # ungroup() # # #Just get rid of those with duplicates, deal with them later # add_syn_res <- syn_res %>% # filter(!input %in% alt_dupes$input) %>% #excluding duplicate matched species # select(id = acceptedNameUsageID, sourceName = input, scientificName) %>% # distinct() %>% # group_by(sourceName) %>% # top_n(1, scientificName) %>% #some have multiple sci names for same id, just pick on # left_join(biotime_data%>% select(-c(id, scientificName)), by = "sourceName", na_matches = "never") %>% # ungroup() %>% # distinct() # # biotime_data <- biotime_data %>% # filter(!sourceName %in% add_syn_res$sourceName) %>% # bind_rows(add_syn_res)
#save locally usethis::use_data(biotime_data)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.