knitr::opts_chunk$set(echo = TRUE)
library(here)
library(tidyverse)
library(lazyeval)
library(taxadb)
library(DBI)
library(dbplyr)

source(here::here("data-raw", "helper_functions.R"))

Get Biotime Occurrence

Option for Getting data from SQL query (not grid filtered)

Access database

# #personal computer
# con <- dbConnect(
#   drv = RMariaDB::MariaDB(), 
#   dbname = "biotime",
#   username = "root",
#   password = "Ka1172353!", 
#   host = "localhost"
# )
# 
# #thelio
# con <- DBI::dbConnect(RMariaDB::MariaDB(), 
#                       host="mariadb", 
#                       user="root", 
#                       password="password", 
#                       dbname="biotime")
# 
# biotime_sql <- tbl(con, "allrawdata") %>% 
#   collect() %>% 
#   rename_all(.funs = tolower) %>%
#   select(-id_all_raw_data)
# ```
# ```r
# data <- biotime_sql %>% 
#   group_by(id_species, plot, year, study_id) %>% 
#   summarise(year_abun = sum(na.omit(abundance)), 
#             year_biomass = sum(na.omit(biomass))) %>%
#   left_join(tbl(con, "datasets") %>% select(taxa = TAXA, study_id = STUDY_ID) %>% collect()) %>%
#   left_join(tbl(con, "species") %>% select(id_species = ID_SPECIES, genus_species = GENUS_SPECIES) %>% collect()) %>%
#   ungroup()
# 
# #should be zero to be appropriately aggregated
# #data %>% group_by(study_id, plot, year, id_species) %>% filter(n()>1) %>% View()

Replace NA plots with a plot label (these are studies with no plot subdivision)

# #replace plot NA's that are really studies with only one plot with a plot label 
# data <- data %>% 
#   mutate(plot = case_when(
#     is.na(plot) ~ "A",
#     TRUE ~ plot
#   ))

Or, from the grid processing script

#file created by 01_biotime_grid.R
load(here::here("data/bt_grid_collate.rda"))

data <- bt_grid_collate %>%
  rename_with(tolower) %>%
  mutate(species = str_replace(species, "_", " "))

rm(bt_grid_collate)

Now get id's

Check which authority would result in the most matches

##Counts show that OTT is the best authority 
get_match_counts(data %>% filter(taxa == "Birds"), "genus_species")
get_match_counts(data, "genus_species")

OTT is the best authority but doesn't have common names, so we use the next best ITIS

#Getting matches by sci name and common name 
biotime_ids <- filter_name(unique(data$species), "itis") %>%
  bind_rows(filter_common(unique(data$species), "itis")) %>%
  filter(taxonRank == "species") %>% #only want ID's to species, since that's the level of the trait data
  drop_na(acceptedNameUsageID)

Some names match to multiple acceptedUsage ID's, so we have to manually choose which one we want.

unres <- get_dupe_ids(biotime_ids, "sort")
unres 

Add the unresolved ID to all the other ids and join with biotime data

biotime_ids <- unres %>%
  left_join(biotime_ids) %>% #join ID's back with all the columns
  filter(acceptedNameUsageID %in% c(
    "ITIS:1026896", #accepted on the ITIS website 
    "ITIS:172921", 
    "ITIS:508923",
    "ITIS:527684", 
    "ITIS:782604", 
    "ITIS:159807", 
    "ITIS:28034",
    "ITIS:71868",
    "ITIS:683103"
  )) %>%
  bind_rows(biotime_ids %>% filter(!sort %in% unres$sort)) #join to resolved ID's, excluding the observations we resolved manually

biotime_data <- biotime_ids %>%
  select(id = acceptedNameUsageID,input) %>%
  distinct() %>%
  right_join(data, by = c("input" = "species")) %>%
  rename(sourceName = input) %>%
  mutate(scientificName = taxadb::get_names(id)) %>%
  mutate(row_num = row_number())

Some ID's matched to more than one species in biotime, so we need to consolidate those entries so there is only one per ID (by removing the sourceName)

#find the entries for species that match to the same ID as another species
dupe_id_entry <- biotime_data %>% 
  group_by(id) %>% 
  filter(n_distinct(sourceName) > 1) 

biotime_data <- biotime_data %>%
  filter(!row_num %in% dupe_id_entry$row_num) %>% #setdiff(biotime_data, dupe_id_entry) %>%
  bind_rows(dupe_id_entry %>% 
              select(-sourceName) %>% 
              group_by(across(c(-row_num, -abundance, -biomass))) %>%
              summarize(abundance = sum(abundance), biomass = sum(biomass))) %>%
  select(-row_num)

Do any unmatched species have matches with other providers?

NOT EXECUTED FOR CURRENT ITERATION (9/4/20)

# 
# #Check for potential matches from synonyms found in the other providers
# syn_res <- match_providers(biotime_data, "itis", common = TRUE)
# 
# #some names have multiple matches (same ID but multiple scinames), have to pick one
# ## in this case all the duplicates have an accepted and a synonym, but there are some single matches that are just synonyms
# ## so we have to find duplicate ID's and then filter
# alt_dupes <- syn_res %>%
#   select(acceptedNameUsageID, input) %>%
#   distinct() %>%
#   group_by(input) %>%
#   filter(n() > 1)
# 
# # #filter duplicates for accepted, and replace old entries 
# # syn_res <- alt_dupes %>% 
# #   filter(acceptedNameUsageID %in% c(
# #     ITIS:331277, 
# #   )) %>% 
# #   bind_rows(syn_res %>% 
# #               filter(!acceptedNameUsageID %in% alt_dupes$acceptedNameUsageID)) %>%
# # #then finish clean up 
# #   select(id = acceptedNameUsageID, input, scientificName) %>% 
# #   distinct() %>%
# #   left_join(data %>% select(-species, -genus), by = c("input" = "species"), na_matches =  "never") %>%
# #   rename(sourceName = input) %>%
# #   ungroup()
# 
# #Just get rid of those with duplicates, deal with them later
# add_syn_res <- syn_res %>% 
#   filter(!input %in% alt_dupes$input) %>% #excluding duplicate matched species
#   select(id = acceptedNameUsageID, sourceName = input, scientificName) %>%
#   distinct() %>%
#   group_by(sourceName) %>%
#   top_n(1, scientificName) %>% #some have multiple sci names for same id, just pick on 
#   left_join(biotime_data%>% select(-c(id, scientificName)), by = "sourceName", na_matches =  "never") %>%
#   ungroup() %>%
#   distinct()
# 
# biotime_data <- biotime_data %>% 
#   filter(!sourceName %in% add_syn_res$sourceName) %>%
#   bind_rows(add_syn_res)
#save locally
usethis::use_data(biotime_data)


karinorman/biodivTS documentation built on Dec. 23, 2024, 10 p.m.