## code to prepare `EMBSeCBIO` dataset goes here
`%>%` <- magrittr::`%>%`
# all_taxa <- readr::read_csv("inst/extdata/all_taxa.csv") %>%
# dplyr::select(1:3)
# embsecbio_taxa <- readr::read_csv("inst/extdata/embsecbio_taxa.csv")
# mpd_taxa <- readr::read_csv("inst/extdata/empdv2_taxa.csv")
# ref_taxa <- embsecbio_taxa %>%
# dplyr::bind_rows(all_taxa) %>%
# dplyr::distinct(taxon_name, .keep_all = TRUE)
# ref_taxa <- readr::read_csv("inst/extdata/taxa_clean.csv")
ref_taxa <- smpds::clean_taxa()
# EMBSeCBIO records in the SMPDSv1
smpdsv1_embsecbio <- smpds::SMPDSv1 %>%
dplyr::filter(toupper(source) %>% stringr::str_detect("EMB|EMBS")) %>%
smpds::total_taxa(1:14)
# embsecbio::site %>%
# dplyr::filter(site_name %in% smpdsv1_embsecbio$site_name)
# Esmeralda's updates to the EMBSeCBIO
ID_ENTITY_ECS <- c(
1157, # Updated depths and age errors,
8, 479, 1062, 1104, 1111, 1114, 1322, 1323, 1324, 1330, 1338, 1346, 1351, 1354, 1359, 1413, 1825, 1828, 2063, 2065, 2066, 2067, 2069, 2070, 2071, 2073, # New age models
479,1067,1114,1354,1323,1324, # Added missing dated ages
1827, # Updated depth units
2105, # Update comments
2, 3, 4, 5, 6, 7, 9, 10, 11, 1053, 1059, 1074, 1091, 1107, 1109, 1110, 1113, 1115, 1118, 1132, 1148, 1157, 1158, 1342, 1361, 1411, 1412, # New age models
2074, 2075, 2076, 2077, 2079, 2080, # New age models
1115, # Update dates and age model
2115, # New record
1688, # avg_depth updated
1053, # depths units changed
1505, # Added new reference
1320, # Added new reference
1114, # Added new reference
1694, # Added new reference
1713, # New dating information added
1103, # Uploaded new age model
1062, # Age model reviewed
1354, # Possible duplicated sampling depth identified
479, # Possible duplicated sampling depth identified
1154, # Comment removed
1108,
1052,
1713,
1692,
1506, # Updated ages
2116:2124 # New entities
) %>%
sort() %>%
unique()
embsecbio_metadata <-
readr::read_csv("inst/extdata/embsecbio_metadata.csv") #%>%
# dplyr::filter(ID_ENTITY >= 2115)
# dplyr::group_by(entity_name) %>%
# dplyr::mutate(n = length(ID_SAMPLE),
# entity_name = ifelse(n > 1,
# paste0(entity_name, "_", seq_along(n)),
# entity_name)) %>%
# dplyr::select(-n)
embsecbio_counts <-
readr::read_csv("inst/extdata/embsecbio_counts.csv") %>%
dplyr::mutate(taxon_name = taxon_name %>%
stringr::str_remove_all("undiff\\.") %>%
stringr::str_squish()) %>%
dplyr::filter(ID_SAMPLE %in% embsecbio_metadata$ID_SAMPLE)
embsecbio_pubs <-
readr::read_csv("inst/extdata/embsecbio_pubs.csv") %>%
dplyr::filter(ID_ENTITY %in% embsecbio_metadata$ID_ENTITY) %>%
dplyr::group_by(ID_ENTITY) %>%
dplyr::mutate(publication = citation %>%
unique() %>%
sort() %>%
paste0(collapse = ";\n")) %>%
dplyr::ungroup() %>%
dplyr::distinct(ID_ENTITY, .keep_all = TRUE) %>%
dplyr::select(-ID_ENTITY_PUB,
-ID_PUB,
-citation)
embsecbio_unused <- embsecbio_metadata %>%
dplyr::filter(age_BP < -72)
embsecbio_metadata2 <- embsecbio_metadata %>%
dplyr::filter(ID_ENTITY >= 2115)
# dplyr::filter(ID_ENTITY %in% ID_ENTITY_ECS)
# dplyr::filter(ID_SAMPLE %in% embsecbio_counts$ID_SAMPLE)
embsecbio_counts2 <- embsecbio_counts %>%
dplyr::filter(ID_SAMPLE %in% embsecbio_metadata2$ID_SAMPLE) %>%
dplyr::distinct(.keep_all = TRUE)
embsecbio_counts_missing_taxa <- embsecbio_counts2 %>%
dplyr::left_join(ref_taxa, by = "taxon_name") %>%
dplyr::filter(is.na(action))
# embsecbio_missing_taxa <- embsecbio_counts_missing_taxa$taxon_name %>%
# unique() %>%
# sort()
# tibble::tibble(taxon_name = embsecbio_counts_missing_taxa$taxon_name %>%
# unique() %>%
# sort(),
# clean_name = taxon_name) %>%
# readr::write_csv("~/Downloads/SMPDSv2/EMBSeCBIO_taxon_names_2020-08-31.csv", na = "")
embsecbio_counts3 <- embsecbio_counts2 %>%
dplyr::left_join(ref_taxa, by = "taxon_name") %>%
dplyr::filter(!is.na(action), action != "delete") %>%
dplyr::select(-action) %>%
dplyr::rename(taxon_name_original = taxon_name,
taxon_name = clean_name) %>%
# dplyr::group_by(entity_name, taxon_name, age_BP) %>%
dplyr::group_by(ID_SAMPLE, taxon_name) %>%
dplyr::mutate(n = length(taxon_count),
taxon_count = sum(as.double(taxon_count), na.rm = TRUE)) %>%
dplyr::ungroup() %>%
dplyr::mutate(taxon_name = ifelse(is.na(taxon_name),
taxon_name_original,
taxon_name)) %>%
dplyr::distinct(ID_SAMPLE, taxon_name, .keep_all = TRUE) %>%
# dplyr::distinct(entity_name, taxon_name, age_BP, .keep_all = TRUE) %>%
dplyr::select(-taxon_name_original, -n) %>%
dplyr::left_join(embsecbio_pubs,
by = "ID_ENTITY") %>%
dplyr::relocate(publication,
.before = taxon_count)
embsecbio_counts_wide <- embsecbio_counts3 %>%
tidyr::pivot_wider(1:16,
names_from = "taxon_name",
values_from = "taxon_count") %>%
smpds::sort_taxa(1:16) %>%
smpds::rm_na_taxa(1:16) %>%
dplyr::arrange(entity_name) %>%
dplyr::group_by(entity_name) %>%
dplyr::mutate(n = length(entity_name),
entity_name2 = ifelse(n > 1,
paste0(entity_name, "_", seq_along(n)),
entity_name),
.before = latitude) %>%
dplyr::ungroup() %>%
dplyr::select(-n) %>%
smpds::parallel_extract_biome(cpus = 2) %>%
progressr::with_progress()
EMBSeCBIO <- embsecbio_counts_wide %>%
dplyr::select(-ID_SITE,
-ID_ENTITY,
-ID_SAMPLE,
-entity_name,
-sample_type,
-avg_depth) %>%
dplyr::relocate(ID_BIOME, .before = age_BP) %>%
dplyr::rename(entity_name = entity_name2) %>%
dplyr::mutate(source = ifelse(is.na(source), "EMBSeCBIO", source)) %>%
dplyr::relocate(source, .before = 1)
usethis::use_data(EMBSeCBIO, overwrite = TRUE)
# ---- Comparisons -------------------------------------------------------------
# Find records in the SMPDSv1 from the EMBSeCBIO
count_decimals <- function(x) {
as.character(x) %>%
purrr::map_dbl(~.x %>% stringr::str_split_fixed("\\.", 2) %>% .[2] %>% nchar())
}
# Esmeralda's updates to the EMBSeCBIO
ID_ENTITY <- c(
1157, # Updated depths and age errors,
8, 479, 1062, 1104, 1111, 1114, 1322, 1323, 1324, 1330, 1338, 1346, 1351, 1354, 1359, 1413, 1825, 1828, 2063, 2065, 2066, 2067, 2069, 2070, 2071, 2073, # New age models
479,1067,1114,1354,1323,1324, # Added missing dated ages
1827, # Updated depth units
2105, # Update comments
2, 3, 4, 5, 6, 7, 9, 10, 11, 1053, 1059, 1074, 1091, 1107, 1109, 1110, 1113, 1115, 1118, 1132, 1148, 1157, 1158, 1342, 1361, 1411, 1412, # New age models
2074, 2075, 2076, 2077, 2079, 2080, # New age models
1115, # Update dates and age model
2115, # New record
1688, # avg_depth updated
1053, # depths units changed
1505, # Added new reference
1320, # Added new reference
1114, # Added new reference
1694, # Added new reference
1713, # New dating information added
1103, # Uploaded new age model
1062, # Age model reviewed
1354, # Possible duplicated sampling depth identified
479, # Possible duplicated sampling depth identified
1154, # Comment removed
1108,
1052,
1713,
1692,
1506, # Updated ages
2116:2124 # New entities
) %>%
sort() %>%
unique()
embsecbio_metadata_ECS <- embsecbio::entity %>% #embsecbio_metadata %>%
dplyr::filter(ID_ENTITY %in% !!ID_ENTITY)
matched_by_entity_name_ECS <- embsecbio_metadata_ECS %>%
dplyr::filter(entity_name %in% smpds::SMPDSv1$entity_name)
unmatched_by_entity_name_ECS <- embsecbio_metadata_ECS %>%
dplyr::filter(!(entity_name %in% matched_by_entity_name_ECS$entity_name))
unmatched_by_entity_name_ECS %>%
readr::write_excel_csv("~/Downloads/SMPDSv2/EMBSeCBIO_records_updated_by_ECS.csv", na = "")
digits <- 4
aux_ECS <- smpds::compare_latlonelv(smpdsv1_embsecbio, #smpds::SMPDSv1,
unmatched_by_entity_name_ECS,
digits = digits)
aux_ECS2 <- aux_ECS %>%
dplyr::filter(count_decimals(latitude.x) >= digits,
count_decimals(latitude.y) >= digits,
count_decimals(longitude.x) >= digits,
count_decimals(longitude.x) >= digits)
## All the records
matched_by_entity_name_db <- embsecbio::entity %>%
dplyr::filter(entity_name %in% smpds::SMPDSv1$entity_name)
unmatched_by_entity_name_db <- embsecbio::entity %>%
dplyr::filter(!(entity_name %in% matched_by_entity_name_db$entity_name))
# matched_by_site_name_db <- unmatched_by_entity_name_db %>%
# dplyr::filter(site_name %in% smpds::SMPDSv1$site_name)
digits <- 4
aux_db <- smpds::compare_latlon(smpdsv1_embsecbio, # smpds::SMPDSv1,
unmatched_by_entity_name_db,
join_method = dplyr::right_join,
digits = digits)
aux_db <- smpds::compare_latlonelv(smpdsv1_embsecbio, # smpds::SMPDSv1,
unmatched_by_entity_name_db,
join_method = dplyr::right_join,
digits = digits)
aux_db2 <- aux_db %>%
dplyr::filter(count_decimals(latitude.x) >= digits,
count_decimals(latitude.y) >= digits,
count_decimals(longitude.x) >= digits,
count_decimals(longitude.x) >= digits)
## Subset
matched_by_entity_name_sub <- embsecbio_metadata %>%
dplyr::filter(entity_name %in% smpds::SMPDSv1$entity_name)
unmatched_by_entity_name_sub <- embsecbio_metadata %>%
dplyr::filter(!(entity_name %in% matched_by_entity_name_sub$entity_name))
# matched_by_site_name_sub <- unmatched_by_entity_name_sub %>%
# dplyr::filter(site_name %in% smpds::SMPDSv1$site_name)
digits <- 4
smpds::compare_latlon(smpdsv1_embsecbio, # smpds::SMPDSv1,
unmatched_by_entity_name_sub,
digits = digits)
aux_sub <- smpds::compare_latlonelv(smpdsv1_embsecbio, # smpds::SMPDSv1,
unmatched_by_entity_name_sub,
digits = digits)
aux_sub2 <- aux_sub %>%
dplyr::filter(count_decimals(latitude.x) >= digits,
count_decimals(latitude.y) >= digits,
count_decimals(longitude.x) >= digits,
count_decimals(longitude.x) >= digits)
counts <- table(aux_sub2$ID_ENTITY)
idx <- table(aux_sub2$ID_ENTITY) == 1
aux2 <- aux_sub2 %>%
dplyr::filter(ID_ENTITY %in% names(counts[idx]))
# ------------------------------------------------------------------------------
# | Sandbox |
# ------------------------------------------------------------------------------
embsecbio_taxa <- readxl::read_xlsx("~/Downloads/SMPDSv2/EMBSeCBIO/EMBSECBIO_clean taxon names.xlsx",
sheet = 1) %>%
magrittr::set_names(c("taxon_name", "clean_name")) %>%
dplyr::mutate(action = ifelse(stringr::str_detect(tolower(clean_name),
"exclude"),
"delete", "update"),
clean_name = ifelse(stringr::str_detect(tolower(clean_name),
"exclude"),
NA, clean_name),
clean_name = clean_name %>%
stringr::str_squish()) %>%
dplyr::arrange(dplyr::desc(action), taxon_name) %>%
dplyr::filter(!is.na(taxon_name)) %>%
dplyr::distinct(taxon_name, clean_name, .keep_all = TRUE)
embsecbio_taxa %>%
readr::write_excel_csv("inst/extdata/embsecbio_taxa.csv", na = "")
embsecbio_samples <- embsecbio::sample %>%
dplyr::filter(ID_ENTITY %in% embsecbio_metadata$ID_ENTITY) %>%
dplyr::select(-sample_name, -sample_type)
embsecbio_counts <- embsecbio::pollen_data %>%
dplyr::filter(ID_SAMPLE %in% embsecbio_samples$ID_SAMPLE)
# Extract data ------------------------------------------------------------
embsecbio_metadata <- embsecbio::age_model %>%
dplyr::mutate(age_BP = dplyr::coalesce(est_age_bacon_intcal20_mean,
est_age_original,
est_age_provided),
.after = 1) %>%
dplyr::filter(age_BP <= 50) %>%
dplyr::distinct(ID_SAMPLE, .keep_all = TRUE) %>%
# dplyr::filter(est_age_bacon_intcal20_median <= 50) %>%
dplyr::left_join(embsecbio::sample, by = "ID_SAMPLE") %>%
dplyr::left_join(embsecbio::entity, by = "ID_ENTITY") %>%
dplyr::left_join(embsecbio::site %>%
dplyr::select(ID_SITE, site_name, site_type, basin_size),
by = "ID_SITE") %>%
dplyr::select(ID_SITE,
ID_ENTITY,
ID_SAMPLE,
site_name,
entity_name,
sample_name,
latitude,
longitude,
elevation,
site_type,
entity_type,
sample_type,
basin_size,
source,
age_BP,
# age_BP = est_age_bacon_intcal20_median,
avg_depth) %>%
dplyr::arrange(ID_SITE, ID_ENTITY, ID_SAMPLE)
embsecbio_metadata %>%
readr::write_excel_csv("inst/extdata/embsecbio_metadata.csv", na = "")
embsecbio_counts <- embsecbio_metadata %>%
dplyr::left_join(embsecbio::pollen_data, by = "ID_SAMPLE") %>%
dplyr::select(-ID_SAMPLE_TAXA) %>%
dplyr::rename(taxon_name = taxon_clean) %>%
dplyr::filter(!is.na(taxon_name))
embsecbio_counts %>%
readr::write_excel_csv("inst/extdata/embsecbio_counts.csv", na = "")
embsecbio_pubs <- embsecbio::entity_pub %>%
dplyr::filter(ID_ENTITY %in% embsecbio_metadata$ID_ENTITY) %>%
dplyr::left_join(embsecbio::pub,
by = "ID_PUB")
embsecbio_pubs %>%
readr::write_excel_csv("inst/extdata/embsecbio_pubs.csv", na = "")
embsecbio_metadata <-
readr::read_csv("~/Downloads/SMPDSv2/EMBSeCBIO/modern_samples_EMBSECBIO_30Aug2021.csv") %>%
dplyr::rename(age_BP = age) %>%
dplyr::left_join(embsecbio::entity) %>%
dplyr::select(-mod_or_0ka_class, -comments) %>%
dplyr::left_join(embsecbio::site) %>%
dplyr::select(-catch_size)
embsecbio_counts <- embsecbio_metadata %>%
dplyr::left_join(embsecbio::sample,
by = c("ID_ENTITY", "avg_depth")) %>%
dplyr::filter(ID_ENTITY %in% embsecbio_metadata$ID_ENTITY) %>%
dplyr::left_join(embsecbio::pollen_data, by = "ID_SAMPLE") %>%
dplyr::select(-ID_SAMPLE_TAXA, -sample_name, -sample_type) %>%
dplyr::rename(taxon_name = taxon_clean) %>%
dplyr::filter(!is.na(taxon_name))
# Export Excel workbook ----
wb <- openxlsx::createWorkbook()
openxlsx::addWorksheet(wb, "metadata")
openxlsx::writeData(wb, "metadata",
EMBSeCBIO %>%
dplyr::select(source:publication))
openxlsx::addWorksheet(wb, "amalgamated")
openxlsx::writeData(wb, "amalgamated",
EMBSeCBIO %>%
dplyr::select(-c(source:publication)) #%>%
# tidyr::unnest(clean)
)
# openxlsx::addWorksheet(wb, "intermediate")
# openxlsx::writeData(wb, "intermediate",
# EMPDv2 %>%
# dplyr::select(ID_EMPDv2, intermediate) %>%
# tidyr::unnest(intermediate))
# openxlsx::addWorksheet(wb, "amalgamated")
# openxlsx::writeData(wb, "amalgamated",
# EMPDv2 %>%
# dplyr::select(ID_EMPDv2, amalgamated) %>%
# tidyr::unnest(amalgamated))
openxlsx::addWorksheet(wb, "taxon_list")
openxlsx::writeData(wb, "taxon_list",
"inst/extdata/embsecbio_taxa.csv" %>%
readr::read_csv())
openxlsx::saveWorkbook(wb,
paste0("data-raw/EMBSeCBIO_",
Sys.Date(),
".xlsx"))
# SPH revisions ----
"The entities and their counts were manually inspected by SPH"
`%>%` <- magrittr::`%>%`
## Load data ----
EMBSeCBIO_all <-
"data-raw/GLOBAL/D_embsecbio_records_additions_SPH.xlsx" %>%
readxl::read_excel(sheet = 1) %>%
dplyr::rename(basin_size = `basin_size (km)`) %>%
dplyr::mutate(
publication = "Harrison, S.P., Marinova, E. and Cruz-Silva, E., 2021. EMBSeCBIO pollen database. University of Reading. Dataset.",
doi = "10.17864/1947.309",
.after = Publication,
) %>%
dplyr::mutate(ID_SAMPLE = seq_along(entity_name), .after = doi) %>%
dplyr::select(-Publication)
### Metadata ----
EMBSeCBIO_metadata <-
EMBSeCBIO_all %>%
dplyr::select(source:ID_SAMPLE)
### Pollen counts ----
EMBSeCBIO_counts <-
EMBSeCBIO_all %>%
dplyr::select(ID_SAMPLE) %>%
dplyr::bind_cols(
EMBSeCBIO_all %>% # Convert columns with counts to numeric type
dplyr::select(-c(source:ID_SAMPLE)) %>%
purrr::map_dfc(~.x %>% as.numeric)
) %>%
magrittr::set_names(
colnames(.) %>%
stringr::str_replace_all("\\.\\.\\.", "#")
) %>%
tidyr::pivot_longer(-ID_SAMPLE,
names_to = "taxon_name",
values_to = "taxon_count") %>%
dplyr::mutate(taxon_name = taxon_name %>%
stringr::str_remove_all("\\#[0-9]+$") %>%
stringr::str_squish()) %>%
dplyr::group_by(ID_SAMPLE, taxon_name) %>%
dplyr::mutate(taxon_count = sum(taxon_count, na.rm = TRUE)) %>%
dplyr::distinct() %>%
dplyr::ungroup()
### Amalgamations ----
EMBSeCBIO_taxa_amalgamation <-
"data-raw/GLOBAL/D_embsecbio_records_additions_SPH.xlsx" %>%
readxl::read_excel(sheet = 2) %>%
magrittr::set_names(c(
"clean", "intermediate", "amalgamated"
)) %>%
dplyr::distinct() %>%
purrr::map_df(stringr::str_squish)
### Combine counts and amalgamation ----
EMBSeCBIO_taxa_counts_amalgamation <-
EMBSeCBIO_counts %>%
dplyr::left_join(EMBSeCBIO_taxa_amalgamation,
by = c("taxon_name" = "clean")) %>%
dplyr::relocate(taxon_count, .after = amalgamated) %>%
dplyr::relocate(ID_SAMPLE, .before = 1) %>%
dplyr::rename(clean = taxon_name)
# dplyr::select(-taxon_name)
### Additional taxonomic corrections (SPH - May 20th) ----
taxonomic_corrections <- "data-raw/GLOBAL/taxonomic_corrections.xlsx" %>%
readxl::read_excel(sheet = 1) %>%
purrr::map_df(stringr::str_squish)
EMBSeCBIO_taxa_counts_amalgamation_rev <-
EMBSeCBIO_taxa_counts_amalgamation %>%
dplyr::mutate(ID_COUNT = seq_along(ID_SAMPLE)) %>%
dplyr::left_join(taxonomic_corrections %>%
dplyr::filter(level %in% c("clean", "all")),
by = c("clean" = "original_taxon")) %>%
dplyr::mutate(clean = dplyr::coalesce(corrected_taxon_name,
clean)) %>%
dplyr::select(-corrected_taxon_name, -level) %>%
dplyr::left_join(taxonomic_corrections %>%
dplyr::filter(level %in% c("intermediate", "all")),
by = c("clean" = "original_taxon")) %>%
dplyr::mutate(intermediate = dplyr::coalesce(corrected_taxon_name,
intermediate)) %>%
dplyr::select(-corrected_taxon_name, -level) %>%
dplyr::left_join(taxonomic_corrections %>%
dplyr::filter(level %in% c("amalgamated", "all")),
by = c("clean" = "original_taxon")) %>%
dplyr::mutate(amalgamated = dplyr::coalesce(corrected_taxon_name,
amalgamated)) %>%
dplyr::select(-corrected_taxon_name, -level) %>%
dplyr::left_join(taxonomic_corrections %>%
dplyr::filter(level %in% c("all")),
by = c("clean" = "original_taxon")) %>%
dplyr::mutate(clean = dplyr::coalesce(corrected_taxon_name,
clean)) %>%
dplyr::select(-corrected_taxon_name, -level) %>%
dplyr::left_join(taxonomic_corrections %>%
dplyr::filter(level %in% c("all")),
by = c("intermediate" = "original_taxon")) %>%
dplyr::mutate(intermediate = dplyr::coalesce(corrected_taxon_name,
intermediate)) %>%
dplyr::select(-corrected_taxon_name, -level) %>%
dplyr::left_join(taxonomic_corrections %>%
dplyr::filter(level %in% c("all")),
by = c("amalgamated" = "original_taxon")) %>%
dplyr::mutate(amalgamated = dplyr::coalesce(corrected_taxon_name,
amalgamated)) %>%
dplyr::select(-corrected_taxon_name, -level)
EMBSeCBIO_taxa_counts_amalgamation_rev %>%
dplyr::group_by(ID_COUNT) %>%
dplyr::mutate(n = dplyr::n()) %>%
dplyr::filter(n > 1)
waldo::compare(EMBSeCBIO_taxa_counts_amalgamation %>%
dplyr::distinct(clean, intermediate, amalgamated),
EMBSeCBIO_taxa_counts_amalgamation_rev %>%
dplyr::distinct(clean, intermediate, amalgamated),
max_diffs = Inf)
EMBSeCBIO_taxa_counts_amalgamation <- EMBSeCBIO_taxa_counts_amalgamation_rev %>%
dplyr::filter(!is.na(taxon_count), taxon_count > 0) %>%
dplyr::select(-ID_COUNT)
EMBSeCBIO_taxa_counts_amalgamation %>%
dplyr::filter(is.na(clean) | is.na(intermediate) | is.na(amalgamated)) %>%
dplyr::distinct(clean, intermediate, amalgamated)
## Find DOIs ----
EMBSeCBIO_metadata_pubs <-
EMBSeCBIO_metadata %>%
dplyr::distinct(publication, doi) %>%
dplyr::arrange(publication) %>%
dplyr::mutate(DOI = publication %>%
stringr::str_extract_all("\\[DOI\\s*(.*?)\\s*\\](;|$)") %>%
purrr::map_chr(~.x %>%
stringr::str_remove_all("^\\[DOI:") %>%
stringr::str_remove_all("\\]\\s*;\\s*$") %>%
stringr::str_remove_all("\\]$") %>%
stringr::str_remove_all("doi:") %>%
stringr::str_squish() %>%
stringr::str_c(collapse = ";\n"))
) %>%
dplyr::mutate(ID_PUB = seq_along(publication)) %>%
dplyr::mutate(updated_publication = NA, .before = publication) %>%
dplyr::mutate(updated_DOI = NA, .before = DOI)
# EMBSeCBIO_metadata_pubs %>%
# readr::write_excel_csv("data-raw/GLOBAL/EMBSeCBIO_modern-references.csv")
### Load cleaned publications list ----
EMBSeCBIO_clean_publications <-
"data-raw/GLOBAL/EMBSeCBIO_modern-references_clean.csv" %>%
readr::read_csv() %>%
dplyr::select(-DOI)
## Append clean publications ----
EMBSeCBIO_metadata_2 <-
EMBSeCBIO_metadata %>%
dplyr::left_join(EMBSeCBIO_metadata_pubs %>%
dplyr::select(-DOI, -doi, -dplyr::contains("updated")),
by = "publication") %>%
dplyr::left_join(EMBSeCBIO_clean_publications,
by = "ID_PUB") %>%
dplyr::select(-publication.x, -publication.y, -doi, -ID_PUB) %>%
dplyr::rename(doi = updated_DOI,
publication = updated_publication)
## Extract PNV/BIOME ----
EMBSeCBIO_metadata_3 <-
EMBSeCBIO_metadata_2 %>%
dplyr::select(-dplyr::starts_with("ID_BIOME")) %>%
smpds::parallel_extract_biome(cpus = 10) %>%
# smpds::biome_name() %>%
dplyr::relocate(ID_BIOME, .after = doi) %>%
smpds::pb()
EMBSeCBIO_metadata_3 %>%
smpds::plot_biome(xlim = range(.$longitude, na.rm = TRUE) * c(0.9, 1.1),
ylim = range(.$latitude, na.rm = TRUE) * c(0.9, 1.1))
## Create count tables ----
### Clean ----
EMBSeCBIO_clean <-
EMBSeCBIO_taxa_counts_amalgamation %>%
dplyr::select(-intermediate, -amalgamated) %>%
dplyr::rename(taxon_name = clean) %>%
dplyr::group_by(ID_SAMPLE, taxon_name) %>%
dplyr::mutate(taxon_count = sum(taxon_count, na.rm = TRUE)) %>%
dplyr::ungroup() %>%
dplyr::distinct() %>%
tidyr::pivot_wider(ID_SAMPLE,
names_from = taxon_name,
values_from = taxon_count,
values_fill = 0,
names_sort = TRUE) %>%
dplyr::arrange(ID_SAMPLE)
### Intermediate ----
EMBSeCBIO_intermediate <-
EMBSeCBIO_taxa_counts_amalgamation %>%
dplyr::select(-clean, -amalgamated) %>%
dplyr::rename(taxon_name = intermediate) %>%
dplyr::group_by(ID_SAMPLE, taxon_name) %>%
dplyr::mutate(taxon_count = sum(taxon_count, na.rm = TRUE)) %>%
dplyr::ungroup() %>%
dplyr::distinct() %>%
tidyr::pivot_wider(ID_SAMPLE,
names_from = taxon_name,
values_from = taxon_count,
values_fill = 0,
names_sort = TRUE) %>%
dplyr::arrange(ID_SAMPLE)
### Amalgamated ----
EMBSeCBIO_amalgamated <-
EMBSeCBIO_taxa_counts_amalgamation %>%
dplyr::select(-clean, -intermediate) %>%
dplyr::rename(taxon_name = amalgamated) %>%
dplyr::group_by(ID_SAMPLE, taxon_name) %>%
dplyr::mutate(taxon_count = sum(taxon_count, na.rm = TRUE)) %>%
dplyr::ungroup() %>%
dplyr::distinct() %>%
tidyr::pivot_wider(ID_SAMPLE,
names_from = taxon_name,
values_from = taxon_count,
values_fill = 0,
names_sort = TRUE) %>%
dplyr::arrange(ID_SAMPLE)
# Extract missing elevations ----
# EMBSeCBIO_metadata_4 <-
# EMBSeCBIO_metadata_3 %>%
# dplyr::rename(elevation_original = elevation) %>%
# smpds:::get_elevation(cpus = 12)
#
# EMBSeCBIO_metadata_4 %>%
# dplyr::select(ID_SAMPLE, entity_name, latitude, longitude, elevation_new = elevation, elevation_original) %>%
# dplyr::mutate(diff =
# abs(elevation_new - elevation_original) / elevation_original) %>%
# # readr::write_csv("data-raw/GLOBAL/EMBSeCBIO_elevations_only.csv", na = "")
# dplyr::filter(diff >= 0.5)
# dplyr::mutate(within_90p =
# dplyr::between(elevation,
# min(c(0.9, 1.1) * elevation_original),
# max(c(0.9, 1.1) * elevation_original)),
# within_95p =
# dplyr::between(elevation,
# min(c(0.95, 1.05) * elevation_original),
# max(c(0.95, 1.05) * elevation_original)),
# within_975p =
# dplyr::between(elevation,
# min(c(0.975, 1.025) * elevation_original),
# max(c(0.975, 1.025) * elevation_original))
# ) %>%
# dplyr::filter(!within_975p)
# Store subsets ----
EMBSeCBIO <-
EMBSeCBIO_metadata_3 %>%
dplyr::mutate(
clean = EMBSeCBIO_clean %>%
dplyr::select(-c(ID_SAMPLE)),
intermediate = EMBSeCBIO_intermediate %>%
dplyr::select(-c(ID_SAMPLE)),
amalgamated = EMBSeCBIO_amalgamated %>%
dplyr::select(-c(ID_SAMPLE))
) %>%
dplyr::mutate(
basin_size_num = basin_size %>%
as.numeric() %>%
round(digits = 6) %>%
as.character(),
basin_size = dplyr::coalesce(
basin_size_num,
basin_size
),
basin_size = basin_size %>%
stringr::str_to_lower() %>%
stringr::str_squish() %>%
stringr::str_replace_all("unknown|Unknown", "not known"),
entity_type = entity_type %>%
stringr::str_to_lower() %>%
stringr::str_squish() %>%
stringr::str_replace_all("unknown|Unknown", "not known") %>%
stringr::str_to_lower(),
site_type = site_type %>%
stringr::str_to_lower() %>%
stringr::str_squish() %>%
stringr::str_replace_all("estuarine", "coastal, estuarine") %>%
stringr::str_replace_all("drained/dry lake", "lacustrine, drained lake") %>%
stringr::str_replace_all("terrestrial, other sediments", "terrestrial") %>%
stringr::str_replace_all("terrestrial, soil", "soil") %>%
stringr::str_replace_all("unknown", "not known")
) %>%
dplyr::relocate(ID_SAMPLE, .before = clean) %>%
# dplyr::relocate(basin_size_old, .after = basin_size) %>%
dplyr::select(-basin_size_num) %>%
dplyr::mutate(source = "EMBSeCBIO", .before = 1)
usethis::use_data(EMBSeCBIO, overwrite = TRUE, compress = "xz")
# Inspect enumerates ----
### basin_size -----
EMBSeCBIO$basin_size %>%
unique() %>% sort()
### site_type ----
EMBSeCBIO$site_type %>%
unique() %>% sort()
### entity_type ----
EMBSeCBIO$entity_type %>%
unique() %>% sort()
# Export Excel workbook ----
wb <- openxlsx::createWorkbook()
openxlsx::addWorksheet(wb, "metadata")
openxlsx::writeData(wb, "metadata",
EMBSeCBIO %>%
dplyr::select(source:ID_SAMPLE))
openxlsx::addWorksheet(wb, "clean")
openxlsx::writeData(wb, "clean",
EMBSeCBIO %>%
dplyr::select(ID_SAMPLE, clean) %>%
tidyr::unnest(clean))
openxlsx::addWorksheet(wb, "intermediate")
openxlsx::writeData(wb, "intermediate",
EMBSeCBIO %>%
dplyr::select(ID_SAMPLE, intermediate) %>%
tidyr::unnest(intermediate))
openxlsx::addWorksheet(wb, "amalgamated")
openxlsx::writeData(wb, "amalgamated",
EMBSeCBIO %>%
dplyr::select(ID_SAMPLE, amalgamated) %>%
tidyr::unnest(amalgamated))
openxlsx::saveWorkbook(wb,
paste0("data-raw/GLOBAL/EMBSeCBIO_",
Sys.Date(),
".xlsx"))
# Load climate reconstructions ----
climate_reconstructions <-
"data-raw/reconstructions/EMBSeCBIO_climate_reconstructions_2022-05-12.csv" %>%
readr::read_csv()
# Load daily values for precipitation to compute MAP (mean annual precipitation)
climate_reconstructions_pre <-
"data-raw/reconstructions/EMBSeCBIO_climate_reconstructions_pre_2022-05-12.csv" %>%
readr::read_csv() %>%
dplyr::rowwise() %>%
dplyr::mutate(map = sum(dplyr::c_across(T1:T365), na.rm = TRUE), .before = T1)
climate_reconstructions_2 <- climate_reconstructions %>%
dplyr::bind_cols(climate_reconstructions_pre %>%
dplyr::select(map))
climate_reconstructions_with_counts <-
EMBSeCBIO %>%
# smpds::EMBSeCBIO %>%
# dplyr::select(-c(mi:mtwa)) %>%
dplyr::bind_cols(
climate_reconstructions_2 %>%
dplyr::select(sn = site_name,
en = entity_name,
new_elevation = elevation,
mi:map)
) %>%
dplyr::relocate(mi:map, .before = clean) %>%
dplyr::mutate(elevation = dplyr::coalesce(elevation, new_elevation))
climate_reconstructions_with_counts %>%
dplyr::filter(site_name != sn | entity_name != en)
waldo::compare(smpds::EMBSeCBIO,
climate_reconstructions_with_counts %>%
dplyr::select(-c(mi:map, sn, en, new_elevation))
)
EMBSeCBIO <- climate_reconstructions_with_counts %>%
dplyr::select(-sn, -en, -new_elevation)
usethis::use_data(EMBSeCBIO, overwrite = TRUE, compress = "xz")
waldo::compare(smpds::EMBSeCBIO, EMBSeCBIO, max_diffs = Inf)
climate_reconstructions_2 %>%
smpds::plot_climate_countour(
var = "mat",
xlim = range(.$longitude, na.rm = TRUE),
ylim = range(.$latitude, na.rm = TRUE)
)
climate_reconstructions_2 %>%
smpds::plot_climate(
var = "map",
xlim = range(.$longitude, na.rm = TRUE),
ylim = range(.$latitude, na.rm = TRUE)
)
rm(climate_reconstructions,
climate_reconstructions_2,
climate_reconstructions_pre,
climate_reconstructions_with_counts)
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.