special.epd: SPECIAL Research Group's Version of the European Pollen Database (EPD)

`%>%` <- magrittr::`%>%`
# Download file ----
epd_repatriation_id <- "1knnWB6XQF_QAI9iumyUAdoFm37_7iTZh"
epd_repatriation_tmp_file <- tempfile(fileext = ".xslx")
googledrive::as_id(epd_repatriation_id) %>%
  googledrive::drive_download(path = epd_repatriation_tmp_file, overwrite = TRUE)

# Open DB connection ----
conn <- dabr::open_conn_mysql("SPECIAL-EPD",
                              password = rstudioapi::askForPassword())
conn_rpd <- dabr::open_conn_mysql("RPD-latest",
                                  password = rstudioapi::askForPassword())

# Load datasets ----
load("data/EPD/EPD_COUNTS.rda")
load("data/EPD/EPD_DATES.rda")
load("data/EPD/EPD_DATES_coretops.rda")
load("data/EPD/EPD_METADATA.rda")

a <- dplyr::bind_rows(
  EPD_DATES,
  EPD_DATES_coretops
)

a %>%
  dplyr::filter(!is.na(depth)) %>%
  dplyr::arrange(entity_name, depth) %>%
  dplyr::group_by(entity_name) %>%
  dplyr::summarise(n = dplyr::n()) %>%
  dplyr::filter(n < 2)

# Helper functions ----
to_bool <- function(x) {
  dplyr::case_when(is.na(x) ~ NA,
                   stringr::str_starts(x, "Y|y") ~ TRUE,
                   stringr::str_starts(x, "N|n") ~ FALSE,
                   TRUE ~ NA)
}

extract_embsecbio <- function(ID_ENTITY = NULL) {
  if (missing(ID_ENTITY))
    return(NULL)
  entity_tb <- embsecbio::entity %>%
    dplyr::filter(ID_ENTITY %in% !!ID_ENTITY) %>%
    dplyr::select(-mod_or_0ka_class, -comments) %>%
    dplyr::left_join(embsecbio::entity_pub) %>%
    dplyr::left_join(embsecbio::pub) %>%
    dplyr::group_by(ID_ENTITY) %>%
    dplyr::mutate(publication = citation %>%
                    stringr::str_c(collapse = ";\n")) %>%
    dplyr::ungroup() %>%
    dplyr::select(-ID_ENTITY_PUB, -ID_PUB, -citation) %>%
    dplyr::distinct()
  if (nrow(entity_tb) == 0)
    return(NULL)
  site_tb <- embsecbio::site %>%
    dplyr::filter(ID_SITE %in% entity_tb$ID_SITE) %>%
    dplyr::select(ID_SITE, site_name, site_type)
    # dplyr::select(-basin_size, -catch_size)
  metadata_tb <- site_tb %>%
    dplyr::right_join(entity_tb)
  date_info_tb <- embsecbio::date_info %>%
    dplyr::filter(ID_ENTITY %in% entity_tb$ID_ENTITY) %>%
    dplyr::rename(depth = avg_depth,
                  age_c14 = dated_age,
                  material_dated = mat_dated) %>%
    dplyr::rowwise() %>%
    dplyr::mutate(error = max(error_positive, error_negative)) %>%
    dplyr::ungroup() %>%
    dplyr::select(-error_negative, -error_positive)
  sample_tb <- embsecbio::sample %>%
    dplyr::filter(ID_ENTITY %in% entity_tb$ID_ENTITY) %>%
    dplyr::rename(depth =  avg_depth) %>%
    dplyr::select(-sample_name) %>%
    dplyr::mutate(depth = depth * 100) %>%
    dplyr::rename(external_ID_ENTITY = ID_ENTITY)
  pollen_tb <- embsecbio::pollen_data %>%
    dplyr::filter(ID_SAMPLE %in% sample_tb$ID_SAMPLE) %>%
    dplyr::select(-ID_SAMPLE_TAXA) #%>%
    # tidyr::pivot_wider(names_from = "taxon_clean", values_from = "taxon_count")
  age_model_tb <- embsecbio::age_model %>%
    dplyr::filter(ID_SAMPLE %in% sample_tb$ID_SAMPLE) %>%
    dplyr::select(-dplyr::contains("intcal13")) %>%
    dplyr::rename(age = est_age_provided) %>%
    magrittr::set_names(colnames(.) %>%
                          stringr::str_remove_all("est_age_bacon_intcal20_") %>%
                          stringr::str_replace_all("uncert", "UNCERT")) #%>%
    # dplyr::left_join(sample_tb %>%
    #                    dplyr::select(ID_SAMPLE, external_ID_ENTITY, depth)) %>%
    # dplyr::relocate(external_ID_ENTITY, depth, .before = 1)
  sample_pollen_tb <- sample_tb %>%
    # dplyr::select(ID_SAMPLE, external_ID_ENTITY, depth) %>%
    dplyr::right_join(pollen_tb, by = "ID_SAMPLE")
  sample_age_model_tb <- sample_tb %>%
    # dplyr::select(ID_SAMPLE, external_ID_ENTITY, depth) %>%
    dplyr::right_join(age_model_tb, by = "ID_SAMPLE")
  list(metadata = metadata_tb,
       # site = site_tb,
       # entity = entity_tb,
       date_info = date_info_tb,
       sample = sample_tb,
       pollen = sample_pollen_tb,
       age_model = sample_age_model_tb)
}

extract_iberia <- function(entity_name = NULL, v2 = FALSE) {
  if (missing(entity_name))
    return(NULL)
  if (v2) {
    message("Loading version 3...")
    data("IBERIA_pollen_v2")
    data("IBERIA_pollen_dates_v2")
    IBERIA_pollen <- IBERIA_pollen_v2
    IBERIA_pollen_dates <- IBERIA_pollen_dates_v2
    rm("IBERIA_pollen_v2", "IBERIA_pollen_dates_v2")
  } else {
    message("Loading version 3...")
    data("IBERIA_pollen_v3")
    data("IBERIA_pollen_dates_v3")
    IBERIA_pollen <- IBERIA_pollen_v3
    IBERIA_pollen_dates <- IBERIA_pollen_dates_v3
    rm("IBERIA_pollen_v3", "IBERIA_pollen_dates_v3")
  }
  # data("IBERIA_pollen")
  # data("IBERIA_pollen_dates")
  entity_age_model_tb <- IBERIA_pollen %>%
    dplyr::filter(entity_name %in% !!entity_name) %>%
    dplyr::select(1:17) %>%
    dplyr::rename(depth = `avg_depth..cm.`) %>%
    dplyr::select(-`IPE.age..cal.`)
  if (nrow(entity_age_model_tb) == 0)
    return(NULL)

  date_info_tb <- IBERIA_pollen_dates %>%
    dplyr::filter(entity_name %in% !!entity_name) %>%
    dplyr::select(-dplyr::starts_with("type"))
  sample_tb <- IBERIA_pollen %>%
    dplyr::filter(entity_name %in% !!entity_name) %>%
    dplyr::select(1:17) %>%
    dplyr::rename(depth = `avg_depth..cm.`) %>%
    dplyr::select(-`IPE.age..cal.`)
  age_model_tb <- IBERIA_pollen %>%
    dplyr::filter(entity_name %in% !!entity_name) %>%
    dplyr::select(1:17) %>%
    dplyr::rename(depth = `avg_depth..cm.`,
                  age = `IPE.age..cal.`) %>%
    magrittr::set_names(colnames(.) %>%
                          stringr::str_remove_all("INTCAL2020_") %>%
                          stringr::str_replace_all("uncert", "UNCERT"))
  list(metadata = entity_age_model_tb %>%
         dplyr::select(1:9) %>%
         dplyr::distinct(),
       date_info = date_info_tb,
       sample = sample_tb,
       age_model = age_model_tb)
}

extract_rpd <- function(ID_ENTITY = NULL) {
  if (missing(ID_ENTITY))
    return(NULL)
  entity_tb <- rpdata::entity %>%
    dplyr::filter(ID_ENTITY %in% !!ID_ENTITY) %>%
    dplyr::select(-core_location, -measurement_method, -last_updated) %>%
    dplyr::rename(entity_type = TYPE) %>%
    dplyr::left_join(rpdata::entity_link_publication) %>%
    dplyr::left_join(rpdata::publication) %>%
    dplyr::group_by(ID_ENTITY) %>%
    dplyr::mutate(publication = citation %>%
                    stringr::str_c(collapse = ";\n")) %>%
    dplyr::ungroup() %>%
    dplyr::select(-ID_PUB, -citation, -pub_DOI_URL, -bibentry, -ID_UNIT) %>%
    dplyr::distinct()
  if (nrow(entity_tb) == 0)
    return(NULL)
  site_tb <- rpdata::site %>%
    dplyr::filter(ID_SITE %in% entity_tb$ID_SITE) %>%
    dplyr::select(ID_SITE, site_name, site_type) # %>%
    # dplyr::select(-water_depth, -basin_size_class, -catch_size_class, -flow_type, -basin_size_km2, -catch_size_km2)
  metadata_tb <- site_tb %>%
    dplyr::right_join(entity_tb)
  date_info_tb <- rpdata::date_info %>%
    dplyr::filter(ID_ENTITY %in% entity_tb$ID_ENTITY) %>%
    dplyr::rename(depth = avg_depth,
                  age_c14 = age_C14) %>%
    dplyr::mutate(depth = depth * 100, # Convert units: m -> cm
                  thickness = thickness * 100) %>%
    dplyr::mutate(depth = ifelse(round(depth) == round(-777777 * 100) |
                                   round(depth) == round(-888888 * 100) |
                                   round(depth) == round(-999999 * 100),
                                 round(depth / 100),
                                 depth),
                  thickness = ifelse(round(thickness) == round(-777777 * 100) |
                                       round(thickness) == round(-888888 * 100) |
                                       round(thickness) == round(-999999 * 100),
                                     round(thickness / 100),
                                     thickness)) %>%
    dplyr::select(-correlation_info)
  sample_tb <- rpdata::sample %>%
    dplyr::filter(ID_ENTITY %in% entity_tb$ID_ENTITY) %>%
    dplyr::rename(depth =  avg_depth,
                  thickness = sample_thickness) %>%
    dplyr::select(-analytical_sample_size, -analytical_sample_size_unit) %>%
    dplyr::mutate(depth = depth * 100, # Convert units: m -> cm
                  thickness = thickness * 100) %>%
    dplyr::mutate(depth = ifelse(round(depth) == round(-777777 * 100) |
                                   round(depth) == round(-888888 * 100) |
                                   round(depth) == round(-999999 * 100),
                                 round(depth / 100),
                                 depth),
                  thickness = ifelse(round(thickness) == round(-777777 * 100) |
                                       round(thickness) == round(-888888 * 100) |
                                       round(thickness) == round(-999999 * 100),
                                     round(thickness / 100),
                                     thickness)) %>%
    dplyr::rename(external_ID_ENTITY = ID_ENTITY) %>%
    dplyr::left_join(rpdata::chronology,
                     by = "ID_SAMPLE") %>%
    dplyr::left_join(rpdata::model_name,
                     by = "ID_MODEL") %>%
    dplyr::rename(age = original_est_age,
                  model_name_original = model_name) %>%
    dplyr::select(-ID_MODEL)
  age_model_tb <- rpdata::age_model %>%
    dplyr::filter(ID_SAMPLE %in% sample_tb$ID_SAMPLE) %>%
    dplyr::left_join(rpdata::model_name,
                     by = "ID_MODEL") %>%
    dplyr::select(-ID_MODEL) %>%
    dplyr::relocate(model_name, .before = 1)
  sample_age_model_tb <- sample_tb %>%
    dplyr::right_join(age_model_tb, by = "ID_SAMPLE")
  list(metadata = metadata_tb,
       date_info = date_info_tb,
       sample = sample_tb,
       age_model = sample_age_model_tb)
}

get_id_date_info <- function(conn) {
  suppressWarnings({
    ID_DATE_INFO <- dabr::select_all(conn, "date_info", quiet = TRUE) %>%
      .$ID_DATE_INFO %>%
      max()
  })
  if (is.na(ID_DATE_INFO) | is.infinite(ID_DATE_INFO))
    return(1)
  return(ID_DATE_INFO)
}

get_id_entity <- function(conn) {
  suppressWarnings({
    ID_ENTITY <- dabr::select_all(conn, "entity", quiet = TRUE) %>%
      .$ID_ENTITY %>%
      max()
  })
  if (is.na(ID_ENTITY) | is.infinite(ID_ENTITY))
    return(1)
  return(ID_ENTITY)
}

get_id_sample <- function(conn) {
  suppressWarnings({
    ID_SAMPLE <- dabr::select_all(conn, "sample", quiet = TRUE) %>%
      .$ID_SAMPLE %>%
      max()
  })
  if (is.na(ID_SAMPLE) | is.infinite(ID_SAMPLE))
    return(1)
  return(ID_SAMPLE)
}

get_id_site <- function(conn) {
  suppressWarnings({
    ID_SITE <- dabr::select_all(conn, "entity", quiet = TRUE) %>%
      .$ID_SITE %>%
      max()
  })
  if (is.na(ID_SITE) | is.infinite(ID_SITE))
    return(1)
  return(ID_SITE)
}

find_age_models <- function(path, entity_name) {
  outputs <- path %>%
    list.files(pattern = "\\.csv", recursive = TRUE, full.names = TRUE) %>%
    stringr::str_subset("bacon")
  outputs_tb <- tibble::tibble(path = outputs) %>%
    dplyr::mutate(entity_name = path %>%
                    purrr::map_chr(~.x %>% dirname %>% basename),
                  subfolder = path %>%
                    purrr::map_chr(~.x %>% dirname %>% dirname %>% basename),
                  .before = 1)
  if (missing(entity_name))
    return(outputs_tb)
  outputs_tb %>%
    dplyr::filter(entity_name %in% !!entity_name) %>%
    dplyr::mutate(am = path %>%
                    purrr::map(~suppressMessages(readr::read_csv(.x))))
}

idx_pairs <- function(max, step) {
  tibble::tibble(x = seq(1, max, step), y = c(x[-1] - 1, max))
}

upload_age_model <- function(conn, entity_name, am, ...) {
  rpd:::msg(entity_name, limit = 78, nl = FALSE)
  # Select records from the sample-charcoal table
  sample_tb <- dabr::select(conn,
                            "SELECT * FROM sample INNER JOIN entity ON",
                            "sample.ID_ENTITY = entity.ID_ENTITY WHERE",
                            "entity_name = ",
                            dabr::quote(entity_name),
                            quiet = TRUE)
  # if (nrow(am) %% nrow(sample_tb) == 0) { # In case there are duplicated samples
  #   am <- am %>%
  #     dplyr::distinct(depths, mean, median, uncert_5, uncert_95, uncert_25, uncert_75, .keep_all = TRUE)
  # }
  am <- am %>%
    dplyr::distinct(depths, mean, median, uncert_5, uncert_95, uncert_25, uncert_75, .keep_all = TRUE) %>%
    dplyr::left_join(sample_tb %>%
                       dplyr::select(ID_SAMPLE, depth),
                     by = c("depths" = "depth"))
  # Verify all the samples exist in the DB
  if (nrow(sample_tb) == nrow(am)) {
    # Check depths
    depth_diff <- abs(am$depths - sample_tb$depth) > 0.001
    if (sum(depth_diff) == nrow(sample_tb)) { # DB has the wrong units
      depth_diff <- abs(am$depths - sample_tb$depth) > 0.001
      if (sum(depth_diff) != nrow(sample_tb)) {
        rpd:::msg("The records in the DB have the wrong units, `cm`.")
        return(tibble::tibble(entity_name,
                              am = list(am),
                              status = FALSE,
                              reason = "The records in the DB have the wrong units, `cm`.",
                              ...))
      }
    }
    if (sum(depth_diff) > 0) {
      rpd:::msg("Mistmatch between depths detected.")
      return(tibble::tibble(entity_name,
                            am = list(am),
                            status = FALSE,
                            reason = "Mistmach in the depths.",
                            ...))
    }
    # Prepare raw data for insertion
    am <- am %>%
      dplyr::mutate(ID_MODEL = 8, # BACON INTCAL20
                    ID_SAMPLE = sample_tb$ID_SAMPLE) %>%
      dplyr::select(-c(sample_ids, depths)) %>%
      purrr::map_df(as.integer) # Convert columns to integer
    # Delete any existing age models
    all_samples_tb <- dabr::select(conn,
                                   "SELECT * FROM sample",
                                   "LEFT JOIN entity ON entity.ID_ENTITY = sample.ID_ENTITY",
                                   "WHERE entity_name = ",
                                   dabr::quote(entity_name),
                                   quiet = TRUE) %>%
      dplyr::arrange(ID_ENTITY)
    if (nrow(all_samples_tb) == 0) {
      print("NO SAMPLES WHERE FOUND")
    }
    if (nrow(all_samples_tb) > 0) {
      dabr::delete(conn,
                   "DELETE FROM age_model WHERE ID_SAMPLE IN (",
                   paste0(all_samples_tb$ID_SAMPLE, collapse = ", "),
                   ") AND ID_MODEL = 8")
    }
    rpd:::update_records(conn, "age_model", am, PK = c(7:8), quiet = TRUE)

    # Check inserted records
    age_model_tb <-
      dabr::select(conn,
                   "SELECT * FROM age_model WHERE ID_SAMPLE IN (",
                   paste0(am$ID_SAMPLE, collapse = ", "),
                   ") AND ID_MODEL = 8",
                   quiet = TRUE) %>%
      tibble::as_tibble()
    # Change case of columns
    colnames(age_model_tb) <- tolower(colnames(age_model_tb))
    colnames(am) <- tolower(colnames(am))
    # Change column order
    am <- am[, colnames(age_model_tb)]
    print(waldo::compare(am, age_model_tb))#, tolerance = 0.01))
  } else {
    rpd:::msg(paste0("Mistmatch in the number of samples: ", entity_name,". "))
    return(tibble::tibble(entity_name,
                          am = list(am),
                          status = FALSE,
                          reason = "Mistmatch in the number of samples.",
                          ...))
  }

  return(tibble::tibble(entity_name,
                        am = list(am),
                        status = TRUE,
                        reason = NA_character_,
                        ...))
  # return(tibble::tibble(am = list(am), status = TRUE, reason = NA_character_))
}

upload_age_model2 <- function(conn, entity_name, am, ...) {
  rpd:::msg(entity_name, limit = 78, nl = FALSE)
  # Select records from the sample-charcoal table
  sample_tb <- dabr::select(conn,
                            "SELECT * FROM sample INNER JOIN entity ON",
                            "sample.ID_ENTITY = entity.ID_ENTITY WHERE",
                            "entity_name = ",
                            dabr::quote(entity_name),
                            quiet = TRUE)
  # browser()
  am <- am %>%
    dplyr::distinct(depths, mean, median, uncert_5, uncert_95, uncert_25, uncert_75, .keep_all = TRUE) %>%
    dplyr::left_join(sample_tb %>%
                       dplyr::select(ID_SAMPLE, depth),
                     by = c("depths" = "depth"))
  # Verify all the samples exist in the DB
  if (nrow(sample_tb) == nrow(am)) {
    # Check depths
    depth_diff <- abs(am$depths - sample_tb$depth) > 0.001
    if (sum(depth_diff) == nrow(sample_tb)) { # DB has the wrong units
      depth_diff <- abs(am$depths - sample_tb$depth) > 0.001
      if (sum(depth_diff) != nrow(sample_tb)) {
        rpd:::msg("The records in the DB have the wrong units, `cm`.")
        return(tibble::tibble(entity_name,
                              am = list(am),
                              status = FALSE,
                              reason = "The records in the DB have the wrong units, `cm`.",
                              ...))
      }
    }
    if (sum(depth_diff) > 0) {
      rpd:::msg("Mistmatch between depths detected.")
      return(tibble::tibble(entity_name,
                            am = list(am),
                            status = FALSE,
                            reason = "Mistmach in the depths.",
                            ...))
    }
    # Prepare raw data for insertion
    am <- am %>%
      dplyr::mutate(ID_MODEL = 8, # BACON INTCAL20
                    ID_SAMPLE = sample_tb$ID_SAMPLE) %>%
      dplyr::select(-c(sample_ids, depths)) %>%
      purrr::map_df(as.integer) # Convert columns to integer
    # Delete any existing age models
    all_samples_tb <- dabr::select(conn,
                                   "SELECT * FROM sample",
                                   "LEFT JOIN entity ON entity.ID_ENTITY = sample.ID_ENTITY",
                                   "WHERE entity_name = ",
                                   dabr::quote(entity_name),
                                   quiet = TRUE) %>%
      dplyr::arrange(ID_ENTITY)
    if (nrow(all_samples_tb) == 0) {
      print("NO SAMPLES WHERE FOUND")
    }
    if (nrow(all_samples_tb) > 0) {
      dabr::delete(conn,
                   "DELETE FROM age_model WHERE ID_SAMPLE IN (",
                   paste0(all_samples_tb$ID_SAMPLE, collapse = ", "),
                   ") AND ID_MODEL = 8")
    }
    rpd:::update_records(conn, "age_model", am, PK = c(7:8), quiet = TRUE)

    # Check inserted records
    age_model_tb <-
      dabr::select(conn,
                   "SELECT * FROM age_model WHERE ID_SAMPLE IN (",
                   paste0(am$ID_SAMPLE, collapse = ", "),
                   ") AND ID_MODEL = 8",
                   quiet = TRUE) %>%
      tibble::as_tibble()
    # Change case of columns
    colnames(age_model_tb) <- tolower(colnames(age_model_tb))
    colnames(am) <- tolower(colnames(am))
    # Change column order
    am <- am[, colnames(age_model_tb)]
    print(waldo::compare(am, age_model_tb))#, tolerance = 0.01))
  } else {
    rpd:::msg(paste0("Mistmatch in the number of samples: ", entity_name,". "))
    return(tibble::tibble(entity_name,
                          am = list(am),
                          status = FALSE,
                          reason = "Mistmatch in the number of samples.",
                          ...))
  }

  return(tibble::tibble(entity_name,
                        am = list(am),
                        status = TRUE,
                        reason = NA_character_,
                        ...))
}

path <- "~/Downloads/special_epd_am"
find_age_models(path)

# Age models ----
epd_age_models_id <- "1wzGY-qFOsrll5MbNipgNjjaKklHldZxO"
epd_age_models_tmp_file <- tempfile(fileext = ".xslx")
googledrive::as_id(epd_age_models_id) %>%
  googledrive::drive_download(path = epd_age_models_tmp_file, overwrite = TRUE)
epd_age_models <- epd_age_models_tmp_file %>%
  readxl::read_excel(sheet = 1) %>%
  janitor::clean_names() %>%
  dplyr::mutate(site_id = as.integer(site_id %>%
                                       stringr::str_squish()),
                age_model_run_successfully_ready_to_check_bool =
                  to_bool(age_model_run_successfully_ready_to_check),
                age_model_checked_bool =
                  to_bool(age_model_checked),
                ready_to_upload_bool =
                  to_bool(ready_to_upload))

epd_age_models_ready_to_upload <- epd_age_models %>%
  dplyr::filter(ready_to_upload_bool)
epd_age_models_ignored_entities <- epd_age_models %>%
  dplyr::filter(ready_to_upload %>%
                  stringr::str_to_lower() %>%
                  stringr::str_detect("ignore"))

epd_age_models_extracted_from_embsecbio <- epd_age_models %>%
  dplyr::filter(age_model_run_successfully_ready_to_check %>%
                  stringr::str_detect("can be extracted from EMBSECBIO|EMBESCBIO"))

epd_age_models_not_uploaded <- epd_age_models %>%
  dplyr::filter(ready_to_upload %>%
                  stringr::str_to_lower() %>%
                  stringr::str_detect("ignore", negate = TRUE)) %>%
  dplyr::filter(is.na(ready_to_upload_bool)) %>%
  dplyr::filter(age_model_run_successfully_ready_to_check %>%
                  stringr::str_detect("can be extracted from EMBSECBIO|EMBESCBIO",
                                      negate = TRUE)) %>%
  dplyr::filter(!(entity_name %in% c("DURANK3", "KILOMYR"))) %>%
  dplyr::filter(age_model_run_successfully_ready_to_check %>%
                  stringr::str_detect("one date", negate = TRUE))

# epd_age_models_not_uploaded %>%
#   readr::write_excel_csv("~/Downloads/epd_age_models_not_uploaded.csv")

find_age_models(path) %>%
  dplyr::filter(entity_name %in% epd_age_models_ready_to_upload$entity_name)

# Subsets ----
## RPD ----
rpd_repatriation <- epd_repatriation_tmp_file %>%
  readxl::read_excel(sheet = 1) %>%
  janitor::clean_names() %>%
  dplyr::filter(!is.na(rpd_id_entity)) %>%
  dplyr::mutate(dates_to_be_extracted_from_rpd =
                  to_bool(dates_to_be_extracted_from_rpd),
                age_model_to_be_extracted_from_rpd =
                  to_bool(age_model_to_be_extracted_from_rpd),
                matched_lab_no = to_bool(matched_lab_no)) %>%
  dplyr::rename(entity_name = entity_name_7,
                RPD_entity_name = entity_name_12,
                RPD_ID_ENTITY = rpd_id_entity)
rpd_repatriation

### RPD dates (113) ----
rpd_repatriation_dates <- rpd_repatriation %>%
  dplyr::filter(dates_to_be_extracted_from_rpd)

rpd_repatriated_dates_info <-
  rpd_repatriation_dates$RPD_ID_ENTITY %>%
  extract_rpd()
rpd_repatriated_dates_info_2 <- rpd_repatriation_dates %>%
  dplyr::select(neotoma_ID_SITE = site_id,
                neotoma_site_name = site_name,
                neotoma_entity_name = entity_name,
                ID_ENTITY = RPD_ID_ENTITY) %>%
  dplyr::inner_join(rpd_repatriated_dates_info$metadata,
                    by = "ID_ENTITY") %>%
  dplyr::rename(external_ID_ENTITY = ID_ENTITY,
                external_ID_SITE =  ID_SITE,
                external_site_name = site_name,
                external_entity_name = entity_name)
rpd_repatriated_dates_info_3 <- EPD_METADATA %>%
  dplyr::select(1:4, 6, 10) %>%
  dplyr::right_join(rpd_repatriated_dates_info_2 %>%
                      dplyr::select(1:6, 8),
                    by = c("entity_name" = "neotoma_entity_name"))

#### External links ----
meta_neo_res <- rpd_repatriated_dates_info_3 %>%
  dplyr::filter(external_entity_name %>% stringr::str_detect("Solso")) %>%
  nrow() %>%
  seq_len() %>%
  purrr::map(function(i) {
    rpd_repatriated_dates_info_3 %>%
      dplyr::filter(external_entity_name %>% stringr::str_detect("Solso")) %>%
      dplyr::slice(i) %>%
      dplyr::select(ID_SITE,
                    ID_ENTITY,
                    external_ID_SITE,
                    external_ID_ENTITY,
                    external_site_name,
                    external_entity_name) %>%
      dplyr::mutate(external_source = "RPD") %>%
      rpd:::add_records(conn = conn, table = "external_link", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate -----
EPD_METADATA_NEO_DB <- dabr::select_all(conn, "external_link") %>%
  dplyr::filter(external_ID_ENTITY %in%
                  rpd_repatriated_dates_info_3$external_ID_ENTITY,
                external_source == "RPD") %>%
  dplyr::select(ID_SITE,
                ID_ENTITY,
                external_ID_SITE,
                external_ID_ENTITY,
                external_site_name,
                external_entity_name)
waldo::compare(rpd_repatriated_dates_info_3 %>%
                 dplyr::select(ID_SITE,
                               ID_ENTITY,
                               external_ID_SITE,
                               external_ID_ENTITY,
                               external_site_name,
                               external_entity_name) %>%
                 dplyr::mutate(external_ID_SITE = as.integer(external_ID_SITE),
                               external_ID_ENTITY = as.integer(external_ID_ENTITY)),
               EPD_METADATA_NEO_DB,
               tolerance = 1E-9)

#### Dates ----
rpd_repatriated_dates_info_4 <-
  rpd_repatriated_dates_info$date_info %>%
  dplyr::select(-ID_DATE_INFO) %>%
  dplyr::rename(external_ID_ENTITY = ID_ENTITY) %>%
  # dplyr::mutate(age_used = ifelse(is.na(reason_age_not_used), "yes", "no")) %>%
  dplyr::left_join(rpd_repatriated_dates_info_3 %>%
                     dplyr::select(ID_ENTITY, external_ID_ENTITY)) %>%
  dplyr::select(-external_ID_ENTITY) %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::rename(lab_num = lab_number) #%>%
  # dplyr::mutate(ID_DATE_INFO = c(
  #   1607:1620,
  #   1627:2018,
  #   2026:2628,
  #   2631:2777
  # ))
  #dplyr::filter(ID_ENTITY %in% c(838)) # New records

special.epd::snapshot(conn, ID_ENTITY = 838)
meta_neo_res <- seq_len(nrow(rpd_repatriated_dates_info_4)) %>%
  purrr::map(function(i) {
    rpd_repatriated_dates_info_4[i, ] %>%
      rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate -----
EPD_DATES_NEO_DB <- dabr::select_all(conn, "date_info") %>%
  dplyr::filter(ID_ENTITY %in% rpd_repatriated_dates_info_4$ID_ENTITY)

corrected_dates <- unique(rpd_repatriated_dates_info_4$ID_ENTITY) %>%
  sort() %>%
  purrr::map_df(function(id) {
    RPD <- rpd_repatriated_dates_info_4 %>%
      dplyr::filter(ID_ENTITY == id)
    DB <- EPD_DATES_NEO_DB %>%
      dplyr::filter(ID_ENTITY == id)
    waldo::compare(RPD %>%
                     .[order(colnames(.))],
                   DB %>%
                     .[order(colnames(.))] %>%
                     magrittr::set_class(c("date_info", class(.))),
                   max_diffs = Inf,
                   tolerance = 1E-9) %>%
      print()
    DB %>%
      dplyr::select(ID_DATE_INFO,
                    lab = lab_num,
                    old_depth = depth,
                    old_thick = thickness) %>%
      dplyr::bind_cols(RPD %>%
                         dplyr::select(ID_ENTITY, lab_num, depth, thickness))
  })

corrected_dates %>%
  dplyr::filter(is.na(ID_DATE_INFO) |
                  is.na(ID_ENTITY) |
                  old_depth * 100 != depth |
                  lab != lab_num)
corrected_dates_2 <- corrected_dates %>%
  dplyr::mutate(thickness = ifelse(old_thick < -888887, round(old_thick), thickness))
corrected_dates_2 %>%
  dplyr::select(ID_ENTITY, ID_DATE_INFO, depth, thickness) %>%
  rpd:::update_records(conn = conn, table = "date_info",
                       dry_run = TRUE, PK = 1:2)

waldo::compare(rpd_repatriated_dates_info_4 %>%
                 .[order(colnames(.))] %>%
                 dplyr::mutate(depth = round(depth, 3),
                               thickness = round(thickness, 3)) %>%
                 dplyr::arrange(ID_ENTITY, depth),
                 # dplyr::arrange(ID_ENTITY, ID_DATE_INFO),
               EPD_DATES_NEO_DB %>%
                 .[order(colnames(.))] %>%
                 dplyr::mutate(depth = round(depth, 3),
                               thickness = round(thickness, 3)) %>%
                 dplyr::arrange(ID_ENTITY, depth)%>%
                 dplyr::select(-ID_DATE_INFO) %>%
                 # dplyr::arrange(ID_ENTITY, ID_DATE_INFO) %>%
                 magrittr::set_class(c("date_info", class(.))),
               max_diffs = Inf,
               tolerance = 1E-9)

#### DURANK3 ----
"The entity called 'DURANK3' was identified in both the EMBSeCBIO and RPD."
"The age models will be inspected to determine which version to keep."
a <- special.epd::get_entity(conn, 780)
special.epd::snapshot(conn, ID_ENTITY = 780)
dabr::select(conn,
             "SELECT * FROM external_link WHERE",
             "external_ID_SITE = 620 AND",
             "external_ID_ENTITY = 712 AND",
             "external_source = 'RPD'")
dabr::delete(conn,
             "DELETE FROM external_link WHERE",
             "external_ID_SITE = 620 AND",
             "external_ID_ENTITY = 712 AND",
             "external_source = 'RPD'")
# Results: 1 record was deleted.
dabr::select(conn,
             "SELECT * FROM date_info WHERE",
             "ID_ENTITY = 780")
dabr::update(conn,
             "UPDATE date_info SET material_dated = 'charcoal' WHERE",
             "ID_ENTITY = 780 AND",
             "ID_DATE_INFO IN (30, 31, 32, 33)")
# Results: 4 records were updated.
dabr::update(conn,
             "UPDATE date_info SET material_dated = 'undifferentiated plant macrofossil' WHERE",
             "ID_ENTITY = 780 AND",
             "ID_DATE_INFO IN (34)")
# Results: 1 record was updated.
dabr::update(conn,
             "UPDATE date_info SET date_type = 'Top of core known' WHERE",
             "ID_ENTITY = 780 AND",
             "ID_DATE_INFO = 1623") # Originally a 'Pollen correlation'
# Results: 1 record was updated.

# Delete duplicated dates
dabr::select(conn,
             "SELECT * FROM date_info WHERE",
             "ID_ENTITY = 780 AND",
             "ID_DATE_INFO IN (1621, 1622, 1624, 1625, 1626)")
dabr::delete(conn,
             "DELETE FROM date_info WHERE",
             "ID_ENTITY = 780 AND",
             "ID_DATE_INFO IN (1621, 1622, 1624, 1625, 1626)")
# Results: 5 records were deleted.

### non-RPD dates (7) ----
non_rpd_repatriation_dates <- rpd_repatriation %>%
  dplyr::filter(!dates_to_be_extracted_from_rpd) %>%
  dplyr::filter(entity_name != "DURANK3")

non_rpd_repatriated_dates_info <-
  non_rpd_repatriation_dates$RPD_ID_ENTITY %>%
  extract_rpd()
non_rpd_repatriated_dates_info_2 <- non_rpd_repatriation_dates %>%
  dplyr::select(neotoma_ID_SITE = site_id,
                neotoma_site_name = site_name,
                neotoma_entity_name = entity_name,
                ID_ENTITY = RPD_ID_ENTITY) %>%
  dplyr::inner_join(non_rpd_repatriated_dates_info$metadata,
                    by = "ID_ENTITY") %>%
  dplyr::rename(external_ID_ENTITY = ID_ENTITY,
                external_ID_SITE =  ID_SITE,
                external_site_name = site_name,
                external_entity_name = entity_name)
non_rpd_repatriated_dates_info_3 <- EPD_METADATA %>%
  dplyr::select(1:4, 6, 10) %>%
  dplyr::right_join(non_rpd_repatriated_dates_info_2 %>%
                      dplyr::select(1:6, 8),
                    by = c("entity_name" = "neotoma_entity_name"))

#### Dates ----
non_rpd_repatriated_dates_info_4 <- EPD_DATES %>%
  dplyr::filter(entity_name %in% non_rpd_repatriated_dates_info_3$entity_name) %>%
  # dplyr::rename(external_ID_SITE = site_id) %>%
  dplyr::left_join(non_rpd_repatriated_dates_info_3 %>%
                     dplyr::select(ID_ENTITY, site_id, entity_name)) %>%
  dplyr::select(-ages_already, -site_id, -site_name, -site_name_clean, -entity_name) %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::rename(age_calib = age_cal)

# Check for existing dates
special.epd::snapshot(conn, ID_ENTITY = non_rpd_repatriated_dates_info_4$ID_ENTITY)
meta_neo_res <- seq_len(nrow(non_rpd_repatriated_dates_info_4)) %>%
  purrr::map(function(i) {
    non_rpd_repatriated_dates_info_4[i, ] %>%
      rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate -----
EPD_DATES_NEO_DB <- dabr::select_all(conn, "date_info") %>%
  dplyr::filter(ID_ENTITY %in% non_rpd_repatriated_dates_info_4$ID_ENTITY)
waldo::compare(non_rpd_repatriated_dates_info_4 %>%
                 .[order(colnames(.))] %>%
                 dplyr::arrange(ID_ENTITY, depth),
               EPD_DATES_NEO_DB %>%
                 .[order(colnames(.))] %>%
                 dplyr::select(-ID_DATE_INFO) %>%
                 dplyr::arrange(ID_ENTITY, depth),
               max_diffs = Inf)

### RPD age models (47) ----
rpd_repatriation_am <- rpd_repatriation %>%
  dplyr::filter(age_model_to_be_extracted_from_rpd)

rpd_repatriated_am_info <-
  rpd_repatriation_am$RPD_ID_ENTITY %>%
  extract_rpd()

rpd_repatriated_am_info_2 <- rpd_repatriation_am %>%
  dplyr::select(neotoma_ID_SITE = site_id,
                neotoma_site_name = site_name,
                neotoma_entity_name = entity_name,
                ID_ENTITY = RPD_ID_ENTITY) %>%
  dplyr::inner_join(rpd_repatriated_am_info$metadata,
                    by = "ID_ENTITY") %>%
  dplyr::rename(external_ID_ENTITY = ID_ENTITY,
                external_ID_SITE =  ID_SITE,
                external_site_name = site_name,
                external_entity_name = entity_name)

rpd_repatriated_am_info_3 <- EPD_METADATA %>%
  dplyr::select(1:4, 6, 10) %>%
  dplyr::right_join(rpd_repatriated_am_info_2 %>%
                      dplyr::select(1:6, 8),
                    by = c("entity_name" = "neotoma_entity_name"))

#### Samples ----
rpd_repatriated_am_info_EPD_COUNTS <- EPD_COUNTS %>%
  dplyr::filter(dataset_id %in% rpd_repatriated_am_info_3$dataset_id) %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-chronology_id) %>%
  dplyr::left_join(EPD_METADATA %>%
                     dplyr::select(1:6, 10),
                   by = c("site_id", "site_name", "site_name_clean", "dataset_id", "entity_name")) %>%
  dplyr::relocate(ID_SITE, ID_ENTITY, .before = 1)

rpd_repatriated_am_info_4 <- rpd_repatriated_am_info$sample %>%
  dplyr::left_join(rpd_repatriated_am_info$age_model) %>%
  # dplyr::mutate(age = dplyr::coalesce(age, est_age_original),
  #               age_original = est_age_original) %>%
  # dplyr::select(-est_age_original, -comment) %>%
  dplyr::left_join(rpd_repatriated_am_info_3 %>%
                     dplyr::select(ID_ENTITY, external_ID_ENTITY),
                   by = "external_ID_ENTITY") %>%
  # dplyr::filter(ID_ENTITY != 18) %>%
  dplyr::select(-ID_SAMPLE, -external_ID_ENTITY) %>%
  dplyr::relocate(ID_ENTITY, .before = 1)

# # Extract thickness from the EPD data
# rpd_repatriated_am_info_5 <- rpd_repatriated_am_info_4 %>%
#   dplyr::mutate(depth2 = round(depth, 3)) %>%
#   dplyr::left_join(rpd_repatriated_am_info_EPD_COUNTS %>%
#                      dplyr::select(ID_ENTITY, depth, thickness, chronology_name,
#                                    age_EPD = age, age_younger, age_older, age_type) %>%
#                      dplyr::mutate(depth = round(depth, 3)),
#                    by = c("ID_ENTITY",  "depth2" = "depth")) %>%
#   dplyr::select(-depth2) %>%
#   dplyr::relocate(#thickness,
#                   # age_original,
#                   age_EPD,
#                   chronology_name,
#                   age_younger,
#                   age_older,
#                   age_type,
#                   .before = mean) %>%
#   # dplyr::filter(age != age_EPD)
#   # dplyr::group_by(ID_ENTITY) %>%
#   dplyr::mutate(lower = round(age_EPD) * 0.99,
#                 upper = round(age_EPD) * 1.01,
#                 same_age = (round(age) >= lower & round(age) <= upper) | (is.na(age) & is.na(age_EPD)),
#                 chronology_name = ifelse(same_age, chronology_name, NA),
#                 age_younger = ifelse(same_age, age_younger, NA),
#                 age_older = ifelse(same_age, age_older, NA),
#                 age_type = ifelse(same_age, age_type, NA)) %>%
#   dplyr::select(-age_original, -age_EPD, -lower, -upper, -same_age) %>%
#   dplyr::mutate(ID_SAMPLE = seq_along(ID_ENTITY), .before = 1)
# # dplyr::mutate(ID_SAMPLE = get_id_sample(conn))
#
# # Verify if the EPD has more samples than the RPD
# tibble::tibble(
#   ID_ENTITY = names(table(rpd_repatriated_am_info_EPD_COUNTS$ID_ENTITY)),
#   EPD = table(rpd_repatriated_am_info_EPD_COUNTS$ID_ENTITY),
#   EMB = table(rpd_repatriated_am_info_4$ID_ENTITY)
# ) %>%
#   dplyr::filter(EPD > EMB)

rpd_repatriated_am_info_3 %>%
  dplyr::filter(!(ID_ENTITY %in% rpd_repatriated_am_info_4$ID_ENTITY))

external_links <- dabr::select_all(conn, "external_link") %>%
  dplyr::filter(ID_ENTITY %in% rpd_repatriated_am_info_3$ID_ENTITY)

aux <- conn %>%
  special.epd::snapshot(ID_ENTITY = rpd_repatriated_am_info_4$ID_ENTITY)
rpd_repatriated_am_info_3 %>%
  dplyr::filter(!(ID_ENTITY %in% aux$date_info$ID_ENTITY))

meta_neo_res <- seq_len(nrow(rpd_repatriated_am_info_4)) %>%
  purrr::map(function(i) {
    rpd_repatriated_am_info_4[i, ] %>%
      dplyr::mutate(chronology_name =
                      stringr::str_c("Original age model: ",
                                     model_name_original)) %>%
      dplyr::select(ID_ENTITY, depth, thickness, chronology_name, age) %>%
      rpd:::add_records(conn = conn, table = "sample", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate -----
EPD_DATES_NEO_DB <- dabr::select_all(conn, "sample") %>%
  dplyr::filter(ID_ENTITY %in% rpd_repatriated_am_info_4$ID_ENTITY)
waldo::compare(rpd_repatriated_am_info_4 %>%
                 dplyr::mutate(chronology_name =
                                 stringr::str_c("Original age model: ",
                                                model_name_original)) %>%
                 dplyr::select(ID_ENTITY, depth, thickness, chronology_name, age) %>%
                 .[order(colnames(.))] %>%
                 dplyr::mutate(depth = round(depth, 3),
                               age = round(age)),
               EPD_DATES_NEO_DB %>%
                 .[order(colnames(.))] %>%
                 dplyr::mutate(depth = round(depth, 3),
                               age = round(age)) %>%
                 dplyr::select(-age_older, -age_type, -age_younger,
                               -ID_SAMPLE, -sample_type, -count_type),
               tolerance = 2)

#### Age models (PENDING 17 AM) ----
# NOTE: There are 17 entities with pending AM
rpd_records_wo_am <- tibble::tribble(
  ~ID_SITE, ~ID_ENTITY,                ~site_name, ~entity_name, ~external_ID_SITE, ~external_ID_ENTITY,           ~external_site_name,      ~external_entity_name, ~external_source,
  196L,       217L,              "Bruckmisse",       "BRM1",              814L,               1340L,                  "Bruckmisse",          "Bruckmisse_BRM1",            "RPD",
  196L,       218L,              "Bruckmisse",       "BRM3",              814L,               1341L,                  "Bruckmisse",          "Bruckmisse_BRM3",            "RPD",
  236L,       265L,             "Champ Gazon",   "CHAGAZON",             1922L,               2381L,                 "Champ Gazon",         "Champ Gazon core",            "RPD",
  365L,       405L,      "Etang de la Gruere",      "EGR2A",            15746L,              40452L,          "Etang de la Gruère",                    "EGR2A",        "NEOTOMA",
  365L,       406L,      "Etang de la Gruere",      "EGR2G",            15746L,              40484L,          "Etang de la Gruère",                    "EGR2G",        "NEOTOMA",
  422L,       469L,             "Glaswaldsee",       "GWA1",              820L,               1354L,                 "Glaswaldsee",         "Glaswaldsee core",            "RPD",
  430L,       478L,             "Gorgo Basso",   "GORGOBAS",              609L,                701L,                 "Gorgo Basso",     "Gorgo Basso core GL3",            "RPD",
  433L,       481L,            "Gourte Mires",     "GOURTE",             1636L,               2075L,                "Gourte Mires",        "Gourte Mires core",            "RPD",
  690L,       776L,               "Lake Bled",      "BLEDC",             1332L,               1717L,                   "Lake Bled",           "Lake Bled core",            "RPD",
  786L,       890L,    "Le Verny des Brulons",   "VERNYBRU",             1836L,               2291L,           "Verny des Brulons",   "Verny des Brulons core",            "RPD",
  852L,       962L,             "Long Breach",   "LONGBREA",              547L,               1437L,                 "Long Breach",         "Long Breach Core",            "RPD",
  1086L,      1206L,         "Pryskyricny dul",   "PRYSKYRI",             1985L,               2458L,             "Pryskyricny dul",            "PRYSKYRI core",            "RPD",
  1301L,      1443L,                "Torveraz",   "TORVERA2",              808L,                905L,                    "Torveraz",          "Torveraz core 2",            "RPD",
  1354L,      1506L,             "Vaike Juusa",      "VJUUS",             1355L,               2466L,                 "Vaike Juusa",                      "J20",            "RPD",
  1374L,      1530L,    "Vestre Oykjamyrtjorn",      "OYKJA",             1840L,               2295L,         "Vestre Oykjamyrtorn", "Vestre Oykjamyrtorn core",            "RPD",
  1410L,      1576L, "Wilder See am Ruhestein",       "WILA",              802L,               1414L,     "Wilder See am Ruhestein",          "Wilder See core",            "RPD",
  1412L,      1579L, "Wildseemoor-Kaltenbronn",       "KWI2",              801L,               1415L, "Wildseemoor bei Kaltenbronn",    "Wildseemoor core KWI2",            "RPD"
)

# aux2 <- rpd_repatriated_am_info_5 %>%
#   dplyr::mutate(ID_MODEL = 8, .before = 2) %>% # Bacon IntCal20
#   dplyr::select(ID_MODEL, ID_SAMPLE, mean, median,
#                 UNCERT_5, UNCERT_25, UNCERT_75, UNCERT_95) %>%
#   dplyr::filter(is.na(mean), is.na(median))
# rpd_records_wo_am <- dabr::select_all(conn, "entity")  %>%
#   dplyr::filter(ID_ENTITY %in% (dabr::select_all(conn, "sample") %>%
#                   dplyr::filter(ID_SAMPLE %in% aux2$ID_SAMPLE) %>%
#                   .$ID_ENTITY %>% unique())) %>%
#   dplyr::select(1:4) %>%
#   dplyr::left_join(dabr::select_all(conn, "external_link")) %>%
#   dplyr::filter(entity_name %>% stringr::str_detect("EGR2") |
#                   external_source == "RPD")
rpd_records_wo_am #%>%
  # readr::write_excel_csv("~/Downloads/rpd_records_without_age_models.csv")

am <- dabr::select_all(conn_rpd, "age_model") %>%
  dplyr::left_join(dabr::select_all(conn_rpd, "sample") %>%
                     dplyr::select(ID_SAMPLE, ID_ENTITY)) %>%
  dplyr::filter(ID_ENTITY %in% rpd_records_wo_am$external_ID_ENTITY)

# Extract ID_SAMPLE
aux <- dabr::select_all(conn, "sample") %>%
  dplyr::filter(ID_ENTITY %in% rpd_repatriated_am_info_4$ID_ENTITY)
rpd_repatriated_am_info_5 <- rpd_repatriated_am_info_4 %>%
  # dplyr::left_join(aux[1:3]) %>%
  dplyr::bind_cols(aux[2]) %>%
  dplyr::relocate(ID_SAMPLE, .before = 2)
# Verify that the ID_SAMPLEs have been assigned to the correct records
waldo::compare(aux[1:4], rpd_repatriated_am_info_5[1:4], tolerance = 0.001)
# Create subset of entities with IntCal20 age models
rpd_repatriated_am_info_6 <- rpd_repatriated_am_info_5 %>%
  dplyr::filter(model_name %>%
                  stringr::str_to_lower() %>%
                  stringr::str_detect("bacon_intcal20"))
# Check if any entries in the mean and median columns are missing
rpd_repatriated_am_info_6 %>%
  dplyr::filter(is.na(mean) | is.na(median))
meta_neo_res <- seq_len(nrow(rpd_repatriated_am_info_6))[-1] %>%
  purrr::map(function(i) {
    rpd_repatriated_am_info_6[i, ] %>%
      dplyr::mutate(ID_MODEL = 8, .before = 2) %>% # Bacon IntCal20
      dplyr::select(ID_MODEL, ID_SAMPLE, mean, median,
                    UNCERT_5, UNCERT_25, UNCERT_75, UNCERT_95) %>%
      rpd:::add_records(conn = conn, table = "age_model", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate ----
EPD_DATES_NEO_DB <- dabr::select_all(conn, "age_model") %>%
  dplyr::filter(ID_SAMPLE %in% rpd_repatriated_am_info_6$ID_SAMPLE)
waldo::compare(rpd_repatriated_am_info_6 %>%
                 dplyr::select(2, 9:14) %>%
                 dplyr::mutate(ID_MODEL = 8, .before = 2) %>%
                 .[order(colnames(.))],
               EPD_DATES_NEO_DB %>%
                 .[order(colnames(.))],
               tolerance = 2)


### non-RPD age models (89) ----
non_rpd_repatriation_am <- rpd_repatriation %>%
  dplyr::filter(!age_model_to_be_extracted_from_rpd) %>%
  dplyr::filter(entity_name != "DURANK3") # Repatriated from the EMBSeCBIO

non_rpd_repatriated_am_info <- non_rpd_repatriation_am %>%
  dplyr::select(site_id, site_name, entity_name) %>%
  dplyr::left_join(dabr::select_all(conn, "external_link"),
                   by = c("site_id" = "external_ID_SITE",
                          "entity_name" = "external_entity_name"))
# non_rpd_repatriated_am_info <-
#   non_rpd_repatriation_am$RPD_ID_ENTITY %>%
#   extract_rpd()
#
# non_rpd_repatriated_am_info_2 <- non_rpd_repatriation_am %>%
#   dplyr::select(neotoma_ID_SITE = site_id,
#                 neotoma_site_name = site_name,
#                 neotoma_entity_name = entity_name,
#                 ID_ENTITY = RPD_ID_ENTITY) %>%
#   dplyr::inner_join(non_rpd_repatriated_am_info$metadata,
#                     by = "ID_ENTITY") %>%
#   dplyr::rename(external_ID_ENTITY = ID_ENTITY,
#                 external_ID_SITE =  ID_SITE,
#                 external_site_name = site_name,
#                 external_entity_name = entity_name)
#
# non_rpd_repatriated_am_info_3 <- EPD_METADATA %>%
#   dplyr::select(1:4, 6, 10) %>%
#   dplyr::right_join(non_rpd_repatriated_am_info_2 %>%
#                       dplyr::select(1:6, 8),
#                     by = c("entity_name" = "neotoma_entity_name"))

#### Samples ----
non_rpd_repatriated_am_info_EPD_COUNTS <- EPD_COUNTS %>%
  dplyr::filter(dataset_id %in% non_rpd_repatriated_am_info$external_ID_ENTITY) %>%
  # dplyr::filter(dataset_id %in% non_rpd_repatriated_am_info_3$dataset_id) %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-chronology_id) %>%
  dplyr::left_join(EPD_METADATA %>%
                     dplyr::select(1:3, 6, 10),
                   by = c("site_id", "dataset_id", "entity_name")) %>%
  dplyr::relocate(ID_SITE, ID_ENTITY, .before = 1) %>%
  dplyr::mutate(depth = ifelse(ID_ENTITY == 365 & depth == 0, 0.5, depth)) # Update depth GDU: 0 -> 0.5
non_rpd_repatriated_am_info_EPD_COUNTS

# Note: remove additional sample linked to GDU (ID_ENTITY = 365, depth = 0)
# ID_SAMPLE = 21612
special.epd::snapshot(conn, ID_ENTITY = 365)
# dabr::select(conn, "SELECT * FROM SAMPLE WHERE ID_ENTITY = 365 AND depth = 0")
# dabr::delete(conn, "DELETE FROM SAMPLE WHERE ID_ENTITY = 365 AND depth = 0")


# non_rpd_repatriated_am_info_EPD_COUNTS %>%
#   dplyr::distinct(ID_SITE, ID_ENTITY, site_id, site_name, dataset_id, entity_name) %>%
#   dplyr::select(site_id, site_name, entity_name, ID_SITE, ID_ENTITY, external_ID_ENTITY = dataset_id) %>%
#   dplyr::arrange(ID_SITE, ID_ENTITY) %>%
#   waldo::compare(non_rpd_repatriated_am_info %>%
#                    dplyr::arrange(ID_SITE, ID_ENTITY))

# non_rpd_repatriated_am_info_4 <- non_rpd_repatriated_am_info$sample %>%
#   dplyr::left_join(non_rpd_repatriated_am_info$age_model) %>%
#   # dplyr::mutate(age = dplyr::coalesce(age, est_age_original),
#   #               age_original = est_age_original) %>%
#   # dplyr::select(-est_age_original, -comment) %>%
#   dplyr::left_join(non_rpd_repatriated_am_info_3 %>%
#                      dplyr::select(ID_ENTITY, external_ID_ENTITY),
#                    by = "external_ID_ENTITY") %>%
#   # dplyr::filter(ID_ENTITY != 18) %>%
#   dplyr::select(-ID_SAMPLE, -external_ID_ENTITY) %>%
#   dplyr::relocate(ID_ENTITY, .before = 1)


non_rpd_repatriated_am_info_2 <- non_rpd_repatriated_am_info_EPD_COUNTS %>%
  dplyr::mutate(ID_SAMPLE = 18260 + seq_along(ID_ENTITY), # get_id_sample(conn)
                .after = ID_ENTITY) %>%
  dplyr::select(-ID_SITE, -site_id, -site_name, -site_name_clean, -dataset_id, -dataset_name, -entity_name, -sample_id, -unit_name)

# Check if the "new" records are already in th DB:
special.epd::snapshot(conn, ID_ENTITY = non_rpd_repatriated_am_info_2$ID_ENTITY)
dabr::select_all(conn, "sample") %>%
  dplyr::filter(ID_SAMPLE %in% non_rpd_repatriated_am_info_2$ID_SAMPLE |
                  ID_ENTITY %in% non_rpd_repatriated_am_info_2$ID_ENTITY)
meta_neo_res <- seq_len(nrow(non_rpd_repatriated_am_info_2)) %>%
  purrr::map(function(i) {
    non_rpd_repatriated_am_info_2[i, ] %>%
      dplyr::select(1:9) %>%
      rpd:::add_records(conn = conn, table = "sample", dry_run = TRUE)
  })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate -----
EPD_DATES_NEO_DB <- dabr::select_all(conn, "sample") %>%
  dplyr::filter(ID_ENTITY %in% non_rpd_repatriated_am_info_2$ID_ENTITY)
waldo::compare(non_rpd_repatriated_am_info_2 %>%
                 dplyr::select(1:9) %>%
                 .[order(colnames(.))] %>%
                 dplyr::mutate(depth = round(depth, 3),
                               age = round(age)),
               EPD_DATES_NEO_DB %>%
                 .[order(colnames(.))] %>%
                 dplyr::mutate(depth = round(depth, 3),
                               age = round(age)) %>%
                 dplyr::select(-count_type, -sample_type),
               tolerance = 2)

#### (HERE) Age models (67) ----
non_rpd_repatriated_am_new <-
  find_age_models(path, non_rpd_repatriated_am_info$entity_name)
non_rpd_repatriated_am_new2 <- non_rpd_repatriated_am_new %>%
  purrr::pmap_df(upload_age_model, conn = conn)

all(non_rpd_repatriated_am_new2$status)
non_rpd_repatriated_am_new2
non_rpd_repatriated_am_new3 <-
  non_rpd_repatriated_am_new2$am %>%
  purrr::map_df(~.x) %>%
  magrittr::set_names(colnames(.) %>% stringr::str_to_upper())

EPD_AM <- dabr::select(conn,
                       "SELECT * FROM age_model WHERE ID_SAMPLE IN (",
                       paste0(non_rpd_repatriated_am_new3$ID_SAMPLE, collapse = ", "),
                       ") AND ID_MODEL = 8")
waldo::compare(non_rpd_repatriated_am_new3 %>%
                 dplyr::arrange(ID_SAMPLE),
               EPD_AM %>%
                 magrittr::set_names(colnames(.) %>% stringr::str_to_upper()) %>%
                 dplyr::arrange(ID_SAMPLE))

special.epd::snapshot(conn, entity_name = non_rpd_repatriated_am_new$entity_name)

# NOTE: Pending age models
non_rpd_repatriated_am_info %>%
  dplyr::filter(!(entity_name %in% non_rpd_repatriated_am_new2$entity_name))

tibble::tribble(
  ~site_id,                  ~site_name, ~entity_name, ~ID_SITE, ~ID_ENTITY, ~external_ID_ENTITY,         ~external_site_name, ~external_source,
  3234,           "Lilla Gloppsjön",   "LGLOPPSJ",     804L,       910L,               4200L,           "Lilla Gloppsjön",        "NEOTOMA",
  3245,               "Lobsigensee",    "LOBHOLO",     825L,       935L,               4213L,               "Lobsigensee",        "NEOTOMA",
  3265,               "Lake Maardu",      "MAASV",     712L,       802L,              45381L,               "Lake Maardu",        "NEOTOMA",
  13390,              "Trettetj√∏rn",     "TRETTE",    1333L,      1477L,              20042L,               "Trettetjørn",        "NEOTOMA",
  14551,      "Lac du Verney-Dessus",   "DESVERNE",     647L,       727L,              22810L,      "Lac du Verney-Dessus",        "NEOTOMA",
  14613, "Lago Piccolo di Avigliana",   "LAVPICCO",     667L,       751L,              22908L, "Lago Piccolo di Avigliana",        "NEOTOMA",
  16133,   "Paleochenal de Neublans",   "NEUBLANS",    1025L,      1143L,              24927L,   "Paleochenal de Neublans",        "NEOTOMA",
  16267,            "Peyre peat-bog",    "PEYREII",    1048L,      1166L,              25213L,            "Peyre peat-bog",        "NEOTOMA",
  23495,                  "Bibersee",      "BIBER",     143L,       162L,              40569L,                  "Bibersee",        "NEOTOMA",
  23895,             "Bay of Biscay",   "MD042845",     115L,       132L,              41446L,             "Bay of Biscay",        "NEOTOMA",
  26125,               "Svartvatnet",     "OTTSVA",    1256L,      1396L,              45343L,               "Svartvatnet",        "NEOTOMA",
  26143,                "Lake Ruila",        "RUI",     731L,       826L,              45379L,                "Lake Ruila",        "NEOTOMA",
  26144,               "Lake Kahala",       "KAHA",     703L,       791L,              45383L,               "Lake Kahala",        "NEOTOMA",
  26145,          "Viitna Linaj√§rv",       "VIPI",    1375L,      1532L,              45385L,           "Viitna Linajärv",        "NEOTOMA",
  26150,    "Lake Plaani K√ºlaj√§rv",        "PLA",     728L,       823L,              45398L,      "Lake Plaani Külajärv",        "NEOTOMA",
  26156,            "Lake Jarveotsa",   "JARVEOTS",     702L,       790L,              45410L,            "Lake Jarveotsa",        "NEOTOMA",
  26217,                "Grostjørna",    "SETESGR",     450L,       503L,              45719L,                "Grostjørna",        "NEOTOMA",
  26991,       "Pr√°≈°ilsk√© jezero",     "PRA-15",    1078L,      1198L,              47520L,          "Prášilské jezero",        "NEOTOMA",
  27384,                  "Yaksha 2",      "YAK2A",    1426L,      1595L,              49139L,                  "Yaksha 2",        "NEOTOMA",
  27385,                  "Yaksha 3",      "YAK3A",    1427L,      1596L,              49141L,                  "Yaksha 3",        "NEOTOMA",
  27526,                "Zeller See",   "ZELLERSE",    1440L,      1610L,              49444L,                "Zeller See",        "NEOTOMA",
  27563,             "Lake Sidi Ali",   "SALI0110",     738L,       833L,              49519L,             "Lake Sidi Ali",        "NEOTOMA"
)

aux <- non_rpd_repatriated_am_new2 %>%
  dplyr::left_join(EPD_METADATA %>%
                     dplyr::select(site_id, entity_name)) %>%
  dplyr::arrange(site_id)


# non_rpd_repatriated_am_new2 %>%
#   dplyr::filter(entity_name %in%
#                   (epd_age_models %>%
#                      dplyr::filter(ready_to_upload) %>%
#                      .$entity_name))

# ##### Export data ----
# conn %>%
#   special.epd::snapshot(entity_name = non_rpd_repatriated_am_new2$entity_name) %>%
#   special.epd::write_csvs(prefix = "~/Downloads/special_epd_snapshot_2022-03-09_")
# entity_tb <- readr::read_csv("special_epd_snapshot_2022-03-09__metadata.csv")
# dates_tb <- readr::read_csv("special_epd_snapshot_2022-03-09__dates.csv")
# samples_tb <- readr::read_csv("special_epd_snapshot_2022-03-09__samples.csv")
# age_model_tb <- readr::read_csv("special_epd_snapshot_2022-03-09__age_model.csv")
# pollen_counts_clean_tb <- readr::read_csv("special_epd_snapshot_2022-03-09__pollen_counts_clean.csv")
# pollen_counts_intermediate_tb <- readr::read_csv("special_epd_snapshot_2022-03-09__pollen_counts_intermediate.csv")
# pollen_counts_amalgamated_tb <- readr::read_csv("special_epd_snapshot_2022-03-09__pollen_counts_amalgamated.csv")

#### Counts ----
non_rpd_repatriated_am_info_3 <- non_rpd_repatriated_am_info_2 %>%
  # dplyr::filter(ID_SAMPLE == 21612) %>%
  dplyr::select(-ID_ENTITY, -depth, -thickness, -chronology_name, -age_type, -age, -age_younger, -age_older)
taxon_name_tb <- dabr::select_all(conn, "taxon_name")
data("EPD_taxa_amalgamation")
##### Clean ----
non_rpd_repatriated_am_info_4 <-
  seq_len(nrow(non_rpd_repatriated_am_info_3)) %>%
  purrr::map_df(function(i) {
    non_rpd_repatriated_am_info_3[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      # dplyr::select(ID_SAMPLE, taxon_name = clean_name, count, epd_taxa) %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      # dplyr::filter(n > 1)
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only

  })

non_rpd_repatriated_am_info_5 <- non_rpd_repatriated_am_info_4
meta_neo_res <- seq_len(nrow(non_rpd_repatriated_am_info_5)) %>%
  purrr::map(function(i) {
    if (i %% 10000 == 0)
      print(i)
    non_rpd_repatriated_am_info_5[i, ] %>%

      rpd:::add_records(conn = conn, table = "pollen_count", dry_run = TRUE, quiet = TRUE)
  })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- dabr::select_all(conn, "pollen_count") %>%
  dplyr::filter(ID_SAMPLE %in% non_rpd_repatriated_am_info_3$ID_SAMPLE,
                amalgamation_level == 0)
waldo::compare(non_rpd_repatriated_am_info_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 2)

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

EPD_taxa_amalgamation_stage2 %>%
  dplyr::filter(intermediate != intermediate2)

non_rpd_repatriated_am_info_6 <-
  unique(non_rpd_repatriated_am_info_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    non_rpd_repatriated_am_info_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  })

meta_neo_res <- seq_len(nrow(non_rpd_repatriated_am_info_6)) %>%
  purrr::map(function(i) {
    if (i %% 10000 == 0)
      print(i)
    non_rpd_repatriated_am_info_6[i, ] %>%
      rpd:::add_records(conn = conn, table = "pollen_count", dry_run = TRUE, quiet = TRUE)
  })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- dabr::select_all(conn, "pollen_count") %>%
  dplyr::filter(ID_SAMPLE %in% non_rpd_repatriated_am_info_3$ID_SAMPLE,
                amalgamation_level == 1)
waldo::compare(non_rpd_repatriated_am_info_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 2)

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

EPD_taxa_amalgamation_stage3 %>%
  dplyr::filter(amalgamated != amalgamated2)

non_rpd_repatriated_am_info_7 <-
  unique(non_rpd_repatriated_am_info_6$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    non_rpd_repatriated_am_info_6 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  })

meta_neo_res <- seq_len(nrow(non_rpd_repatriated_am_info_7)) %>%
  purrr::map(function(i) {
    if (i %% 10000 == 0)
      print(i)
    non_rpd_repatriated_am_info_7[i, ] %>%
      rpd:::add_records(conn = conn, table = "pollen_count", dry_run = TRUE, quiet = TRUE)
  })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- dabr::select_all(conn, "pollen_count") %>%
  dplyr::filter(ID_SAMPLE %in% non_rpd_repatriated_am_info_3$ID_SAMPLE,
                amalgamation_level == 2)
waldo::compare(non_rpd_repatriated_am_info_7 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 2)

aux <- special.epd::snapshot(conn, ID_ENTITY = non_rpd_repatriated_am_info_2$ID_ENTITY)
aux2 <- aux$entity %>%
  dplyr::filter(ID_ENTITY %in% aux$sample$ID_ENTITY)

##### Inspection ----
tmp <- non_rpd_repatriated_am_info_5 %>%
  dplyr::group_by(ID_SAMPLE) %>%
  dplyr::summarise(n_clean = dplyr::n()) %>%
  dplyr::left_join(non_rpd_repatriated_am_info_6 %>%
                     dplyr::group_by(ID_SAMPLE) %>%
                     dplyr::summarise(n_intermediate = dplyr::n()),
                   by = "ID_SAMPLE") %>%
  dplyr::left_join(non_rpd_repatriated_am_info_7 %>%
                     dplyr::group_by(ID_SAMPLE) %>%
                     dplyr::summarise(n_amalgamated = dplyr::n()),
                   by = "ID_SAMPLE") %>%
  dplyr::mutate(diff_1 = n_clean - n_intermediate,
                diff_2 = n_clean - n_amalgamated) %>%
  dplyr::arrange(dplyr::desc(diff_2), dplyr::desc(diff_1))
tmp

# Manual inspection of a record
r0 <- non_rpd_repatriated_am_info_3 %>%
  dplyr::filter(ID_SAMPLE %in% 24032) %>%
  smpds::rm_na_taxa()
r1 <- non_rpd_repatriated_am_info_5 %>%
  dplyr::filter(ID_SAMPLE %in% 24032) %>%
  dplyr::left_join(taxon_name_tb,
                   by = "ID_TAXON") %>%
  dplyr::select(-ID_TAXON, -amalgamation_level) %>%
  tidyr::pivot_wider(ID_SAMPLE, names_from = taxon_name, values_from = count)

r2 <- non_rpd_repatriated_am_info_6 %>%
  dplyr::filter(ID_SAMPLE %in% 24032) %>%
  dplyr::left_join(taxon_name_tb,
                   by = "ID_TAXON") %>%
  dplyr::select(-ID_TAXON, -amalgamation_level) %>%
  tidyr::pivot_wider(ID_SAMPLE, names_from = taxon_name, values_from = count)

r3 <- non_rpd_repatriated_am_info_7 %>%
  dplyr::filter(ID_SAMPLE %in% 24032) %>%
  dplyr::left_join(taxon_name_tb,
                   by = "ID_TAXON") %>%
  dplyr::select(-ID_TAXON, -amalgamation_level) %>%
  tidyr::pivot_wider(ID_SAMPLE, names_from = taxon_name, values_from = count)

tmp2 <- dplyr::bind_rows(r0, r1, r2, r3) %>%
  dplyr::select(-ID_SAMPLE)
tmp3 <- tmp2[order(colnames(tmp2))]


## EMBSeCBIO ----
# Import the taxa amalgamation table
embsecbio_taxa_amalgamation <-
  readxl::read_excel("data-raw/embsecbio_taxon_list_clean_names_SPH.xlsx",
                     skip = 1) %>%
  janitor::clean_names() %>%
  dplyr::rename(taxon_name = taxon_clean,
                clean_name = epd_clean_names) %>%
  dplyr::filter(clean_name %>%
                  stringr::str_to_lower() %>%
                  stringr::str_squish() %>%
                  stringr::str_detect("exclude", negate = TRUE)) %>%
  dplyr::mutate(clean_name = clean_name %>%
                  stringr::str_squish())

embsecbio_repatriation <- epd_repatriation_tmp_file %>%
  readxl::read_excel(sheet = 2) %>%
  janitor::clean_names() %>%
  dplyr::filter(!is.na(site_in_embsecbio)) %>%
  dplyr::mutate(dates_to_be_extracted_from_embsecbio =
                  to_bool(dates_to_be_extracted_from_embsecbio),
                age_model_to_be_extracted_from_embsecbio =
                  to_bool(age_model_to_be_extracted_from_embsecbio)) %>%
  dplyr::rename(EMBSeCBIO_ID_ENTITY = embsecbio_id_entity,
                entity_name = entity_name_7,
                EMBSeCBIO_entity_name = entity_name_12)

### EMBSeCBIO dates (12) ----
embsecbio_repatriation_dates <- embsecbio_repatriation %>%
  dplyr::filter(dates_to_be_extracted_from_embsecbio)

bkg <- conn %>%
  special.epd::snapshot(entity_name = embsecbio_repatriation_dates$entity_name)
bkg$entity
bkg$date_info %>%
  split(.$ID_ENTITY) %>%
  names()
bkg$sample %>%
  split(.$ID_ENTITY) %>%
  names()
bkg$entity %>%
  dplyr::filter(!(ID_ENTITY %in% c(2L, 31L, 64L, 266L, 476L, 583L, 779L, 780L, 810L, 821L)))
special.epd::snapshot(conn, entity_name = c("KARAMIK", "BH2"))

embsecbio_repatriated_dates_info <-
  embsecbio_repatriation_dates$EMBSeCBIO_ID_ENTITY %>%
  extract_embsecbio()
embsecbio_repatriated_dates_info_2 <- embsecbio_repatriation_dates %>%
  dplyr::select(neotoma_ID_SITE = site_id,
                neotoma_site_name = site_name,
                neotoma_entity_name = entity_name,
                ID_ENTITY = EMBSeCBIO_ID_ENTITY) %>%
  dplyr::inner_join(embsecbio_repatriated_dates_info$metadata,
                    by = "ID_ENTITY") %>%
  dplyr::rename(external_ID_ENTITY = ID_ENTITY,
                external_ID_SITE =  ID_SITE,
                external_site_name = site_name,
                external_entity_name = entity_name)
embsecbio_repatriated_dates_info_3 <- EPD_METADATA %>%
  dplyr::select(1:4, 6, 10) %>%
  dplyr::right_join(embsecbio_repatriated_dates_info_2 %>%
                      dplyr::select(1:6, 8),
                    by = c("entity_name" = "neotoma_entity_name"))

#### External links ----
meta_neo_res <- seq_len(nrow(embsecbio_repatriated_dates_info_3)) %>%
  purrr::map(function(i) {
    embsecbio_repatriated_dates_info_3[i, ] %>%
      dplyr::select(ID_SITE,
                    ID_ENTITY,
                    external_ID_SITE,
                    external_ID_ENTITY,
                    external_site_name,
                    external_entity_name) %>%
      dplyr::mutate(external_source = "EMBSECBIO") %>%
      rpd:::add_records(conn = conn, table = "external_link", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate -----
EPD_METADATA_NEO_DB <- dabr::select_all(conn, "external_link") %>%
  dplyr::filter(external_ID_ENTITY %in%
                  embsecbio_repatriated_dates_info_3$external_ID_ENTITY,
                external_source == "EMBSECBIO") %>%
  dplyr::select(ID_SITE,
                ID_ENTITY,
                external_ID_SITE,
                external_ID_ENTITY,
                external_site_name,
                external_entity_name)
waldo::compare(embsecbio_repatriated_dates_info_3 %>%
                 dplyr::select(ID_SITE,
                               ID_ENTITY,
                               external_ID_SITE,
                               external_ID_ENTITY,
                               external_site_name,
                               external_entity_name) %>%
                 dplyr::mutate(external_ID_SITE = as.integer(external_ID_SITE),
                               external_ID_ENTITY = as.integer(external_ID_ENTITY)),
               EPD_METADATA_NEO_DB,
               tolerance = 1E-9)

#### Dates ----
embsecbio_repatriated_dates_info_4 <-
  embsecbio_repatriated_dates_info$date_info %>%
  dplyr::select(-ID_DATE_INFO) %>%
  dplyr::rename(external_ID_ENTITY = ID_ENTITY,
                reason_age_not_used = date_comments) %>%
  dplyr::mutate(age_used = ifelse(is.na(reason_age_not_used), "yes", "no")) %>%
  dplyr::left_join(embsecbio_repatriated_dates_info_3 %>%
                     dplyr::select(ID_ENTITY, external_ID_ENTITY)) %>%
  dplyr::select(-external_ID_ENTITY) %>%
  dplyr::relocate(ID_ENTITY, .before = 1)
meta_neo_res <- seq_len(nrow(embsecbio_repatriated_dates_info_4)) %>%
  purrr::map(function(i) {
    embsecbio_repatriated_dates_info_4[i, ] %>%
      rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

##### Validate -----
EPD_DATES_NEO_DB <- dabr::select_all(conn, "date_info") %>%
  dplyr::filter(ID_ENTITY %in% embsecbio_repatriated_dates_info_4$ID_ENTITY)
waldo::compare(embsecbio_repatriated_dates_info_4 %>%
                 .[order(colnames(.))],
               EPD_DATES_NEO_DB %>%
                 .[order(colnames(.))] %>%
                 dplyr::select(-age_calib, -ID_DATE_INFO, -notes),
               tolerance = 1E-9)

# #### Samples ---
# aux <- EPD_COUNTS %>%
#   dplyr::filter(dataset_id %in% embsecbio_repatriated_dates_info_3$dataset_id) %>%
#   smpds::rm_na_taxa(1:16) %>%
#   dplyr::select(-chronology_id, -chronology_name) %>%
#   dplyr::left_join(EPD_METADATA %>%
#                      dplyr::select(1:6, 10),
#                    by = c("site_id", "site_name", "site_name_clean", "dataset_id", "entity_name")) %>%
#   dplyr::relocate(ID_SITE, ID_ENTITY, .before = 1)
#
# embsecbio_repatriated_dates_info$age_model %>%
#   dplyr::select(-est_age_original, -comment) %>%
#   dplyr::left_join(embsecbio_repatriated_dates_info_3 %>%
#                      dplyr::select(ID_ENTITY, external_ID_ENTITY))
# embsecbio_repatriated_dates_info$pollen %>%
#   dplyr::left_join(embsecbio_repatriated_dates_info_3 %>%
#                      dplyr::select(ID_ENTITY, external_ID_ENTITY))


### non-EMBSeCBIO / EPD dates (106) ----
epd_embsecbio_repatriation_dates <- embsecbio_repatriation %>%
  dplyr::filter(!dates_to_be_extracted_from_embsecbio)
epd_embsecbio_repatriated_dates_info <- EPD_DATES %>%
  dplyr::filter(entity_name %in% epd_embsecbio_repatriation_dates$entity_name) %>%
  dplyr::filter(!is.na(site_id), !is.na(depth))
  # epd_embsecbio_repatriation_dates$EMBSeCBIO_ID_ENTITY %>%
  # extract_embsecbio()

epd_embsecbio_external_links <- dabr::select_all(conn, "external_link") %>%
  dplyr::filter(external_entity_name %in%
                  epd_embsecbio_repatriated_dates_info$entity_name)

epd_embsecbio_repatriated_dates_info_2 <- epd_embsecbio_repatriation_dates %>%
  dplyr::select(external_ID_SITE = site_id,
                external_site_name = site_name,
                external_entity_name = entity_name) %>%
  dplyr::left_join(epd_embsecbio_external_links,
                   by = c("external_ID_SITE",
                          "external_site_name",
                          "external_entity_name")) %>%
  dplyr::right_join(epd_embsecbio_repatriated_dates_info,
                    by = c(
                      "external_ID_SITE" = "site_id",
                      "external_site_name" = "site_name",
                      "external_entity_name" = "entity_name"
                    ))

epd_embsecbio_repatriated_dates_info_3 <- epd_embsecbio_repatriated_dates_info_2

#### Dates ----
epd_embsecbio_repatriated_dates_info_4 <-
  epd_embsecbio_repatriated_dates_info_3 %>%
  dplyr::select(-c(1:3, 6:7)) %>%
  dplyr::arrange(ID_ENTITY, depth) %>%
  dplyr::select(-ID_SITE, -ages_already, -site_name_clean) %>%
  dplyr::rename(age_calib = age_cal)
  # epd_embsecbio_repatriated_dates_info$date_info %>%
  # dplyr::select(-ID_DATE_INFO) %>%
  # dplyr::rename(external_ID_ENTITY = ID_ENTITY,
  #               reason_age_not_used = date_comments) %>%
  # dplyr::mutate(age_used = ifelse(is.na(reason_age_not_used), "yes", "no")) %>%
  # dplyr::left_join(epd_embsecbio_repatriated_dates_info_3 %>%
  #                    dplyr::select(ID_ENTITY, external_ID_ENTITY)) %>%
  # dplyr::select(-external_ID_ENTITY) %>%
  # dplyr::relocate(ID_ENTITY, .before = 1)

epd_embsecbio_repatriated_dates_info_4_EPD <- EPD_DATES %>%
  dplyr::filter(entity_name %in%
                  epd_embsecbio_repatriated_dates_info_3$external_entity_name) %>% #,
                # is.na(ages_already) | ages_already == "EMBSECBIO") %>%
  dplyr::filter(!is.na(site_id), !is.na(depth)) %>%
  dplyr::left_join(EPD_METADATA %>%
                     dplyr::select(1:6),
                   by = c("site_id", "site_name", "site_name_clean", "entity_name")) %>%
  dplyr::relocate(ID_SITE, ID_ENTITY, .before = 1) %>%
  dplyr::arrange(ID_ENTITY, depth) %>%
  dplyr::select(-ID_SITE, -ages_already, -site_id, -site_name, -site_name_clean, -entity_name) %>%
  dplyr::rename(age_calib = age_cal)
#
# waldo::compare(epd_embsecbio_repatriated_dates_info_4, epd_embsecbio_repatriated_dates_info_4_EPD)
dplyr::bind_rows(epd_embsecbio_repatriated_dates_info_4,
                 epd_embsecbio_repatriated_dates_info_4_EPD) %>%
  dplyr::mutate(lab_num = stringr::str_squish(lab_num) %>%
                  stringr::str_remove_all("\\s|-")) %>%
  dplyr::group_by(ID_ENTITY, lab_num) %>%
  dplyr::mutate(n = length(lab_num)) %>%
  dplyr::arrange(ID_ENTITY, depth) %>%
  dplyr::filter(n != 2)

dabr::select_all(conn, "date_info") %>%
  dplyr::filter(ID_ENTITY %in% epd_embsecbio_repatriated_dates_info_4_EPD$ID_ENTITY)

waldo::compare(epd_embsecbio_repatriated_dates_info_4_EPD %>%
                 dplyr::arrange(ID_ENTITY, depth),
               epd_embsecbio_repatriated_dates_info_4 %>%
                 dplyr::arrange(ID_ENTITY, depth),
               tolerance = 1E-9)
meta_neo_res <- seq_len(nrow(epd_embsecbio_repatriated_dates_info_4_EPD)) %>%
  purrr::map(function(i) {
    epd_embsecbio_repatriated_dates_info_4_EPD[i, ] %>%
      rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate -----
EPD_DATES_NEO_DB <- dabr::select_all(conn, "date_info") %>%
  dplyr::filter(ID_ENTITY %in% epd_embsecbio_repatriated_dates_info_4_EPD$ID_ENTITY)
waldo::compare(epd_embsecbio_repatriated_dates_info_4_EPD %>%
                 .[order(colnames(.))] %>%
                 dplyr::arrange(ID_ENTITY, depth),
               EPD_DATES_NEO_DB %>%
                 .[order(colnames(.))] %>%
                 dplyr::select(-ID_DATE_INFO) %>%
                 dplyr::arrange(ID_ENTITY, depth),
               max_diffs = Inf,
               tolerance = 1E-9)


special.epd::snapshot(conn,
                      ID_ENTITY = epd_embsecbio_repatriated_dates_info_4_EPD$ID_ENTITY)
# epd_embsecbio_repatriated_dates_info_4_EPD %>%
#   dplyr::filter(is.na(ID_ENTITY))
# epd_embsecbio_repatriated_dates_info_4_EPD %>%
#   dplyr::filter((lab_num %in% EPD_DATES_NEO_DB$lab_num))
#   dplyr::filter(!(ID_ENTITY %in% EPD_DATES_NEO_DB$ID_ENTITY &
#                     lab_num %in% EPD_DATES_NEO_DB$lab_num))
#   # dplyr::filter(!(ID_ENTITY %in% EPD_DATES_NEO_DB$ID_ENTITY &
#   #                   depth %in% EPD_DATES_NEO_DB$depth))


### EMBSeCBIO age models (106) ----
embsecbio_repatriation_am <- embsecbio_repatriation %>%
  dplyr::filter(age_model_to_be_extracted_from_embsecbio)

embsecbio_repatriated_am_info <-
  embsecbio_repatriation_am$EMBSeCBIO_ID_ENTITY %>%
  extract_embsecbio()

embsecbio_taxon_names <- embsecbio_repatriated_am_info$pollen$taxon_clean %>%
  unique()
# tibble::tibble(
#   taxon_clean = embsecbio_repatriated_am_info$pollen$taxon_clean %>% unique() %>% sort()
# ) %>% readr::write_excel_csv("~/Downloads/embsecbio_taxon_list_clean_names.csv")

embsecbio_repatriated_am_info_2 <- embsecbio_repatriation_am %>%
  dplyr::select(neotoma_ID_SITE = site_id,
                neotoma_site_name = site_name,
                neotoma_entity_name = entity_name,
                ID_ENTITY = EMBSeCBIO_ID_ENTITY) %>%
  dplyr::inner_join(embsecbio_repatriated_am_info$metadata,
                    by = "ID_ENTITY") %>%
  dplyr::rename(external_ID_ENTITY = ID_ENTITY,
                external_ID_SITE =  ID_SITE,
                external_site_name = site_name,
                external_entity_name = entity_name)

embsecbio_repatriated_am_info_3 <- EPD_METADATA %>%
  dplyr::select(1:4, 6, 10) %>%
  dplyr::right_join(embsecbio_repatriated_am_info_2 %>%
                      dplyr::select(1:6, 8),
                    by = c("entity_name" = "neotoma_entity_name"))

#### Samples ----
embsecbio_repatriated_am_info_EPD_COUNTS <- EPD_COUNTS %>%
  dplyr::filter(dataset_id %in% embsecbio_repatriated_am_info_3$dataset_id) %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-chronology_id) %>%
  dplyr::left_join(EPD_METADATA %>%
                     dplyr::select(1:6, 10),
                   by = c("site_id", "site_name", "site_name_clean", "dataset_id", "entity_name")) %>%
  dplyr::relocate(ID_SITE, ID_ENTITY, .before = 1)

embsecbio_repatriated_am_info_4 <- embsecbio_repatriated_am_info$age_model %>%
  dplyr::mutate(age = dplyr::coalesce(age, est_age_original),
                age_original = est_age_original) %>%
  dplyr::select(-est_age_original, -comment) %>%
  dplyr::left_join(embsecbio_repatriated_am_info_3 %>%
                     dplyr::select(ID_ENTITY, external_ID_ENTITY)) %>%
  # dplyr::filter(ID_ENTITY != 18) %>%
  dplyr::select(-ID_SAMPLE, -external_ID_ENTITY) %>%
  dplyr::relocate(ID_ENTITY, .before = 1)

# Extract thickness from the EPD data
embsecbio_repatriated_am_info_5 <- embsecbio_repatriated_am_info_4 %>%
  dplyr::mutate(depth2 = round(depth, 3)) %>%
  dplyr::left_join(embsecbio_repatriated_am_info_EPD_COUNTS %>%
                     dplyr::select(ID_ENTITY, depth, thickness, chronology_name,
                                   age_EPD = age, age_younger, age_older, age_type) %>%
                     dplyr::mutate(depth = round(depth, 3)),
                   by = c("ID_ENTITY",  "depth2" = "depth")) %>%
  dplyr::select(-depth2) %>%
  dplyr::relocate(thickness,
                  # age_original,
                  age_EPD,
                  chronology_name,
                  age_younger,
                  age_older,
                  age_type,
                  .before = mean) %>%
  # dplyr::filter(age != age_EPD)
  # dplyr::group_by(ID_ENTITY) %>%
  dplyr::mutate(lower = round(age_EPD) * 0.99,
                upper = round(age_EPD) * 1.01,
                same_age = (round(age) >= lower & round(age) <= upper) | (is.na(age) & is.na(age_EPD)),
                chronology_name = ifelse(same_age, chronology_name, NA),
                age_younger = ifelse(same_age, age_younger, NA),
                age_older = ifelse(same_age, age_older, NA),
                age_type = ifelse(same_age, age_type, NA)) %>%
  dplyr::select(-age_original, -age_EPD, -lower, -upper, -same_age) %>%
  dplyr::mutate(ID_SAMPLE = seq_along(ID_ENTITY), .before = 1)
  # dplyr::mutate(ID_SAMPLE = get_id_sample(conn))

# Verify if the EPD has more samples than the EMBSeCBIO
tibble::tibble(
  ID_ENTITY = names(table(embsecbio_repatriated_am_info_EPD_COUNTS$ID_ENTITY)),
  EPD = table(embsecbio_repatriated_am_info_EPD_COUNTS$ID_ENTITY),
  EMB = table(embsecbio_repatriated_am_info_4$ID_ENTITY)
) %>%
  dplyr::filter(EPD > EMB)

# Missing samples (the entity 'GALINI' does not have an age model and only
# original_est_age at 2575 cm).
embsecbio_missing_samples <- embsecbio_repatriated_am_info$sample %>%
  dplyr::filter(!(ID_SAMPLE %in% embsecbio_repatriated_am_info$age_model$ID_SAMPLE)) %>%
  dplyr::left_join(embsecbio_repatriated_am_info_3 %>%
                     dplyr::select(ID_ENTITY, external_ID_ENTITY),
                   by = "external_ID_ENTITY") %>%
  dplyr::select(-ID_SAMPLE, -external_ID_ENTITY) %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::mutate(depth2 = round(depth, 3)) %>%
  dplyr::left_join(embsecbio_repatriated_am_info_EPD_COUNTS %>%
                     dplyr::select(ID_ENTITY, depth, thickness, chronology_name,
                                   age_EPD = age, age_younger, age_older, age_type) %>%
                     dplyr::mutate(depth = round(depth, 3)),
                   by = c("ID_ENTITY",  "depth2" = "depth")) %>%
  dplyr::select(-depth2) %>%
  dplyr::relocate(thickness,
                  age_EPD,
                  chronology_name,
                  age_younger,
                  age_older,
                  age_type,
                  .after = depth) %>%
  dplyr::mutate(ID_SAMPLE = 101316:101360)

embsecbio_missing_samples %>%
  rpd:::add_records(conn = conn, table = "sample", dry_run = TRUE)
# Results: 45 records were inserted.

meta_neo_res <- seq_len(nrow(embsecbio_repatriated_am_info_5)) %>%
  purrr::map(function(i) {
    embsecbio_repatriated_am_info_5[i, ] %>%
      dplyr::select(1:11) %>%
      rpd:::add_records(conn = conn, table = "sample", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate -----
EPD_DATES_NEO_DB <- dabr::select_all(conn, "sample") %>%
  dplyr::filter(ID_ENTITY %in% embsecbio_repatriated_am_info_5$ID_ENTITY)
waldo::compare(embsecbio_repatriated_am_info_5 %>%
                 dplyr::select(1:11) %>%
                 dplyr::bind_rows(embsecbio_missing_samples) %>%
                 dplyr::arrange(ID_ENTITY, depth) %>%
                 dplyr::select(-ID_SAMPLE) %>%
                 .[order(colnames(.))] %>%
                 dplyr::mutate(depth = round(depth, 3),
                               age = round(age)) %>%
                 magrittr::set_class(class(.)[-1]),
               EPD_DATES_NEO_DB %>%
                 .[order(colnames(.))] %>%
                 dplyr::arrange(ID_ENTITY, depth) %>%
                 dplyr::select(-ID_SAMPLE) %>%
                 dplyr::mutate(depth = round(depth, 3),
                               age = round(age)),
               tolerance = 1)

#### Age models ----
meta_neo_res <- seq_len(nrow(embsecbio_repatriated_am_info_5)) %>%
  purrr::map(function(i) {
    embsecbio_repatriated_am_info_5[i, ] %>%
      dplyr::select(1, 12:17) %>%
      dplyr::mutate(ID_MODEL = 8, .before = 2) %>% # Bacon IntCal20
      rpd:::add_records(conn = conn, table = "age_model", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate ----
EPD_DATES_NEO_DB <- dabr::select_all(conn, "age_model") %>%
  dplyr::filter(ID_SAMPLE %in% embsecbio_repatriated_am_info_5$ID_SAMPLE)
waldo::compare(embsecbio_repatriated_am_info_5 %>%
                 dplyr::select(1, 12:17) %>%
                 dplyr::mutate(ID_MODEL = 8, .before = 2) %>%
                 .[order(colnames(.))] %>%
                 magrittr::set_class(class(.)[-1]),
               EPD_DATES_NEO_DB %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)

# New age model: 'BOZOVA' - NOT AVAILABLE
find_age_models("~/Downloads/special_epd_am/", entity_name = "BOZOVA")

#### Counts ----
EPD_SAMPLES_NEO_DB <- dabr::select_all(conn, "sample") %>%
  dplyr::filter(ID_ENTITY %in% embsecbio_repatriated_am_info_5$ID_ENTITY) %>%
  dplyr::select(ID_ENTITY,
                ID_SAMPLE,
                depth)
embsecbio_pollen_counts <- embsecbio_repatriated_am_info$pollen %>%
  dplyr::select(-ID_SAMPLE) %>%
  dplyr::left_join(embsecbio_repatriated_am_info_3 %>%
                     dplyr::select(ID_ENTITY, entity_name, external_ID_ENTITY),
                   by = "external_ID_ENTITY") %>%
  dplyr::mutate(depth_rnd = round(depth, 3)) %>%
  dplyr::left_join(EPD_SAMPLES_NEO_DB %>%
                     dplyr::mutate(depth_rnd = round(depth, 3)),
                   by = c("ID_ENTITY", "depth_rnd"))
embsecbio_pollen_counts %>% dplyr::filter(is.na(ID_SAMPLE))
# embsecbio_pollen_counts %>%
#   dplyr::group_by(ID_SAMPLE, ID_ENTITY, taxon_clean, depth.x)
# extract_embsecbio(1132)

aux <- conn %>%
  special.epd::snapshot(ID_ENTITY = embsecbio_pollen_counts$ID_ENTITY)
aux$date_info$ID_ENTITY %>% unique() %>% length()
aux$sample$ID_ENTITY %>% unique() %>% length()
aux$age_model$ID_SAMPLE %>% unique() %>% length()
embsecbio_pollen_counts$ID_SAMPLE %>% unique() %>% length()
tibble::tibble(
  ID_SAMPLE = embsecbio_pollen_counts$ID_SAMPLE %>% unique(),
) %>%
  dplyr::anti_join(
    tibble::tibble(
      ID_SAMPLE = aux$sample$ID_SAMPLE %>% unique()
    )
  )

# special.epd::snapshot(ID_ENTITY = epd_repatriated_samples_2$ID_ENTITY)
existing_pollen_counts <-
  aux$pollen_count %>%
  purrr::map_df(~.x) %>% # Combine the three pollen count tables
  dplyr::select(ID_SAMPLE) %>%
  dplyr::group_by(ID_SAMPLE) %>%
  dplyr::summarise(n = dplyr::n())

existing_pollen_counts %>%
  dplyr::filter(n != 3) # Verify if sample does not have the 3 tables

# epd_repatriated_samples_3 <- epd_repatriated_samples_2 %>%
#   dplyr::select(-ID_ENTITY, -depth, -thickness, -chronology_name, -age_type, -age, -age_younger, -age_older)
#
# epd_repatriated_samples_3 %>%
#   dplyr::filter(ID_SAMPLE %in% existing_pollen_counts$ID_SAMPLE)
taxon_name_tb <- dabr::select_all(conn, "taxon_name")
# Check if all the EMBSeCBIO taxons are in the `taxon_name` table
new_taxon_names <-
  c(embsecbio_taxa_amalgamation$clean_name,
  embsecbio_taxa_amalgamation$intermediate,
  embsecbio_taxa_amalgamation$amalgamated) %>%
  unique() %>%
  sort() %>%
  tibble::tibble() %>%
  magrittr::set_names("taxon_name") %>%
  dplyr::left_join(taxon_name_tb) %>%
  dplyr::filter(is.na(ID_TAXON))
new_taxon_names %>%
  rpd:::add_records(conn = conn, table = "taxon_name", dry_run = TRUE)
# Results: 47 records were inserted.

# Remove taxons not in the `embsecbio_taxa_amalgamation` table
embsecbio_pollen_counts_excluded <- embsecbio_pollen_counts %>%
  dplyr::filter(!(taxon_clean %in% embsecbio_taxa_amalgamation$taxon_name))
embsecbio_pollen_counts_2 <- embsecbio_pollen_counts %>%
  dplyr::filter(taxon_clean %in% embsecbio_taxa_amalgamation$taxon_name)
##### Clean ----
# oplan <- future::plan(future::multisession, workers = 8)
# options(future.globals.maxSize = 2000*1024^2)
embsecbio_pollen_counts_3 <-
  sort(unique(embsecbio_pollen_counts$ID_SAMPLE)) %>%
  # furrr::future_map_dfr(function(i) {
  purrr::map_df(function(i) {
    embsecbio_pollen_counts_2 %>%
      dplyr::filter(ID_SAMPLE == i) %>%
      dplyr::select(ID_SAMPLE, taxon_name = taxon_clean, count = taxon_count) %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(embsecbio_taxa_amalgamation,
                       by = "taxon_name") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)
# future::plan(oplan)

embsecbio_pollen_counts_3 %>%
  dplyr::filter(is.na(ID_TAXON))
dim(embsecbio_pollen_counts_3)


embsecbio_pollen_counts_3 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx <- idx_pairs(nrow(embsecbio_pollen_counts_3), 2000)
pb <- progress::progress_bar$new(total = nrow(idx))
# meta_neo_res <- seq_len(nrow(embsecbio_pollen_counts)) %>%
meta_neo_res <-
  purrr::map2(idx$x,
              idx$y,
              ~ {
                pb$tick()
                embsecbio_pollen_counts_3[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count", dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- dabr::select_all(conn, "pollen_count") %>%
  dplyr::filter(ID_SAMPLE %in% embsecbio_pollen_counts_3$ID_SAMPLE,
                amalgamation_level == 0)
waldo::compare(embsecbio_pollen_counts_3 %>%
                 # dplyr::arrange(ID_SAMPLE, ID_TAXON) %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1e-9)
# 165980
##### Intermediate ----
embsecbio_taxa_amalgamation_stage2 <- embsecbio_taxa_amalgamation %>%
  dplyr::select(-taxon_name) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

embsecbio_pollen_counts_4 <-
  sort(unique(embsecbio_pollen_counts_3$ID_SAMPLE)) %>%
  purrr::map_df(function(i) {
    embsecbio_pollen_counts_3 %>%
      dplyr::filter(ID_SAMPLE == i) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(embsecbio_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

embsecbio_pollen_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON))

embsecbio_pollen_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(embsecbio_pollen_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                embsecbio_pollen_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count", dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- dabr::select_all(conn, "pollen_count") %>%
  dplyr::filter(ID_SAMPLE %in% embsecbio_pollen_counts_4$ID_SAMPLE,
                amalgamation_level == 1)
waldo::compare(embsecbio_pollen_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1e-9)
# 162962

##### Amalgamated ----
embsecbio_taxa_amalgamation_stage3 <- embsecbio_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

embsecbio_pollen_counts_5 <-
  unique(embsecbio_pollen_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    embsecbio_pollen_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(embsecbio_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

embsecbio_pollen_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)|is.na(count))

embsecbio_pollen_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(embsecbio_pollen_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                embsecbio_pollen_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count", dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
# 152998
###### Validate -----
EPD_TAXA <- dabr::select_all(conn, "pollen_count") %>%
  dplyr::filter(ID_SAMPLE %in% embsecbio_pollen_counts_5$ID_SAMPLE,
                amalgamation_level == 2)
waldo::compare(embsecbio_pollen_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1e-9)

embsecbio_pollen_counts %>%
  dplyr::distinct(entity_name, .keep_all = TRUE)


### non-EMBSeCBIO / EPD age models (12) ----
non_embsecbio_repatriation_am <- embsecbio_repatriation %>%
  dplyr::filter(!age_model_to_be_extracted_from_embsecbio)

non_embsecbio_repatriated_am_info <- non_embsecbio_repatriation_am %>%
  dplyr::select(site_id, site_name, entity_name) %>%
  dplyr::left_join(dabr::select_all(conn, "external_link"),
                   by = c("site_id" = "external_ID_SITE",
                          "entity_name" = "external_entity_name"))

#### Samples ----
non_embsecbio_repatriated_am_info_EPD_COUNTS <- EPD_COUNTS %>%
  dplyr::filter(dataset_id %in% non_embsecbio_repatriated_am_info$external_ID_ENTITY) %>%
  # dplyr::filter(dataset_id %in% non_embsecbio_repatriated_am_info_3$dataset_id) %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-chronology_id) %>%
  dplyr::left_join(EPD_METADATA %>%
                     dplyr::select(1:3, 6, 10),
                   by = c("site_id", "dataset_id", "entity_name")) %>%
  dplyr::relocate(ID_SITE, ID_ENTITY, .before = 1) #%>%
  # dplyr::mutate(depth = ifelse(ID_ENTITY == 365 & depth == 0, 0.5, depth)) # Update depth GDU: 0 -> 0.5
non_embsecbio_repatriated_am_info_EPD_COUNTS

# # Note: remove additional sample linked to GDU (ID_ENTITY = 365, depth = 0)
# # ID_SAMPLE = 21612
# special.epd::snapshot(conn, ID_ENTITY = 365)


non_embsecbio_repatriated_am_info_2 <- non_embsecbio_repatriated_am_info_EPD_COUNTS %>%
  dplyr::mutate(ID_SAMPLE = 28481 + seq_along(ID_ENTITY), # get_id_sample(conn)
                .after = ID_ENTITY) %>%
  dplyr::select(-ID_SITE, -site_id, -site_name, -site_name_clean, -dataset_id, -dataset_name, -entity_name, -sample_id, -unit_name)

# Check if the "new" records are already in th DB:
special.epd::snapshot(conn, ID_ENTITY = non_embsecbio_repatriated_am_info_2$ID_ENTITY)
dabr::select_all(conn, "sample") %>%
  dplyr::filter(ID_SAMPLE %in% non_embsecbio_repatriated_am_info_2$ID_SAMPLE |
                  ID_ENTITY %in% non_embsecbio_repatriated_am_info_2$ID_ENTITY)
meta_neo_res <- seq_len(nrow(non_embsecbio_repatriated_am_info_2)) %>%
  purrr::map(function(i) {
    non_embsecbio_repatriated_am_info_2[i, ] %>%
      dplyr::select(1:9) %>%
      rpd:::add_records(conn = conn, table = "sample", dry_run = TRUE)
  })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate -----
EPD_DATES_NEO_DB <- dabr::select_all(conn, "sample") %>%
  dplyr::filter(ID_ENTITY %in% non_embsecbio_repatriated_am_info_2$ID_ENTITY)
waldo::compare(non_embsecbio_repatriated_am_info_2 %>%
                 dplyr::select(1:9) %>%
                 .[order(colnames(.))] %>%
                 dplyr::mutate(depth = round(depth, 3),
                               age = round(age)),
               EPD_DATES_NEO_DB %>%
                 .[order(colnames(.))] %>%
                 dplyr::mutate(depth = round(depth, 3),
                               age = round(age)) %>%
                 dplyr::select(-count_type, -sample_type),
               tolerance = 2)

#### Age models (11) ----
non_embsecbio_repatriated_am_new <-
  find_age_models(path, non_embsecbio_repatriated_am_info$entity_name)

# Check if the AM are ready to be uploaded
non_embsecbio_repatriated_am_new %>%
  dplyr::filter(entity_name %in% epd_age_models_ready_to_upload$entity_name)

# Check if the age models already exist in the DB
special.epd::snapshot(conn, entity_name = non_embsecbio_repatriated_am_new$entity_name)

non_embsecbio_repatriated_am_new2 <- non_embsecbio_repatriated_am_new %>%
  purrr::pmap_df(upload_age_model, conn = conn)
# Check if all the age models were uploaded
all(non_embsecbio_repatriated_am_new2$status)
non_embsecbio_repatriated_am_new2
non_embsecbio_repatriated_am_new3 <-
  non_embsecbio_repatriated_am_new2$am %>%
  purrr::map_df(~.x) %>%
  magrittr::set_names(colnames(.) %>% stringr::str_to_upper())

EPD_AM <- dabr::select(conn,
                       "SELECT * FROM age_model WHERE ID_SAMPLE IN (",
                       paste0(non_embsecbio_repatriated_am_new3$ID_SAMPLE, collapse = ", "),
                       ") AND ID_MODEL = 8")
waldo::compare(non_embsecbio_repatriated_am_new3 %>%
                 dplyr::arrange(ID_SAMPLE),
               EPD_AM %>%
                 magrittr::set_names(colnames(.) %>% stringr::str_to_upper()) %>%
                 dplyr::arrange(ID_SAMPLE))

special.epd::snapshot(conn, entity_name = non_embsecbio_repatriated_am_new$entity_name)

# NOTE: Pending age models
non_embsecbio_repatriated_am_info %>%
  dplyr::filter(!(entity_name %in% non_embsecbio_repatriated_am_new2$entity_name))

# This entity only has one date, so no age model was created
tibble::tribble(
  ~site_id,    ~site_name, ~entity_name, ~ID_SITE, ~ID_ENTITY, ~external_ID_ENTITY, ~external_site_name, ~external_source,
  3033, "Bozova Lake",     "BOZOVA",     185L,       206L,               3950L,       "Bozova Lake",        "NEOTOMA"
)


aux <- non_embsecbio_repatriated_am_new2 %>%
  dplyr::left_join(EPD_METADATA %>%
                     dplyr::select(site_id, entity_name)) %>%
  dplyr::arrange(site_id)

#### Counts ----
non_embsecbio_repatriated_am_info_3 <- non_embsecbio_repatriated_am_info_2 %>%
  # dplyr::filter(ID_SAMPLE == 21612) %>%
  dplyr::select(-ID_ENTITY, -depth, -thickness, -chronology_name, -age_type, -age, -age_younger, -age_older)
taxon_name_tb <- dabr::select_all(conn, "taxon_name")
data("EPD_taxa_amalgamation")
##### Clean ----
non_embsecbio_repatriated_am_info_4 <-
  seq_len(nrow(non_embsecbio_repatriated_am_info_3)) %>%
  purrr::map_df(function(i) {
    non_embsecbio_repatriated_am_info_3[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      # dplyr::select(ID_SAMPLE, taxon_name = clean_name, count, epd_taxa) %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      # dplyr::filter(n > 1)
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

meta_neo_res <- seq_len(nrow(non_embsecbio_repatriated_am_info_4)) %>%
  purrr::map(function(i) {
    if (i %% 10000 == 0)
      print(i)
    non_embsecbio_repatriated_am_info_4[i, ] %>%

      rpd:::add_records(conn = conn, table = "pollen_count", dry_run = TRUE, quiet = TRUE)
  })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- dabr::select_all(conn, "pollen_count") %>%
  dplyr::filter(ID_SAMPLE %in% non_embsecbio_repatriated_am_info_4$ID_SAMPLE,
                amalgamation_level == 0)
waldo::compare(non_embsecbio_repatriated_am_info_4 %>%
                 # dplyr::arrange(ID_SAMPLE, ID_TAXON) %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1e-9)

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

EPD_taxa_amalgamation_stage2 %>%
  dplyr::filter(intermediate != intermediate2)

non_embsecbio_repatriated_am_info_5 <-
  unique(non_embsecbio_repatriated_am_info_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    non_embsecbio_repatriated_am_info_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

meta_neo_res <- seq_len(nrow(non_embsecbio_repatriated_am_info_5)) %>%
  purrr::map(function(i) {
    if (i %% 10000 == 0)
      print(i)
    non_embsecbio_repatriated_am_info_5[i, ] %>%
      rpd:::add_records(conn = conn, table = "pollen_count", dry_run = TRUE, quiet = TRUE)
  })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- dabr::select_all(conn, "pollen_count") %>%
  dplyr::filter(ID_SAMPLE %in% non_embsecbio_repatriated_am_info_5$ID_SAMPLE,
                amalgamation_level == 1)
waldo::compare(non_embsecbio_repatriated_am_info_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1e-9)

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

EPD_taxa_amalgamation_stage3 %>%
  dplyr::filter(amalgamated != amalgamated2)

non_embsecbio_repatriated_am_info_6 <-
  unique(non_embsecbio_repatriated_am_info_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    non_embsecbio_repatriated_am_info_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

meta_neo_res <- seq_len(nrow(non_embsecbio_repatriated_am_info_6)) %>%
  purrr::map(function(i) {
    if (i %% 10000 == 0)
      print(i)
    non_embsecbio_repatriated_am_info_6[i, ] %>%
      rpd:::add_records(conn = conn, table = "pollen_count", dry_run = F, quiet = TRUE)
  })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- dabr::select_all(conn, "pollen_count") %>%
  dplyr::filter(ID_SAMPLE %in% non_embsecbio_repatriated_am_info_6$ID_SAMPLE,
                amalgamation_level == 2)
waldo::compare(non_embsecbio_repatriated_am_info_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1e-9)

aux <- special.epd::snapshot(conn,
                             ID_ENTITY = non_embsecbio_repatriated_am_info_2$ID_ENTITY)
aux$entity %>%
  dplyr::filter(ID_ENTITY %in% aux$sample$ID_ENTITY)
non_embsecbio_repatriated_am_info %>%
  dplyr::filter(ID_ENTITY %in% non_embsecbio_repatriated_am_info_2$ID_ENTITY)
aux2

## EMBSeCBIO (extra) ----
embsecbio_extra_repatriation <- epd_repatriation_tmp_file %>%
  readxl::read_excel(sheet = 3) %>%
  janitor::clean_names() %>%
  dplyr::mutate(dates_to_be_extracted_from_embsecbio =
                  to_bool(dates_to_be_extracted_from_embsecbio),
                age_model_to_be_extracted_from_embsecbio =
                  to_bool(age_model_to_be_extracted_from_embsecbio)) %>%
  dplyr::rename(EMBSeCBIO_ID_ENTITY = embsecbio_id_entity,
                entity_name = entity_name_7,
                EMBSeCBIO_entity_name = entity_name_12)

### EMBSeCBIO dates (8) ----
embsecbio_extra_repatriation_dates <- embsecbio_extra_repatriation %>%
  dplyr::filter(dates_to_be_extracted_from_embsecbio)

embsecbio_extra_repatriated_dates_info <-
  embsecbio_extra_repatriation_dates$EMBSeCBIO_ID_ENTITY %>%
  extract_embsecbio()

embsecbio_taxon_names <- embsecbio_repatriated_am_info$pollen$taxon_clean %>%
  unique()
embsecbio_extra_taxon_names <- embsecbio_extra_repatriated_dates_info$pollen$taxon_clean %>%
  unique()
# tibble::tibble(
#   taxon_clean = c(embsecbio_taxon_names,
#                   embsecbio_extra_taxon_names) %>%
#     unique() %>%
#     sort()
# ) %>% readr::write_excel_csv("~/Downloads/embsecbio_taxon_list_clean_names.csv")

#### Entities ----
embsecbio_extra_repatriated_dates_info_entities <-
  embsecbio_extra_repatriated_dates_info$metadata %>%
  dplyr::arrange(site_name, entity_name) %>%
  dplyr::filter(entity_name %>% stringr::str_detect("GS05", TRUE)) %>%
  dplyr::mutate(ID_SITE = c(1449, 1450, 1451, 1451, 1451, 1451, 1452, 1453),
                ID_ENTITY = 1620:1627) %>%
  # dplyr::mutate(ID_SITE = c(1448, 1449, 1450, 1451, 1451, 1451, 1451, 1452, 1453),
  #               ID_ENTITY = 1619:1627) %>%
  dplyr::select(-entity_type)
  # dplyr::mutate(ID_SITE = c(1, 2, 3, 4, 4, 4, 4, 5, 6) + get_id_site(conn),
  #               ID_ENTITY = seq_along(ID_ENTITY) + get_id_entity(conn))

# (2022-03-10): ID_ENTITY = 1619 This is a duplicated entry of GS05
# Change depth of GS05 (ID_ENTITY = 509) from -518 to -26
special.epd::snapshot(conn, ID_ENTITY = c(509, 1619))
# special.epd::snapshot(conn, entity_name = "GS05")
dabr::select(conn,
             "SELECT * FROM entity WHERE ID_ENTITY = 1619 AND entity_name LIKE '%Caspian%GS05%'")
dabr::delete(conn,
             "DELETE FROM entity WHERE ID_ENTITY = 1619 AND entity_name LIKE '%Caspian%GS05%'")
# Results: 1 record was deleted.
dabr::select(conn,
             "SELECT * FROM external_link WHERE external_ID_ENTITY = 1830 AND external_entity_name LIKE '%Caspian%GS05%'")
dabr::delete(conn,
             "DELETE FROM external_link WHERE external_ID_ENTITY = 1830 AND external_entity_name LIKE '%Caspian%GS05%'")
# Results: 1 record was deleted.

meta_neo_res <- seq_len(nrow(embsecbio_extra_repatriated_dates_info_entities)) %>%
  purrr::map(function(i) {
    embsecbio_extra_repatriated_dates_info_entities[i, ] %>%
      rpd:::add_records(conn = conn, table = "entity", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate -----
EPD_NEO_DB <- dabr::select_all(conn, "entity") %>%
  dplyr::filter(ID_ENTITY %in% embsecbio_extra_repatriated_dates_info_entities$ID_ENTITY)
waldo::compare(embsecbio_extra_repatriated_dates_info_entities %>%
                 .[order(colnames(.))] %>%
                 magrittr::set_class(class(.)[-1]),
               EPD_NEO_DB %>%
                 .[order(colnames(.))] %>%
                 dplyr::select(-doi),
               tolerance = 1E-9)

embsecbio_extra_repatriated_dates_info_2 <- embsecbio_extra_repatriation_dates %>%
  dplyr::select(neotoma_ID_SITE = site_id,
                neotoma_site_name = site_name,
                neotoma_entity_name = site_name,
                # neotoma_entity_name = entity_name,
                ID_ENTITY = EMBSeCBIO_ID_ENTITY) %>%
  dplyr::inner_join(embsecbio_extra_repatriated_dates_info$metadata,
                    by = "ID_ENTITY") %>%
  dplyr::rename(external_ID_ENTITY = ID_ENTITY,
                external_ID_SITE =  ID_SITE,
                external_site_name = site_name,
                external_entity_name = entity_name) %>%
  dplyr::filter(external_entity_name %>% stringr::str_detect("GS05", TRUE))
embsecbio_extra_repatriated_dates_info_3 <- embsecbio_extra_repatriated_dates_info_entities %>%
  dplyr::select(ID_SITE, ID_ENTITY, site_name, entity_name) %>%
  dplyr::right_join(embsecbio_extra_repatriated_dates_info_2 %>%
                      dplyr::select(1:6, 8),
                    by = c("entity_name" = "neotoma_entity_name"))

#### External links ----
meta_neo_res <- seq_len(nrow(embsecbio_extra_repatriated_dates_info_3)) %>%
  purrr::map(function(i) {
    embsecbio_extra_repatriated_dates_info_3[i, ] %>%
      dplyr::select(ID_SITE,
                    ID_ENTITY,
                    external_ID_SITE,
                    external_ID_ENTITY,
                    external_site_name,
                    external_entity_name) %>%
      dplyr::mutate(external_source = "EMBSECBIO") %>%
      rpd:::add_records(conn = conn, table = "external_link", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate -----
EPD_NEO_DB <- dabr::select_all(conn, "external_link") %>%
  dplyr::filter(external_ID_ENTITY %in%
                  embsecbio_extra_repatriated_dates_info_3$external_ID_ENTITY,
                external_source == "EMBSECBIO") %>%
  dplyr::select(ID_SITE,
                ID_ENTITY,
                external_ID_SITE,
                external_ID_ENTITY,
                external_site_name,
                external_entity_name)
waldo::compare(embsecbio_extra_repatriated_dates_info_3 %>%
                 dplyr::select(ID_SITE,
                               ID_ENTITY,
                               external_ID_SITE,
                               external_ID_ENTITY,
                               external_site_name,
                               external_entity_name) %>%
                 dplyr::mutate(external_ID_SITE =
                                 as.integer(external_ID_SITE),
                               external_ID_ENTITY =
                                 as.integer(external_ID_ENTITY)) %>%
                 magrittr::set_class(class(.)[-1]),
               EPD_NEO_DB,
               tolerance = 1E-9)

#### Dates ----
embsecbio_extra_repatriated_dates_info_4 <-
  embsecbio_extra_repatriated_dates_info$date_info %>%
  dplyr::select(-ID_DATE_INFO) %>%
  dplyr::rename(external_ID_ENTITY = ID_ENTITY,
                reason_age_not_used = date_comments) %>%
  dplyr::mutate(age_used = ifelse(is.na(reason_age_not_used), "yes", "no")) %>%
  dplyr::left_join(embsecbio_extra_repatriated_dates_info_3 %>%
                     dplyr::select(ID_ENTITY, external_ID_ENTITY)) %>%
  dplyr::select(-external_ID_ENTITY) %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::filter(!is.na(ID_ENTITY))
meta_neo_res <- seq_len(nrow(embsecbio_extra_repatriated_dates_info_4)) %>%
  purrr::map(function(i) {
    embsecbio_extra_repatriated_dates_info_4[i, ] %>%
      rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate -----
EPD_NEO_DB <- dabr::select_all(conn, "date_info") %>%
  dplyr::filter(ID_ENTITY %in%
                  embsecbio_extra_repatriated_dates_info_4$ID_ENTITY)
waldo::compare(embsecbio_extra_repatriated_dates_info_4 %>%
                 .[order(colnames(.))],
               EPD_NEO_DB %>%
                 .[order(colnames(.))] %>%
                 dplyr::select(-age_calib, -ID_DATE_INFO, -notes))

#### Samples ----
embsecbio_extra_repatriated_am_info_4 <- embsecbio_extra_repatriated_dates_info$age_model %>%
  dplyr::mutate(age = dplyr::coalesce(age, est_age_original),
                age_original = est_age_original) %>%
  dplyr::select(-est_age_original, -comment) %>%
  dplyr::left_join(embsecbio_extra_repatriated_dates_info_3 %>%
                     dplyr::select(ID_ENTITY, external_ID_ENTITY)) %>%
  # dplyr::filter(ID_ENTITY != 18) %>%
  dplyr::select(-ID_SAMPLE, -external_ID_ENTITY) %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::filter(!is.na(ID_ENTITY))

meta_neo_res <- seq_len(nrow(embsecbio_extra_repatriated_am_info_4)) %>%
  purrr::map(function(i) {
    embsecbio_extra_repatriated_am_info_4[i, ] %>%
      dplyr::select(1:5) %>%
      rpd:::add_records(conn = conn, table = "sample", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate -----
EPD_NEO_DB <- dabr::select_all(conn, "sample") %>%
  dplyr::filter(ID_ENTITY %in% embsecbio_extra_repatriated_am_info_4$ID_ENTITY)
waldo::compare(embsecbio_extra_repatriated_am_info_4 %>%
                 dplyr::select(1:5) %>%
                 .[order(colnames(.))] %>%
                 dplyr::mutate(depth = round(depth, 3),
                               age = round(age)) %>%
                 magrittr::set_class(class(.)[-1]),
               EPD_NEO_DB %>%
                 dplyr::select(-age_older, -age_younger, -age_type, -ID_SAMPLE, -chronology_name, -thickness) %>%
                 .[order(colnames(.))] %>%
                 dplyr::mutate(depth = round(depth, 3),
                               age = round(age)),
               tolerance = 1)

#### Age models ----
EPD_NEO_DB <- dabr::select_all(conn, "sample") %>%
  dplyr::filter(ID_ENTITY %in% embsecbio_extra_repatriated_am_info_4$ID_ENTITY)
embsecbio_extra_repatriated_am_info_5 <- embsecbio_extra_repatriated_am_info_4 %>%
  dplyr::mutate(ID_SAMPLE = EPD_NEO_DB$ID_SAMPLE, .before = 1) %>%
  # dplyr::filter(!(ID_ENTITY %in% c(1626, 1627))) %>% # Need further analysis
  dplyr::select(-c(2:6)) %>%
  dplyr::select(-age_original)
meta_neo_res <- seq_len(nrow(embsecbio_extra_repatriated_am_info_5)) %>%
  purrr::map(function(i) {
    embsecbio_extra_repatriated_am_info_5[i, ] %>%
      dplyr::mutate(ID_MODEL = 8, .before = 2) %>% # Bacon IntCal20
      rpd:::add_records(conn = conn, table = "age_model", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate ----
EPD_NEO_DB <- dabr::select_all(conn, "age_model") %>%
  dplyr::filter(ID_SAMPLE %in% embsecbio_extra_repatriated_am_info_5$ID_SAMPLE)
waldo::compare(embsecbio_extra_repatriated_am_info_5 %>%
                 dplyr::mutate(ID_MODEL = 8, .before = 2) %>%
                 .[order(colnames(.))] %>%
                 magrittr::set_class(class(.)[-1]),
               EPD_NEO_DB %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)

#### HERE (PENDING) ----
"The samples linked to the entities 1626 and 1627 might need to be deleted (2022-02-17)"
dabr::select_all(conn, "entity") %>%
  dplyr::filter(ID_ENTITY %in% c(1626, 1627))
dabr::select_all(conn, "external_link") %>%
  dplyr::filter(ID_ENTITY %in% c(1626, 1627))
dabr::select_all(conn, "sample") %>%
  dplyr::filter(ID_ENTITY %in% c(1626, 1627))
extract_embsecbio(c(1341, 1342))
dabr::select(conn, "SELECT * FROM age_model WHERE ID_SAMPLE BETWEEN 5813 AND 5870")

#### Counts ----
EPD_SAMPLES_NEO_DB <- dabr::select_all(conn, "sample") %>%
  dplyr::filter(ID_ENTITY %in% embsecbio_extra_repatriated_am_info_4$ID_ENTITY) %>%
  dplyr::select(ID_ENTITY,
                ID_SAMPLE,
                depth)

embsecbio_extra_pollen_counts <- embsecbio_extra_repatriated_dates_info$pollen %>%
  dplyr::select(-ID_SAMPLE) %>%
  dplyr::left_join(embsecbio_extra_repatriated_dates_info_3 %>%
                     dplyr::select(ID_ENTITY, entity_name, external_ID_ENTITY),
                   by = "external_ID_ENTITY") %>%
  dplyr::mutate(depth_rnd = round(depth, 3)) %>%
  dplyr::left_join(EPD_SAMPLES_NEO_DB %>%
                     dplyr::mutate(depth_rnd = round(depth, 3)),
                   by = c("ID_ENTITY", "depth_rnd"))
embsecbio_extra_pollen_counts %>% dplyr::filter(is.na(ID_SAMPLE))

aux <- conn %>%
  special.epd::snapshot(ID_ENTITY = embsecbio_extra_pollen_counts$ID_ENTITY)
aux$date_info$ID_ENTITY %>% unique() %>% length()
aux$sample$ID_ENTITY %>% unique() %>% length()
aux$age_model$ID_SAMPLE %>% unique() %>% length()
embsecbio_extra_pollen_counts$ID_SAMPLE %>% unique() %>% length()
tibble::tibble(
  ID_SAMPLE = embsecbio_extra_pollen_counts$ID_SAMPLE %>% unique(),
) %>%
  dplyr::anti_join(
    tibble::tibble(
      ID_SAMPLE = aux$sample$ID_SAMPLE %>% unique()
    )
  )

# special.epd::snapshot(ID_ENTITY = epd_repatriated_samples_2$ID_ENTITY)
existing_pollen_counts <-
  aux$pollen_count %>%
  purrr::map_df(~.x) %>% # Combine the three pollen count tables
  dplyr::select(ID_SAMPLE) %>%
  dplyr::group_by(ID_SAMPLE) %>%
  dplyr::summarise(n = dplyr::n())

existing_pollen_counts %>%
  dplyr::filter(n != 3) # Verify if sample does not have the 3 tables

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
# Check if all the EMBSeCBIO taxons are in the `taxon_name` table
new_taxon_names <-
  c(embsecbio_taxa_amalgamation$clean_name,
    embsecbio_taxa_amalgamation$intermediate,
    embsecbio_taxa_amalgamation$amalgamated) %>%
  unique() %>%
  sort() %>%
  tibble::tibble() %>%
  magrittr::set_names("taxon_name") %>%
  dplyr::left_join(taxon_name_tb) %>%
  dplyr::filter(is.na(ID_TAXON))
new_taxon_names %>%
  rpd:::add_records(conn = conn, table = "taxon_name", dry_run = TRUE)

# Remove taxons not in the `embsecbio_taxa_amalgamation` table
embsecbio_extra_pollen_counts_excluded <- embsecbio_extra_pollen_counts %>%
  dplyr::filter(!(taxon_clean %in% embsecbio_taxa_amalgamation$taxon_name))
embsecbio_extra_pollen_counts_2 <- embsecbio_extra_pollen_counts %>%
  dplyr::filter(taxon_clean %in% embsecbio_taxa_amalgamation$taxon_name)
##### Clean ----
embsecbio_extra_pollen_counts_3 <-
  sort(unique(embsecbio_extra_pollen_counts$ID_SAMPLE)) %>%
  purrr::map_df(function(i) {
    embsecbio_extra_pollen_counts_2 %>%
      dplyr::filter(ID_SAMPLE == i) %>%
      dplyr::select(ID_SAMPLE, taxon_name = taxon_clean, count = taxon_count) %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(embsecbio_taxa_amalgamation,
                       by = "taxon_name") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

embsecbio_extra_pollen_counts_3 %>%
  dplyr::filter(is.na(ID_TAXON))
dim(embsecbio_extra_pollen_counts_3)
embsecbio_extra_pollen_counts_3 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()

idx <- idx_pairs(nrow(embsecbio_extra_pollen_counts_3), 2000)
pb <- progress::progress_bar$new(total = nrow(idx))
meta_neo_res <-
  purrr::map2(idx$x,
              idx$y,
              ~ {
                pb$tick()
                embsecbio_extra_pollen_counts_3[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count", dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- dabr::select_all(conn, "pollen_count") %>%
  dplyr::filter(ID_SAMPLE %in% embsecbio_extra_pollen_counts_3$ID_SAMPLE,
                amalgamation_level == 0)
waldo::compare(embsecbio_extra_pollen_counts_3 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1e-9)
# 9822
##### Intermediate ----
embsecbio_taxa_amalgamation_stage2 <- embsecbio_taxa_amalgamation %>%
  dplyr::select(-taxon_name) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

embsecbio_extra_pollen_counts_4 <-
  sort(unique(embsecbio_extra_pollen_counts_3$ID_SAMPLE)) %>%
  purrr::map_df(function(i) {
    embsecbio_extra_pollen_counts_3 %>%
      dplyr::filter(ID_SAMPLE == i) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(embsecbio_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

embsecbio_extra_pollen_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON))

embsecbio_extra_pollen_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(embsecbio_extra_pollen_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                embsecbio_extra_pollen_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count", dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- dabr::select_all(conn, "pollen_count") %>%
  dplyr::filter(ID_SAMPLE %in% embsecbio_extra_pollen_counts_4$ID_SAMPLE,
                amalgamation_level == 1)
waldo::compare(embsecbio_extra_pollen_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1e-9)
# 9688

##### Amalgamated ----
embsecbio_taxa_amalgamation_stage3 <- embsecbio_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

embsecbio_extra_pollen_counts_5 <-
  unique(embsecbio_extra_pollen_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    embsecbio_extra_pollen_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(embsecbio_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

embsecbio_extra_pollen_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)|is.na(count))

embsecbio_extra_pollen_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(embsecbio_extra_pollen_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                embsecbio_extra_pollen_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count", dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
###### Validate -----
EPD_TAXA <- dabr::select_all(conn, "pollen_count") %>%
  dplyr::filter(ID_SAMPLE %in% embsecbio_extra_pollen_counts_5$ID_SAMPLE,
                amalgamation_level == 2)
waldo::compare(embsecbio_extra_pollen_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1e-9)
# 9312

embsecbio_extra_pollen_counts %>%
  dplyr::distinct(entity_name, .keep_all = TRUE)


## IBERIA ----
# data("IBERIA_pollen")
# data("IBERIA_pollen_dates")
iberia_repatriation <- epd_repatriation_tmp_file %>%
  readxl::read_excel(sheet = 4) %>%
  janitor::clean_names() %>%
  dplyr::mutate(site_in_iberia = to_bool(site_in_iberia),
                dates_to_be_extracted_from_iberia =
                  to_bool(dates_to_be_extracted_from_iberia),
                age_model_to_be_extracted_from_iberia =
                  to_bool(age_model_to_be_extracted_from_iberia)) %>%
  dplyr::rename(IBERIA_ID_ENTITY = iberia_id_entity,
                IBERIA_site_name = site_name_in_iberia,
                IBERIA_entity_name = entity_name_in_iberia) %>%
  dplyr::mutate(IBERIA_site_name = IBERIA_site_name %>%
                  stringr::str_replace_all("Eix", "Elx"),
                IBERIA_entity_name = IBERIA_entity_name %>%
                  stringr::str_replace_all("EIX", "ELX"))

# iberia_repatriation %>%
#   dplyr::arrange(IBERIA_site_name, IBERIA_entity_name) %>%
#   dplyr::distinct(IBERIA_site_name) %>%
#   dplyr::mutate(IBERIA_ID_SITE = seq_along(IBERIA_site_name))
  # dplyr::mutate(IBERIA_ID_ENTITY = seq_along(IBERIA_entity_name)) %>%

# IBERIA_pollen %>%
#   dplyr::select(1:5) %>%
#   dplyr::distinct() %>%
#   dplyr::filter(entity_name %in% iberia_repatriation_dates$IBERIA_entity_name)

### IBERIA dates (84) ----
iberia_repatriation_dates <- iberia_repatriation %>%
  dplyr::filter(dates_to_be_extracted_from_iberia)

iberia_repatriated_dates_info <-
  iberia_repatriation_dates$IBERIA_entity_name %>%
  stringr::str_replace_all("EIX", "ELX") %>%
  extract_iberia()

iberia_repatriated_dates_info_2 <- iberia_repatriation_dates %>%
  dplyr::select(neotoma_ID_SITE = site_id,
                neotoma_site_name = site_name,
                neotoma_entity_name = entity_name,
                entity_name = IBERIA_entity_name) %>%
  dplyr::inner_join(iberia_repatriated_dates_info$metadata,
                    by = "entity_name") %>%
  dplyr::rename(external_ID_ENTITY = ID_ENTITY,
                external_ID_SITE =  ID_SITE,
                external_site_name = site_name,
                external_entity_name = entity_name)
iberia_repatriated_dates_info_3 <- EPD_METADATA %>%
  dplyr::select(1:4, 6, 10) %>%
  dplyr::right_join(iberia_repatriated_dates_info_2 %>%
                      dplyr::select(1:7),
                    by = c("entity_name" = "neotoma_entity_name"))

#### External links ----
dabr::select(conn,
             "SELECT * FROM external_link WHERE ID_ENTITY = 387 AND",
             "external_source = 'IBERIA'")
dabr::update(conn,
             "UPDATE external_link SET external_site_name = 'Elx',",
             "external_entity_name = 'ELX' WHERE",
             "ID_SITE = 351 AND ID_ENTITY = 387 AND external_source = 'IBERIA'")
# Results: 1 record was updated.

meta_neo_res <- seq_len(nrow(iberia_repatriated_dates_info_3)) %>%
  purrr::map(function(i) {
    iberia_repatriated_dates_info_3[i, ] %>%
      dplyr::select(ID_SITE,
                    ID_ENTITY,
                    external_ID_SITE,
                    external_ID_ENTITY,
                    external_site_name,
                    external_entity_name) %>%
      dplyr::mutate(external_source = "IBERIA") %>%
      rpd:::add_records(conn = conn, table = "external_link", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate -----
EPD_NEO_DB <- dabr::select_all(conn, "external_link") %>%
  dplyr::filter(external_ID_ENTITY %in%
                  iberia_repatriated_dates_info_3$external_ID_ENTITY,
                external_source == "IBERIA") %>%
  dplyr::select(ID_SITE,
                ID_ENTITY,
                external_ID_SITE,
                external_ID_ENTITY,
                external_site_name,
                external_entity_name)
waldo::compare(iberia_repatriated_dates_info_3 %>%
                 dplyr::select(ID_SITE,
                               ID_ENTITY,
                               external_ID_SITE,
                               external_ID_ENTITY,
                               external_site_name,
                               external_entity_name) %>%
                 dplyr::mutate(external_ID_SITE = as.integer(external_ID_SITE),
                               external_ID_ENTITY = as.integer(external_ID_ENTITY)),
               EPD_NEO_DB,
               tolerance = 1E-9)

#### Dates ----
# Pull existing ID_DATE_INFO
EPD_DATES_NEO_DB <- dabr::select_all(conn, "date_info") %>%
  dplyr::filter(ID_ENTITY %in% iberia_repatriated_dates_info_4$ID_ENTITY)
unique(EPD_DATES_NEO_DB$ID_DATE_INFO)
unique(EPD_DATES_NEO_DB$ID_ENTITY)
# readr::read_csv("inst/extdata/epd-special-iberia-v2-dates_2022-03-16.csv")
# EPD_DATES_NEO_DB %>%
#   readr::write_excel_csv("inst/extdata/epd-special-iberia-v2-dates_2022-03-16.csv")

# dabr::select(conn,
#              "SELECT * FROM date_info WHERE ID_ENTITY IN (",
#              paste0(EPD_DATES_NEO_DB$ID_ENTITY, collapse = ", "),
#              ")")
# dabr::delete(conn,
#              "DELETE FROM date_info WHERE ID_ENTITY IN (",
#              paste0(EPD_DATES_NEO_DB$ID_ENTITY, collapse = ", "),
#              ")")
# Results: 618 records were deleted.

iberia_repatriated_dates_info_4 <-
  iberia_repatriated_dates_info$date_info %>%
  dplyr::select(-ID_SITE) %>%
  dplyr::mutate(
    lab_num = dplyr::case_when(
      stringr::str_detect(lab_num, "NULL") ~ NA_character_,
      TRUE ~ lab_num
    ),
    material_dated = dplyr::case_when(
      stringr::str_detect(material_dated, "NULL") ~ NA_character_,
      TRUE ~ material_dated
    )) %>%
  dplyr::rename(external_ID_ENTITY = ID_ENTITY,
                reason_age_not_used = notes,
                age_calib = age_cal) %>%
  dplyr::mutate(age_used = ifelse(is.na(reason_age_not_used), "yes", "no")) %>%
  dplyr::left_join(iberia_repatriated_dates_info_3 %>%
                     dplyr::select(ID_ENTITY, external_ID_ENTITY)) %>%
  dplyr::select(-external_ID_ENTITY, -entity_name) %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::mutate(ID_DATE_INFO = 9938 + seq_along(ID_ENTITY)) # get_id_date_info(conn)
  # dplyr::mutate(ID_DATE_INFO = c(657:1010, 1471, 1472, 1011:1272))
  # dplyr::mutate(ID_DATE_INFO = c(657:1272, 1471, 1472))
# IDs 1471 and 1472 are linked to Lake Saloio (SALOIO, 828)

iberia_repatriated_dates_info_4 %>%
  dplyr::filter(is.na(ID_ENTITY) | is.na(ID_DATE_INFO))
# Find gaps in the table (i.e. non-continuous ID_DATE_INFO)
aa <- dabr::select_all(conn, "date_info")
tibble::tibble(ID_DATE_INFO = seq_len(max(aa$ID_ENTITY))) %>%
  dplyr::left_join(aa) %>%
  dplyr::filter(is.na(ID_ENTITY)) %>%
  .$ID_DATE_INFO

meta_neo_res <- seq_len(nrow(iberia_repatriated_dates_info_4)) %>%
  purrr::map(function(i) {
    iberia_repatriated_dates_info_4[i, ] %>%
      rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate -----
EPD_DATES_NEO_DB <- dabr::select_all(conn, "date_info") %>%
  dplyr::filter(ID_ENTITY %in% iberia_repatriated_dates_info_4$ID_ENTITY)
waldo::compare(iberia_repatriated_dates_info_4 %>%
                 .[order(colnames(.))] %>%
                 dplyr::arrange(ID_ENTITY, depth),
               EPD_DATES_NEO_DB %>%
                 .[order(colnames(.))] %>%
                 dplyr::select(-notes) %>%
                 dplyr::arrange(ID_ENTITY, depth),
               tolerance = 1E-9,
               max_diffs = Inf)

### IBERIA age models (69) ----
iberia_repatriation_am <- iberia_repatriation %>%
  dplyr::filter(age_model_to_be_extracted_from_iberia)

iberia_repatriated_am_info <-
  iberia_repatriation_am$IBERIA_entity_name %>%
  stringr::str_replace_all("EIX", "ELX") %>%
  extract_iberia()

iberia_repatriated_am_info_2 <- iberia_repatriation_am %>%
  dplyr::select(neotoma_ID_SITE = site_id,
                neotoma_site_name = site_name,
                neotoma_entity_name = entity_name,
                entity_name = IBERIA_entity_name) %>%
  # ID_ENTITY = IBERIA_ID_ENTITY) %>%
  dplyr::left_join(iberia_repatriated_am_info$metadata,
                    by = "entity_name") %>%
  dplyr::rename(external_ID_ENTITY = ID_ENTITY,
                external_ID_SITE =  ID_SITE,
                external_site_name = site_name,
                external_entity_name = entity_name)

iberia_repatriated_am_info_3 <- EPD_METADATA %>%
  dplyr::select(1:4, 6, 10) %>%
  dplyr::right_join(iberia_repatriated_am_info_2 %>%
                      dplyr::select(1:7),
                    by = c("entity_name" = "neotoma_entity_name"))

#### Samples ----
iberia_repatriated_am_info_EPD_COUNTS <- EPD_COUNTS %>%
  dplyr::filter(dataset_id %in% iberia_repatriated_am_info_3$dataset_id) %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-chronology_id) %>%
  dplyr::left_join(EPD_METADATA %>%
                     dplyr::select(1:6, 10),
                   by = c("site_id", "site_name", "site_name_clean", "dataset_id", "entity_name")) %>%
  dplyr::relocate(ID_SITE, ID_ENTITY, .before = 1)

iberia_repatriated_am_info_4 <- iberia_repatriated_am_info$age_model %>%
  dplyr::select(-ID_SITE) %>%
  dplyr::rename(external_ID_ENTITY = ID_ENTITY) %>%
  dplyr::left_join(iberia_repatriated_am_info_3 %>%
                     dplyr::select(ID_ENTITY, external_ID_ENTITY)) %>%
  dplyr::select(-external_ID_ENTITY) %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::select(-site_name, -entity_name, -latitude, -longitude, -elevation, -source, -publication)

# Extract thickness from the EPD data
iberia_repatriated_am_info_5 <- iberia_repatriated_am_info_4 %>%
  dplyr::mutate(depth2 = round(depth, 3)) %>%
  dplyr::left_join(iberia_repatriated_am_info_EPD_COUNTS %>%
                     dplyr::select(ID_ENTITY, depth, thickness, chronology_name,
                                   age_EPD = age, age_younger, age_older, age_type) %>%
                     dplyr::mutate(depth = round(depth, 3)),
                   by = c("ID_ENTITY",  "depth2" = "depth")) %>%
  dplyr::select(-depth2) %>%
  dplyr::relocate(thickness,
                  # age_original,
                  age_EPD,
                  chronology_name,
                  age_younger,
                  age_older,
                  age_type,
                  .before = mean) %>%
  # dplyr::filter(age != age_EPD)
  # dplyr::group_by(ID_ENTITY) %>%
  dplyr::mutate(age = ifelse(is.na(age), round(age_EPD), age)) %>%
  dplyr::mutate(lower = round(age_EPD) * 0.99,
                upper = round(age_EPD) * 1.01,
                same_age = (round(age) >= lower & round(age) <= upper) | (is.na(age) & is.na(age_EPD)),
                chronology_name = ifelse(same_age, chronology_name, NA),
                age_younger = ifelse(same_age, age_younger, NA),
                age_older = ifelse(same_age, age_older, NA),
                age_type = ifelse(same_age, age_type, NA)) %>%
  dplyr::select(-age_EPD, -lower, -upper, -same_age) %>%
  dplyr::mutate(ID_SAMPLE = c(6233L:7041L, 7144L:8130L, 8230L:9824L, 9857L:11040L, 12409L:12432L),
                .before = 1)
  # dplyr::mutate(ID_SAMPLE = c(6233:7041, 7144:7334, 7335:8130, 8230:8433, 8434:9433, 9434:9824, 9857:10465, 10466:11040, 12409:12432),
  #                 # 6232 + seq_along(ID_ENTITY), # get_id_sample(conn)
  #               .before = 1)

# Dump old samples:
# dabr::select(conn,
#              "SELECT * FROM sample WHERE ID_SAMPLE IN (",
#              paste0(c(6233L:7041L, 7144L:8130L, 8230L:9824L, 9857L:11040L, 12409L:12432L),
#                     collapse = ", "),
#              ")") %>%
#   readr::write_excel_csv("inst/extdata/epd-special-iberia-v2-samples_2022-03-16.csv")

# dabr::delete(conn,
#              "DELETE FROM sample WHERE ID_SAMPLE IN (",
#              paste0(c(6233L:7041L, 7144L:8130L, 8230L:9824L, 9857L:11040L, 12409L:12432L),
#                     collapse = ", "),
#              ")")
# # Results: 4599 records were deleted.
dabr::select(conn,
             "SELECT * FROM sample WHERE ID_SAMPLE IN (",
             paste0(c(6233L:7041L, 7144L:8130L, 8230L:9824L, 9857L:11040L, 12409L:12432L),
                    collapse = ", "),
             ")")

# Verify if the EPD has more samples than the IBERIA
tibble::tibble(
  ID_ENTITY = names(table(iberia_repatriated_am_info_EPD_COUNTS$ID_ENTITY)),
  EPD = table(iberia_repatriated_am_info_EPD_COUNTS$ID_ENTITY),
  EMB = table(iberia_repatriated_am_info_4$ID_ENTITY)
) %>%
  dplyr::filter(EPD > EMB)

# Verify if the "new" samples exist in the DB
dabr::select_all(conn, "sample") %>%
  dplyr::filter(ID_SAMPLE %in% iberia_repatriated_am_info_5$ID_SAMPLE)
dabr::select_all(conn, "sample") %>%
  dplyr::filter(ID_ENTITY %in% iberia_repatriated_am_info_5$ID_ENTITY)

meta_neo_res <- seq_len(nrow(iberia_repatriated_am_info_5)) %>%
  purrr::map(function(i) {
    iberia_repatriated_am_info_5[i, ] %>%
      dplyr::select(1:9) %>%
      rpd:::add_records(conn = conn, table = "sample", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate -----
EPD_DATES_NEO_DB <- dabr::select_all(conn, "sample") %>%
  dplyr::filter(ID_ENTITY %in% iberia_repatriated_am_info_5$ID_ENTITY)
waldo::compare(iberia_repatriated_am_info_5 %>%
                 dplyr::select(1:9) %>%
                 .[order(colnames(.))] %>%
                 dplyr::mutate(depth = round(depth, 3),
                               age = round(age)) %>%
                 dplyr::arrange(ID_ENTITY, depth),
               EPD_DATES_NEO_DB %>%
                 dplyr::select(-count_type, -sample_type) %>%
                 .[order(colnames(.))] %>%
                 dplyr::mutate(depth = round(depth, 3),
                               age = round(age)) %>%
                 dplyr::arrange(ID_ENTITY, depth),
               tolerance = 1E-9,
               max_diffs = Inf)

# sort(unique(EPD_DATES_NEO_DB$ID_ENTITY))[36:38]
# EPD_DATES_NEO_DB %>%
#   dplyr::filter(ID_ENTITY == 827) %>% .$ID_SAMPLE %>% range()
# iberia_repatriated_am_info_5 %>%
#   dplyr::filter(ID_ENTITY == 827) %>% .$ID_SAMPLE %>% range()
# aux <- sort(unique(EPD_DATES_NEO_DB$ID_ENTITY)) %>%
#   purrr::map(function(i) {
#     db <- EPD_DATES_NEO_DB %>%
#       dplyr::filter(ID_ENTITY == i) %>%
#       dplyr::select(-count_type, -sample_type) %>%
#       .[order(colnames(.))]
#     ss <- iberia_repatriated_am_info_5 %>%
#       dplyr::filter(ID_ENTITY == i) %>%
#       dplyr::select(1:9) %>%
#       .[order(colnames(.))]
#     waldo::compare(ss, db, x_arg = "SS", y_arg = "DB", tolerance = 1E-9)
#   })

#### Age models ----
meta_neo_res <- seq_len(nrow(iberia_repatriated_am_info_5)) %>%
  purrr::map(function(i) {
    iberia_repatriated_am_info_5[i, ] %>%
      dplyr::select(1, 10:15) %>%
      dplyr::mutate(ID_MODEL = 8, .before = 2) %>% # Bacon IntCal20
      rpd:::add_records(conn = conn, table = "age_model", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate ----
EPD_DATES_NEO_DB <- dabr::select_all(conn, "age_model") %>%
  dplyr::filter(ID_SAMPLE %in% iberia_repatriated_am_info_5$ID_SAMPLE)
waldo::compare(iberia_repatriated_am_info_5 %>%
                 dplyr::select(1, 10:15) %>%
                 dplyr::mutate(ID_MODEL = 8, .before = 2) %>%
                 .[order(colnames(.))],
               EPD_DATES_NEO_DB %>%
                 .[order(colnames(.))], tolerance = 2)

#### HERE: COUNTS ----

### non-IBERIA age models (15) ----
# NOTE: even though these entities were marked as "non-IBERIAN", the latest
# version of the Iberian subset contains all the necessary updates.
non_iberia_repatriation_am <- iberia_repatriation %>%
  dplyr::filter(!age_model_to_be_extracted_from_iberia)

non_iberia_repatriated_am_info <-
  non_iberia_repatriation_am$IBERIA_entity_name %>%
  extract_iberia()

non_iberia_repatriated_am_info_2 <- non_iberia_repatriation_am %>%
  dplyr::select(neotoma_ID_SITE = site_id,
                neotoma_site_name = site_name,
                neotoma_entity_name = entity_name,
                entity_name = IBERIA_entity_name) %>%
  dplyr::left_join(non_iberia_repatriated_am_info$metadata,
                   by = "entity_name") %>%
  dplyr::rename(external_ID_ENTITY = ID_ENTITY,
                external_ID_SITE =  ID_SITE,
                external_site_name = site_name,
                external_entity_name = entity_name)

non_iberia_repatriated_am_info_3 <- EPD_METADATA %>%
  dplyr::select(1:4, 6, 10) %>%
  dplyr::right_join(non_iberia_repatriated_am_info_2 %>%
                      dplyr::select(1:7),
                    by = c("entity_name" = "neotoma_entity_name"))

non_iberia_repatriated_am_info_EPD_COUNTS <- EPD_COUNTS %>%
  dplyr::filter(entity_name %in%
                  non_iberia_repatriated_am_info_3$entity_name) %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-chronology_id) %>%
  dplyr::left_join(EPD_METADATA %>%
                     dplyr::select(1:3, 6, 10),
                   by = c("site_id", "dataset_id", "entity_name")) %>%
  dplyr::relocate(ID_SITE, ID_ENTITY, .before = 1)

# non_iberia_repatriated_am_info_2 <- non_iberia_repatriated_am_info_EPD_COUNTS %>%
#   dplyr::mutate(ID_SAMPLE = 28481 + seq_along(ID_ENTITY), # get_id_sample(conn)
#                 .after = ID_ENTITY) %>%
#   dplyr::select(-ID_SITE, -site_id, -site_name, -site_name_clean, -dataset_id, -dataset_name, -entity_name, -sample_id, -unit_name)
#
# # Check if the "new" records are already in th DB:
# special.epd::snapshot(conn, ID_ENTITY = non_iberia_repatriated_am_info_2$ID_ENTITY)
# dabr::select_all(conn, "sample") %>%
#   dplyr::filter(ID_SAMPLE %in% non_iberia_repatriated_am_info_2$ID_SAMPLE |
#                   ID_ENTITY %in% non_iberia_repatriated_am_info_2$ID_ENTITY)


#### Samples ----
# non_iberia_repatriated_am_info_EPD_COUNTS <- EPD_COUNTS %>%
#   dplyr::filter(dataset_id %in%
#                   non_iberia_repatriated_am_info$external_ID_ENTITY) %>%
#   smpds::rm_na_taxa(1:16) %>%
#   dplyr::select(-chronology_id) %>%
#   dplyr::left_join(EPD_METADATA %>%
#                      dplyr::select(1:3, 6, 10),
#                    by = c("site_id", "dataset_id", "entity_name")) %>%
#   dplyr::relocate(ID_SITE, ID_ENTITY, .before = 1) #%>%
# # dplyr::mutate(depth = ifelse(ID_ENTITY == 365 & depth == 0, 0.5, depth)) # Update depth GDU: 0 -> 0.5
# non_iberia_repatriated_am_info_EPD_COUNTS


non_iberia_repatriated_am_info_4 <- non_iberia_repatriated_am_info$age_model %>%
  dplyr::select(-ID_SITE) %>%
  dplyr::rename(external_ID_ENTITY = ID_ENTITY) %>%
  dplyr::select(-entity_name) %>%
  dplyr::left_join(non_iberia_repatriated_am_info_3 %>%
                     dplyr::select(ID_ENTITY, external_ID_ENTITY, entity_name),
                   by = "external_ID_ENTITY") %>%
  dplyr::select(-external_ID_ENTITY) %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::select(-site_name, #-entity_name,
                -latitude, -longitude, -elevation, -source, -publication)

dim(non_iberia_repatriated_am_info_EPD_COUNTS)
dim(non_iberia_repatriated_am_info_4)

# a <- non_iberia_repatriated_am_info_EPD_COUNTS %>%
#   dplyr::group_by(entity_name) %>%
#   dplyr::select(1:16) %>%
#   dplyr::mutate(n_epd = length(ID_ENTITY),
#                 depths_epd = tibble::tibble(source = "EPD",
#                                             depth) %>% list()) %>%
#   dplyr::distinct(entity_name, n_epd, depths_epd)
#
# b <- non_iberia_repatriated_am_info_4 %>%
#   dplyr::group_by(entity_name) %>%
#   dplyr::select(1:10) %>%
#   dplyr::mutate(n_iberia = length(ID_ENTITY),
#                 depths_iberia = tibble::tibble(source = "IBERIA",
#                                                depth) %>% list()) %>%
#   dplyr::distinct(entity_name, n_iberia, depths_iberia)
#
# dplyr::full_join(a, b) %>%
#   dplyr::arrange(entity_name)
# dplyr::full_join(a, b) %>%
#   dplyr::arrange(entity_name) %>%
#   purrr::pmap_df(function(entity_name, depths_epd, depths_iberia, ...) {
#     dplyr::full_join(depths_epd, depths_iberia, by = "depth") %>%
#       dplyr::filter(is.na(source.x) | is.na(source.y)) %>%
#       dplyr::mutate(entity_name, .before = 1)
#   })

# Extract thickness from the EPD data
non_iberia_repatriated_am_info_5 <- non_iberia_repatriated_am_info_4 %>%
  dplyr::mutate(depth2 = round(depth, 3)) %>%
  dplyr::left_join(non_iberia_repatriated_am_info_EPD_COUNTS %>%
                     dplyr::select(ID_ENTITY, depth, thickness, chronology_name,
                                   age_EPD = age, age_younger, age_older, age_type) %>%
                     dplyr::mutate(depth = round(depth, 3)),
                   by = c("ID_ENTITY",  "depth2" = "depth")) %>%
  dplyr::select(-depth2) %>%
  dplyr::relocate(thickness,
                  # age_original,
                  age_EPD,
                  chronology_name,
                  age_younger,
                  age_older,
                  age_type,
                  .before = mean) %>%
  dplyr::mutate(age = ifelse(is.na(age), round(age_EPD), age)) %>%
  dplyr::mutate(lower = round(age_EPD) * 0.99,
                upper = round(age_EPD) * 1.01,
                same_age = (round(age) >= lower & round(age) <= upper) | (is.na(age) & is.na(age_EPD)),
                chronology_name = ifelse(same_age, chronology_name, NA),
                age_younger = ifelse(same_age, age_younger, NA),
                age_older = ifelse(same_age, age_older, NA),
                age_type = ifelse(same_age, age_type, NA)) %>%
  dplyr::select(-age_EPD, -lower, -upper, -same_age)


# Find existing samples
existing_samples <- dabr::select(conn,
                                 "SELECT * FROM sample WHERE ID_ENTITY IN (",
                                 paste0(non_iberia_repatriated_am_info_5$ID_ENTITY,
                                        collapse = ", "),
                                 ")")

samples_already_in_the_db <- non_iberia_repatriated_am_info_5 %>%
  dplyr::left_join(existing_samples %>%
                     dplyr::select(ID_ENTITY, ID_SAMPLE, depth)) %>%
  dplyr::filter(!is.na(ID_SAMPLE)) %>% #.$ID_ENTITY %>% unique()
  dplyr::relocate(ID_SAMPLE, .after = 1)

waldo::compare(existing_samples %>%
                 .[order(colnames(.))] %>%
                 dplyr::select(-count_type, -sample_type),
               samples_already_in_the_db[1:9] %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)

non_iberia_repatriated_am_info_6 <- non_iberia_repatriated_am_info_5 %>%
  dplyr::left_join(existing_samples %>%
                     dplyr::select(ID_ENTITY, ID_SAMPLE, depth)) %>%
  dplyr::filter(is.na(ID_SAMPLE)) %>% #.$ID_ENTITY %>% unique()
  dplyr::relocate(ID_SAMPLE, .after = 1) %>%
  dplyr::mutate(ID_SAMPLE = 101360 + seq_along(ID_ENTITY)) # get_id_sample(conn)

# Check if any of the samples exists
dabr::select(conn,
             "SELECT * FROM sample WHERE ID_SAMPLE IN (",
             paste0(non_iberia_repatriated_am_info_6$ID_SAMPLE,
                    collapse = ", "),
             ")")

meta_neo_res <- seq_len(nrow(non_iberia_repatriated_am_info_6)) %>%
  purrr::map(function(i) {
    non_iberia_repatriated_am_info_6[i, ] %>%
      dplyr::select(1:9) %>%
      rpd:::add_records(conn = conn, table = "sample", dry_run = TRUE)
  })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate -----
EPD_DATES_NEO_DB <- dabr::select_all(conn, "sample") %>%
  dplyr::filter(ID_ENTITY %in% non_iberia_repatriated_am_info_6$ID_ENTITY)
waldo::compare(non_iberia_repatriated_am_info_6 %>%
                 dplyr::select(1:9) %>%
                 .[order(colnames(.))] %>%
                 dplyr::mutate(depth = round(depth, 3),
                               age = round(age)),
               EPD_DATES_NEO_DB %>%
                 .[order(colnames(.))] %>%
                 dplyr::mutate(depth = round(depth, 3),
                               age = round(age)) %>%
                 dplyr::select(-count_type, -sample_type),
               tolerance = 1E-9)

#### Age models (15) ----
# non_iberia_repatriated_am_new <-
#   find_age_models(path, non_iberia_repatriated_am_info$metadata$entity_name)
#
# # Check if the AM are ready to be uploaded
# non_iberia_repatriated_am_new %>%
#   dplyr::filter(entity_name %in% epd_age_models_ready_to_upload$entity_name)
#
# # Check if the age models already exist in the DB
# special.epd::snapshot(conn, entity_name = non_iberia_repatriated_am_new$entity_name)
#
# non_iberia_repatriated_am_new2 <- non_iberia_repatriated_am_new %>%
#   purrr::pmap_df(upload_age_model, conn = conn)
# # Check if all the age models were uploaded
# all(non_iberia_repatriated_am_new2$status)
# non_iberia_repatriated_am_new2
# non_iberia_repatriated_am_new3 <-
#   non_iberia_repatriated_am_new2$am %>%
#   purrr::map_df(~.x) %>%
#   magrittr::set_names(colnames(.) %>% stringr::str_to_upper())
samples_already_in_the_db
non_iberia_repatriated_am_info_6
# Delete old age models
dabr::select(conn,
             "SELECT * FROM age_model WHERE ID_SAMPLE IN (",
             paste0(samples_already_in_the_db$ID_SAMPLE, collapse = ", "),
             ") AND ID_MODEL = 8")
# dabr::delete(conn,
#              "DELETE FROM age_model WHERE ID_SAMPLE IN (",
#              paste0(samples_already_in_the_db$ID_SAMPLE, collapse = ", "),
#              ") AND ID_MODEL = 8")
# special.epd::snapshot(conn,
#                       entity_name = non_iberia_repatriated_am_new$entity_name)

samples_already_in_the_db %>%
  dplyr::select(ID_SAMPLE, mean, median, dplyr::starts_with("UNCERT")) %>%
  dplyr::mutate(ID_MODEL = 8, .before = 2) %>%
  rpd:::add_records(conn = conn, table = "age_model", dry_run = TRUE)
non_iberia_repatriated_am_info_6 %>%
  dplyr::select(ID_SAMPLE, mean, median, dplyr::starts_with("UNCERT")) %>%
  dplyr::mutate(ID_MODEL = 8, .before = 2) %>%
  rpd:::add_records(conn = conn, table = "age_model", dry_run = TRUE)
##### Validate ----
EPD_NEO_DB <- dabr::select_all(conn, "age_model") %>%
  dplyr::filter(ID_SAMPLE %in% samples_already_in_the_db$ID_SAMPLE)
waldo::compare(samples_already_in_the_db %>%
                 dplyr::select(ID_SAMPLE, mean, median,
                               dplyr::starts_with("UNCERT")) %>%
                 dplyr::mutate(ID_MODEL = 8, .before = 2) %>%
                 .[order(colnames(.))],
               EPD_NEO_DB %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)

EPD_NEO_DB <- dabr::select_all(conn, "age_model") %>%
  dplyr::filter(ID_SAMPLE %in% non_iberia_repatriated_am_info_6$ID_SAMPLE)
waldo::compare(non_iberia_repatriated_am_info_6 %>%
                 dplyr::select(ID_SAMPLE, mean, median,
                               dplyr::starts_with("UNCERT")) %>%
                 dplyr::mutate(ID_MODEL = 8, .before = 2) %>%
                 .[order(colnames(.))],
               EPD_NEO_DB %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)

## IBERIA - COUNTS ----
special.epd::snapshot(conn,
                      entity_name = iberia_extra_repatriation_dates$entity_name)
data("IBERIA_counts_v3")
IBERIA_counts_v3

iberia_repatriated_samples <- conn %>%
  dabr::select("SELECT ID_ENTITY, ID_SAMPLE, depth FROM sample WHERE",
               "ID_ENTITY IN (",
               paste0(unique(IBERIA_counts_v3$ID_ENTITY), collapse = ","),
               ")")

iberia_repatriated_counts <-
  iberia_repatriated_samples %>%
  dplyr::select(ID_ENTITY, ID_SAMPLE, depth) %>%
  dplyr::mutate(depth_rnd = round(depth, 3)) %>%
  dplyr::right_join(IBERIA_counts_v3 %>%
                      dplyr::mutate(depth_rnd = round(depth, 3)),
                    by = c("ID_ENTITY", "depth_rnd"))
dim(iberia_repatriated_counts)
dim(IBERIA_counts_v3)

# NOTE: there are some entities with hiatuses, so the samples were not included
## in the database
iberia_repatriated_counts_not_used <- iberia_repatriated_counts %>%
  dplyr::filter(is.na(ID_SAMPLE)) %>%#,
                # entity_name %in% c("Navamuno_S3",
                #                    "PERAFITA",
                #                    "PozoN_2015 core"))
  dplyr::select(-taxon_name, -clean, -intermediate, -amalgamated) %>%
  dplyr::distinct(depth_rnd, .keep_all = TRUE)
iberia_repatriated_counts_not_used %>%
  View()

iberia_repatriated_counts_not_used %>%
  dplyr::distinct(ID_ENTITY, depth_rnd, .keep_all = TRUE)
iberia_repatriated_counts_2 <- iberia_repatriated_counts %>%
  dplyr::filter(!is.na(ID_SAMPLE)) #%>%
# dplyr::filter(entity_name %>% stringr::str_detect("LdlMo", negate = FALSE))
iberia_repatriated_counts_2 %>%
  dplyr::filter(is.na(ID_ENTITY) | is.na(ID_SAMPLE)) %>%
  dplyr::distinct(ID_ENTITY, depth_rnd, .keep_all = TRUE)

aux <- conn %>%
  special.epd::snapshot(ID_ENTITY = iberia_repatriated_counts$ID_ENTITY)
aux$date_info$ID_ENTITY %>% unique() %>% length()
aux$sample$ID_ENTITY %>% unique() %>% length()
aux$age_model$ID_SAMPLE %>% unique() %>% length()
iberia_repatriated_counts_2$ID_SAMPLE %>% unique() %>% length()

# Samples that don't have pollen counts
aux$sample %>%
  dplyr::select(ID_ENTITY, ID_SAMPLE, depth) %>%
  dplyr::anti_join(
    tibble::tibble(
      ID_SAMPLE = iberia_repatriated_counts_2$ID_SAMPLE %>% unique(),
    )
  ) %>%
  dplyr::left_join(
    iberia_repatriated_counts %>%
      dplyr::select(ID_ENTITY, site_name, entity_name) %>%
      dplyr::distinct(ID_ENTITY, .keep_all = TRUE)
  ) %>%
  View()

# special.epd::snapshot(ID_ENTITY = epd_repatriated_samples_2$ID_ENTITY)
existing_pollen_counts <-
  aux$pollen_count %>%
  purrr::map_df(~.x) %>% # Combine the three pollen count tables
  dplyr::select(ID_SAMPLE) %>%
  dplyr::group_by(ID_SAMPLE) %>%
  dplyr::summarise(n = dplyr::n())

existing_pollen_counts %>%
  dplyr::filter(n != 3) # Verify if sample does not have the 3 tables

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
# Check if all the IBERIA extra taxons are in the `taxon_name` table
new_taxon_names <-
  c(iberia_repatriated_counts_2$clean,
    iberia_repatriated_counts_2$intermediate,
    iberia_repatriated_counts_2$amalgamated) %>%
  unique() %>%
  sort() %>%
  tibble::tibble() %>%
  magrittr::set_names("taxon_name") %>%
  dplyr::left_join(taxon_name_tb) %>%
  dplyr::filter(is.na(ID_TAXON))
new_taxon_names %>%
  rpd:::add_records(conn = conn, table = "taxon_name", dry_run = TRUE)
# Results: 15 records were inserted.

##### Clean ----
iberia_repatriated_counts_3 <-
  sort(unique(iberia_repatriated_counts_2$ID_SAMPLE)) %>%
  purrr::map_df(function(i) {
    iberia_repatriated_counts_2 %>%
      dplyr::filter(ID_SAMPLE == i) %>%
      dplyr::select(ID_SAMPLE, taxon_name = clean, count) %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

iberia_repatriated_counts_3 %>%
  dplyr::filter(is.na(ID_TAXON))

iberia_repatriated_counts_3 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()

idx <- idx_pairs(nrow(iberia_repatriated_counts_3), 2000)
pb <- progress::progress_bar$new(total = nrow(idx))
meta_neo_res <-
  purrr::map2(idx$x,
              idx$y,
              ~ {
                pb$tick()
                iberia_repatriated_counts_3[.x:.y, ] %>%
                  rpd:::add_records(conn = conn,
                                    table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(iberia_repatriated_counts_3$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(iberia_repatriated_counts_3 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 465925

##### Intermediate ----
iberia_repatriated_counts_4 <-
  sort(unique(iberia_repatriated_counts_2$ID_SAMPLE)) %>%
  purrr::map_df(function(i) {
    iberia_repatriated_counts_2 %>%
      dplyr::filter(ID_SAMPLE == i) %>%
      dplyr::select(ID_SAMPLE, taxon_name = intermediate, count) %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 1, .before = count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

iberia_repatriated_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON))

iberia_repatriated_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(iberia_repatriated_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                iberia_repatriated_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn,
                                    table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(iberia_repatriated_counts_4$ID_SAMPLE),
                      collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(iberia_repatriated_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 452309

##### Amalgamated ----
iberia_repatriated_counts_5 <-
  unique(iberia_repatriated_counts_2$ID_SAMPLE) %>%
  purrr::map_df(function(i) {
    iberia_repatriated_counts_2 %>%
      dplyr::filter(ID_SAMPLE == i) %>%
      dplyr::select(ID_SAMPLE, taxon_name = amalgamated, count) %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 2, .before = count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

iberia_repatriated_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)|is.na(count))

iberia_repatriated_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(iberia_repatriated_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                iberia_repatriated_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn,
                                    table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(iberia_repatriated_counts_5$ID_SAMPLE),
                      collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(iberia_repatriated_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 380529

iberia_repatriated_counts_2 %>%
  dplyr::distinct(entity_name, .keep_all = TRUE)

## IBERIA (extra) ----
iberia_extra_repatriation <- epd_repatriation_tmp_file %>%
  readxl::read_excel(sheet = 5) %>%
  janitor::clean_names() %>%
  dplyr::mutate(site_in_iberia = to_bool(site_in_iberia),
                dates_to_be_extracted_from_iberia =
                  to_bool(dates_to_be_extracted_from_iberia),
                age_model_to_be_extracted_from_iberia =
                  to_bool(age_model_to_be_extracted_from_iberia)) %>%
  dplyr::rename(entity_name = entity_name_7) %>%
  dplyr::select(-site_id, -iberia_id_entity, -entity_name_12)

### IBERIA dates (38) ----
iberia_extra_repatriation_dates <- iberia_extra_repatriation %>%
  dplyr::filter(dates_to_be_extracted_from_iberia)

iberia_extra_repatriated_dates_info <-
  iberia_extra_repatriation_dates$entity_name %>%
  extract_iberia()

#### Entities ----
iberia_extra_repatriated_dates_info_entities <-
  iberia_extra_repatriated_dates_info$metadata %>%
  dplyr::arrange(site_name, entity_name) %>%
  # dplyr::mutate(ID_SITE = seq_along(ID_SITE) + get_id_site(conn),
  #               ID_ENTITY = seq_along(ID_ENTITY) + get_id_entity(conn))
  dplyr::mutate(ID_SITE = c(1454:1476, 1478:1492),
                ID_ENTITY = c(1628:1650, 1652:1666))

##### Verify if the "new" entities exist in the DB
dabr::select_all(conn, "entity") %>%
  dplyr::filter(ID_SITE %in%
                  iberia_extra_repatriated_dates_info_entities$ID_SITE)
dabr::select_all(conn, "entity") %>%
  dplyr::filter(ID_ENTITY %in%
                  iberia_extra_repatriated_dates_info_entities$ID_ENTITY)

meta_neo_res <- seq_len(nrow(iberia_extra_repatriated_dates_info_entities)) %>%
  purrr::map(function(i) {
    iberia_extra_repatriated_dates_info_entities[i, ] %>%
      rpd:::add_records(conn = conn, table = "entity", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate -----
EPD_NEO_DB <- dabr::select_all(conn, "entity") %>%
  dplyr::filter(ID_ENTITY %in% iberia_extra_repatriated_dates_info_entities$ID_ENTITY)
waldo::compare(iberia_extra_repatriated_dates_info_entities %>%
                 .[order(colnames(.))],
               EPD_NEO_DB %>%
                 .[order(colnames(.))] %>%
                 dplyr::select(-doi, -site_type))


iberia_extra_repatriated_dates_info_2 <-
  iberia_extra_repatriated_dates_info$metadata %>%
  dplyr::select(external_ID_ENTITY = ID_ENTITY,
                external_ID_SITE =  ID_SITE,
                external_site_name = site_name,
                external_entity_name = entity_name)
  # iberia_extra_repatriation_dates %>%
  # dplyr::select(neotoma_ID_SITE = site_id,
  #               neotoma_site_name = site_name,
  #               neotoma_entity_name = site_name,
  #               # neotoma_entity_name = entity_name,
  #               ID_ENTITY = IBERIA_ID_ENTITY) %>%
  # dplyr::inner_join(iberia_extra_repatriated_dates_info$metadata,
  #                   by = "ID_ENTITY") %>%
  # dplyr::rename(external_ID_ENTITY = ID_ENTITY,
  #               external_ID_SITE =  ID_SITE,
  #               external_site_name = site_name,
  #               external_entity_name = entity_name)
iberia_extra_repatriated_dates_info_3 <- iberia_extra_repatriated_dates_info_entities %>%
  dplyr::select(ID_SITE, ID_ENTITY, external_entity_name = entity_name) %>%
  dplyr::right_join(iberia_extra_repatriated_dates_info_2,
                    by = c("external_entity_name"))
iberia_extra_repatriated_dates_info_3


#### External links ----
meta_neo_res <- seq_len(nrow(iberia_extra_repatriated_dates_info_3)) %>%
  purrr::map(function(i) {
    iberia_extra_repatriated_dates_info_3[i, ] %>%
      dplyr::select(ID_SITE,
                    ID_ENTITY,
                    external_ID_SITE,
                    external_ID_ENTITY,
                    external_site_name,
                    external_entity_name) %>%
      dplyr::mutate(external_source = "IBERIA") %>%
      # rpd:::update_records(conn = conn, table = "external_link", PK = c(1, 2, 3, 4))
      rpd:::add_records(conn = conn, table = "external_link", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

dabr::select_all(conn, "external_link") %>%
  dplyr::filter(external_entity_name %in%
                  iberia_extra_repatriated_dates_info_3$external_entity_name) %>%
  dplyr::filter(external_source %in% c("EMBSECBIO")) %>%
  purrr::pwalk(function(ID_SITE,
                        ID_ENTITY,
                        external_ID_SITE,
                        external_ID_ENTITY,
                        external_site_name,
                        external_entity_name,
                        external_source) {
    dabr::delete(conn,
                 "DELETE FROM external_link WHERE",
                 "ID_SITE = ", ID_SITE,
                 "AND ID_ENTITY = ", ID_ENTITY,
                 "AND external_ID_SITE = ", external_ID_SITE,
                 "AND external_ID_ENTITY = ", external_ID_ENTITY,
                 "AND external_site_name = ", dabr::quote(external_site_name),
                 "AND external_entity_name = ", dabr::quote(external_entity_name),
                 "AND external_source = ", dabr::quote(external_source))
  })

##### Validate -----
EPD_NEO_DB <- dabr::select_all(conn, "external_link") %>%
  dplyr::filter(external_ID_ENTITY %in%
                  iberia_extra_repatriated_dates_info_3$external_ID_ENTITY,
                external_source == "IBERIA") %>%
  dplyr::select(ID_SITE,
                ID_ENTITY,
                external_ID_SITE,
                external_ID_ENTITY,
                external_site_name,
                external_entity_name)
waldo::compare(iberia_extra_repatriated_dates_info_3 %>%
                 dplyr::select(ID_SITE,
                               ID_ENTITY,
                               external_ID_SITE,
                               external_ID_ENTITY,
                               external_site_name,
                               external_entity_name) %>%
                 dplyr::mutate(external_ID_SITE = as.integer(external_ID_SITE),
                               external_ID_ENTITY = as.integer(external_ID_ENTITY)),
               EPD_NEO_DB)

#### Dates ----
# iberia_repatriated_dates_info_4 <-
#   iberia_repatriated_dates_info$date_info %>%
#   dplyr::select(-ID_SITE) %>%
#   dplyr::rename(external_ID_ENTITY = ID_ENTITY,
#                 reason_age_not_used = notes,
#                 age_calib = age_cal) %>%
#   dplyr::mutate(age_used = ifelse(is.na(reason_age_not_used), "yes", "no")) %>%
#   dplyr::left_join(iberia_repatriated_dates_info_3 %>%
#                      dplyr::select(ID_ENTITY, external_ID_ENTITY)) %>%
#   dplyr::select(-external_ID_ENTITY, -site_name, -entity_name) %>%
#   dplyr::relocate(ID_ENTITY, .before = 1) %>%
#   dplyr::mutate(ID_DATE_INFO = 657:1272)

# Find existing dates
old_dates <- dabr::select_all(conn, "date_info") %>%
  dplyr::filter(ID_ENTITY %in%
                  iberia_extra_repatriated_dates_info_3$ID_ENTITY) #%>%
  # readr::write_excel_csv("inst/extdata/epd-special-iberia-extra-v2-dates_2022-03-17.csv")
dabr::select(conn,
             "SELECT * FROM date_info WHERE ID_DATE_INFO IN (",
             paste0(old_dates$ID_DATE_INFO, collapse = ", "),
             ")")
# dabr::delete(conn,
#              "DELETE FROM date_info WHERE ID_DATE_INFO IN (",
#              paste0(old_dates$ID_DATE_INFO, collapse = ", "),
#              ")")
# # Results: 332 records were deleted.

iberia_extra_repatriated_dates_info_4 <-
  iberia_extra_repatriated_dates_info$date_info %>%
  dplyr::select(-ID_SITE) %>%
  dplyr::rename(external_ID_ENTITY = ID_ENTITY,
                reason_age_not_used = notes,
                age_calib = age_cal) %>%
  dplyr::mutate(age_used = ifelse(is.na(reason_age_not_used), "yes", "no")) %>%
  dplyr::left_join(iberia_extra_repatriated_dates_info_3 %>%
                     dplyr::select(ID_ENTITY, external_ID_ENTITY)) %>%
  dplyr::select(-external_ID_ENTITY, #-site_name,
                -entity_name) %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::mutate(ID_DATE_INFO = 10556 + seq_along(ID_ENTITY)) #1273:1606)

  # dplyr::mutate(ID_DATE_INFO = seq_along(ID_ENTITY) + get_id_date_info(conn))

meta_neo_res <- seq_len(nrow(iberia_extra_repatriated_dates_info_4)) %>%
  purrr::map(function(i) {
    iberia_extra_repatriated_dates_info_4[i, ] %>%
      rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate -----
EPD_NEO_DB <- dabr::select_all(conn, "date_info") %>%
  dplyr::filter(ID_ENTITY %in% iberia_extra_repatriated_dates_info_4$ID_ENTITY)
waldo::compare(iberia_extra_repatriated_dates_info_4 %>%
                 .[order(colnames(.))],
               EPD_NEO_DB %>%
                 .[order(colnames(.))] %>%
                 dplyr::select(-notes),
               tolerance = 1E-9)

#### Samples ----
dabr::select(conn,
             "SELECT * FROM sample WHERE ID_ENTITY IN (",
             paste0(iberia_extra_repatriated_dates_info_4$ID_ENTITY,
                    collapse = ", "),
             ")")
# dabr::delete(conn,
#              "DELETE FROM sample WHERE ID_ENTITY IN (",
#              paste0(iberia_extra_repatriated_dates_info_4$ID_ENTITY,
#                     collapse = ", "),
#              ")")
# # Results: 2699 records were deleted.

iberia_extra_repatriated_am_info_4 <-
  iberia_extra_repatriated_dates_info$age_model %>%
  dplyr::select(-ID_SITE) %>%
  dplyr::rename(external_ID_ENTITY = ID_ENTITY) %>%
  dplyr::left_join(iberia_extra_repatriated_dates_info_3 %>%
                     dplyr::select(ID_ENTITY, external_ID_ENTITY)) %>%
  dplyr::select(-external_ID_ENTITY) %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::select(-site_name, -entity_name, -latitude, -longitude, -elevation, -source, -publication) %>%
  dplyr::mutate(ID_SAMPLE = c(11041:12408, 12433:13763),
                  # seq_along(ID_ENTITY) + get_id_sample(conn),
                .before = 2)

meta_neo_res <- seq_len(nrow(iberia_extra_repatriated_am_info_4)) %>%
  purrr::map(function(i) {
    iberia_extra_repatriated_am_info_4[i, ] %>%
      dplyr::select(1:4) %>%
      rpd:::add_records(conn = conn, table = "sample", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate -----
EPD_NEO_DB <- dabr::select_all(conn, "sample") %>%
  dplyr::filter(ID_ENTITY %in% iberia_extra_repatriated_am_info_4$ID_ENTITY)
waldo::compare(iberia_extra_repatriated_am_info_4 %>%
                 dplyr::select(1:4) %>%
                 .[order(colnames(.))] %>%
                 dplyr::mutate(depth = round(depth, 3),
                               age = round(age)),
               EPD_NEO_DB %>%
                 dplyr::select(-count_type, -age_older, -age_younger, -age_type, -chronology_name, -thickness, -sample_type) %>%
                 .[order(colnames(.))] %>%
                 dplyr::mutate(depth = round(depth, 3),
                               age = round(age)),
               tolerance = 1E-9)

#### Age models ----
dabr::select_all(conn, "sample") %>%
  dplyr::filter(ID_ENTITY %in% iberia_extra_repatriated_am_info_4$ID_ENTITY)
dabr::select_all(conn, "age_model") %>%
  dplyr::filter(ID_SAMPLE %in% iberia_extra_repatriated_am_info_4$ID_SAMPLE)

iberia_extra_repatriated_am_info_5 <- iberia_extra_repatriated_am_info_4 %>%
  dplyr::select(-c(1, 3:4))
meta_neo_res <- seq_len(nrow(iberia_extra_repatriated_am_info_5)) %>%
  purrr::map(function(i) {
    iberia_extra_repatriated_am_info_5[i, ] %>%
      dplyr::mutate(ID_MODEL = 8, .before = 2) %>% # Bacon IntCal20
      rpd:::add_records(conn = conn, table = "age_model", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate ----
EPD_NEO_DB <- dabr::select_all(conn, "age_model") %>%
  dplyr::filter(ID_SAMPLE %in% iberia_extra_repatriated_am_info_5$ID_SAMPLE)
waldo::compare(iberia_extra_repatriated_am_info_5 %>%
                 dplyr::mutate(ID_MODEL = 8, .before = 2) %>%
                 .[order(colnames(.))],
               EPD_NEO_DB %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)

#### COUNTS ----
special.epd::snapshot(conn,
                      entity_name = iberia_extra_repatriation_dates$entity_name)
data("IBERIA_extra_counts_v3")
IBERIA_extra_counts_v3

iberia_extra_repatriated_counts <-
  iberia_extra_repatriated_am_info_4 %>%
  dplyr::select(ID_ENTITY, ID_SAMPLE, depth) %>%
  dplyr::mutate(depth_rnd = round(depth, 3)) %>%
  dplyr::right_join(IBERIA_extra_counts_v3 %>%
                     dplyr::mutate(depth_rnd = round(depth, 3)),
                   by = c("ID_ENTITY", "depth_rnd"))
  # dplyr::right_join(IBERIA_extra_counts_v3,
  #                   by = c("ID_ENTITY", "depth"))
dim(iberia_extra_repatriated_counts)
dim(IBERIA_extra_counts_v3)

id_sample_with_counts <- unique(iberia_extra_repatriated_counts$ID_SAMPLE)
id_sample_with_am <- unique(iberia_extra_repatriated_am_info_4$ID_SAMPLE)
id_samples_without_counts <-
  id_sample_with_am[!(id_sample_with_am %in% id_sample_with_counts)]
dabr::select(conn,
             "SELECT * FROM sample WHERE ID_SAMPLE IN (",
             paste0(id_samples_without_counts, collapse = ","),
             ")") %>%
  dplyr::select(1:7) %>%
  dplyr::left_join(dabr::select_all(conn, "entity")) %>%
  dplyr::select(1:10)

# NOTE: there are some entities with hiatuses, so the samples were not included
## in the database
iberia_extra_repatriated_counts_not_used <- iberia_extra_repatriated_counts %>%
  dplyr::filter(is.na(ID_SAMPLE),
                entity_name %in% c("Navamuno_S3",
                                   "PERAFITA",
                                   "PozoN_2015 core"))
iberia_extra_repatriated_counts_not_used %>%
  dplyr::distinct(ID_ENTITY, depth_rnd, .keep_all = TRUE)
iberia_extra_repatriated_counts_2 <- iberia_extra_repatriated_counts %>%
  dplyr::filter(!is.na(ID_SAMPLE)) #%>%
  # dplyr::filter(entity_name %>% stringr::str_detect("LdlMo", negate = FALSE))
iberia_extra_repatriated_counts_2 %>%
  dplyr::filter(is.na(ID_ENTITY) | is.na(ID_SAMPLE)) %>%
  dplyr::distinct(ID_ENTITY, depth_rnd, .keep_all = TRUE)

aux <- conn %>%
  special.epd::snapshot(ID_ENTITY = iberia_extra_repatriated_counts$ID_ENTITY)
aux$date_info$ID_ENTITY %>% unique() %>% length()
aux$sample$ID_ENTITY %>% unique() %>% length()
aux$age_model$ID_SAMPLE %>% unique() %>% length()
iberia_extra_repatriated_counts$ID_SAMPLE %>% unique() %>% length()
tibble::tibble(
  ID_SAMPLE = iberia_extra_repatriated_counts_2$ID_SAMPLE %>% unique(),
) %>%
  dplyr::anti_join(
    tibble::tibble(
      ID_SAMPLE = aux$sample$ID_SAMPLE %>% unique()
    )
  )

# Samples that don't have pollen counts
aux$sample %>%
  dplyr::select(ID_ENTITY, ID_SAMPLE, depth) %>%
  dplyr::anti_join(
    tibble::tibble(
      ID_SAMPLE = iberia_extra_repatriated_counts_2$ID_SAMPLE %>% unique(),
    )
  ) %>%
  dplyr::left_join(
    iberia_extra_repatriated_counts %>%
      dplyr::select(ID_ENTITY, site_name, entity_name) %>%
      dplyr::distinct(ID_ENTITY, .keep_all = TRUE)
  )


# special.epd::snapshot(ID_ENTITY = epd_repatriated_samples_2$ID_ENTITY)
existing_pollen_counts <-
  aux$pollen_count %>%
  purrr::map_df(~.x) %>% # Combine the three pollen count tables
  dplyr::select(ID_SAMPLE) %>%
  dplyr::group_by(ID_SAMPLE) %>%
  dplyr::summarise(n = dplyr::n())

existing_pollen_counts %>%
  dplyr::filter(n != 3) # Verify if sample does not have the 3 tables

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
# Check if all the IBERIA extra taxons are in the `taxon_name` table
new_taxon_names <-
  c(iberia_extra_repatriated_counts_2$clean,
    iberia_extra_repatriated_counts_2$intermediate,
    iberia_extra_repatriated_counts_2$amalgamated) %>%
  unique() %>%
  sort() %>%
  tibble::tibble() %>%
  magrittr::set_names("taxon_name") %>%
  dplyr::left_join(taxon_name_tb) %>%
  dplyr::filter(is.na(ID_TAXON))
new_taxon_names %>%
  rpd:::add_records(conn = conn, table = "taxon_name", dry_run = TRUE)
# Results: 61 records were inserted.

# NOTE (2022-03-22): SPH requested that blank cells were replaced by ZEROS in
# the counts file, which were originally excluded, so this filter only keeps
# counts = 0, to be added to the database.
iberia_extra_repatriated_counts_2 <- iberia_extra_repatriated_counts_2 %>%
  dplyr::filter(count == 0)
existing_pollen_counts_long <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(iberia_extra_repatriated_counts_2$ID_SAMPLE),
                      collapse = ","),
               ")") %>%
  dplyr::mutate(pk = paste0(ID_SAMPLE, "-", ID_TAXON, "-", amalgamation_level))

iberia_extra_repatriated_counts_2 %>%
  dplyr::filter(is.na(taxon_name) | is.na(clean))

##### Clean ----
iberia_extra_repatriated_counts_3 <-
  sort(unique(iberia_extra_repatriated_counts_2$ID_SAMPLE)) %>%
  purrr::map_df(function(i) {
    iberia_extra_repatriated_counts_2 %>%
      dplyr::filter(ID_SAMPLE == i) %>%
      dplyr::select(ID_SAMPLE, taxon_name = clean, count) %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

iberia_extra_repatriated_counts_3 %>%
  dplyr::filter(is.na(ID_TAXON))

# Filter existing records
iberia_extra_repatriated_counts_3 <- iberia_extra_repatriated_counts_3 %>%
  dplyr::mutate(pk = paste0(ID_SAMPLE, "-", ID_TAXON, "-", amalgamation_level)) %>%
  dplyr::filter(!(pk %in% existing_pollen_counts_long$pk)) %>%
  dplyr::select(-pk)

dim(iberia_extra_repatriated_counts_3)
iberia_extra_repatriated_counts_3 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()

idx <- idx_pairs(nrow(iberia_extra_repatriated_counts_3), 2000)
pb <- progress::progress_bar$new(total = nrow(idx))
meta_neo_res <-
  purrr::map2(idx$x,
              idx$y,
              ~ {
                pb$tick()
                iberia_extra_repatriated_counts_3[.x:.y, ] %>%
                  rpd:::add_records(conn = conn,
                                    table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(iberia_extra_repatriated_counts_3$ID_SAMPLE, collapse = ", "),
               ") and amalgamation_level = 0") %>%
  dplyr::mutate(pk = paste0(ID_SAMPLE, "-", ID_TAXON, "-", amalgamation_level)) %>%
  dplyr::filter(!(pk %in% existing_pollen_counts_long$pk)) %>%
  dplyr::select(-pk)
waldo::compare(iberia_extra_repatriated_counts_3 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 149520 + 7373 + 58884

##### Intermediate ----
iberia_extra_repatriated_counts_4 <-
  sort(unique(iberia_extra_repatriated_counts_2$ID_SAMPLE)) %>%
  purrr::map_df(function(i) {
    iberia_extra_repatriated_counts_2 %>%
      dplyr::filter(ID_SAMPLE == i) %>%
      dplyr::select(ID_SAMPLE, taxon_name = intermediate, count) %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 1, .before = count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

iberia_extra_repatriated_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON))

# Filter existing records
iberia_extra_repatriated_counts_4 <- iberia_extra_repatriated_counts_4 %>%
  dplyr::mutate(pk = paste0(ID_SAMPLE, "-", ID_TAXON, "-", amalgamation_level)) %>%
  dplyr::filter(!(pk %in% existing_pollen_counts_long$pk)) %>%
  dplyr::select(-pk)

iberia_extra_repatriated_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(iberia_extra_repatriated_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                iberia_extra_repatriated_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn,
                                    table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(iberia_extra_repatriated_counts_4$ID_SAMPLE, collapse = ", "),
               ") and amalgamation_level = 1") %>%
  dplyr::mutate(pk = paste0(ID_SAMPLE, "-", ID_TAXON, "-", amalgamation_level)) %>%
  dplyr::filter(!(pk %in% existing_pollen_counts_long$pk)) %>%
  dplyr::select(-pk)
waldo::compare(iberia_extra_repatriated_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 145129 + 7081 + 61519

##### Amalgamated ----
iberia_extra_repatriated_counts_5 <-
  unique(iberia_extra_repatriated_counts_2$ID_SAMPLE) %>%
  purrr::map_df(function(i) {
    iberia_extra_repatriated_counts_2 %>%
      dplyr::filter(ID_SAMPLE == i) %>%
      dplyr::select(ID_SAMPLE, taxon_name = amalgamated, count) %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 2, .before = count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

iberia_extra_repatriated_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)|is.na(count))

# Filter existing records
iberia_extra_repatriated_counts_5 <- iberia_extra_repatriated_counts_5 %>%
  dplyr::mutate(pk = paste0(ID_SAMPLE, "-", ID_TAXON, "-", amalgamation_level)) %>%
  dplyr::filter(!(pk %in% existing_pollen_counts_long$pk)) %>%
  dplyr::select(-pk)

iberia_extra_repatriated_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(iberia_extra_repatriated_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                iberia_extra_repatriated_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn,
                                    table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(iberia_extra_repatriated_counts_5$ID_SAMPLE, collapse = ", "),
               ") and amalgamation_level = 2") %>%
  dplyr::mutate(pk = paste0(ID_SAMPLE, "-", ID_TAXON, "-", amalgamation_level)) %>%
  dplyr::filter(!(pk %in% existing_pollen_counts_long$pk)) %>%
  dplyr::select(-pk)
waldo::compare(iberia_extra_repatriated_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 126646 + 5621 + 45208

iberia_extra_repatriated_counts_2 %>%
  dplyr::distinct(entity_name, .keep_all = TRUE)

##### Laguna de la Mosca ----
# The counts for this entity had to be manually inspected. Kept the samples
# at 15 and 18 cm from the "short core".
laguna_de_la_mosca <- IBERIA_pollen_v3 %>%
  dplyr::filter(entity_name == "LdlMo composite",
                `avg_depth..cm.` %in% c(15, 18)) %>%
  smpds::rm_na_taxa(1:17)
laguna_de_la_mosca %>%
  dplyr::select(-c(1:9, 11:17)) %>%
  dplyr::rename(depth = `avg_depth..cm.`) %>%
  tidyr::pivot_longer(-1) %>%
  tidyr::pivot_wider(names_from = "depth")

# New sites ----
# This sites were provided directly by the authors.
## Entities ----
new_entities <- "~/Downloads/iberian_extra_sites/iberia_extra_new_sites.xlsx" %>%
  readxl::read_excel(sheet = 1) %>%
  janitor::clean_names() %>%
  dplyr::mutate(ID_SITE = 1492 + seq_along(site_name), # get_id_site(conn)
                ID_ENTITY = 1666 + seq_along(entity_name), # get_id_entity(conn)
                .before = 1)
# Check if the new entities exist in the DB
dabr::select(conn,
             "SELECT * FROM entity WHERE ID_ENTITY IN (",
             paste0(new_entities$ID_ENTITY, collapse = ", "),
             ")")
conn %>%
  rpd:::add_records(table = "entity", data = new_entities, dry_run = TRUE)
# Results: 3 records were inserted.
### Validate ----
EPD_NEO_DB <- dabr::select(conn,
                           "SELECT * FROM entity WHERE ID_ENTITY IN (",
                           paste0(new_entities$ID_ENTITY, collapse = ", "),
                           ")")
waldo::compare(new_entities,
               EPD_NEO_DB,
               tolerance = 1E-9)

## Dates ----
new_entities_dates <- "~/Downloads/iberian_extra_sites/iberia_extra_new_sites.xlsx" %>%
  readxl::read_excel(sheet = 2) %>%
  janitor::clean_names() %>%
  dplyr::left_join(new_entities %>%
                     dplyr::select(1:4),
                   by = c("site_name", "entity_name")) %>%
  dplyr::mutate(ID_DATE_INFO = 10888 + seq_along(entity_name)) %>% # get_id_date_info(conn)
  dplyr::relocate(ID_SITE, ID_ENTITY, ID_DATE_INFO, .before = 1) %>%
  dplyr::select(-ID_SITE, -site_name, -entity_name)
# Check if the new dates exist in the DB
dabr::select(conn,
             "SELECT * FROM date_info WHERE ID_DATE_INFO IN (",
             paste0(new_entities_dates$ID_DATE_INFO, collapse = ", "),
             ")")
conn %>%
  rpd:::add_records(table = "date_info",
                    data = new_entities_dates,
                    dry_run = TRUE)
# Results: 18 records were inserted.
### Validate ----
EPD_NEO_DB <- dabr::select(conn,
                           "SELECT * FROM date_info WHERE ID_DATE_INFO IN (",
                           paste0(new_entities_dates$ID_DATE_INFO, collapse = ", "),
                           ")")
waldo::compare(new_entities_dates,
               EPD_NEO_DB,
               tolerance = 1E-9)

## Samples ----
new_entities_samples <- "~/Downloads/iberian_extra_sites/iberia_extra_new_sites.xlsx" %>%
  readxl::read_excel(sheet = 3) %>%
  janitor::clean_names() %>%
  dplyr::left_join(new_entities %>%
                     dplyr::select(1:4),
                   by = c("site_name", "entity_name")) %>%
  dplyr::mutate(ID_SAMPLE = 102608 + seq_along(entity_name)) %>% # get_id_sample(conn)
  dplyr::relocate(ID_SITE, ID_ENTITY, ID_SAMPLE, .before = 1) %>%
  dplyr::select(-ID_SITE, -site_name, -entity_name)
# Check if the new samples exist in the DB
dabr::select(conn,
             "SELECT * FROM sample WHERE ID_SAMPLE IN (",
             paste0(new_entities_samples$ID_SAMPLE, collapse = ", "),
             ")")
conn %>%
  rpd:::add_records(table = "sample",
                    data = new_entities_samples,
                    dry_run = TRUE)
# Results: 127 records were inserted.
### Validate ----
EPD_NEO_DB <- dabr::select(conn,
                           "SELECT * FROM sample WHERE ID_SAMPLE IN (",
                           paste0(new_entities_samples$ID_SAMPLE, collapse = ", "),
                           ")")
waldo::compare(new_entities_samples,
               EPD_NEO_DB,
               tolerance = 1E-9)

## Age models ----
new_entities_am <-
  find_age_models("~/Downloads/special_epd_am/",
                  entity_name = new_entities$entity_name)

# Check if the age models already exist in the DB
special.epd::snapshot(conn, entity_name = new_entities_am$entity_name)

new_entities_am2 <- new_entities_am %>%
  purrr::pmap_df(upload_age_model, conn = conn)
# Check if all the age models were uploaded
all(new_entities_am2$status)
new_entities_am2
new_entities_am3 <-
  new_entities_am2$am %>%
  purrr::map_df(~.x) %>%
  magrittr::set_names(colnames(.) %>% stringr::str_to_upper())

EPD_AM <- dabr::select(conn,
                       "SELECT * FROM age_model WHERE ID_SAMPLE IN (",
                       paste0(new_entities_am3$ID_SAMPLE, collapse = ", "),
                       ") AND ID_MODEL = 8")
waldo::compare(new_entities_am3 %>%
                 dplyr::arrange(ID_SAMPLE),
               EPD_AM %>%
                 magrittr::set_names(colnames(.) %>% stringr::str_to_upper()) %>%
                 dplyr::arrange(ID_SAMPLE))

## Counts ----
# data("IBERIA_new_counts_v3")
IBERIA_new_counts_v3_2 <- IBERIA_new_counts_v3 %>%
  dplyr::left_join(new_entities %>%
                     dplyr::select(1:3),
                   by = c("site_name")) %>%
  dplyr::relocate(ID_SITE, ID_ENTITY, .before = 1) %>%
  dplyr::select(-ID_SITE, -site_name)
new_entities_counts <- new_entities_samples %>%
  dplyr::select(ID_ENTITY, ID_SAMPLE, depth) %>%
  dplyr::mutate(depth_rnd = round(depth, 3)) %>%
  dplyr::right_join(IBERIA_new_counts_v3_2 %>%
                      dplyr::mutate(depth_rnd = round(depth, 3)),
                    by = c("ID_ENTITY", "depth_rnd")) %>%
  dplyr::select(-dplyr::starts_with("depth"))
new_entities_counts %>%
  dplyr::filter(is.na(ID_ENTITY) | is.na(ID_SAMPLE))

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
# Check if all the NEW taxons are in the `taxon_name` table
new_taxon_names <-
  c(new_entities_counts$clean,
    new_entities_counts$intermediate,
    new_entities_counts$amalgamated) %>%
  unique() %>%
  sort() %>%
  tibble::tibble() %>%
  magrittr::set_names("taxon_name") %>%
  dplyr::left_join(taxon_name_tb) %>%
  dplyr::filter(is.na(ID_TAXON))
new_taxon_names %>%
  rpd:::add_records(conn = conn, table = "taxon_name", dry_run = TRUE)
# Results: 6 records were inserted.

##### Clean ----
new_entities_counts_3 <-
  sort(unique(new_entities_counts$ID_SAMPLE)) %>%
  purrr::map_df(function(i) {
    new_entities_counts %>%
      dplyr::filter(ID_SAMPLE == i) %>%
      dplyr::select(ID_SAMPLE, taxon_name = clean, count) %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

new_entities_counts_3 %>%
  dplyr::filter(is.na(ID_TAXON))
dim(new_entities_counts_3)
new_entities_counts_3 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()

idx <- idx_pairs(nrow(new_entities_counts_3), 2000)
pb <- progress::progress_bar$new(total = nrow(idx))
meta_neo_res <-
  purrr::map2(idx$x,
              idx$y,
              ~ {
                pb$tick()
                new_entities_counts_3[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count", dry_run = !TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(new_entities_counts_3$ID_SAMPLE, collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(new_entities_counts_3 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 7740
##### Intermediate ----
new_entities_counts_4 <-
  sort(unique(new_entities_counts$ID_SAMPLE)) %>%
  purrr::map_df(function(i) {
    new_entities_counts %>%
      dplyr::filter(ID_SAMPLE == i) %>%
      dplyr::select(ID_SAMPLE, taxon_name = intermediate, count) %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 1, .before = count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

new_entities_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON))

new_entities_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(new_entities_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                new_entities_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count", dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(new_entities_counts_4$ID_SAMPLE, collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(new_entities_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 7389

##### Amalgamated ----
new_entities_counts_5 <-
  unique(new_entities_counts$ID_SAMPLE) %>%
  purrr::map_df(function(i) {
    new_entities_counts %>%
      dplyr::filter(ID_SAMPLE == i) %>%
      dplyr::select(ID_SAMPLE, taxon_name = amalgamated, count) %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 2, .before = count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

new_entities_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)|is.na(count))

new_entities_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(new_entities_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                new_entities_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count", dry_run = !TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(new_entities_counts_5$ID_SAMPLE, collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(new_entities_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1e-9)
# 6291




# No dates in the RPD ----
no_dates_rpd <- readxl::read_excel("~/Downloads/No dates in RPD.xlsx")
rpd_site_entity_tbs <- rpdata::site[1:2] %>%
  dplyr::left_join(rpdata::entity)
rpd_site_entity_tbs %>%
  dplyr::filter(ID_ENTITY %in% no_dates_rpd$ID_ENTITY)
aux <- no_dates_rpd %>%
  dplyr::left_join(rpd_site_entity_tbs,
                   by = "ID_ENTITY")

aux <- rpd_site_entity_tbs[1:10] %>%
  dplyr::right_join(rpdata::date_info %>%
                      dplyr::filter(ID_ENTITY %in% no_dates_rpd$ID_ENTITY)) %>%
  dplyr::arrange(site_name, entity_name, avg_depth)
aux %>%
  readr::write_excel_csv("~/Downloads/rpd_records_for_LS_to_inspect_2022-02-10.csv", na = "")

# Pending entities ----
## LGLOPPSJ ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "LGLOPPSJ")
aux
#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "LGLOPPSJ")
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## LOBHOLO ----
"Extract dates and samples from the EPD"
aux <- conn %>%
  special.epd::snapshot(entity_name = "LOBHOLO")
aux
#### Dates ----
EPD_DATES %>%
  dplyr::filter(entity_name == "LOBHOLO") %>%
  dplyr::select(-c(1:5)) %>%
  dplyr::rename(age_calib = age_cal) %>%
  dplyr::mutate(ID_ENTITY = 935,
                ID_DATE_INFO = 10906 + seq_along(ID_ENTITY),
                .before = 1) %>%
  rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
# Results: 10 records were inserted.

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "LOBHOLO")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## MAASV ----
"Extract samples from the RPD and dates from the EPD"
aux <- conn %>%
  special.epd::snapshot(entity_name = "MAASV")
aux
#### Samples ----
# Update sample depths and thickness using the RPD data.
aux$sample %>%
  dplyr::select(ID_ENTITY, ID_SAMPLE, depth, thickness) %>%
  dplyr::mutate(depth = depth - 0.5,
                thickness = 1) %>%
  rpd:::update_records(conn = conn, table = "sample",
                       dry_run = TRUE, PK = 1:2)
# Results: 94 records were updated.

#### Dates ----
EPD_DATES %>%
  dplyr::filter(entity_name == "MAASV") %>%
  dplyr::select(-c(1:5)) %>%
  dplyr::rename(age_calib = age_cal) %>%
  dplyr::mutate(ID_ENTITY = 802,
                ID_DATE_INFO = 10916 + seq_along(ID_ENTITY),
                .before = 1) %>%
  rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
# Results: 9 records were inserted.

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "MAASV")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## TRETTE ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "TRETTE")
aux
#### Dates ----
EPD_DATES %>%
  dplyr::filter(entity_name == "TRETTE") %>%
  dplyr::select(-c(1:5)) %>%
  dplyr::rename(age_calib = age_cal) %>%
  dplyr::mutate(ID_ENTITY = 1477,
                ID_DATE_INFO = 10925 + seq_along(ID_ENTITY),
                .before = 1) %>%
  rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
# Results: 10 records were inserted.

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "TRETTE")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)


## OYKJA ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "OYKJA")
aux
#### Samples ----
# Extract samples from the EPD
## Delete existing counts (extracted from the RPD)
dabr::select(conn,
             "SELECT * FROM sample WHERE ID_ENTITY = 1530 AND ID_SAMPLE",
             "BETWEEN 16063 AND 16095")
# dabr::delete(conn,
#              "DELETE FROM sample WHERE ID_ENTITY = 1530 AND ID_SAMPLE",
#              "BETWEEN 16063 AND 16095")
# # Results: 33 records were deleted.
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name == "OYKJA") %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:6)) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) %>%
  dplyr::mutate(ID_ENTITY = 1530,
                ID_SAMPLE = 102735 + seq_along(ID_ENTITY),
                .before = 1)
aux_samples_counts %>%
  dplyr::select(1:9) %>%
  rpd:::add_records(conn = conn, table = "sample", dry_run = TRUE)
# Results: 99 records were inserted.

#### Counts ----
aux_samples_counts_2 <- aux_samples_counts %>%
  dplyr::select(-c(1, 3:9))

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
data("EPD_taxa_amalgamation")
##### Clean ----
aux_samples_counts_4 <-
  seq_len(nrow(aux_samples_counts_2)) %>%
  purrr::map_df(function(i) {
    aux_samples_counts_2[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON))
aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage1 <- idx_pairs(nrow(aux_samples_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage1))
meta_neo_res <-
  purrr::map2(idx_stage1$x,
              idx_stage1$y,
              ~ {
                pb$tick()
                aux_samples_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(aux_samples_counts_4$ID_SAMPLE, collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(aux_samples_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 10098

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

aux_samples_counts_5 <-
  unique(aux_samples_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON))
aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(aux_samples_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                aux_samples_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(aux_samples_counts_5$ID_SAMPLE, collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(aux_samples_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 8712

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

aux_samples_counts_6 <-
  unique(aux_samples_counts_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON))
aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(aux_samples_counts_6), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                aux_samples_counts_6[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(aux_samples_counts_6$ID_SAMPLE, collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(aux_samples_counts_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 6534

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "OYKJA")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## PRYSKYRI ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "PRYSKYRI")
aux
#### Dates ----
EPD_DATES %>%
  dplyr::filter(entity_name == "PRYSKYRI") %>%
  dplyr::select(-c(1:5)) %>%
  dplyr::rename(age_calib = age_cal) %>%
  dplyr::mutate(ID_ENTITY = 1206,
                ID_DATE_INFO = 10935 + seq_along(ID_ENTITY),
                .before = 1) %>%
  rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
# Results: 9 records were inserted.

#### Samples ----
extract_rpd(2458)
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name == "PRYSKYRI") %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:6)) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) %>%
  dplyr::mutate(ID_ENTITY = 1206,
                ID_SAMPLE = 17282:17333,
                .before = 1)

#### (PENDING) Counts ----
"Ask SPH if we should repatriate these counts, even though the samples were extracted from the RPD (they are the same)"
aux_samples_counts_2 <- aux_samples_counts %>%
  dplyr::select(-c(1, 3:9))

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
data("EPD_taxa_amalgamation")
##### Clean ----
aux_samples_counts_4 <-
  seq_len(nrow(aux_samples_counts_2)) %>%
  purrr::map_df(function(i) {
    aux_samples_counts_2[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON))
aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage1 <- idx_pairs(nrow(aux_samples_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage1))
meta_neo_res <-
  purrr::map2(idx_stage1$x,
              idx_stage1$y,
              ~ {
                pb$tick()
                aux_samples_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(aux_samples_counts_4$ID_SAMPLE, collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(aux_samples_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 10098

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

aux_samples_counts_5 <-
  unique(aux_samples_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON))
aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(aux_samples_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                aux_samples_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(aux_samples_counts_5$ID_SAMPLE, collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(aux_samples_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 8712

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

aux_samples_counts_6 <-
  unique(aux_samples_counts_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON))
aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(aux_samples_counts_6), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                aux_samples_counts_6[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(aux_samples_counts_6$ID_SAMPLE, collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(aux_samples_counts_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 6534

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "PRYSKYRI")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## DESVERNE ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "DESVERNE")
aux
#### Samples ----
aux_rpd <- extract_rpd(ID_ENTITY = 2462)
aux_rpd$sample
aux$sample

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "DESVERNE")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)


## TORVERA2 ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "TORVERA2")
aux
#### Samples ----
aux_rpd <- extract_rpd(ID_ENTITY = 905)
aux_rpd$sample
aux$sample

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "TORVERA2")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)


## LAVPICCO ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "LAVPICCO")
aux
#### Samples ----
aux_rpd <- extract_rpd(ID_ENTITY = 336)
aux_rpd$sample
aux$sample

#### Dates ----
EPD_DATES %>%
  dplyr::filter(entity_name == "LAVPICCO") %>%
  dplyr::select(-c(1:5)) %>%
  dplyr::rename(age_calib = age_cal) %>%
  dplyr::mutate(ID_ENTITY = 751,
                ID_DATE_INFO = 10944 + seq_along(ID_ENTITY),
                .before = 1) %>%
  rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
# Results: 16 records were inserted.

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "LAVPICCO")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)


## EGR2G ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "EGR2G")
aux
#### Dates ----
EPD_DATES %>%
  dplyr::filter(entity_name == "EGR2G") %>%
  dplyr::select(-c(1:5)) %>%
  dplyr::rename(age_calib = age_cal) %>%
  dplyr::mutate(ID_ENTITY = 406,
                ID_DATE_INFO = 10960 + seq_along(ID_ENTITY),
                .before = 1) %>%
  rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
# Results: 31 records were inserted.

#### Samples ----
aux_rpd <- extract_rpd(ID_ENTITY = 1459)
aux_rpd$sample
aux$sample
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name == "EGR2G") %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:6)) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) %>%
  dplyr::mutate(ID_ENTITY = 406,
                ID_SAMPLE = 14951:14994,
                .before = 1)
aux_samples_counts %>%
  dplyr::select(1:9) %>%
  rpd:::update_records(conn = conn, table = "sample",
                       dry_run = TRUE, PK = 1:2)
# Results: 44 records were updated.

#### Counts ----
aux_samples_counts_2 <- aux_samples_counts %>%
  dplyr::select(-c(1, 3:9))

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
data("EPD_taxa_amalgamation")
##### Clean ----
aux_samples_counts_4 <-
  seq_len(nrow(aux_samples_counts_2)) %>%
  purrr::map_df(function(i) {
    aux_samples_counts_2[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage1 <- idx_pairs(nrow(aux_samples_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage1))
meta_neo_res <-
  purrr::map2(idx_stage1$x,
              idx_stage1$y,
              ~ {
                pb$tick()
                aux_samples_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(aux_samples_counts_4$ID_SAMPLE, collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(aux_samples_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 5104

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

aux_samples_counts_5 <-
  unique(aux_samples_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(aux_samples_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                aux_samples_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(aux_samples_counts_5$ID_SAMPLE, collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(aux_samples_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 4576

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

aux_samples_counts_6 <-
  unique(aux_samples_counts_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(aux_samples_counts_6), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                aux_samples_counts_6[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(aux_samples_counts_6$ID_SAMPLE, collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(aux_samples_counts_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 3476

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "EGR2G")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## EGR2A ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "EGR2A")
aux

#### Dates ----
EPD_DATES %>%
  dplyr::filter(entity_name == "EGR2A") %>%
  dplyr::select(-c(1:5)) %>%
  dplyr::rename(age_calib = age_cal) %>%
  dplyr::mutate(ID_ENTITY = 405,
                ID_DATE_INFO = 10991 + seq_along(ID_ENTITY),
                .before = 1) %>%
  rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
# Results: 31 records were inserted.

# Update dates based on manual inspections by SPH:
aux$date_info %>%
  dplyr::mutate(depth = c(55, 71, 105, 129, 175, 195, 213, 247, 259, 275, 285, 291, 311,
                          327, 343, 355, 363, 371, 377, 385, 391, 401, 415, 417, 431, 465,
                          499, 525, 547, 573, 649),
                thickness = 2) %>%
  dplyr::select(ID_ENTITY, ID_DATE_INFO, depth, thickness) %>%
  rpd:::update_records(conn = conn, table = "date_info", dry_run = TRUE)
# Results: 31 records were updated.

#### Samples ----
aux_rpd <- extract_rpd(ID_ENTITY = 1458)
aux_rpd$sample
aux$sample
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name == "EGR2A") %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:6)) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) %>%
  dplyr::mutate(ID_ENTITY = 405,
                ID_SAMPLE = 14810:14950,
                .before = 1)
aux_samples_counts %>%
  dplyr::select(1:9) %>%
  rpd:::update_records(conn = conn, table = "sample",
                       dry_run = TRUE, PK = 1:2)
# Results: 141 records were updated.

#### Counts ----
aux_samples_counts_2 <- aux_samples_counts %>%
  dplyr::select(-c(1, 3:9))

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
data("EPD_taxa_amalgamation")
##### Clean ----
aux_samples_counts_4 <-
  seq_len(nrow(aux_samples_counts_2)) %>%
  purrr::map_df(function(i) {
    aux_samples_counts_2[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage1 <- idx_pairs(nrow(aux_samples_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage1))
meta_neo_res <-
  purrr::map2(idx_stage1$x,
              idx_stage1$y,
              ~ {
                pb$tick()
                aux_samples_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_4$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(aux_samples_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 23406

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

aux_samples_counts_5 <-
  unique(aux_samples_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(aux_samples_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                aux_samples_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_5$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(aux_samples_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 21573

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

aux_samples_counts_6 <-
  unique(aux_samples_counts_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(aux_samples_counts_6), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                aux_samples_counts_6[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_6$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(aux_samples_counts_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 13677

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "EGR2A")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)


## NEUBLANS ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "NEUBLANS")
aux
#### Dates ----
EPD_DATES %>%
  dplyr::filter(entity_name == "NEUBLANS") %>%
  dplyr::select(-c(1:5)) %>%
  dplyr::rename(age_calib = age_cal) %>%
  dplyr::mutate(ID_ENTITY = 1143,
                ID_DATE_INFO = 11022 + seq_along(ID_ENTITY),
                .before = 1) %>%
  rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
# Results: 3 records were inserted.

#### Citation ----
neublans_publications <-
  c("Gauthier, E., 2000. Mise en évidence pollinique de la culture de la vigne au 1er siècle après J.-C. dans la plaine du Doubs (Neublans, Jura). ArchéoSciences, revue d'Archéométrie, 24(1), pp.63-69.",
    "Gauthier, E., 2001. Evolution de l'impact de l'homme sur la végétation du massif jurassien au cours des quatre derniers millénaires. Nouvelles données palynologiques.. Doctoral dissertation. Université de Franche-Comté, Besançon, France."
  ) %>%
  paste0(collapse = ";\n")
aux$entity
dabr::select(conn,
             "SELECT * FROM entity WHERE ID_SITE = 1025 AND ID_ENTITY = 1143") %>%
  .$publication %>% cat()
# dabr::update(conn,
#              "UPDATE entity SET publication = ",
#              paste0('"', neublans_publications, '"'),
#              "WHERE ID_SITE = 1025 AND ID_ENTITY = 1143")

#### Samples ----
aux_rpd <- extract_rpd(ID_ENTITY = 382)
aux_rpd$sample
aux$sample
# extract_rpd(382)
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name == "NEUBLANS") %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:6)) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) #%>%
# dplyr::mutate(ID_ENTITY = 1206,
#               ID_SAMPLE = 17282:17333,
#               .before = 1)

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "NEUBLANS")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## PEYREII ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "PEYREII")
aux

#### Dates ----
EPD_DATES %>%
  dplyr::filter(entity_name == "PEYREII") %>%
  dplyr::select(-c(1:5)) %>%
  dplyr::rename(age_calib = age_cal) %>%
  dplyr::mutate(ID_ENTITY = 1166,
                ID_DATE_INFO = 11025 + seq_along(ID_ENTITY),
                .before = 1) %>%
  rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
# Results: 6 records were inserted.

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "PEYREII")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## CHAGAZON ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "CHAGAZON")
aux

#### Samples ----
aux_rpd <- extract_rpd(ID_ENTITY = 2381)
aux_rpd$sample
aux$sample
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name == "CHAGAZON") %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:6)) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) %>%
  dplyr::mutate(ID_ENTITY = 265,
                ID_SAMPLE = 16332:16434,
                .before = 1)
aux_samples_counts %>%
  dplyr::select(1:9) %>%
  rpd:::update_records(conn = conn, table = "sample",
                       dry_run = TRUE, PK = 1:2)
# Results: 103 records were updated.

#### Counts ----
aux_samples_counts_2 <- aux_samples_counts %>%
  dplyr::select(-c(1, 3:9))

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
data("EPD_taxa_amalgamation")
##### Clean ----
aux_samples_counts_4 <-
  seq_len(nrow(aux_samples_counts_2)) %>%
  purrr::map_df(function(i) {
    aux_samples_counts_2[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage1 <- idx_pairs(nrow(aux_samples_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage1))
meta_neo_res <-
  purrr::map2(idx_stage1$x,
              idx_stage1$y,
              ~ {
                pb$tick()
                aux_samples_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_4$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(aux_samples_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 11227

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

aux_samples_counts_5 <-
  unique(aux_samples_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(aux_samples_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                aux_samples_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_5$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(aux_samples_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 10815

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

aux_samples_counts_6 <-
  unique(aux_samples_counts_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(aux_samples_counts_6), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                aux_samples_counts_6[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_6$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(aux_samples_counts_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 8961

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "CHAGAZON")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)


## VERNYBRU ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "VERNYBRU")
aux

#### Samples ----
aux_rpd <- extract_rpd(ID_ENTITY = 2291)
aux$date_info
aux_rpd$sample
aux$sample
# Delete old samples (RPD samples)
dabr::select(conn,
             "SELECT * FROM sample WHERE ID_SAMPLE BETWEEN 16050 AND 16062")
# dabr::delete(conn,
#              "DELETE FROM sample WHERE ID_SAMPLE BETWEEN 16050 AND 16062",
#              "AND ID_ENTITY = 890")
# # Results: 13 records were deleted.

aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name == "VERNYBRU") %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:6)) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) %>%
  dplyr::mutate(ID_ENTITY = 890,
                ID_SAMPLE = 102834 + seq_along(ID_ENTITY),
                .before = 1)
aux_samples_counts %>%
  dplyr::select(1:9) %>%
  rpd:::update_records(conn = conn, table = "sample",
                       dry_run = TRUE, PK = 1:2)
# Results: 37 records were updated.

#### Counts ----
aux_samples_counts_2 <- aux_samples_counts %>%
  dplyr::select(-c(1, 3:9))

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
data("EPD_taxa_amalgamation")
##### Clean ----
aux_samples_counts_4 <-
  seq_len(nrow(aux_samples_counts_2)) %>%
  purrr::map_df(function(i) {
    aux_samples_counts_2[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage1 <- idx_pairs(nrow(aux_samples_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage1))
meta_neo_res <-
  purrr::map2(idx_stage1$x,
              idx_stage1$y,
              ~ {
                pb$tick()
                aux_samples_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = !TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_4$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(aux_samples_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 3441

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

aux_samples_counts_5 <-
  unique(aux_samples_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(aux_samples_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                aux_samples_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_5$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(aux_samples_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 3330

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

aux_samples_counts_6 <-
  unique(aux_samples_counts_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(aux_samples_counts_6), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                aux_samples_counts_6[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_6$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(aux_samples_counts_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 2775

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "VERNYBRU")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## BIBER ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "BIBER")
aux
#### Dates ----
EPD_DATES %>%
  dplyr::filter(entity_name == "BIBER") %>%
  dplyr::select(-c(1:5)) %>%
  dplyr::rename(age_calib = age_cal) %>%
  dplyr::mutate(ID_ENTITY = 162,
                ID_DATE_INFO = 11031 + seq_along(ID_ENTITY),
                .before = 1) %>%
  rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
# Results: 26 records were inserted.

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "BIBER")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## LONGBREA ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "LONGBREA")
aux

#### Samples ----
aux_rpd <- extract_rpd(ID_ENTITY = 1437)
aux_rpd$sample
aux$sample$ID_SAMPLE %>% dput
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name == "LONGBREA") %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:6)) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) %>%
  dplyr::mutate(ID_ENTITY = 962,
                ID_SAMPLE = 14763:14809,
                .before = 1)
aux_samples_counts %>%
  dplyr::select(1:9) %>%
  rpd:::update_records(conn = conn, table = "sample",
                       dry_run = !TRUE, PK = 1:2)
# Results: 47 records were updated.

#### Counts ----
aux_samples_counts_2 <- aux_samples_counts %>%
  dplyr::select(-c(1, 3:9))

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
data("EPD_taxa_amalgamation")
##### Clean ----
aux_samples_counts_4 <-
  seq_len(nrow(aux_samples_counts_2)) %>%
  purrr::map_df(function(i) {
    aux_samples_counts_2[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage1 <- idx_pairs(nrow(aux_samples_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage1))
meta_neo_res <-
  purrr::map2(idx_stage1$x,
              idx_stage1$y,
              ~ {
                pb$tick()
                aux_samples_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_4$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(aux_samples_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 3243

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

aux_samples_counts_5 <-
  unique(aux_samples_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(aux_samples_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                aux_samples_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_5$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(aux_samples_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 3102

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

aux_samples_counts_6 <-
  unique(aux_samples_counts_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(aux_samples_counts_6), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                aux_samples_counts_6[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_6$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(aux_samples_counts_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 2726

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "LONGBREA")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## GOURTE ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "GOURTE")
aux

#### Samples* ----
aux_rpd <- extract_rpd(ID_ENTITY = 2075)
aux_rpd$sample
aux$sample
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name == "GOURTE") %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:6)) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) %>%
  dplyr::mutate(ID_ENTITY = 481,
                ID_SAMPLE = 15068:15115,
                .before = 1)
aux_samples_counts %>%
  dplyr::select(1:9) %>%
  rpd:::update_records(conn = conn, table = "sample",
                       dry_run = TRUE, PK = 1:2)
# Results:  records were updated.

#### Counts* ----
"NOTE: the RPD has 3 additional samples, so I will double check with SPH to
determine whether we want to import pollen for the matching samples (48)"
aux_samples_counts_2 <- aux_samples_counts %>%
  dplyr::select(-c(1, 3:9))

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
data("EPD_taxa_amalgamation")
##### Clean ----
aux_samples_counts_4 <-
  seq_len(nrow(aux_samples_counts_2)) %>%
  purrr::map_df(function(i) {
    aux_samples_counts_2[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage1 <- idx_pairs(nrow(aux_samples_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage1))
meta_neo_res <-
  purrr::map2(idx_stage1$x,
              idx_stage1$y,
              ~ {
                pb$tick()
                aux_samples_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_4$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(aux_samples_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 23406

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

aux_samples_counts_5 <-
  unique(aux_samples_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(aux_samples_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                aux_samples_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_5$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(aux_samples_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 21573

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

aux_samples_counts_6 <-
  unique(aux_samples_counts_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(aux_samples_counts_6), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                aux_samples_counts_6[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_6$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(aux_samples_counts_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 13677

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "GOURTE")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## MD042845 ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "MD042845")
aux

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "MD042845")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## GORGOBAS ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "GORGOBAS")
aux

#### Samples ----
aux_rpd <- extract_rpd(ID_ENTITY = 701)
aux_rpd$sample
aux$sample$ID_SAMPLE %>% dput
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name == "GORGOBAS") %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:6)) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) %>%
  dplyr::mutate(ID_ENTITY = 478,
                ID_SAMPLE = 13844:13930,
                .before = 1)
aux_samples_counts %>%
  dplyr::select(1:9) %>%
  rpd:::update_records(conn = conn, table = "sample",
                       dry_run = TRUE, PK = 1:2)
# Results: 87 records were updated.

#### Counts ----
aux_samples_counts_2 <- aux_samples_counts %>%
  dplyr::select(-c(1, 3:9))

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
data("EPD_taxa_amalgamation")
##### Clean ----
aux_samples_counts_4 <-
  seq_len(nrow(aux_samples_counts_2)) %>%
  purrr::map_df(function(i) {
    aux_samples_counts_2[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage1 <- idx_pairs(nrow(aux_samples_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage1))
meta_neo_res <-
  purrr::map2(idx_stage1$x,
              idx_stage1$y,
              ~ {
                pb$tick()
                aux_samples_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_4$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(aux_samples_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 19488

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

aux_samples_counts_5 <-
  unique(aux_samples_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(aux_samples_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                aux_samples_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_5$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(aux_samples_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 18009

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

aux_samples_counts_6 <-
  unique(aux_samples_counts_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(aux_samples_counts_6), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                aux_samples_counts_6[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_6$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(aux_samples_counts_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 10527

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "GORGOBAS")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## BRM3 ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "BRM3")
aux

#### Samples ----
aux_rpd <- extract_rpd(ID_ENTITY = 1341)
aux_rpd$sample
aux$sample
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name == "BRM3") %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:6)) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) %>%
  dplyr::mutate(ID_ENTITY = 218,
                ID_SAMPLE = 14130:14171,
                .before = 1)
aux_samples_counts %>%
  dplyr::select(1:9) %>%
  rpd:::update_records(conn = conn, table = "sample",
                       dry_run = TRUE, PK = 1:2)
# Results: 42 records were updated.

#### Counts ----
aux_samples_counts_2 <- aux_samples_counts %>%
  dplyr::select(-c(1, 3:9))

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
data("EPD_taxa_amalgamation")
##### Clean ----
aux_samples_counts_4 <-
  seq_len(nrow(aux_samples_counts_2)) %>%
  purrr::map_df(function(i) {
    aux_samples_counts_2[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage1 <- idx_pairs(nrow(aux_samples_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage1))
meta_neo_res <-
  purrr::map2(idx_stage1$x,
              idx_stage1$y,
              ~ {
                pb$tick()
                aux_samples_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_4$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(aux_samples_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 3864

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

aux_samples_counts_5 <-
  unique(aux_samples_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(aux_samples_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                aux_samples_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_5$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(aux_samples_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 3696

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

aux_samples_counts_6 <-
  unique(aux_samples_counts_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(aux_samples_counts_6), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                aux_samples_counts_6[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_6$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(aux_samples_counts_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 2940

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "BRM3")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## BRM1 ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "BRM1")
aux

#### Samples ----
aux_rpd <- extract_rpd(ID_ENTITY = 1340)
aux_rpd$sample
aux$sample$ID_SAMPLE %>% dput
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name == "BRM1") %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:6)) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) %>%
  dplyr::mutate(ID_ENTITY = 217,
                ID_SAMPLE = 14101:14129,
                .before = 1)
aux_samples_counts$depth - aux$sample$depth
aux_samples_counts %>%
  dplyr::select(1:9) %>%
  rpd:::update_records(conn = conn, table = "sample",
                       dry_run = TRUE, PK = 1:2)
# Results: 29 records were updated.

#### Counts ----
aux_samples_counts_2 <- aux_samples_counts %>%
  dplyr::select(-c(1, 3:9))

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
data("EPD_taxa_amalgamation")
##### Clean ----
aux_samples_counts_4 <-
  seq_len(nrow(aux_samples_counts_2)) %>%
  purrr::map_df(function(i) {
    aux_samples_counts_2[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage1 <- idx_pairs(nrow(aux_samples_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage1))
meta_neo_res <-
  purrr::map2(idx_stage1$x,
              idx_stage1$y,
              ~ {
                pb$tick()
                aux_samples_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_4$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(aux_samples_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 2494

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

aux_samples_counts_5 <-
  unique(aux_samples_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(aux_samples_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                aux_samples_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_5$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(aux_samples_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 2320

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

aux_samples_counts_6 <-
  unique(aux_samples_counts_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(aux_samples_counts_6), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                aux_samples_counts_6[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_6$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(aux_samples_counts_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 1972

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "BRM1")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## GWA1 ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "GWA1")
aux

#### Samples ----
aux_rpd <- extract_rpd(ID_ENTITY = 1354)
aux_rpd$sample
aux$sample
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name == "GWA1") %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:6)) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) %>%
  dplyr::mutate(ID_ENTITY = 469,
                ID_SAMPLE = 14172:14324,
                .before = 1)
aux_samples_counts$depth - aux$sample$depth
aux_samples_counts %>%
  dplyr::select(1:9) %>%
  rpd:::update_records(conn = conn, table = "sample",
                       dry_run = TRUE, PK = 1:2)
# Results: 153 records were updated.

#### Counts ----
aux_samples_counts_2 <- aux_samples_counts %>%
  dplyr::select(-c(1, 3:9))

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
data("EPD_taxa_amalgamation")
##### Clean ----
aux_samples_counts_4 <-
  seq_len(nrow(aux_samples_counts_2)) %>%
  purrr::map_df(function(i) {
    aux_samples_counts_2[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage1 <- idx_pairs(nrow(aux_samples_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage1))
meta_neo_res <-
  purrr::map2(idx_stage1$x,
              idx_stage1$y,
              ~ {
                pb$tick()
                aux_samples_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_4$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(aux_samples_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 23562

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

aux_samples_counts_5 <-
  unique(aux_samples_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(aux_samples_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                aux_samples_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_5$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(aux_samples_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 22185

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

aux_samples_counts_6 <-
  unique(aux_samples_counts_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(aux_samples_counts_6), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                aux_samples_counts_6[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_6$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(aux_samples_counts_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 13158

# Delete objects with counts
rm(aux_samples_counts, aux_samples_counts_2, aux_samples_counts_4)
rm(aux_samples_counts_5, aux_samples_counts_6)

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "GWA1")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## WILA ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "WILA")
aux

#### Samples ----
aux_rpd <- extract_rpd(ID_ENTITY = 1414)
aux_rpd$sample
aux$sample$ID_SAMPLE %>% dput
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name == "WILA") %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:6)) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) %>%
  dplyr::mutate(ID_ENTITY = 1576,
                ID_SAMPLE = 14325:14583,
                .before = 1)
aux_samples_counts$depth - aux$sample$depth
aux_samples_counts %>%
  dplyr::select(1:9) %>%
  rpd:::update_records(conn = conn, table = "sample",
                       dry_run = !TRUE, PK = 1:2)
# Results: 259 records were updated.

#### Counts ----
aux_samples_counts_2 <- aux_samples_counts %>%
  dplyr::select(-c(1, 3:9))

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
data("EPD_taxa_amalgamation")
##### Clean ----
aux_samples_counts_4 <-
  seq_len(nrow(aux_samples_counts_2)) %>%
  purrr::map_df(function(i) {
    aux_samples_counts_2[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage1 <- idx_pairs(nrow(aux_samples_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage1))
meta_neo_res <-
  purrr::map2(idx_stage1$x,
              idx_stage1$y,
              ~ {
                pb$tick()
                aux_samples_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_4$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(aux_samples_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 48433

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

aux_samples_counts_5 <-
  unique(aux_samples_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(aux_samples_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                aux_samples_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_5$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(aux_samples_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 45325

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

aux_samples_counts_6 <-
  unique(aux_samples_counts_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(aux_samples_counts_6), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                aux_samples_counts_6[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = !TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_6$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(aux_samples_counts_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 24864

# Delete objects with counts
rm(aux_samples_counts, aux_samples_counts_2, aux_samples_counts_4)
rm(aux_samples_counts_5, aux_samples_counts_6)

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "WILA")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## KWI2 ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "KWI2")
aux

#### Samples ----
aux_rpd <- extract_rpd(ID_ENTITY = 1415)
aux_rpd$sample
aux$sample
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name == "KWI2") %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:6)) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) %>%
  dplyr::mutate(ID_ENTITY = 1579,
                ID_SAMPLE = 14584:14762,
                .before = 1)
aux_samples_counts$depth - aux$sample$depth
aux_samples_counts %>%
  dplyr::select(1:9) %>%
  rpd:::update_records(conn = conn, table = "sample",
                       dry_run = !TRUE, PK = 1:2)
# Results: 179 records were updated.

#### Counts ----
aux_samples_counts_2 <- aux_samples_counts %>%
  dplyr::select(-c(1, 3:9))

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
data("EPD_taxa_amalgamation")
##### Clean ----
aux_samples_counts_4 <-
  seq_len(nrow(aux_samples_counts_2)) %>%
  purrr::map_df(function(i) {
    aux_samples_counts_2[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage1 <- idx_pairs(nrow(aux_samples_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage1))
meta_neo_res <-
  purrr::map2(idx_stage1$x,
              idx_stage1$y,
              ~ {
                pb$tick()
                aux_samples_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_4$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(aux_samples_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 20048

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

aux_samples_counts_5 <-
  unique(aux_samples_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(aux_samples_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                aux_samples_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_5$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(aux_samples_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 18795

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

aux_samples_counts_6 <-
  unique(aux_samples_counts_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(aux_samples_counts_6), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                aux_samples_counts_6[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_6$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(aux_samples_counts_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 13604

# Delete objects with counts
rm(aux_samples_counts, aux_samples_counts_2, aux_samples_counts_4)
rm(aux_samples_counts_5, aux_samples_counts_6)

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "KWI2")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## GKSA ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "GKSA")
aux

#### Samples ----
aux
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name == "GKSA") %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:6)) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) %>%
  dplyr::mutate(ID_ENTITY = 495,
                ID_SAMPLE = 102871 + seq_along(ID_ENTITY),
                .before = 1)
aux_samples_counts %>%
  dplyr::select(1:9) %>%
  rpd:::update_records(conn = conn, table = "sample",
                       dry_run = !TRUE, PK = 1:2)
# Results: 160 records were updated.

#### Counts ----
aux_samples_counts_2 <- aux_samples_counts %>%
  dplyr::select(-c(1, 3:9))

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
data("EPD_taxa_amalgamation")
##### Clean ----
aux_samples_counts_4 <-
  seq_len(nrow(aux_samples_counts_2)) %>%
  purrr::map_df(function(i) {
    aux_samples_counts_2[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage1 <- idx_pairs(nrow(aux_samples_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage1))
meta_neo_res <-
  purrr::map2(idx_stage1$x,
              idx_stage1$y,
              ~ {
                pb$tick()
                aux_samples_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_4$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(aux_samples_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 13920

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

aux_samples_counts_5 <-
  unique(aux_samples_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(aux_samples_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                aux_samples_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_5$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(aux_samples_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 13440

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

aux_samples_counts_6 <-
  unique(aux_samples_counts_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(aux_samples_counts_6), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                aux_samples_counts_6[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_6$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(aux_samples_counts_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 11360

# Delete objects with counts
rm(aux_samples_counts, aux_samples_counts_2, aux_samples_counts_4)
rm(aux_samples_counts_5, aux_samples_counts_6)

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "GKSA")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## GKREBSEE ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "GKREBSEE")
aux

#### Samples ----
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name == "GKREBSEE") %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:6)) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) %>%
  dplyr::mutate(ID_ENTITY = 494,
                ID_SAMPLE = 103031 + seq_along(ID_ENTITY),
                .before = 1)
aux_samples_counts %>%
  dplyr::select(1:9) %>%
  rpd:::update_records(conn = conn, table = "sample",
                       dry_run = TRUE, PK = 1:2)
# Results: 125 records were updated.

#### Counts ----
aux_samples_counts_2 <- aux_samples_counts %>%
  dplyr::select(-c(1, 3:9))

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
data("EPD_taxa_amalgamation")
##### Clean ----
aux_samples_counts_4 <-
  seq_len(nrow(aux_samples_counts_2)) %>%
  purrr::map_df(function(i) {
    aux_samples_counts_2[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage1 <- idx_pairs(nrow(aux_samples_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage1))
meta_neo_res <-
  purrr::map2(idx_stage1$x,
              idx_stage1$y,
              ~ {
                pb$tick()
                aux_samples_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_4$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(aux_samples_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 11875

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

aux_samples_counts_5 <-
  unique(aux_samples_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(aux_samples_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                aux_samples_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_5$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(aux_samples_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 11125

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

aux_samples_counts_6 <-
  unique(aux_samples_counts_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(aux_samples_counts_6), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                aux_samples_counts_6[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_6$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(aux_samples_counts_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 9375

# Delete objects with counts
rm(aux_samples_counts, aux_samples_counts_2, aux_samples_counts_4)
rm(aux_samples_counts_5, aux_samples_counts_6)

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "GKREBSEE")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## GKSE ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "GKSE")
aux

#### Samples ----
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name == "GKSE") %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:6)) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) %>%
  dplyr::mutate(ID_ENTITY = 496,
                ID_SAMPLE = 103156 + seq_along(ID_ENTITY),
                .before = 1)
aux_samples_counts %>%
  dplyr::select(1:9) %>%
  rpd:::update_records(conn = conn, table = "sample",
                       dry_run = TRUE, PK = 1:2)
# Results: 44 records were updated.

#### Counts ----
aux_samples_counts_2 <- aux_samples_counts %>%
  dplyr::select(-c(1, 3:9))

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
data("EPD_taxa_amalgamation")
##### Clean ----
aux_samples_counts_4 <-
  seq_len(nrow(aux_samples_counts_2)) %>%
  purrr::map_df(function(i) {
    aux_samples_counts_2[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage1 <- idx_pairs(nrow(aux_samples_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage1))
meta_neo_res <-
  purrr::map2(idx_stage1$x,
              idx_stage1$y,
              ~ {
                pb$tick()
                aux_samples_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_4$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(aux_samples_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 3828

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

aux_samples_counts_5 <-
  unique(aux_samples_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(aux_samples_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                aux_samples_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_5$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(aux_samples_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 3652

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

aux_samples_counts_6 <-
  unique(aux_samples_counts_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(aux_samples_counts_6), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                aux_samples_counts_6[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_6$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(aux_samples_counts_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 3036

# Delete objects with counts
rm(aux_samples_counts, aux_samples_counts_2, aux_samples_counts_4)
rm(aux_samples_counts_5, aux_samples_counts_6)

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "GKSE")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## BLEDC ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "BLEDC")
aux

#### Samples ----
aux_rpd <- extract_rpd(ID_ENTITY = 1717)
aux_rpd$sample
aux$sample
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name == "BLEDC") %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:6)) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) %>%
  dplyr::mutate(ID_ENTITY = 776,
                ID_SAMPLE = 14995:15040,
                .before = 1)
aux_samples_counts$depth - aux$sample$depth
aux_samples_counts %>%
  dplyr::select(1:9) %>%
  rpd:::update_records(conn = conn, table = "sample",
                       dry_run = TRUE, PK = 1:2)
# Results: 46 records were updated.

#### Counts ----
aux_samples_counts_2 <- aux_samples_counts %>%
  dplyr::select(-c(1, 3:9))

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
data("EPD_taxa_amalgamation")
##### Clean ----
aux_samples_counts_4 <-
  seq_len(nrow(aux_samples_counts_2)) %>%
  purrr::map_df(function(i) {
    aux_samples_counts_2[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage1 <- idx_pairs(nrow(aux_samples_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage1))
meta_neo_res <-
  purrr::map2(idx_stage1$x,
              idx_stage1$y,
              ~ {
                pb$tick()
                aux_samples_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_4$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(aux_samples_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 2070

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

aux_samples_counts_5 <-
  unique(aux_samples_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(aux_samples_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                aux_samples_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_5$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(aux_samples_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 2070

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

aux_samples_counts_6 <-
  unique(aux_samples_counts_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(aux_samples_counts_6), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                aux_samples_counts_6[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_6$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(aux_samples_counts_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 1978

# Delete objects with counts
rm(aux_samples_counts, aux_samples_counts_2, aux_samples_counts_4)
rm(aux_samples_counts_5, aux_samples_counts_6)

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "BLEDC")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## OTTSVA ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "OTTSVA")
aux

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "OTTSVA")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## RUI ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "RUI")
aux
#### Dates ----
EPD_DATES %>%
  dplyr::filter(entity_name == "RUI") %>%
  dplyr::select(-c(1:5)) %>%
  dplyr::rename(age_calib = age_cal) %>%
  dplyr::mutate(depth = depth - 90) %>% # Account for water level
  dplyr::mutate(ID_ENTITY = 826,
                ID_DATE_INFO = 11126 + seq_along(ID_ENTITY),
                .before = 1) %>%
  rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
# Results: 15 records were inserted.

#### Samples ----
aux$sample %>%
  dplyr::mutate(depth = depth - 90) %>% # Account for water level
  dplyr::select(ID_ENTITY, ID_SAMPLE, depth) %>%
  rpd:::update_records(conn = conn, table = "sample",
                       dry_run = TRUE, PK = 1:2)
# Results: 46 records were inserted.

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "RUI")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## KAHA ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "KAHA")
aux

get_links(conn, 791)
# Update name from KAHA to KAHA 1-94
dabr::select(conn,
             "SELECT * from entity WHERE ID_ENTITY = 791")
dabr::update(conn,
             "UPDATE entity SET entity_name = 'KAHA 1-94' WHERE",
             "ID_ENTITY = 791 AND entity_name = 'KAHA'")
# Results: 1 record was updated.

#### Dates ----
EPD_DATES %>%
  dplyr::filter(entity_name == "KAHA") %>%
  dplyr::select(-c(1:5)) %>%
  dplyr::rename(age_calib = age_cal) %>%
  dplyr::mutate(ID_ENTITY = 791,
                ID_DATE_INFO = 11057 + seq_along(ID_ENTITY),
                .before = 1) %>%
  rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
# Results: 14 records were inserted.

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "KAHA")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## VIPI ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "VIPI")
aux
#### Dates ----
EPD_DATES %>%
  dplyr::filter(entity_name == "VIPI") %>%
  dplyr::select(-c(1:5)) %>%
  dplyr::rename(age_calib = age_cal) %>%
  dplyr::mutate(ID_ENTITY = 1532,
                ID_DATE_INFO = 11071 + seq_along(ID_ENTITY),
                .before = 1) %>%
  rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
# Results: 8 records were inserted.

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "VIPI")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## VJUUS ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "VJUUS")
aux

#### Samples ----
aux_rpd <- extract_rpd(ID_ENTITY = 2466)
aux_rpd$sample
aux$sample
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name == "VJUUS") %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:6)) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) %>%
  dplyr::mutate(ID_ENTITY = 1506,
                ID_SAMPLE = 17370:17443,
                .before = 1)
aux_samples_counts %>%
  dplyr::select(1:9) %>%
  rpd:::update_records(conn = conn, table = "sample",
                       dry_run = TRUE, PK = 1:2)
# Results: 74 records were updated.

#### Counts ----
aux_samples_counts_2 <- aux_samples_counts %>%
  dplyr::select(-c(1, 3:9))

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
data("EPD_taxa_amalgamation")
##### Clean ----
aux_samples_counts_4 <-
  seq_len(nrow(aux_samples_counts_2)) %>%
  purrr::map_df(function(i) {
    aux_samples_counts_2[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage1 <- idx_pairs(nrow(aux_samples_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage1))
meta_neo_res <-
  purrr::map2(idx_stage1$x,
              idx_stage1$y,
              ~ {
                pb$tick()
                aux_samples_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_4$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(aux_samples_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 2664

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

aux_samples_counts_5 <-
  unique(aux_samples_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(aux_samples_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                aux_samples_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_5$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(aux_samples_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 2664

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

aux_samples_counts_6 <-
  unique(aux_samples_counts_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(aux_samples_counts_6), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                aux_samples_counts_6[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_6$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(aux_samples_counts_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 2516

# Delete objects with counts
rm(aux_samples_counts, aux_samples_counts_2, aux_samples_counts_4)
rm(aux_samples_counts_5, aux_samples_counts_6)

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "VJUUS")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## PLA ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "PLA")
aux

#### Dates ----
EPD_DATES %>%
  dplyr::filter(entity_name == "PLA") %>%
  dplyr::select(-c(1:5)) %>%
  dplyr::rename(age_calib = age_cal) %>%
  dplyr::mutate(ID_ENTITY = 823,
                ID_DATE_INFO = 11079 + seq_along(ID_ENTITY),
                .before = 1) %>%
  rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
# Results: 9 records were inserted.

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "PLA")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## JARVEOTS ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "JARVEOTS")
aux

#### Dates ----
EPD_DATES %>%
  dplyr::filter(entity_name == "JARVEOTS") %>%
  dplyr::select(-c(1:5)) %>%
  dplyr::rename(age_calib = age_cal) %>%
  dplyr::mutate(depth = depth - 265) %>%
  dplyr::mutate(ID_ENTITY = 790,
                ID_DATE_INFO = 11088 + seq_along(ID_ENTITY),
                .before = 1) %>%
  rpd:::update_records(conn = conn, table = "date_info",
                       dry_run = TRUE, PK = 1:2)
# Results: 4 records were inserted.

#### Samples ----
aux$sample
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name == "JARVEOTS") %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:6)) %>%
  dplyr::mutate(depth = depth - 265) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) %>%
  dplyr::mutate(ID_ENTITY = 790,
                ID_SAMPLE = 25606:25643,
                .before = 1)
aux_samples_counts$depth - aux$sample$depth
aux_samples_counts %>%
  dplyr::select(1:9) %>%
  rpd:::update_records(conn = conn, table = "sample",
                       dry_run = !TRUE, PK = 1:2)
# Results: 38 records were updated.

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "JARVEOTS")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## SETESGR ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "SETESGR")
aux

# Delete old dates (from the RPD)
aux$date_info
dabr::select(conn,
             "SELECT * FROM date_info WHERE ID_DATE_INFO BETWEEN 2778 AND 2784")
# dabr::delete(conn,
#              "DELETE FROM date_info WHERE ID_DATE_INFO BETWEEN 2778 AND 2784",
#              "AND ID_ENTITY = 503")
# # Results: 7 records were deleted.

#### Dates ----
EPD_DATES %>%
  dplyr::filter(entity_name == "SETESGR") %>%
  dplyr::select(-c(1:5)) %>%
  dplyr::rename(age_calib = age_cal) %>%
  dplyr::mutate(ID_ENTITY = 503,
                ID_DATE_INFO = 11141 + seq_along(ID_ENTITY),
                .before = 1) %>%
  rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
# Results: 8 records were inserted.

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "SETESGR")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## PRA-15 ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "PRA-15")
aux

#### Dates ----
EPD_DATES %>%
  dplyr::filter(entity_name == "PRA-15") %>%
  dplyr::select(-c(1:5)) %>%
  dplyr::rename(age_calib = age_cal) %>%
  dplyr::mutate(ID_ENTITY = 1198,
                ID_DATE_INFO = 11092 + seq_along(ID_ENTITY),
                .before = 1) %>%
  rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
# Results: 19 records were inserted.

#### Age model* ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "PRA-15")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## YAK2A ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "YAK2A")
aux

#### Dates ----
EPD_DATES %>%
  dplyr::filter(entity_name == "YAK2A") %>%
  dplyr::select(-c(1:5)) %>%
  dplyr::rename(age_calib = age_cal) %>%
  dplyr::bind_rows(
    tibble::tibble(
      date_type = "AMS",
      depth = 163.25,
      thickness = 0.5,
      lab_num = "ULA-6960",
      age_c14 = 7605,
      error = 20,
      material_dated = "bulk sediment",
      age_used = "no"
    )
  ) %>%
  dplyr::arrange(depth) %>%
  dplyr::mutate(ID_ENTITY = 1595,
                ID_DATE_INFO = 11111 + seq_along(ID_ENTITY),
                .before = 1) %>%
  rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
# Results: 7 records were inserted.

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "YAK2A")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## YAK3A ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "YAK3A")
aux

#### Dates ----
EPD_DATES %>%
  dplyr::filter(entity_name == "YAK3A") %>%
  dplyr::select(-c(1:5)) %>%
  dplyr::rename(age_calib = age_cal) %>%
  dplyr::mutate(ID_ENTITY = 1596,
                ID_DATE_INFO = 11118 + seq_along(ID_ENTITY),
                .before = 1) %>%
  rpd:::add_records(conn = conn, table = "date_info", dry_run = !TRUE)
# Results: 8 records were inserted.

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "YAK3A")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## ZELLERSE ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "ZELLERSE")
aux

#### Dates ----
aux_rpd <- extract_rpd(ID_ENTITY = 2479)
aux_rpd$date_info
EPD_DATES %>%
  dplyr::filter(entity_name == "ZELLERSE") %>%
  dplyr::select(-c(1:5)) %>%
  dplyr::rename(age_calib = age_cal) %>%
  # dplyr::bind_rows(aux_rpd$date_info[1, ]) %>%
  dplyr::arrange(depth) %>%
  dplyr::mutate(ID_ENTITY = 1610,
                ID_DATE_INFO = 11149 + seq_along(ID_ENTITY),
                .before = 1) %>%
  rpd:::add_records(conn = conn, table = "date_info", dry_run = !TRUE)
# Results: 13 records were inserted.

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "ZELLERSE")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## SALI0110 ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "SALI0110")
aux$date_info %>%
  dplyr::arrange(depth)

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "SALI0110")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)


## KILOMYR ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "KILOMYR")
aux
aux$date_info %>%
  dplyr::arrange(depth)
#### Dates ----
dabr::select(conn,
             "SELECT * FROM date_info WHERE ID_ENTITY = 1089 AND ID_DATE_INFO = 2022")
dabr::update(conn,
             "UPDATE date_info SET depth = 743.75, thickness = 1.5 WHERE",
             "ID_ENTITY = 1089 AND ID_DATE_INFO = 2022 AND depth = 743.3")
# Results: 1 record was updated.

#### Samples ----
aux_rpd <- extract_rpd(ID_ENTITY = 2267)
aux_rpd$sample
aux$sample
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name == "KILOMYR") %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:6)) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) %>%
  dplyr::mutate(thickness = 1) %>%
  dplyr::mutate(ID_ENTITY = 1089,
                ID_SAMPLE = c(15742:15798, 17446:17451),
                .before = 1)
aux_samples_counts %>%
  dplyr::select(1:9) %>%
  rpd:::update_records(conn = conn, table = "sample",
                       dry_run = TRUE, PK = 1:2)
# Results: 63 records were updated.

#### Counts ----
aux_samples_counts_2 <- aux_samples_counts %>%
  dplyr::select(-c(1, 3:9))

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
data("EPD_taxa_amalgamation")
##### Clean ----
aux_samples_counts_4 <-
  seq_len(nrow(aux_samples_counts_2)) %>%
  purrr::map_df(function(i) {
    aux_samples_counts_2[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage1 <- idx_pairs(nrow(aux_samples_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage1))
meta_neo_res <-
  purrr::map2(idx_stage1$x,
              idx_stage1$y,
              ~ {
                pb$tick()
                aux_samples_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_4$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(aux_samples_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 5859

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

aux_samples_counts_5 <-
  unique(aux_samples_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(aux_samples_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                aux_samples_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_5$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(aux_samples_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 5103

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

aux_samples_counts_6 <-
  unique(aux_samples_counts_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(aux_samples_counts_6), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                aux_samples_counts_6[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_6$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(aux_samples_counts_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 3654

# Delete objects with counts
rm(aux_samples_counts, aux_samples_counts_2, aux_samples_counts_4)
rm(aux_samples_counts_5, aux_samples_counts_6)

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "KILOMYR")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

## JELENILO ----
aux <- conn %>%
  special.epd::snapshot(entity_name = "JELENILO")
aux

#### Dates ----
aux_rpd <- extract_rpd(ID_ENTITY = 2455)
aux_rpd$date_info
EPD_DATES %>%
  dplyr::filter(entity_name == "JELENILO") %>%
  dplyr::select(-c(1:5)) %>%
  dplyr::rename(age_calib = age_cal) %>%
  dplyr::mutate(ID_ENTITY = 599,
                ID_DATE_INFO = 11162 + seq_along(ID_ENTITY),
                .before = 1) %>%
  rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
# Results: 6 records were inserted.

#### Samples ----
aux_rpd$sample
aux$sample$ID_SAMPLE %>% dput
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name == "JELENILO") %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:6)) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) %>%
  dplyr::mutate(thickness = 1) %>%
  dplyr::mutate(ID_ENTITY = 599,
                ID_SAMPLE = 17221:17259,
                .before = 1)
aux_samples_counts %>%
  dplyr::select(1:9) %>%
  rpd:::update_records(conn = conn, table = "sample",
                       dry_run = TRUE, PK = 1:2)
# Results: 39 records were updated.

#### Counts ----
aux_samples_counts_2 <- aux_samples_counts %>%
  dplyr::select(-c(1, 3:9))

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
data("EPD_taxa_amalgamation")
##### Clean ----
aux_samples_counts_4 <-
  seq_len(nrow(aux_samples_counts_2)) %>%
  purrr::map_df(function(i) {
    aux_samples_counts_2[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage1 <- idx_pairs(nrow(aux_samples_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage1))
meta_neo_res <-
  purrr::map2(idx_stage1$x,
              idx_stage1$y,
              ~ {
                pb$tick()
                aux_samples_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_4$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(aux_samples_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 2379

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

aux_samples_counts_5 <-
  unique(aux_samples_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(aux_samples_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                aux_samples_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_5$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(aux_samples_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 2301

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

aux_samples_counts_6 <-
  unique(aux_samples_counts_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(aux_samples_counts_6), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                aux_samples_counts_6[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_6$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(aux_samples_counts_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 2067

# Delete objects with counts
rm(aux_samples_counts, aux_samples_counts_2, aux_samples_counts_4)
rm(aux_samples_counts_5, aux_samples_counts_6)

#### Age model ----
am <- find_age_models("~/Downloads/21.3.22 outstanding models/", "JELENILO")
am
am %>%
  purrr::pmap_df(upload_age_model, conn = conn)

# EPD exclusive ----
## (HERE) Upload age models ----
entities_with_am <- c("TELAKKO",
                      "BALLYNAH",
                      "BORIE",
                      "CBB75",
                      "DISTRE",
                      "FOUGS942",
                      "GARATOUE",
                      "GL",
                      "GLUBOKOE",
                      "GRAECH51",
                      "HALSJON",
                      "MD992348",
                      "STONETOR",
                      "WILDMOOS"
                      )
pending_am <-
  find_age_models("~/Downloads/special_epd_am/",
                  entity_name = entities_with_am)
pending_am
# Check if the age models already exist in the DB
special.epd::snapshot(conn, entity_name = entities_with_am) %>% .$age_model

pending_am2 <- pending_am %>%
  purrr::pmap_df(upload_age_model, conn = conn)
# Check if all the age models were uploaded
all(pending_am2$status)
pending_am2
pending_am3 <-
  pending_am2$am %>%
  purrr::map_df(~.x) %>%
  magrittr::set_names(colnames(.) %>% stringr::str_to_upper())
pending_am3 %>%
  dplyr::group_by(ID_SAMPLE) %>%
  dplyr::mutate(n = length(ID_SAMPLE)) %>%
  dplyr::filter(n > 1)

EPD_AM <- dabr::select(conn,
                       "SELECT * FROM age_model WHERE ID_SAMPLE IN (",
                       paste0(pending_am3$ID_SAMPLE, collapse = ", "),
                       ") AND ID_MODEL = 8")
waldo::compare(pending_am3 %>%
                 dplyr::arrange(ID_SAMPLE),
               EPD_AM %>%
                 magrittr::set_names(colnames(.) %>% stringr::str_to_upper()) %>%
                 dplyr::arrange(ID_SAMPLE))

## Summary ----
epd_exclusive_repatriation <- epd_repatriation_tmp_file %>%
  readxl::read_excel(sheet = 6) %>%
  janitor::clean_names()

epd_exclusive_repatriation_db_dump <- conn %>%
  special.epd::snapshot(entity_name = epd_exclusive_repatriation$entity_name)
epd_exclusive_repatriaition_entities_with_dates <-
  epd_exclusive_repatriation_db_dump$date_info %>%
  # dplyr::distinct(ID_ENTITY) %>%
  dplyr::group_by(ID_ENTITY) %>%
  dplyr::summarise(n_dates = dplyr::n()) %>%
  dplyr::left_join(epd_exclusive_repatriation_db_dump$entity %>%
                     dplyr::select(1:4),
                   by = "ID_ENTITY") %>%
  dplyr::mutate(DATES = TRUE)

epd_exclusive_repatriaition_entities_with_samples <-
  epd_exclusive_repatriation_db_dump$sample %>%
  # dplyr::distinct(ID_ENTITY) %>%
  dplyr::group_by(ID_ENTITY) %>%
  dplyr::summarise(n_samples = dplyr::n()) %>%
  dplyr::left_join(epd_exclusive_repatriation_db_dump$entity %>%
                     dplyr::select(1:4),
                   by = "ID_ENTITY") %>%
  dplyr::mutate(SAMPLES = TRUE)

epd_exclusive_repatriaition_entities_with_am <-
  epd_exclusive_repatriation_db_dump$age_model %>%
  dplyr::distinct(ID_SAMPLE) %>%
  dplyr::left_join(epd_exclusive_repatriation_db_dump$sample,
                   by = "ID_SAMPLE") %>%
  dplyr::group_by(ID_ENTITY) %>%
  dplyr::summarise(n_am = dplyr::n()) %>%
  dplyr::left_join(epd_exclusive_repatriation_db_dump$entity %>%
                     dplyr::select(1:4),
                   by = "ID_ENTITY") %>%
  dplyr::mutate(AM = TRUE)

epd_exclusive_repatriaition_entities_with_counts_0 <-
  epd_exclusive_repatriation_db_dump$pollen_count$clean %>%
  dplyr::distinct(ID_SAMPLE) %>%
  dplyr::left_join(epd_exclusive_repatriation_db_dump$sample,
                   by = "ID_SAMPLE") %>%
  dplyr::group_by(ID_ENTITY) %>%
  dplyr::summarise(n_counts_0 = dplyr::n()) %>%
  dplyr::left_join(epd_exclusive_repatriation_db_dump$entity %>%
                     dplyr::select(1:4),
                   by = "ID_ENTITY") %>%
  dplyr::mutate(COUNTS_0 = TRUE)
epd_exclusive_repatriaition_entities_with_counts_1 <-
  epd_exclusive_repatriation_db_dump$pollen_count$intermediate %>%
  dplyr::distinct(ID_SAMPLE) %>%
  dplyr::left_join(epd_exclusive_repatriation_db_dump$sample,
                   by = "ID_SAMPLE") %>%
  dplyr::group_by(ID_ENTITY) %>%
  dplyr::summarise(n_counts_1 = dplyr::n()) %>%
  dplyr::left_join(epd_exclusive_repatriation_db_dump$entity %>%
                     dplyr::select(1:4),
                   by = "ID_ENTITY") %>%
  dplyr::mutate(COUNTS_1 = TRUE)
epd_exclusive_repatriaition_entities_with_counts_2 <-
  epd_exclusive_repatriation_db_dump$pollen_count$amalgamated %>%
  dplyr::distinct(ID_SAMPLE) %>%
  dplyr::left_join(epd_exclusive_repatriation_db_dump$sample,
                   by = "ID_SAMPLE") %>%
  dplyr::group_by(ID_ENTITY) %>%
  dplyr::summarise(n_counts_2 = dplyr::n()) %>%
  dplyr::left_join(epd_exclusive_repatriation_db_dump$entity %>%
                     dplyr::select(1:4),
                   by = "ID_ENTITY") %>%
  dplyr::mutate(COUNTS_2 = TRUE)


epd_exclusive_repatriation %>%
  dplyr::select(-site_name) %>%
  dplyr::left_join(epd_exclusive_repatriaition_entities_with_dates) %>%
  dplyr::left_join(epd_exclusive_repatriaition_entities_with_samples) %>%
  dplyr::left_join(epd_exclusive_repatriaition_entities_with_am) %>%
  dplyr::left_join(epd_exclusive_repatriaition_entities_with_counts_0) %>%
  dplyr::left_join(epd_exclusive_repatriaition_entities_with_counts_1) %>%
  dplyr::left_join(epd_exclusive_repatriaition_entities_with_counts_2) %>%
  dplyr::mutate(has_DATES = ifelse(is.na(DATES), FALSE, DATES),
                has_SAMPLES = ifelse(is.na(SAMPLES), FALSE, SAMPLES),
                has_AM = ifelse(is.na(AM), FALSE, AM),
                COUNTS_0 = ifelse(is.na(COUNTS_0), FALSE, COUNTS_0),
                COUNTS_1 = ifelse(is.na(COUNTS_1), FALSE, COUNTS_1),
                COUNTS_2 = ifelse(is.na(COUNTS_2), FALSE, COUNTS_2),
                has_COUNTS = COUNTS_0 & COUNTS_1 & COUNTS_2) %>%
  dplyr::left_join(epd_age_models %>%
                     dplyr::select(-site_id, -site_name_clean)) %>%
  dplyr::relocate(n_dates, n_samples, n_am,
                  .before = has_DATES) %>%
  readr::write_excel_csv("~/Downloads/special-epd_epd-exclusive_summary.csv")



external_link_tb <- dabr::select_all(conn, "external_link") %>%
  dplyr::group_by(ID_ENTITY) %>%
  dplyr::mutate(n = dplyr::n())

records_from_rpd <- tibble::tibble(entity_name = rpd_repatriation$entity_name, db = "RPD")
records_from_emb <- tibble::tibble(entity_name = embsecbio_repatriation$entity_name, db = "EMBSeCBIO")
records_from_ibe <- tibble::tibble(entity_name = iberia_repatriation$entity_name, db = "IBERIA")
records_from_ib2 <- tibble::tibble(entity_name = iberia_extra_repatriation$entity_name, db = "IBERIA")

records_from_other_sources <- dplyr::bind_rows(
  records_from_rpd,
  records_from_emb,
  records_from_ibe,
  records_from_ib2
) %>%
  dplyr::left_join(dabr::select(conn,
                                "SELECT ID_ENTITY, entity_name",
                                "FROM entity"),
                   by = "entity_name") %>%
  dplyr::relocate(ID_ENTITY, .before = 1)

EPD_exclusive <- external_link_tb %>%
  dplyr::filter(n == 1)
# Check if any records have been already repatriated
EPD_exclusive_snap <- conn %>%
  special.epd::snapshot(ID_ENTITY = EPD_exclusive$ID_ENTITY)
with_dates <- EPD_exclusive_snap$date_info %>%
  dplyr::filter(ID_ENTITY %in% records_from_other_sources$ID_ENTITY) %>%
  dplyr::distinct(ID_ENTITY) %>%
  dplyr::arrange(ID_ENTITY) %>%
  purrr::flatten_dbl()
  # unique(EPD_exclusive_snap$date_info$ID_ENTITY) %>% sort()
with_samples <- EPD_exclusive_snap$sample %>%
  dplyr::filter(ID_ENTITY %in% records_from_other_sources$ID_ENTITY) %>%
  dplyr::distinct(ID_ENTITY) %>%
  dplyr::arrange(ID_ENTITY) %>%
  purrr::flatten_dbl()
  # unique(EPD_exclusive_snap$sample$ID_ENTITY) %>% sort()

entities_from_other_dbs <- external_link_tb %>%
  dplyr::filter(n > 1)
# EPD_exclusive %>%
#   dplyr::filter(!(ID_ENTITY %in% records_from_other_sources$ID_ENTITY)) %>%
#   readr::write_excel_csv("~/Downloads/epd.csv", na = "")

## Dates ----
epd_repatriated_dates_info <- EPD_DATES %>%
  dplyr::filter(entity_name %in% EPD_exclusive$external_entity_name) %>%
  dplyr::left_join(dabr::select(conn,
                                "SELECT ID_ENTITY, entity_name",
                                "FROM entity"),
                   by = "entity_name") %>%
  # dplyr::left_join(non_rpd_repatriated_dates_info_3 %>%
  #                    dplyr::select(ID_ENTITY, site_id, entity_name)) %>%
  dplyr::select(-ages_already, -site_id, -site_name, -site_name_clean, -entity_name) %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::rename(age_calib = age_cal) %>%
  dplyr::filter(!(ID_ENTITY %in% records_from_other_sources$ID_ENTITY)) %>%
  dplyr::filter(!(ID_ENTITY %in% with_dates)) %>%
  dplyr::mutate(ID_DATE_INFO = 2830 + seq_along(ID_ENTITY), .after = 1) #get_id_date_info(conn)

if (nrow(epd_repatriated_dates_info) == 0)
  epd_repatriated_dates_info <- dabr::select(conn,
                                             "SELECT * FROM date_info WHERE ID_DATE_INFO BETWEEN 2831 AND 9938")
# epd_repatriated_dates_info <- conn %>%
#   dabr::select("SELECT * FROM date_info WHERE ID_DATE_INFO BETWEEN 2831 AND 9938")

# Check for existing dates
special.epd::snapshot(conn, ID_ENTITY = epd_repatriated_dates_info$ID_ENTITY)
meta_neo_res <- seq_len(nrow(epd_repatriated_dates_info)) %>%
  purrr::map(function(i) {
    epd_repatriated_dates_info[i, ] %>%
      rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
  })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate -----
EPD_DATES_NEO_DB <- dabr::select_all(conn, "date_info") %>%
  dplyr::filter(ID_ENTITY %in% epd_repatriated_dates_info$ID_ENTITY)
waldo::compare(epd_repatriated_dates_info %>%
                 .[order(colnames(.))] %>%
                 dplyr::arrange(ID_ENTITY, depth),
               EPD_DATES_NEO_DB %>%
                 .[order(colnames(.))] %>%
                 # dplyr::select(-ID_DATE_INFO) %>%
                 dplyr::arrange(ID_ENTITY, depth),
               tolerance = 1E-9,
               max_diffs = Inf)
EPD_DATES_NEO_DB %>%
  dplyr::filter(ID_ENTITY %in% records_from_other_sources$ID_ENTITY)


##Samples ----
epd_repatriated_samples <- EPD_COUNTS %>%
  dplyr::filter(entity_name %in% EPD_exclusive$external_entity_name) %>%
  # smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-chronology_id) %>%
  dplyr::left_join(EPD_METADATA %>%
                     dplyr::select(1:3, 6, 10),
                   by = c("site_id", "dataset_id", "entity_name")) %>%
  dplyr::relocate(ID_SITE, ID_ENTITY, .before = 1) %>%
  dplyr::filter(!(ID_ENTITY %in% records_from_other_sources$ID_ENTITY)) %>%
  dplyr::filter(!(ID_ENTITY %in% with_samples))
if (nrow(epd_repatriated_samples) == 0)
  epd_repatriated_dates_info <- dabr::select(conn,
                                             "SELECT * FROM sample WHERE ID_SAMPLE BETWEEN 29059 AND 101315")

epd_repatriated_samples

epd_repatriated_samples_2 <- epd_repatriated_samples %>%
  dplyr::mutate(ID_SAMPLE = 29059 + seq_along(ID_ENTITY), # get_id_sample(conn)
                .after = ID_ENTITY) %>%
  dplyr::select(-ID_SITE, -site_id, -site_name, -site_name_clean, -dataset_id, -dataset_name, -entity_name, -sample_id, -unit_name)

# Check if the "new" records are already in th DB:
special.epd::snapshot(conn, ID_ENTITY = epd_repatriated_samples_2$ID_ENTITY)
dabr::select_all(conn, "sample") %>%
  dplyr::filter(ID_SAMPLE %in% epd_repatriated_samples_2$ID_SAMPLE |
                  ID_ENTITY %in% epd_repatriated_samples_2$ID_ENTITY)
meta_neo_res <- seq_len(nrow(epd_repatriated_samples_2)) %>%
  purrr::map(function(i) {
    epd_repatriated_samples_2[i, ] %>%
      dplyr::select(1:9) %>%
      rpd:::add_records(conn = conn, table = "sample", dry_run = TRUE)
  })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()
##### Validate -----
EPD_SAMPLES_NEO_DB <- dabr::select_all(conn, "sample") %>%
  dplyr::filter(ID_ENTITY %in% epd_repatriated_samples_2$ID_ENTITY)
waldo::compare(epd_repatriated_samples_2 %>%
                 dplyr::select(1:9) %>%
                 .[order(colnames(.))] %>%
                 dplyr::mutate(depth = round(depth, 3),
                               age = round(age)),
               EPD_SAMPLES_NEO_DB %>%
                 .[order(colnames(.))] %>%
                 dplyr::mutate(depth = round(depth, 3),
                               age = round(age)) %>%
                 dplyr::select(-count_type, -sample_type),
               tolerance = 2)

## (HERE) Age models ----
epd_repatriated_new_am <-
  find_age_models(path, EPD_exclusive$external_entity_name)
# Find duplicated entries
epd_repatriated_new_am %>%
  dplyr::group_by(entity_name) %>%
  dplyr::mutate(n = dplyr::n()) %>%
  dplyr::mutate(path = path %>%
                  stringr::str_remove_all("/Users/roberto.villegas-diaz/Downloads/special_epd_am/")) %>%
  dplyr::filter(n > 1) %>%
  dplyr::arrange(entity_name, subfolder) #%>%
  # dplyr::select(entity_name, subfolder, path) %>%
  # readr::write_excel_csv("~/Downloads/epd_new_age_models_duplicated.csv")
epd_repatriated_new_am %>%
  dplyr::filter(entity_name %in% epd_age_models_ready_to_upload$entity_name)
epd_repatriated_new_am2 <- epd_repatriated_new_am %>%
  dplyr::filter(entity_name %in% epd_age_models_ready_to_upload$entity_name) %>%
  purrr::pmap_df(upload_age_model, conn = conn)

all(epd_repatriated_new_am2$status)
# if(!all(epd_repatriated_new_am2$status)) {
#   epd_repatriated_new_am2_v2 <- epd_repatriated_new_am2 %>%
#     dplyr::filter(!status) %>%
#     dplyr::select(-status, -reason) %>%
#     purrr::pmap_df(upload_age_model2, conn = conn)
# }
#
# all(epd_repatriated_new_am2_v2$status)
# if(!all(epd_repatriated_new_am2_v2$status)) {
#   epd_repatriated_new_am2_v3 <- epd_repatriated_new_am %>%
#     dplyr::filter(entity_name %in%
#                     (epd_repatriated_new_am2_v2 %>%
#                        dplyr::filter(!status) %>%
#                        .$entity_name)) %>%
#     purrr::pmap_df(upload_age_model2, conn = conn)
# }
# all(epd_repatriated_new_am2_v3$status)
#
# epd_repatriated_new_am2 %>%
#   dplyr::filter(!status) %>%
#   .$am %>%
#   purrr::map_df(~.x)

epd_repatriated_new_am3 <-
  epd_repatriated_new_am2 %>%
  dplyr::filter(status) %>%
  # dplyr::bind_rows(epd_repatriated_new_am2_v2 %>% dplyr::filter(status)) %>%#,
                   # epd_repatriated_new_am2_v3 %>% dplyr::filter(status)) %>%
  .$am %>%
  purrr::map_df(~.x) %>%
  magrittr::set_names(colnames(.) %>%
                        stringr::str_to_upper() %>%
                        stringr::str_replace_all("MEAN", "mean") %>%
                        stringr::str_replace_all("MEDIAN", "median")) %>%
  dplyr::distinct(ID_SAMPLE, .keep_all = TRUE) %>%
  dplyr::arrange(ID_SAMPLE)

EPD_AM <- dabr::select(conn,
                       "SELECT * FROM age_model WHERE ID_SAMPLE IN (",
                       paste0(epd_repatriated_new_am3$ID_SAMPLE, collapse = ", "),
                       ") AND ID_MODEL = 8")
waldo::compare(epd_repatriated_new_am3,
               EPD_AM)

# a <- epd_repatriated_new_am3[-seq_len(40000),]
# b <- EPD_AM[-seq_len(40000),]
# a[378,]
# b[378,]
# waldo::compare(a[seq_len(1000),],
#                b[seq_len(1000),])

special.epd::snapshot(conn, entity_name = epd_repatriated_new_am$entity_name)

# NOTE: Pending age models
epd_repatriated_new_am %>%
  dplyr::filter(!(entity_name %in% epd_age_models_ready_to_upload$entity_name)) %>%
  dplyr::select(-am)

tibble::tribble(
  ~entity_name,                   ~subfolder,                                                                                                            ~path,
  "ZS-9",              "Coastal sites",                  "/Users/roberto.villegas-diaz/Downloads/special_epd_am/Coastal sites/ZS-9/bacon_chronology.csv",
  "TELAKKO",        "PL Coastal entities",         "/Users/roberto.villegas-diaz/Downloads/special_epd_am/PL Coastal entities/TELAKKO/bacon_chronology.csv",
  "VK12",        "PL Coastal entities",            "/Users/roberto.villegas-diaz/Downloads/special_epd_am/PL Coastal entities/VK12/bacon_chronology.csv",
  "BXBX", "YS Batch2 (IBERIA-missing)",     "/Users/roberto.villegas-diaz/Downloads/special_epd_am/YS Batch2 (IBERIA-missing)/BXBX/bacon_chronology.csv",
  "MD992348", "YS Batch2 (IBERIA-missing)", "/Users/roberto.villegas-diaz/Downloads/special_epd_am/YS Batch2 (IBERIA-missing)/MD992348/bacon_chronology.csv",
  "TG8", "YS Batch2 (IBERIA-missing)",      "/Users/roberto.villegas-diaz/Downloads/special_epd_am/YS Batch2 (IBERIA-missing)/TG8/bacon_chronology.csv"
)


## (HERE) Counts ----
# Find samples with existing pollen records
# existing_pollen_counts <- conn %>%
#   dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
#                paste0(epd_repatriated_samples_2$ID_SAMPLE, collapse = ", "),
#                ")",
#                quiet = TRUE)

aux <- conn %>%
  special.epd::snapshot(ID_ENTITY = EPD_exclusive$ID_ENTITY)
  # special.epd::snapshot(ID_ENTITY = epd_repatriated_samples_2$ID_ENTITY)
existing_pollen_counts <-
  aux$pollen_count %>%
  purrr::map_df(~.x) %>% # Combine the three pollen count tables
  dplyr::select(ID_SAMPLE) %>%
  dplyr::group_by(ID_SAMPLE) %>%
  dplyr::summarise(n = dplyr::n())

existing_pollen_counts %>%
  dplyr::filter(n != 3) # Verify if sample does not have the 3 tables

epd_repatriated_samples_3 <- epd_repatriated_samples_2 %>%
  dplyr::select(-ID_ENTITY, -depth, -thickness, -chronology_name, -age_type, -age, -age_younger, -age_older)

epd_repatriated_samples_3 %>%
  dplyr::filter(ID_SAMPLE %in% existing_pollen_counts$ID_SAMPLE)
taxon_name_tb <- dabr::select_all(conn, "taxon_name")
data("EPD_taxa_amalgamation")
##### Clean ----
oplan <- future::plan(future::multisession, workers = 8)
options(future.globals.maxSize = 2000*1024^2)
epd_repatriated_samples_4 <-
  seq_len(nrow(epd_repatriated_samples_3)) %>%
  furrr::future_map_dfr(function(i) {
    epd_repatriated_samples_3[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)
future::plan(oplan)

epd_repatriated_samples_4 %>%
  dplyr::filter(is.na(ID_TAXON))
dim(epd_repatriated_samples_4)

# IDS <- unique(epd_repatriated_samples_4$ID_SAMPLE) %>% sort()
# idx <- idx_pairs(length(IDS), 1000)
# purrr::map2(idx$x, idx$y, ~dabr::delete(conn,
#                                         "DELETE FROM pollen_count WHERE ID_SAMPLE IN (",
#                                         paste0(IDS[.x:.y], collapse = ","),
#                                         ")"))
aux <- special.epd::snapshot(conn,
                             ID_ENTITY = epd_repatriated_samples_2$ID_ENTITY)
epd_repatriated_samples_4 %>%
  dplyr::filter(ID_SAMPLE %in% aux$pollen_count$clean$ID_SAMPLE)
# dabr::delete(conn,
#              "DELETE FROM pollen_count WHERE ID_SAMPLE IN (",
#              paste0(unique(aux$pollen_count$clean$ID_SAMPLE), collapse = ","),
#              ")")


epd_repatriated_samples_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx <- idx_pairs(nrow(epd_repatriated_samples_4), 1000)
pb <- progress::progress_bar$new(total = nrow(idx))
# meta_neo_res <- seq_len(nrow(epd_repatriated_samples_4)) %>%
meta_neo_res <-
  purrr::map2(idx$x,
              idx$y,
              ~ {
                pb$tick()
                epd_repatriated_samples_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count", dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- dabr::select_all(conn, "pollen_count") %>%
  dplyr::filter(ID_SAMPLE %in% epd_repatriated_samples_4$ID_SAMPLE,
                amalgamation_level == 0)
waldo::compare(epd_repatriated_samples_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1e-9)
# 5655448
##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

# EPD_taxa_amalgamation_stage2 %>%
# dplyr::filter(intermediate != intermediate2)
oplan <- future::plan(future::multisession, workers = 8)
epd_repatriated_samples_5 <-
  unique(epd_repatriated_samples_4$ID_SAMPLE) %>%
  furrr::future_map_dfr(function(ID_SAMPLE) {
    epd_repatriated_samples_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)
future::plan(oplan)

epd_repatriated_samples_5 %>%
  dplyr::filter(is.na(ID_TAXON))

epd_repatriated_samples_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(epd_repatriated_samples_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                epd_repatriated_samples_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count", dry_run = TRUE, quiet = TRUE)
              })

# meta_neo_res <- seq_len(nrow(epd_repatriated_samples_5)) %>%
#   purrr::map(function(i) {
#     if (i %% 10000 == 0)
#       print(i)
#     epd_repatriated_samples_5[i, ] %>%
#       rpd:::add_records(conn = conn, table = "pollen_count", dry_run = TRUE, quiet = TRUE)
#   })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- dabr::select_all(conn, "pollen_count") %>%
  dplyr::filter(ID_SAMPLE %in% epd_repatriated_samples_5$ID_SAMPLE,
                amalgamation_level == 1)
waldo::compare(epd_repatriated_samples_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1e-9)

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

# EPD_taxa_amalgamation_stage3 %>%
#   dplyr::filter(amalgamated != amalgamated2)
oplan <- future::plan(future::multisession, workers = 8)
epd_repatriated_samples_6 <-
  unique(epd_repatriated_samples_5$ID_SAMPLE) %>%
  furrr::future_map_dfr(function(ID_SAMPLE) {
    epd_repatriated_samples_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)
future::plan(oplan)

epd_repatriated_samples_6 %>%
  dplyr::filter(is.na(ID_TAXON)|is.na(count))

epd_repatriated_samples_6 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(epd_repatriated_samples_6), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                epd_repatriated_samples_6[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count", dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- dabr::select_all(conn, "pollen_count") %>%
  dplyr::filter(ID_SAMPLE %in% epd_repatriated_samples_6$ID_SAMPLE,
                amalgamation_level == 2)
waldo::compare(epd_repatriated_samples_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1e-9)

aux <- special.epd::snapshot(conn,
                             ID_ENTITY = epd_repatriated_samples_2$ID_ENTITY)
aux$entity %>%
  dplyr::filter(ID_ENTITY %in% aux$sample$ID_ENTITY) %>%
  .$ID_ENTITY %>% dput
non_embsecbio_repatriated_am_info %>%
  dplyr::filter(ID_ENTITY %in% epd_repatriated_samples_2$ID_ENTITY)
aux2

entities_with_pollen_counts <-
  c(1L, 3L, 4L, 6L, 7L, 8L, 9L, 10L, 12L, 13L, 14L, 15L, 16L, 17L,
    19L, 20L, 22L, 23L, 24L, 25L, 26L, 27L, 28L, 29L, 30L, 32L, 33L,
    35L, 36L, 37L, 38L, 40L, 41L, 43L, 44L, 45L, 46L, 47L, 48L, 50L,
    52L, 53L, 54L, 55L, 56L, 57L, 58L, 59L, 60L, 61L, 62L, 63L, 68L,
    69L, 70L, 71L, 72L, 73L, 74L, 75L, 77L, 81L, 85L, 86L, 87L, 88L,
    91L, 92L, 95L, 96L, 97L, 98L, 99L, 101L, 103L, 104L, 105L, 106L,
    107L, 108L, 109L, 110L, 111L, 112L, 113L, 114L, 115L, 116L, 117L,
    118L, 119L, 120L, 121L, 122L, 123L, 124L, 125L, 126L, 129L, 130L,
    131L, 133L, 134L, 135L, 136L, 137L, 138L, 139L, 142L, 144L, 145L,
    146L, 147L, 148L, 149L, 150L, 152L, 153L, 154L, 155L, 156L, 158L,
    160L, 163L, 164L, 165L, 166L, 167L, 168L, 169L, 170L, 171L, 172L,
    175L, 179L, 180L, 181L, 182L, 183L, 184L, 185L, 186L, 187L, 188L,
    189L, 190L, 191L, 193L, 194L, 195L, 196L, 197L, 198L, 199L, 200L,
    201L, 204L, 205L, 207L, 208L, 209L, 210L, 211L, 212L, 213L, 214L,
    215L, 216L, 220L, 222L, 225L, 227L, 228L, 229L, 230L, 231L, 232L,
    233L, 236L, 238L, 239L, 243L, 244L, 245L, 246L, 247L, 248L, 249L,
    250L, 251L, 252L, 253L, 254L, 255L, 256L, 257L, 259L, 260L, 261L,
    262L, 263L, 264L, 267L, 268L, 269L, 270L, 271L, 272L, 273L, 275L,
    276L, 277L, 278L, 279L, 280L, 281L, 282L, 283L, 284L, 285L, 286L,
    287L, 288L, 289L, 290L, 291L, 292L, 293L, 294L, 295L, 296L, 297L,
    298L, 299L, 300L, 301L, 302L, 303L, 304L, 305L, 306L, 307L, 308L,
    309L, 310L, 311L, 312L, 313L, 315L, 316L, 317L, 318L, 319L, 320L,
    321L, 322L, 325L, 326L, 327L, 328L, 329L, 333L, 334L, 335L, 336L,
    337L, 338L, 339L, 341L, 342L, 345L, 346L, 347L, 348L, 350L, 351L,
    352L, 353L, 354L, 355L, 356L, 357L, 358L, 359L, 360L, 361L, 362L,
    363L, 364L, 366L, 367L, 368L, 369L, 370L, 371L, 372L, 374L, 375L,
    376L, 377L, 378L, 382L, 386L, 388L, 389L, 390L, 391L, 392L, 393L,
    394L, 395L, 396L, 397L, 399L, 402L, 403L, 404L, 407L, 408L, 409L,
    410L, 411L, 412L, 413L, 414L, 415L, 416L, 417L, 418L, 419L, 420L,
    421L, 422L, 423L, 424L, 425L, 426L, 428L, 429L, 430L, 431L, 433L,
    434L, 435L, 436L, 437L, 438L, 439L, 441L, 442L, 443L, 444L, 445L,
    446L, 447L, 448L, 449L, 451L, 452L, 453L, 455L, 456L, 457L, 458L,
    459L, 460L, 461L, 462L, 463L, 465L, 466L, 467L, 468L, 470L, 471L,
    472L, 473L, 474L, 475L, 477L, 479L, 480L, 482L, 483L, 484L, 485L,
    486L, 487L, 488L, 489L, 491L, 492L, 493L, 497L, 498L, 500L, 504L,
    505L, 506L, 507L, 508L, 511L, 512L, 513L, 514L, 515L, 516L, 517L,
    519L, 520L, 521L, 522L, 523L, 524L, 525L, 526L, 527L, 528L, 529L,
    530L, 533L, 534L, 535L, 536L, 537L, 538L, 539L, 540L, 542L, 543L,
    544L, 545L, 546L, 547L, 548L, 550L, 551L, 552L, 553L, 554L, 556L,
    557L, 558L, 559L, 560L, 561L, 564L, 565L, 566L, 568L, 573L, 574L,
    575L, 576L, 577L, 578L, 579L, 580L, 581L, 584L, 586L, 587L, 588L,
    590L, 591L, 592L, 593L, 594L, 595L, 596L, 597L, 598L, 600L, 601L,
    602L, 603L, 604L, 605L, 606L, 607L, 608L, 609L, 610L, 611L, 612L,
    613L, 614L, 615L, 616L, 617L, 619L, 620L, 622L, 624L, 625L, 626L,
    630L, 631L, 632L, 633L, 634L, 635L, 637L, 638L, 639L, 641L, 642L,
    643L, 645L, 646L, 647L, 648L, 649L, 650L, 651L, 654L, 655L, 656L,
    657L, 658L, 659L, 660L, 661L, 662L, 663L, 664L, 666L, 670L, 671L,
    672L, 673L, 674L, 675L, 676L, 677L, 679L, 680L, 681L, 682L, 683L,
    684L, 685L, 686L, 687L, 688L, 689L, 691L, 692L, 693L, 694L, 695L,
    696L, 697L, 698L, 699L, 701L, 703L, 704L, 705L, 706L, 707L, 708L,
    709L, 710L, 711L, 712L, 713L, 714L, 715L, 716L, 717L, 718L, 719L,
    720L, 721L, 722L, 723L, 724L, 725L, 726L, 728L, 729L, 730L, 731L,
    732L, 733L, 734L, 735L, 736L, 737L, 738L, 739L, 742L, 744L, 745L,
    746L, 747L, 748L, 749L, 750L, 752L, 753L, 757L, 763L, 764L, 767L,
    768L, 769L, 770L, 771L, 777L, 781L, 782L, 783L, 785L, 786L, 787L,
    788L, 789L, 792L, 793L, 794L, 795L, 796L, 797L, 798L, 799L, 800L,
    801L, 803L, 804L, 806L, 807L, 808L, 809L, 811L, 812L, 813L, 814L,
    815L, 816L, 817L, 818L, 822L, 824L, 825L, 829L, 830L, 834L, 836L,
    837L, 839L, 841L, 843L, 846L, 852L, 854L, 855L, 856L, 857L, 858L,
    859L, 860L, 862L, 865L, 866L, 867L, 868L, 869L, 870L, 871L, 872L,
    873L, 874L, 875L, 876L, 877L, 878L, 879L, 880L, 881L, 882L, 883L,
    884L, 885L, 886L, 887L, 888L, 891L, 892L, 893L, 894L, 895L, 896L,
    897L, 898L, 899L, 900L, 902L, 903L, 905L, 906L, 907L, 908L, 909L,
    911L, 913L, 914L, 915L, 916L, 917L, 918L, 919L, 920L, 921L, 922L,
    924L, 927L, 928L, 930L, 931L, 932L, 933L, 934L, 936L, 938L, 939L,
    940L, 942L, 943L, 944L, 945L, 946L, 948L, 949L, 950L, 951L, 952L,
    953L, 954L, 955L, 956L, 957L, 958L, 959L, 960L, 961L, 963L, 964L,
    965L, 966L, 967L, 968L, 969L, 970L, 971L, 972L, 973L, 974L, 977L,
    979L, 980L, 981L, 982L, 983L, 984L, 985L, 986L, 987L, 989L, 990L,
    991L, 992L, 995L, 996L, 997L, 998L, 999L, 1000L, 1001L, 1002L,
    1003L, 1004L, 1005L, 1006L, 1007L, 1008L, 1009L, 1010L, 1011L,
    1012L, 1013L, 1014L, 1015L, 1017L, 1018L, 1020L, 1021L, 1022L,
    1023L, 1024L, 1025L, 1026L, 1027L, 1028L, 1029L, 1030L, 1031L,
    1032L, 1033L, 1034L, 1035L, 1039L, 1040L, 1041L, 1042L, 1043L,
    1044L, 1045L, 1046L, 1047L, 1048L, 1049L, 1050L, 1051L, 1053L,
    1054L, 1055L, 1056L, 1057L, 1058L, 1059L, 1060L, 1061L, 1062L,
    1065L, 1066L, 1067L, 1068L, 1069L, 1070L, 1072L, 1073L, 1074L,
    1075L, 1076L, 1077L, 1080L, 1081L, 1082L, 1083L, 1084L, 1085L,
    1087L, 1090L, 1092L, 1094L, 1097L, 1099L, 1100L, 1101L, 1102L,
    1103L, 1104L, 1105L, 1106L, 1109L, 1110L, 1111L, 1112L, 1113L,
    1114L, 1116L, 1118L, 1119L, 1120L, 1121L, 1122L, 1123L, 1124L,
    1125L, 1126L, 1127L, 1128L, 1129L, 1130L, 1131L, 1133L, 1134L,
    1135L, 1137L, 1138L, 1139L, 1140L, 1141L, 1142L, 1144L, 1146L,
    1147L, 1148L, 1151L, 1156L, 1160L, 1161L, 1162L, 1163L, 1164L,
    1165L, 1167L, 1168L, 1169L, 1170L, 1171L, 1173L, 1174L, 1175L,
    1178L, 1179L, 1180L, 1181L, 1182L, 1183L, 1184L, 1185L, 1186L,
    1187L, 1189L, 1190L, 1191L, 1192L, 1194L, 1195L, 1196L, 1200L,
    1202L, 1204L, 1205L, 1207L, 1212L, 1213L, 1214L, 1215L, 1216L,
    1220L, 1221L, 1222L, 1223L, 1224L, 1225L, 1226L, 1227L, 1229L,
    1230L, 1231L, 1233L, 1234L, 1235L, 1236L, 1238L, 1239L, 1241L,
    1242L, 1243L, 1244L, 1245L, 1246L, 1247L, 1249L, 1250L, 1251L,
    1252L, 1253L, 1254L, 1255L, 1256L, 1257L, 1258L, 1259L, 1260L,
    1261L, 1262L, 1263L, 1264L, 1265L, 1266L, 1267L, 1268L, 1269L,
    1270L, 1271L, 1273L, 1274L, 1275L, 1277L, 1278L, 1279L, 1280L,
    1281L, 1282L, 1283L, 1284L, 1285L, 1286L, 1287L, 1288L, 1289L,
    1290L, 1291L, 1292L, 1293L, 1298L, 1299L, 1300L, 1301L, 1302L,
    1303L, 1304L, 1305L, 1306L, 1307L, 1308L, 1309L, 1310L, 1311L,
    1312L, 1313L, 1314L, 1315L, 1316L, 1317L, 1318L, 1319L, 1320L,
    1321L, 1323L, 1325L, 1326L, 1329L, 1330L, 1331L, 1332L, 1334L,
    1335L, 1338L, 1339L, 1340L, 1342L, 1343L, 1344L, 1345L, 1346L,
    1347L, 1348L, 1349L, 1350L, 1351L, 1352L, 1353L, 1354L, 1355L,
    1357L, 1358L, 1359L, 1360L, 1361L, 1362L, 1363L, 1364L, 1365L,
    1366L, 1367L, 1368L, 1370L, 1371L, 1372L, 1373L, 1375L, 1377L,
    1381L, 1382L, 1383L, 1384L, 1385L, 1386L, 1387L, 1388L, 1389L,
    1391L, 1392L, 1394L, 1397L, 1398L, 1399L, 1401L, 1402L, 1403L,
    1404L, 1405L, 1406L, 1407L, 1408L, 1409L, 1410L, 1411L, 1413L,
    1414L, 1415L, 1416L, 1417L, 1418L, 1419L, 1420L, 1421L, 1422L,
    1423L, 1424L, 1425L, 1426L, 1427L, 1428L, 1429L, 1430L, 1431L,
    1433L, 1434L, 1435L, 1437L, 1438L, 1439L, 1440L, 1441L, 1444L,
    1445L, 1446L, 1447L, 1449L, 1450L, 1452L, 1453L, 1454L, 1455L,
    1456L, 1457L, 1458L, 1459L, 1460L, 1461L, 1462L, 1463L, 1464L,
    1465L, 1466L, 1467L, 1468L, 1469L, 1471L, 1472L, 1474L, 1475L,
    1476L, 1478L, 1479L, 1480L, 1481L, 1482L, 1483L, 1486L, 1488L,
    1490L, 1491L, 1492L, 1493L, 1494L, 1495L, 1496L, 1497L, 1498L,
    1499L, 1500L, 1501L, 1502L, 1503L, 1504L, 1505L, 1508L, 1509L,
    1510L, 1511L, 1512L, 1513L, 1514L, 1515L, 1516L, 1517L, 1519L,
    1520L, 1521L, 1522L, 1523L, 1524L, 1525L, 1526L, 1528L, 1529L,
    1531L, 1533L, 1535L, 1536L, 1537L, 1538L, 1540L, 1541L, 1542L,
    1543L, 1544L, 1545L, 1546L, 1547L, 1548L, 1551L, 1552L, 1553L,
    1554L, 1555L, 1556L, 1557L, 1558L, 1559L, 1560L, 1561L, 1562L,
    1563L, 1564L, 1565L, 1566L, 1567L, 1568L, 1569L, 1570L, 1571L,
    1572L, 1573L, 1574L, 1575L, 1577L, 1580L, 1581L, 1582L, 1583L,
    1584L, 1585L, 1586L, 1587L, 1588L, 1589L, 1590L, 1591L, 1592L,
    1594L, 1597L, 1599L, 1600L, 1602L, 1604L, 1605L, 1606L, 1607L,
    1608L, 1609L, 1611L, 1612L, 1613L, 1614L, 1615L, 1616L, 1617L,
    1618L)

# Final clean-ups ----
## Remove empty dates ----
date_info_tb <- dabr::select_all(conn, "date_info")
date_info_tb %>%
  dplyr::filter(is.na(date_type),
                is.na(depth),
                is.na(thickness),
                is.na(age_used)) %>%
  .$ID_DATE_INFO %>% dput
dabr::select(conn,
             "SELECT * FROM date_info WHERE ID_DATE_INFO IN (",
             paste0(2864:2890, collapse = ", "),
             ")")
# dabr::delete(conn,
#              "DELETE FROM date_info WHERE ID_DATE_INFO IN (",
#              paste0(2864:2890, collapse = ", "),
#              ") AND date_type IS NULL")
# # Results: 27 records were deleted.

date_info_tb %>%
  dplyr::group_by(ID_ENTITY) %>%
  dplyr::filter(depth < 1)


# Update dates extracted from the RPD (wrong units: depth and thickness) ----
### Dates ----
date_info_tb <-
  dabr::select_all(conn, "date_info")

date_info_tb_dodgy_dates <-
  date_info_tb %>%
  dplyr::filter(depth < 1 | thickness < 1)

date_info_tb_dodgy_dates_external_links <-
  dabr::select_all(conn, "external_link") %>%
  dplyr::filter(ID_ENTITY %in% date_info_tb_dodgy_dates$ID_ENTITY) %>%
  dplyr::filter(external_source != "NEOTOMA")

date_info_tb_dodgy_dates_entities <-
  dabr::select_all(conn, "entity") %>%
  dplyr::filter(ID_ENTITY %in% date_info_tb_dodgy_dates$ID_ENTITY) %>%
  dplyr::left_join(date_info_tb_dodgy_dates_external_links) %>%
  dplyr::mutate(external_source =
                  ifelse(is.na(external_source), "NEOTOMA",
                         external_source))

##### RPD ----
date_info_tb_dodgy_dates_entities_RPD <-
  date_info_tb_dodgy_dates_entities %>%
  dplyr::filter(external_source == "RPD") %>%
  .$external_ID_ENTITY %>%
  extract_rpd()

date_info_tb_dodgy_dates_entities_RPD_EPD_dates <-
  date_info %>%
  dplyr::filter(ID_ENTITY %in% (
    date_info_tb_dodgy_dates_entities %>%
      dplyr::filter(external_source == "RPD") %>%
      .$ID_ENTITY
  ))

date_info_tb_dodgy_dates_entities_RPD_NEOTOMA_dates <-
  EPD_DATES %>%
  dplyr::filter(entity_name %in% (
    date_info_tb_dodgy_dates_entities %>%
      dplyr::filter(external_source == "RPD") %>%
      .$entity_name
  )) %>%
  dplyr::left_join(dabr::select(conn,
                                "SELECT ID_ENTITY, entity_name",
                                "FROM entity"),
                   by = "entity_name") %>%
  dplyr::select(-ages_already, -site_id, -site_name, -site_name_clean, -entity_name) %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::rename(age_calib = age_cal)

date_info_tb_dodgy_dates_entities_RPD_dates <-
  date_info_tb_dodgy_dates_entities_RPD$date_info %>%
  dplyr::select(-ID_DATE_INFO) %>%
  dplyr::rename(external_ID_ENTITY = ID_ENTITY) %>%
  dplyr::left_join(external_links_rpd %>%
                     dplyr::select(ID_ENTITY, external_ID_ENTITY)) %>%
  dplyr::select(-external_ID_ENTITY) %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::rename(lab_num = lab_number)

waldo::compare(date_info_tb_dodgy_dates_entities_RPD_dates %>%
                 .[order(colnames(.))],
               date_info_tb_dodgy_dates_entities_RPD_EPD_dates %>%
                 .[order(colnames(.))] %>%
                 dplyr::select(-ID_DATE_INFO),
               tolerance = 1E-3)

# This difference was a change requested by SPH
date_info_tb_dodgy_dates_entities_RPD_dates[202,]
date_info_tb_dodgy_dates_entities_RPD_EPD_dates[202,]

##### IBERIA ----
date_info_tb_dodgy_dates_entities_IBERIA <-
  date_info_tb_dodgy_dates_entities %>%
  dplyr::filter(external_source == "IBERIA") %>%
  .$external_entity_name %>%
  extract_iberia()

date_info_tb_dodgy_dates_entities_IBERIA_EPD_dates <-
  date_info %>%
  dplyr::filter(ID_ENTITY %in% (
    date_info_tb_dodgy_dates_entities %>%
      dplyr::filter(external_source == "IBERIA") %>%
      .$ID_ENTITY
  ))

date_info_tb_dodgy_dates_entities_IBERIA_NEOTOMA_dates <-
  EPD_DATES %>%
  dplyr::filter(entity_name %in% (
    date_info_tb_dodgy_dates_entities %>%
      dplyr::filter(external_source == "IBERIA") %>%
      .$entity_name
  )) %>%
  dplyr::left_join(dabr::select(conn,
                                "SELECT ID_ENTITY, entity_name",
                                "FROM entity"),
                   by = "entity_name") %>%
  dplyr::select(-ages_already, -site_id, -site_name, -site_name_clean, -entity_name) %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::rename(age_calib = age_cal)

date_info_tb_dodgy_dates_entities_IBERIA_dates <-
  date_info_tb_dodgy_dates_entities_IBERIA$date_info %>%
  dplyr::select(-ID_SITE) %>%
  dplyr::rename(external_ID_ENTITY = ID_ENTITY) %>%
  dplyr::left_join(dabr::select_all(conn, "external_link") %>%
                     dplyr::filter(external_source == "IBERIA") %>%
                     dplyr::select(ID_ENTITY, external_ID_ENTITY)) %>%
  dplyr::select(-external_ID_ENTITY, -entity_name) %>%
  dplyr::mutate(age_used = ifelse(is.na(notes), "yes", "no")) %>%
  dplyr::rename(reason_age_not_used = notes) %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::rename(age_calib = age_cal) %>%
  purrr::map_dfc(~ifelse(.x == "NULL", NA, .x)) %>%
  dplyr::arrange(ID_ENTITY)

waldo::compare(date_info_tb_dodgy_dates_entities_IBERIA_dates %>%
                 .[order(colnames(.))],
               date_info_tb_dodgy_dates_entities_IBERIA_EPD_dates %>%
                 .[order(colnames(.))] %>%
                 dplyr::select(-ID_DATE_INFO, -notes) %>%
                 magrittr::set_class(class(.)[-1]),
               tolerance = 1E-3,
               max_diffs = Inf)

##### EMBSECBIO ----
date_info_tb_dodgy_dates_entities_EMBSECBIO <-
  date_info_tb_dodgy_dates_entities %>%
  dplyr::filter(external_source == "EMBSECBIO") %>%
  .$external_ID_ENTITY %>%
  extract_embsecbio()

date_info_tb_dodgy_dates_entities_EMBSECBIO_EPD_dates <-
  date_info %>%
  dplyr::filter(ID_ENTITY %in% (
    date_info_tb_dodgy_dates_entities %>%
      dplyr::filter(external_source == "EMBSECBIO") %>%
      .$ID_ENTITY
  )) %>%
  dplyr::arrange(ID_ENTITY, depth)

date_info_tb_dodgy_dates_entities_EMBSECBIO_NEOTOMA_dates <-
  EPD_DATES %>%
  dplyr::filter(entity_name %in% (
    date_info_tb_dodgy_dates_entities %>%
      dplyr::filter(external_source == "EMBSECBIO") %>%
      .$entity_name
  )) %>%
  dplyr::left_join(dabr::select(conn,
                                "SELECT ID_ENTITY, entity_name",
                                "FROM entity"),
                   by = "entity_name") %>%
  dplyr::select(-ages_already, -site_id, -site_name, -site_name_clean, -entity_name) %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::rename(age_calib = age_cal)

date_info_tb_dodgy_dates_entities_EMBSECBIO_dates <-
  date_info_tb_dodgy_dates_entities_EMBSECBIO$date_info %>%
  dplyr::select(-ID_DATE_INFO) %>%
  dplyr::rename(external_ID_ENTITY = ID_ENTITY) %>%
  dplyr::left_join(dabr::select_all(conn, "external_link") %>%
                     dplyr::filter(external_source == "EMBSECBIO") %>%
                     dplyr::select(ID_ENTITY, external_ID_ENTITY)) %>%
  dplyr::select(-external_ID_ENTITY) %>%
  dplyr::mutate(age_used = ifelse(is.na(date_comments), "yes", "no")) %>%
  dplyr::rename(notes = date_comments) %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::arrange(ID_ENTITY, depth)

waldo::compare(date_info_tb_dodgy_dates_entities_EMBSECBIO_dates %>%
                 .[order(colnames(.))],
               date_info_tb_dodgy_dates_entities_EMBSECBIO_EPD_dates %>%
                 .[order(colnames(.))] %>%
                 dplyr::select(-ID_DATE_INFO, -age_calib, -reason_age_not_used) %>%
                 magrittr::set_class(class(.)[-1]),
               tolerance = 1E-3,
               max_diffs = Inf)

#### NEOTOMA ----
date_info_tb_dodgy_dates_entities_NEOTOMA_EPD_dates <-
  dabr::select_all(conn, "date_info") %>%
  dplyr::filter(ID_ENTITY %in% (
    date_info_tb_dodgy_dates_entities %>%
      dplyr::filter(external_source == "NEOTOMA") %>%
      .$ID_ENTITY
  )) %>%
  dplyr::arrange(ID_ENTITY, depth, age_c14, age_calib)

date_info_tb_dodgy_dates_entities_NEOTOMA_EPD_dates_coretops <-
  date_info_tb_dodgy_dates_entities_NEOTOMA_EPD_dates %>%
  dplyr::filter(date_type %>%
                  stringr::str_starts("Top|top"))

# Some core tops seem to be missing from the DB
date_info_tb_dodgy_dates_entities_NEOTOMA_dates_coretops <-
  EPD_DATES_coretops %>%
  dplyr::filter(entity_name %in% (
    date_info_tb_dodgy_dates_entities %>%
      dplyr::filter(external_source == "NEOTOMA") %>%
      .$entity_name
  )) %>%
  dplyr::left_join(dabr::select(conn,
                                "SELECT ID_ENTITY, entity_name",
                                "FROM entity"),
                   by = "entity_name") %>%
  dplyr::select(-site_name, -entity_name) %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::rename(age_calib = age_cal) %>%
  dplyr::arrange(ID_ENTITY, depth, age_c14, age_calib)

# new_coretops <-
#   date_info_tb_dodgy_dates_entities_NEOTOMA_dates_coretops %>%
#   dplyr::select(ID_ENTITY, date_type, depth, thickness, lab_num, age_calib,
#                 error, material_dated, age_used) %>%
#   dplyr::left_join(
#     date_info_tb_dodgy_dates_entities_NEOTOMA_EPD_dates_coretops %>%
#       dplyr::select(ID_ENTITY, ID_DATE_INFO, date_type, depth, thickness,
#                     lab_num, age_calib, error, material_dated, age_used) %>%
#       magrittr::set_names(paste0(colnames(.), "_DB")),
#     by = c("ID_ENTITY" = "ID_ENTITY_DB")
#   ) %>%
#   dplyr::filter(is.na(ID_DATE_INFO_DB)) %>%
#   dplyr::select(1:9) %>%
#   dplyr::mutate(ID_DATE_INFO = 11170:11188, .after = ID_ENTITY)

new_coretops <-
  tibble::tribble(
    ~ID_ENTITY, ~ID_DATE_INFO,              ~date_type, ~depth, ~thickness,         ~lab_num, ~age_calib, ~error,  ~material_dated, ~age_used,
    48L,        11170L, "Top of core estimated",      0,         NA, "not applicable",          0,      5, "not applicable",     "yes",
    53L,        11171L,     "Top of core known",      1,         NA, "not applicable",        -60,      5, "not applicable",     "yes",
    186L,        11172L, "Top of core estimated",      0,         NA, "not applicable",          0,     50, "not applicable",     "yes",
    525L,        11173L, "Top of core estimated",    837,         NA, "not applicable",          0,     50, "not applicable",     "yes",
    568L,        11174L, "Top of core estimated",      5,         NA, "not applicable",        -52,      5, "not applicable",     "yes",
    617L,        11175L, "Top of core estimated",    0.5,         NA, "not applicable",        -47,     10, "not applicable",     "yes",
    624L,        11176L, "Top of core estimated",      3,         NA, "not applicable",        -31,     12, "not applicable",     "yes",
    666L,        11177L, "Top of core estimated",      0,         NA, "not applicable",          0,      5, "not applicable",     "yes",
    782L,        11178L, "Top of core estimated",   1297,         NA, "not applicable",          0,     50, "not applicable",     "yes",
    876L,        11179L, "Top of core estimated",      0,         NA, "not applicable",        -52,     10, "not applicable",     "yes",
    1126L,        11180L, "Top of core estimated",      0,         NA, "not applicable",          0,      5, "not applicable",     "yes",
    1182L,        11181L, "Top of core estimated",      0,         NA, "not applicable",        -57,     10, "not applicable",     "yes",
    1189L,        11182L,     "Top of core known",   1100,         NA, "not applicable",        -66,      2, "not applicable",     "yes",
    1222L,        11183L, "Top of core estimated",      0,         NA, "not applicable",        -53,      5, "not applicable",     "yes",
    1234L,        11184L, "Top of core estimated",      0,         NA, "not applicable",          0,      5, "not applicable",     "yes",
    1258L,        11185L,     "Top of core known",      0,         NA, "not applicable",        -35,      1, "not applicable",     "yes",
    1365L,        11186L, "Top of core estimated",      2,         NA, "not applicable",        -52,     10, "not applicable",     "yes",
    1433L,        11187L, "Top of core estimated",      0,         NA, "not applicable",        -48,     10, "not applicable",     "yes",
    1522L,        11188L, "Top of core estimated",      0,         NA, "not applicable",          0,      5, "not applicable",     "yes"
  )
new_coretops %>%
  rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
# Results: 19 records were inserted.

date_info_tb_dodgy_dates_entities_NEOTOMA_dates <-
  EPD_DATES %>%
  dplyr::filter(entity_name %in% (
    date_info_tb_dodgy_dates_entities %>%
      dplyr::filter(external_source == "NEOTOMA") %>%
      .$entity_name
  )) %>%
  dplyr::left_join(dabr::select(conn,
                                "SELECT ID_ENTITY, entity_name",
                                "FROM entity"),
                   by = "entity_name") %>%
  dplyr::select(-ages_already, -site_id, -site_name, -site_name_clean, -entity_name) %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::rename(age_calib = age_cal) %>%
  dplyr::arrange(ID_ENTITY, depth, age_c14, age_calib) %>%
  dplyr::filter(!is.na(date_type))

waldo::compare(date_info_tb_dodgy_dates_entities_NEOTOMA_dates %>%
                 # .[1:200,] %>%
                 .[order(colnames(.))],
               date_info_tb_dodgy_dates_entities_NEOTOMA_EPD_dates %>%
                 # .[1:200,] %>%
                 .[order(colnames(.))] %>%
                 dplyr::select(-ID_DATE_INFO) %>%
                 magrittr::set_class(class(.)),
               tolerance = 1E-3,
               max_diffs = Inf)

date_info_tb_dodgy_dates_entities_NEOTOMA_dates[169,]
date_info_tb_dodgy_dates_entities_NEOTOMA_EPD_dates[169,]

c(date_info_tb_dodgy_dates_entities_NEOTOMA_dates$ID_ENTITY,
  date_info_tb_dodgy_dates_entities_NEOTOMA_EPD_dates$ID_ENTITY) %>%
  unique() %>%
  sort() %>%
  purrr::map_dbl(function(id) {
    a <- date_info_tb_dodgy_dates_entities_NEOTOMA_dates %>%
      dplyr::filter(ID_ENTITY == id)
    b <- date_info_tb_dodgy_dates_entities_NEOTOMA_EPD_dates %>%
      dplyr::filter(ID_ENTITY == id)
    if(nrow(a) != nrow(b))
      return(id)
    NA
  })

# These differences are due to an additional date added
date_info_tb_dodgy_dates_entities_NEOTOMA_dates %>%
  dplyr::filter(ID_ENTITY == 1667) %>%
  dplyr::bind_rows(date_info_tb_dodgy_dates_entities_NEOTOMA_EPD_dates %>%
                     dplyr::filter(ID_ENTITY == 1667)) %>%
  View()

# Add missing core-top
tibble::tibble(
  ID_ENTITY = 33,
  ID_DATE_INFO = 11169,
  date_type = "Top of core known",
  depth = 0,
  lab_num = "not applicable",
  age_calib = -46,
  error = 1,
  material_dated = "not applicable",
  age_used = "yes"
) %>%
  rpd:::add_records(conn = conn, table = "date_info", dry_run = TRUE)
# Results: 1 record was inserted.

# aux <- date_info_tb_dodgy_dates_entities_RPD_NEOTOMA_dates$ID_ENTITY %>%
#   unique() %>%
#   sort() %>%
#   purrr::map_df(function(id) {
#     a <- date_info_tb_dodgy_dates_entities_RPD_NEOTOMA_dates %>%
#       dplyr::filter(ID_ENTITY == id) %>%
#       dplyr::select(ID_ENTITY,
#                     depth_NEO = depth,
#                     thickness_NEO = thickness,
#                     lab_num_NEO = lab_num) %>%
#       dplyr::mutate(source = "NEOTOMA")
#     b <- date_info_tb_dodgy_dates_entities_RPD_EPD_dates %>%
#       dplyr::filter(ID_ENTITY == id) %>%
#       dplyr::select(ID_ENTITY, ID_DATE_INFO,
#                     depth, thickness, lab_num) %>%
#       dplyr::mutate(source = "SPECIAL-EPD")
#     if (nrow(a) == nrow(b))
#       return(dplyr::bind_cols(a %>%
#                                 magrittr::set_names(
#                                   paste0(colnames(.), "_NEO")
#                                 ),
#                               b %>%
#                                 magrittr::set_names(
#                                   paste0(colnames(.), "_DB")
#                                 )))
#     dplyr::bind_rows(a, b)
#   })
# View(aux)

external_links_rpd <- dabr::select_all(conn, "external_link") %>%
  dplyr::filter(external_source == "RPD")
entities_rpd <- dabr::select_all(conn, "entity") %>%
  dplyr::filter(ID_ENTITY %in% external_links_rpd$ID_ENTITY)
rpd_info <- external_links_rpd$external_ID_ENTITY %>%
  extract_rpd()


rpd_dates <- rpd_info$date_info %>%
  dplyr::select(-ID_DATE_INFO) %>%
  dplyr::rename(external_ID_ENTITY = ID_ENTITY) %>%
  # dplyr::mutate(age_used = ifelse(is.na(reason_age_not_used), "yes", "no")) %>%
  dplyr::left_join(external_links_rpd %>%
                     dplyr::select(ID_ENTITY, external_ID_ENTITY)) %>%
  dplyr::select(-external_ID_ENTITY) %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::rename(lab_num = lab_number)

epd_dates <- dabr::select_all(conn, "date_info") %>%
  dplyr::filter(ID_ENTITY %in% entities_rpd$ID_ENTITY)

corrected_dates_rpd <- unique(entities_rpd$ID_ENTITY) %>%
  sort() %>%
  purrr::map_df(function(id) {
    RPD <- rpd_dates %>%
      dplyr::filter(ID_ENTITY == id)
    DB <- epd_dates %>%
      dplyr::filter(ID_ENTITY == id)
    waldo::compare(RPD %>%
                     .[order(colnames(.))],
                   DB %>%
                     .[order(colnames(.))] %>%
                     magrittr::set_class(c("date_info", class(.))),
                   max_diffs = Inf,
                   tolerance = 1E-9) %>%
      print()
    if (dim(RPD)[1] != dim(DB)[1])
      message("ID_ENTITY: ", id)
    DB %>%
      dplyr::select(ID_DATE_INFO,
                    lab = lab_num,
                    old_depth = depth,
                    old_thick = thickness) %>%
      dplyr::bind_cols(RPD %>%
                         dplyr::select(ID_ENTITY, lab_num, depth, thickness))
  })

corrected_dates_rpd %>%
  dplyr::filter(is.na(ID_DATE_INFO) |
                  is.na(ID_ENTITY) |
                  lab != lab_num)
corrected_dates_rpd %>%
  dplyr::select(ID_ENTITY, ID_DATE_INFO, depth, thickness) %>%
  rpd:::update_records(conn = conn, table = "date_info",
                       dry_run = TRUE, PK = 1:2)

waldo::compare(dabr::select_all(conn, "date_info") %>%
                 dplyr::filter(ID_ENTITY %in% entities_rpd$ID_ENTITY) %>%
                 dplyr::arrange(ID_ENTITY, depth),
               rpd_dates %>%
                 dplyr::arrange(ID_ENTITY, depth),
               tolerance = 1E-9)

### Samples ----
sample_tb <-
  dabr::select_all(conn, "sample")

samples_tb_2 <- sample_tb %>%
  dplyr::filter(thickness >= -1000, thickness < 1)
id_entities <- samples_tb_2 %>%
  .$ID_ENTITY %>% unique() %>% sort()
aux <- special.epd::snapshot(conn, ID_ENTITY = id_entities)
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name %in% aux$entity$entity_name) %>%
  dplyr::left_join(dabr::select(conn,
                                "SELECT ID_ENTITY, entity_name",
                                "FROM entity"),
                   by = "entity_name") %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::select(1:16) %>%
  dplyr::select(-site_id, -site_name, -site_name_clean, -dataset_id,
                -dataset_name, -entity_name, -chronology_id, -sample_id) %>%
  dplyr::arrange(ID_ENTITY, depth)
  # smpds::rm_na_taxa(1:16) %>%
  # dplyr::select(-c(1:6)) %>%
  # dplyr::select(-chronology_id, -sample_id, -unit_name)

EPD_samples <- aux$sample %>%
  dplyr::arrange(ID_ENTITY, depth)
NEOTOMA_samples <- aux_samples_counts %>%
  dplyr::arrange(ID_ENTITY, depth)

c(EPD_samples$ID_ENTITY,
  NEOTOMA_samples$ID_ENTITY) %>%
  unique() %>%
  sort() %>%
  purrr::map(function(id) {
    a <- EPD_samples %>%
      dplyr::filter(ID_ENTITY == id)
    b <- NEOTOMA_samples %>%
      dplyr::filter(ID_ENTITY == id)
    waldo::compare(a, b)
  })

c(EPD_samples$ID_ENTITY,
  NEOTOMA_samples$ID_ENTITY) %>%
  unique() %>%
  sort() %>% .[c(16, 21)]

# Samples with sample thickness = 0
EPD_samples %>%
  dplyr::filter(ID_ENTITY == 929) %>%
  dplyr::select(ID_ENTITY, ID_SAMPLE, thickness) %>%
  dplyr::mutate(thickness = ifelse(round(thickness, 3) == 0, NA, thickness)) %>%
  rpd:::update_records(conn = conn, table = "sample",
                       dry_run = !TRUE, PK = 1:2)
# Results: 38 records were updated.

NEOTOMA_samples %>%
  dplyr::filter(ID_ENTITY == 621)


# waldo::compare(aux_samples_counts %>%
#                  dplyr::select(ID_ENTITY, depth, thickness) %>%
#                  # dplyr::select(1:7) %>%
#                  .[order(colnames(.))],
#                EPD_samples %>%
#                  dplyr::select(ID_ENTITY, depth, thickness) %>%
#                  .[order(colnames(.))],
#                tolerance = 1E-3,
#                max_diffs = Inf)


samples_with_am_tb <-
  sample_tb %>%
  dplyr::mutate(has_AM = ID_SAMPLE %>%
                  purrr::map_lgl(function(id) {
                    (dabr::select(conn,
                                 "SELECT * FROM pollen_count WHERE",
                                 "ID_SAMPLE =",id,
                                 "LIMIT 1",
                                 quiet = TRUE) %>%
                      nrow()) > 0
                  }))

sample_tb_dodgy_samples <-
  # sample_tb %>%
  samples_with_am_tb %>%
  dplyr::filter(!has_AM) %>%
  dplyr::filter(thickness < 1)

sample_tb_dodgy_samples_external_links <-
  dabr::select_all(conn, "external_link") %>%
  dplyr::filter(ID_ENTITY %in% unique(sample_tb_dodgy_samples$ID_ENTITY)) %>%
  dplyr::filter(external_source != "NEOTOMA")

sample_tb_dodgy_samples_entities <-
  dabr::select_all(conn, "entity") %>%
  dplyr::filter(ID_ENTITY %in% unique(sample_tb_dodgy_samples$ID_ENTITY)) %>%
  dplyr::left_join(sample_tb_dodgy_samples_external_links) %>%
  dplyr::mutate(external_source =
                  ifelse(is.na(external_source), "NEOTOMA",
                         external_source))

##### RPD ----
sample_tb_dodgy_samples_entities_RPD <-
  sample_tb_dodgy_samples_entities %>%
  dplyr::filter(external_source == "RPD") %>%
  .$external_ID_ENTITY %>%
  extract_rpd()

sample_tb_dodgy_samples_entities_RPD_EPD_samples <-
  # sample_tb %>%
  samples_with_am_tb %>%
  dplyr::filter(ID_ENTITY %in% (
    sample_tb_dodgy_samples_entities %>%
      dplyr::filter(external_source == "RPD") %>%
      .$ID_ENTITY
  ))

sample_tb_dodgy_samples_entities_RPD_NEOTOMA_samples <-
  EPD_COUNTS %>%
  dplyr::filter(entity_name %in% (
    sample_tb_dodgy_samples_entities %>%
      dplyr::filter(external_source == "RPD") %>%
      .$entity_name
  )) %>%
  dplyr::left_join(dabr::select(conn,
                                "SELECT ID_ENTITY, entity_name",
                                "FROM entity"),
                   by = "entity_name") %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::select(1:16) %>%
  dplyr::select(-site_id, -site_name, -site_name_clean, -dataset_id,
                -dataset_name, -entity_name, -chronology_id, -sample_id)

sample_tb_dodgy_samples_entities_RPD_samples <-
  sample_tb_dodgy_samples_entities_RPD$sample %>%
  dplyr::select(-ID_SAMPLE) %>%
  dplyr::left_join(external_links_rpd %>%
                     dplyr::select(ID_ENTITY, external_ID_ENTITY)) %>%
  dplyr::select(-external_ID_ENTITY, -charcoal_measurement,
                -model_name_original) %>%
  dplyr::relocate(ID_ENTITY, .before = 1)

waldo::compare(sample_tb_dodgy_samples_entities_RPD_samples %>%
                 .[order(colnames(.))] %>%
                 magrittr::set_class(class(.)[-1]),
               sample_tb_dodgy_samples_entities_RPD_EPD_samples %>%
                 .[order(colnames(.))] %>%
                 dplyr::select(-ID_SAMPLE, -age_older, -age_younger,
                               -chronology_name, -count_type, -sample_type,
                               -age_type, -has_AM),
               tolerance = 1E-9)

sample_tb_dodgy_samples_entities_RPD_samples %>%
  magrittr::set_names(paste0(colnames(.), "_RPD")) %>%
  dplyr::bind_cols(sample_tb_dodgy_samples_entities_RPD_EPD_samples %>%
                     dplyr::select(ID_ENTITY, ID_SAMPLE, depth, thickness, age)
                   ) %>%
  dplyr::filter(round(depth_RPD, 3) == round(depth, 3)) %>%
  dplyr::filter(thickness_RPD != thickness) %>%
  dplyr::select(ID_ENTITY, ID_SAMPLE, thickness = thickness_RPD) %>%
  rpd:::update_records(conn = conn, table = "sample",
                       dry_run = TRUE, PK = 1:2)
# Results: 1560 records were updated.

##### IBERIA ----
sample_tb_dodgy_samples_entities_IBERIA <-
  sample_tb_dodgy_samples_entities %>%
  dplyr::filter(external_source == "IBERIA") %>%
  .$external_entity_name %>%
  extract_iberia()

sample_tb_dodgy_samples_entities_IBERIA_EPD_samples <-
  samples_with_am_tb %>%
  dplyr::filter(ID_ENTITY %in% (
    sample_tb_dodgy_samples_entities %>%
      dplyr::filter(external_source == "IBERIA") %>%
      .$ID_ENTITY
  ))

sample_tb_dodgy_samples_entities_IBERIA_NEOTOMA_samples <-
  EPD_COUNTS %>%
  dplyr::filter(entity_name %in% (
    sample_tb_dodgy_samples_entities %>%
      dplyr::filter(external_source == "IBERIA") %>%
      .$entity_name
  )) %>%
  dplyr::left_join(dabr::select(conn,
                                "SELECT ID_ENTITY, entity_name",
                                "FROM entity"),
                   by = "entity_name") %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::select(1:16) %>%
  dplyr::select(-site_id, -site_name, -site_name_clean, -dataset_id,
                -dataset_name, -entity_name, -chronology_id, -sample_id)

# sample_tb_dodgy_samples_entities_IBERIA_samples <-
#   sample_tb_dodgy_samples_entities_IBERIA$samples %>%
#   dplyr::select(-ID_SITE) %>%
#   dplyr::rename(external_ID_ENTITY = ID_ENTITY) %>%
#   dplyr::left_join(dabr::select_all(conn, "external_link") %>%
#                      dplyr::filter(external_source == "IBERIA") %>%
#                      dplyr::select(ID_ENTITY, external_ID_ENTITY)) %>%
#   dplyr::select(-external_ID_ENTITY, -entity_name) %>%
#   dplyr::mutate(age_used = ifelse(is.na(notes), "yes", "no")) %>%
#   dplyr::rename(reason_age_not_used = notes) %>%
#   dplyr::relocate(ID_ENTITY, .before = 1) %>%
#   dplyr::rename(age_calib = age_cal) %>%
#   purrr::map_dfc(~ifelse(.x == "NULL", NA, .x)) %>%
#   dplyr::arrange(ID_ENTITY)
#
# waldo::compare(sample_tb_dodgy_samples_entities_IBERIA_samples %>%
#                  .[order(colnames(.))],
#                sample_tb_dodgy_samples_entities_IBERIA_EPD_samples %>%
#                  .[order(colnames(.))] %>%
#                  dplyr::select(-ID_DATE_INFO, -notes) %>%
#                  magrittr::set_class(class(.)[-1]),
#                tolerance = 1E-3,
#                max_diffs = Inf)

##### EMBSECBIO ----
sample_tb_dodgy_samples_entities_EMBSECBIO <-
  sample_tb_dodgy_samples_entities %>%
  dplyr::filter(external_source == "EMBSECBIO") %>%
  .$external_ID_ENTITY %>%
  extract_embsecbio()

sample_tb_dodgy_samples_entities_EMBSECBIO_EPD_samples <-
  samples %>%
  dplyr::filter(ID_ENTITY %in% (
    sample_tb_dodgy_samples_entities %>%
      dplyr::filter(external_source == "EMBSECBIO") %>%
      .$ID_ENTITY
  )) %>%
  dplyr::arrange(ID_ENTITY, depth)

sample_tb_dodgy_samples_entities_EMBSECBIO_NEOTOMA_samples <-
  EPD_COUNTS %>%
  dplyr::filter(entity_name %in% (
    sample_tb_dodgy_samples_entities %>%
      dplyr::filter(external_source == "RPD") %>%
      .$entity_name
  )) %>%
  dplyr::left_join(dabr::select(conn,
                                "SELECT ID_ENTITY, entity_name",
                                "FROM entity"),
                   by = "entity_name") %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::select(1:16) %>%
  dplyr::select(-site_id, -site_name, -site_name_clean, -dataset_id,
                -dataset_name, -entity_name, -chronology_id, -sample_id)

# sample_tb_dodgy_samples_entities_EMBSECBIO_samples <-
#   sample_tb_dodgy_samples_entities_EMBSECBIO$samples %>%
#   dplyr::select(-ID_DATE_INFO) %>%
#   dplyr::rename(external_ID_ENTITY = ID_ENTITY) %>%
#   dplyr::left_join(dabr::select_all(conn, "external_link") %>%
#                      dplyr::filter(external_source == "EMBSECBIO") %>%
#                      dplyr::select(ID_ENTITY, external_ID_ENTITY)) %>%
#   dplyr::select(-external_ID_ENTITY) %>%
#   dplyr::mutate(age_used = ifelse(is.na(date_comments), "yes", "no")) %>%
#   dplyr::rename(notes = date_comments) %>%
#   dplyr::relocate(ID_ENTITY, .before = 1) %>%
#   dplyr::arrange(ID_ENTITY, depth)
#
# waldo::compare(sample_tb_dodgy_samples_entities_EMBSECBIO_samples %>%
#                  .[order(colnames(.))],
#                sample_tb_dodgy_samples_entities_EMBSECBIO_EPD_samples %>%
#                  .[order(colnames(.))] %>%
#                  dplyr::select(-ID_DATE_INFO, -age_calib, -reason_age_not_used) %>%
#                  magrittr::set_class(class(.)[-1]),
#                tolerance = 1E-3,
#                max_diffs = Inf)

#### NEOTOMA ----
sample_tb_dodgy_samples_entities_NEOTOMA_EPD_samples <-
  dabr::select_all(conn, "sample") %>%
  dplyr::filter(ID_ENTITY %in% (
    sample_tb_dodgy_samples_entities %>%
      dplyr::filter(external_source == "NEOTOMA") %>%
      .$ID_ENTITY
  )) %>%
  dplyr::arrange(ID_ENTITY, ID_SAMPLE, depth, thickness, age)

# Some core tops seem to be missing from the DB
sample_tb_dodgy_samples_entities_NEOTOMA_samples <-
  EPD_COUNTS %>%
  dplyr::filter(entity_name %in% (
    sample_tb_dodgy_samples_entities %>%
      dplyr::filter(external_source == "NEOTOMA") %>%
      .$entity_name
  )) %>%
  dplyr::left_join(dabr::select(conn,
                                "SELECT ID_ENTITY, entity_name",
                                "FROM entity"),
                   by = "entity_name") %>%
  dplyr::relocate(ID_ENTITY, .before = 1) %>%
  dplyr::select(1:16) %>%
  dplyr::select(-site_id, -site_name, -site_name_clean, -dataset_id,
                -dataset_name, -entity_name, -chronology_id, -sample_id)

waldo::compare(sample_tb_dodgy_samples_entities_NEOTOMA_EPD_samples %>%
                 .[order(colnames(.))],
               sample_tb_dodgy_samples_entities_NEOTOMA_samples %>%
                 .[order(colnames(.))] %>%
                 # dplyr::select(-ID_SAMPLE) %>%
                 magrittr::set_class(class(.)[-1]),
               tolerance = 1E-3,
               max_diffs = Inf)

## Repatriate missing counts from the EPD-NEOTOMA ----
sample_tb <- dabr::select_all(conn, "sample")
pollen_tb <- dabr::select_all(conn, "pollen_count")

entities_with_pollen <- sample_tb %>%
  dplyr::filter(!(ID_SAMPLE %in% pollen_tb$ID_SAMPLE))
entities_num_samples <- sample_tb %>%
  dplyr::filter(ID_ENTITY %in% entities_with_pollen$ID_ENTITY) %>%
  dplyr::group_by(ID_ENTITY) %>%
  dplyr::summarise(total = dplyr::n())

entities_wo_summary <- entities_with_pollen %>%
  dplyr::filter(ID_ENTITY %in% entities_with_pollen$ID_ENTITY) %>%
  dplyr::group_by(ID_ENTITY) %>%
  dplyr::summarise(wo_counts = dplyr::n()) %>%
  dplyr::left_join(entities_num_samples)

ID_ENTITIES <- entities_wo_summary %>%
  dplyr::filter(wo_counts == total) %>%
  .$ID_ENTITY

### New counts ----
#### Samples ----
c("BELIYA2C", "BRVE", "BUCGES3", "CRH", "DALLICAN", "KILOGAM",
  "GOURTE", "SETESGH", "OVSA", "HES1", "HES2", "HUZ1", "OTTKIN",
  "KOZLI", "SKOGSTJE", "SETESLK", "KILOLIT", "LIT1", "OTTMOR",
  "NABAHNE", "DOLSKYM", "PARIZ1", "PRYSKYRI", "MONDOVI", "RYNHOLEC",
  "TORVERA2", "LANDE", "NARMORNW", "PESCHIO1", "TRAVESS1"," VRBKA", "ZAHAJI"
)
entity_tb <- dabr::select_all(conn, "entity") %>%
  dplyr::filter(ID_ENTITY %in% ID_ENTITIES)
sample_tb <- dabr::select_all(conn, "sample") %>%
  dplyr::filter(ID_ENTITY %in% ID_ENTITIES)
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name %in% entity_tb$entity_name) %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:5)) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) %>%
  dplyr::left_join(entity_tb %>%
                     dplyr::select(ID_ENTITY, entity_name),
                   by = "entity_name") %>%
  dplyr::relocate(ID_ENTITY, .before = 1)

aux_samples_counts_2 <- aux_samples_counts$ID_ENTITY %>%
  unique() %>%
  purrr::map_df(function(ID) {
    a <- sample_tb %>%
      dplyr::filter(ID_ENTITY == ID)
    b <- aux_samples_counts %>%
      dplyr::filter(ID_ENTITY == ID)

    if (nrow(a) == nrow(b)) {
      a %>%
        dplyr::select(ID_ENTITY, ID_SAMPLE,
                      depth_RPD = depth, thickness_RPD = thickness) %>%
        dplyr::bind_cols(b %>%
                           dplyr::select(-ID_ENTITY))
    } else {
      a %>%
        dplyr::select(ID_ENTITY, ID_SAMPLE, depth, thickness_RPD = thickness) %>%
        dplyr::mutate(depth_rnd = round(depth, 3)) %>%
        dplyr::right_join(b %>%
                            dplyr::mutate(depth_rnd = round(depth, 3)),
                          by = c("ID_ENTITY", "depth_rnd"))
    }
  })

aux_samples_counts_2 %>%
  dplyr::filter(is.na(ID_ENTITY) | is.na(ID_SAMPLE))
#
# aux_samples_counts %>%
#   dplyr::select(1:9) %>%
#   rpd:::update_records(conn = conn, table = "sample",
#                        dry_run = TRUE, PK = 1:2)
# # Results: 39 records were updated.

#### Counts ----
load("data/EPD/EPD_taxa_to_exclude.rda")
load("data/EPD/EPD_taxa_amalgamation.rda")
aux_samples_counts_3 <- aux_samples_counts_2 %>%
  dplyr::select(-dplyr::starts_with("age"),
                -dplyr::starts_with("depth"),
                -dplyr::starts_with("thickness"),
                -ID_ENTITY,
                -entity_name,
                -chronology_name)
colnames(aux_samples_counts_3) %in% EPD_taxa_to_exclude$epd_taxa
colnames(aux_samples_counts_3)[!colnames(aux_samples_counts_3) %in% EPD_taxa_amalgamation$epd_taxa]

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
##### Clean ----
aux_samples_counts_4 <-
  seq_len(nrow(aux_samples_counts_2)) %>%
  purrr::map_df(function(i) {
    aux_samples_counts_3[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage1 <- idx_pairs(nrow(aux_samples_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage1))
meta_neo_res <-
  purrr::map2(idx_stage1$x,
              idx_stage1$y,
              ~ {
                pb$tick()
                aux_samples_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_4$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(aux_samples_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 528079 + 7735

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

aux_samples_counts_5 <-
  unique(aux_samples_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(aux_samples_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                aux_samples_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_5$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(aux_samples_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 469618 + 7085

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

aux_samples_counts_6 <-
  unique(aux_samples_counts_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(aux_samples_counts_6), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                aux_samples_counts_6[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_6$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(aux_samples_counts_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 287661 + 4810

# Delete objects with counts
rm(aux_samples_counts, aux_samples_counts_2, aux_samples_counts_3,
   aux_samples_counts_4, aux_samples_counts_5, aux_samples_counts_6)

### Additional counts ----
"Some entities have counts missing for few samples"
entities_with_pollen <- sample_tb %>%
  dplyr::filter(!(ID_SAMPLE %in% pollen_tb$ID_SAMPLE)) %>%
  dplyr::arrange(ID_ENTITY)
#### Samples ----
samples_with_missing_counts <-
  tibble::tribble(
    ~ID_ENTITY, ~ID_SAMPLE, ~depth,
    18L,    101349L,   3230,
    89L,      4802L,    305,
    89L,      4803L,    310,
    89L,      4804L,    315,
    89L,      4805L,    320,
    89L,      4806L,    325,
    89L,      4807L,    330,
    89L,      4808L,    335,
    89L,      4809L,    340,
    89L,      4810L,    350,
    89L,      4811L,    355,
    89L,      4812L,    360,
    89L,      4813L,    365,
    89L,      4814L,    370,
    89L,      4815L,    380,
    89L,      4816L,    385,
    89L,      4817L,    390,
    89L,      4818L,    395,
    89L,      4819L,    400,
    89L,      4820L,    405,
    89L,      4821L,    410,
    89L,      4822L,    415,
    89L,      4823L,    420,
    89L,      4824L,    425,
    89L,      4825L,    430,
    89L,      4826L,    435,
    89L,      4827L,    440,
    89L,      4828L,    445,
    89L,      4829L,    450,
    89L,      4830L,    455,
    89L,      4831L,    460,
    89L,      4832L,    465,
    89L,      4833L,    469,
    127L,    101496L, 1169.5,
    127L,    101497L, 1170.5,
    481L,     15065L,      0,
    481L,     15066L,      4,
    481L,     15067L,      8
  )
entity_tb <- dabr::select_all(conn, "entity") %>%
  dplyr::filter(ID_ENTITY %in% samples_with_missing_counts$ID_ENTITY)
sample_tb <- dabr::select_all(conn, "sample") %>%
  dplyr::filter(ID_SAMPLE %in% samples_with_missing_counts$ID_SAMPLE)
aux_samples_counts <- EPD_COUNTS %>%
  dplyr::filter(entity_name %in% entity_tb$entity_name) %>%
  smpds::rm_na_taxa(1:16) %>%
  dplyr::select(-c(1:5)) %>%
  dplyr::select(-chronology_id, -sample_id, -unit_name) %>%
  dplyr::left_join(entity_tb %>%
                     dplyr::select(ID_ENTITY, entity_name),
                   by = "entity_name") %>%
  dplyr::relocate(ID_ENTITY, .before = 1)

aux_samples_counts_2 <- aux_samples_counts$ID_ENTITY %>%
  unique() %>%
  purrr::map_df(function(ID) {
    a <- sample_tb %>%
      dplyr::filter(ID_ENTITY == ID)
    b <- aux_samples_counts %>%
      dplyr::filter(ID_ENTITY == ID)

    a %>%
      dplyr::select(ID_ENTITY, ID_SAMPLE, depth, thickness_RPD = thickness) %>%
      dplyr::mutate(depth_rnd = round(depth, 3)) %>%
      dplyr::left_join(b %>%
                          dplyr::mutate(depth_rnd = round(depth, 3)),
                        by = c("ID_ENTITY", "depth_rnd"))
  })

aux_samples_counts_2 %>%
  dplyr::filter(is.na(ID_ENTITY) | is.na(ID_SAMPLE))
#
# aux_samples_counts %>%
#   dplyr::select(1:9) %>%
#   rpd:::update_records(conn = conn, table = "sample",
#                        dry_run = TRUE, PK = 1:2)
# # Results: 39 records were updated.

#### Counts ----
load("data/EPD/EPD_taxa_to_exclude.rda")
load("data/EPD/EPD_taxa_amalgamation.rda")
aux_samples_counts_3 <- aux_samples_counts_2 %>%
  dplyr::select(-dplyr::starts_with("age"),
                -dplyr::starts_with("depth"),
                -dplyr::starts_with("thickness"),
                -ID_ENTITY,
                -entity_name,
                -chronology_name)
colnames(aux_samples_counts_3) %in% EPD_taxa_to_exclude$epd_taxa
colnames(aux_samples_counts_3)[!colnames(aux_samples_counts_3) %in% EPD_taxa_amalgamation$epd_taxa]

taxon_name_tb <- dabr::select_all(conn, "taxon_name")
##### Clean ----
aux_samples_counts_4 <-
  seq_len(nrow(aux_samples_counts_2)) %>%
  purrr::map_df(function(i) {
    aux_samples_counts_3[i, ] %>%
      tidyr::pivot_longer(-1, names_to = "epd_taxa", values_to = "count") %>%
      dplyr::filter(!is.na(count)) %>%
      dplyr::left_join(EPD_taxa_amalgamation,
                       by = "epd_taxa") %>%
      dplyr::left_join(taxon_name_tb,
                       by = c("clean_name" = "taxon_name")) %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE),
                    n = length(count)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, count) %>%
      dplyr::ungroup() %>%
      dplyr::mutate(amalgamation_level = 0, .before = count) # Clean names only
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_4 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage1 <- idx_pairs(nrow(aux_samples_counts_4), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage1))
meta_neo_res <-
  purrr::map2(idx_stage1$x,
              idx_stage1$y,
              ~ {
                pb$tick()
                aux_samples_counts_4[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_4$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 0")
waldo::compare(aux_samples_counts_4 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 1977

##### Intermediate ----
EPD_taxa_amalgamation_stage2 <- EPD_taxa_amalgamation %>%
  dplyr::select(-epd_taxa) %>%
  dplyr::distinct(clean_name, intermediate, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(intermediate))

aux_samples_counts_5 <-
  unique(aux_samples_counts_4$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_4 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage2,
                       by = c("taxon_name" = "clean_name")) %>%
      dplyr::mutate(amalgamation_level = 1) %>%
      dplyr::select(-ID_TAXON, -dplyr::starts_with("action")) %>%
      dplyr::rename(clean_taxon_name = taxon_name,
                    taxon_name = intermediate) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_5 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage2 <- idx_pairs(nrow(aux_samples_counts_5), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage2))
meta_neo_res <-
  purrr::map2(idx_stage2$x,
              idx_stage2$y,
              ~ {
                pb$tick()
                aux_samples_counts_5[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })

meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_5$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 1")
waldo::compare(aux_samples_counts_5 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 1966

##### Amalgamated ----
EPD_taxa_amalgamation_stage3 <- EPD_taxa_amalgamation_stage2 %>%
  dplyr::select(-clean_name, -dplyr::starts_with("action")) %>%
  dplyr::distinct(intermediate, amalgamated, .keep_all = TRUE) %>%
  dplyr::filter(!is.na(amalgamated))

aux_samples_counts_6 <-
  unique(aux_samples_counts_5$ID_SAMPLE) %>%
  purrr::map_df(function(ID_SAMPLE) {
    aux_samples_counts_5 %>%
      dplyr::filter(ID_SAMPLE == !!ID_SAMPLE) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "ID_TAXON") %>%
      dplyr::left_join(EPD_taxa_amalgamation_stage3,
                       by = c("taxon_name" = "intermediate")) %>%
      dplyr::mutate(amalgamation_level = 2) %>%
      dplyr::select(-ID_TAXON) %>%
      dplyr::rename(intermediate_taxon_name = taxon_name,
                    taxon_name = amalgamated) %>%
      dplyr::left_join(taxon_name_tb,
                       by = "taxon_name") %>%
      dplyr::group_by(ID_SAMPLE, ID_TAXON) %>%
      dplyr::mutate(count = sum(count, na.rm = TRUE)) %>%
      dplyr::distinct(ID_SAMPLE, ID_TAXON, .keep_all = TRUE) %>%
      dplyr::ungroup() %>%
      dplyr::select(ID_SAMPLE, ID_TAXON, amalgamation_level, count)
  }) %>%
  dplyr::arrange(ID_SAMPLE, ID_TAXON)

aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON) | is.na(ID_SAMPLE))
aux_samples_counts_6 %>%
  dplyr::filter(is.na(ID_TAXON)) %>% .$count %>% sum()
idx_stage3 <- idx_pairs(nrow(aux_samples_counts_6), 2000)
pb <- progress::progress_bar$new(total = nrow(idx_stage3))
meta_neo_res <-
  purrr::map2(idx_stage3$x,
              idx_stage3$y,
              ~ {
                pb$tick()
                aux_samples_counts_6[.x:.y, ] %>%
                  rpd:::add_records(conn = conn, table = "pollen_count",
                                    dry_run = TRUE, quiet = TRUE)
              })
meta_neo_res %>% purrr::flatten_lgl() %>% sum()

###### Validate -----
EPD_TAXA <- conn %>%
  dabr::select("SELECT * FROM pollen_count WHERE ID_SAMPLE IN (",
               paste0(unique(aux_samples_counts_6$ID_SAMPLE), collapse = ", "),
               ") and amalgamation_level = 2")
waldo::compare(aux_samples_counts_6 %>%
                 .[order(colnames(.))],
               EPD_TAXA %>%
                 .[order(colnames(.))],
               tolerance = 1E-9)
# 1906

# Delete objects with counts
rm(aux_samples_counts, aux_samples_counts_2, aux_samples_counts_3,
   aux_samples_counts_4, aux_samples_counts_5, aux_samples_counts_6)
special-uor/special.epd documentation built on April 10, 2022, 11:06 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
special-uor/special.epd
SPECIAL Research Group's Version of the European Pollen Database (EPD)

R.ignore/repatriate_data.R
In special-uor/special.epd: SPECIAL Research Group's Version of the European Pollen Database (EPD)

R Package Documentation

Browse R Packages

We want your feedback!

special-uor/special.epd SPECIAL Research Group's Version of the European Pollen Database (EPD)

R.ignore/repatriate_data.R In special-uor/special.epd: SPECIAL Research Group's Version of the European Pollen Database (EPD)

R Package Documentation

Browse R Packages

We want your feedback!

special-uor/special.epd
SPECIAL Research Group's Version of the European Pollen Database (EPD)

R.ignore/repatriate_data.R
In special-uor/special.epd: SPECIAL Research Group's Version of the European Pollen Database (EPD)