BadCoordinateID.R
In ridigbio: Interface to the iDigBio Data API

## ----message=FALSE------------------------------------------------------------
# Load core libraries; install these packages if you have not already
library(ridigbio)
library(tidyverse)

# Load library for making nice HTML output
library(kableExtra)

# Load libraries for visualizing geographic data
library(leaflet)

library(cowplot)

## ----echo = FALSE-------------------------------------------------------------

verify_df_flagCoord <- FALSE

#Test that examples will run
tryCatch({
    # Your code that might throw an error
    verify_df_flagCoord <- idig_search_records(
        rq = list(flags = "rev_geocode_corrected", institutioncode = "lacm"),
        limit = 10
    )
}, error = function(e) {
    # Code to run if an error occurs
    cat("An error occurred during the idig_search_records call: ", e$message, "\n")
    cat("Vignettes will not be fully generated. Please try again after resolving the issue.")
    # Optionally, you can return NULL or an empty dataframe
    verify_df_flagCoord <- FALSE
})

## ----eval=verify_df_flagCoord-------------------------------------------------
# Edit the fields (e.g. `flags`) and values (e.g. "rev_geocode_corrected") in
# `list()` to adjust your query and the fields (e.g. `uuid`) in `fields` to
# adjust the columns returned in your results
df_flagCoord <- idig_search_records(rq = list(flags = "rev_geocode_corrected",
                                              institutioncode = "lacm"),
                    fields = c("uuid",
                               "institutioncode",
                               "collectioncode",
                               "country",
                               "data.dwc:country",
                               "stateprovince",
                               "county",
                               "locality",
                               "geopoint",
                               "data.dwc:decimalLongitude",
                               "data.dwc:decimalLatitude",
                               "flags"),
                    limit = 100000) %>% 
  # Rename fields to more easily reflect their provenance (either from the
  # data provider directly or modified by the data aggregator)
  rename(provider_lon = `data.dwc:decimalLongitude`,
         provider_lat = `data.dwc:decimalLatitude`,
         provider_country = `data.dwc:country`,
         aggregator_lon = `geopoint.lon`,
         aggregator_lat = `geopoint.lat`,
         aggregator_country = country,
         aggregator_stateprovince = stateprovince,
         aggregator_county = county,
         aggregator_locality = locality) %>% 
  # Reorder columns for easier viewing
  select(uuid, institutioncode, collectioncode, provider_lat, aggregator_lat,
         provider_lon, aggregator_lon, provider_country, aggregator_country,
         aggregator_stateprovince, aggregator_county, aggregator_locality,
         flags)

## ----eval=verify_df_flagCoord, echo = FALSE-----------------------------------
# Subset `df_flagCoord` to show example
df_flagCoord[1:50,] %>% 
  select(-flags) %>% 
  kable() %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"),
                font_size = 12,
                fixed_thead = T) %>% 
  scroll_box(width = "100%", height = "400px")

## ----eval=verify_df_flagCoord-------------------------------------------------
# Create function to allow subsetting the `df_flagCoord` dataset by other flags
# found on these same records
df_flagSubset <- function(subsetFlag) {
  df_flagCoord %>% 
  filter(grepl(subsetFlag, flags)) %>% 
  select(uuid, matches("_lat|_lon")) %>% 
  unite(provider_coords, c("provider_lat", "provider_lon"), sep = ",") %>% 
  unite(aggregator_coords, c("aggregator_lat", "aggregator_lon"), sep = ",") %>% 
  gather(key = type, value = coordinates, -uuid) %>% 
  separate(coordinates, c("lat","lon"), sep = ",") %>% 
  mutate(lat = as.numeric(lat)) %>% 
  mutate(lon = as.numeric(lon)) %>% 
  arrange(uuid, type)}

# Subset `df_flagCoord` by records flagged for having had their latitude negated
# to place point in stated country by reverse geocoding process
df_rev_geocode_lat_sign <- df_flagSubset("rev_geocode_lat_sign")

# Create map displaying a few examples of records with the
# rev_geocode_flip_lat_sign flag
pal <- leaflet::colorFactor(palette = c("#d7191c", "#fdae61", "#ffffbf", "#abdda4", "#2b83ba"),
                   domain = df_rev_geocode_lat_sign$uuid[1:10])

map <- df_rev_geocode_lat_sign[1:10,] %>% 
  mutate(popup = str_c(type, " = ", lat, ", ", lon, sep = "")) %>% 
  leaflet() %>%
  addTiles() %>% 
  addCircleMarkers(
    lng = ~lon,
    lat = ~lat,
    radius = 10,
    weight = 1,
    color = ~pal(uuid),
    stroke = FALSE,
    fillOpacity = 100,
    popup = ~popup) %>% 
    addLegend("bottomright", pal = pal, values = ~uuid,
    title = "Specimen Records",
    opacity = 1)

## ----eval=verify_df_flagCoord, echo = FALSE, out.width = '100%'---------------
map

## ----eval=verify_df_flagCoord-------------------------------------------------
# Summarize flagged records by collection type
spmByColl <- df_flagCoord %>% 
  group_by(collectioncode) %>% 
  tally()

# Generate graph to display counts of flagged records by collection within the
# institution
graph_spmByColl <- ggplot(spmByColl, 
                          aes(x = reorder(collectioncode, -n), 
                              y = n,
                              fill = collectioncode)) +
  geom_col() +
  theme(panel.background = element_blank(),
        legend.title = element_blank(),
        axis.title.x = element_text(face = "bold"),
        axis.text.y = element_blank(),
        axis.ticks.y = element_blank(),
        axis.title.y = element_text(face = "bold"),
        plot.title = element_text(size = 12, face = "bold")) +
  labs(x = "collection", 
       y = "# of specimen records",
       title = "LACM records flagged with geo-coordinate data quality issues by iDigBio") +
  geom_text(aes(label = n, vjust = -0.5))

# Get count of total records published by the institution using function
# `idig_count_records`
totalInstSpm <- idig_count_records(rq = list(institutioncode = "lacm"))

# Calculate flagged records as percent of total records
percentFlagged <- sum(spmByColl$n)/totalInstSpm*100

## ----eval=verify_df_flagCoord, out.width="700px", echo = FALSE----------------
graph_spmByColl <- graph_spmByColl +
                   theme_minimal_grid() +
                   theme(
                       text = element_text(size = 22),
                       axis.text = element_text(size = 22),
                       plot.title = element_text(size = 22, face = "bold")
                   )

knitr::include_graphics(save_plot("plot.png", graph_spmByColl, base_height = 10, base_width = 24))

## ----eval=verify_df_flagCoord-------------------------------------------------
# Collate `df_flagAssoc` to describe other data quality flags that are associated
# with rev_geocode_corrected in `df_flagCoord`
df_flagAssoc <- df_flagCoord %>% 
  select(uuid, flags) %>% 
  unnest(flags) %>% 
  group_by(flags) %>% 
  tally() %>% 
  mutate("category" = case_when(str_detect(flags, "geo|country|state")
                              ~ "geography",
                      str_detect(flags, "dwc_datasetid_added|dwc_multimedia_added|datecollected_bounds")
                              ~ "other",
                      str_detect(flags, "gbif|dwc|tax")
                              ~ "taxonomy")) %>% 
  mutate("percent" = n/(nrow(df_flagCoord))*100) %>% 
  arrange(category, desc(n))

# Visualize associated data quality flags
graph_spmByColl <- ggplot(df_flagAssoc, aes(x = reorder(flags, -percent), y = percent, fill = category)) +
  geom_col() +
  theme(axis.title.x = element_text(face = "bold"),
        axis.text.x = element_text(angle = 75, hjust = 1),
        axis.ticks.y = element_blank(),
        axis.title.y = element_text(face = "bold"),
        plot.title = element_text(size = 12, face = "bold")
        ) +
  labs(x = "additional iDigBio data quality flag", 
       y = "% specimen records",
       title = "LACM records flagged for geo-coordinate issues are also flagged for...",
       fill = "flag category")

## ----eval=verify_df_flagCoord, out.width="700px", echo = FALSE----------------
graph_spmByColl <- graph_spmByColl +
                   theme_minimal_grid() +
                   theme(
                       text = element_text(size = 22),
                       axis.text = element_text(size = 22),
                       axis.text.x = element_text(angle = 90, vjust = 0.5, hjust = 1),
                       plot.title = element_text(size = 22, face = "bold")
                   )

knitr::include_graphics(save_plot("plot2.png", graph_spmByColl, base_height = 10, base_width = 24))