basic_workflow.R
In BeeBDC: Occurrence Data Cleaning

## ----libraryChunk, load-packages, include=FALSE-------------------------------
# markdown packages
library(rgnparser)
library(magrittr)
library(knitr)
library(rmarkdown)
library(rmdformats)
library(prettydoc)
library(htmltools)
library(pkgdown)

# Load core packages
library(devtools)
library(BiocManager)
library(purrr)
library(here)
library(renv)
library(bdc)
library(CoordinateCleaner)
library(dplyr)
library(readr)
library(stringr)
library(lubridate)
library(tidyselect)
library(R.utils)
library(tidyr)
library(ggplot2)
library(forcats)
library(emld)
library(rlang)
library(xml2)
library(mgsub)
library(rvest)
library(rnaturalearth)
library(rnaturalearthdata)
library(countrycode)
library(janitor)
library(circlize)
library(paletteer)
library(cowplot)
library(igraph)
library(ggspatial)
library(sf)
library(parallel)
library(terra)

# Dont detect cores to avoid GitHbub error
old <- options()         # code line i 
on.exit(options(old))      # code line i+1 
options(mc.cores = parallel::detectCores())

## ----secretRootPath, include=FALSE--------------------------------------------
# Set the RootPath to tempdir
RootPath <- tempdir()
if (!dir.exists(paste0(RootPath, "/Data_acquisition_workflow"))) {
    dir.create(paste0(RootPath, "/Data_acquisition_workflow"), recursive = TRUE)
}

## ----global-options, include=FALSE--------------------------------------------
knitr::opts_chunk$set(error = TRUE, 
                      eval = TRUE, 
                      tidy = TRUE, 
                      warning = FALSE,
                      root.dir = normalizePath(tempdir()))

## ----falseRootPath, eval=FALSE------------------------------------------------
#  RootPath <- paste0("/your/path/here")

## ----CreateRootPath, warning=FALSE, collapse = TRUE---------------------------
  # Create the working directory in the RootPath if it doesn't exist already
if (!dir.exists(paste0(RootPath, "/Data_acquisition_workflow"))) {
    dir.create(paste0(RootPath, "/Data_acquisition_workflow"), recursive = TRUE)
}
  # Set the working directory
setwd(paste0(RootPath,"/Data_acquisition_workflow"))

## ----activate, collapse = TRUE------------------------------------------------
renv::activate(project = paste0(RootPath,"/Data_acquisition_workflow"))

## ----installPackages, message=FALSE, warning=FALSE, results=FALSE, collapse = TRUE, eval = FALSE----
#  if (!require("BiocManager", quietly = TRUE))
#      install.packages("BiocManager", repos = "http://cran.us.r-project.org")
#  
#  BiocManager::install("ComplexHeatmap")

## ----rnaturalearthhires, eval=FALSE-------------------------------------------
#    # Install remotes if needed
#  if (!require("remotes", quietly = TRUE))
#      install.packages("remotes", repos = "http://cran.us.r-project.org")
#    # Download and then load rnaturalearthhires
#  remotes::install_github("ropensci/rnaturalearthhires")
#  install.packages("rnaturalearthhires", repos = "https://ropensci.r-universe.dev", type = "source")
#  library(rnaturalearthhires)

## ----installBeeBDC, results=TRUE, message=TRUE, eval = FALSE, collapse = TRUE----
#  install.packages("BeeBDC")
#  library(BeeBDC)

## ----snapshot, collapse = TRUE------------------------------------------------
renv::snapshot(project = paste0(RootPath,"/Data_acquisition_workflow"),
                 prompt = FALSE)

## ----dirMaker, collapse = TRUE, eval = FALSE----------------------------------
#  BeeBDC::dirMaker(
#      RootPath = RootPath,
#      RDoc = "vignettes/BeeBDC_main.Rmd") %>%
#        # Add paths created by this function to the environment()
#      list2env(envir = parent.env(environment()))

## ----dirMakerSECRETELY, include = FALSE---------------------------------------
# For the sake of this tutorial, we will not use here::i_am in dirMaker, because we aren't allowed
  # to mess with package directories in this way. This will work-around to use the tempdir()
DataPath <- paste0(RootPath, "/Data_acquisition_workflow")
OutPath_Check <- paste0(RootPath, "/Data_acquisition_workflow/Output/Check")
OutPath_Figures <- paste0(RootPath, "/Data_acquisition_workflow/Output/Figures")
OutPath_Intermediate <- paste0(RootPath, "/Data_acquisition_workflow/Output/Intermediate")
OutPath_Report <- paste0(RootPath, "/Data_acquisition_workflow/Output/Report")
  # Create these files
if (!dir.exists(DataPath)) {
    dir.create(DataPath, recursive = TRUE)}
if (!dir.exists(OutPath_Check)) {
    dir.create(OutPath_Check, recursive = TRUE)}
if (!dir.exists(OutPath_Figures)) {
    dir.create(OutPath_Figures, recursive = TRUE)}
if (!dir.exists(OutPath_Intermediate)) {
    dir.create(OutPath_Intermediate, recursive = TRUE)}
if (!dir.exists(OutPath_Report)) {
    dir.create(OutPath_Report, recursive = TRUE)}

## ----lapply_library, results=FALSE, collapse = TRUE---------------------------
lapply(c("ComplexHeatmap", "magrittr"), 
       library, character.only = TRUE)

## ----2.0, eval = FALSE--------------------------------------------------------
#  # Load some package data — the taxonomy and a flagged example dataset
#    # Download the full beesTaxonomy file
#  taxonomyFile <- BeeBDC::beesTaxonomy()

## ----2.0secret, collapse = TRUE, eval = TRUE----------------------------------
  # load in the small test dataset in the background
system.file("extdata", "testTaxonomy.rda", package="BeeBDC") |>
  load()
  # Rename the file
taxonomyFile <- testTaxonomy
rm(testTaxonomy)

## ----2.0ii--------------------------------------------------------------------
  # Load the example beesFlagged dataset
beesFlagged <- BeeBDC::beesFlagged

selectedGenera <- taxonomyFile %>%
    # Select only tribe anthophorini (for example)
  dplyr::filter(tolower(tribe) == tolower("anthophorini")) %>%
  distinct(genus)
  
  # Filter the data
taxonData <- beesFlagged %>%
  dplyr::filter(genus %in% selectedGenera$genus)
  # View the data
taxonData

## ----3.0----------------------------------------------------------------------
  # Select your study area
studyArea <- c("Canada", "United states", "Mexico", "Guatemala")
# Filter the data to that area
countryData <- beesFlagged %>%
  dplyr::filter(country %in% studyArea)
  # View the data
countryData

## ----4.1----------------------------------------------------------------------
filteredData <- 
  BeeBDC::summaryFun(data = beesFlagged,
   # Choose the columns to NOT filter (or NULL to filter all columns)
   dontFilterThese = c(".gridSummary", ".lonFlag", ".latFlag", ".uncer_terms",
                      ".uncertaintyThreshold"),
    # In the output, do you want to REMOVE all filtering columns (TRUE), or keep them (FALSE)
   removeFilterColumns = TRUE,
   # In the output, do you want to only keep clean data according to your filtering (TRUE),
    # Or keep all data and simply update the .summary column (FALSE)
  filterClean = TRUE) 

## ----4.2----------------------------------------------------------------------
filteredData <- beesFlagged %>%
  # Remove any exiting .uncertaintyThreshold column
  dplyr::select(!tidyselect::any_of(".uncertaintyThreshold")) %>%
    # Chose the coordinate uncertainty to filter to...
  BeeBDC::coordUncerFlagR(data = .,
                  uncerColumn = "coordinateUncertaintyInMeters",
                    # 10 km here
                  threshold = 10000) %>%
    # Now re-do the .summary column and filter the data using this new value
  BeeBDC::summaryFun(
  data = .,
  dontFilterThese = c(".gridSummary", ".lonFlag", ".latFlag", ".uncer_terms"),
  removeFilterColumns = TRUE,
  filterClean = TRUE)

## ----4.2a---------------------------------------------------------------------
filteredData <- beesFlagged %>%
    # Remove any exisitng .year_outOfRange column
  dplyr::select(!".year_outOfRange") %>%
    # Chose the minimum year to filter to...
  bdc::bdc_year_outOfRange(data = .,
                           eventDate = "year",
                           year_threshold = 1970) %>%
    # Now re-do the .summary column and filter the data using this new value
  BeeBDC::summaryFun(
    data = .,
    dontFilterThese = c(".gridSummary", ".lonFlag", ".latFlag", ".uncer_terms",
                        ".uncertaintyThreshold"),
    removeFilterColumns = TRUE,
    filterClean = TRUE)

## ----4.2b---------------------------------------------------------------------
filteredData <- 
  # The input dataset
  beesFlagged %>%
  # Chose the year range...
  dplyr::filter(year > 1950 & year < 1970) %>%
  # Now re-do the .summary column and filter the data using this new value
  BeeBDC::summaryFun(
    # Select the input dataset to filter
    data = .,
    # Choose the columns to NOT filter (or NULL to filter all columns)
    dontFilterThese = c(".gridSummary", ".lonFlag", ".latFlag", ".uncer_terms",
                        ".uncertaintyThreshold"),
    # In the output, do you want to REMOVE all filtering columns (TRUE), or keep them (FALSE)
    removeFilterColumns = TRUE,
    # In the output, do you want to only keep clean data according to your filtering (TRUE),
    # Or keep all data and simply update the .summary column (FALSE)
    filterClean = TRUE)

## ----5.1, eval = FALSE--------------------------------------------------------
#  if(!require("BiocManager", quietly = TRUE)){
#    install.packages("BiocManager")}
#  BiocManager::install("ComplexHeatmap", force = TRUE)
#  renv::snapshot()

## ----5.1ii, eval = FALSE------------------------------------------------------
#  duplicates <- fileFinder(path = "PATH TO A FOLDER CONTAINING THE duplicateRun_ — could be supp. materials folder",
#                            fileName = "duplicateRun_") %>%
#    readr::read_csv() %>%
#    # Select only the stingless bee data
#    dplyr::filter(database_id %in% stinglessData$database_id |
#                    database_id_match %in% stinglessData$database_id)

## ----5.1on.exit, include = FALSE----------------------------------------------
oldpar <- par(no.readonly = TRUE) 
on.exit(oldpar)

## ----5.1iii, eval = FALSE-----------------------------------------------------
#  # Choose the global figure parameters
#    par(mar = c(2, 2, 2, 2)/2, mfrow = c(1,1))
#  
#  # Create the chorDiagram. You can leave many of the below values out but we show here
#  # the defaults
#  
#  BeeBDC::chordDiagramR(
#    # The duplicate data from the dupeSummary function output
#    dupeData = duplicates,
#    outPath = OutPath_Figures,
#    fileName = "ChordDiagram.pdf",
#    # These can be modified to help fit the final pdf that's exported.
#    width = 9,
#    height = 7.5,
#    bg = "white",
#    # How few distinct dataSources should a group have to be listed as "other"
#    smallGrpThreshold = 3,
#    title = "Duplicated record sources",
#    # The default list of colour palettes to choose from usign the paleteer package
#    palettes = c("cartography::blue.pal", "cartography::green.pal",
#                 "cartography::sand.pal", "cartography::orange.pal", "cartography::red.pal",
#                 "cartography::purple.pal", "cartography::brown.pal"),
#    canvas.ylim = c(-1.0,1.0),
#    canvas.xlim = c(-0.6, 0.25),
#    text.col = "black",
#    legendX = grid::unit(6, "mm"),
#    legendY = grid::unit(18, "mm"),
#    legendJustify = c("left", "bottom"),
#    niceFacing = TRUE)

## ----5.2----------------------------------------------------------------------
data("beesFlagged", package = "BeeBDC")

# Create a figure shoring the total number of duplicates, kept duplicates, and unique
# records for each datasource (simplified to the text before the first underscore) and
# the proportion of the above for each data source
BeeBDC::dupePlotR(
  data = beesFlagged,
  # The outPath to save the plot as
  outPath = tempdir(),
  fileName = "Fig3_duplicatePlot.pdf",
  # Colours in order: duplicate, kept duplicate, unique
  dupeColours = c("#F2D2A2","#B9D6BC", "#349B90"),
  # Plot size and height
  base_height = 7, base_width = 7,
  legend.position = c(0.85, 0.8),
  # Extra variables can be fed into forcats::fct_recode() to change names on plot
  GBIF = "GBIF", SCAN = "SCAN", iDigBio = "iDigBio", USGS = "USGS", ALA = "ALA", 
  ASP = "ASP",
  returnPlot = TRUE
)

## ----5.3b---------------------------------------------------------------------
# Visualise all flags for each dataSource (simplified to the text before the first underscore)
  # A clever user might also realise the potential to summarise and produce outputs in other columns
BeeBDC::plotFlagSummary(
  # WARNING: alternate path if wanting to produce figures for the selected taxonData (2.0 above)
  # Select only the taxonData data
  data = beesFlagged,
  # Colours in order of pass (TRUE), fail (FALSE), and NA
  flagColours = c("#127852", "#A7002D", "#BDBABB"),
  fileName = paste0("FlagsPlot_Amell", Sys.Date(),".pdf"),
  outPath = tempdir(),
  width = 15, height = 9,
  # OPTIONAL:
         #  # Filter to species
           speciesName = "Apis mellifera Linnaeus, 1758",
             # column to look in
           nameColumn = "scientificName",
           # Save the filtered data
           saveFiltered = FALSE,
     # Filter column to display on map
           filterColumn = ".summary",
           plotMap = TRUE,
       # amount to jitter points if desired, e.g. 0.25 or NULL
     jitterValue = NULL,
       # Map opacity value for points between 0 and 1
     mapAlpha = 1,
  returnPlot = TRUE,
  # Extra variables can be fed into forcats::fct_recode() to change names on plot
  GBIF = "GBIF", SCAN = "SCAN", iDigBio = "iDigBio", USGS = "USGS", ALA = "ALA", 
  ASP = "ASP", CAES = "CAES", 'B. Mont.' = "BMont", 'B. Minkley' = "BMin", Ecd = "Ecd",
  Gaiarsa = "Gai", EPEL = "EPEL"
)

## ----5.4----------------------------------------------------------------------
BeeBDC::summaryMaps(
  data = beesFlagged,
  width = 10, height = 10,
  class_n = 3,
  class_Style = "jenks",
  outPath = tempdir(),
  fileName = "CountryMaps_jenks.pdf",
  returnPlot = TRUE
)

## ----6.0, eval = FALSE--------------------------------------------------------
#  mapData %>%
#    readr::write_excel_csv(paste0(DataPath, "/Output/Intermediate/", "cleanTaxon_",
#                            Sys.Date(), ".csv"))

## ----cleanup, include=FALSE, collapse = TRUE----------------------------------
# Remove the webpage folder
unlink(paste0(dirname(getwd()), "/inst/extdata/WebDir"), recursive = TRUE)