Nothing
# This function was written on the 25th of February to find manually identified outliers in
# bee data
# For questions, contact James B Dorey at jbdorey[at]me.com
#' Finds outliers, and their duplicates, as determined by experts
#'
#' Uses expert-identified outliers with source spreadsheets that may be edited by users. The function
#' will also use the duplicates file made using [BeeBDC::dupeSummary()] to identify duplicates of the
#' expert-identified outliers and flag those as well.
#' The function will add a flagging column called `.expertOutlier` where records that are FALSE are
#' the expert outliers.
#'
#' @param data A data frame or tibble. Occurrence records as input.
#' @param DataPath A character path to the directory that contains the outlier spreadsheets.
#' @param PaigeOutliersName A character patch. Should lead to outlier spreadsheet from Paige Chesshire (csv file).
#' @param newOutliersName A character path. Should lead to appropriate outlier spreadsheet (xlsx file).
#' @param ColombiaOutliers_all A character path. Should lead to spreadsheet of bee outliers from Colombia (csv file).
#' @param duplicates A data frame or tibble. The duplicate file produced by [BeeBDC::dupeSummary()].
#' @param NearTRUE Optional. A character file name to the csv file. If you want to remove expert
#' outliers that are too close to TRUE points, use the name of the NearTRUE.csv.
#' Note: This implementation is only basic for now unless there is a greater need in the future.
#' @param NearTRUE_threshold Numeric. The threshold (in km) for the distance to TRUE points to
#' keep expert outliers.
#'
#' @return Returns the data with a new column, `.expertOutlier` where records that are FALSE are
#' the expert outliers.
#' @export
#'
#' @importFrom dplyr %>%
#'
#' @examples
#' \dontrun{
#' # Read example data
#' data(beesFlagged)
#' # Read in the most-recent duplicates file as well
#' if(!exists("duplicates")){
#' duplicates <- fileFinder(path = DataPath,
#' fileName = "duplicateRun_") %>%
#' readr::read_csv()}
#' # identify the outliers and get a list of their database_ids
#' beesFlagged_out <- manualOutlierFindeR(
#' data = beesFlagged,
#' DataPath = DataPath,
#' PaigeOutliersName = "removedBecauseDeterminedOutlier.csv",
#' newOutliersName = "^All_outliers_ANB_14March.xlsx",
#' ColombiaOutliers_all = "All_Colombian_OutlierIDs.csv",
#' duplicates = duplicates)
#' }
#'
manualOutlierFindeR <- function(
data = NULL,
DataPath = NULL,
PaigeOutliersName = "removedBecauseDeterminedOutlier.csv",
newOutliersName = "All_outliers_ANB.xlsx",
ColombiaOutliers_all = "All_Colombian_OutlierIDs.csv",
duplicates = NULL,
NearTRUE = NULL,
NearTRUE_threshold = 5
){
# locally bind variables to the function
OutPath_Report <- eventDate <- near_truepoints_KM <- database_id <- occurrenceID <- NULL
institutionCode <- database_id_keep <- catalogNumber <- .expertOutlier <- . <- NULL
#### 0.0 Prep ####
##### 0.1 Errors ####
###### a. FATAL errors ####
if(is.null(data)){
stop(paste0(" - No data was given. Please specify the occurrence data."))
}
if(is.null(DataPath)){
stop(paste0(" - No DataPath was given. Please specify the directory that contains the outliers."))
}
if(is.null(PaigeOutliersName)){
stop(paste0(" - No PaigeOutliersName was given. Please specify the outliers' file name."))
}
if(is.null(newOutliersName)){
stop(paste0(" - No newOutliersName was given. Please specify the outliers' file name."))
}
if(is.null(duplicates)){
stop(paste0(" - No duplicates was given. Please provide the duplicates dataset as generated by jbd_dupeSummary."))
}
###
#### 1.0 Data prep ####
##### 1.1 Find data ####
writeLines(" - Looking for the datasets...")
###### a. Paige outliers ####
# Find the outliers from chesshire et al. 2023
PaigeOutliers <- fileFinder(path = DataPath,
fileName = PaigeOutliersName) %>%
readr::read_csv( col_types = readr::cols(.default = "c")) %>%
suppressWarnings()
###### b. new outliers ####
# Find the new outliers from the three sheets concatenates by Angela
outliersAll <- fileFinder(path = DataPath,
fileName = newOutliersName) %>%
openxlsx::read.xlsx("Outliers_FromCanadaToPanama_ANB") %>%
dplyr::mutate(dplyr::across(tidyselect::all_of(colnames(.)), as.character)) %>%
dplyr::bind_rows(fileFinder(path = DataPath,
fileName = newOutliersName) %>%
openxlsx::read.xlsx("Tracys_outliers") %>%
dplyr::mutate(dplyr::across(tidyselect::all_of(colnames(.)), as.character))
) %>%
dplyr::bind_rows(fileFinder(path = DataPath,
fileName = newOutliersName) %>%
openxlsx::read.xlsx("Colombian_outliers") %>%
dplyr::mutate(dplyr::across(tidyselect::all_of(colnames(.)), as.character))
) %>%
dplyr::bind_rows(fileFinder(path = DataPath,
fileName = newOutliersName) %>%
openxlsx::read.xlsx("Outliers_SppInStatus3") %>%
dplyr::mutate(dplyr::across(tidyselect::all_of(colnames(.)), as.character))
) %>%
readr::write_excel_csv(paste(tempdir(), "newOutliers.csv", sep = "/"))
# Read back in with the correct column classes
outliersAll <- fileFinder(path = tempdir(),
fileName = "newOutliers.csv") %>%
readr::read_csv(col_types = readr::cols(.default = "c"), lazy = FALSE) %>%
dplyr::mutate(eventDate = eventDate %>%
lubridate::ymd_hms(truncated = 5)) %>%
suppressWarnings()
###### c. Colombia ####
ColombiaOutliers <- fileFinder(path = DataPath,
fileName = ColombiaOutliers_all) %>%
readr::read_csv( col_types = readr::cols(.default = "c")) %>%
suppressWarnings()
###### d. remove NearTRUE ####
# If user provies a NearTRUE input
if(!is.null(NearTRUE)){
# Find and read the csv
NearTRUE_data <- fileFinder(path = DataPath,
fileName = NearTRUE) %>%
readr::read_csv() %>%
dplyr::filter(near_truepoints_KM >= NearTRUE_threshold)
# Remove those below a threshold in from NearTRUE in outliersAll
outliersAll <- outliersAll %>%
dplyr::filter(!database_id %in% NearTRUE_data$database_id)
# Remove those below a threshold in from NearTRUE in ColombiaOutliers
ColombiaOutliers <- ColombiaOutliers %>%
dplyr::filter(!database_id %in% NearTRUE_data$database_id)
}
###### e. eventDate ####
# format data eventDate
data <- data %>%
dplyr::mutate(eventDate = eventDate %>%
lubridate::ymd_hms(truncated = 5))
##### 1.2 Process Paige ####
writeLines(" - Processing the Paige outliers...")
# Find PaigeOutliers in the occurrence data by occurrenceID and institutionCode
Outl_occID <- data %>%
tidyr::drop_na(occurrenceID) %>%
dplyr::filter(occurrenceID %in% PaigeOutliers$occurrenceID &
institutionCode %in% PaigeOutliers$institutionCode)
# Find PaigeOutliers by occurrenceID and institutionCode
Outliers_matched <- data %>%
# Remove matched IDs
dplyr::filter(!occurrenceID %in% Outl_occID$occurrenceID) %>%
tidyr::drop_na(catalogNumber, institutionCode) %>%
dplyr::filter(catalogNumber %in% PaigeOutliers$catalogNumber &
institutionCode %in% PaigeOutliers$institutionCode) %>%
# Re-bind the outlier matches
dplyr::bind_rows(Outl_occID)
# Combine the Paige and new outliers
outliersAll <- outliersAll %>%
# Convert to the correct column types
readr::type_convert(col_types = ColTypeR()) %>%
dplyr::bind_rows(Outliers_matched)
#### 2.0 Find outlier duplicates ####
##### 2.1 Find duplicates ####
writeLines(" - Looking for duplicates of the outliers...")
# Get a list of the outliers and their duplicates
outlierDuplicates <- duplicates %>%
dplyr::filter(database_id %in% outliersAll$database_id |
database_id_keep %in% outliersAll$database_id)
duplicateList <- c(outlierDuplicates %>% dplyr::pull(database_id),
outlierDuplicates %>% dplyr::pull(database_id_keep)) %>% unique()
##### 2.2 Combine duplicates ####
# Get a list of all outliers and their duplicates - database_id
outList <- c(outliersAll %>% dplyr::pull(database_id), duplicateList,
ColombiaOutliers %>% pull(database_id)) %>% unique()
#### 3.0 Flag records ####
# Find the occurrences that did not match
data <- data %>%
# Add the .expertOutlier columns as TRUE (not flagged)
dplyr::mutate(.expertOutlier = dplyr::if_else(
database_id %in% outList,
FALSE, TRUE))
# Return user output
message(
paste(
"\\manualOutlierFindeR:\n",
"Flagged",
format(sum(data$.expertOutlier == FALSE, na.rm = TRUE), big.mark = ","),
"expert-identified outliers:\n",
"The column '.expertOutlier' was added to the database.\n"
)
)
# Return data
return(data)
}
Any scripts or data that you put into this service are public.
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.