R/manualOutlierFindeR.R
In BeeBDC: Occurrence Data Cleaning

Documented in manualOutlierFindeR

# This function was written on the 25th of February to find manually identified outliers in 
  # bee data
# For questions, contact James B Dorey at jbdorey[at]me.com


#' Finds outliers, and their duplicates, as determined by experts
#' 
#' Uses expert-identified outliers with source spreadsheets that may be edited by users. The function 
#' will also use the duplicates file made using [BeeBDC::dupeSummary()] to identify duplicates of the 
#' expert-identified outliers and flag those as well.
#' The function will add a flagging column called `.expertOutlier` where records that are FALSE are
#' the expert outliers.
#'
#' @param data A data frame or tibble. Occurrence records as input.
#' @param DataPath A character path to the directory that contains the outlier spreadsheets.
#' @param PaigeOutliersName A character patch. Should lead to outlier spreadsheet from Paige Chesshire (csv file).
#' @param newOutliersName A character path. Should lead to appropriate outlier spreadsheet (xlsx file).
#' @param ColombiaOutliers_all A character path. Should lead to spreadsheet of bee outliers from Colombia (csv file).
#' @param duplicates A data frame or tibble. The duplicate file produced by [BeeBDC::dupeSummary()].
#' @param NearTRUE Optional. A character file name to the csv file. If you want to remove expert
#' outliers that are too close to TRUE points, use the name of the NearTRUE.csv.
#' Note: This implementation is only basic for now unless there is a greater need in the future.
#' @param NearTRUE_threshold Numeric. The threshold (in km) for the distance to TRUE points to 
#' keep expert outliers.
#'
#' @return Returns the data with a new column, `.expertOutlier` where records that are FALSE are
#' the expert outliers.
#' @export
#' 
#' @importFrom dplyr %>%
#'
#' @examples
#' \dontrun{
#'   # Read example data
#'   data(beesFlagged)
#' # Read in the most-recent duplicates file as well
#' if(!exists("duplicates")){
#'   duplicates <- fileFinder(path = DataPath,
#'                             fileName = "duplicateRun_") %>%
#'     readr::read_csv()}
#' # identify the outliers and get a list of their database_ids
#' beesFlagged_out <- manualOutlierFindeR(
#'   data = beesFlagged,
#'   DataPath = DataPath,
#'   PaigeOutliersName = "removedBecauseDeterminedOutlier.csv",
#'   newOutliersName = "^All_outliers_ANB_14March.xlsx",
#'   ColombiaOutliers_all = "All_Colombian_OutlierIDs.csv",
#'   duplicates = duplicates)
#' }
#' 
manualOutlierFindeR <- function(
    data = NULL,
    DataPath = NULL,
    PaigeOutliersName = "removedBecauseDeterminedOutlier.csv",
    newOutliersName = "All_outliers_ANB.xlsx",
    ColombiaOutliers_all = "All_Colombian_OutlierIDs.csv",
    duplicates = NULL,
    NearTRUE = NULL,
    NearTRUE_threshold = 5
    ){
  # locally bind variables to the function
  OutPath_Report <- eventDate <- near_truepoints_KM <- database_id <- occurrenceID <- NULL
  institutionCode <- database_id_keep <- catalogNumber <- .expertOutlier <- . <- NULL
  
  #### 0.0 Prep ####
  ##### 0.1 Errors ####
  ###### a. FATAL errors ####
  if(is.null(data)){
    stop(paste0(" - No data was given. Please specify the occurrence data."))
  }
  if(is.null(DataPath)){
    stop(paste0(" - No DataPath was given. Please specify the directory that contains the outliers."))
  }
  if(is.null(PaigeOutliersName)){
    stop(paste0(" - No PaigeOutliersName was given. Please specify the outliers' file name."))
  }
  if(is.null(newOutliersName)){
    stop(paste0(" - No newOutliersName was given. Please specify the outliers' file name."))
  }
  if(is.null(duplicates)){
    stop(paste0(" - No duplicates was given. Please provide the duplicates dataset as generated by jbd_dupeSummary."))
  }
  ###
  
  #### 1.0 Data prep ####
    ##### 1.1 Find data ####
  writeLines(" - Looking for the datasets...")
      ###### a. Paige outliers ####
  # Find the outliers from chesshire et al. 2023
  PaigeOutliers <- fileFinder(path = DataPath,
                               fileName = PaigeOutliersName) %>%
    readr::read_csv( col_types = readr::cols(.default = "c")) %>%
    suppressWarnings()
  
      ###### b. new outliers ####
  # Find the new outliers from the three sheets concatenates by Angela
  outliersAll <- fileFinder(path = DataPath,
                             fileName = newOutliersName) %>%
    openxlsx::read.xlsx("Outliers_FromCanadaToPanama_ANB") %>%
    dplyr::mutate(dplyr::across(tidyselect::all_of(colnames(.)), as.character)) %>%
    dplyr::bind_rows(fileFinder(path = DataPath,
                                 fileName = newOutliersName) %>%
                       openxlsx::read.xlsx("Tracys_outliers") %>%
                       dplyr::mutate(dplyr::across(tidyselect::all_of(colnames(.)), as.character))
                     ) %>%
    dplyr::bind_rows(fileFinder(path = DataPath,
                                 fileName = newOutliersName) %>%
                       openxlsx::read.xlsx("Colombian_outliers") %>%
                       dplyr::mutate(dplyr::across(tidyselect::all_of(colnames(.)), as.character))
                     ) %>%
    dplyr::bind_rows(fileFinder(path = DataPath,
                                 fileName = newOutliersName) %>%
                       openxlsx::read.xlsx("Outliers_SppInStatus3") %>%
                       dplyr::mutate(dplyr::across(tidyselect::all_of(colnames(.)), as.character))
                     ) %>%
    readr::write_excel_csv(paste(tempdir(), "newOutliers.csv", sep = "/"))
  # Read back in with the correct column classes
  outliersAll <- fileFinder(path = tempdir(),
                             fileName = "newOutliers.csv") %>%
    readr::read_csv(col_types = readr::cols(.default = "c"), lazy = FALSE) %>%
    dplyr::mutate(eventDate = eventDate %>%
                    lubridate::ymd_hms(truncated = 5)) %>%
    suppressWarnings()
  
  ###### c. Colombia ####
  ColombiaOutliers <- fileFinder(path = DataPath,
                                  fileName = ColombiaOutliers_all) %>%
    readr::read_csv( col_types = readr::cols(.default = "c")) %>%
    suppressWarnings()
  
  ######  d. remove NearTRUE ####
  # If user provies a NearTRUE input
  if(!is.null(NearTRUE)){
    # Find and read the csv
    NearTRUE_data <- fileFinder(path = DataPath,
                                 fileName = NearTRUE) %>% 
      readr::read_csv() %>% 
      dplyr::filter(near_truepoints_KM >= NearTRUE_threshold)
    
    # Remove those below a threshold in from NearTRUE in outliersAll
    outliersAll <- outliersAll %>%
      dplyr::filter(!database_id %in% NearTRUE_data$database_id)
    # Remove those below a threshold in from NearTRUE in ColombiaOutliers
    
    ColombiaOutliers <- ColombiaOutliers %>%
      dplyr::filter(!database_id %in% NearTRUE_data$database_id)
  }
  
  ###### e. eventDate ####
    # format data eventDate
  data <- data %>%
    dplyr::mutate(eventDate = eventDate %>%
                    lubridate::ymd_hms(truncated = 5))
  
  
    ##### 1.2 Process Paige ####
  writeLines(" - Processing the Paige outliers...")
  
  # Find PaigeOutliers in the occurrence data by occurrenceID and institutionCode
  Outl_occID <- data %>%
    tidyr::drop_na(occurrenceID) %>%
    dplyr::filter(occurrenceID %in% PaigeOutliers$occurrenceID &
                    institutionCode %in% PaigeOutliers$institutionCode) 
  # Find PaigeOutliers by occurrenceID and institutionCode
  Outliers_matched <- data %>%
    # Remove matched IDs
    dplyr::filter(!occurrenceID %in% Outl_occID$occurrenceID) %>%
    tidyr::drop_na(catalogNumber, institutionCode) %>%
    dplyr::filter(catalogNumber %in% PaigeOutliers$catalogNumber &
                    institutionCode %in% PaigeOutliers$institutionCode) %>%
    # Re-bind the outlier matches 
    dplyr::bind_rows(Outl_occID)

  # Combine the Paige and new outliers 
  outliersAll <- outliersAll %>%
      # Convert to the correct column types
    readr::type_convert(col_types = ColTypeR()) %>%
    dplyr::bind_rows(Outliers_matched) 

  
  #### 2.0 Find outlier duplicates ####
    ##### 2.1 Find duplicates ####
  writeLines(" - Looking for duplicates of the outliers...")
    # Get a list of the outliers and their duplicates
  outlierDuplicates <- duplicates %>%
    dplyr::filter(database_id %in% outliersAll$database_id | 
                    database_id_keep %in% outliersAll$database_id)
  duplicateList <- c(outlierDuplicates %>% dplyr::pull(database_id), 
                     outlierDuplicates %>% dplyr::pull(database_id_keep)) %>% unique()
  
    ##### 2.2 Combine duplicates ####
  # Get a list of all outliers and their duplicates - database_id
  outList <- c(outliersAll %>% dplyr::pull(database_id), duplicateList,
               ColombiaOutliers %>% pull(database_id)) %>% unique()
  
  #### 3.0 Flag records ####
  # Find the occurrences that did not match
  data <- data %>%
    # Add the .expertOutlier columns as TRUE (not flagged)
    dplyr::mutate(.expertOutlier = dplyr::if_else(
      database_id %in% outList,
      FALSE, TRUE)) 
  # Return user output
  message(
    paste(
      "\\manualOutlierFindeR:\n",
      "Flagged",
      format(sum(data$.expertOutlier == FALSE, na.rm = TRUE), big.mark = ","),
      "expert-identified outliers:\n",
      "The column '.expertOutlier' was added to the database.\n"
    )
  )
  
  # Return data
return(data)
  
}