R/flagRecorder.R
In BeeBDC: Occurrence Data Cleaning

Documented in flagRecorder

# This function was written by James Dorey to load and output updated summary information from flagging
  # This function was written from the 9th of June 2022. For questions, please email James
# at jbdorey[at]me.com


#' Loads, appends, and saves occurrence flag data 
#' 
#' This function is used to save the flag data for your occurrence data as you run the BeeBDC script. 
#' It will read and append existing files, if asked to. Your flags should also be saved in the occurrence 
#' file itself automatically.
#'
#' @param data A data frame or tibble. Occurrence records as input.
#' @param outPath A character path. Where the file should be saved.
#' @param fileName Character. The name of the file to be saved
#' @param idColumns A character vector. The names of the columns that are to be kept along with the
#' flag columns. These columns should be useful for identifying unique records with flags.
#' Default = c("database_id", "id", "catalogNumber", "occurrenceID", "dataSource").
#' @param append Logical. If TRUE, this will find and append an existing file generated by this function.
#' @param printSummary Logical. If TRUE, print a [summary()] of all filter columns - i.e. those which 
#' tidyselect::starts_with(".")
#'
#' @return Saves a file with id and flag columns and returns this as an object.
#' @export
#' 
#' @importFrom dplyr %>%
#'
#' @examples
#' # Load the example data
#' data("beesFlagged")
#' 
#'   # Run the function
#'   OutPath_Report <- tempdir()
#' flagFile <- flagRecorder(
#'   data = beesFlagged,
#'   outPath = paste(OutPath_Report, sep =""),
#'   fileName = paste0("flagsRecorded_", Sys.Date(), ".csv"),
#'   # These are the columns that will be kept along with the flags
#'   idColumns = c("database_id", "id", "catalogNumber", "occurrenceID", "dataSource"),
#'   # TRUE if you want to find a file from a previous part of the script to append to
#'   append = FALSE)

 
flagRecorder <- function(
    data = NULL,
    outPath = NULL,
    fileName = NULL,
    idColumns = c("database_id", "id", "catalogNumber", "occurrenceID", "dataSource"),
    append = NULL,
    printSummary = FALSE){
  # locally bind variables to the function
  database_id <- . <- rowSum <- .summary <- NULL
  
    #### 0.0 Prep ####
      ##### 0.1 Packages ####
  requireNamespace("dplyr")
  requireNamespace("tidyselect")
  requireNamespace("lubridate")
  requireNamespace("readr")

      ##### 0.2 Warnings ####
  if(is.null(data)){
    warning(" - Please provide a dataset.")
  }
  if(is.null(outPath)){
    warning(" - Please provide an outPath to where the bdc folders are dataset.")
  }
  if(is.null(idColumns)){
    warning(paste(" - No ID columns were selected! We will keep the following id columns intact:\n",
                  "database_id, id, catalogNumber, and occurrenceID.",
            sep = ""))
    idColumns = c("database_id", "id", "catalogNumber", "occurrenceID")
  }
  if(is.null(append)){
    warning(" - Please provide a append argument - TRUE (find existing file) or FALSE (start from scratch).")
  }
  
  
    ##### 0.3 Data in ####
      ###### a. Occurrence dataset ####
  # Select the columns from the input occurrence dataset
  data <- data %>%
    dplyr::select( dplyr::all_of(idColumns), tidyselect::starts_with(".")) 
  
      ###### b. Existing data ####
  if(append == TRUE){
    # Find an existing file
  flagPath <- fileFinder(path = outPath, fileName = "flagsRecorded_")
    # Read it in
  flagColumns <- readr::read_csv(flagPath)
    # Find the new columns that need adding in
  newColumns <- setdiff(colnames(data), colnames(flagColumns))
    # Merge the new columns to the data tibble
  data <- flagColumns %>%
    dplyr::left_join(
        # Select only the new columns to add, from the new tibble.
      dplyr::select(data, c(database_id, dplyr::all_of(newColumns))),
        # Merge by database_id
      by = "database_id", keep = FALSE)
    # Remove the spent dataframe
  rm(flagColumns)
  }
  
  
    #### 1.0 Yes .summary ####
  if(".summary" %in% colnames(data)){
    # Update .summary column
  summaryCol <- data  %>% 
    # Select all columns starting with "."
    dplyr::select(tidyselect::starts_with(".")) %>% 
    # Delete the summary column if it's there
    dplyr::select(!tidyselect::starts_with(".summary")) %>%
    # Make FALSE == 1 and TRUE == 0
    dplyr::mutate_if(is.logical, ~abs(as.numeric(.) - 1)) %>%
    # IF rowSum > 0 then there is at least one flag
    dplyr::mutate(rowSum = rowSums(., na.rm = TRUE)) %>%
    # Add the .summary column
    dplyr::mutate(.summary = dplyr::if_else(rowSum > 0,
                                            FALSE, TRUE)) %>%
    dplyr::select(.summary) 
    # Add this column in
  data <- data %>%
    dplyr::mutate(.summary = summaryCol$.summary)
  # User output
  message(" - .summary column detected. This will be over-written.")
  }
  
  #### 2.0 No .summary ####
  if(!".summary" %in% colnames(data)){
    # Update .summary column
    summaryCol <- data  %>% 
      # Select al lcolumns starting with "."
      dplyr::select(tidyselect::starts_with(".")) %>% 
      # Delete the summary column if it's there
      dplyr::select(!tidyselect::starts_with(".summary")) %>%
      # Make FALSE == 1 and TRUE == 0
      dplyr::mutate_if(is.logical, ~abs(as.numeric(.) - 1)) %>%
      # IF rowSum > 0 then there is at least one flag
      dplyr::mutate(rowSum = rowSums(., na.rm = TRUE)) %>%
      # Add the .summary column
      dplyr::mutate(.summary = dplyr::if_else(rowSum > 0,
                                              FALSE, TRUE)) %>%
      dplyr::select(.summary) 
    # Add this column in
    data <- data %>%
      dplyr::mutate(.summary = summaryCol$.summary)
      # User output
    message(" - NO .summary column detected. This will added to the data.")
  }
  
  #### 3.0 Save ####
    # Save this information as the csv flagsRecorded_DATE.csv
  readr::write_excel_csv(data,
                   paste0(outPath, "/", fileName, sep = ""))
    # User output
  message(paste(
    " - Data saved to ", paste0(outPath, "/", fileName, sep = ""), 
    sep = ""))
  
    # User output
  writeLines(paste(
    " - Selected ", ncol(data), " columns. These include:\n",
    paste(colnames(data)[1:ncol(data)-1], collapse = ", "),
    ", and ", paste(colnames(data)[ncol(data)]),
    sep = ""
  ))
  
  # Print summary if requested
  if(printSummary == TRUE){
      summary(dplyr::select(data, tidyselect::starts_with(".")))
  }
  
  # Return this file
  return(data)
} # END flagRecorder