R/procFulcrum.R

Defines functions procFulcrum

Documented in procFulcrum

#' procFulcrum
#'
#' \code{procFulcrum} processes raw Fulcrum data read into R with the \code{readFulcrum} function.
#' Processing consists of dropping unused variables, renaming variables for joining, and adding flags to collection data.
#'
#' @param data A list of dataframes generated by the \code{readFulcrum} function.
#' @return A list of up to five processed dataframes.
#' \tabular{ll}{
#' field_sampling_proc \tab a processed dataframe from the nematode_field_sampling.csv\cr
#' field_field_sampling_sample_photo_proc \tab a processed dataframe from the nematode_field_sampling_sample_photo.csv\cr
#' isolation_proc \tab a processed dataframe from the nematode_isolation.csv\cr
#' isolation_s_labeled_plates_proc \tab a processed dataframe from the nematode_isolation_s_labeled_plates.csv\cr
#' nematode_isolation_photos_proc \tab a processed dataframe from the nematode_isolation_photos.csv\cr
#' }
#' @import dplyr
#' @import lubridate
#' @import stringr
#' @importFrom runner streak_run
#' @export

procFulcrum <- function(data) {
  # find names of data
  data_names <- names(data)

  # make processed data list
  proc_data <- list()

  # process field_sampling
  if(TRUE %in% (stringr::str_detect(data_names, pattern = "_field_sampling$"))) {
    message(glue::glue("Processing {stringr::str_subset(data_names, pattern = '_field_sampling$')}"))

    field_sampling_proc <- data[[stringr::str_which(data_names, pattern = "_field_sampling$")]] %>%
      dplyr::mutate(c_label = stringr::str_to_upper(c_label)) %>%
      # name created_by to specify who picked up the sample
      dplyr::rename(collection_by = created_by,
                    collection_fulcrum_latitude = latitude,
                    collection_fulcrum_longitude = longitude,
                    fulcrum_altitude = gps_altitude,
                    collection_local_time = time) %>%
      dplyr::select(-updated_at,
                    -system_created_at,
                    -system_updated_at,
                    -date,
                    -sample_photo_caption, # not needed here, can bring in from photo export
                    -sample_photo_url,
                    -updated_by, -version, -status, -assigned_to, -geometry) %>% # make custom urls later with function
      # this is UTC time (very important if you want to convert to local time)
      dplyr::mutate(collection_datetime_UTC = lubridate::ymd_hms(created_at, tz = "UTC")) %>%
      # again this is UTC date (very important if you want to convert to local date)
      dplyr::mutate(collection_date_UTC = lubridate::date(created_at)) %>%
      dplyr::select(-created_at) %>%
      # Flag Fahrenheit observations and fix in proc
      dplyr::mutate(flag_substrate_temperature = ifelse(substrate_temperature > 40, TRUE, FALSE),
                    proc_substrate_temperature = ifelse(substrate_temperature > 40,
                                                        FtoC(substrate_temperature),
                                                        substrate_temperature)) %>%
      # Rename sub_temp with raw prefix
      dplyr::rename(raw_substrate_temperature = substrate_temperature) %>%
      # Fix ambient temp F to C
      dplyr::mutate(flag_ambient_temperature = ifelse(ambient_temperature_c > 40, TRUE, FALSE),
                    proc_ambient_temperature = ifelse(ambient_temperature_c > 40,
                                                      FtoC(ambient_temperature_c),
                                                      ambient_temperature_c)) %>%
      # Rename ambient_temp with raw prefix
      dplyr::rename(raw_ambient_temperature = ambient_temperature_c) %>%
      # force ambient temp to numeric
      dplyr::mutate(raw_ambient_temperature = as.numeric(raw_ambient_temperature)) %>%
      # add flags for runs of temperature data
      dplyr::arrange(collection_datetime_UTC) %>%
      dplyr::mutate(flag_ambient_temperature_run = ifelse(ambient_humidity == dplyr::lag(ambient_humidity) &
                                                            raw_ambient_temperature == dplyr::lag(raw_ambient_temperature) & gridsect == "no", TRUE, FALSE),
                    run_length = runner::streak_run(flag_ambient_temperature_run),
                    flag_ambient_temperature_run = dplyr::case_when((run_length >= 4 & flag_ambient_temperature_run == TRUE)  ~ T,
                                                              (dplyr::lead(run_length, n = 3L) >= 4 & flag_ambient_temperature_run == TRUE) ~ T,
                                                              (dplyr::lead(run_length, n = 2L) >= 4 & flag_ambient_temperature_run == TRUE) ~ T,
                                                              (dplyr::lead(run_length, n = 1L) >= 4 & flag_ambient_temperature_run == TRUE) ~ T,
                                                              TRUE ~ F)) %>%
      dplyr::select(-run_length) %>%
      # flag duplicated C-labels
      dplyr::group_by(c_label) %>%
      dplyr::mutate(flag_duplicated_c_label_field_sampling = ifelse(dplyr::n() > 1, TRUE, FALSE)) %>%
      dplyr::ungroup() %>%
      # looks for commas in the sample_photo name
      dplyr::mutate(flag_unusual_sample_photo_num = ifelse(is.na(stringr::str_count(sample_photo, pattern = ",")) |
                                                             stringr::str_count(sample_photo, pattern = ",") != 0, TRUE, FALSE)) %>%
      # break apart multiple sample photos. This takes the first sample photo and warns if additional photos are discarded
      tidyr::separate(col = sample_photo, into = c("sample_photo1", "sample_photo2", "sample_photo3"), sep = ",", extra = "drop", fill = "right") %>%
      # force gridsect variables to correct class b/c they will default to logical if no gridsects are present
      dplyr::mutate(gridsect_index = as.character(gridsect_index),
                    grid_sect_direction = as.character(grid_sect_direction),
                    gridsect_radius = as.character(gridsect_radius))

    # add to processed list
    proc_data["field_sampling_proc"] <- list(field_sampling_proc)
  }


  # Process field_sampling_sample_photo
    if(TRUE %in% (stringr::str_detect(data_names, pattern = "_field_sampling_sample_photo"))) {
    message(glue::glue("Processing {stringr::str_subset(data_names, pattern = '_field_sampling_sample_photo')}"))

    field_sampling_sample_photo_proc <- data[[stringr::str_which(data_names, pattern = "_field_sampling_sample_photo")]] %>%
      dplyr::group_by(fulcrum_parent_id) %>% # group to find best precision among photos
      dplyr::arrange(exif_gps_dop) %>%
      dplyr::ungroup() %>%
      dplyr::distinct(fulcrum_parent_id, .keep_all = T) %>%
      dplyr::mutate(best_photo_gps_dop_logical = TRUE) %>%
      dplyr::select(fulcrum_id, exif_gps_latitude, exif_gps_longitude, exif_gps_altitude, best_photo_gps_dop_logical, best_photo_gps_dop = exif_gps_dop, best_photo_caption = caption)

    # add to list
    proc_data["field_sampling_sample_photo_proc"] <- list(field_sampling_sample_photo_proc)
  }

  # Process isolation
  if(TRUE %in% (stringr::str_detect(data_names, pattern = "isolation$"))) {
    message(glue::glue("Processing {stringr::str_subset(data_names, pattern = 'isolation$')}"))

    isolation_proc <- data[[stringr::str_which(data_names, pattern = "isolation$")]] %>%
      dplyr::group_by(c_label) %>%
      dplyr::mutate(flag_duplicated_isolation_for_c_label = ifelse(dplyr::n() > 1, TRUE, FALSE)) %>% # could use count here without grouping?
      dplyr::ungroup() %>%
      # this is UTC time (very important if you want to convert to local time)
      dplyr::mutate(isolation_datetime_UTC = lubridate::ymd_hms(created_at, tz = "UTC")) %>%
      # again this is UTC date (very important if you want to convert to local date)
      dplyr::mutate(isolation_date_UTC = lubridate::date(created_at)) %>%
      dplyr::rename(c_label_id = c_label,
                    isolation_id = fulcrum_id,
                    isolation_by = created_by,
                    isolation_local_time = time,
                    isolation_latitude = latitude,
                    isolation_longitude = longitude,
                    isolation_photo = photos) %>% # TAC
      dplyr::mutate(flag_unusual_isolation_photo_num = ifelse(is.na(stringr::str_count(isolation_photo, pattern = ",")) |
                                                             stringr::str_count(isolation_photo, pattern = ",") != 0, TRUE, FALSE)) %>%
      # break apart multiple isolation photos. This takes the first isolation photo and warns if additional photos are discarded
      tidyr::separate(col = isolation_photo, into = c("isolation_photo"), sep = ",", extra = "drop", fill = "right") %>%
      dplyr::select(-created_at, -project, -geometry, -photos_caption, -photos_url, -gps_altitude, -gps_horizontal_accuracy,
                    -gps_vertical_accuracy, -gps_speed, -gps_course) # TAC , -photos

    # add to list
    proc_data["isolation_proc"] <- list(isolation_proc)
  }

  if(TRUE %in% (stringr::str_detect(data_names, pattern = "isolation_s_labeled_plates"))) {
    message(glue::glue("Processing {stringr::str_subset(data_names, pattern = 'isolation_s_labeled_plates')}"))
    isolation_s_labeled_plates_proc <- data[[stringr::str_which(data_names, pattern = "isolation_s_labeled_plates")]] %>%
      dplyr::select(fulcrum_parent_id, s_label) %>%
      # flag duplicated S-labels
      dplyr::group_by(s_label) %>%
      dplyr::mutate(flag_duplicated_s_label_isolation_s_labeled_plates = ifelse(dplyr::n() > 1, TRUE, FALSE),
                    flag_duplicated_s_label_isolation_s_labeled_plates = ifelse(flag_duplicated_s_label_isolation_s_labeled_plates == TRUE &
                                                                                  is.na(s_label), FALSE, flag_duplicated_s_label_isolation_s_labeled_plates)) %>%
      dplyr::ungroup() %>%
      # flag missing S-labels
      dplyr::mutate(flag_missing_s_label_isolation_s_labeled_plates = ifelse(is.na(s_label), TRUE, FALSE))

    # add to list
    proc_data["isolation_s_labeled_plates_proc"] <- list(isolation_s_labeled_plates_proc)
  }

  if(TRUE %in% (stringr::str_detect(data_names, pattern = "isolation_photos$"))) {
    message(glue::glue("Processing {stringr::str_subset(data_names, pattern = 'isolation_photos$')}"))
    # read in isolation photos dataframe
    isolation_photos_proc <- data[[stringr::str_which(data_names, pattern = "isolation_photos$")]]

    # add to list
    proc_data["isolation_photos_proc"] <- list(isolation_photos_proc)
  }

  # return list
  return(proc_data)
}
AndersenLab/easyfulcrum documentation built on Aug. 23, 2023, 2:35 a.m.