poplerr: Popler R Package

Documented in pplr_site_rep pplr_site_rep_plot

#' @name pplr_site_rep
#' @rdname pplr_site_rep
#' 
#' @title Spatial-temporal replication of data sets 
#' 
#' @description Functions to examine the number of temporal replicates 
#' contained within each spatial replication level of a dataset. 
#' \code{pplr_site_rep_plot} plots the temporal replicates available for 
#' each site.
#' \code{pplr_site_rep} produces logical vectors that identify the spatial 
#' replicates with enough temporal replication, or summary tables.
#' 
#' @param input An object of produced by \code{pplr_get_data}. Note that this
#' is not an output from \code{pplr_browse}, as the raw data is required to 
#' calculate the amount of replication.
#' @param freq A number corresponding to the desired annual frequency of 
#' replicates. Studies that are replicated more frequently will be 
#' included in the counts and those that replicated less frequently will be 
#' excluded. 
#' If \code{return_logical = TRUE}, rows that contain information from sites 
#' that are replicated at the desired frequency will have a \code{TRUE} value, 
#' and rows that are not will have a \code{FALSE} value. 
#' Values greater than 1 will select sampling done multiple times per year. 
#' For example, \code{freq = 2} indicates a desired sampling frequency of 6
#' months. Conversely, \code{freq = 0.5} indicates a desired sampling done 
#' once every 2 years.
#' @param duration An integer corresponding to the desired number of yearly
#' replicates. Rows containing site information from sites with more 
#' replication will be included, while those with less will be excluded.
#' @param rep_level An integer corresponding to the level of spatial 
#' replication over which verify yearly temporal replication. Values between 1 and 5 
#' are possible (though higher levels may not be present for some datasets). 
#' Higher values correspond to higher levels of spatial nestedness. 
#' The default value of \code{rep_level = 1} corresponds to sites.
#' @param return_logical logical; if \code{TRUE}, the function returns a logical
#' vector. This vector can be used to subset the dataset. If \code{FALSE}, the 
#' function returns a summary table of class \code{tbl}. This table shows, in 
#' variable \code{number_of_samples}, how many temporal replicates per year 
#' are contained by each spatial replicate. Default is \code{TRUE}.
#' 
#'
#' @return \code{pplr_site_rep_plot}: \code{input} object (invisibly) or a
#' \code{ggplot2} object. Use \code{return_plot} to control.
#' 
#' \code{pplr_site_rep}: A \code{tbl} or a logical vector of length 
#' \code{dim(input)[1]}. Use \code{return_logical} to control.
#' 
#' @details \code{pplr_site_rep_plot} produces a scatterplot showing the sites 
#' (\code{spatial_replication_level_1}) and years for which data is available.
#' 
#' \code{pplr_site_rep} works with any level of spatial replication and produces
#' either a summary table of temporal replication or a logical vector that can be used 
#' to subset a data set based on the desired frequency and length of time.
#' 
#' 
#' @examples 
#' \dontrun{
#' 
#' library(ggplot2)
#' library(dplyr)
#' 
#' # produce logical vector and subset using it. This can also be piped into a 
#' # the plotting function for visiualization
#' 
#' good_studies <- pplr_get_data(lterid == 'SEV') %>%
#'                    .[pplr_site_rep(input = .,
#'                                    duration = 12, 
#'                                    rep_level = 3), ] %>%
#'                    pplr_site_rep_plot()
#'                                        
#' 
#' # Or, make a neat summary table and decide where to go from there
#' SEV <- pplr_get_data(lterid == 'SEV')
#' 
#' rep_table <- pplr_site_rep(input = SEV,
#'                            freq = 0.5,
#'                            duration = 12,
#'                            return_logical = FALSE)
#'  
#' # pplr_site_rep_plot ---------------
#'                                                      
#' # create an unmodified figure
#' BNZ <- pplr_get_data(lterid == 'BNZ')
#' 
#' pplr_site_rep_plot(BNZ)
#' 
#' # Return the figure instead of the data for subsequent modification
#' Antarctica <- pplr_get_data(lterid == 'PAL')
#' 
#' pplr_site_rep_plot(Antarctica,
#'               return_plot = TRUE) + 
#'    ggtitle("Penguins Rock!")
#'    
#' # Use within pipes. Cannot return and modify the figure this way.
#' pplr_get_data(lterid == 'SEV') %>% 
#'   pplr_site_rep_plot(return_plot = FALSE) %>%
#'   pplr_report_metadata()
#' }
#' 
#' @importFrom dplyr bind_cols group_by summarise n ungroup
#' @importFrom magrittr %>%
#' @importFrom rlang quo syms .data !! !!! 
#' @importFrom stringr str_split
#' 
#' @export

pplr_site_rep <- function(input, 
                          freq = 1, 
                          duration = 10,
                          rep_level = 1,
                          return_logical = TRUE) {
  
  # test that input is indeed a 'get_data' object
  if(!inherits(input, 'get_data')) {
    stop('"input" must be an object of class "get_data".')
  }
  
  # test that rep_level is an integer between 1 and 5
  if(rep_level < 1 & rep_level > 5) {
    stop('"rep_level" must be an integer between 1 and 5.')
  }

  # create symbols for rep_levels
  levels <- seq_len(rep_level)
  levels <- paste0('spatial_replication_level_', levels, sep = "")
  
  # test that the levels specified by user are actually available in dataset
  if(!all(levels %in% names(input))) {
    stop('requested "rep_level" is not available for this data set.\n',
         'Please try again with a lower value for "rep_level".',
         call. = FALSE)
  }
  
  group_col <- rlang::quo(group_col)
  
  # New data frame for filtering with single spatial rep group_column
  new_data <- dplyr::bind_cols(input, paste2(input, levels, group_col))
  

  group <- group_col
  year <- rlang::quo(year)
  
  # Remove years based on frequency
  
  # first, transform data to correct format and then figure out
  # what the user actually wants back
  
  
  # if there is daily data, capture it
  
  dates <- c('year', 'day', 'month')
  
  if(!'day' %in% names(new_data)){
    # otherwise, create dummy day variable
    new_data$day <- 1
  }
  
  # Same for months. If available, use it
  if(!'month' %in% names(new_data)) {
    new_data$month <- 6
  }
  
  # Some of the data sets will have some days entered, but will be missing others.
  # Consequently, they generate NAs when lubridate tries to parse them and are dropped
  # from calculations
  
  NA_months <- NA_days <- integer(0)
  
  if(any(is.na(new_data$month))) {
    warning('Some, but not all, "month" entries are missing and ',
            'will not be included in the calculations of replication ',
            '"duration" or "frequency".',
            call. = FALSE)
    NA_months <- which(is.na(new_data$month))
  }
  
  if(any(is.na(new_data$day))) {
    warning('Some, but not all, "day" entries are missing and ',
            'will not be included in the calculations of replication ',
            '"duration" or "frequency".',
            call. = FALSE)
    NA_days <- which(is.na(new_data$day))
    
  }
  
  sample_date <- rlang::quo(sample_date)
  
  # create formatted sample date
  new_data <- dplyr::bind_cols(new_data, paste2(new_data,
                                                dates,
                                                sample_date))
  
  new_data$sample_date <- suppressWarnings(as.Date(new_data$sample_date,"%Y-%d-%m"))
  
  sample_date <- rlang::quo(sample_date)
  
  
  summary <- new_data %>%
    dplyr::group_by(!! group, !! year) %>%
    dplyr::summarise(n_dates = length(unique(!! sample_date)))
  
  
  # perform frequency based filtering 
  if(freq <= 1) {
    
    # if the frequency is greater than 1 year, then it gets a little tricky.
    # first, compute the differences between each year at each site, and then 
    # create an index of sites meeting that criteria for frequency based 
    # filtering
    
    year_diffs <- rle(diff(summary$year))
    
    freq_idx <- which(year_diffs$values <= 1/freq & 
                      year_diffs$values >= 0)
    
    sites <- character(length(freq_idx))
    
    for(i in seq_len(length(freq_idx))) {
      idx <- sum(year_diffs$lengths[seq_len(freq_idx[i])])
      sites[i] <- summary$group_col[idx]
    }
    sites <- unique(sites)
    
    summary <- dplyr::filter(summary, .data$group_col %in% sites)
    
  } else {
    summary <- dplyr::filter(summary, .data$n_dates >= freq)
  }
  
  if(dim(summary)[1] < 1) {
    stop('No sites meet the frequency requirements for this combination of\n',
         '"freq" and "rep_level". Please use different settings.')
  }
  
  
  # next, compute which years are sampled consecutively. rle computes the number
  # times in a row that a value appears. Thus, if a difference of 1 appears more
  # than the requested duration, then that is what we want
  consecutive_years <- rle(diff(summary$year))
  
  if(freq >= 1){
    duration_idx <- which(consecutive_years$lengths >= duration &
                          consecutive_years$values == 1)
  } else {
    duration_idx <- which(consecutive_years$lengths >= duration &
                          consecutive_years$values <= 1/freq & 
                          consecutive_years$values > 0)
    
  }
  if(length(duration_idx) < 1) {
    stop('Chosen "duration" is too long for this data set.\n',
         'Please choose a shorter "duration".')
  }
  
  # rle doesn't really return what we want though. The actual group_col index
  # we need are the row numbers for each site replicate. Examine the output of 
  # rle to see why (length(consecutive_years[[1]] != length(summary$year)))
  # so, quick for loop to find the appropriate indices. 
  
  true_idx <- integer(length(duration_idx))
  site_ids <- character(length(duration_idx))
  
  for(i in seq_len(length(duration_idx))) {
    idx <- duration_idx[i]
    
    true_idx[i] <- sum(consecutive_years$lengths[seq_len(idx)])
    
    site_ids[i] <- summary$group_col[true_idx[i]]
    
  }
  
  
  
  
  # Now, depending on user selected output, create logical vector for subsetting,
  # or just return the correctly formatted table
  
  if(return_logical) {
    out <- new_data$group_col %in% site_ids
    
    out[c(NA_days, NA_months)] <- FALSE
    
  } else {
    
    summary <- summary %>%
      dplyr::filter(!! group %in% site_ids) %>%
      dplyr::group_by(!! group, !! year) %>%
      dplyr::summarise(number_of_samples = sum(.data$n_dates,
                                               na.rm = TRUE)) %>%
      dplyr::ungroup()
    
    
    # re-build spatial_replication_level_x columns
    for(i in seq_len(length(levels))) {
      summary[ ,levels[i]] <- stringr::str_split(summary$group_col,
                                                 '-',
                                                 simplify = TRUE)[ ,i]
      
      
    }
    
    out <- dplyr::select(summary,
                         !!! rlang::syms(levels), 
                         .data$year,
                         .data$number_of_samples)
    
    
  }
  
  return(out)
}


#' @noRd
#' @importFrom rlang quo_name := !!
#' @importFrom purrr pmap_chr
#' @importFrom dplyr tibble
# helper to work in bind_cols
paste2 <- function(data, cols, name) {
  name <- rlang::quo_name(name)
  data <- data[ ,cols, drop = FALSE]
  listdf <- list()
  
  for(i in seq_len(dim(data)[2])) {
    listdf[[i]] <- data[ ,i]
  }
  
  out <- purrr::pmap_chr(listdf, paste, sep = '-')
  
  out <- dplyr::tibble(!! name := out)

  return(out)
}


#' @name pplr_site_rep_plot
#' @rdname pplr_site_rep
#' 
#' @inheritParams pplr_site_rep
#' @param return_plot A logical indicating whether to return a copy of the 
#' \code{input} data or the \code{ggplot} object created by the function. Use
#' \code{TRUE} to return the \code{ggplot} object for subsequent modification.
#' Use \code{FALSE} to return an invisible copy of the \code{input} object 
#' (useful for piping). Default is \code{FALSE}.
#' 
#' @importFrom ggplot2 ggplot aes geom_point theme_bw xlab ylab
#' @importFrom rlang quo !! 
#' 
#' @export

pplr_site_rep_plot <- function(input, 
                               return_plot = FALSE){
  
  
  if(inherits(input, 'browse')) {
    stop("'pplr_site_rep()' only works with outputs from 'pplr_get_data()'.",
         call. = FALSE)
  }
  
  
  x <- rlang::quo(input$year)
  y <- rlang::quo(input$spatial_replication_level_1)
  
  rep_plot <- ggplot2::ggplot(data = input,
                              ggplot2::aes(x = !! x,
                                           y = !! y)) +
    ggplot2::geom_point(size = 5) +
    ggplot2::theme_bw() + 
    ggplot2::xlab("Year with available data") + 
    ggplot2::ylab("Site")
  
  print(rep_plot)
  
  if(return_plot) {
    return(rep_plot)
  } else {
    invisible(input)
  }
}