R/summ-residence_index.r

Defines functions aggregate_total_no_overlap aggregate_total_with_overlap total_diff_days interval_count residence_index

Documented in aggregate_total_no_overlap aggregate_total_with_overlap interval_count residence_index total_diff_days

#' Generate the residence index from a set of detections
#'
#' This residence index tool will take condensed detection event data (from
#' \code{\link{detection_events}} and caculate the residence index for each
#' location. The information passed to the function is
#' what is used to calculate the residence index, make sure you are only passing
#' the data you want taken into consideration for the residence index (i.e.
#' species, stations, tags, etc.).
#'
#' @references 
#' Kessel, S.T., Hussey, N.E., Crawford, R.E., Yurkowski, D.J., O'Neill, C.V.
#' and Fisk, A.T., 2016. Distinct patterns of Arctic cod (\emph{Boreogadus
#' saida}) presence and absence in a shallow high Arctic embayment, revealed
#' across open-water and ice-covered periods through acoustic telemetry. Polar
#' Biology, 39(6), pp.1057-1068.
#' \url{https://www.researchgate.net/publication/279269147}
#'
#' @param detections A data.frame from the \code{\link{detection_events}}
#'   function.
#'   
#' @param calculation_method A character string with the calculation method
#'   using one of the following: \code{kessel}, \code{time_interval}, 
#'   \code{timedelta}, \code{aggregate_with_overlap}, or 
#'   \code{aggregate_no_overlap}.
#'   
#' @param locations An optional data frame that identifies all unique 
#' locations where RI will be calculated. Three columns required:
#' \describe{
#'   \item{location}{Character string with unique location identifier.}
#'   \item{mean_longitude}{Location longitude (for mapping).}
#'   \item{mean_latitude}{Location latitude (for mapping).}
#' }
#' 
#' If \code{locations = NULL} (default value) then RI will only be 
#' calculated at locations present in \code{detections$location}.
#'   
#' @param group_col Optional character string (can be multiple) that identifies 
#' additional grouping variables for RI calculations. The default value 
#' (\code{group_col = "animal_id"}) will calculate and return RI for each 
#'  animal at each location (i.e., for each unique combination of 
#'  \code{location} and \code{animal_id}. If \code{group_col = NULL} then 
#'  RI will be calculated by location only (will not account for animal or 
#'  any other variable).
#'   
#' @param time_interval_size Character string with size of the time interval 
#' used when \code{calculation_method = "time_interval"}. This is passed to 
#' \link[base]{seq.Date}'s \code{by} argument, so must meet the requirements of 
#' that argument for that function (e.g., "1 day", "4 hours", etc.). Default is 
#' \code{"1 day"}.
#' 
#' @param groupwise_total Logical that determines how the denominator is 
#' calculated in RI. If FALSE (default) then the denominator represents the 
#' total number of time intervals or time (depending on calculation method) 
#' among all records. Otherwise (if FALSE), the denominator 
#' represents the total number of time intervals or time within each 
#' group level (e.g., for each animal if \code{group_col = "animal_id"}.
#'
#' @details The \strong{kessel} method converts both the \code{first_detection}
#' and \code{last_detection} columns into a date with no hours, minutes, or
#' seconds. Next it creates a list of the unique days where a detection was
#' seen. The size of the list is returned as the total number of days as an
#' integer. This calculation is used to determine the total number of distinct
#' days (T) and the total number of distinct days per location (S). Possible
#' rounding error may occur as a detection on 2016-01-01 23:59:59 and a
#' detection on 2016-01-02 00:00:01 would be counted as two days when it is
#' really 2-3 seconds.
#'
#' \deqn{ RI = S/T}
#' \deqn{ RI = Residence Index}
#' \deqn{S = Distinct number of days detected at the location}
#' \deqn{T = Distinct number of days detected at any location}
#'
#' @details The \strong{time_interval} calculation method determines the
#'   number of time intervals (size determined by \code{time_interval_size}
#'   argument) in which detections occurred at each \code{location} and as a 
#'   fraction of the number of time intervals in which detections occurred 
#'   among all sites. For each location, residency index (RI) is calculated:
#'
#' \deqn{ RI = L/T}
#' \deqn{ RI = Residence Index}
#' \deqn{L = Distinct number of time intervals in which detection observed at 
#'           this location}
#' \deqn{T = Distinct number of time intervals in which detection observed at 
#'           any location}
#'           
#'   For consistency with other \code{calculation_method}s, 
#'   the L and T are not reported, but are converted cumulative time covered in 
#'   days and reported in columns \code{days_detected} and \code{total_days}.
#' 
#'
#' @details The \strong{timedelta} calculation method determines the first
#' detection and the last detection of all detections. The time difference is
#' then taken as the values to be used in calculating the residence index. The
#' timedelta for each station is divided by the timedelta of the array to
#' determine the residence index.
#'
#' \deqn{
#' RI = Delta S/Delta T}
#' 
#' \deqn{RI = Residence Index}
#' 
#' \deqn{Delta S = Last detection time at the location - First detection time at the location}
#' 
#' \deqn{Delta T = Last detection time at any location - First detection time at any location}
#' 
#'
#' @details
#' The \strong{aggregate_with_overlap} calculation method takes the length of time of each
#' detection and sums them together. A total is returned. The sum for each location
#' is then divided by the sum among all locations to determine the residence index.
#'
#' \deqn{
#' RI = AwOS/AwOT}
#' 
#' \deqn{RI = Residence Index}
#' 
#' \deqn{AwOS = Sum of length of time of each detection at the location}
#' 
#' \deqn{AwOT = Sum of length of time of each detection among all locations}
#' 
#'
#' @details 
#' The \strong{aggregate_no_overlap} calculation method takes the length of time of each
#' detection and sums them together. However, any overlap in time between one or
#' more detections is excluded from the sum. For example, if the first detection
#' is from \code{2016-01-01 01:02:43} to \code{2016-01-01 01:10:12} and the second
#' detection is from \code{2016-01-01 01:09:01 }to \code{2016-01-01 01:12:43}, then the
#' sum of those two detections would be 10 minutes. A total is returned once all
#' detections of been added without overlap. The sum for each location is then
#' divided by the sum among all locations to determine the residence index.
#'
#' \deqn{
#' RI = AnOS/AnOT}
#' 
#' \deqn{RI = Residence Index}
#' 
#' \deqn{AnOS = Sum of length of time of each detection at the location, excluding any overlap}
#' 
#' \deqn{AnOT = Sum of length of time of each detection among all locations, excluding any overlap}
#' 
#' 
#' @examples
#' #get path to example detection file
#' det_file <- system.file("extdata", "walleye_detections.csv",
#'                          package = "glatos")
#' det <- read_glatos_detections(det_file)
#' detection_events <- glatos::detection_events(det)
#' rik_data <- glatos::residence_index(detection_events, 
#'                                     calculation_method = 'kessel')
#' rit_data <- glatos::residence_index(detection_events, 
#'                                     calculation_method = 'time_interval')
#' rit_data <- glatos::residence_index(detection_events, 
#'                                     calculation_method = 'timedelta')
#' riawo_data <- glatos::residence_index(detection_events, 
#'                              calculation_method = 'aggregate_with_overlap')
#' riano_data <- glatos::residence_index(detection_events, 
#'                              calculation_method = 'aggregate_no_overlap')
#'
#' @return A data.frame of days_detected, residency_index, location,
#' mean_latitude, mean_longitude
#' 
#' 
#' @author A. Nunes, \email{anunes@dal.ca}
#'
#' @importFrom dplyr count distinct select
#' @export
residence_index <- function(detections, calculation_method='kessel', 
  locations = NULL, group_col = "animal_id", time_interval_size = "1 day", 
  groupwise_total = TRUE) {
  
  #set to NULL if NA
  if(!is.null(group_col)) if(is.na(group_col)) group_col <- NULL 
  if(!is.null(locations)) if(all(is.na(locations))) locations <- NULL 
  
  if(!is.null(group_col)){
    if((group_col == 'animal_id') & (calculation_method == 'aggregate_with_overlap')) {
      message("NOTE: Becuase an individual animal cannot overlap with itself, this will produce the same output as aggregate_no_overlap when animal_id is passed to group_col")
    }
  }
  
  #get locations from detections if not given
  if(is.null(locations)){
    locs <- dplyr::select(detections, location, mean_latitude, mean_longitude)
  }
  if(!is.null(locations) & inherits(locations, "data.frame")){
    
    # Check that the required columns appear in the detections data frame
    req_cols <- c("location", "mean_latitude", "mean_longitude")
    missingCols <- setdiff(req_cols, names(locations))
    if (length(missingCols) > 0){
      stop(paste0("det is missing the following ",
        "column(s):\n", paste0("       '", missingCols, "'", collapse="\n")), 
        call. = FALSE)
    }
    
    locs <- dplyr::select(locations, location, mean_latitude, mean_longitude)
    
  }
  
  locs <- dplyr::distinct(locs)    

  #summarize lat and lon for each unique location
  locs <- dplyr::group_by(locs, location)
  locs <- dplyr::summarise(locs, 
            mean_latitude = mean(mean_latitude, na.rm = TRUE),
            mean_longitude = mean(mean_longitude, na.rm = TRUE))
    
  #insert 0 for missing group levels (e.g., non-detection at a location)
  
  #all possible combinations of locations and grp_by columns
  if(!is.null(group_col)){
  group_levels <- merge(data.frame(location = unique(locs$location), 
                        stringsAsFactors = FALSE), 
                      dplyr::distinct(dplyr::select(detections, group_col)))
  } else {
    group_levels <- data.frame(location = unique(locs$location), 
                      stringsAsFactors = FALSE)
  }
  
  #summarize RI for each group
  
  #numerator
  group_cols <- c("location", group_col)
  
  detections <- dplyr::group_by(detections, .dots = group_cols)
  
  ri <- dplyr::do(detections, 
                    data.frame(days_detected = get_days(., calculation_method, 
                                                        time_interval_size)))   
  
  #add missing combinations (non-detection)
  ri <- dplyr::left_join(group_levels, ri, by = group_cols)
  
  ri <- dplyr::mutate(ri, 
          days_detected = ifelse(is.na(days_detected), 0, days_detected))
  
  #divisor
  #set grouping for divisor, calculate total days
  if(groupwise_total == FALSE | is.null(group_col)) {
    detections <- dplyr::ungroup(detections)
    ri$total_days <- get_days(detections, calculation_method, 
                              time_interval_size)
  } else{
    detections <- dplyr::group_by(detections, .dots = group_col)
    ri <- dplyr::left_join(ri,
                           dplyr::do(detections, 
                             data.frame(total_days = 
                                          get_days(., calculation_method, 
                                                   time_interval_size))),
                           by = group_col)
    detections <- dplyr::ungroup(detections)
  }
    
  #calculate RI
  ri <- dplyr::mutate(ri, 
      residency_index = as.double(days_detected) / total_days)

  #add lat and lon
  ri <- dplyr::left_join(ri, locs, by = "location")
  
  out_cols <- c(group_col, c("days_detected", "total_days", "residency_index", 
    "location", "mean_latitude", "mean_longitude"))
  ri <- data.frame(ri)[, out_cols]

  return(ri)
}


#' The function below takes a detection events data frame and determines the 
#' number of time bins in which detections were observed and returns the 
#' cumulative time covered by all bins, in days. Interval (bin) size is 
#' determined by the 'time_interval_size' argument. 
#'
#' For each event (row in detection events data frame), the function 
#' sequences from first_detection to last_detection by time_interval_size, then 
#' counts the number of unique intervals.
#'
#'
#' @param detections - data frame from detection_events (condensed = TRUE)
#' 
#' @param time_interval_size time increment string as in seq.Date 'by' argument
#'
interval_count <- function(detections, time_interval_size) {
  
  #get unique bins in each detection event
  detections <- dplyr::rowwise(detections)
  ints <- dplyr::do(detections, data.frame(
    int = seq(lubridate::floor_date(.$first_detection, time_interval_size), 
              lubridate::floor_date(.$last_detection, time_interval_size),
          by = time_interval_size)))
  
  intcount <- dplyr::n_distinct(ints)
  
  #fraction of day covered by one interval 
  day_fraction <- diff(as.numeric(seq(as.POSIXct("2000-01-01 00:00"), 
                                      by = time_interval_size, 
                                      length.out = 2))) / 86400.0
  
  #cumulative days (not necessarily contiguous)
  day_count <- intcount * day_fraction
  
  return(day_count)
}


#' The function below determines the total days difference.
#'
#' The difference is determined by the minimal first_detection of every
#' detection and the maximum last_detection of every detection. Both are
#' converted into a datetime then subtracted to get a timedelta. The timedelta
#' is converted to seconds and divided by the number of seconds in a day
#' (86400). The function returns a floating point number of days (i.e.
#' 503.76834).
#'
#' @param detections - data frame pulled from the compressed detections CSV
#'
total_diff_days <- function(detections) {
  first <- detections$first_detection[which.min(detections$first_detection)]
  last <- detections$last_detection[which.max(detections$last_detection)]
  total <- as.double(difftime(last, first, units="secs"))/86400.0
  return(total)
}


#' The function below aggregates timedelta of first_detection and last_detection
#' of each detection into a final timedelta then returns a float of the number
#' of days. If the first_detection and last_detection are the same, a timedelta
#' of one second is assumed.
#' 
#' @param detections -data frame pulled from the compressed detections CSV
#'
#' @importFrom dplyr mutate
aggregate_total_with_overlap <- function(detections) {
  detections <- mutate(detections, timedelta = as.double(difftime(last_detection,first_detection, units="secs")))
  detections <- mutate(detections, timedelta = dplyr::recode(detections$timedelta, `0` = 1))
  total <- as.double(sum(detections$timedelta))/86400.0
  return(total)
}


#' The function below aggregates timedelta of first_detection and
#' last_detection, excluding overlap between detections. Any overlap between two
#' detections is converted to a new detection using the earlier first_detection
#' and the latest last_detection. If the first_detection and last_detection are
#' the same, a timedelta of one second is assumed.
#'
#' @param detections - data frame pulled from the compressed detections CSV
#'
#' @importFrom data.table foverlaps
aggregate_total_no_overlap <- function(detections) {
  
  #extract intervals, rename
  ints <- data.table::as.data.table(detections)[ , .(t1 = first_detection, 
    t2 = last_detection)]
  
  #function to combine overlapping intervals
  aggregate_intervals <- function(x){
    data.table::setkey(x, t1, t2)
    
    #get indices of overlapping intervals, including self
    ov_ints <- data.table::foverlaps(x, x, which = TRUE) 
    
    #for each row in x, get min first det  & max lat det against overlaps
    ov_ints <- ov_ints[ , .(t1 = min(x$t1[c(xid, yid)]), 
      t2 = max(x$t2[c(xid, yid)])), 
      by = "xid"]
    
    ov_ints <- unique(ov_ints[, c("t1", "t2")])
    
    return(ov_ints)
  }
  
  
  #recursively apply until no overlaps
  repeat{
    ri <- nrow(ints)
    ints <- aggregate_intervals(ints)
    if(nrow(ints) == ri) break
  }
  
  
  ints[ , tdiff := (as.numeric(t2) - as.numeric(t1))]
  
  total_time <- sum(ints$tdiff) / 86400.0
  
  return(total_time)
  
}


#' Determines which calculation method to use for the residency index.
#'
#' Wrapper method for the calulation methods above.
#'
#' @param dets - data frame pulled from the detection events
#' @param calculation_method - determines which method above will be used to
#'   count total time and location time
#' @param time_interval_size - size of time interval
get_days <- function(dets, calculation_method='kessel', 
                     time_interval_size = "1 day") {
  days <- 0
  if (calculation_method == 'aggregate_with_overlap') {
    days <- glatos:::aggregate_total_with_overlap(dets)
  } else if(calculation_method == 'aggregate_no_overlap') {
    days <- glatos:::aggregate_total_no_overlap(dets)
  } else if(calculation_method == 'timedelta') {
    days <- glatos:::total_diff_days(dets)
  } else if(calculation_method == 'kessel'){
    days <- glatos:::interval_count(dets, time_interval_size = "1 day")
  } else if(calculation_method == 'time_interval'){
    days <- glatos:::interval_count(dets, time_interval_size)
  } else {
      stop("Unsupported 'calculated_method'.")
  }
  return(days)
}
jsta/glatos documentation built on July 11, 2022, 7:01 a.m.