R/format_data.R

Defines functions format_data_flat

Documented in format_data_flat

#' Format data
#'
#' @param data Original point count dataframe with individual species detections and distances in each row; ok to contain multiple species at this stage
#' @param strata Column name in \code{data} specifying the groups for which you want density estimates (e.g., site, vegetation age class, or a combination of the two); defaults to "Transect"; it's ok to have only 1 stratum
#' @param year Optional, defaults to \code{NULL}; if density estimates should be generated by year and \code{strata}, change to column name in \code{data} indicating year
#' @param strata.area Default of 1 produces estimates of the number of individuals per hectare; change to name of column representing actual area of each stratum (in hectares) to instead extrapolate densities and estimate total number of individuals
#' @param sample Column name in \code{data} specifying independent sampling locations within each stratum; defaults to "Point"
#' @param visitnum Column name in \code{data} defining the unique visit (survey) number or ID at each sampling location within each stratum; used to calculate sampling effort at each location; defaults to "Visit"
#' @param weights Optional, defaults to \code{NULL}; Column name in \code{data} to multiply sampling effort by (e.g., percent cover of target vegetation type)
#' @param species Column name in \code{data} containing name or code of species detected; defaults to "Spp"
#' @param count Column name in \code{data} containing the number of individuals for each detection; defaults to "Count"
#' @param dist Column name in \code{data} containing estimated distance to each species detection (in meters); defaults to \code{NULL}
#' @param dist_bin_id Defaults to 'Distance.Bin.ID'; currently only partially supported for protocol VCP25
#' @param flyover Optional, defaults to \code{NULL}; Column name in \code{data} flagging flyover detections for removal
#' @param protocol Column name in \code{data} containing point count protocol ID; defaults to "Protocol"; drops fixed radius protocol FR50 and produces warning if data contains more than one other protocol
#'
#' @details This function is the first step in conducting a distance analysis, formatting point count data into the data structure with column names required by the package \code{Distance}.
#'
#' Default values are intended for work with California Avian Data Center (CADC) outputs, using column names produced by downloading data in "Project Lead" mode, but should be adaptable to other formats.
#'
#' Effort is calculated as the number of unique values in \code{visitnum} for each \code{strata} and optionally \code{year}. Effort may be further weighted by values in \code{weights}. For error checking, the function will print the total number of unique strata and sampling locations identified, as well as the range of the number of visits to each sampling location.
#'
#' The function requires providing either \code{dist}, a numeric column with detection distances (which may represent the midpoint of distance bins), or \code{dist_bin_id}. If only \code{dist_bin_id} is provided, the function will attempt to translate the standard bin IDs provided by CADC to the corresponding numeric midpoint values for the protocol VCP25 only. For all other protocols, the function currently produces an error. All distances are assumed to be in meters and strata area in hectares.
#'
#' Any values in \code{flyover} other than '', ' ', or \code{NA} are assumed to flag birds flying over a sampling location (but not using the sampling location), and will be removed.
#'
#' @return Returns a data frame including columns required by package \code{Distance}, including: \code{Region.Label}, \code{Area}, \code{Sample.Label}, \code{Effort}, and \code{distance}.
#' Also returns \code{species}, \code{count}, \code{protocol}, and any other unused columns remaining in original data.
#'
#' @author Kristen Dybala, \email{kdybala@@pointblue.org}
#'
#' @export
#'
#' @examples
#'
#' \dontrun{fdat = format.data(data=dat, strata='group')}

format_data_flat = function(data, strata='Transect', year=NULL, strata.area=1, sample='Point', visitnum='Visit', weights=NULL,
                       species='Spp', count='Count', dist=NULL, dist_bin_id='Distance.Bin.ID',
                       flyover=NULL, protocol='Protocol')
{

  Region.Label = Sample.Label = NULL
  vars <- names(data)
  fdat <- data

  ## Region.Label
  if (!is.null(year)) { # create separate label for each stratum & year
    fdat$Region.Label <- as.factor(paste0(fdat[, which(colnames(fdat) == strata)], '_', fdat[, which(colnames(fdat) == year)]))
  } else {fdat$Region.Label <- as.factor(fdat[, which(colnames(fdat) == strata)])}
  cat('\nData include',length(unique(fdat$Region.Label)), 'strata:\n', unique(as.character(fdat$Region.Label)))

  ## Area
  if (length(strata.area)>1) {
    stop('"strata.area" should either be a column name or a single numeric value')
  } else if (is.numeric(strata.area)) {
    fdat$Area <- strata.area
  } else {fdat$Area <- fdat[, which(colnames(fdat) == strata.area)]}

  ## Sample.Label
  fdat$Sample.Label <- fdat[, which(colnames(fdat) == sample)]
  cat('\n\nData include', length(unique(fdat$Sample.Label)), 'unique sampling locations:\n', unique(as.character(fdat$Sample.Label)))

  ## Effort
  fdat$visitnum <- fdat[, which(colnames(fdat) == visitnum)]
  effort <- plyr::ddply(fdat, plyr::.(Sample.Label, Region.Label), plyr::summarize, Effort = length(unique(visitnum)))
  fdat <- merge(fdat, effort, by = c('Sample.Label','Region.Label'))
  ## Report range of efforts:
  cat('\n\nEffort ranges from', min(fdat$Effort), 'to', max(fdat$Effort), 'visits per sampling location')

  ## Optional: update effort with additional weights
  if (!is.null(weights)) {
    fdat$weights <- fdat[, which(colnames(fdat) == weights)]
    fdat$Effort <- fdat$Effort * weights
    cat('\nNote: effort further weighted by', weights)
  }

  ## Detections
  fdat$species <- fdat[, which(colnames(fdat) == species)]
  fdat$count <- fdat[, which(colnames(fdat) == count)]
  if (!is.null(dist)) {
    fdat$distance <- fdat[, which(colnames(fdat) == dist)]
  } else if (!is.null(dist_bin_id)) {
    fdat$dist_bin_id <- fdat[,which(colnames(fdat) == dist_bin_id)]
  } else {
    stop('Must specify either dist or dist_bin_id')
  }

  ## Protocols
  fdat$protocol <- fdat[, which(colnames(fdat) == protocol)]
  cat('\n\nProtocol(s) found:', unique(as.character(fdat$protocol)))

  ## check for fixed radius protocol FR50
  if ('FR50' %in% fdat$protocol) {
    if (length(unique(fdat$protocol)) == 1) {
      stop("Can't run distance sampling on protocol FR50.", call.=F)
    } else {
      warning("Can't run distance sampling on protocol FR50. Data will be dropped.", call.=F)
      fdat <- fdat[-which(fdat$protocol == 'FR50'),]
    }
  }
  ## check for more than one protocol
  if (length(unique(fdat$protocol))>1) {
    warning('More than one protocol code remains in data. During analysis, choose bins carefully to make sure they align.', call.=FALSE)
    if (is.null(dist)) { #more than one protocol code and only distance bins provided
      stop('dist = NULL: Specify a numeric column containing distance estimates (e.g. midpoints of distance bins)', call.=FALSE)
    }
  } else if (length(unique(fdat$protocol))==1) {
    if (is.null(dist) & !is.null(dist_bin_id)) { #one protocol code and only distance bins provided
      if ('VCP25' %in% fdat$protocol) { #one protocol only: VCP25 --> can use bin translator
        key <- translate_distance_bins(protocol = unique(fdat$protocol))
        fdat <- merge(fdat, key, by = 'dist_bin_id', all.x = T)
      } else {
        stop('dist = NULL: Specify a numeric column containing distance estimates (e.g. midpoints of distance bins)')
      }
    }
  }

  ## Flyovers
  if (any(is.na(fdat$distance))) { #remove detections with no distance recorded
    fdat <- fdat[-which(is.na(fdat$distance)),]
  }
  if (!is.null(flyover)) {
    fdat$flyover <- fdat[, which(colnames(fdat) == flyover)]
    fdat$flyover[fdat$flyover=='' | fdat$flyover==' '] <- NA
    if (any(!is.na(fdat$flyover))) { #flyovers detected
      cat('\nLabels in', flyover, 'assumed to mark flyovers (and deleted):', unique(as.character(fdat$flyover[!is.na(fdat$flyover)])))
      fdat <- fdat[-which(!is.na(fdat$flyover)),]
    }
  }

  if (nrow(fdat) == 0) {
    stop('No detections remain in data set.')
  } else {
    vars = vars[-which(vars %in% c(strata.area, sample, species, count, dist, dist_bin_id, flyover, protocol))]
    fdat = fdat[, c('Region.Label','Area','Sample.Label','Effort','species','count','distance','protocol',vars)]
  }
  return(fdat)
}

translate_distance_bins = function(protocol='VCP25') {
  if(protocol=='VCP25') {
    key = data.frame(dist_bin_id = c('L10','L20','L30','L40','L50','G75','G00','B00','FLO'),
                     distance = c(seq(5,45,10),62.5,82.5,NA,NA))
  } else if (protocol=='VCP10') {
    key = data.frame(dist_bin_id = c('L10','L20','L30','L40','L50','G60','G70','G80','G90','G00','B00','FLO'),
                     distance = c(seq(5,95,10),NA,NA))
  } else {
    stop('Function under development. Distance bins currently only work with protocols VCP25 and VCP10. Instead, create numeric distance column representing midpoints of distance bins.')
  }
  return (key)
}
kdybala/pbdistance documentation built on May 20, 2019, 8:28 a.m.