R/estimateN.r
In Rdistance: Distance-Sampling Analyses for Density and Abundance Estimation

Documented in estimateN

#' @title Abundance point estimates
#' 
#' @description Estimate abundance given a distance function, 
#' a "merged" data frame containing detections and transect lengths, area, 
#' and the number of sides surveyed (if line-transects).   
#' This is called internally by \code{abundEstim}.  Most users will call 
#' \code{abundEstim} to estimate abundance. 
#' 
#' @param dfunc An estimate distance function (see \code{dfuncEstim}).
#' 
#' @param data A data frame containing distance observations, transects, 
#' and lengths.  This data frame must have a column named 'siteID' that identifies
#' unique sites (transects or points). If observations were made on  line-transects, this 
#' data frame must also have a column named 
#' by the \code{lengthColumn} parameter that contains transect lengths. NA
#' length transects are accepted and are dropped when computing total 
#' transect length. Only observations on non-NA-length transects are toward density.
#' 
#' @param surveyedSides The number of sides of the transect that were surveyed. Either 
#' 1 or 2.  Only applies to line transects. 
#' 
#' @inheritParams abundEstim
#' 
#' @inheritParams dfuncEstim  
#'  
#' @inherit abundEstim details
#'   
#' @return A list containing the following components:
#' 
#'    \item{density}{Estimated density in the surveyed area.}
#'    
#'    \item{abundance}{Estimated abundance on the study area.}
#'    
#'    \item{n.groups}{The number of detections (not individuals, unless all group sizes = 1) 
#'    used to estimate density and abundance.}
#'    
#'    \item{n.seen}{The number of individuals (sum of group sizes) used to 
#'    estimate density and abundance.}
#'    
#'    \item{area}{Total area of inference. Study area size}
#'    
#'    \item{surveyedUnits}{Number of surveyed sites.  This is total transect length
#'    for line-transects and number of points for point-transects. This total transect
#'    length does not include NA transects.}
#'    
#'    \item{surveyedSides}{Number of sides (1 or 2) of transects surveyed. Only relevant for line-transects.}
#'    
#'    \item{avg.group.size}{Average group size on non-NA transects}
#'    
#'    \item{w}{Strip width. }
#'    
#'    \item{pDetection}{Probability of detection.}
#'    
#' For line-transects that do not involve covariates, x$density  
#' is x$n.seen / (x$surveyedSides * x$w * x$pDetection * x$surveyedUnits)
#'    
#'         
#'    
#' @seealso \code{\link{dfuncEstim}}, \code{\link{abundEstim}}
#' 
#' @keywords model
#'
#' @export 

estimateN <- function(dfunc
                      , data
                      , area = NULL
                      , surveyedSides
                      , lengthColumn
                      , control = RdistanceControls()
                      ){

  if( !inherits(area, "units") & control$requireUnits ){
    if( !is.null(area) ){
      # If we are here, area did not come with units, we require units, and it's not null: ERROR
      stop(paste("Study area measurement units are required."
                 , "Assign units to area by attaching 'units' package then:\n"
                 , "units(area) <- '<units of measurment>'."
                 , "'m^2' (sq meters), 'ha' (hectares), 'km^2', and 'acre' are acceptable.\n"
                 , "See units::valid_udunits()"))
    }
    # if we are here, area is NULL: Report abundance in 1 square unit of measure
    area <- units::set_units(1, dfunc$outputUnits, mode = "standard")^2
  } else if( control$requireUnits ){
    # if we are here, area has units and it is not null (because you cannot 
    # assign units to NULL) but it could be NA)
    # Convert area units to square of outputUnits.
    # This converts units like "hectare" to "m^2". If cannot convert, and error is thrown here
    squareOutputUnits <- units::set_units(1, dfunc$outputUnits, mode = "standard")^2
    area <- units::set_units(area, squareOutputUnits, mode = "standard")
  }

  # ---- Check whether point survey "lengths" are present ----
  if( dfunc$pointSurvey ){
    if( !( lengthColumn %in% names(data)) ){
      # no point "length". Use siteID as length. Below, we count non-missing site ids
      lengthColumn <- "siteID"
    }  
  }  
  
  # ---- Find observations on NA length transects inside the strip ----
  mt <- terms(dfunc$model.frame)
  distColumn <- all.vars(mt)[attr(mt, "response")]
  inStrip <- (dfunc$w.lo <= data[,distColumn]) &
             (data[,distColumn] <= dfunc$w.hi)
  onPosTransect <- !is.na(data[, lengthColumn ]) # use distance in dfunc, but do not count groupsize
  distPresent <- !is.na(data[, distColumn]) # count groupsize, but no distances
  obsInd <- inStrip & onPosTransect & distPresent

  # ---- Measurement units check. ----
  if( !dfunc$pointSurvey ){
    if( !inherits(data[,lengthColumn], "units") & control$requireUnits ){
      stop(paste("Transect length units are required.\n" 
                 , "Assign units to transect lengths in site data frame with statements like:\n"
                 , "siteData$length <- units::set_units(siteData$length, '<units>')\n"
                 , "'m' (meters), 'ft' (feet), 'km' (kilometers), etc. are all acceptable.\n"
                 , "See units::valid_udunits()"))
    } else if( control$requireUnits ){
      # if we are here, length has units. convert them to units used during estimation.
      # Input dfunc must have a $outputUnits component.
      data[,lengthColumn] <-  units::set_units(data[,lengthColumn], dfunc$outputUnits, mode = "standard")
    }
  } 
  
  # ---- Estimate numerator of abundance ----
  # REMEMBER: component $detections of dfunc has been truncated to (w.lo, w.hi)
  #           component $model.frame has NOT been truncated to the strip

  # If dfunc has covariates, esw is a vector of length n. No covars, esw is scalar
  # If dfunc is pointSurveys, esw is EDR.  If line surveys, esw is ESW.
  esw <- effectiveDistance(dfunc
                         , newdata = data[ obsInd, , drop = FALSE])
    
  w <- dfunc$w.hi - dfunc$w.lo
  
  if (dfunc$pointSurvey) {
    phat <- (esw / w)^2  # for points
  } else {
    phat <- esw / w  # for lines
  }

  # phat should be unit-less; check just to be sure, if so drop "1" units
  # Odd: sometimes phat has units "1", sometimes "m/m". Either way, test and remove.
  
  if( isUnitless(phat)  ){
    phat <- units::drop_units(phat)
  }

  if( !is.null(attr(mt, "offset")) ){
    gsCol <- all.vars(mt)[attr(mt, "offset")]
    gs <- data[, gsCol]
    gs <- gs[ obsInd ]
  } else {
    gs <- rep(1, sum(obsInd))
  }
  
  nhat <- gs / phat # inflated counts one per detection
  
  # ---- Compute denominator of abundance, and abundance itself ----
  
  if (FALSE) {
    
    # MOVE ALL THIS TO A PREDICT METHOD
    
    # if(!is.null(dfunc$covars)){
    #   covarsInModel <- attr(terms(dfunc$model.frame), "term.labels")
    #   allSiteLevelCovs <- all( covarsInModel %in% names(siteData) )
    #   if( !allSiteLevelCovs ) {
    #     nonSiteLevCovs <- paste("'",covarsInModel[!(covarsInModel %in% names(siteData))], 
    #                             "'",collapse = ", ", sep="")
    #     mess <-"Cannot estimate site-level abundance. bySite is TRUE but detection-level "
    #     if(regexpr(",", nonSiteLevCovs) < 0) {
    #       mess <- paste0(mess, "covariate ",
    #                    nonSiteLevCovs,
    #                    " is in the model. Options: remove this covariate from dfunc; ",
    #                    "set bySite=FALSE; or, summarize it to the site-level,", 
    #                    " place it in siteData under same name, and re-run ", 
    #                    "abundEstim WITHOUT re-running dfuncEstim.")
    #     } else {
    #       mess <- paste0(mess, "covariates ",
    #                      nonSiteLevCovs,
    #                      " are in the model. Options: remove these covariates from dfunc; ",
    #                      "set bySite=FALSE; or, summarize them to the site-level,", 
    #                      " place them in siteData under same names, and re-run ", 
    #                      "abundEstim WITHOUT re-running dfuncEstim.")
    #       
    #     }
    #     stop(mess)
    #   }
    # }
    # 
    # # ---- sum statistics by siteID
    # 
    # nhat.df <- data.frame(siteID = detectionData$siteID, abundance=nhat)
    # nhat.df <- tapply(nhat.df$abundance, nhat.df$siteID, sum)
    # nhat.df <- data.frame(siteID = names(nhat.df), abundance=nhat.df)
    # 
    # observedCount <- tapply(detectionData$groupsize, detectionData$siteID, sum)
    # observedCount <- data.frame(siteID = names(observedCount), observedCount=observedCount)
    # 
    # nhat.df <- merge(observedCount, nhat.df)  # should match perfectly
    # 
    # # If detectionData$siteID is a factor and has all levels, even zero
    # # sites, don't need to do this.  But, to be safe merge back to 
    # # get zero transects and replace NA with 0 
    # 
    # # Must do this to get pDetection on 0-sites.  Site must have 
    # # non-missing covars if dfunc has covars
    # esw <- effectiveDistance(dfunc, siteData)
    # siteData <- data.frame(siteData, esw=esw)
    # 
    # if (dfunc$pointSurvey) {
    #   siteData$pDetection <- (siteData$esw / w)^2  # for points
    # } else {
    #   siteData$pDetection <- siteData$esw / w  # for lines
    # }
    # 
    # 
    # nhat.df <- merge(siteData, nhat.df, by="siteID", all.x=TRUE)
    # nhat.df$observedCount[is.na(nhat.df$observedCount)] <- 0
    # nhat.df$abundance[is.na(nhat.df$abundance)] <- 0
    # 
    # # Calculate density
    # if (dfunc$pointSurvey) {
    #   sampledArea <- pi * w^2  # area samled for single point  
    # } else {
    #   sampledArea <- 2 * w * nhat.df$length  # area sampled for single line
    # }
    # nhat.df$density <- (nhat.df$abundance * area) / sampledArea
    # 
    # 
    # # Calculate effective area ("effective" as in ESW)
    # # This is suggested as the offset term in a GLM model of density
    # if (dfunc$pointSurvey) {
    #   nhat.df$effArea <- (pi * nhat.df$esw^2) / area  # for points
    # } else {
    #   nhat.df$effArea <- (nhat.df$length * nhat.df$esw * 2) / area  # for lines
    # }
    # 
    # 
    # # Replace the column name for "esw", which is edr for points
    # if (dfunc$pointSurvey) {
    #   names(nhat.df)[names(nhat.df)=="esw"] <- "EDR"  # for points
    # } else {
    #   names(nhat.df)[names(nhat.df)=="esw"] <- "ESW"  # for lines
    # }
    
    
  } else {

    # ----  not bySite ----
    
    # ---- Compute number of points or transect length ----
    # Note: drop NA length transects or points here, which may have detections and distance measurements.
    # There are dups because data is the merged data, and has one row per detection and there 
    # are multiple detections per transect or point.
    if( dfunc$pointSurvey ){
      dups <- duplicated(data$siteID)
      totSurveyedUnits <- sum(!is.na(data[!dups,,drop=FALSE][,lengthColumn]))
    } else {
      dups <- duplicated(data$siteID)
      totSurveyedUnits <- sum(data[!dups,,drop=FALSE][,lengthColumn], na.rm = TRUE)
    }
    
    if(dfunc$pointSurvey){
      dens <- sum(nhat) / (pi * w^2 * totSurveyedUnits)
    } else {
      dens <- sum(nhat) / (surveyedSides * w * totSurveyedUnits)
    }
    
    nhat.df <- dens * area
    
    # nhat.df should be unitless
    if( isUnitless(nhat.df) ){
      nhat.df <- units::drop_units(nhat.df)
    } else {
      warning(paste("Strange measurement units have been detected because abundance is not unitless.\n", 
      "Check measurement unit compatability among distances, transect lengths, and area."))
    }
    
    nhat.df <- list(density = dens
                    , abundance = nhat.df
                    , n.groups = length(gs)
                    , n.seen = sum(gs)
                    , area = area
                    , surveyedUnits = totSurveyedUnits
                    , surveyedSides = surveyedSides
                    , avg.group.size = mean(gs)
                    , range.group.size = range(gs)
                    , w = w
                    , pDetection = phat
                    )
  }
    
  # some interesting tidbits:
  #  sampled area = tot.trans.len * 2 * (dfunc$w.hi - dfunc$w.lo)
  #  sum(1/phat) = what Distance calls "N in covered region".  This is
  #    estimated number of groups in the sampled area.
  #  sum(1/phat)*mean(dfunc$detections$groupsize) = an estimate of individuals
  #    in the sampled area. This is probably how Distance would do it.
  #  sum(dfunc$detections$groupsize/phat) = the HT estimate of individuals 
  #    in the sampled area.  How RDistance does it. This and the Distance 
  #    estimate are very close.  Only difference is ratio of sums or sum of 
  #    ratios.
  #  sum(detectionData$groupsize) / (sum(1/phat)*mean(detectionData$groupsize)) = 
  #    n.indivs / (nhat.groups*mean.grp.size) = n.groups / nhat.groups = 
  #    what Distance calls "Average p".  This is different than mean(phat) 
  #    the way Rdistance calculates it.
  
  

  return(nhat.df)
}  # end estimate.nhat function