R/weighting.R

Defines functions weight_aimlmf weight.adjust weight.gen

Documented in weight.adjust weight_aimlmf weight.gen

#' Calculate sample weights for points using design polygons
#' @description Calculate the weight for points in a sample design based on the sample frame or strata used to draw them and the point fate. The outputs are a data frame of the points and their weights, a data frame summary of the fates of the points (optionally by year), and a data frame summary of the strata (optionally by year) if strata were used. No spatial checks are done, so make sure that \code{pts} and \code{frame.spdf} are from the same design, limited to the extent of the original sample frame, and have complete overlap.
#' @param pts Data frame or spatial points data frame. This should be the points information. At minimum, it must contain a field matching the string \code{pts.fatefield} containing string values matching those in \code{target.values}, \code{unknown.values}, etc. If providing \code{pts.groupfield} it must also have a field matching that containing values matching values found in \code{frame.spdf$frame.groupfield}. If \code{date.field} or \code{year.source.field} is provided, then fields matching those must exist.
#' @param pts.fatefield Character string. This must exactly match the name of the field in \code{pts} that contains string values matching those in \code{target.values}, \code{unknown.values}, etc.
#' @param pts.groupfield Optional character string. This must exactly match the name of the field in \code{pts} that contains values matching those found in \code{frame.spdf$frame.groupfield}. This will most often be the field containing strata identities.
#' @param frame.spdf Spatial polygons data frame. This must be the design's sample frame or strata restricted to the sample frame extent. If providing \code{frame.groupfield} it must also have a field matching that containing values matching values found in \code{pts$pts.groupfield}.
#' @param frame.groupfield  Optional character string. This must exactly match the name of the field in \code{frame.spdf} that contains values matching those found in \code{pts$pts.groupfield}. This will most often be the field containing strata identities.
#' @param target.values Character string or character vector. This defines what values in the point fate field count as target points. When using AIM design databases, this should be at minimum \code{c("Target Sampled", "TS")}. This is case insensitive.
#' @param unknown.values Character string or character vector. This defines what values in the point fate field count as unknown points. When using AIM design databases, this should be at minimum \code{c("Unknown", "Unk")}. This is case insensitive.
#' @param nontarget.values Character string or character vector. This defines what values in the point fate field count as non-target points. When using AIM design databases, this should be at minimum \code{c("Non-Target", "NT", NA)}. This is case insensitive.
#' @param inaccessible.values Character string or character vector. This defines what values in the point fate field count as non-target points. When using AIM design databases, this should be at minimum \code{c("Inaccessible")}. This is case insensitive.
#' @param unneeded.values Character string or character vector. This defines what values in the point fate field count as not needed or unneeded points. When using AIM design databases, this should be at minimum \code{c("Not needed")}. This is case insensitive.
# @param ... Optional character strings. These must exactly match the names of the field in \code{pts} and will be used to group the points beyond the identity/identities they share with the frame. When calculating \code{frame.stats} these will be passed to \code{dplyr::group_by_()}. They will have no impact on \code{frame.summary} or \code{point.weights}. \code{"YEAR"} would be a common string to pass here.
#' @return A list containing the named data frames \code{frame.stats}, \code{frame.summary} (if groupfield strings were provided), and \code{point.weights}.
#' @export
weight.gen <- function(pts,
                       pts.fatefield = NULL, #pts.fatefield
                       pts.groupfield = NULL, #"WEIGHT.ID"
                       frame.spdf,
                       frame.groupfield = NULL, #designstratumfield
                       target.values = NULL,
                       unknown.values = NULL,
                       nontarget.values = NULL,
                       inaccessible.values = NULL,
                       unneeded.values = NULL){
  ## Sanitize
  if (class(pts) == "SpatialPointsDataFrame") {
    working.pts <- pts@data
  } else {
    working.pts <- pts
  }
  if (class(working.pts) != "data.frame") {
    stop("pts must be either a data frame or a spatial points data frame.")
  }
  working.pts[[pts.fatefield]] <- toupper(working.pts[[pts.fatefield]])

  if (!all(working.pts[[pts.fatefield]] %in% c(target.values, unknown.values, nontarget.values, inaccessible.values, unneeded.values))) {
    message("The following fate[s] need to be added to the appropriate fate argument[s] in your function call:")
    ## Take the vector of all the unique values in pts.spdf$final_desig (or another fate field) that aren't found in the fate vectors and collapse it into a single string, separated by ", "
    stop(paste(unique(working.pts[[pts.fatefield]][!(working.pts[[pts.fatefield]] %in% c(target.values, unknown.values, nontarget.values, inaccessible.values, unneeded.values))]), collapse = ", "))
  }

  # additional.point.groups <- list(...)
  # if (!any(additional.point.groups %in% names(working.pts))) {
  #   message("The following additional grouping fields were not found in pts:")
  #   stop(paste(additional.point.groups[!(additional.point.groups %in% names(working.pts))], collapse = ", "))
  # }
  # additional.point.groups <- rlang::quos(...)

  ## Add areas in hectares to the frame if they're not there already
  if (!("AREA.HA" %in% names(frame.spdf@data))) {
    frame.spdf <- add.area(frame.spdf)
  }

  ## Creating a table of the point counts by point type
  ## Start with adding the point types
  working.pts$key[working.pts[[pts.fatefield]] %in% target.values] <- "Observed.pts"
  working.pts$key[working.pts[[pts.fatefield]] %in% nontarget.values] <- "Unsampled.pts.nontarget"
  working.pts$key[working.pts[[pts.fatefield]] %in% inaccessible.values] <- "Unsampled.pts.inaccessible"
  working.pts$key[working.pts[[pts.fatefield]] %in% unneeded.values] <- "Unsampled.pts.unneeded"
  working.pts$key[working.pts[[pts.fatefield]] %in% unknown.values] <- "Unsampled.pts.unknown"


  ## Here's the summary by key and pts.groupfield and year.field as appropriate
  pts.summary.fields <- c(pts.groupfield[!is.null(pts.groupfield)])

  pts.summary <- eval(parse(text = paste0("dplyr::summarize(.data = group_by(.data = ungroup(working.pts), key, ", paste(pts.summary.fields, collapse = ", "),"), count = n())")))

  ## Spreading that
  pts.summary.wide <- tidyr::spread(data = pts.summary,
                                    key = key,
                                    value = count,
                                    fill = 0)

  ## What kinds of points might be tackled
  point.types <- c("Observed.pts", "Unsampled.pts.nontarget", "Unsampled.pts.inaccessible", "Unsampled.pts.unneeded", "Unsampled.pts.unknown")

  ## We need to know which of the types of points (target, non-target, etc.) are actually represented
  extant.counts <- point.types[point.types %in% names(pts.summary.wide)]

  ## Only asking for summarize() to operate on those columns that exist because if, for example, there's no Unsampled.pts.unneeded column and we call it here, the function will crash and burn
  # This should probably be converted to use !!!rlang::quos() but it works right now so don't sweat it
  frame.stats <- eval(parse(text = paste0("pts.summary.wide %>% group_by(", paste(pts.summary.fields, collapse = ","),") %>%",
                                          "dplyr::summarize(sum(", paste0(extant.counts, collapse = "), sum("), "))")))

  ## Fix the naming becaue it's easier to do it after the fact than write paste() so that it builds names in in the line above
  names(frame.stats) <- stringr::str_replace_all(string = names(frame.stats), pattern = "^sum\\(", replacement = "")
  names(frame.stats) <- stringr::str_replace_all(string = names(frame.stats), pattern = "\\)$", replacement = "")

  ## Add in the missing columns if some point categories weren't represented
  for (name in point.types[!(point.types %in% names(frame.stats))]) {
    frame.stats[, name] <- 0
  }

  frame.stats <- frame.stats[, names(frame.stats) != "NA"]

  ## TODO: Needs to handle a polygon OR a raster for the frame
  ## HERE WE FORK FOR IF THERE ARE STRATA OR NOT
  if (!is.null(frame.groupfield)) {
    ## Because we have strata, use the design stratum attribute
    ## Create a data frame to store the area values in hectares for strata. The as.data.frame() is because it was a tibble for some reason
    area.df <- group_by_(frame.spdf@data, frame.groupfield) %>% dplyr::summarize(AREA.HA.SUM = sum(AREA.HA)) %>% as.data.frame()


    ## Get the sums of point types
    frame.summary <- frame.stats %>% group_by_(pts.groupfield) %>%
      dplyr::summarize(Observed.pts = sum(Observed.pts),
                       Unsampled.pts.nontarget = sum(Unsampled.pts.nontarget),
                       Unsampled.pts.inaccessible = sum(Unsampled.pts.inaccessible),
                       Unsampled.pts.unneeded = sum(Unsampled.pts.unneeded),
                       Unsampled.pts.unknown = sum(Unsampled.pts.unknown))

    ## Add in the areas of the strata
    frame.summary <- merge(x = frame.summary,
                           y = area.df,
                           by.x = pts.groupfield,
                           by.y = frame.groupfield)

    ## Renaming is causing dplyr and tidyr to freak out, so we'll just copy the values into the fieldnames we want
    frame.summary$Stratum <- frame.summary[[pts.groupfield]]
    frame.summary$Area.HA <- frame.summary$AREA.HA.SUM

    ## Calculate the rest of the values
    frame.summary <- frame.summary %>% group_by(Stratum) %>%
      ## The total points, minus the unneeded so we don't penalize projects for them!
      mutate(Total.pts = sum(Observed.pts, Unsampled.pts.nontarget, Unsampled.pts.inaccessible, Unsampled.pts.unknown)) %>%
      ## The proportion of the total points in the stratum that were "target"
      mutate(Prop.dsgn.pts.obsrvd = Observed.pts/Total.pts) %>%
      ## The effective "sampled area" based on the proportion of points that were surveyed
      mutate(Sampled.area.HA = unlist(Area.HA * Prop.dsgn.pts.obsrvd)) %>%
      ## The weight for each point in the stratum is the effective sampled area divided by the number of points surveyed, unknown, and inaccessible in the stratum
      mutate(Weight = Sampled.area.HA/sum(Observed.pts, Unsampled.pts.inaccessible, Unsampled.pts.unknown)) %>% as.data.frame()

    ## When there are NaNs in the calculated fields, replace them with 0
    frame.summary <- tidyr::replace_na(frame.summary, replace = list(Prop.dsgn.pts.obsrvd = 0, Sampled.area.HA = 0, Weight = 0))

    ## Add the weights to the points, but only the observed ones
    for (stratum in frame.summary$Stratum) {
      working.pts$WGT[(working.pts[[pts.fatefield]] %in% target.values) & working.pts[[pts.groupfield]] == stratum] <- frame.summary$Weight[frame.summary$Stratum == stratum]
    }

    ## All the unassigned weights get converted to 0
    working.pts <- tidyr::replace_na(working.pts, replace = list(WGT = 0))

    point.weights <- working.pts

  } else if (is.null(frame.groupfield)) {

    ## Treat it as a single unit for lack of stratification
    area <- sum(frame.spdf@data$AREA.HA)

    ## derive weights
    proportion.observed <- 1 ## initialize - proportion of 1.0 means there were no nonresponses
    wgt <- 0 ## initialize wgt
    sample.area <- 0 ## initialize actual sampled area
    if (sum(frame.stats[, point.types]) > 0) {
      proportion.observed <- sum(frame.stats$Observed.pts)/sum(frame.stats[, c("Observed.pts", "Unsampled.pts.nontarget", "Unsampled.pts.inaccessible", "Unsampled.pts.unknown")]) ## realized proportion of the stratum that was sampled (observed/total no. of points that weren't "unneeded")
    }
    if (sum(frame.stats$Observed.pts) > 0) {
      ## Record the actual area(ha) sampled - (proportional reduction * stratum area)
      sample.area <- proportion.observed*area
      ## (The proportion of the total area that was sampled * total area [ha]) divided by the number of observed, inaccessible, and unknown points
      wgt <- (sample.area)/sum(frame.stats$Observed.pts, frame.stats$Inaccessible.pts, frame.stats$Unknown.pts)

    }

    ##Tabulate key information for this DD
    frame.summary <- data.frame(Stratum = "Frame",
                                Total.pts = sum(frame.stats[, point.types]),
                                Observed.pts = sum(frame.stats$Observed.pts),
                                Unsampled.pts.nontarget = sum(frame.stats$Unsampled.pts.nontarget),
                                Unsampled.pts.inaccessible = sum(frame.stats$Unsampled.pts.inaccessible),
                                Unsampled.pts.unneeded = sum(frame.stats$Unsampled.pts.unneeded),
                                Unsampled.pts.unknown = sum(frame.stats$Unsampled.pts.unknown),
                                Area.HA = area,
                                Prop.dsgn.pts.obsrvd = proportion.observed,
                                Sampled.area.HA = sample.area,
                                Weight = wgt,
                                stringsAsFactors = FALSE)

    point.weights <- working.pts

    ## If there are points to work with, do this
    if (nrow(point.weights) > 0) {
      ## If a point had a target fate, assign the calculates weight
      point.weights$WGT[point.weights[[pts.fatefield]] %in% target.values] <- wgt
      ## If a point had a non-target or unknown designation, assign 0 as the weight
      point.weights$WGT[point.weights[[pts.fatefield]] %in% c(nontarget.values, unknown.values, inaccessible.values, unneeded.values)] <- 0
    }
  }

  ## Make sure that this is in the order we want
  frame.summary <- frame.summary[, c("Stratum",
                                     "Total.pts",
                                     "Observed.pts",
                                     "Unsampled.pts.nontarget",
                                     "Unsampled.pts.inaccessible",
                                     "Unsampled.pts.unneeded",
                                     "Unsampled.pts.unknown",
                                     "Area.HA",
                                     "Prop.dsgn.pts.obsrvd",
                                     "Sampled.area.HA",
                                     "Weight")]

  names(point.weights)[names(point.weights) == "key"] <- "POINT.FATE"

  output <- list("frame.stats" = frame.stats, "frame.summary" = frame.summary, "point.weights" = point.weights)
  return(output[!is.null(output)])
}


#' Adjusting Weights Calculated from AIM Sample Designs
#'
#' This function takes the point weights data frame output from the function \code{weight()} and a SpatialPolygonsDataFrame defining the weight categories. Returns the data frame supplied as points with the new column \code{ADJWGT} containing the adjusted weights.
#' @param points Data frame output from \code{weight()}, equivalent to \code{weight()[["point.weights"]]} or \code{weight()[[2]]}.
#' @param wgtcat.spdf SpatialPolygonsDataFrame describing the weight categories for adjusting the weights. Use the output from \code{intersect()}.
#' @param spdf.area.field Character string defining the field name in \code{wgtcat@data} that contains the areas for the weight categories. Defaults to \code{"AREA.HA.UNIT.SUM"}.
#' @param spdf.wgtcat.field Character string defining the field name in \code{wgtcat@data} that contains the unique identification for the weight categories. Defaults to \code{"UNIQUE.IDENTIFIER"}.
#' @param projection \code{sp::CRS()} argument. Defaults to NAD83 with \code{sp::CRS("+proj=longlat +ellps=GRS80 +datum=NAD83 +no_defs")}. Is used to reproject all SPDFs in order to perform spatial manipulations.
#' @keywords weights
#' @examples
#' weight.adjuster()
#' @export

weight.adjust <- function(points,
                          wgtcat.spdf,
                          spdf.area.field = "AREA.HA.UNIT.SUM",
                          spdf.wgtcat.field = "UNIQUE.IDENTIFIER",
                          projection = sp::CRS("+proj=longlat +ellps=GRS80 +datum=NAD83 +no_defs")
){
  ## Sanitization
  names(points) <- toupper(names(points))
  names(wgtcat.spdf@data) <- toupper(names(wgtcat.spdf@data))
  spdf.area.field <- toupper(spdf.area.field)
  spdf.wgtcat.field <- toupper(spdf.wgtcat.field)

  ## Convert points to an SPDF
  points.spdf <- SpatialPointsDataFrame(coords = points[, c("LONGITUDE", "LATITUDE")],
                                        data = points,
                                        proj4string = projection)

  ## Attribute the points.spdf with the wgtcat identities from wgtcat.spdf
  points.spdf <- attribute.shapefile(spdf1 = points.spdf,
                                     spdf2 = wgtcat.spdf,
                                     attributefield = spdf.wgtcat.field,
                                     newfield = spdf.wgtcat.field)

  if (is.null(points.spdf)) {
    message("There was no overlap between the points and the wgtcat polygons. Returning NULL.")
    return(NULL)
  } else {
    ## Add the areas in using the unique identifier
    data.current <- merge(x = points.spdf@data,
                          y = distinct(wgtcat.spdf@data[, c(spdf.wgtcat.field, spdf.area.field)]))

    ## The weighted points attributed by the combination of reporting units and strata
    ## We first restrict to the points that inherited identities (this should've already happened in the previous step, but just to be safe)
    # data.current <- data.attributed[!is.na(data.attributed[, points.wgtcat.field]),]

    ## We want to include all the points. So we make a logical vector of just T with a length equal to the number of plots
    sites.current <- (rep(TRUE, nrow(data.current)))

    ## Grab the current weights from those points as its own vector
    wgt.current <- data.current$WGT

    ## NB: The identity inherited from the shapefile needs to match the field used for name in framesize
    wtcat.current <- data.current[, spdf.wgtcat.field]

    ## The framesize information about each of the unique wgtcat identities
    ## I currently have this as an area, but I think it needs to be the inverse of the proportion of the area of the reporting unit that each identity represents
    ## so the framesize value for a particular wgtcat = [area of the whole spdf]/[area of particular wgtcat]
    framesize.current <- wgtcat.spdf@data[, spdf.area.field]
    names(framesize.current) <- wgtcat.spdf@data[, spdf.wgtcat.field]

    ## Run the weight adjustment
    data.current$ADJWGT <- spsurvey::adjwgt(sites.current, wgt.current, wtcat.current, framesize.current)

    return(data.current)
  }
}

#' Calculate weights for analyses combining AIM and LMF data
#' @description When combining AIM and LMF designs for a single weighted analysis, the process of calculating weights is complicated by the two-stage nature of the LMF design. This calculates a relative weight for each point first based on whether or not it falls inside an LMF segment that was selected for sampling and how many points total fall in any given LMF segment. Those relative weights are then used to calculate combined design weights.
#' @param aim_points Data frame or spatial points data frame. The AIM point information, including unique identities in the variable \code{aim_idvar}. If this is just a data frame OR if \code{wgtcats} is not a spatial polygons data frame then it must already have the weight category and LMF segment memberships assigned in the variables \code{wgtcat_var} and \code{segment_var} (there should be an \code{NA} for any observation that does not fall in an LMF segment). If this is spatial and so are either \code{wgtcats} or \code{segments}, then the points will be attributed with the memberships useing \code{sp::over()}. This must also include \code{aim_fate_var} if any of the points were not observed/sampled, e.g. rejected, inaccessible, or had an unknown fate, otherwise they will all be assumed to've been observed/sampled.
#' @param lmf_points Data frame or spatial points data frame. The LMF point information, including unique identities in the variable \code{lmf_idvar}. If this is just a data frame OR if \code{wgtcats} is not a spatial polygons data frame then it must already have the weight category and LMF segment memberships assigned in the variables \code{wgtcat_var} and \code{segment_var} (there should be an \code{NA} for any observation that does not fall in an LMF segment). If this is spatial and so are either \code{wgtcats} or \code{segments}, then the points will be attributed with the memberships useing \code{sp::over()}. All LMF points are assumed to be observed/sampled.
#' @param aim_idvar Character string. The name of the variable in \code{aim_points} that contains the unique identities for the points.
#' @param lmf_idvar Character string. The name of the variable in \code{lmf_points} that contains the unique identities for the points.
#' @param aim_fatevar Optional character string. The name of the variable in \code{aim_points} that contains the fates of the points. If \code{NULL} then the fate of all points in \code{aim_points} will be assumed to be sampled/observed. Defaults to \code{NULL}
#' @param observed_fates Optional vector of character strings. The strings present in \code{aim_points$aim_fatevar} that correspond to a sampled/observed fate. Defaults to \code{NULL}
#' @param invalid_fates Optional vector of character strings. The strings present in \code{aim_points$aim_fatevar} that correspond to fates that SHOULD NOT BE INCLUDED IN WEIGHTING, e.g. unneeded or not yet evaluated as in future points or unused oversample. Defaults to \code{NULL}
#' @param wgtcats Data frame or spatial polygons data frame. The information about the weight categories. This must contain the weight category identities in a variable named \code{wgtcat_var}. If this is a data frame, it must also contain the AREAS IN HECTARES in a variable named \code{wgtcat_area_var}. Defaults to \code{NULL}
#' @param wgtcat_var Character string. The name of the variable in \code{wgtcats} (and, if \code{wgtcats} is not spatial, \code{aim_points} and \code{lmf_points}) that contains the weight category identities/memberships.
#' @param wgtcat_area_var Optional character string. The name of the variable in \code{wgtcats} that contains the AREAS IN HECTARES. Only optional if \code{wgtcats} is spatial. Defaults to \code{NULL}
#' @param segments Optional spatial polygons data frame. The information about the LMF segments, this is only optional if \code{segment_var} is not already assigned to \code{aim_points} and \code{lmf_points}. This must contain the weight category identities in a variable named \code{segment_var}. Defaults to \code{NULL}
#' @param segment_var Character string. The name of the variable in \code{segments} (or, if \code{segments} is not provided, \code{aim_points} and \code{lmf_points}) that contains the LMF segment identities.
#' @param projection Optional CRS object. Used to reproject all spatial data. If \code{NULL} then the projection is taken from \code{wgtcats} unless it's not spatial in which case it's taken from \code{segments} unless it's not provided in which case no reprojection occurs. Defaults to \code{NULL}
#' @param verbose Logical. If \code{TRUE} then the function will produce informative messages as it executes its steps. Useful for debugging. Defaults to \code{FALSE}.
#' @return A list of two data frames: point_weights which contains information for each point that did not have a fate in \code{invalid_fates} and wgtcat_summary which contains information about each weight category.
#' @export
weight_aimlmf <- function(aim_points,
                            lmf_points,
                            aim_idvar,
                            lmf_idvar,
                            aim_fatevar = NULL,
                            observed_fates = c("TS"),
                            invalid_fates = NULL,
                            wgtcats = NULL,
                            wgtcat_var,
                            wgtcat_area_var = NULL,
                            segments = NULL,
                            segment_var,
                            projection = NULL,
                            verbose = FALSE){
  if (length(wgtcat_var) != 1 | class(wgtcat_var) != "character") {
    stop("wgtcat_var must be a single character string")
  }
  if (!is.null(wgtcat_area_var)) {
    if (length(wgtcat_area_var) != 1 | class(wgtcat_area_var) != "character") {
      stop("wgtcat_area_var must be a single character string")
    }
  }
  if (length(segment_var) != 1 | class(segment_var) != "character") {
    stop("segment_var must be a single character string")
  }

  if (!is.null(wgtcats)) {
    if (!(class(wgtcats) %in% c("SpatialPolygonsDataFrame", "data.frame"))) {
      stop("wgtcats must be a spatial polygons data frame or data frame")
    }
    if (nrow(wgtcats) < 1) {
      stop("wgtcats contains no observations/data")
    }
    if (!(wgtcat_var %in% names(wgtcats))) {
      stop(paste("The variable", wgtcat_var, "does not appear in wgtcats@data"))
    }
  }

  if (!is.null(segments)) {
    if (!(class(segments) %in% "SpatialPolygonsDataFrame")) {
      stop("segments must be a spatial polygons data frame")
    }
    if (nrow(segments) < 1) {
      stop("segments contains no observations/data")
    }
    if (!(segment_var %in% names(segments))) {
      stop(paste("The variable", segment_var, "does not appear in segments@data"))
    }
  }

  if (is.null(aim_fatevar)) {
    warning("No fate variable specified for AIM points. Assuming all were observed/sampled.")
    aim_fatevar <- "fate"
    aim_points[["fate"]] <- "observed"
    lmf_points[["fate"]] <- "observed"
    observed_fates <- "observed"
  } else {
    if (length(aim_fatevar) > 1 | class(aim_fatevar) != "character") {
      stop("The aim fate variable must be a single character string")
    }
    if (!aim_fatevar %in% names(aim_points)) {
      stop(paste("The variable", aim_fatevar, "does not appear in aim_points@data"))
    } else {
      aim_points[["fate"]] <- aim_points[[aim_fatevar]]
    }
    if (is.null(observed_fates)) {
      warning("No observed fates provided. Assuming all AIM points were observed/sampled unless specified otherwise with invalid_fates")
      observed_fates <- unique(aim_points[["fate"]])
      observed_fates <- observed_fates[!(observed_fates %in% invalid_fates)]
    }
  }

  lmf_points[["fate"]] <- observed_fates[1]

  if (!is.null(observed_fates)) {
    if (!any(aim_points[["fate"]] %in% observed_fates)) {
      warning("No AIM points have a fate specified as observed.")
    }
  }

  # Harmonize projections
  if (is.null(projection)) {
    if (!is.null(wgtcats)) {
      projection <- wgtcats@proj4string
    } else if (!is.null(segments)) {
      projection <- segments@proj4string
    }
  }

  if (!is.null(projection)) {
    if (class(aim_points) %in% c("SpatialPointsDataFrame")) {
      if (!identical(projection, aim_points@proj4string)) {
        aim_points <- sp::spTransform(aim_points,
                                      CRSobj = projection)
      }
    }
    if (class(lmf_points) %in% c("SpatialPointsDataFrame")) {
      if (!identical(projection, lmf_points@proj4string)) {
        lmf_points <- sp::spTransform(lmf_points,
                                      CRSobj = projection)
      }
    }
    if (!is.null(wgtcats)) {
      if (class(lmf_points) %in% c("SpatialPolygonsDataFrame")) {
        if (!identical(projection, wgtcats)) {
          wgtcats <- sp::spTransform(wgtcats,
                                     CRSobj = projection)
        }
      }
    }
    if (!is.null(segments)) {
      if (!identical(projection, segments@proj4string)) {
        segments <- sp::spTransform(segments,
                                    CRSobj = projection)
      }
    }
  }





  # Assign the weight categories
  if (class(wgtcats) %in% "SpatialPolygonsDataFrame") {
    if (!(wgtcat_var %in% names(wgtcats))) {
      stop("The variable ", wgtcat_var, " does not appear in wgtcats")
    }
    aim_points[["wgtcat"]] <- sp::over(aim_points, wgtcats)[[wgtcat_var]]
    lmf_points[["wgtcat"]] <- sp::over(lmf_points, wgtcats)[[wgtcat_var]]
  } else {
    if (!(wgtcat_var %in% names(aim_points))) {
      stop("The variable ", wgtcat_var, " does not appear in aim_points")
    }
    if (!(wgtcat_var %in% names(lmf_points))) {
      stop("The variable ", wgtcat_var, " does not appear in lmf_points")
    }
    aim_points[["wgtcat"]] <- aim_points[[wgtcat_var]]
    lmf_points[["wgtcat"]] <- lmf_points[[wgtcat_var]]
  }


  # Assign the LMF segment codes
  if (is.null(segments)) {
    if (!(segment_var %in% names(aim_points))) {
      stop("The variable ", segment_var, " does not appear in aim_points")
    }
    if (!(segment_var %in% names(lmf_points))) {
      stop("The variable ", segment_var, " does not appear in lmf_points")
    }
    aim_points[["segment"]] <- aim_points[[segment_var]]
    lmf_points[["segment"]] <- lmf_points[[segment_var]]
  } else {
    if (!(segment_var %in% names(segments))) {
      stop("The variable ", segment_var, " does not appear in segments")
    }
    aim_points[["segment"]] <- sp::over(aim_points, segments)[[segment_var]]
    lmf_points[["segment"]] <- sp::over(lmf_points, segments)[[segment_var]]
  }


  # Just harmonize the idvar names for now
  aim_points[["unique_id"]] <- aim_points[[aim_idvar]]
  lmf_points[["unique_id"]] <- lmf_points[[lmf_idvar]]

  # If somehow LMF points aren't in a segment, that's a major problem
  if (any(is.na(lmf_points[["segment"]]))) {
    stop(paste("The following LMF points did not spatially intersect any segment polygons:",
               paste(lmf_points[is.na(lmf_points[["segment"]]), "unique_id"], collapse = ", ")))
  }

  # TODO: Stick a check in here that the segment ID from the polygons
  # matches the one derived from the LMF plot ID
  # Probably just warn if not?

  # Get data frames
  if (class(aim_points) %in% "SpatialPointsDataFrame") {
    aim_df <- aim_points@data
  } else {
    aim_df <- aim_points
  }
  if (class(lmf_points) %in% "SpatialPointsDataFrame") {
    lmf_df <- lmf_points@data
  } else {
    lmf_df <- lmf_points
  }

  # NOTE THAT THIS FILTERS OUT ANYTHING FLAGGED AS NOT NEEDED IN THE FATE
  # So that'd be unused oversamples or points from the FUTURE that no one would've sampled anyway
  aim_df <- aim_df[!(aim_df[["fate"]] %in% invalid_fates), c("unique_id", "fate", "wgtcat", "segment")]
  aim_df[["aim"]] <- TRUE
  aim_df[["lmf"]] <- FALSE

  # We only have target sampled LMF points available to us, so we don't need to filter them
  lmf_df <- lmf_df[, c("unique_id", "fate", "wgtcat", "segment")]
  lmf_df[["aim"]] <- FALSE
  lmf_df[["lmf"]] <- TRUE

  # Combine them
  combined_df <- unique(rbind(aim_df, lmf_df))

  # There shouldn't be any that don't belong to a wgtcat anymore
  combined_df <- combined_df[!is.na(combined_df[["wgtcat"]]), ]

  # Add an observed variable for easy reference later
  combined_df[["observed"]] <- combined_df[["fate"]] %in% observed_fates

  # To make the lookup table, drop any points that fell outside LMF segments
  combined_segmentsonly_df <- combined_df[!is.na(combined_df[["segment"]]), ]

  # Create a segment relative weight lookup table
  segment_relwgt_lut <- do.call(rbind,
                                lapply(X = split(combined_segmentsonly_df, combined_segmentsonly_df[["segment"]]),
                                       FUN = function(X){
                                         # These are the count of AIM points with any valid fate
                                         aim_count <- sum(X[["aim"]])

                                         # We also need the count of LMF points with any valid fate, but that's complicated
                                         # We only have the sampled LMF points so we can count those
                                         lmf_sampled_count <- sum(X[["lmf"]])
                                         # To get the number of evaluated but not sampled points:
                                         # The LMF plot keys end in a digit that represents the intended sampling order within a segment
                                         # 1 and 2 are considered base points and were intended to be sampled
                                         # If a sampled LMF plot's plot key ends in 3, that means that one or both of the base points
                                         # were evaluated and rejected rather than sampled, which brings the evaluated LMF plot count
                                         # to three for the segment.
                                         # This just asks if the third point was used
                                         lmf_oversample_used <- any(grepl(X[["unique_id"]][X[["lmf"]]],
                                                                          pattern = "\\D3$"))

                                         # Likewise, if only one LMF plot was sampled in a segment, that means the other two were
                                         # evalurated and rejected rather than sampled, also bringing the total to three.
                                         # So if there was only one sampled or if the oversample was used, there were three evaluated
                                         if (sum(X[["lmf"]]) == 1 | lmf_oversample_used) {
                                           lmf_count <- 3
                                         } else {
                                           # This will fire only if there sampled count was 2, but better to be safe here
                                           lmf_count <- lmf_sampled_count
                                         }


                                         # The relative weight for points falling within a segment is calculated as
                                         # 1 / (number of points)
                                         relative_weight <- 1 / sum(aim_count, lmf_count)

                                         output <- data.frame("segment" = X[["segment"]][1],
                                                              "relwgt" = relative_weight,
                                                              stringsAsFactors = FALSE)
                                         return(output)
                                       }))

  # Add the relative weights to the combined AIM and LMF points
  combined_df <- merge(x = combined_df,
                       y = segment_relwgt_lut,
                       by = "segment",
                       all.x = TRUE)

  # Anywhere there's an NA associated with an AIM point, that's just one that fell outside the segments
  combined_df[is.na(combined_df[["relwgt"]]) & combined_df[["aim"]], "relwgt"] <- 1



  ####### NOTE!!!!!!!!! ##############################################
  # This is hacked up using wgtcat_df from above because the wgtcats geometry is screwed up
  # wgtcats <- aim.analysis::add.area(wgtcats)
  if (is.null(wgtcat_area_var)) {
    if (class(wgtcats) %in% "SpatialPolygonsDataFrame") {
      wgtcat_df <- aim.analysis::add.area(wgtcats)@data
    } else {
      stop("No name for a variable in wgtcats containing the area in hectares was provided and wgtcats is not a spatial polygons data frame so area can't be calculated")
    }
  } else {
    if (!(wgtcat_area_var %in% names(wgtcats))) {
      if (class(wgtcats) %in% "SpatialPolygonsDataFrame") {
        wgtcat_df <- aim.analysis::add.area(wgtcats)@data
      } else {
        stop("The variable ", wgtcat_area_var, " does not appear in wgtcats")
      }
    } else {
      warning("Trusting that the variable ", wgtcat_area_var, " in wgtcats contains the areas in hectares")
      if (class(wgtcats) %in% "SpatialPolygonsDataFrame") {
        wgtcat_df <- wgtcats@data
        wgtcat_df[["AREA.HA"]] <- wgtcat_df[[wgtcat_area_var]]
      } else {
        wgtcat_df <- wgtcats
        wgtcat_df[["AREA.HA"]] <- wgtcat_df[[wgtcat_area_var]]
      }
    }
  }


  wgtcat_df[["wgtcat"]] <- wgtcat_df[[wgtcat_var]]

  # Making sure that these aren't spread out over multiple observations
  if (verbose) {
    message("Summarizing wgtcat_df by wgtcat to calculate areas in case the wgtcats are split into multiple observations")
  }
  wgtcat_df <- dplyr::summarize(dplyr::group_by(wgtcat_df,
                                                wgtcat),
                                "hectares" = sum(AREA.HA))

  wgtcat_areas <- setNames(wgtcat_df[["hectares"]], wgtcat_df[["wgtcat"]])

  weight_info <- lapply(X = wgtcat_df[["wgtcat"]],
                        wgtcat_areas = wgtcat_areas,
                        points = combined_df,
                        wgtcat_var = wgtcat_var,
                        FUN = function(X, wgtcat_areas, points, wgtcat_var){
                          # Area of this polygon
                          area <- wgtcat_areas[X]

                          # All of the points (AIM and LMF) falling in this polygon
                          points <- points[points[["wgtcat"]] == X, ]

                          # If there are in fact points, do some weight calculations!
                          if (nrow(points) > 0 & any(points[["observed"]])) {
                            # The number of observed AIM points with a relative weight of 1
                            # So, not sharing an LMF segment with any LMF points
                            aim_standalone <- sum(combined_df[["aim"]] & combined_df[["relwgt"]] == 1)
                            # The number of unique segments selected for LMF points
                            lmf_segment_count <- length(unique(points[["segment"]]))

                            # The approximate number of 160 acre segments in this polygon
                            # Obviously, not all of the segments were selected in the first stage
                            # of the LMF design, but this is how many were available in this polygon
                            approximate_segment_count <- area / (160 / 2.47)

                            # This is the sum of the relative weights of the OBSERVED points
                            # This does not include the inaccessible, rejected, or unknown points
                            # It does include both AIM and LMF, however
                            sum_observed_relwgts <- sum(points[points[["observed"]], "relwgt"])
                            # This is the sum of the relative weights of all the AIM points, regardless of fate
                            sum_relwgts <- sum(points[["relwgt"]])

                            # The units are segments per point
                            # The segments are 120 acre quads (quarter sections?) that the first stage of LMF picked from
                            # Steve Garman called this "ConWgt" which I've expanded to conditional_weight
                            # but that's just a guess at what "con" was short for
                            conditional_weight <- approximate_segment_count / (aim_standalone + lmf_segment_count)
                            conditional_area <- conditional_weight * sum_observed_relwgts

                            # What's the observed proportion of the area?
                            observed_proportion <- sum_observed_relwgts / sum_relwgts
                            # And how many acres is that then?
                            # We can derive the "unknown" or "unsampled" area as the difference
                            # between the polygon area and the observed area
                            observed_area <- area * observed_proportion

                            # Then this adjustment value is calculated
                            weight_adjustment <- observed_area / conditional_area



                            # Put everything about the wgtcat in general in one output data frame
                            output_wgtcat <- data.frame(wgtcat = X,
                                                        area = area,
                                                        area_units = "hectares",
                                                        approximate_segment_count = approximate_segment_count,
                                                        sum_observed_relwgts = sum_observed_relwgts,
                                                        sum_relwgts = sum_relwgts,
                                                        observed_proportion = observed_proportion,
                                                        observed_area = observed_area,
                                                        unobserved_area = area - observed_area,
                                                        conditional_weight = conditional_weight,
                                                        conditional_area = conditional_area,
                                                        weight_adjustment = weight_adjustment,
                                                        point_count = sum(points[["observed"]]),
                                                        observed_point_count = nrow(points),
                                                        stringsAsFactors = FALSE)

                            # But much more importantly add the calculated weights to the points
                            output_points <- points

                            # This handles situations where there were points, but none of them were observed
                            # In that case, weight_adjustment will be 0 / 0 = NaN
                            # That's fine because we can identify that this polygon is still in the inference area
                            # but that its entire area is "unknown" because no data were in it
                            if (is.nan(weight_adjustment)) {
                              output_points[["wgt"]] <- NA
                            } else {
                              # The point weight is calculated here.
                              # This is Garman's formula and I don't have the documentation justifying it on hand
                              output_points[["wgt"]] <- weight_adjustment * conditional_weight * points[["relwgt"]]
                              message("Checking weight sum for ", X)
                              # I'm rounding here because at unrealistically high levels of precision it gets weird and can give false positives
                              if (round(sum(output_points[["wgt"]]), digits = 3) != round(area, digits = 3)) {
                                warning("The sum of the point weights (", sum(output_points[["wgt"]]), ") does not equal the polygon area (", area, ") for ", X)
                              }
                            }

                          } else {
                            # Basically just empty data frames
                            output_wgtcat <- data.frame(wgtcat = X,
                                                        area = area,
                                                        area_units = "hectares",
                                                        approximate_segment_count = area / (160 / 2.47),
                                                        sum_observed_relwgts = NA,
                                                        sum_relwgts = NA,
                                                        observed_proportion = 0,
                                                        observed_area = 0,
                                                        unobserved_area = area,
                                                        conditional_weight = NA,
                                                        conditional_area = NA,
                                                        weight_adjustment = NA,
                                                        point_count = 0,
                                                        observed_point_count = 0,
                                                        stringsAsFactors = FALSE)
                            output_points <- NULL
                          }

                          return(list(points = output_points,
                                      wgtcat = output_wgtcat))
                        })

  point_weights <- do.call(rbind,
                           lapply(X = weight_info,
                                  FUN = function(X){
                                    X[["points"]]
                                  }))
  wgtcat_summary <- do.call(rbind,
                            lapply(X = weight_info,
                                   FUN = function(X){
                                     X[["wgtcat"]]
                                   }))
  return(list(point_weights = point_weights,
              wgtcat_summary = wgtcat_summary))
}
nstauffer/aim.analysis documentation built on Nov. 2, 2023, 12:52 a.m.