R/lincoln.R

Defines functions get_lincoln_estimates compare_harvest_rates get_comparison_dataframe get_confidence_levels get_harvest_rate get_reporting_probabilities has_recovery_rate has_reporting_probas get_direct_recoveries summarize_recoveries summarize_bandings

Documented in compare_harvest_rates get_direct_recoveries get_harvest_rate get_lincoln_estimates summarize_bandings summarize_recoveries

#' @title Summarize bandings
#' @description Counts the number of bird banded by year. Grouping is done on the
#' *b.year* column.
#' @param df A banding dataframe
#' @return A summary dataframe with a new column named *n_banded* which contains
#' the number of banded birds by year
#' @rdname summarize_bandings
#' @export
summarize_bandings <- function(df) {
  res = df %>%
    group_by(b.year) %>%
    summarise(n_banded = sum(count_of_birds))
  return(res)
}

#' @title Summarize recoveries
#' @description Counts the number of recoveries of birds banded in the same year.
#' If the data is not grouped by bands, it will be grouped by age class
#' (*age_short* column), banding year(*b.year*) and recovery year (*r.corrected_year*).
#' @param df A recovery dataframe.
#' @param by_band Should the results be sorted by band type? Default: FALSE
#' @return The summarized dataframe
#' @rdname summarize_recoveries
#' @export
summarize_recoveries <- function(df, by_band = FALSE) {
  by_groups <- (if (by_band) {
    c("b.year", "add_info")
  } else {
    c("age_short", "b.year", "r.corrected_year")
  })
  res <- df %>% filter(b.year == r.corrected_year) %>%
    group_by_at(by_groups) %>%
    summarise(n_recoveries = n())
  return(res)
}

#' @title Get direct recoveries
#' @description Get all direct recoveries by merging summaries of banding and
#' recovery data. This function takes raw datasets, filters them if needed,
#' summarizes them, merge them and calculates recovery rate with variance and
#' standard error.
#' @param banding_df A dataframe contaning banding data extracted from Gamebirds
#' and already cleaned.
#' @param recoveries_df A dataframe contaning recovery data extracted from Gamebirds
#' and already cleaned.
#' @param filters Filters to apply to the dataframes. Can contain database
#' specific filters. This argument will be used if the data is not filtered
#' and *banding_filters* or *recoveries_filters* are not provided, Default: NULL
#' @param banding_filters Filters to apply to the banding dataset, Default: NULL
#' @param recoveries_filters Filters to apply to the recoveries dataset, Default: NULL
#' @param filtered Are the datasets already filtered?, Default: FALSE
#' @return A dataframe containing summarized banding and recoveries data with
#' a recovery rate, variance and standard error.
#' @details The species for which the data is compiled is stored in the SPEC
#' attribute of the returned dataframe
#' @rdname get_direct_recoveries
#' @seealso \code{\link{clean_dataset}}, \code{\link{summarize_bandings}},
#' \code{\link{summarize_recoveries}}
#' @export
get_direct_recoveries <-
  function(banding_df,
           recoveries_df,
           filters = NULL,
           banding_filters = NULL,
           recoveries_filters = NULL,
           filtered=FALSE) {
    if (!filtered) {
      b_filters <-
        if (is.null(banding_filters))
          filters
      else
        banding_filters
      banding_db <-
        filter_database(db = banding_df,
                        filters = b_filters,
                        db_type = "b")
      r_filters <-
        if (is.null(recoveries_filters))
          filters
      else
        recoveries_filters
      rec_db <-
        filter_database(db = recoveries_df,
                        filters = filters,
                        db_type = "r")
      SPEC = b_filters$SPEC
    } else {
      banding_db <- banding_df
      rec_db <- recoveries_df
      SPEC = unique(banding_db$SPEC)
    }

    bandings <- summarize_bandings(banding_db)
    recoveries <- summarize_recoveries(rec_db)

    dr_df <- merge(bandings, recoveries)

    dr_df <-
      dr_df %>% mutate(recov_rate = n_recoveries / n_banded) %>%
      mutate(var_recov_rate = (recov_rate * (1 - recov_rate)) / (n_banded - 1)) %>%
      mutate(se_recov_rate = sqrt(var_recov_rate))
    attr(dr_df, "SPEC") = SPEC
    return(dr_df)
  }

has_reporting_probas <- function(df) {
  return (all(c("rho", "var_rho") %in% colnames(df)))
}

has_recovery_rate <- function(df) {
  return (all(c("recov_rate", "var_recov_rate") %in% colnames(df)))
}

get_reporting_probabilities <- function(df, rho_df) {
  if (!has_reporting_probas(df)) {
    if (is.null(rho_df)) {
      rho_df = gb_reporting_probas
    }
    df <- inner_join(df, rho_df, by = "b.year")
  }
  return (df)
}


#' @title Calculate the harvest rate based on reporting probabilities
#' @description Takes a dataframe generated by \code{\link{get_direct_recoveries}},
#' a dataframe containing reproting probabilities and calculates the harvest rates
#' based on recovery data of banded birds. Variance, standard error, confidence
#' levels and coefficient of variation are also calculated for the harvest rate.
#' The harvest rate is estimated based on data described in Alisauskas *et al.*, (2014)
#'
#' @param df A dataframe generated by  \code{\link{get_direct_recoveries}}, Default: NULL
#' @param rho_df A dataframe with reporting probabilities, Default: NULL
#' @param ... If `df` is not provided, additionnal arguments passed to
#' \code{\link{get_direct_recoveries}} to generate the direct directories dataframe
#' @return A dataframe with reporting probabilities and havest rate with
#' variance, standard error, confidence levels and coefficient of variation.
#' @rdname get_harvest_rate
#' @seealso \code{\link{get_direct_recoveries}}
#' @export
get_harvest_rate <-
  function(df = NULL,
           rho_df = NULL,
           ...) {
    hr_df <- if (is.null(df)) {
      get_direct_recoveries(...)
    } else {
      if (!has_recovery_rate(df)) {
        stop(
          paste(
            "Missing columns for df argument. Please make sure the dataframe",
            "has been generated by the get_direct_recoveries() function"
          )
        )
      }
      df
    }
    hr_df <- get_reporting_probabilities(hr_df, rho_df)

    hr_df <- hr_df %>%
      mutate(harv_rate = recov_rate / rho) %>%
      mutate(var_harv_rate = (var_recov_rate / (rho ^ 2)) + ((recov_rate ^ 2 * var_rho) / rho ^ 4)) %>%
      mutate(se_harv_rate = sqrt(var_harv_rate)) %>%
      mutate(cl_harv_rate = se_harv_rate * 1.96) %>%
      mutate(cv_harv_rate = se_harv_rate / harv_rate)
    return(hr_df)
  }



get_confidence_levels <- function(df) {
  return(as.data.frame(cbind(
    b.year = df$b.year,
    start = (df$h - df$cl_h),
    end = (df$h + df$cl_h)
  )))
}

get_comparison_dataframe <-
  function(df = NULL,
           filters = NULL,
           pos = 1,
           ...) {
    if (is.null(df)) {
      if (is.null(filters)) {
        stop(sprintf(
          paste(
            "Dataframe to compare is missing. You must either",
            "provide a filtered data frame with the `df%s` argument or a",
            "list of filters with the `filters%s` argument."
          ),
          pos,
          pos
        ))
      }
      df <- get_harvest_rate(filters = filters, ...)
      if ("filter_name" %in% names(filters)) {
        df["type"] <- filters$filter_name
      }
    }
    if (!"type" %in% colnames(df)) {
      df["type"] = paste0("filters", pos)
    }
    return(df)
  }

#' @title Compares two harvest rates dataframes created from different filters
#' @description This function takes either two dataframes returned by
#' \code{\link{get_harvest_rate}} or creates two from different filters.
#' It then compares the harvest rates for the two dataframes over the years
#' to see if differences exist.
#' @param df1 First dataframe to compare, Default: NULL
#' @param df2 Second dataframe to compare, Default: NULL
#' @param filters1 Filters used to create *df1* if it is not provided, Default: NULL
#' @param filters2 Filters used to create *df2* if it is not provided, Default: NULL
#' @param plot Plot the comparisons, Default: TRUE
#' @param check_overlap Check if all confidence levels overlap, Default: TRUE
#' @param ... Additional arguments needed by \code{\link{get_harvest_rate}} to
#' create the dataframes if they are not provided
#' @return A logical specifiying if all confidence levels overlap if *check_overlap*
#' is set to TRUE.
#' @rdname compare_harvest_rates
#' @seealso  \code{\link{get_harvest_rate}}
#' @export
compare_harvest_rates <-
  function(df1 = NULL,
           df2 = NULL,
           filters1 = NULL,
           filters2 = NULL,
           plot = TRUE,
           check_overlap = TRUE,
           ...) {
    # All Bands
    df1 <- get_comparison_dataframe(df1, filters1, 1, ...)
    df2 <- get_comparison_dataframe(df2, filters2, 2, ...)

    if (plot) {
      library(ggplot2)
      df <- rbind(df1, df2)
      both_plot <-
        ggplot(data = df, aes(x = b.year, y = harv_rate)) +
        geom_point(aes(color = type)) +
        geom_errorbar(aes(
          ymin = harv_rate - cl_harv_rate,
          ymax = harv_rate + cl_harv_rate
        ))
      print(both_plot)
    }

    if (check_overlap) {
      ## Compute overlap
      common_years <- intersect(df1$b.year, df2$b.year)

      # Get confidence levels
      cl1 <-
        get_confidence_levels(df1[df1$b.year %in% common_years, ])
      cl2 <-
        get_confidence_levels(df2[df2$b.year %in% common_years, ])
      # Check if there is an overlap
      overlap <-
        (cl2$start <= cl1$end) &
        (cl2$end >= cl1$start)
      # Return TRUE if all confidence levels overlap
      return(all(overlap))
    }
  }


#' @title Get Lincoln estimates
#' @description This is the function that calculates the Lincoln estimates based
#' on the paper by Alisauskas *et al.*, (2014) using harvest estimates and
#' a harvest rate dataframe created by \code{\link{get_harvest_rate}}
#' @param df A harvest rate dataframe generated by \code{\link{get_harvest_rate}}
#' , Default: NULL
#' @param harvest_df A dataframe containing harvest data estimates for the target
#' species and target years. The dataframe must have a column named b.year,
#'  Default: NULL
#' @param harvest_correction_factor A factor used to correct the harvest
#' estimations which tend to be overstimated. By default, uses the factor for
#' goose species collected after 1999 as described by Padding and Royle (2012),
#' Default: 0.61
#' @param plot_estimates Should the estimates be plotted, Default: TRUE
#' @param save_estimates Should the estimates be saved in csv format, Default: TRUE
#' @param save_path The path where estimates should be saved, Default: '.'
#' @param ... Additional arguments needed by \code{\link{get_harvest_rate}} to
#' create the *df* if it is not provided
#' @return A dataframe containing the Lincoln estimates
#' @rdname get_lincoln_estimates
#' @export
get_lincoln_estimates <-
  function(df = NULL,
           harvest_df = NULL,
           harvest_correction_factor = 0.61,
           plot_estimates = TRUE,
           save_estimates = TRUE,
           save_path = ".",
           ...) {
    if (is.null(df)) {
      df <- get_harvest_rate(...)
    }
    if (is.null(harvest_df)) {
      # Should be an error since it is a big portion of estimates
      stop("A harvest dataframe for the selected species should be provided")
    }
    lincoln_df <- inner_join(df, harvest_df, by = "b.year")
    lincoln <- lincoln_df %>%
      mutate(harvest_adj = harvest * harvest_correction_factor) %>%
      mutate(se_harvest_adj = se_harvest * harvest_correction_factor) %>%
      mutate(var_harvest_adj = se_harvest_adj ^ 2) %>%
      mutate(N = ((((n_banded + 1) * (harvest_adj + 1) * rho
      ) / (n_recoveries + 1)) - 1)) %>%
      mutate(var_N_b.r = ((n_banded + 1) * (n_banded - n_recoveries)) / (((n_recoveries +
                                                                             1) ^ 2) * (n_recoveries + 2))) %>%
      mutate(var_N_bH.r =  ((n_banded / n_recoveries) ^ 2) * var_harvest_adj + harvest_adj ^
               2 * var_N_b.r) %>%
      mutate(var_N = (((
        n_banded * harvest_adj
      ) / n_recoveries) ^ 2) * var_rho + rho ^
        2 * var_N_bH.r) %>%
      mutate(se_N = sqrt(var_N)) %>%
      mutate(cl_N = se_N * 1.96)


    if (plot_estimates) {
      library(ggplot2)
      plt <- ggplot(data = lincoln, aes(x = b.year, y = N)) +
        geom_point() + geom_smooth() +
        scale_y_continuous(name = "Abundance (Lincoln)", limits = c(0, 500000)) +
        geom_errorbar(aes(ymin = N - cl_N, ymax = N + cl_N)) +
        labs(x = "Year")
      print(plt)
    }

    lincoln_est <-
      as.data.frame(cbind(lincoln$b.year, lincoln$N, lincoln$cl_N))
    colnames(lincoln_est) <- c("year", "N", "NCL")
    if (save_estimates) {
      print(attributes(df)$SPEC)
      SPEC <- if ("SPEC" %in% names(attributes(df))) {
        attributes(df)$SPEC
      } else {
        warning("Species was not found in the estimation dataframe. Please
                make sure df was created by the get_harvest_rate() function.
                Using 'UNKN' as species.")
        "UNKN"
      }
      file_path = file.path(save_path,
                            sprintf(
                              '%s_Lincoln_2000_2019_%s.csv',
                              SPEC,
                              format(Sys.time(), "%b%d_%Y")
                            ))
      write.csv(lincoln_est,
                file_path)
    }
    return(lincoln_est)

  }
Vin985/gblincoln documentation built on April 21, 2022, 1:49 a.m.