#' Pull catch data for satisfactory tows from the NWFSC data warehouse
#'
#' Pull catch data from the
#' [NWFSC data warehouse](https://www.webapps.nwfsc.noaa.gov/data)
#' for a single species or all observed species, where the latter is specified
#' by leaving both `common_name = NULL` and `sci_name = NULL`.
#'
#' @details
#' The data available in the warehouse are cleaned prior to being downloaded
#' with the intent that they provide the best available information for use
#' in an index-standardization procedure. The removed samples may be of use
#' to others with a less-restrictive goal than producing an index of abundance.
#' For example, life-stage samples are excluded because they are not collected
#' using the same protocols as standard samples.
#' To download all data, we currently recommend going to the
#' [NWFSC data warehouse](https://www.webapps.nwfsc.noaa.gov/data)
#' and using the csv link to extract data for a single species at a time.
#' In the future, we hope to add functionality to this package such that
#' downloading all data can be done easily within this function.
#' See [Issue #43](https://github.com/pfmc-assessments/nwfscSurvey/issues/43)
#' for more information.
#'
#' @template common_name
#' @template sci_name
#' @template years
#' @template survey
#' @template dir
#' @template convert
#' @template verbose
#'
#' @author Chantel Wetzel
#' @export
#'
#' @import chron
#' @importFrom stringr str_replace_all
#' @importFrom dplyr left_join rename
#' @import glue
#'
#' @examples
#' \dontrun{
#' # survey is only arg that has to be specified
#' catch_data <- pull_catch(survey = "NWFSC.Combo")
#'
#' # Example with specified common name
#' catch_data <- pull_catch(common_name = "vermilion rockfish",
#' survey = "NWFSC.Combo")
#'
#' # Example with specified scientific name
#' catch_data <- pull_catch(sci_name = "Eopsetta jordani",
#' survey = "NWFSC.Combo")
#'
#' # Example with multiple names
#' catch_data <- pull_catch(common_name = c("vermilion rockfish",
#' "vermilion and sunset rockfish"), survey = "NWFSC.Combo")
#'
#' catch_data <- pull_catch(sci_name = c("Sebastes miniatus",
#' "Sebastes sp. (crocotulus)",
#' "Sebastes sp. (miniatus / crocotulus)"),
#' survey = "NWFSC.Combo")
#' }
#'
pull_catch <- function(common_name = NULL,
sci_name = NULL,
years = c(1970, 2050),
survey,
dir = NULL,
convert = TRUE,
verbose = TRUE) {
if (survey %in% c("NWFSC.Shelf.Rockfish", "NWFSC.Hook.Line")) {
stop("The catch pull currently does not work for NWFSC Hook & Line Survey data.",
"\nA subset of the data is available on the data warehouse https://www.webapp.nwfsc.noaa.gov/data",
"\nContact John Harms (john.harms@noaa.gov) for the full data set.")
}
if(length(c(common_name, sci_name)) != max(c(length(common_name), length(sci_name)))){
stop("Can not pull data using both the common_name or sci_name together.
\n Please retry using only one." )
}
check_dir(dir = dir, verbose = verbose)
if (is.null(common_name)) {
var_name <- "scientific_name"
species <- sci_name
} else {
var_name <- "common_name"
species <- common_name
}
if (is.null(sci_name) & is.null(common_name)) {
var_name <- "common_name"
species <- "pull all"
}
# Survey options available in the data warehouse
project_long <- check_survey(survey = survey)
if (length(years) == 1) {
years <- c(years, years)
}
# Pull data for the specific species for the following variables
# Can only pull the nested fields (legacy performance and statistical partition) if
# the main table fields are specified. Could pull separate and then join which
# would allow us to eliminate vars_long form the main pull
perf_codes <- c(
"operation_dim$legacy_performance_code",
"statistical_partition_dim$statistical_partition_type"
)
vars_long <- c(
"common_name", "scientific_name", "project", "year", "vessel", "tow",
"total_catch_numbers", "total_catch_wt_kg",
"subsample_count", "subsample_wt_kg", "cpue_kg_per_ha_der",
perf_codes
)
# These are the retained and returned fields
vars_short <- vars_long[!vars_long %in% perf_codes]
# symbols here are generally: %22 = ", %2C = ",", %20 = " "
species_str <- convert_to_hex_string(species)
add_species <- paste0("field_identified_taxonomy_dim$", var_name, "|=[", species_str,"]")
if (any(species == "pull all")) {
add_species <- ""
}
url_text <- get_url(data_table = "trawl.catch_fact",
project_long = project_long,
add_species = add_species,
years = years,
vars_long = vars_long)
if (verbose) {
message("Pulling catch data. This can take up to ~ 30 seconds (or more).")
}
# Pull data from positive tows for selected species
positive_tows <- try(get_json(url = url_text))
if(!is.data.frame(positive_tows)){
stop()
}
# Remove water hauls
water_hauls <- is.na(positive_tows[, "operation_dim$legacy_performance_code"])
if (sum(water_hauls) > 0) {
positive_tows[water_hauls, "operation_dim$legacy_performance_code"] <- -999
}
# Retain on standard survey samples
# whether values are NA or "NA" varies based on the presence of "Life Stage" samples
standard_samples <- sum(is.na(positive_tows[, "statistical_partition_dim$statistical_partition_type"])) != nrow(positive_tows)
if (standard_samples) {
keep <- positive_tows[, "statistical_partition_dim$statistical_partition_type"] == "NA"
positive_tows <- positive_tows[keep, ]
}
good_tows <- positive_tows[, "operation_dim$legacy_performance_code"] != 8
positive_tows <- positive_tows[good_tows, ]
positive_tows <- positive_tows[, vars_short]
if(sum(is.na(positive_tows[, "common_name"])) > 0) {
replace <- which(is.na(positive_tows[, "common_name"]))
positive_tows[replace, "common_name"] <- positive_tows[replace, "scientific_name"]
}
# Pull all tow data including tows where the species was not observed
vars_long <- c("project", "year", "vessel", "pass", "tow", "datetime_utc_iso",
"depth_m", "longitude_dd", "latitude_dd", "area_swept_ha_der",
"trawl_id", "operation_dim$legacy_performance_code")
vars_short <- vars_long[vars_long != "operation_dim$legacy_performance_code"]
url_text <- get_url(data_table = "trawl.operation_haul_fact",
project_long = project_long,
years = years,
vars_long = vars_long)
all_tows <- try(get_json(url = url_text))
# Remove water hauls
water_hauls <- is.na(all_tows[, "operation_dim$legacy_performance_code"])
if (sum(water_hauls) > 0) {
all_tows[water_hauls, "operation_dim$legacy_performance_code"] <- -999
}
keep <- all_tows[, "operation_dim$legacy_performance_code"] != 8
all_tows <- all_tows[keep, ]
all_tows <- all_tows[, vars_short]
all_tows <- all_tows[
!duplicated(paste(all_tows$year, all_tows$pass, all_tows$vessel, all_tows$tow)),
#c("project", "trawl_id", "year", "pass", "vessel", "tow", "datetime_utc_iso", "depth_m",
# "longitude_dd", "latitude_dd", "area_swept_ha_der"
#)
]
positive_tows_grouped <- dplyr::group_by(
.data = positive_tows,
common_name, scientific_name
)
# Split positive_tows into 1 data frame for each combination of common_name
# and scientific_name and store in a named list for purrr::map()
positive_tows_split <- dplyr::group_split(positive_tows_grouped)
group_names <- dplyr::group_keys(positive_tows_grouped)
names(positive_tows_split) <- tidyr::unite(group_names, col = "groups") |>
dplyr::pull(groups)
# For each data frame in the large list, find the tows that are not present
# in positive_tows and join them into a single data frame
# Give them the appropriate common and scientific names using .id then split
# the concatenated column out into the two original columns
names_intersect <- intersect(colnames(all_tows), colnames(positive_tows))
zero_tows <- purrr::map_df(
.x = positive_tows_split,
.f = \(y) dplyr::anti_join(x = all_tows, y = y, by = names_intersect),
.id = "groups"
) |>
tidyr::separate_wider_delim(
cols = "groups",
delim = "_",
names = colnames(group_names)
)
# Join the positive tows with the tow information
positive_tows_with_tow_info <- dplyr::left_join(
x = positive_tows,
y = all_tows,
by = intersect(colnames(all_tows), colnames(positive_tows))
)
# Join the augmented positive tow information with the zero tows
# arrange by common_name and tow_id
catch <- dplyr::full_join(
x = positive_tows_with_tow_info,
y = zero_tows,
by = c(colnames(group_names), colnames(all_tows))
) |>
dplyr::arrange(common_name, trawl_id)
# Need to check what this is doing
no_area <- which(is.na(catch$area_swept_ha_der))
if (length(no_area) > 0) {
if (verbose) {
print(
glue::glue("There are {length(no_area)} records with no area swept calculation. These record will be filled with the mean swept area across all tows."))
print(
catch[no_area, c("trawl_id", "year", "area_swept_ha_der", "cpue_kg_per_ha_der", "total_catch_numbers")])
}
catch[no_area, "area_swept_ha_der"] <- mean(catch$area_swept_ha_der, trim = 0.05, na.rm = TRUE)
}
# Fill in zeros where needed
catch[is.na(catch)] <- 0
catch$date <- chron::chron(
format(as.POSIXlt(catch$datetime_utc_iso, format = "%Y-%m-%dT%H:%M:%S"), "%Y-%m-%d"),
format = "y-m-d", out.format = "YYYY-m-d")
catch$trawl_id <- as.character(catch$trawl_id)
# kg / km2 <- (100 hectare / 1 *km2) * (kg / hectare)
catch$cpue_kg_km2 <- catch$cpue_kg_per_ha_der * 100
colnames(catch)[which(colnames(catch) == "area_swept_ha_der")] <- "area_swept_ha"
if(convert) {
firstup <- function(x) {
substr(x, 1, 1) <- toupper(substr(x, 1, 1))
x
}
colnames(catch) <- firstup(colnames(catch))
colnames(catch)[colnames(catch)=="Cpue_kg_km2"] <- "cpue_kg_km2"
colnames(catch)[colnames(catch)=="Cpue_kg_per_ha_der"] <- "cpue_kg_per_ha_der"
colnames(catch)[colnames(catch)=="Total_catch_numbers"] <- "total_catch_numbers"
colnames(catch)[colnames(catch)=="Total_catch_wt_kg"] <- "total_catch_wt_kg"
}
save_rdata(
x = catch,
dir = dir,
name_base = paste0("catch_", species, "_", survey),
verbose = verbose
)
return(catch)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.