#' Validate data
#'
#' After cleaning the data with \code{\link{clean}}, check to make sure the data don't have any erroneously repeated values.
#'
#' @importFrom dplyr select
#' @importFrom dplyr matches
#' @importFrom dplyr all_of
#' @importFrom dplyr group_by
#' @importFrom dplyr mutate
#' @importFrom dplyr n
#' @importFrom dplyr relocate
#' @importFrom dplyr ungroup
#' @importFrom dplyr summarize
#' @importFrom dplyr across
#' @importFrom dplyr filter
#' @importFrom dplyr sym
#' @importFrom dplyr contains
#' @importFrom dplyr row_number
#' @importFrom dplyr rename
#' @importFrom dplyr arrange
#' @importFrom dplyr desc
#' @importFrom tidyr unite
#' @importFrom tidyr pivot_longer
#' @importFrom stringr str_sub
#' @importFrom stringr str_split
#' @importFrom dplyr left_join
#' @importFrom dplyr bind_rows
#'
#' @param x The object created after cleaning data with \code{\link{clean}}
#' @param type Type of validation to perform. Acceptable values include:
#' \itemize{
#' \item vertical - Checks for repetition vertically in species and/or bag fields, grouped by dl_state and dl_date
#' \item horizontal - Checks for repetition horizontally, across each record
#' }
#' @param all Should all species groups be checked (TRUE)? If set to FALSE (default), then only ducks will be vertically checked and only ducks, geese, and coots_snipe will be horizontally checked.
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}
#'
#' @export
validate <-
function(x, type, all = FALSE){
if (type == "vertical") {
if (all == FALSE) {
# Quick check (ducks only)
validated_v <-
x |>
# Subset the data
select(dl_state, dl_date, ducks_bag) |>
group_by(dl_state, dl_date) |>
# Add number of rows per group
mutate(dl_rows = n()) |>
# Move dl_rows to 3rd col position
relocate(dl_rows, .after = dl_date) |>
ungroup() |>
group_by(dl_state, dl_date, dl_rows) |>
# Count the number of unique values in each species column
summarize(ducks_bag = length(unique(ducks_bag)), .groups = "keep") |>
pivot_longer(
cols = !contains("dl"),
names_to = "spp",
values_to = "v_rep") |>
ungroup() |>
# Keep only the uniform spp groups
filter(v_rep == 1) |>
# Report count of times uniform values are repeated
mutate(v_repeated = dl_rows) |>
select(-c("dl_rows", "v_rep"))
} else if (all == TRUE) {
# All species columns
vr <-
x |>
# Subset the data
select(
dl_state,
source_file,
all_of(ref_bagfields)) |>
group_by(source_file) |>
# Add number of rows per source_file
mutate(dl_rows = n()) |>
ungroup() |>
# Move dl_rows to 3rd col position
relocate(dl_rows, .after = source_file) |>
group_by(dl_state, source_file, dl_rows) |>
# Count the number of unique values in each species column
summarize(
across(
all_of(ref_bagfields),
~length(unique(.x))),
.groups = "keep") |>
pivot_longer(
cols = !matches("dl|source"),
names_to = "spp",
values_to = "n_unique_values") |>
ungroup() |>
# Keep only the spp groups with uniform values
filter(n_unique_values == 1) |>
mutate(v_repeated = dl_rows) |>
select(dl_state, source_file, spp, v_repeated)
validated_v <-
vr |>
# Join in what value was repeated
mutate(
repeated_value =
map(
1:nrow(vr),
~investigate(
x,
loc = vr$dl_state[.x],
period_type = "dl_date",
period =
str_extract(
vr$source_file[.x], "[0-9].+(?=\\.txt)"),
species = vr$spp[.x]) |>
pull(1)) |>
unlist()
) |>
# Keep only the spp groups with more than 1 possible bag value AND
# filter out expected vertical repetition for bag values that are wrong,
# e.g. received 1s for no-season instead of 0s, etc; this is already
# reported by strataCheck function and clutters the output here
# i.e. filter out "no season" species/states
left_join(
hip_bags_ref |>
select(-FWSstratum) |>
group_by(state, spp) |>
filter(n() == 1) |>
ungroup() |>
rename(
dl_state = state,
repeated_value = stateBagValue) |>
mutate(repeated_value = as.character(repeated_value)) |>
# Join in state/species combinations that are supposed to be
# all 0s because we receive those bag values in another format
bind_rows(pmt_files |> rename(repeated_value = value)) |>
distinct() |>
select(-repeated_value) |>
mutate(flag = "no season"),
by = c("dl_state", "spp")
) |>
filter(is.na(flag)) |>
select(-flag)
}
if (nrow(validated_v) != 0) {
return(validated_v)
} else {
message("Data are good to go! No vertical repetition detected.")
}
} else if (type == "horizontal") {
if(all == FALSE){
# Quick check (duck, goose, & coots_snipe only)
h_test <-
x |>
# Subset the data
select(source_file, ducks_bag, geese_bag, coots_snipe) |>
group_by(source_file) |>
# Paste all of the species group values together
unite(h_string, !contains("source"), sep = "-") |>
ungroup() |>
mutate(
# Subset the first value from each horizontal bag string, useful
# later if the values are repeated
h_first = str_sub(h_string, 0, 1),
# Convert string to vector
h_string = str_split(h_string, "-"),
# Create unique row key
rowkey = row_number())
} else if (all == TRUE) {
# All species columns
h_test <-
x |>
# Subset the data
select(
source_file,
all_of(ref_bagfields)) |>
group_by(source_file) |>
# Paste all of the species group values together
unite(h_string, !contains("source"), sep = "-") |>
ungroup() |>
mutate(
# Subset the first value from each horizontal bag string, useful
# later if the values are repeated
h_first = str_sub(h_string, 0, 1),
# Convert string to vector
h_string = str_split(h_string, "-"),
# Create unique row key
rowkey = row_number())
}
validated_h <-
h_test |>
group_by(rowkey) |>
# Calculate the number of unique values used by each row
mutate(h_validate = length(unique(h_test$h_string[[rowkey]]))) |>
ungroup() |>
select(-c("h_string", "rowkey")) |>
# If only one value was used across the row, mark it as containing
# repeats
mutate(h_validate = ifelse(h_validate == "1", "rep", "not-rep")) |>
# Calculate the total number of rows per file
group_by(source_file) |>
mutate(h_total = n()) |>
ungroup() |>
# Keep only the rows that have repeats indicated
filter(h_validate == "rep") |>
# Count the number of repeated rows per file
group_by(source_file, h_first, h_total) |>
summarize(h_rep = n(), .groups = "keep") |>
ungroup() |>
# Calculate the proportion of repeats per file
group_by(source_file) |>
mutate(prop_repeat = h_rep/h_total) |>
ungroup() |>
relocate(h_rep, .before = h_total) |>
# Rename h_first so it is more clear that the field indicates the
# repeated horizontal value
rename(h_value = h_first) |>
# Sort with descending proportion of repeats
arrange(desc(prop_repeat))
if (nrow(validated_h) > 0) {
return(validated_h)
} else {
message("Data are good to go! No horizontal repetition detected.")
}
} else {
# Return for incorrect type given
message(
paste0(
"Error: Incorrect value supplied for `type` parameter. Please",
" choose: 'vertical' or 'horizontal'.")
)
}
}
#' Investigate data
#'
#' The internal \code{investigate} function looks into vertical repetition in \code{\link{validate}}.
#'
#' @importFrom dplyr select
#' @importFrom dplyr all_of
#' @importFrom dplyr filter
#' @importFrom dplyr distinct
#' @importFrom dplyr quo_name
#'
#' @param x The object created after cleaning data with \code{\link{clean}}
#' @param loc The download state in question
#' @param period_type The type of time period that will be given. You may choose one of two types:
#' \itemize{
#' \item dl_date
#' \item dl_cycle}
#' @param period Value of the time period in question
#' @param species The bird group in question. One of the bird groups from the following list may be supplied:
#' \itemize{
#' \item ducks_bag, geese_bag, dove_bag, woodcock_bag, coots_snipe, rails_gallinules, cranes, band_tailed_pigeon, brant, seaducks}
#'
#' @author Abby Walter, \email{abby_walter@@fws.gov}
#' @references \url{https://github.com/USFWS/migbirdHIP}
investigate <-
function(x, loc, period_type, period, species){
# Pull requested value
investigated_x <-
x |>
select(dl_state, {{period_type}}, all_of(ref_bagfields), source_file) |>
filter(dl_state == loc & !!sym(period_type) == period) |>
select(quo_name(species), source_file) |>
distinct()
if(nrow(investigated_x) == 0) {
message("Are you sure you entered all of the parameters correctly?")
}
else{
return(investigated_x)
}
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.