R/validation-functions.R

Defines functions validate_n_fishers validate_sites validate_gear_type validate_vessel_type validate_price_weight validate_catch_params alert_outlier validate_catch_price validate_landing_regularity validate_surveys_time validate_this_imei get_deployed_imeis

Documented in alert_outlier validate_catch_params validate_catch_price validate_price_weight validate_surveys_time

# Get a vector with imeis deployed in the field
get_deployed_imeis <- function(metadata) {
  metadata$devices %>%
    dplyr::right_join(metadata$device_installs,
      by = c("id" = "device_imei")
    ) %>%
    dplyr::filter(!is.na(.data$device_imei)) %>%
    magrittr::extract2("device_imei")
}

# Perform tests for a single imei and return the corrected value and the flag
validate_this_imei <- function(this_imei, this_id = NULL, valid_imeis) {
  this_id <- as.integer(this_id)

  # If imei is NA there is nothing to validate
  if (is.na(this_imei)) {
    out <- list(imei = NA_character_, alert_number = NA_integer_, submission_id = this_id)
    return(out)
  }

  # Zero seems to be used for no IMEI as well
  if (this_imei == "0") {
    out <- list(imei = NA_character_, alert_number = NA_integer_, submission_id = this_id)
    return(out)
  }

  # If the IMEI is negative it was probably a typo
  this_imei <- as.numeric(this_imei)
  if (this_imei < 0) this_imei <- this_imei * -1

  # Optimistically we need at least 5 digits to work with and that might be
  if (this_imei < 9999) {
    out <- list(imei = NA_character_, alert_number = 1, submission_id = this_id)
    return(out)
  }

  # If a valid IMEI is found replace it
  imei_regex <- paste0(as.character(this_imei), "$")
  imei_matches <- stringr::str_detect(valid_imeis, imei_regex)
  n_matches <- sum(imei_matches)
  if (n_matches == 1) {
    list(imei = valid_imeis[imei_matches], alert_number = NA_integer_, submission_id = this_id)
  } else if (n_matches > 1) {
    list(imei = NA_character_, alert_number = 2, submission_id = this_id)
  } else if (n_matches == 0) {
    list(imei = NA_character_, alert_number = 3, submission_id = this_id)
  }
}


#' Validate surveys' temporal parameters
#'
#' This function takes a preprocessed landings' matrix and validate temporal
#' info associated to each survey.
#'
#' @param data A preprocessed data frame
#' @param hrs Limit of trip duration in hours to be considered a valid catch
#'   session.
#' @param submission_delay Limit for maximum difference (in days) between the
#'   survey submission date and the recorded landing datw
#'
#' @return A list containing data frames with validated catch dates and catch
#'   duration.
#'
#' @importFrom rlang .data
#' @export
#'
#' @examples
#' \dontrun{
#' pars <- read_config()
#' landings <- get_merged_landings(pars)
#' validate_surveys_time(landings, hrs = 18)
#' }
validate_surveys_time <- function(data, hrs = NULL, submission_delay) {
  validated_time <- list(
    validated_dates = data %>%
      dplyr::select(.data$`_id`, .data$date, .data$`_submission_time`) %>%
      dplyr::mutate(
        `_submission_time` = lubridate::ymd_hms(.data$`_submission_time`),
        submission_date = lubridate::with_tz(.data$`_submission_time`, "Asia/Dili"),
        date = as.POSIXct(.data$date, tz = "Asia/Dili"),
        d = date - .data$submission_date
      ) %>%
      dplyr::transmute(
        # Alert needs to be checked before editing the date column
        alert_number = dplyr::case_when(
          # test if submission date is prior catch date
          .data$date > .data$submission_date ~ 4,
          .data$date < .data$submission_date - lubridate::duration(submission_delay, units = "days") ~ 10,
          TRUE ~ NA_real_
        ),
        date = as.Date(.data$date, tz = "Asia/Dili"),
        submission_id = as.integer(.data$`_id`)
      ),
    validated_duration = data %>%
      dplyr::select(.data$`_id`, .data$`trip_group/duration`) %>%
      dplyr::mutate(`trip_group/duration` = abs(as.numeric(.data$`trip_group/duration`))) %>%
      dplyr::transmute(
        trip_length = dplyr::case_when(
          .data$`trip_group/duration` > hrs |
            .data$`trip_group/duration` < 1 ~ NA_real_,
          TRUE ~ .data$`trip_group/duration`
        ), # test if catch duration is longer than n hours or minor than 1 hour
        alert_number = dplyr::case_when(
          .data$`trip_group/duration` > hrs |
            .data$`trip_group/duration` < 1 ~ 5,
          TRUE ~ NA_real_
        ),
        submission_id = as.integer(.data$`_id`)
      )
  )
  validated_time
}

validate_landing_regularity <- function(landings) {
  regularity_alerts <-
    landings %>%
    dplyr::select(.data$`_id`, .data$total_catch_value, .data$species_group) %>%
    tidyr::unnest(.data$species_group, keep_empty = TRUE) %>%
    tidyr::unnest(.data$length_individuals, keep_empty = TRUE) %>%
    dplyr::select(.data$`_id`, .data$species, .data$total_catch_value, .data$n_individuals) %>%
    dplyr::mutate(
      total_catch_value = as.double(.data$total_catch_value),
      total_catch_value = abs(.data$total_catch_value),
      n_individuals = abs(.data$n_individuals)
    ) %>%
    dplyr::group_by(.data$`_id`) %>%
    dplyr::summarise(
      species = dplyr::first(.data$species),
      total_catch_value = dplyr::first(.data$total_catch_value),
      n_individuals = sum(.data$n_individuals, na.rm = T)
    ) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(
      alert_regularity = dplyr::case_when(
        .data$species == "0" & .data$n_individuals > 0 |
          .data$species == "0" & .data$total_catch_value > 0 |
          !.data$species == "0" & .data$n_individuals <= 0 |
          !.data$species == "0" & .data$total_catch_value <= 0 |
          .data$total_catch_value <= 0 & .data$n_individuals > 0 |
          .data$total_catch_value > 0 & .data$n_individuals <= 0
        # is.na(.data$total_catch_value) & .data$n_individuals >= 0 |
        # is.na(.data$n_individuals) & .data$total_catch_value >= 0
        ~ 22, TRUE ~ NA_real_
      )
    ) %>%
    dplyr::rename(submission_id = .data$`_id`)

  no_regular_ids <-
    regularity_alerts %>%
    dplyr::filter(!is.na(.data$alert_regularity)) %>%
    magrittr::extract2("submission_id")

  regular_landings <-
    landings %>%
    tidyr::unnest(.data$species_group, keep_empty = TRUE) %>%
    tidyr::unnest(.data$length_individuals, keep_empty = TRUE) %>%
    dplyr::mutate(
      total_catch_value = as.double(.data$total_catch_value),
      total_catch_value = abs(.data$total_catch_value),
      n_individuals = abs(.data$n_individuals),
      alert_number = dplyr::case_when(
        .data$`_id` %in% no_regular_ids ~ 22,
        TRUE ~ NA_real_
      ),
      total_catch_value = dplyr::case_when(is.na(.data$alert_number) ~ .data$total_catch_value, TRUE ~ NA_real_),
      n_individuals = dplyr::case_when(is.na(.data$alert_number) ~ .data$n_individuals, TRUE ~ NA_real_),
      dplyr::across(
        c(.data$weight:.data$Vitamin_A_mu), ~ dplyr::case_when(
          is.na(.data$alert_number) ~ .data$.x, TRUE ~ NA_real_
        )
      )
    ) %>%
    tidyr::nest(length_individuals = c(.data$mean_length:.data$Vitamin_A_mu)) %>%
    tidyr::nest(species_group = c(
      .data$n, .data$species, .data$food_or_sale, .data$other_species_name,
      .data$photo, .data$length_individuals, .data$length_type
    ))

  list(
    regularity_alerts = regularity_alerts,
    regular_landings = regular_landings
  )
}

#' Validate surveys' total catch values
#'
#' This function takes a preprocessed landings' matrix and uses univariate
#' techniques (see [univOutl::LocScaleB]) for the identification of outliers in
#' the distribution of the total catch values associated to surveys.
#'
#' @param data A preprocessed data frame
#' @inheritParams univOutl::LocScaleB
#'
#' @return A data frame containing validated catch values.
#' @export
#'
#' @importFrom rlang .data
#'
#' @examples
#' \dontrun{
#' pars <- read_config()
#' landings <- get_merged_landings(pars)
#' validate_catch_value(landings, method = "MAD", k = 13)
#' }
#'
validate_catch_price <- function(data, method = NULL, k = NULL) {
  validated_price <-
    data %>%
    dplyr::filter(is.na(.data$alert_number)) %>%
    dplyr::select(.data$`_id`, .data$total_catch_value) %>%
    dplyr::transmute(
      alert_number = ifelse(.data$total_catch_value > 1500, 6, NA_integer_),
      # alert_number = alert_outlier(
      #  x = .data$total_catch_value, alert_if_smaller = 9, alert_if_larger = 6,
      #  logt = TRUE, k = k, method = method
      # ),
      total_catch_value = dplyr::case_when(
        is.na(.data$alert_number) ~ .data$total_catch_value,
        TRUE ~ NA_real_
      ),
      submission_id = as.integer(.data$`_id`)
    )

  data %>%
    dplyr::filter(!is.na(.data$alert_number)) %>%
    dplyr::select(.data$`_id`, .data$total_catch_value, .data$alert_number) %>%
    dplyr::rename(submission_id = .data$`_id`) %>%
    dplyr::mutate(submission_id = as.integer(.data$submission_id)) %>%
    dplyr::bind_rows(validated_price)
}

#' Generate an alert vector based on the `univOutl::LocScaleB()` function
#'
#' @param x numeric vector where outliers will be checked
#' @param no_alert_value value to put in the output when there is no alert (x is within bounds)
#' @param alert_if_larger alert for when x is above the bounds found by `univOutl::LocScaleB()`
#' @param alert_if_smaller alert for when x is below the bounds found by `univOutl::LocScaleB()`
#' @param ... arguments for `univOutl::LocScaleB()`
#'
#' @return a vector of the same lenght as x
#' @importFrom stats mad
alert_outlier <- function(x,
                          no_alert_value = NA_real_,
                          alert_if_larger = no_alert_value,
                          alert_if_smaller = no_alert_value,
                          ...) {
  algo_args <- list(...)

  # Helper function to check if everything is NA or zero
  all_na_or_zero <- function(x) {
    isTRUE(all(is.na(x) | x == 0))
  }

  # If everything is NA or zero there is nothing to compute
  if (all_na_or_zero(x)) {
    return(NA_real_)
  }
  # If the median absolute deviation is zero we shouldn't be using this algo
  if (mad(x, na.rm = T) <= 0) {
    return(NA_real_)
  }
  # If weights are specified and they are all NA or zero
  if (!is.null(algo_args$weights)) {
    if (all_na_or_zero(algo_args$weights)) {
      return(NA_real_)
    }
  }

  bounds <- univOutl::LocScaleB(x, ...) %>%
    magrittr::extract2("bounds")

  if (isTRUE(algo_args$logt)) bounds <- exp(bounds) - 1

  dplyr::case_when(
    x < bounds[1] ~ alert_if_smaller,
    x > bounds[2] ~ alert_if_larger,
    TRUE ~ no_alert_value
  )
}

#' Validate surveys' catch parameters
#'
#' This function takes a preprocessed landings' matrix and uses univariate
#' techniques (see [univOutl::LocScaleB]) for the identification of outliers in
#' the distribution of the number of individuals per catch and their size. The
#' function returns a data frame with the survey id, the alert number and
#' a nested column `species_group` containing validated catches parameters.
#'
#' @param data A preprocessed data frame
#' @param k_ind Extension of bounds for the number of individuals
#'
#' @return A data frame containing the validated catches parameters.
#' @export
#'
#' @examples
#' \dontrun{
#' pars <- read_config()
#' landings <- get_merged_landings(pars)
#' validate_catch_params(landings, k = 3)
#' }
#'
validate_catch_params <- function(data = NULL, k_ind = NULL) {
  catches_dat_unnested <-
    data %>%
    dplyr::filter(is.na(alert_number)) %>%
    dplyr::select(.data$`_id`, .data$`trip_group/gear_type`, .data$species_group) %>%
    tidyr::unnest(.data$species_group, keep_empty = TRUE) %>%
    tidyr::unnest(.data$length_individuals, keep_empty = TRUE)

  validated_length <-
    catches_dat_unnested %>%
    dplyr::group_by(.data$`trip_group/gear_type`, .data$species) %>%
    dplyr::mutate(
      alert_n_individuals = alert_outlier(
        x = .data$n_individuals,
        alert_if_larger = 11, logt = TRUE, k = k_ind
      ),
      n_individuals = dplyr::case_when(
        is.na(.data$alert_n_individuals) ~ .data$n_individuals,
        TRUE ~ NA_real_
      )
    ) %>%
    dplyr::mutate(
      alert_number = .data$alert_n_individuals,
      submission_id = .data$`_id`
    ) %>%
    dplyr::ungroup() %>%
    # Adjusting weight and nutrients accordingly
    dplyr::mutate(
      dplyr::across(
        c(.data$weight:.data$Vitamin_A_mu),
        ~ dplyr::case_when(
          !is.na(.data$alert_number) ~ NA_real_,
          .data$n_individuals == 0 ~ 0,
          TRUE ~ .data$.x
        )
      )
    ) %>%
    dplyr::select(-.data$alert_n_individuals, -.data$`_id`)

  # extract alert number
  alert_number <-
    validated_length %>%
    dplyr::select(.data$submission_id, .data$n, .data$alert_number) %>%
    dplyr::group_by(.data$submission_id) %>%
    dplyr::arrange(dplyr::desc(alert_number), .by_group = TRUE) %>%
    dplyr::filter(dplyr::row_number() == 1) %>%
    dplyr::ungroup() %>%
    dplyr::select(-.data$n) %>%
    dplyr::mutate(submission_id = as.integer(.data$submission_id))

  # nest validated data
  validated_length_nested <-
    validated_length %>%
    dplyr::select(-.data$alert_number) %>%
    dplyr::group_by(.data$submission_id, .data$n, .data$species) %>%
    tidyr::nest(length_individuals = c(.data$mean_length:.data$Vitamin_A_mu))

  # replace validated catches params in original data
  validated_catch_params <-
    data %>%
    dplyr::filter(is.na(.data$alert_number)) %>%
    dplyr::rename(submission_id = .data$`_id`) %>%
    dplyr::select(.data$submission_id, .data$species_group) %>%
    tidyr::unnest(.data$species_group, keep_empty = TRUE) %>%
    dplyr::mutate(length_individuals = validated_length_nested$length_individuals) %>%
    dplyr::group_by(.data$submission_id) %>%
    tidyr::nest() %>%
    dplyr::rename("species_group" = "data") %>%
    dplyr::ungroup() %>%
    dplyr::mutate(submission_id = as.integer(.data$submission_id)) %>%
    dplyr::left_join(alert_number, by = "submission_id")


  data %>%
    dplyr::filter(!is.na(alert_number)) %>%
    dplyr::select(.data$`_id`, .data$species_group, .data$alert_number) %>%
    dplyr::rename(submission_id = .data$`_id`) %>%
    dplyr::mutate(submission_id = as.integer(.data$submission_id)) %>%
    dplyr::bind_rows(validated_catch_params)
}

#' Outlier identification based on Cook's distance
#'
#' This function adds an additional alert to both price and catch alert dataframes
#' when the relation between the price and weight assume abnormal values relatively
#' to each species. The relationship between weight and price is mostly linear,
#' this function identifies the survey IDs where the Cook's distance is higher than
#' cook_dist * mean_cook, where cook_dist is a multiplicative coefficient and
#' cook_dist is the average Cook's distance relatively to each species.
#'
#' Currently, cook_dist is set to 21 as default value.
#'
#' @param catch_alerts The dataframe of catch alerts.
#' @param price_alerts The dataframe of price alerts.
#' @param non_regular_ids The dataframe of landings regularity alerts.
#' @param cook_dist A number that go in the formula cook_dist * (mean(cooksd)).
#' @param price_weight_min Min price per weight value threshold.
#' @param price_weight_max Max price per weight value threshold.
#' @return The price and catch alert' dataframes including outlier identification
#' based on Cook's distance.
#' @export
#'
validate_price_weight <- function(catch_alerts = NULL,
                                  price_alerts = NULL,
                                  non_regular_ids = NULL,
                                  cook_dist = NULL,
                                  price_weight_min = NULL,
                                  price_weight_max = NULL) {
  # Extract single catches IDs
  single_catches <-
    catch_alerts %>%
    dplyr::mutate(n = purrr::map_dbl(.data$species_group, nrow)) %>%
    dplyr::filter(.data$n == 1) %>%
    magrittr::extract2("submission_id")

  # Extract IDs with abnormal price weight relation based on Cook's distance
  price_per_weight_alerts <-
    dplyr::left_join(price_alerts, catch_alerts, by = "submission_id") %>%
    # dplyr::filter(.data$submission_id %in% single_catches) %>%
    tidyr::unnest(.data$species_group) %>%
    tidyr::unnest(.data$length_individuals) %>%
    dplyr::select(.data$submission_id, .data$species, .data$total_catch_value, .data$weight) %>%
    dplyr::filter(!is.na(.data$weight) & !is.na(.data$total_catch_value) & .data$weight != 0) %>%
    # dplyr::group_by(.data$submission_id, .data$species) %>%
    dplyr::group_by(.data$submission_id) %>%
    dplyr::summarise(
      total_catch_value = dplyr::first(.data$total_catch_value),
      weight = sum(.data$weight, na.rm = T),
    ) %>%
    # dplyr::group_by(.data$species) %>%
    dplyr::mutate(model = broom::augment(stats::lm(formula = log(.data$total_catch_value + 1)
    ~ log(.data$weight / 1000 + 1)))) %>%
    dplyr::ungroup() %>%
    dplyr::mutate(cooksd = .data$model$`.cooksd`) %>%
    dplyr::select(-.data$model) %>%
    dplyr::mutate(
      weight_kg = .data$weight / 1000,
      pk = .data$total_catch_value / .data$weight_kg,
      alert_number = dplyr::case_when( # .data$cooksd > (cook_dist * mean(.data$cooksd)) |
        .data$pk < price_weight_min |
          .data$pk > price_weight_max
        ~ 17, TRUE ~ NA_real_
      )
    ) %>%
    dplyr::ungroup() %>%
    dplyr::filter(!is.na(.data$alert_number)) %>%
    magrittr::extract2("submission_id")

  regularity_alerts <-
    non_regular_ids %>%
    dplyr::filter(!is.na(.data$alert_regularity)) %>%
    magrittr::extract2("submission_id")

  n_individuals_alert <-
    catch_alerts %>%
    dplyr::filter(!is.na(.data$alert_number)) %>%
    magrittr::extract2("submission_id")

  revenue_alert <-
    price_alerts %>%
    dplyr::filter(!is.na(.data$alert_number)) %>%
    magrittr::extract2("submission_id")

  # Integrate new alert to prices and weights
  price_alerts %<>%
    dplyr::mutate(
      alert_number = dplyr::case_when(
        .data$submission_id %in% price_per_weight_alerts ~ 17,
        .data$submission_id %in% regularity_alerts ~ 22,
        .data$submission_id %in% n_individuals_alert ~ 11,
        TRUE ~ .data$alert_number
      ),
      total_catch_value = dplyr::case_when(
        is.na(.data$alert_number) ~ .data$total_catch_value, TRUE ~ NA_real_
      )
    )

  catch_alerts %<>%
    tidyr::unnest(.data$species_group, keep_empty = TRUE) %>%
    tidyr::unnest(.data$length_individuals, keep_empty = TRUE) %>%
    dplyr::mutate(
      alert_number = dplyr::case_when(
        .data$submission_id %in% price_per_weight_alerts ~ 17,
        .data$submission_id %in% regularity_alerts ~ 22,
        .data$submission_id %in% revenue_alert ~ 6,
        TRUE ~ .data$alert_number
      ),
      n_individuals = dplyr::case_when(is.na(.data$alert_number) ~ .data$n_individuals, TRUE ~ NA_real_),
      dplyr::across(
        c(.data$weight:.data$Vitamin_A_mu), ~ dplyr::case_when(
          is.na(.data$alert_number) ~ .data$.x, TRUE ~ NA_real_
        )
      )
    ) %>%
    tidyr::nest(length_individuals = c(.data$mean_length:.data$Vitamin_A_mu)) %>%
    tidyr::nest(species_group = c(
      .data$n, .data$species, .data$food_or_sale, .data$other_species_name,
      .data$photo, .data$length_individuals, .data$length_type
    ))

  dplyr::full_join(catch_alerts, price_alerts, by = c("submission_id")) %>%
    dplyr::mutate(alert_number = dplyr::coalesce(.data$alert_number.x, .data$alert_number.y)) %>%
    dplyr::select(-.data$alert_number.x, -.data$alert_number.y)
}

# Ideally this function would in the future, check for the integrity of the boat type
validate_vessel_type <- function(landings, metadata_vessel_table) {
  landings %>%
    dplyr::rename(
      submission_id = .data$`_id`,
      boat_code = .data$`trip_group/boat_type`
    ) %>%
    dplyr::mutate(boat_code = as.integer(.data$boat_code)) %>%
    dplyr::left_join(metadata_vessel_table, by = "boat_code") %>%
    dplyr::rename(vessel_type = .data$boat_type) %>%
    # If no vessel type is not what we expected
    dplyr::mutate(
      not_valid_code = !is.na(.data$boat_code) & is.na(.data$vessel_type),
      alert_number = dplyr::if_else(isTRUE(.data$not_valid_code), 12, NA_real_)
    ) %>%
    # If no vessel type was recorded when it should have
    dplyr::mutate(
      no_vessel_type = .data$`trip_group/has_boat` == "TRUE" & is.na(.data$vessel_type),
      alert_number = dplyr::if_else(isTRUE(.data$no_vessel_type), 13, NA_real_)
    ) %>%
    # Fixing types
    dplyr::mutate(submission_id = as.integer(.data$submission_id)) %>%
    dplyr::select(.data$vessel_type, .data$alert_number, .data$submission_id)
}

validate_gear_type <- function(landings, metadata_gear_table) {
  landings %>%
    dplyr::rename(
      submission_id = .data$`_id`,
      gear_code = .data$`trip_group/gear_type`
    ) %>%
    dplyr::left_join(metadata_gear_table, by = "gear_code") %>%
    # If no vessel type is not what we expected
    dplyr::mutate(
      not_valid_code = !is.na(.data$gear_code) & is.na(.data$gear_id),
      alert_number = dplyr::if_else(isTRUE(.data$not_valid_code), 14, NA_real_)
    ) %>%
    # If no gear type was recorded when it should have
    dplyr::mutate(
      no_vessel_type = .data$`trip_group/has_boat` == "TRUE" & is.na(.data$gear_code),
      alert_number = dplyr::if_else(isTRUE(.data$no_vessel_type), 15, NA_real_)
    ) %>%
    # Fixing types
    dplyr::mutate(submission_id = as.integer(.data$submission_id)) %>%
    dplyr::select(.data$gear_id, .data$alert_number, .data$submission_id) %>%
    dplyr::rename(gear_type = .data$gear_id)
}

validate_sites <- function(landings, metadata_stations, metadata_reporting_units) {
  sites_df <-
    metadata_stations %>%
    dplyr::filter(!is.na(.data$station_code)) %>%
    dplyr::inner_join(metadata_reporting_units, by = "reporting_unit") %>%
    dplyr::select(.data$station_code, .data$station_name, .data$reporting_unit) %>%
    dplyr::mutate(station_code = as.character(.data$station_code)) %>%
    dplyr::mutate(station_name = trimws(.data$station_name)) %>%
    dplyr::rename(reporting_region = .data$reporting_unit) %>%
    dplyr::distinct()

  landings %>%
    dplyr::rename(submission_id = .data$`_id`) %>%
    dplyr::mutate(station_code = as.character(.data$landing_site_name)) %>%
    dplyr::select(.data$submission_id, .data$station_code) %>%
    dplyr::left_join(sites_df, by = "station_code") %>%
    # If the station is not known to us
    dplyr::mutate(alert_number = dplyr::if_else(is.na(.data$station_name) | is.na(.data$reporting_region), 16, NA_real_)) %>%
    # Fixing types
    dplyr::mutate(submission_id = as.integer(.data$submission_id))
}


validate_n_fishers <- function(landings, method, k) {
  landings %>%
    dplyr::select(
      submission_id = .data$`_id`,
      fisher_number_child = .data$`trip_group/no_fishers/no_child_fishers`,
      fisher_number_man = .data$`trip_group/no_fishers/no_men_fishers`,
      fisher_number_woman = .data$`trip_group/no_fishers/no_women_fishers`
    ) %>%
    dplyr::mutate(dplyr::across(tidyselect::starts_with("fisher"), as.numeric)) %>%
    dplyr::mutate(dplyr::across(tidyselect::starts_with("fisher"), list(alert = alert_outlier), alert_if_larger = 18, alert_if_smaller = 18, k = k, logt = T, method = method)) %>%
    dplyr::mutate(alert_number = dplyr::coalesce(.data$fisher_number_child_alert, .data$fisher_number_man_alert, .data$fisher_number_woman_alert)) %>%
    dplyr::mutate(dplyr::across(tidyselect::starts_with("fisher"), ~ dplyr::if_else(!is.na(alert_number), NA_real_, .))) %>%
    dplyr::select(-tidyselect::ends_with("alert")) %>%
    # Fixing types
    dplyr::mutate(submission_id = as.integer(.data$submission_id))
}


validate_habitat <- function(landings, metadata_habitat) {
  landings %>%
    dplyr::rename(
      submission_id = .data$`_id`,
      habitat_code = .data$`trip_group/habitat`
    ) %>%
    dplyr::mutate(
      habitat_code = as.numeric(.data$habitat_code),
      habitat_type = dplyr::case_when(
        habitat_code == 1 ~ "Reef",
        habitat_code == 2 ~ "FAD",
        habitat_code == 3 ~ "Deep",
        habitat_code == 4 ~ "Beach",
        habitat_code == 5 ~ "Traditional FAD",
        habitat_code == 6 ~ "Mangrove",
        habitat_code == 7 ~ "Seagrass",
        TRUE ~ NA_character_
      )
    ) %>%
    dplyr::select(.data$submission_id, .data$habitat_code, .data$habitat_type) %>%
    dplyr::mutate(
      alert_number = dplyr::case_when(
        !.data$habitat_code %in% c(metadata_habitat$habitat_code, NA_integer_) ~ 19,
        TRUE ~ NA_real_
      ),
      habitat_type = dplyr::case_when(
        is.na(alert_number) ~ .data$habitat_type,
        TRUE ~ NA_character_
      ),
      submission_id = as.integer(.data$submission_id)
    )
}

validate_mesh <- function(landings, mesh_limit) {
  landings %>%
    dplyr::select(
      submission_id = .data$`_id`,
      .data$`trip_group/mesh_size`,
      .data$`trip_group/mesh_size_other`
    ) %>%
    dplyr::mutate(
      `trip_group/mesh_size` = dplyr::case_when(
        `trip_group/mesh_size` == "seluk" ~ NA_character_, TRUE ~ .data$`trip_group/mesh_size`
      ),
      mesh_size = dplyr::coalesce(.data$`trip_group/mesh_size`, .data$`trip_group/mesh_size_other`),
      mesh_size = as.double(.data$mesh_size),
      mesh_size = .data$mesh_size * 25.4, # convert in mm
      alert_number = dplyr::case_when(
        .data$mesh_size < 0 | .data$mesh_size > mesh_limit ~ 20,
        TRUE ~ NA_real_
      ),
      mesh_size = dplyr::case_when(
        is.na(alert_number) ~ .data$mesh_size,
        TRUE ~ NA_real_
      ),
      submission_id = as.integer(.data$submission_id)
    ) %>%
    dplyr::select(-c(.data$`trip_group/mesh_size`, .data$`trip_group/mesh_size_other`))
}


validate_gleaners <- function(landings, method, k_gleaners) {
  landings %>%
    dplyr::select(
      submission_id = .data$`_id`,
      n_gleaners = .data$how_many_gleaners_today
    ) %>%
    dplyr::mutate(
      n_gleaners = as.double(.data$n_gleaners),
      n_gleaners = abs(.data$n_gleaners),
      alert_number = alert_outlier(
        x = .data$n_gleaners,
        alert_if_larger = 21, logt = TRUE, k = k_gleaners
      ),
      n_gleaners = dplyr::case_when(
        is.na(alert_number) ~ .data$n_gleaners,
        TRUE ~ NA_real_
      ),
      submission_id = as.integer(.data$submission_id)
    )
}

validate_fuel <- function(landings, method, k_fuel) {
  landings %>%
    dplyr::select(
      submission_id = .data$`_id`,
      has_boat = .data$`trip_group/has_boat`,
      fuel = .data$fuel_L
    ) %>%
    dplyr::mutate(
      submission_id = as.integer(.data$submission_id),
      fuel = as.double(.data$fuel),
      alert_number.1 = ifelse(.data$fuel < 0, 23, NA_real_),
      alert_number.2 = alert_outlier(
        x = .data$fuel,
        alert_if_larger = 23, logt = TRUE, k = k_fuel
      ),
      alert_number.3 = dplyr::case_when(.data$fuel > 0 & isFALSE(.data$has_boat) ~ 23, TRUE ~ NA_real_),
      alert_number = dplyr::coalesce(.data$alert_number.1, .data$alert_number.2, .data$alert_number.3),
      fuel = dplyr::case_when(
        is.na(alert_number) ~ .data$fuel,
        TRUE ~ NA_real_
      )
    ) %>%
    dplyr::select(.data$submission_id, .data$fuel, .data$alert_number)
}

validate_conservation <- function(landings, metadata_conservation) {
  landings %>%
    dplyr::select(
      submission_id = .data$`_id`,
      conservation_code = .data$`group_conservation_trading/conservation`
    ) %>%
    dplyr::full_join(metadata_conservation, by = "conservation_code") %>%
    dplyr::select(-.data$conservation_code) %>%
    dplyr::mutate(
      submission_id = as.integer(.data$submission_id),
      alert_number = NA_real_
    )
}

validate_happiness <- function(landings) {
  landings %>%
    dplyr::select(
      submission_id = .data$`_id`,
      happiness = .data$happiness_rating
    ) %>%
    dplyr::mutate(
      submission_id = as.integer(.data$submission_id),
      happiness = as.integer(.data$happiness),
      alert_number = NA_real_
    )
}


get_bounds_table <- function(data = NULL, metadata_table = NULL, k_ind = NULL) {
  get_bounds <- function(x) {
    bounds <-
      univOutl::LocScaleB(x$n_individuals, logt = TRUE, k = k_ind) %>%
      magrittr::extract2("bounds")
    bounds
  }

  data %>%
    dplyr::filter(is.na(.data$alert_number)) %>%
    dplyr::select(.data$`_id`, .data$`trip_group/gear_type`, .data$species_group) %>%
    tidyr::unnest(.data$species_group, keep_empty = TRUE) %>%
    tidyr::unnest(.data$length_individuals, keep_empty = TRUE) %>%
    dplyr::filter(!.data$species == "0" & .data$n_individuals > 0) %>%
    split(interaction(.$species, .$`trip_group/gear_type`)) %>%
    purrr::discard(~ nrow(.) == 0) %>%
    purrr::map(get_bounds) %>%
    dplyr::bind_rows(.id = "taxa") %>%
    dplyr::mutate(upper.up = exp(.data$upper.up)) %>%
    tidyr::separate(col = "taxa", into = c("taxa", "gear_type")) %>%
    dplyr::select(.data$taxa, .data$gear_type, max_individuals = .data$upper.up) %>%
    dplyr::mutate(max_individuals = round(.data$max_individuals, 1)) %>%
    dplyr::arrange(.data$taxa) %>%
    dplyr::left_join(
      metadata_table$gear_types %>%
        dplyr::select(
          gear_type = .data$gear_code,
          gear = .data$gear_id
        ),
      by = "gear_type"
    ) %>%
    dplyr::select(-.data$gear_type) %>%
    tidyr::pivot_wider(names_from = .data$gear, values_from = .data$max_individuals) %>%
    dplyr::left_join(
      metadata_table$catch_types %>%
        dplyr::select(
          taxa = .data$interagency_code,
          taxa_group = .data$catch_name_en
        ),
      by = "taxa"
    ) %>%
    dplyr::select("taxa group" = .data$taxa_group, dplyr::everything(), -.data$taxa) %>%
    dplyr::filter(!.data$`taxa group` %in% c("Herring", "Unknown", "Surgeonfish", "Bannerfish")) %>%
    dplyr::arrange(.data$`taxa group`)
}
WorldFishCenter/peskas.timor.data.pipeline documentation built on April 14, 2025, 1:47 p.m.