agriutilities: Utilities for Data Analysis in Agriculture

Documented in check_design_met

#' Check Experimental Design
#'
#' @description This function helps to identify the experimental design of each
#' trial, filters the data and then provide a summary for the traits and the
#' experimental design. This works as a quality check before we fit any model.
#' Returns an object of class \code{checkAgri}.
#'
#' @param data A data.frame in a wide format.
#' @param genotype A character string indicating the column in data that
#' contains genotypes.
#' @param trial A character string indicating the column in data that contains
#' trials.
#' @param traits A character vector specifying the traits for which the models
#' should be fitted.
#' @param rep A character string indicating the column in data that contains
#' replicates.
#' @param block A character string indicating the column in data that contains
#' sub blocks.
#' @param row A character string indicating the column in data that contains the
#' row coordinates.
#' @param col A character string indicating the column in data that contains the
#' column coordinates.
#'
#' @return An object of class \code{checkAgri}, with a list of:
#' \item{summ_traits}{A data.frame containing a summary of the traits.}
#' \item{exp_design_resum}{A data.frame containing a summary of the experimental
#'  design.}
#' \item{filter}{A list by trait containing the filtered trials.}
#' \item{exp_design_list}{A data.frame containing the experimental design of
#' each trial.}
#' \item{check_connectivity}{A data.frame with the genotype connectivity.}
#' \item{connectivity_matrix}{A matrix with the amount of genotypes shared
#' between each pair of trial.}
#' \item{data_design}{A data frame containing the data used with two additional
#' columns, one realted to the experimental design and a sequential number (id)}
#' \item{inputs}{A list containing the character string that indicates the
#' column in data that contains the genotype, trial, traits, rep, block, row
#' and col.}
#' @export
#'
#' @examples
#' library(agridat)
#' library(agriutilities)
#' data(besag.met)
#' dat <- besag.met
#' results <- check_design_met(
#'   data = dat,
#'   genotype = "gen",
#'   trial = "county",
#'   traits = c("yield"),
#'   rep = "rep",
#'   block = "block",
#'   col = "col",
#'   row = "row"
#' )
#' print(results)
#' plot(results, type = "connectivity")
#' plot(results, type = "missing")
#' @import dplyr
#' @importFrom stats sd median
#' @importFrom utils type.convert
check_design_met <- function(data = NULL,
                             genotype = NULL,
                             trial = NULL,
                             traits = NULL,
                             rep = NULL,
                             block = NULL,
                             row = NULL,
                             col = NULL) {
  if (is.null(data)) {
    stop("Error: data not found")
  }
  if (is.null(genotype)) {
    stop("No 'genotype' name column provided")
  }
  if (!genotype %in% names(data)) {
    stop(paste("No '", genotype, "' found in the data"))
  }
  if (is.null(trial)) {
    message("No 'trial' name column provided")
    trial <- "trial"
    data <- data %>% mutate(trial = NA)
  }
  if (!trial %in% names(data)) {
    stop(paste("No '", trial, "' found in the data"))
  }
  if (is.null(traits)) {
    stop("No 'traits' argument provided")
  }
  if (is.null(rep)) {
    message("No 'rep' name column provided")
    rep <- "rep"
    data <- data %>% mutate(rep = NA)
  }
  if (!rep %in% names(data)) {
    stop(paste("No '", rep, "' found in the data"))
  }
  if (is.null(block)) {
    message("No 'block' name column provided")
    block <- "block"
    data <- data %>% mutate(block = NA)
  }
  if (!block %in% names(data)) {
    stop(paste("No '", block, "' found in the data"))
  }
  if (is.null(row)) {
    message("No 'row' name column provided")
    row <- "row"
    data <- data %>% mutate(row = NA)
  }
  if (!row %in% names(data)) {
    stop(paste("No '", row, "' found in the data"))
  }
  if (is.null(col)) {
    message("No 'col' name column provided")
    col <- "col"
    data <- data %>% mutate(col = NA)
  }
  if (!col %in% names(data)) {
    stop(paste("No '", col, "' found in the data"))
  }
  for (i in traits) {
    if (!i %in% names(data)) {
      stop(paste("No '", i, "' column found"))
    }
    class_trait <- data[[i]] %>% class()
    if (!class_trait %in% c("numeric", "integer")) {
      stop(
        paste0("The class of the trait '", i, "' should be numeric or integer.")
      )
    }
  }
  trait_issue <- traits[make.names(traits) != traits]
  traits <- traits[make.names(traits) == traits]
  if (length(trait_issue) != 0) {
    message(
      "Issues in variables names. The trait '",
      paste(trait_issue, collapse = " - "),
      "' has been removed."
    )
    if (length(traits) == 0) {
      stop("Check the variable names.")
    }
  }

  summ_traits <- data %>%
    dplyr::select(.data[[trial]], all_of(traits)) %>%
    gather(data = ., key = "traits", value = "value", -.data[[trial]]) %>%
    group_by(.data[[trial]], traits) %>%
    summarise(
      Min = suppressWarnings(min(value, na.rm = TRUE)),
      Mean = mean(value, na.rm = TRUE),
      Median = median(value, na.rm = TRUE),
      Max = suppressWarnings(max(value, na.rm = TRUE)),
      SD = sd(value, na.rm = TRUE),
      CV = SD / abs(Mean),
      n = n(),
      n_miss = sum(is.na(value)),
      miss_perc = n_miss / n,
      .groups = "drop"
    )

  l <- which(is.infinite(summ_traits$Min))
  summ_traits[l, c("Min", "Mean", "Max")] <- NA

  exp_design_resum <- data %>%
    group_by(.data[[trial]], .data[[genotype]]) %>%
    mutate(gen_reps = n()) %>%
    group_by(.data[[trial]], .data[[rep]]) %>%
    mutate(size_reps = n()) %>%
    group_by(.data[[trial]]) %>%
    summarise(
      n = n(),
      n_gen = n_distinct(.data[[genotype]], na.rm = TRUE),
      n_rep = n_distinct(.data[[rep]], na.rm = TRUE),
      n_block = n_distinct(.data[[block]], na.rm = TRUE),
      n_col = n_distinct(.data[[col]], na.rm = TRUE),
      n_row = n_distinct(.data[[row]], na.rm = TRUE),
      n_col = ifelse(n_col <= 3, NA, n_col),
      n_row = ifelse(n_row <= 3, NA, n_row),
      num_of_reps = paste(sort(unique(gen_reps)), collapse = "_"),
      num_of_gen = paste(
        table(gen_reps) / sort(unique(gen_reps)),
        collapse = "_"
      ),
      reps_size = paste(unique(size_reps), collapse = "_"),
      reps_equal = length(unique(size_reps)) == 1,
      unrep = ifelse(n_gen == n, TRUE, FALSE),
      rcbd = ifelse(reps_equal && n_rep > 1, "rcbd", NA),
      alpha_lattice = ifelse(rcbd == "rcbd" & n_block > 1, "alpha", NA),
      prep = ifelse(nchar(num_of_reps) > 1, "prep", NA),
      spatial = ifelse(
        test = n_col > 1 & n_row > 1 & !rcbd %in% "rcbd",
        yes = "row_col",
        no = ifelse(
          test = n_col > 1 & n_row > 1 & rcbd %in% "rcbd",
          yes = "res_row_col",
          no = NA
        )
      )
    ) %>%
    type.convert(as.is = FALSE)

  trials_spatial <- exp_design_resum %>%
    filter(!is.na(spatial)) %>%
    pull(trial) %>%
    as.character()

  if (!is.null(trials_spatial)) {
    duplicates <- data %>%
      filter(.data[[trial]] %in% trials_spatial) %>%
      select(.data[[trial]], .data[[col]], .data[[row]]) %>%
      na.omit() %>%
      split(x = ., .[, trial]) %>%
      lapply(FUN = function(x) ifelse(sum(duplicated(x)) >= 1, TRUE, FALSE)) %>%
      unlist()
    duplicates <- names(duplicates[which(duplicates %in% TRUE)])
  } else {
    duplicates <- ""
  }

  filters_summary <- list()

  for (i in traits) {
    # Filter missing trials
    trials_to_remove_miss <- summ_traits %>%
      filter(traits %in% i) %>%
      filter(miss_perc >= 0.50) %>%
      pull(.data[[trial]]) %>%
      as.character()

    # Filter by non_var_gen
    non_var_gen <- data %>%
      filter(!.data[[trial]] %in% trials_to_remove_miss) %>%
      group_by(.data[[trial]], .data[[genotype]]) %>%
      summarise(
        mean = mean(.data[[i]], na.rm = TRUE),
        sd = sd(.data[[i]], na.rm = TRUE),
        cv = sd / mean,
        .groups = "drop"
      ) %>%
      group_by(.data[[trial]]) %>%
      summarise(var_total = sum(sd, na.rm = TRUE), .groups = "drop") %>%
      arrange(var_total)

    # Filter variation
    trials_to_remove_novar <- non_var_gen %>%
      filter(var_total == 0) %>%
      pull(trial) %>%
      as.character()

    filters_summary[[i]] <- list(
      "missing_50%" = trials_to_remove_miss,
      "no_variation" = trials_to_remove_novar,
      "row_col_dup" = duplicates,
      "trials_to_remove" = union(
        x = union(trials_to_remove_miss, trials_to_remove_novar),
        y = duplicates
      )
    )
  }

  design <- exp_design_resum %>%
    mutate(
      exp_design = ifelse(
        test = spatial %in% "row_col",
        yes = "row_col",
        no = ifelse(
          test = spatial %in% "res_row_col",
          yes = "res_row_col",
          no = ifelse(
            test = alpha_lattice %in% "alpha",
            yes = "alpha_lattice",
            no = ifelse(
              test = rcbd %in% "rcbd",
              yes = "rcbd",
              no = ifelse(
                test = prep %in% "prep",
                yes = "crd",
                no = "crd"
              )
            )
          )
        )
      )
    ) %>%
    select(.data[[trial]], exp_design) %>%
    as.data.frame()

  data_design <- merge(x = data, y = design, all.x = TRUE) %>%
    mutate(id = seq_len(nrow(.))) %>%
    relocate(id)

  # Conectivity
  conn1 <- check_connectivity(
    data = data,
    genotype = genotype,
    trial = trial,
    response = NULL
  )

  conn2 <- check_connectivity(
    data = data,
    genotype = genotype,
    trial = trial,
    response = NULL,
    return_matrix = TRUE
  )

  objt <- list(
    summ_traits = summ_traits,
    exp_design_resum = exp_design_resum,
    filter = filters_summary,
    exp_design_list = design,
    check_connectivity = conn1,
    connectivity_matrix = conn2,
    data_design = data_design,
    inputs = list(
      genotype = genotype,
      trial = trial,
      traits = traits,
      rep = rep,
      block = block,
      row = row,
      col = col
    )
  )
  class(objt) <- "checkAgri"
  return(objt)
}