RankspeQ: RankspeQ: An R package platform for genotype characterization and performance-ranking based on MultispeQ measurements

Documented in target_trait_comparison

#' Comparisons between raknings and target trait of interest.
#'
#' @param ranks The list obtained by MSPQ_ranks.
#' @param target.trait.file data.frame in narrow format with the yield component or any other trait to contrast against MultispeQ ranking. An observation per experimental unit is required as well as the row & column coordinates if spats = TRUE in MSPQ_ranks.
#' @param target.trait.name Character. name of the yield variable on target.trait.file to contrast against MultispeQ ranking.
#' @param metadata data.frame in narrow format with extra information about the genotypes i.e. gene pool, known biotic/abiotic traits, etc. Optional process. NULL by default.
#'
#' @return A list with the confusion matrices and summary of predictions.
#' @export
#'
#' @examples
#' \dontrun{
#' # yield_data: any table containing the same evaluated genotypes and the yield or any target to contrast against MSPQ rankings
#'
#' target_trait_comparison(
#'   ranks = ranks,
#'   target.trait.file = yield_data,
#'   target.trait.name = "kg_ha",
#'   metadata = NULL
#' )
#' }
target_trait_comparison <- function(ranks, target.trait.file, target.trait.name, metadata = NULL) {
  if (!is.null(target.trait.file)) {
    if(tibble::is_tibble(target.trait.file)){
      target.trait.file <- as.data.frame(target.trait.file)
    }
    if (!is.data.frame(target.trait.file)) {
      stop(paste0("The argument target.trait.file must be a data frame that contains the '", ranks$Genotype, "' column and the target trait to evaluate."))
    } else if (!ranks$Genotype %in% names(target.trait.file)) {
      stop(paste0("The variable '", ranks$Genotype, "' was not found into the target.trait.file dataset."))
    } else if (!all(unique(target.trait.file[, ranks$Genotype]) %in% unique(unlist(c(ranks$means[, ranks$Genotype]))))) {
      gen_index <- unique(target.trait.file[, ranks$Genotype])[which(!unique(target.trait.file[, ranks$Genotype]) %in% unique(unlist(c(ranks$means[, ranks$Genotype]))))]
      gen_lets <- substr(gen_index, 1, 3)

      gen_real <- list()
      for (i in 1:length(gen_lets)) {
        gen_real[[i]] <- unique(unlist(c(ranks$means[, ranks$Genotype])))[grep(gen_lets[i], unique(unlist(c(ranks$means[, ranks$Genotype]))))]
      }
      gen_real <- unlist(gen_real)

      if (!length(gen_real) == 0) {
        stop(paste0("The genotype(s) '", paste0((gen_index), collapse = "', '"), "' from the target trait file not found into the ranked genotypes.
                  Should it be named as '", paste0((gen_real), collapse = "', '"), "'?"))
      } else {
        stop(paste0("The genotype(s) '", paste0((gen_index), collapse = "', '"), "' from the target trait file not found into the ranked genotypes."))
      }
    }
  }

  if ("DAS" %in% ranks$Sources_of_variation) {
    DAS_ind <- 4
  } else {
    DAS_ind <- 3
  }


  if (!is.null(ranks[["SPATS_variables"]])) {
    if (!all(ranks[["SPATS_variables"]] %in% names(target.trait.file))) {
      stop(paste0(
        "The MulstispeQ ranks obtained by the function MSPQ_ranks() were adjusted by SpATS. The spatial variables ", paste0("'", ranks[["SPATS_variables"]], "'", collapse = ", "),
        " were not found into the target trait file"
      ))
    } else if (!all(c(ranks$Sources_of_variation[DAS_ind:length(ranks$Sources_of_variation)], target.trait.name) %in% names(target.trait.file))) {
      pos <- which(!c(ranks$Sources_of_variation[DAS_ind:length(ranks$Sources_of_variation)], target.trait.name) %in% names(target.trait.file))
      stop(paste0(
        "The variable(s) ", paste0("'", c(ranks$Sources_of_variation[DAS_ind:length(ranks$Sources_of_variation)], target.trait.name)[pos], "'", collapse = ", "),
        " do not exist into the target trait file. Make sure that it/they exist and try again"
      ))
      rm(pos, DAS_ind)
    }

    cat("Adjusting target trait with spatial components... \n")


    if ("DAS" %in% ranks$Sources_of_variation) {
      sov_rm <- which(!ranks$Sources_of_variation %in% c(ranks$Genotype, "DAS", "date", "time"))
    } else {
      sov_rm <- which(!ranks$Sources_of_variation %in% c(ranks$Genotype, "date", "time"))
    }

    arr_SoV <- ranks$Sources_of_variation[sov_rm]
    rm(sov_rm)

    yield.list <- as.list(dplyr::group_split(target.trait.file, .dots = arr_SoV))
    yield.list <- lapply(yield.list, function(x) as.data.frame(x))

    yield.model <- list()
    yield.blup <- list()

    for (i in 1:length(yield.list)) {
      yield.model[[i]] <- suppressWarnings(SpATS_mrbean(
        data = yield.list[[i]], response = target.trait.name, genotype = ranks$Genotype, col = ranks$SPATS_variables[2], row = ranks$SPATS_variables[1], segm = T, ncols = NULL, nrows = NULL, rep = "",
        fix_fact = NULL, ran_fact = NULL, gen_ran = T, covariate = NULL,
        clean_out = FALSE, iterations = 1, checks = NULL
      ))
      yield.blup[[i]] <- data.frame(predict(yield.model[[i]], which = ranks$Genotype, predFixed = "marginal")[, c(ranks$Genotype, "predicted.values")])
      names(yield.blup[[i]])[2] <- target.trait.name
      yield.blup[[i]]$x <- yield.list[[i]][, arr_SoV][1]
      for (s in 1:length(arr_SoV)) {
        names(yield.blup[[i]])[2 + s] <- arr_SoV[s]
      }
    }

    yield.transf <- do.call(rbind, yield.blup)
    yield.list <- yield.blup
    rm(i, s, DAS_ind)
  } else {
    if ("DAS" %in% ranks$Sources_of_variation) {
      sov_rm <- which(!ranks$Sources_of_variation %in% c(ranks$Genotype, "DAS", "date", "time"))
    } else {
      sov_rm <- which(!ranks$Sources_of_variation %in% c(ranks$Genotype, "date", "time"))
    }

    arr_SoV <- ranks$Sources_of_variation[sov_rm]
    rm(sov_rm)

    yield.transf <- target.trait.file %>%
      dplyr::group_by_(.dots = c(ranks$Genotype, arr_SoV)) %>%
      dplyr::summarise_if(.predicate = "is.numeric", .funs = "mean", na.rm = T) %>%
      dplyr::ungroup() %>%
      as.data.frame()
  }


  # Confusion matrices ------------------------------------------------------


  cat("Plotting confusion matrices... \n")

  conam <- ranks$rank_table

  if ("DAS" %in% ranks$Sources_of_variation) {
    conam %<>%
      dplyr::select_(.dots = c("date", "time", ranks$Genotype, arr_SoV, "cumulative_trait_score", "DAS")) %>%
      dplyr::left_join(., yield.transf %>%
        dplyr::select(dplyr::all_of(c(ranks$Genotype, arr_SoV, target.trait.name))),
      by = c(ranks$Genotype, arr_SoV)
      )

    conam <- as.list(dplyr::group_split(conam, date, DAS, time, .dots = arr_SoV))
  } else {
    conam %<>%
      dplyr::select_(.dots = c("date", "time", ranks$Genotype, arr_SoV, "cumulative_trait_score")) %>%
      dplyr::left_join(., yield.transf %>%
        dplyr::select(dplyr::all_of(c(ranks$Genotype, arr_SoV, target.trait.name))),
      by = c(ranks$Genotype, arr_SoV)
      )

    conam <- as.list(dplyr::group_split(conam, date, time, .dots = arr_SoV))
  }


  fin <- list()


  grupos <- 10
  falses <- round(grupos * 0.3)

  for (i in 1:length(conam)) {
    conam[[i]]$y.rank <- as.numeric(rank(conam[[i]][, target.trait.name]))
    conam[[i]]$m.rank <- as.numeric(rank(conam[[i]][, "cumulative_trait_score"]))
    conam[[i]] %<>% dplyr::arrange(y.rank)
    clus <- round(nrow(conam[[i]]) * 0.1)
    conam[[i]]$clus.y <- rep(1:grupos, times = clus, length.out = nrow(conam[[i]])) %>% sort()

    conam[[i]] %<>% dplyr::arrange(m.rank)
    conam[[i]]$clus.m <- rep(1:grupos, times = clus, length.out = nrow(conam[[i]])) %>% sort()


    conam[[i]] %<>%
      dplyr::arrange(clus.y) %>%
      dplyr::mutate(
        eval = dplyr::case_when(
          clus.m == clus.y ~ TRUE,
          clus.y + 1 == clus.m ~ TRUE,
          clus.y - 1 == clus.m ~ TRUE,
          # clus.y+2 == clus.m ~ TRUE,
          # clus.y-2 == clus.m~TRUE,
          TRUE ~ FALSE
        ),
        col = dplyr::case_when(
          eval == TRUE ~ "Predicted",
          clus.m >= (max(clus.m) - falses) + 1 & clus.y <= falses ~ "False Positive",
          clus.y >= (max(clus.m) - falses) + 1 & clus.m <= falses ~ "False Negative",
          TRUE ~ "Low Prediction"
        )
      )
  }
  rm(i)


  eval <- list()


  if ("DAS" %in% ranks$Sources_of_variation) {
    for (i in 1:length(conam)) {
      eval[[i]] <- conam[[i]] %>%
        dplyr::group_by(col) %>%
        dplyr::summarise(Count = dplyr::n()) %>%
        dplyr::mutate(conf_matrix = paste0(unique(conam[[i]][, arr_SoV]), " ", conam[[i]]$DAS[1], " ", as.character(conam[[i]]$time[1]))) %>%
        dplyr::select(conf_matrix, col, Count) %>%
        dplyr::rename(Variable = col)
    }
    eval <- DT::datatable(do.call(rbind, eval), rownames = F)
  } else {
    for (i in 1:length(conam)) {
      eval[[i]] <- conam[[i]] %>%
        dplyr::group_by(col) %>%
        dplyr::summarise(Count = dplyr::n()) %>%
        dplyr::mutate(conf_matrix = paste0(unique(conam[[i]][, arr_SoV]), " ", conam[[i]]$date[1], " ", as.character(conam[[i]]$time[1]))) %>%
        dplyr::select(conf_matrix, col, Count) %>%
        dplyr::rename(Variable = col)
    }
    eval <- DT::datatable(do.call(rbind, eval), rownames = F)
  }

  conf_matrices <- list()

  for (i in 1:length(conam)) {
    data_loop <- conam[[i]]
    aaa <- as.list(dplyr::group_split(data_loop, clus.y, clus.m))
    aaa <- lapply(aaa, function(x) {
      x$name <- paste0(unlist(c(x[, ranks$Genotype])), collapse = "\n")
      return(x)
    })
    aaa <- do.call(rbind, aaa) %>%
      dplyr::select(clus.y, clus.m, name, col) %>%
      .[!duplicated(.), ]



    if ("DAS" %in% ranks$Sources_of_variation) {
      conf_matrices[[i]] <- plotly::ggplotly(ggplot2::ggplot(
        aaa,
        ggplot2::aes_string(
          x = "factor(clus.y)", y = "factor(clus.m)", fill = "factor(col)",
          text = "name"
        )
      ) +
        ggplot2::geom_raster(alpha = 0.95) +
        ggplot2::theme_classic() +
        ggplot2::labs(
          title = paste0(data_loop$DAS[1], " ", as.character(data_loop$time[1]), " ", unique(data_loop[, arr_SoV])),
          x = "Yield cluster", y = "MultispeQ cluster", fill = ""
        ), tooltip = "text")
      names(conf_matrices)[i] <- paste0(data_loop$DAS[1], " ", as.character(data_loop$time[1]), " ", unique(data_loop[, arr_SoV]))
    } else {
      conf_matrices[[i]] <- plotly::ggplotly(ggplot2::ggplot(
        aaa,
        ggplot2::aes_string(
          x = "factor(clus.y)", y = "factor(clus.m)", fill = "factor(col)",
          text = "name"
        )
      ) +
        ggplot2::geom_raster(alpha = 0.95) +
        ggplot2::theme_classic() +
        ggplot2::labs(
          title = paste0(data_loop$date[1], " ", as.character(data_loop$time[1]), " ", unique(data_loop[, arr_SoV])),
          x = "Yield cluster", y = "MultispeQ cluster", fill = ""
        ), tooltip = "text")
      names(conf_matrices)[i] <- paste0(data_loop$date[1], " ", as.character(data_loop$time[1]), " ", unique(data_loop[, arr_SoV]))
    }
  }
  rm(data_loop, i, aaa)


  # metadata ----------------------------------------------------------------

  if (!is.null(metadata)) {
    if (!is.data.frame(metadata)) {
      warning(paste0("The argument metadata must be a data frame that contains the '", ranks$Genotype, "' column and the traits to evaluate. This procedure is not done"), immediate. = T)
    } else if (!ranks$Genotype %in% names(metadata)) {
      warning(paste0("The variable '", ranks$Genotype, "' was not found into the metadata dataset."), immediate. = T)
    } else if (!all(unique(metadata[, ranks$Genotype]) %in% unique(unlist(c(ranks$means[, ranks$Genotype]))))) {
      gen_index <- unique(metadata[, ranks$Genotype])[which(!unique(metadata[, ranks$Genotype]) %in% unique(unlist(c(ranks$means[, ranks$Genotype]))))]
      gen_lets <- substr(gen_index, 1, 3)

      gen_real <- list()
      for (i in 1:length(gen_lets)) {
        gen_real[[i]] <- unique(unlist(c(ranks$means[, ranks$Genotype])))[grep(gen_lets[i], unique(unlist(c(ranks$means[, ranks$Genotype]))))]
      }
      gen_real <- unlist(gen_real)

      if (!length(gen_real) == 0) {
        warning(paste0("The genotype(s) '", paste0((gen_index), collapse = "', '"), "' from the metadata file not found into the ranked genotypes.
                  Should it be named as '", paste0((gen_real), collapse = "', '"), "'?. This procedure is not done"), immediate. = T)
      } else {
        warning(paste0("The genotype(s) '", paste0((gen_index), collapse = "', '"), "' from the metadata file not found into the ranked genotypes. This procedure is not done"), immediate. = T)
      }
    } else {
      metadatos <- suppressMessages(lapply(1:length(conam), function(x) dplyr::left_join(conam[[x]], metadata)))

      cols <- names(metadatos[[1]])[which(!names(metadatos[[1]]) %in% names(conam[[1]]))]

      cat(paste0("The genotypes metadata to analyse is: ", paste0(cols, collapse = " "), "\n"))

      metadatos <- do.call(rbind, metadatos)


      descriptors <- list()

      for (i in 1:length(cols)) {
        descriptors[[i]] <- metadatos %>%
          dplyr::group_by_(.dots = c(cols[i], arr_SoV)) %>%
          dplyr::group_split() %>%
          as.list()
      }
      rm(i)

      if ("DAS" %in% ranks$Sources_of_variation) {
        for (i in 1:length(descriptors)) {
          for (j in 1:length(descriptors[[i]])) {
            suppressMessages(descriptors[[i]][[j]] %<>%
              dplyr::group_by_(.dots = c("DAS", "date", arr_SoV)) %>%
              dplyr::summarise(
                Predicted = sum(col == "Predicted", na.rm = T),
                No_Predicted = sum(col == "Low Prediction", na.rm = T),
                False_Positive = sum(col == "False Positive", na.rm = T),
                False_Negative = sum(col == "False Negative", na.rm = T)
              ) %>%
              dplyr::ungroup() %>%
              tidyr::gather(., key = "x", value = "Number_of_Genotypes", Predicted, No_Predicted, False_Positive, False_Negative) %>%
              dplyr::mutate(.,
                s = as.data.frame(descriptors[[i]][[j]])[1, cols[i]],
                date = lubridate::ymd(date)
              ))
            names(descriptors[[i]])[[j]] <- paste0(as.data.frame(descriptors[[i]][[j]])[1, "s"], " ", paste0(descriptors[[i]][[j]][1, arr_SoV], collapse = " "))
          }
          names(descriptors)[[i]] <- cols[i]
        }
      } else {
        for (i in 1:length(descriptors)) {
          for (j in 1:length(descriptors[[i]])) {
            suppressMessages(descriptors[[i]][[j]] %<>%
              dplyr::group_by_(.dots = c("date", arr_SoV)) %>%
              dplyr::summarise(
                Predicted = sum(col == "Predicted", na.rm = T),
                No_Predicted = sum(col == "Low Prediction", na.rm = T),
                False_Positive = sum(col == "False Positive", na.rm = T),
                False_Negative = sum(col == "False Negative", na.rm = T)
              ) %>%
              dplyr::ungroup() %>%
              tidyr::gather(., key = "x", value = "Number_of_Genotypes", Predicted, No_Predicted, False_Positive, False_Negative) %>%
              dplyr::mutate(.,
                s = as.data.frame(descriptors[[i]][[j]])[1, cols[i]],
                date = lubridate::ymd(date)
              ))
            names(descriptors[[i]])[[j]] <- paste0(as.data.frame(descriptors[[i]][[j]])[1, "s"], " ", paste0(descriptors[[i]][[j]][1, arr_SoV], collapse = " "))
          }
          names(descriptors)[[i]] <- cols[i]
        }
      }



      meta_tables <- descriptors


      if ("DAS" %in% ranks$Sources_of_variation) {
        for (i in 1:length(descriptors)) {
          for (j in 1:length(descriptors[[i]])) {
            title <- paste0(as.data.frame(descriptors[[i]][[j]])[1, "s"], " ", paste0(descriptors[[i]][[j]][1, arr_SoV], collapse = " "))

            descriptors[[i]][[j]] <- descriptors[[i]][[j]] %>%
              ggplot2::ggplot(ggplot2::aes(
                x = DAS, y = Number_of_Genotypes,
                fill = factor(x, levels = c("Predicted", "No_Predicted", "False_Positive", "False_Negative"))
              )) +
              ggplot2::geom_bar(stat = "identity", position = ggplot2::position_dodge(1)) +
              ggplot2::scale_fill_discrete(labels = c("Predicted", "Low Prediction", "False Positive", "False Negative")) +
              ggplot2::theme_bw() +
              ggplot2::theme(legend.title = ggplot2::element_blank()) +
              ggplot2::labs(title = title, x = "Days After Sowing", y = "Occurrences (#)")
          }
        }
      } else {
        for (i in 1:length(descriptors)) {
          for (j in 1:length(descriptors[[i]])) {
            title <- paste0(as.data.frame(descriptors[[i]][[j]])[1, "s"], " ", paste0(descriptors[[i]][[j]][1, arr_SoV], collapse = " "))

            descriptors[[i]][[j]] <- plotly::ggplotly(descriptors[[i]][[j]] %>%
              ggplot2::ggplot(ggplot2::aes(
                x = date,
                y = Number_of_Genotypes,
                fill = factor(x, levels = c("Predicted", "No_Predicted", "False_Positive", "False_Negative"))
              )) +
              ggplot2::geom_bar(stat = "identity", position = ggplot2::position_dodge(4.9)) +
              ggplot2::theme_bw() +
              ggplot2::theme(legend.title = ggplot2::element_blank()) +
              ggplot2::labs(title = title, x = "Date", y = "Occurrences (#)"), tooltip = c("x", "y"))
          }
        }
      }



      ssss <- suppressMessages(do.call(rbind, conam) %>% dplyr::left_join(., metadata))


      suppressMessages(ssss %<>%
        dplyr::group_by_(.dots = c(ranks$Genotype, arr_SoV, cols)) %>%
        dplyr::summarise(
          Predicted = sum(col == "Predicted", na.rm = T),
          Low_Prediction = sum(col == "Low Prediction", na.rm = T),
          False_Positive = sum(col == "False Positive", na.rm = T),
          False_Negative = sum(col == "False Negative", na.rm = T)
        ) %>%
        dplyr::ungroup())

      suppressMessages(dates <- do.call(rbind, conam) %>%
        dplyr::group_by_(.dots = ranks$Genotype, arr_SoV) %>%
        dplyr::summarise(
          Times = dplyr::n(),
          Dates = Times / 2
        ) %>%
        dplyr::ungroup())



      summary_table <- suppressMessages(DT::datatable(dplyr::left_join(ssss, dates)))
    }
  } else {
    ssss <- do.call(rbind, conam)

    suppressMessages(ssss %<>%
      dplyr::group_by_(.dots = ranks$Genotype, arr_SoV) %>%
      dplyr::summarise(
        Predicted = sum(col == "Predicted", na.rm = T),
        Low_Prediction = sum(col == "Low Prediction", na.rm = T),
        False_Positive = sum(col == "False Positive", na.rm = T),
        False_Negative = sum(col == "False Negative", na.rm = T)
      ) %>%
      dplyr::ungroup())

    suppressMessages(dates <- do.call(rbind, conam) %>%
      dplyr::group_by_(.dots = ranks$Genotype, arr_SoV) %>%
      dplyr::summarise(
        Times = dplyr::n(),
        Dates = Times / 2
      ) %>%
      dplyr::ungroup())



    summary_table <- suppressMessages(DT::datatable(dplyr::left_join(ssss, dates)))
  }

  cat("Making return\n")

  if (!is.null(metadata)) {
    output <- list(
      #conam = conam,
      Conf_matrices = conf_matrices,
      Summary_of_predictions = eval,
      Summary_per_Genotype = summary_table,
      Metadata_tables = meta_tables,
      Metadata_plots = descriptors
    )
    if (!is.null(ranks[["SPATS_variables"]])) {
      output$target.trait.BLUP <- yield.transf
    }
    cat("Done!!!\n")
    return(output)
  } else {
    output <- list(
      #conam = conam,
      Conf_matrices = conf_matrices,
      Summary_of_predictions = eval,
      Summary_per_Genotype = summary_table
    )
    if (!is.null(ranks[["SPATS_variables"]])) {
      output$target.trait.BLUP <- yield.transf
    }
    cat("Done!!!\n")
    return(output)
  }
}
jssotob/RankspeQ documentation built on Jan. 31, 2024, 10:11 a.m.
rdrr.io home R language documentation Run R code online
CRAN packages Bioconductor packages R-Forge packages GitHub packages
Note that we can't provide technical support on individual packages. You should contact the package authors for that.
jssotob/RankspeQ
RankspeQ: An R package platform for genotype characterization and performance-ranking based on MultispeQ measurements

R/function_target_trait_comparison.R
In jssotob/RankspeQ: RankspeQ: An R package platform for genotype characterization and performance-ranking based on MultispeQ measurements

Defines functions target_trait_comparison

Documented in target_trait_comparison

R Package Documentation

Browse R Packages

We want your feedback!

jssotob/RankspeQ RankspeQ: An R package platform for genotype characterization and performance-ranking based on MultispeQ measurements

R/function_target_trait_comparison.R In jssotob/RankspeQ: RankspeQ: An R package platform for genotype characterization and performance-ranking based on MultispeQ measurements

Defines functions target_trait_comparison

Documented in target_trait_comparison

R Package Documentation

Browse R Packages

We want your feedback!

jssotob/RankspeQ
RankspeQ: An R package platform for genotype characterization and performance-ranking based on MultispeQ measurements

R/function_target_trait_comparison.R
In jssotob/RankspeQ: RankspeQ: An R package platform for genotype characterization and performance-ranking based on MultispeQ measurements