R/pi.R

## Nucleotide diversity

#' @title Nucleotide diversity
#' @description Calculates the nucleotide diversity (Nei & Li, 1979).
#'
#' To get an estimate with the consensus reads, use the
#' function \emph{summary_haplotypes} found in the package
#' [stackr](https://github.com/thierrygosselin/stackr). The estimate in
#' \emph{summary_haplotypes} integrates the consensus markers found in
#' [STACKS](http://catchenlab.life.illinois.edu/stacks/)
#' populations.haplotypes.tsv file.
#' Both radiator and stackr functions requires \code{stringdist} package.
#'
#' The \code{read.length} argument below is used directly in the calculations.
#' To be correctly estimated, the reads obviously need to be of identical size...


#' @inheritParams radiator_common_arguments
#' @param data (4 options) A file or object generated by radiator:
#' \itemize{
#' \item tidy data
#' \item Genomic Data Structure (GDS)
#' }
#'
#' \emph{How to get GDS and tidy data ?}
#' Look into \code{\link{tidy_genomic_data}},
#' \code{\link{read_vcf}} or
#' \code{\link{tidy_vcf}}.


#' @param read.length (integer, optional) The length in nucleotide of your reads.
#' By default it is estimated from the data using the column \code{COL}.
#' Default: \code{read.length = NULL}.

#' @param path.folder (path, optional) By default will print results in the working directory.
#' Default: \code{path.folder = NULL}.

#' @return The function returns a list with the function call and:
#' \enumerate{
#' \item $pi.individuals: the pi estimated for each individual
#' \item $pi.populations: the pi statistics estimated per populations and overall.
#' \item $boxplot.pi: showing the boxplot of Pi for each populations and overall.
#' }
#' use $ to access each #' objects in the list.

#' @examples
#' \dontrun{
#' require(stringdist)
#' # The simplest way to run the function:
#' pi.sum <- radiator::pi(data = "brook.charr.gds")
#' }


#' @rdname pi
#' @export

#' @references Nei M, Li WH (1979)
#' Mathematical model for studying genetic variation in terms of
#' restriction endonucleases.
#' Proceedings of the National Academy of Sciences of
#' the United States of America, 76, 5269–5273.

#' @note Thanks to Anne-Laure Ferchaud for very useful comments on previous version
#' of this function.

#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}

pi <- function(
  data,
  read.length = NULL,
  parallel.core = parallel::detectCores() - 1,
  path.folder = NULL,
  verbose = TRUE
) {
  radiator_packages_dep(package = "stringdist")
  # Cleanup-------------------------------------------------------------------
  radiator_function_header(f.name = "pi", verbose = verbose)
  file.date <- format(Sys.time(), "%Y%m%d@%H%M")
  if (verbose) message("Execution date@time: ", file.date)
  old.dir <- getwd()
  opt.change <- getOption("width")
  options(width = 70)
  timing <- radiator_tic()
  #back to the original directory and options
  on.exit(setwd(old.dir), add = TRUE)
  on.exit(options(width = opt.change), add = TRUE)
  on.exit(radiator_toc(timing), add = TRUE)
  on.exit(radiator_function_header(f.name = "pi", start = FALSE, verbose = verbose), add = TRUE)
  res <- list()

  # manage missing arguments -----------------------------------------------------
  if (missing(data)) rlang::abort("Input file missing")

  # Folders---------------------------------------------------------------------
  path.folder <- generate_folder(
    f = path.folder,
    rad.folder = "pi",
    prefix_int = FALSE,
    internal = FALSE,
    file.date = file.date,
    verbose = verbose)

  # Detect format --------------------------------------------------------------
  data.type <- radiator::detect_genomic_format(data)
  if (!data.type %in% c("tbl_df", "fst.file", "SeqVarGDSClass", "gds.file")) {
    rlang::abort("Input not supported for this function: read function documentation")
  }

  if (data.type %in% c("SeqVarGDSClass", "gds.file")) {
    radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)


    message("Importing gds file...")
    if (data.type == "gds.file") {
      data <- radiator::read_rad(data, verbose = verbose)
      data.type <- "SeqVarGDSClass"
    }

    tidy.data <- extract_genotypes_metadata(
      gds = data,
      genotypes.meta.select = c("MARKERS", "COL", "INDIVIDUALS", "POP_ID", "GT_BIN"))

    if (length(tidy.data) == 0) {
      tidy.data <- gds2tidy(gds = data)
    }
    SeqArray::seqClose(data)
    data <- tidy.data
    tidy.data <- NULL
  } else {#tidy.data
    if (is.vector(data)) data %<>% radiator::tidy_wide(data = ., import.metadata = TRUE)
  }

  # Nei & Li 1979 Nucleotide Diversity -----------------------------------------
  message("Nucleotide diversity (Pi):")
  if (is.null(read.length) && !rlang::has_name(data, "COL")) {
    rlang::abort("Unable to estimate size of reads... read.length argument is required")
  }

  if (is.null(read.length)) read.length <- max(data$COL)

  message("    Read length used: ", read.length)
  # Pi: by individuals----------------------------------------------------------
  message("    Pi calculations: individuals...")
  data %<>%
    dplyr::filter(!is.na(GT_BIN)) %>%
    dplyr::mutate(
      ALLELE1 = dplyr::case_when(
        GT_BIN == 2L ~ "2",
        GT_BIN != 2L ~ "1"
      ),
      ALLELE2 = dplyr::case_when(
        GT_BIN == 0L ~ "1",
        GT_BIN != 0L ~ "2"
      )
    )

  res$pi.individuals <- data %>%
    dplyr::mutate(
      PI = (
        stringdist::stringdist(
          a = ALLELE1,
          b = ALLELE2,
          method = "hamming"
        )
      ) / read.length
    ) %>%
    dplyr::group_by(POP_ID, INDIVIDUALS) %>%
    dplyr::summarise(PI = mean(PI))


  write_rad(
    data = res$pi.individuals,
    path = path.folder,
    filename = "pi.individuals.tsv",
    tsv = TRUE, verbose = verbose)

  # Pi: by pop------------------------------------------------------------------
  message("    Pi calculations: populations...")
  data %<>%
    dplyr::select(POP_ID, INDIVIDUALS, MARKERS, ALLELE1, ALLELE2) %>%
    tidyr::pivot_longer(
      data = .,
      cols = -c("POP_ID", "INDIVIDUALS", "MARKERS"),
      names_to = "ALLELE_GROUP",
      values_to = "ALLELES"
    )
  res$pi.populations <- data %>%
    split(x = ., f = .$POP_ID) %>%
    purrr::map_df(
      .x = ., .f = pi_pop,
      read.length = read.length, parallel.core = parallel.core
    )

  # Pi: overall  ---------------------------------------------------------------
  message("    Pi calculations: overall")
  res$pi.populations <- tibble::add_row(
    .data = res$pi.populations,
    POP_ID = "OVERALL",
    PI_NEI = radiator_future(
      .x = data,
      .f = pi_rad,
      flat.future = "dfr",
      split.vec = FALSE,
      split.with = "MARKERS",
      parallel.core = parallel.core,
      read.length = read.length
    ) %>%
      dplyr::summarise(PI_NEI = mean(PI), .groups = "drop") %>%
      purrr::flatten_dbl(.)
  )
  write_rad(
    data = res$pi.populations,
    path = path.folder,
    filename = "pi.populations.tsv",
    tsv = TRUE, verbose = verbose)

  # figure ---------------------------------------------------------------------
  if (verbose) message("Generating violin plot of pi")
  n.pop <- length(unique(res$pi.individuals$POP_ID))

  res$boxplot.pi <- dplyr::filter(res$pi.individuals, POP_ID != "OVERALL") %>%
    ggplot2::ggplot(data = ., ggplot2::aes(x = factor(POP_ID), y = PI, na.rm = TRUE)) +
    ggplot2::geom_violin(trim = F) +
    ggplot2::geom_boxplot(width = 0.1, fill = "black", outlier.colour = NA) +
    ggplot2::stat_summary(fun = "mean", geom = "point", shape = 21, size = 2.5, fill = "white") +
    ggplot2::labs(
      x = "Sampling sites",
      y = "Individual nucleotide diversity (Pi)"
    ) +
    ggplot2::theme(
      legend.position = "none",
      axis.title.x = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
      axis.title.y = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
      legend.title = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
      legend.text = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold"),
      strip.text.x = ggplot2::element_text(size = 10, family = "Helvetica", face = "bold")
    )
  ggplot2::ggsave(
    limitsize = FALSE,
    plot = res$boxplot.pi,
    filename = file.path(path.folder, "pi.violin.plot.pdf"),
    width = max(10, n.pop * 2), height = 10,
    dpi = 300, units = "cm", useDingbats = FALSE)

  # results --------------------------------------------------------------------
  return(res)
}#End pi

# Internal nested functions ----------------------------------------------------
# pi_rad ---------------------------------------------------------------------------
#' @title pi_rad
#' @description main pi function
#' @rdname pi_rad
#' @keywords internal
#' @export
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}

pi_rad <- carrier::crate(function(data, read.length) {
  `%>%` <- magrittr::`%>%`

  y <- dplyr::select(data, ALLELES) %>%
    purrr::flatten_chr(.)

  if (length(unique(y)) <= 1) {
    pi <- tibble::tibble(PI = as.numeric(0))
  } else {

    #1 Get all pairwise comparison
    allele.pairwise <- utils::combn(unique(y), 2)

    #2 Calculate pairwise nucleotide mismatches
    pairwise.mismatches <- apply(allele.pairwise, 2, function(z) {
      stringdist::stringdist(a = z[1], b = z[2], method = "hamming")
    })

    #3 Calculate allele frequency
    allele.freq <- table(y)/length(y)

    #4 Calculate nucleotide diversity from pairwise mismatches and allele frequency
    pi <- apply(allele.pairwise, 2, function(y) allele.freq[y[1]] * allele.freq[y[2]])
    pi <- tibble::tibble(PI = sum(pi * pairwise.mismatches) / read.length)
  }
  return(pi)
})#End pi


# Pi pop ---------------------------------------------------------------------------
#' @title pi_pop
#' @description pi_pop function
#' @rdname pi_pop
#' @keywords internal
#' @export
#' @author Thierry Gosselin \email{thierrygosselin@@icloud.com}

pi_pop <- function(data, read.length, parallel.core = parallel::detectCores() - 1) {
  pop <- unique(data$POP_ID)
  message("    Pi calculations for pop: ", pop)
  # data <- df.split.pop[["DD"]]
  # data <- df.split.pop[["SKY"]]

  pi.pop <- radiator_future(
    .x = data,
    .f = pi_rad,
    flat.future = "dfr",
    split.vec = FALSE,
    split.with = "MARKERS",
    parallel.core = parallel.core,
    read.length = read.length
  ) %>%
    dplyr::summarise(PI_NEI = mean(PI)) %>%
    tibble::add_column(.data = ., POP_ID = pop, .before = "PI_NEI")

  return(pi.pop)
}#End pi_pop
thierrygosselin/radiator documentation built on May 5, 2024, 5:12 a.m.