R/summary_rad.R

Defines functions summary_rad

Documented in summary_rad

#' @name summary_rad
#' @title Summary statistics for RADseq data
#' @description Summarise the tidy or gds data.
#' The summary statistics is by default calculated by strata and markers.
#' Frequency of the REF/ALT alleles, the observed and the expected heterozygosity
#' and the inbreeding coefficient (FIS).
#' @param data (4 options) A file or object generated by radiator:
#' \itemize{
#' \item tidy data
#' \item Genomic Data Structure (GDS)
#' }
#'
#' \emph{How to get GDS and tidy data ?}
#' Look into \code{\link{tidy_genomic_data}},
#' \code{\link{read_vcf}} or
#' \code{\link{tidy_vcf}}.
# @param filename (optional) Name of the file written to the working directory.
#' @param path.folder (path, optional) By default will print results in the working directory.
#' Default: \code{path.folder = NULL}.
#' @param digits (integer, optional). Default: \code{digits = 4}.
#' @inheritParams radiator_common_arguments
#' @rdname summary_rad
#' @export
# @keywords internal

summary_rad <- function(
  data,
  path.folder = NULL,
  digits = 4,
  verbose = TRUE
) {
  # Cleanup-------------------------------------------------------------------
  file.date <- format(Sys.time(), "%Y%m%d@%H%M")
  if (verbose) message("Execution date@time: ", file.date)
  old.dir <- getwd()
  opt.change <- getOption("width")
  options(width = 70)
  timing <- proc.time()# for timing
  #back to the original directory and options
  on.exit(setwd(old.dir), add = TRUE)
  on.exit(options(width = opt.change), add = TRUE)
  on.exit(timing <- proc.time() - timing, add = TRUE)
  on.exit(if (verbose) message("\nComputation time, overall: ", round(timing[[3]]), " sec"), add = TRUE)
  on.exit(if (verbose) cat("############################ summary_rad completed #############################\n"), add = TRUE)

  # Checking for missing and/or default arguments-------------------------------
  if (missing(data)) rlang::abort("Input file missing")


  # Folders---------------------------------------------------------------------
  path.folder <- generate_folder(
    f = path.folder,
    rad.folder = "summary_stats",
    prefix_int = FALSE,
    internal = FALSE,
    file.date = file.date,
    verbose = verbose)

  # Detect format --------------------------------------------------------------
  data.type <- radiator::detect_genomic_format(data)
  if (!data.type %in% c("tbl_df", "fst.file", "SeqVarGDSClass", "gds.file")) {
    rlang::abort("Input not supported for this function: read function documentation")
  }

  if (data.type %in% c("SeqVarGDSClass", "gds.file")) {
    radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)

    message("Importing gds file...")
    if (data.type == "gds.file") {
      data <- radiator::read_rad(data, verbose = verbose)
      data.type <- "SeqVarGDSClass"
    }

    tidy.data <- extract_genotypes_metadata(
      gds = data,
      genotypes.meta.select = c("MARKERS", "COL", "INDIVIDUALS", "POP_ID", "GT_BIN"))

    if (length(tidy.data) == 0) {
      tidy.data <- gds2tidy(gds = data)
    }
    SeqArray::seqClose(data)
    data <- tidy.data
    tidy.data <- NULL

  } else {#tidy.data
    if (is.vector(data)) data <- radiator::tidy_wide(data = data, import.metadata = TRUE)
  }
  if (verbose) message("Summarizing...")
  # the function
  sum_rad <- function(x, maf = c("global", "local"), digits = 6) {
    maf <- match.arg(arg = maf, choices = c("global", "local"))
    x %<>%
      dplyr::summarise(
        N = as.numeric(length(unique(INDIVIDUALS))),
        PP = as.numeric(length(GT_BIN[GT_BIN == 0])),
        PQ = as.numeric(length(GT_BIN[GT_BIN == 1])),
        QQ = as.numeric(length(GT_BIN[GT_BIN == 2]))
      ) %>%
      dplyr::ungroup(.) %>%
      dplyr::mutate(
        FREQ_REF = ((PP*2) + PQ)/(2*N),
        FREQ_ALT = ((QQ*2) + PQ)/(2*N),
        HET_O = PQ/N,
        HET_E = 2 * FREQ_REF * FREQ_ALT,
        FIS = dplyr::if_else(HET_O == 0, 1, round(((HET_E - HET_O) / HET_E), digits)),
        PP = NULL, PQ = NULL, QQ = NULL
      )

    if (maf == "global") {
      x %<>% dplyr::rename(MAF_GLOBAL = FREQ_ALT)
    } else {
      x %<>% dplyr::rename(MAF_LOCAL = FREQ_ALT)
    }

    return(x)
  }#End sum_rad

  data  %<>% dplyr::filter(!is.na(GT_BIN))
  # ms: markers stats

  ms <- data %>%
    dplyr::group_by(MARKERS) %>%
    sum_rad(x = ., maf = "global", digits = digits)

  # mps: markers pop stats
  mps <- data %>%
    dplyr::group_by(MARKERS, POP_ID) %>%
    sum_rad(x = ., maf = "local", digits = digits)

  # ps: pop stats
  ps <- dplyr::bind_cols(
    dplyr::group_by(data, POP_ID) %>% dplyr::summarise(N = length(unique(INDIVIDUALS))),
    dplyr::group_by(mps, POP_ID) %>%
      dplyr::summarise(
        .data = .,
        dplyr::across(
          .cols = c(N, FREQ_REF, MAF_LOCAL, HET_O, HET_E),
          .fns = mean, na.rm = TRUE
        ),
        .groups = "keep"
      ) %>%
      dplyr::rename(N_MEAN = N)
  ) %>%
    dplyr::mutate(
      POP_ID1 = NULL,
      FIS = dplyr::if_else(HET_O == 0, 1, round(((HET_E - HET_O) / HET_E), digits))
    ) %>%
    dplyr::mutate(
      dplyr::across(
        .cols = c(FREQ_REF, MAF_LOCAL, HET_O, HET_E, FIS),
        .fns = round,
        digits = digits
      )
    )

  # writting the results
  write_rad(
    data = ms,
    path = path.folder,
    filename = "summary.stats.markers.tsv",
    tsv = TRUE, verbose = verbose)

  write_rad(
    data = mps,
    path = path.folder,
    filename = "summary.stats.markers.pop.tsv",
    tsv = TRUE, verbose = verbose)

  write_rad(
    data = ps,
    path = path.folder,
    filename = "summary.stats.pop.tsv",
    tsv = TRUE, verbose = verbose)

  return(res = list(summary.stats.pop = ps,
                    summary.stats.markers.pop = mps,
                    summary.stats.markers = ms))
}#End summary_stats_vcf_tidy
thierrygosselin/radiator documentation built on May 5, 2024, 5:12 a.m.