#' @name summary_rad
#' @title Summary statistics for RADseq data
#' @description Summarise the tidy or gds data.
#' The summary statistics is by default calculated by strata and markers.
#' Frequency of the REF/ALT alleles, the observed and the expected heterozygosity
#' and the inbreeding coefficient (FIS).
#' @param data (4 options) A file or object generated by radiator:
#' \itemize{
#' \item tidy data
#' \item Genomic Data Structure (GDS)
#' }
#'
#' \emph{How to get GDS and tidy data ?}
#' Look into \code{\link{tidy_genomic_data}},
#' \code{\link{read_vcf}} or
#' \code{\link{tidy_vcf}}.
# @param filename (optional) Name of the file written to the working directory.
#' @param path.folder (path, optional) By default will print results in the working directory.
#' Default: \code{path.folder = NULL}.
#' @param digits (integer, optional). Default: \code{digits = 4}.
#' @inheritParams radiator_common_arguments
#' @rdname summary_rad
#' @export
# @keywords internal
summary_rad <- function(
data,
path.folder = NULL,
digits = 4,
verbose = TRUE
) {
# Cleanup-------------------------------------------------------------------
file.date <- format(Sys.time(), "%Y%m%d@%H%M")
if (verbose) message("Execution date@time: ", file.date)
old.dir <- getwd()
opt.change <- getOption("width")
options(width = 70)
timing <- proc.time()# for timing
#back to the original directory and options
on.exit(setwd(old.dir), add = TRUE)
on.exit(options(width = opt.change), add = TRUE)
on.exit(timing <- proc.time() - timing, add = TRUE)
on.exit(if (verbose) message("\nComputation time, overall: ", round(timing[[3]]), " sec"), add = TRUE)
on.exit(if (verbose) cat("############################ summary_rad completed #############################\n"), add = TRUE)
# Checking for missing and/or default arguments-------------------------------
if (missing(data)) rlang::abort("Input file missing")
# Folders---------------------------------------------------------------------
path.folder <- generate_folder(
f = path.folder,
rad.folder = "summary_stats",
prefix_int = FALSE,
internal = FALSE,
file.date = file.date,
verbose = verbose)
# Detect format --------------------------------------------------------------
data.type <- radiator::detect_genomic_format(data)
if (!data.type %in% c("tbl_df", "fst.file", "SeqVarGDSClass", "gds.file")) {
rlang::abort("Input not supported for this function: read function documentation")
}
if (data.type %in% c("SeqVarGDSClass", "gds.file")) {
radiator_packages_dep(package = "SeqArray", cran = FALSE, bioc = TRUE)
message("Importing gds file...")
if (data.type == "gds.file") {
data <- radiator::read_rad(data, verbose = verbose)
data.type <- "SeqVarGDSClass"
}
tidy.data <- extract_genotypes_metadata(
gds = data,
genotypes.meta.select = c("MARKERS", "COL", "INDIVIDUALS", "POP_ID", "GT_BIN"))
if (length(tidy.data) == 0) {
tidy.data <- gds2tidy(gds = data)
}
SeqArray::seqClose(data)
data <- tidy.data
tidy.data <- NULL
} else {#tidy.data
if (is.vector(data)) data <- radiator::tidy_wide(data = data, import.metadata = TRUE)
}
if (verbose) message("Summarizing...")
# the function
sum_rad <- function(x, maf = c("global", "local"), digits = 6) {
maf <- match.arg(arg = maf, choices = c("global", "local"))
x %<>%
dplyr::summarise(
N = as.numeric(length(unique(INDIVIDUALS))),
PP = as.numeric(length(GT_BIN[GT_BIN == 0])),
PQ = as.numeric(length(GT_BIN[GT_BIN == 1])),
QQ = as.numeric(length(GT_BIN[GT_BIN == 2]))
) %>%
dplyr::ungroup(.) %>%
dplyr::mutate(
FREQ_REF = ((PP*2) + PQ)/(2*N),
FREQ_ALT = ((QQ*2) + PQ)/(2*N),
HET_O = PQ/N,
HET_E = 2 * FREQ_REF * FREQ_ALT,
FIS = dplyr::if_else(HET_O == 0, 1, round(((HET_E - HET_O) / HET_E), digits)),
PP = NULL, PQ = NULL, QQ = NULL
)
if (maf == "global") {
x %<>% dplyr::rename(MAF_GLOBAL = FREQ_ALT)
} else {
x %<>% dplyr::rename(MAF_LOCAL = FREQ_ALT)
}
return(x)
}#End sum_rad
data %<>% dplyr::filter(!is.na(GT_BIN))
# ms: markers stats
ms <- data %>%
dplyr::group_by(MARKERS) %>%
sum_rad(x = ., maf = "global", digits = digits)
# mps: markers pop stats
mps <- data %>%
dplyr::group_by(MARKERS, POP_ID) %>%
sum_rad(x = ., maf = "local", digits = digits)
# ps: pop stats
ps <- dplyr::bind_cols(
dplyr::group_by(data, POP_ID) %>% dplyr::summarise(N = length(unique(INDIVIDUALS))),
dplyr::group_by(mps, POP_ID) %>%
dplyr::summarise(
.data = .,
dplyr::across(
.cols = c(N, FREQ_REF, MAF_LOCAL, HET_O, HET_E),
.fns = mean, na.rm = TRUE
),
.groups = "keep"
) %>%
dplyr::rename(N_MEAN = N)
) %>%
dplyr::mutate(
POP_ID1 = NULL,
FIS = dplyr::if_else(HET_O == 0, 1, round(((HET_E - HET_O) / HET_E), digits))
) %>%
dplyr::mutate(
dplyr::across(
.cols = c(FREQ_REF, MAF_LOCAL, HET_O, HET_E, FIS),
.fns = round,
digits = digits
)
)
# writting the results
write_rad(
data = ms,
path = path.folder,
filename = "summary.stats.markers.tsv",
tsv = TRUE, verbose = verbose)
write_rad(
data = mps,
path = path.folder,
filename = "summary.stats.markers.pop.tsv",
tsv = TRUE, verbose = verbose)
write_rad(
data = ps,
path = path.folder,
filename = "summary.stats.pop.tsv",
tsv = TRUE, verbose = verbose)
return(res = list(summary.stats.pop = ps,
summary.stats.markers.pop = mps,
summary.stats.markers = ms))
}#End summary_stats_vcf_tidy
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.