#' @name gl.report.callrate
#' @title Reports summary of Call Rate for loci or individuals
#' @description
#' SNP datasets generated by DArT have missing values primarily arising from
#' failure to call a SNP because of a mutation at one or both of the restriction
#' enzyme recognition sites. P/A datasets (SilicoDArT) have missing values
#' because it was not possible to call whether a sequence tag was amplified or
#' not. This function tabulates the number of missing values as quantiles.
#' @param x Name of the genlight object containing the SNP or presence/absence
#' (SilicoDArT) data [required].
#' @param method Specify the type of report by locus (method='loc') or
#' individual (method='ind') [default 'loc'].
#' @param by_pop Whether report by population [default FALSE].
#' @param plot.out Specify if plot is to be produced [default TRUE].
#' @param plot_theme User specified theme [default theme_dartR()].
#' @param plot_colors Vector with two color names for the borders and fill
#' [default two_colors].
#' @param bins Number of bins to display in histograms [default 25].
#' @param save2tmp If TRUE, saves any ggplots and listings to the session
#' temporary directory (tempdir) [default FALSE].
#' @param verbose Verbosity: 0, silent or fatal errors; 1, begin and end; 2,
#' progress log; 3, progress and results summary; 5, full report
#' [default 2, unless specified using gl.set.verbosity].
#' @details
#' This function expects a genlight object, containing either SNP data or
#' SilicoDArT (=presence/absence data).
#'
#' Callrate is summarized by locus or by individual to allow sensible decisions
#' on thresholds for filtering taking into consideration consequential loss of
#' data. The summary is in the form of a tabulation and plots.
#'
#' Plot themes can be obtained from:
#' \itemize{
#' \item \url{https://ggplot2.tidyverse.org/reference/ggtheme.html} and \item
#' \url{https://yutannihilation.github.io/allYourFigureAreBelongToUs/ggthemes/}
#' }
#'
#' Resultant ggplots and the tabulation are saved to the session's temporary
#' directory.
#' @return Returns unaltered genlight object
#' @author Custodian: Arthur Georges -- Post to
#' \url{https://groups.google.com/d/forum/dartr}
#' @examples
#' \donttest{
#' # SNP data
#' test.gl <- testset.gl[1:20,]
#' gl.report.callrate(test.gl)
#' gl.report.callrate(test.gl,method='ind')
#' # Tag P/A data
#' test.gs <- testset.gs[1:20,]
#' gl.report.callrate(test.gs)
#' gl.report.callrate(test.gs,method='ind')
#' }
#' test.gl <- testset.gl[1:20,]
#' gl.report.callrate(test.gl)
#'
#' @seealso \code{\link{gl.filter.callrate}}
#' @family report functions
#' @import patchwork
#' @export
gl.report.callrate <- function(x,
method = "loc",
by_pop = FALSE,
plot.out = TRUE,
plot_theme = theme_dartR(),
plot_colors = two_colors,
bins = 50,
save2tmp = FALSE,
verbose = NULL) {
# SET VERBOSITY
verbose <- gl.check.verbosity(verbose)
# FLAG SCRIPT START
funname <- match.call()[[1]]
utils.flag.start(func = funname,
build = "Jody",
verbosity = verbose)
# CHECK DATATYPE
datatype <- utils.check.datatype(x, verbose = verbose)
# FUNCTION SPECIFIC ERROR CHECKING
# Check that call rate is up to date and recalculate if necessary
# if (!x@other$loc.metrics.flags$CallRate) {
x <- utils.recalc.callrate(x, verbose = 0)
# }
# DO THE JOB
########### FOR METHOD BASED ON LOCUS
if (method == "loc") {
callrate <- x@other$loc.metrics$CallRate
if (plot.out) {
# get title for plots
if (datatype == "SNP") {
title1 <- "SNP data - Call Rate by Locus"
} else {
title1 <- "Fragment P/A data - Call Rate by Locus"
}
# Calculate minimum and maximum graph cutoffs for callrate
min <- min(callrate, na.rm = TRUE)
min <- trunc(min * 100) / 100
# Boxplot
p1 <-
ggplot(data.frame(callrate), aes(y = callrate)) +
geom_boxplot(color=plot_colors[1], fill = plot_colors[2]) +
coord_flip() +
plot_theme +
xlim(range = c(-1, 1)) +
ylim(min, 1) +
ylab(" ") +
theme(axis.text.y=element_blank(),axis.ticks.y=element_blank()) +
ggtitle(title1)
# Histogram
p2 <-
ggplot(data.frame(callrate), aes(x = callrate)) +
geom_histogram(bins = bins,color = plot_colors[1],fill = plot_colors[2]) +
coord_cartesian(xlim = c(min, 1)) +
xlab("Call rate") +
ylab("Count") +
plot_theme
# plots by population
if(nPop(x)>1 & by_pop==TRUE){
pops <- seppop(x)
cat(" Reporting Call Rate by population\n")
c_rate_plots <- lapply(pops, function(z) {
pop_tmp <- utils.recalc.callrate(z, verbose = 0)
c_rate_tmp <- pop_tmp$other$loc.metrics$CallRate
p_temp <-
ggplot(as.data.frame(c_rate_tmp), aes(x = c_rate_tmp)) +
geom_histogram(bins = bins, color = plot_colors[1],fill = plot_colors[2]) +
xlab("Call rate") +
ylab("Count") +
coord_cartesian(xlim = c(min, 1)) +
plot_theme +
ggtitle(paste(popNames(z), "n =", nInd(z)))
cat(" Population:",popNames(pop_tmp),"\n")
cat(" No. of loci =", nLoc(pop_tmp), "\n")
cat(" No. of individuals =", nInd(pop_tmp), "\n")
cat(" Mean Call Rate",mean(pop_tmp$other$loc.metrics$CallRate,na.rm = TRUE) ,
"\n\n")
return(p_temp)
})
}
}
# Print out some statistics
stats <- summary(callrate)
cat(" Reporting Call Rate by Locus\n")
cat(" No. of loci =", nLoc(x), "\n")
cat(" No. of individuals =", nInd(x), "\n")
cat(" Minimum : ", stats[1], "\n")
cat(" 1st quartile : ", stats[2], "\n")
cat(" Median : ", stats[3], "\n")
cat(" Mean : ", stats[4], "\n")
cat(" 3r quartile : ", stats[5], "\n")
cat(" Maximum : ", stats[6], "\n")
cat(" Missing Rate Overall: ", round(sum(is.na(
as.matrix(x)
)) / (nLoc(x) * nInd(x)), 4), "\n\n")
# Determine the loss of loci for a given threshold
quantile_res <-
quantile(callrate, probs = seq(0, 1, 1 / 20),type=1)
retained <-
unlist(lapply(quantile_res, function(y) {
res <- length(callrate[callrate >= y])
}))
pc.retained <- round(retained * 100 / nLoc(x), 1)
filtered <- nLoc(x) - retained
pc.filtered <- 100 - pc.retained
df <-
data.frame(
as.numeric(sub("%", "", names(
quantile_res
))),
quantile_res,
retained,
pc.retained,
filtered,
pc.filtered
)
colnames(df) <-
c("Quantile",
"Threshold",
"Retained",
"Percent",
"Filtered",
"Percent")
df <- df[order(-df$Quantile), ]
df$Quantile <- paste0(df$Quantile, "%")
rownames(df) <- NULL
}
########### FOR METHOD BASED ON INDIVIDUAL Calculate the call rate by individual
if (method == "ind") {
ind.call.rate <- 1 - rowSums(is.na(as.matrix(x))) / nLoc(x)
if (plot.out) {
# get title for plots
if (datatype == "SNP") {
title1 <- "SNP data - Call Rate by Individual"
} else {
title1 <- "Fragment P/A data - Call Rate by Individual"
}
# Calculate minimum and maximum graph cutoffs for callrate
min <- min(ind.call.rate)
min <- trunc(min * 100) / 100
# Boxplot
p1 <-
ggplot(data.frame(ind.call.rate), aes(y = ind.call.rate)) +
geom_boxplot(color = plot_colors[1], fill = plot_colors[2]) +
coord_flip() +
plot_theme +
xlim(range = c(-1, 1)) +
ylim(min, 1) +
ylab(" ") +
theme(axis.text.y = element_blank(), axis.ticks.y = element_blank()) +
ggtitle(title1)
# Histogram
p2 <-
ggplot(data.frame(ind.call.rate), aes(x = ind.call.rate)) +
geom_histogram(bins = bins, color = plot_colors[1],fill = plot_colors[2]) +
coord_cartesian(xlim = c(min, 1)) +
xlab("Call rate") +
ylab("Count") +
plot_theme
}
# Print out some statistics
stats <- summary(ind.call.rate)
cat(" Reporting Call Rate by Individual\n")
cat(" No. of loci =", nLoc(x), "\n")
cat(" No. of individuals =", nInd(x), "\n")
cat(" Minimum : ", stats[1], "\n")
cat(" 1st quartile : ", stats[2], "\n")
cat(" Median : ", stats[3], "\n")
cat(" Mean : ", stats[4], "\n")
cat(" 3r quartile : ", stats[5], "\n")
cat(" Maximum : ", stats[6], "\n")
cat(" Missing Rate Overall: ", round(sum(is.na(
as.matrix(x)
)) / (nLoc(x) * nInd(x)), 4), "\n\n")
# Determine the loss of individuals for a given threshold using quantiles
quantile_res <-
quantile(ind.call.rate, probs = seq(0, 1, 1 / 20))
retained <- unlist(lapply(quantile_res, function(y) {
res <- length(ind.call.rate[ind.call.rate >= y])
}))
pc.retained <- round(retained * 100 / nInd(x), 1)
filtered <- nInd(x) - retained
pc.filtered <- 100 - pc.retained
df <-
data.frame(
as.numeric(sub("%", "", names(
quantile_res
))),
quantile_res,
retained,
pc.retained,
filtered,
pc.filtered
)
colnames(df) <-
c("Quantile",
"Threshold",
"Retained",
"Percent",
"Filtered",
"Percent")
df <- df[order(-df$Quantile), ]
df$Quantile <- paste0(df$Quantile, "%")
rownames(df) <- NULL
ind.call.rate_pop <- as.data.frame(cbind(names(ind.call.rate),
as.character(pop(x)),
ind.call.rate))
colnames(ind.call.rate_pop) <- c("ind_name","pop","missing_data")
ind.call.rate_pop <- ind.call.rate_pop[order(ind.call.rate_pop$pop,
ind.call.rate_pop$missing_data,
decreasing = TRUE),]
}
# PRINTING OUTPUTS
if (plot.out) {
# using package patchwork
p3 <- (p1 / p2) + plot_layout(heights = c(1, 4))
print(p3)
if(nPop(x)>1 & method == "loc" & by_pop == TRUE){
row_plots <- ceiling(nPop(x) / 3)
p4 <- wrap_plots(c_rate_plots)
p4 <- p4 + plot_layout(ncol = 3, nrow = row_plots)
print(p4)
}
}
print(df)
cat("\n\n")
if (method == "ind") {
ind_print <- ind.call.rate_pop
ind_print <- ind_print[order(ind_print$missing_data),]
# ind_print$missing_data <- round(ind_print$missing_data, digits = 4)
print(ind_print, row.names = FALSE)
}
# SAVE INTERMEDIATES TO TEMPDIR
# creating temp file names
if (save2tmp) {
if (plot.out) {
temp_plot <- tempfile(pattern = "Plot_")
match_call <-
paste0(names(match.call()),
"_",
as.character(match.call()),
collapse = "_")
# saving to tempdir
saveRDS(list(match_call, p3), file = temp_plot)
# saving plots per pop
if(nPop(x)>1){
temp_plot_2 <- tempfile(pattern = "Plot_per_pop")
# saving to tempdir
saveRDS(list(match_call, p4), file = temp_plot_2)
}
if (verbose >= 2) {
cat(report(" Saving the ggplot to session tempfile\n"))
}
}
temp_table <- tempfile(pattern = "Table_")
saveRDS(list(match_call, df), file = temp_table)
if (method == "ind") {
temp_table_2 <- tempfile(pattern = "Table2_")
saveRDS(list(ind.call.rate_pop, df), file = temp_table_2)
}
if (verbose >= 2) {
cat(report(" Saving tabulation to session tempfile\n"))
cat(
report(
" NOTE: Retrieve output files from tempdir using
gl.list.reports() and gl.print.reports()\n"
)
)
}
}
# FLAG SCRIPT END
if (verbose >= 1) {
cat(report("Completed:", funname, "\n"))
}
# RETURN
invisible(x)
}
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.