#' @name summary_catalog_log_lik
#' @title Summary of catalog log likelihood
#' @description This function reads inside the output of
#' STACKS ustasks-cstacks-sstacks folder
#' and and summarize the sstacks output files \code{.matches} to get
#' the mean and median log likelihood of catalog loci.
#' @param matches.folder (logical). Folder where the \code{.matches.tsv.gz} files are stored.
#' Thes files are generated by \href{http://catchenlab.life.illinois.edu/stacks/comp/sstacks.php}{sstacks}.
#' @param parallel.core (optional) The number of core used for parallel
#' execution.
#' Default: \code{parallel::detectCores() - 1}.
#' @param verbose (optional) Make the function a little more chatty during
#' execution.
#' Default: \code{verbose = FALSE}.
#' @rdname summary_catalog_log_lik
#' @export
#' @return The function returns a summary (data frame) containing:
#' \enumerate{
#' \item INDIVIDUALS: the sample id
#' \item LOCUS_NUMBER: the number of locus
#' \item BLACKLIST_USTACKS: the number of locus blacklisted by ustacks
#' \item FOR_CATALOG: the number of locus available to generate the catalog
#' \item BLACKLIST_ARTIFACT: the number of artifact genotypes (> 2 alleles, see details)
#' \item FILTERED: the number of locus after artifacts are removed
#' \item HOMOZYGOSITY: the number of homozygous genotypes
#' \item HETEROZYGOSITY: the number of heterozygous genotypes
#' \item MEAN_NUMBER_SNP_LOCUS: the mean number of SNP/locus (excluding the artifact locus)
#' \item MAX_NUMBER_SNP_LOCUS: the max number of SNP/locus observed for this individual (excluding the artifact locus)
#' \item NUMBER_LOCUS_4SNP: the number of locus with 4 or more SNP/locus (excluding the artifact locus)
#' }
#' @examples
#' \dontrun{
#' catalog.log.lik <- stackr::summary_catalog_log_lik(
#' matches.folder = "06_ustacks_cstacks_sstacks", verbose = TRUE)
#' # To get the summary after correction module STACKS rxstacks, use the built-in
#' feature of \code{run_rxstacks} or run this function with rxstacks outout folder selected.
#' }
#' @seealso
#' \href{http://catchenlab.life.illinois.edu/stacks/comp/ustacks.php}{ustacks}
#' @references Catchen JM, Amores A, Hohenlohe PA et al. (2011)
#' Stacks: Building and Genotyping Loci De Novo From Short-Read Sequences.
#' G3, 1, 171-182.
#' @references Catchen JM, Hohenlohe PA, Bassham S, Amores A, Cresko WA (2013)
#' Stacks: an analysis tool set for population genomics.
#' Molecular Ecology, 22, 3124-3140.
summary_catalog_log_lik <- function(
matches.folder,
parallel.core = parallel::detectCores() - 1,
verbose = FALSE
) {
if (verbose) cat("#######################################################################\n")
if (verbose) cat("################# stackr::summary_catalog_log_lik #####################\n")
if (verbose) cat("#######################################################################\n")
timing <- proc.time()
opt.change <- getOption("width")
options(width = 70)
res <- list() # return results in this list
if (missing(matches.folder)) stop("matches.folder argument required")
message("Printing the distribution of mean log likelihoods for catalog loci: selected")
# message("After visualizing the distribution:")
# message(" 1. select the best threshold")
# message(" 2. use function run_rxstacks with the appropriate filter thresholds.")
# Import matches--------------------------------------------------------------
matches.files <- list.files(
path = matches.folder, pattern = "matches", full.names = TRUE)
# Check stacks version--------------------------------------------------------
stacks.version <- stringi::stri_detect_fixed(str = readr::read_lines(matches.files[1], n_max = 1), pattern = "version 2")
if (stacks.version) {
stop("This function is not usefull with stacks version >= 2.0")
}
n.matches <- length(matches.files)
if (n.matches == 0) stop("Missing matches files: generate the files for each samples with run_sstacks or sstacks in command line mode")
message("Reading ", n.matches, " matches files\n for the distribution of mean log likelihoods of catalog loci...")
# functions
n.col <- ncol(suppressWarnings(suppressMessages(readr::read_tsv(file = matches.files[[1]], n_max = 1, comment = "#"))))
read_matches <- function(matches.files, n.col) {
if (n.col == 6) {
matches <- readr::read_tsv(
file = matches.files,
col_names = c("CATALOG_ID", "SAMPLE_ID", "LOCUS_ID", "HAPLOTYPE", "STACK_DEPTH", "CIGAR_STRING"),
col_types = "iiicic",
comment = "#") %>%
dplyr::select(CATALOG_ID, SAMPLE_ID, LOG_LIKELIHOOD)
}
if (n.col == 8) {
matches <- readr::read_tsv(
file = matches.files,
col_names = c("SQL_ID", "BATCH_ID", "CATALOG_ID", "SAMPLE_ID", "LOCUS_ID", "HAPLOTYPE", "STACK_DEPTH", "LOG_LIKELIHOOD"),
col_types = "iiiiicid", na = "-",
comment = "#")
}
if (n.col == 9) {
matches <- readr::read_tsv(
file = matches.files,
col_names = c("SQL_ID", "BATCH_ID", "CATALOG_ID", "SAMPLE_ID", "LOCUS_ID", "HAPLOTYPE", "STACK_DEPTH", "LOG_LIKELIHOOD", "NEW_COL"),
col_types = "iiiiicidc",
comment = "#") %>%
dplyr::select(CATALOG_ID, SAMPLE_ID, LOG_LIKELIHOOD)
}
return(matches)
}# End read_matches
summarize_likelihood <- function(x) {
lik <- dplyr::group_by(x, CATALOG_ID) %>%
dplyr::summarise(
MEAN = mean(LOG_LIKELIHOOD),
MEDIAN = stats::median(LOG_LIKELIHOOD))
return(lik)
}#End summarize_likelihood
log.likelihood.summary <- list()
log.likelihood.summary <- .stackr_parallel(
X = matches.files,
FUN = read_matches,
mc.cores = parallel.core,
n.col = n.col
) %>%
dplyr::bind_rows(.) %>%
dplyr::distinct(CATALOG_ID, SAMPLE_ID, LOG_LIKELIHOOD) %>%
dplyr::arrange(CATALOG_ID)
message("Summarizing the log-likelihood....")
res$log.likelihood.summary <- log.likelihood.summary %>%
dplyr::left_join(
dplyr::distinct(log.likelihood.summary, CATALOG_ID) %>%
dplyr::mutate(
SPLIT_VEC = dplyr::ntile(x = 1:nrow(.), n = parallel.core * 3))
, by = "CATALOG_ID") %>%
split(x = ., f = .$SPLIT_VEC) %>%
.stackr_parallel(
X = .,
FUN = summarize_likelihood,
mc.cores = parallel.core
) %>%
dplyr::bind_rows(.)
log.likelihood.summary <- NULL
options(width = opt.change) #restore the option width
overall.mean <- round(mean(res$log.likelihood.summary$MEAN), 2)
overall.median <- round(stats::median(res$log.likelihood.summary$MEDIAN), 2)
res$log.likelihood.fig <- tidyr::gather(
data = res$log.likelihood.summary, key = STATS, value = VALUE, -CATALOG_ID) %>%
ggplot2::ggplot(data = ., ggplot2::aes(x = VALUE)) +
ggplot2::geom_histogram() +
ggplot2::labs(x = "Log likelihoods of catalog loci") +
ggplot2::labs(y = "Number of catalog loci") +
ggplot2::facet_wrap(~STATS, nrow = 1, ncol = 2) +
ggplot2::theme(
axis.title.x = ggplot2::element_text(size = 12, family = "Helvetica", face = "bold"),
axis.title.y = ggplot2::element_text(size = 12, family = "Helvetica", face = "bold"),
legend.title = ggplot2::element_text(size = 12, family = "Helvetica", face = "bold"),
legend.text = ggplot2::element_text(size = 12, family = "Helvetica", face = "bold"),
strip.text.x = ggplot2::element_text(size = 12, family = "Helvetica", face = "bold"))
message("Distribution plot available inside the object in the Global environment")
message("Overall mean log likelihood: ", overall.mean)
message("Overall median log likelihood: ", overall.median)
timing <- proc.time() - timing
if (verbose) message("\nComputation time: ", round(timing[[3]]), " sec")
if (verbose) cat("############################## completed ##############################\n")
return(res)
}#End summary_catalog_log_lik
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.