#' @name summary_cstacks
#' @title Summarize STACKS cstacks log file generated by stackr
#' @description This function reads the log file of stackr \code{\link{run_cstacks}}
#' and summarise the information.
#'
#' The information shown is particularly helpful when choosing the best thresholds.
#' This function is run automatically inside \code{\link{run_cstacks}}, but
#' it can be run on it's own.
#' @param cstacks.log (path, character).
#' The cstacks log file generated by \code{\link{run_cstacks}}.
#' @param verbose (logical, optional) Make the function a little more chatty during
#' execution.
#' Default: \code{verbose = FALSE}.
#' @rdname summary_cstacks
#' @export
#' @return When the function is run separately it returns an object in the
#' global environment and a file in \code{08_stacks_results} folder, if it exists and
#' if not, the file is written in the working directory.
#' The summary is a tibble:
#' \enumerate{
#' \item INDIVIDUALS: the sample id
#' \item LOCUS_CATALOG_START: the number of locus in the catalog at the start when
#' processing the sample.
#' \item LOCI_MATCH_TO_CATALOG: the number of loci for the sample matching loci
#' in the catalog.
#' \item LOCI_MATCH_TO_CATALOG_GAPPED: the number of loci for the sample matching loci
#' in the catalog after allowing for gapped alignment.
#' \item LOCI_ADDED: the number of loci added to the catalog after processing the
#' sample.
#' \item LOCI_LINKED: the number of loci that matched more than one catalog
#' locus (linked loci).
#' \item LOCUS_CATALOG_END: the number of locus in the catalog at the end of
#' processing the sample.
#' \item MISMATCHES: the number of mismatches allowed between loci of different samples.
#' }
#' @examples
#' \dontrun{
#' cstacks.summary <- stackr::summary_cstacks(
#' cstacks.log = "09_log_files/cstacks_20191101@1108.log",
#' verbose = TRUE
#' )
#' }
#' @seealso
#' \href{http://catchenlab.life.illinois.edu/stacks/comp/cstacks.php}{cstacks}
#' \code{\link{run_cstacks}}
#' @references Catchen JM, Amores A, Hohenlohe PA et al. (2011)
#' Stacks: Building and Genotyping Loci De Novo From Short-Read Sequences.
#' G3, 1, 171-182.
#' @references Catchen JM, Hohenlohe PA, Bassham S, Amores A, Cresko WA (2013)
#' Stacks: an analysis tool set for population genomics.
#' Molecular Ecology, 22, 3124-3140.
summary_cstacks <- function(cstacks.log, verbose = FALSE) {
if (verbose) cat("#######################################################################\n")
if (verbose) cat("##################### stackr::summary_cstacks #########################\n")
if (verbose) cat("#######################################################################\n")
timing <- proc.time()
opt.change <- getOption("width")
options(width = 70)
if (missing(cstacks.log)) stop("log file argument required")
if (!file.exists(cstacks.log)) stop("log file does not exists, check path")
message("\nSummarizing stackr::run_cstacks log file:\n", cstacks.log, "\n")
# Filename
file.date <- format(Sys.time(), "%Y%m%d@%H%M")
filename <- stringi::stri_join("summary_cstacks_", file.date, ".tsv")
if (file.exists("08_stacks_results")) filename <- file.path("08_stacks_results", filename)
if (file.exists(filename)) {
file.date.w.seconds <- format(Sys.time(), "%Y%m%d@%H%M%S")
filename <- stringi::stri_replace_all_fixed(
str = filename,
pattern = file.date,
replacement = file.date.w.seconds,
vectorize_all = FALSE
)
}
# Import the log file
cstacks.logfile <- readr::read_tsv(
file = cstacks.log,
col_names = "CSTACKS",
col_types = "c"
)
# detect if existing catalog...
existing.catalog <- TRUE %in% stringi::stri_detect_fixed(str = cstacks.logfile$CSTACKS, pattern = "Initializing existing catalog")
# sample list
sample.names <- cstacks.logfile %>%
dplyr::filter(stringi::stri_detect_fixed(str = CSTACKS, pattern = ".tags.")) %>%
dplyr::filter(!stringi::stri_detect_fixed(str = CSTACKS, pattern = "catalog.")) %>%
dplyr::mutate(
SAMPLES = stringi::stri_extract(CSTACKS, regex = '(?<=\\/).*(?=\\.tags)'),
CSTACKS = NULL
)
if (verbose) message("Number of samples in catalog: ", nrow(sample.names))
# Mismatches
mis <- dplyr::filter(
.data = cstacks.logfile,
stringi::stri_detect_fixed(str = CSTACKS, pattern = "Number of mismatches")
) %>%
dplyr::mutate(
MISMATCHES = stringi::stri_extract_all_charclass(
str = CSTACKS,
pattern = "[0-9]") %>%
purrr::map(1),
MISMATCHES = as.integer(MISMATCHES),
CSTACKS = NULL
) %>%
purrr::flatten_int(.)
if (verbose) message("Mismatches used for catalog construction: ", mis)
if (verbose) message("Extracting information from the log file...")
extract1 <- dplyr::filter(
.data = cstacks.logfile,
stringi::stri_detect_fixed(str = CSTACKS, pattern = "loci in the catalog")
) %>%
dplyr::mutate(
LOCUS_CATALOG_START = stringi::stri_extract_all_charclass(
str = CSTACKS,
pattern = "[0-9]") %>%
purrr::map(1),
LOCUS_CATALOG_START = as.integer(LOCUS_CATALOG_START),
CSTACKS = NULL
)
if (!existing.catalog) {
extract1 %<>% tibble::add_row(.data = ., .before = 1, LOCUS_CATALOG_START = 0L)
}
extract.last <- extract1 %>%
dplyr::filter(dplyr::row_number() != 1L) %>%
dplyr::rename( LOCUS_CATALOG_END = LOCUS_CATALOG_START) %>%
dplyr::bind_rows(
dplyr::filter(
.data = cstacks.logfile,
stringi::stri_detect_fixed(str = CSTACKS, pattern = "Final catalog contains")
) %>%
dplyr::mutate(
LOCUS_CATALOG_END = stringi::stri_extract_all_charclass(
str = CSTACKS,
pattern = "[0-9]"),
LOCUS_CATALOG_END = as.integer(LOCUS_CATALOG_END),
CSTACKS = NULL
)
)
extract2 <- dplyr::filter(
.data = cstacks.logfile,
stringi::stri_detect_fixed(
str = CSTACKS,
pattern = "loci were matched to a catalog locus.")
) %>%
dplyr::mutate(
LOCI_MATCH_TO_CATALOG = stringi::stri_extract_all_charclass(
str = CSTACKS,
pattern = "[0-9]"),
LOCI_MATCH_TO_CATALOG = as.integer(LOCI_MATCH_TO_CATALOG),
CSTACKS = NULL
)
if (!existing.catalog) {
extract2 %<>% tibble::add_row(.data = ., .before = 1, LOCI_MATCH_TO_CATALOG = NA_integer_)
}
extract3 <- dplyr::filter(
.data = cstacks.logfile,
stringi::stri_detect_fixed(
str = CSTACKS,
pattern = "loci were matched to a catalog locus using gapped alignments.")
) %>%
dplyr::mutate(
LOCI_MATCH_TO_CATALOG_GAPPED= stringi::stri_extract_all_charclass(
str = CSTACKS,
pattern = "[0-9]"),
LOCI_MATCH_TO_CATALOG_GAPPED = as.integer(LOCI_MATCH_TO_CATALOG_GAPPED),
CSTACKS = NULL
)
if (!existing.catalog) {
extract3 %<>% tibble::add_row(.data = ., .before = 1, LOCI_MATCH_TO_CATALOG_GAPPED = NA_integer_)
}
extract4 <- dplyr::filter(
.data = cstacks.logfile,
stringi::stri_detect_fixed(
str = CSTACKS,
pattern = "loci were newly added to the catalog.")
) %>%
dplyr::mutate(
LOCI_ADDED= stringi::stri_extract_all_charclass(
str = CSTACKS,
pattern = "[0-9]"),
LOCI_ADDED = as.integer(LOCI_ADDED),
CSTACKS = NULL
)
extract5 <- dplyr::filter(
.data = cstacks.logfile,
stringi::stri_detect_fixed(
str = CSTACKS,
pattern = "loci matched more than one catalog locus, linking them."
)
) %>%
dplyr::mutate(
LOCI_LINKED= stringi::stri_extract_all_charclass(
str = CSTACKS,
pattern = "[0-9]"),
LOCI_LINKED = as.integer(LOCI_LINKED),
CSTACKS = NULL
)
if (!existing.catalog) {
extract5 %<>% tibble::add_row(.data = ., .before = 1, LOCI_LINKED = NA_integer_)
}
sum <- dplyr::bind_cols(
sample.names, extract1, extract2, extract3, extract4, extract5, extract.last
) %>%
dplyr::mutate(MISMATCHES = mis)
# Write to working directory
readr::write_tsv(x = sum, path = filename)
message("File written: ", filename)
timing <- proc.time() - timing
options(width = opt.change)
if (verbose) message("\nComputation time: ", round(timing[[3]]), " sec")
if (verbose) cat("############################## completed ##############################\n")
return(sum)
}#End summary_ustacks
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.