#' @name summary_sstacks
#' @title Summarize STACKS sstacks log file generated by stackr
#' @description This function reads the log file of stackr \code{\link{run_sstacks}}
#' and summarise the information.
#'
#' The information shown is particularly helpful when choosing the best thresholds.
#' This function is run automatically inside \code{\link{run_sstacks}}, but
#' it can be run on it's own.
#' @param sstacks.log (path, character).
#' The cstacks log file generated by \code{\link{run_sstacks}}.
#' @param verbose (logical, optional) Make the function a little more chatty during
#' execution.
#' Default: \code{verbose = FALSE}.
#' @rdname summary_sstacks
#' @export
#' @return When the function is run separately it returns an object in the
#' global environment and a file in \code{08_stacks_results} folder, if it exists and
#' if not, the file is written in the working directory.
#' The summary is a tibble:
#' \enumerate{
#'
#' \item INDIVIDUALS: The sample id.
#' \item LOCUS_IN_CATALOG: The number of locus in the catalog.
#' \item LOCUS_IN_SAMPLES: The number of locus in the sample.
#' \item MATCHING_LOCI: The number of matching loci
#' \item BLACKLISTED_LOCI_NO_VERIFIED_HAPLOTYPES: The number of loci with no verfied haplotypes.
#' \item BLACKLISTED_LOCI_MATCHING_MORE_1_CATALOG_LOCUS: The number of
#' loci that matched more than one catalog locus and were excluded.
#' \item BLACKLISTED_LOCI_UNACCOUNTED_SNP: The number of loci that contained SNPs
#' unaccounted for in the catalog and were excluded.
#' \item HAPLOTYPES_VERIFIED: The number of haplotypes verified by the total
#' number of haplotypes examined.
#' \item GAPPED_ALIGNMENTS_ATTEMPTED: The number of gapped alignments attempted
#' \item LOCI_MATCHED_1_CATALOG_LOCUS: The number of loci that matched more than
#' one catalog locus and were excluded.
#' \item HAPLOTYPES_VERIFIED_EXAMINED: The number of haplotypes verified by the total
#' number of haplotypes examined.
#' \item BLACKLISTED_LOCI_MATCHED_NO_CATALOG: The number of loci blacklisted because
#' of no match with the catalog.
#' \item GAPPED_BLACKLISTED_LOCI_MATCHING_MORE_1_CATALOG_LOCUS: The number of
#' loci that matched more than one catalog locus and were excluded.
#' \item GAPPED_BLACKLISTED_LOCI_UNACCOUNTED_SNP: The number of loci that
#' contained SNPs unaccounted for in the catalog and were excluded.
#' \item LOCI_NO_VERIFIED_HAPLOTYPES: The number of loci with no verified haplotypes.
#' \item BLACKLISTED_LOCI_INCONSISTENT_ALIGNMENTS: The number of loci blacklisted
#' because of inconsistent alignments.
#' \item TOTAL_SAMPLE_LOCI: This is the number of loci that will be used in
#' stacks tsv2bam.
#' }
#' @examples
#' \dontrun{
#' sum <- stackr::summary_sstacks(
#' sstacks.log = "09_log_files/sstacks_20191101@1108.log",
#' verbose = TRUE
#' )
#' }
#' @seealso
#' \href{http://catchenlab.life.illinois.edu/stacks/comp/sstacks.php}{sstacks}
#' \code{\link{run_sstacks}}
#' @references Catchen JM, Amores A, Hohenlohe PA et al. (2011)
#' Stacks: Building and Genotyping Loci De Novo From Short-Read Sequences.
#' G3, 1, 171-182.
#' @references Catchen JM, Hohenlohe PA, Bassham S, Amores A, Cresko WA (2013)
#' Stacks: an analysis tool set for population genomics.
#' Molecular Ecology, 22, 3124-3140.
summary_sstacks <- function(sstacks.log, verbose = FALSE) {
# sstacks.log <- "09_log_files/sstacks_20191101@1504.log"
# verbose <- TRUE
if (verbose) cat("#######################################################################\n")
if (verbose) cat("##################### stackr::summary_sstacks #########################\n")
if (verbose) cat("#######################################################################\n")
timing <- proc.time()
opt.change <- getOption("width")
options(width = 70)
if (missing(sstacks.log)) stop("log file argument required")
if (!file.exists(sstacks.log)) stop("log file does not exists, check path")
message("Summarizing stackr::run_sstacks log file:\n", sstacks.log, "\n")
# Filename
file.date <- format(Sys.time(), "%Y%m%d@%H%M")
filename <- stringi::stri_join("summary_sstacks_", file.date, ".tsv")
if (file.exists("08_stacks_results")) filename <- file.path("08_stacks_results", filename)
if (file.exists(filename)) {
file.date.w.seconds <- format(Sys.time(), "%Y%m%d@%H%M%S")
filename <- stringi::stri_replace_all_fixed(
str = filename,
pattern = file.date,
replacement = file.date.w.seconds,
vectorize_all = FALSE
)
}
# Import the log file
sstacks.logfile <- readr::read_tsv(
file = sstacks.log,
col_names = "SSTACKS",
col_types = "c"
)
# sample list
sample.names <- dplyr::filter(
.data = sstacks.logfile,
stringi::stri_detect_fixed(str = SSTACKS, pattern = ".tags.")
) %>%
dplyr::mutate(
INDIVIDUALS = stringi::stri_extract(SSTACKS, regex = '(?<=\\/).*(?=\\.tags)'),
SSTACKS = NULL
) %>%
dplyr::filter(dplyr::row_number() != 1) # remove the first row with catalog
if (verbose) message("Number of samples: ", nrow(sample.names))
if (verbose) message("Extracting information from the log file...")
# Number of loci in the catalog and in the samples
extract1 <- dplyr::filter(
.data = sstacks.logfile,
stringi::stri_detect_fixed(str = SSTACKS, pattern = "sample loci compared against the catalog containing")
) %>%
dplyr::mutate(
LOCUS_IN_CATALOG = stringi::stri_extract_all_charclass(
str = SSTACKS,
pattern = "[0-9]") %>%
purrr::map(2),
LOCUS_IN_CATALOG = as.integer(LOCUS_IN_CATALOG),
LOCUS_IN_SAMPLES = stringi::stri_extract_all_charclass(
str = SSTACKS,
pattern = "[0-9]") %>%
purrr::map(1),
LOCUS_IN_SAMPLES = as.integer(LOCUS_IN_SAMPLES),
SSTACKS = NULL
)
# matching loci and verified haplotypes
extract2 <- dplyr::filter(
.data = sstacks.logfile,
stringi::stri_detect_fixed(
str = SSTACKS,
pattern = "matching loci")
) %>%
dplyr::filter(
stringi::stri_detect_fixed(
str = SSTACKS,
pattern = "contained no verified haplotypes")
) %>%
dplyr::mutate(
MATCHING_LOCI = stringi::stri_extract_all_charclass(
str = SSTACKS,
pattern = "[0-9]") %>%
purrr::map(1),
MATCHING_LOCI = as.integer(MATCHING_LOCI),
BLACKLISTED_LOCI_NO_VERIFIED_HAPLOTYPES = stringi::stri_extract_all_charclass(
str = SSTACKS,
pattern = "[0-9]") %>%
purrr::map(2),
BLACKLISTED_LOCI_NO_VERIFIED_HAPLOTYPES = as.integer(BLACKLISTED_LOCI_NO_VERIFIED_HAPLOTYPES),
SSTACKS = NULL
)
# blacklisted loci that matched more than 1 catalog locus
extract3 <- dplyr::filter(
.data = sstacks.logfile,
stringi::stri_detect_fixed(
str = SSTACKS,
pattern = "loci matched more than one catalog locus and were excluded")
) %>%
dplyr::mutate(
SSTACKS = stringi::stri_extract_all_charclass(
str = SSTACKS,
pattern = "[0-9]")
)
gapped <- FALSE
if (nrow(extract3) > nrow(sample.names)) gapped <- TRUE
if (gapped) {
extract3 %<>%
dplyr::mutate(
INDIVIDUALS = rep(sample.names$INDIVIDUALS, each = 2),
GROUP = rep(
c("BLACKLISTED_LOCI_MATCHING_MORE_1_CATALOG_LOCUS",
"GAPPED_BLACKLISTED_LOCI_MATCHING_MORE_1_CATALOG_LOCUS"),
nrow(extract3) / 2
)
) %>%
tidyr::pivot_wider(data = ., names_from = GROUP, values_from = SSTACKS) %>%
dplyr::select(-INDIVIDUALS)
} else {
extract3 %<>% dplyr::rename(BLACKLISTED_LOCI_MATCHING_MORE_1_CATALOG_LOCUS = SSTACKS)
}
extract3 %<>% dplyr::mutate_all(.tbl = ., .funs = as.integer)
# blacklisted loci with SNPs unaccounted for in the catalog
extract4 <- dplyr::filter(
.data = sstacks.logfile,
stringi::stri_detect_fixed(
str = SSTACKS,
pattern = "SNPs unaccounted for in the catalog and were excluded")
) %>%
dplyr::mutate(
SSTACKS = stringi::stri_extract_all_charclass(
str = SSTACKS,
pattern = "[0-9]")
)
gapped <- FALSE
if (nrow(extract4) > nrow(sample.names)) gapped <- TRUE
if (gapped) {
extract4 %<>%
dplyr::mutate(
INDIVIDUALS = rep(sample.names$INDIVIDUALS, each = 2),
GROUP = rep(
c("BLACKLISTED_LOCI_UNACCOUNTED_SNP",
"GAPPED_BLACKLISTED_LOCI_UNACCOUNTED_SNP"),
nrow(extract4) / 2
)
) %>%
tidyr::pivot_wider(data = ., names_from = GROUP, values_from = SSTACKS) %>%
dplyr::select(-INDIVIDUALS)
} else {
extract4 %<>% dplyr::rename(BLACKLISTED_LOCI_UNACCOUNTED_SNP = SSTACKS)
}
extract4 %<>% dplyr::mutate_all(.tbl = ., .funs = as.integer)
# total haplotypes examined from matching loci
extract5 <- dplyr::filter(
.data = sstacks.logfile,
stringi::stri_detect_fixed(
str = SSTACKS,
pattern = "total haplotypes examined from matching loci")
) %>%
dplyr::mutate(
HAPLOTYPES_VERIFIED = stringi::stri_join(
stringi::stri_extract_all_charclass(
str = SSTACKS,
pattern = "[0-9]") %>%
purrr::map(2), " / ",
stringi::stri_extract_all_charclass(
str = SSTACKS,
pattern = "[0-9]") %>%
purrr::map(1)
),
SSTACKS = NULL
)
sum <- dplyr::bind_cols(
sample.names, extract1, extract2, extract3, extract4, extract5
) %>%
dplyr::mutate(
TOTAL_SAMPLE_LOCI = MATCHING_LOCI -
BLACKLISTED_LOCI_NO_VERIFIED_HAPLOTYPES
)
if (gapped) {
# gapped alignments attempted
extract6 <- dplyr::filter(
.data = sstacks.logfile,
stringi::stri_detect_fixed(
str = SSTACKS,
pattern = "gapped alignments attempted")
) %>%
dplyr::mutate(
GAPPED_ALIGNMENTS_ATTEMPTED =
stringi::stri_extract_all_charclass(
str = SSTACKS,
pattern = "[0-9]") %>%
purrr::map(2),
SSTACKS = NULL
) %>%
dplyr::mutate_all(.tbl = ., .funs = as.integer)
# gapped alignments attempted
extract7 <- dplyr::filter(
.data = sstacks.logfile,
stringi::stri_detect_fixed(
str = SSTACKS,
pattern = "loci matched one catalog locus")
) %>%
dplyr::mutate(
LOCI_MATCHED_1_CATALOG_LOCUS =
stringi::stri_extract_all_charclass(
str = SSTACKS,
pattern = "[0-9]") %>%
purrr::map(1),
LOCI_MATCHED_1_CATALOG_LOCUS = as.integer(LOCI_MATCHED_1_CATALOG_LOCUS),
HAPLOTYPES_VERIFIED_EXAMINED =
stringi::stri_join(
stringi::stri_extract_all_charclass(
str = SSTACKS,
pattern = "[0-9]") %>%
purrr::map(3), " / ",
stringi::stri_extract_all_charclass(
str = SSTACKS,
pattern = "[0-9]") %>%
purrr::map(2)
),
SSTACKS = NULL
)
#loci matched no catalog locus
extract8 <- dplyr::filter(
.data = sstacks.logfile,
stringi::stri_detect_fixed(
str = SSTACKS,
pattern = "loci matched no catalog locus")
) %>%
dplyr::mutate(
BLACKLISTED_LOCI_MATCHED_NO_CATALOG =
stringi::stri_extract_all_charclass(
str = SSTACKS,
pattern = "[0-9]"),
BLACKLISTED_LOCI_MATCHED_NO_CATALOG = as.integer(BLACKLISTED_LOCI_MATCHED_NO_CATALOG),
SSTACKS = NULL
)
#loci had no verified haplotypes
extract9 <- dplyr::filter(
.data = sstacks.logfile,
stringi::stri_detect_fixed(
str = SSTACKS,
pattern = "loci had no verified haplotypes")
) %>%
dplyr::mutate(
LOCI_NO_VERIFIED_HAPLOTYPES =
stringi::stri_extract_all_charclass(
str = SSTACKS,
pattern = "[0-9]"),
LOCI_NO_VERIFIED_HAPLOTYPES = as.integer(LOCI_NO_VERIFIED_HAPLOTYPES),
SSTACKS = NULL
)
#loci had inconsistent alignments to a catalog locus and were excluded
extract10 <- dplyr::filter(
.data = sstacks.logfile,
stringi::stri_detect_fixed(
str = SSTACKS,
pattern = "loci had inconsistent alignments to a catalog locus and were excluded")
) %>%
dplyr::mutate(
BLACKLISTED_LOCI_INCONSISTENT_ALIGNMENTS =
stringi::stri_extract_all_charclass(
str = SSTACKS,
pattern = "[0-9]"),
BLACKLISTED_LOCI_INCONSISTENT_ALIGNMENTS = as.integer(BLACKLISTED_LOCI_INCONSISTENT_ALIGNMENTS),
SSTACKS = NULL
)
sum %<>% dplyr::bind_cols(extract6, extract7, extract8, extract9, extract10) %>%
dplyr::mutate(
TOTAL_SAMPLE_LOCI = TOTAL_SAMPLE_LOCI + LOCI_MATCHED_1_CATALOG_LOCUS
)
}
# ordering the columns
want <- c(
"INDIVIDUALS",
"LOCUS_IN_CATALOG",
"LOCUS_IN_SAMPLES",
"MATCHING_LOCI",
"BLACKLISTED_LOCI_NO_VERIFIED_HAPLOTYPES",
"BLACKLISTED_LOCI_MATCHING_MORE_1_CATALOG_LOCUS",
"BLACKLISTED_LOCI_UNACCOUNTED_SNP",
"HAPLOTYPES_VERIFIED",
"GAPPED_ALIGNMENTS_ATTEMPTED",
"LOCI_MATCHED_1_CATALOG_LOCUS",
"HAPLOTYPES_VERIFIED_EXAMINED",
"BLACKLISTED_LOCI_MATCHED_NO_CATALOG",
"GAPPED_BLACKLISTED_LOCI_MATCHING_MORE_1_CATALOG_LOCUS",
"GAPPED_BLACKLISTED_LOCI_UNACCOUNTED_SNP",
"LOCI_NO_VERIFIED_HAPLOTYPES",
"BLACKLISTED_LOCI_INCONSISTENT_ALIGNMENTS",
"TOTAL_SAMPLE_LOCI"
)
suppressWarnings(sum %<>% dplyr::select(dplyr::one_of(want)))
# Write to working directory
readr::write_tsv(x = sum, path = filename)
message("File written: ", filename)
timing <- proc.time() - timing
options(width = opt.change)
if (verbose) message("\nComputation time: ", round(timing[[3]]), " sec")
if (verbose) cat("############################## completed ##############################\n")
return(sum)
}#End summary_ustacks
Add the following code to your website.
For more information on customizing the embed code, read Embedding Snippets.