R/summary_cstacks.R

Defines functions summary_cstacks

Documented in summary_cstacks

#' @name summary_cstacks
#' @title Summarize STACKS cstacks log file generated by stackr
#' @description This function reads the log file of stackr \code{\link{run_cstacks}}
#' and summarise the information.
#'
#' The information shown is particularly helpful when choosing the best thresholds.
#' This function is run automatically inside \code{\link{run_cstacks}}, but
#' it can be run on it's own.

#' @param cstacks.log (path, character).
#' The cstacks log file generated by \code{\link{run_cstacks}}.

#' @param verbose (logical, optional) Make the function a little more chatty during
#' execution.
#' Default: \code{verbose = FALSE}.

#' @rdname summary_cstacks
#' @export

#' @return When the function is run separately it returns an object in the
#' global environment and a file in \code{08_stacks_results} folder, if it exists and
#' if not, the file is written in the working directory.
#' The summary is a tibble:
#' \enumerate{
#' \item INDIVIDUALS: the sample id
#' \item LOCUS_CATALOG_START: the number of locus in the catalog at the start when
#' processing the sample.
#' \item LOCI_MATCH_TO_CATALOG: the number of loci for the sample matching loci
#' in the catalog.
#' \item LOCI_MATCH_TO_CATALOG_GAPPED: the number of loci for the sample matching loci
#' in the catalog after allowing for gapped alignment.
#' \item LOCI_ADDED: the number of loci added to the catalog after processing the
#' sample.
#' \item LOCI_LINKED: the number of loci that matched more than one catalog
#' locus (linked loci).
#' \item LOCUS_CATALOG_END: the number of locus in the catalog at the end of
#' processing the sample.
#' \item MISMATCHES: the number of mismatches allowed between loci of different samples.
#' }


#' @examples
#' \dontrun{
#' cstacks.summary <- stackr::summary_cstacks(
#' cstacks.log = "09_log_files/cstacks_20191101@1108.log",
#' verbose = TRUE
#' )
#' }


#' @seealso
#' \href{http://catchenlab.life.illinois.edu/stacks/comp/cstacks.php}{cstacks}

#' \code{\link{run_cstacks}}

#' @references Catchen JM, Amores A, Hohenlohe PA et al. (2011)
#' Stacks: Building and Genotyping Loci De Novo From Short-Read Sequences.
#' G3, 1, 171-182.
#' @references Catchen JM, Hohenlohe PA, Bassham S, Amores A, Cresko WA (2013)
#' Stacks: an analysis tool set for population genomics.
#' Molecular Ecology, 22, 3124-3140.

summary_cstacks <- function(cstacks.log, verbose = FALSE) {

  if (verbose) cat("#######################################################################\n")
  if (verbose) cat("##################### stackr::summary_cstacks #########################\n")
  if (verbose) cat("#######################################################################\n")
  timing <- proc.time()
  opt.change <- getOption("width")
  options(width = 70)

  if (missing(cstacks.log)) stop("log file argument required")
  if (!file.exists(cstacks.log)) stop("log file does not exists, check path")

  message("\nSummarizing stackr::run_cstacks log file:\n", cstacks.log, "\n")

  # Filename
  file.date <- format(Sys.time(), "%Y%m%d@%H%M")
  filename <- stringi::stri_join("summary_cstacks_", file.date, ".tsv")

  if (file.exists("08_stacks_results")) filename <- file.path("08_stacks_results", filename)
  if (file.exists(filename)) {
    file.date.w.seconds <- format(Sys.time(), "%Y%m%d@%H%M%S")
    filename <- stringi::stri_replace_all_fixed(
      str = filename,
      pattern = file.date,
      replacement = file.date.w.seconds,
      vectorize_all = FALSE
    )
  }

  # Import the log file
  cstacks.logfile <- readr::read_tsv(
    file = cstacks.log,
    col_names = "CSTACKS",
    col_types = "c"
  )

  # detect if existing catalog...
  existing.catalog <- TRUE %in% stringi::stri_detect_fixed(str = cstacks.logfile$CSTACKS, pattern = "Initializing existing catalog")

  # sample list
  sample.names <- cstacks.logfile %>%
    dplyr::filter(stringi::stri_detect_fixed(str = CSTACKS, pattern = ".tags.")) %>%
    dplyr::filter(!stringi::stri_detect_fixed(str = CSTACKS, pattern = "catalog.")) %>%
    dplyr::mutate(
      SAMPLES = stringi::stri_extract(CSTACKS, regex = '(?<=\\/).*(?=\\.tags)'),
      CSTACKS = NULL
    )

  if (verbose) message("Number of samples in catalog: ", nrow(sample.names))

  # Mismatches
  mis <- dplyr::filter(
    .data = cstacks.logfile,
    stringi::stri_detect_fixed(str = CSTACKS, pattern = "Number of mismatches")
  ) %>%
    dplyr::mutate(
      MISMATCHES = stringi::stri_extract_all_charclass(
        str = CSTACKS,
        pattern = "[0-9]") %>%
        purrr::map(1),
      MISMATCHES = as.integer(MISMATCHES),
      CSTACKS = NULL
    ) %>%
    purrr::flatten_int(.)

  if (verbose) message("Mismatches used for catalog construction: ", mis)

  if (verbose) message("Extracting information from the log file...")

  extract1 <- dplyr::filter(
    .data = cstacks.logfile,
    stringi::stri_detect_fixed(str = CSTACKS, pattern = "loci in the catalog")
  ) %>%
    dplyr::mutate(
      LOCUS_CATALOG_START = stringi::stri_extract_all_charclass(
        str = CSTACKS,
        pattern = "[0-9]") %>%
        purrr::map(1),
      LOCUS_CATALOG_START = as.integer(LOCUS_CATALOG_START),
      CSTACKS = NULL
    )

  if (!existing.catalog) {
    extract1 %<>% tibble::add_row(.data = ., .before = 1, LOCUS_CATALOG_START = 0L)
  }

  extract.last <- extract1 %>%
    dplyr::filter(dplyr::row_number() != 1L) %>%
    dplyr::rename( LOCUS_CATALOG_END = LOCUS_CATALOG_START) %>%
    dplyr::bind_rows(
      dplyr::filter(
        .data = cstacks.logfile,
        stringi::stri_detect_fixed(str = CSTACKS, pattern = "Final catalog contains")
      ) %>%
        dplyr::mutate(
          LOCUS_CATALOG_END = stringi::stri_extract_all_charclass(
            str = CSTACKS,
            pattern = "[0-9]"),
          LOCUS_CATALOG_END = as.integer(LOCUS_CATALOG_END),
          CSTACKS = NULL
        )
    )



  extract2 <- dplyr::filter(
    .data = cstacks.logfile,
    stringi::stri_detect_fixed(
      str = CSTACKS,
      pattern = "loci were matched to a catalog locus.")
  ) %>%
    dplyr::mutate(
      LOCI_MATCH_TO_CATALOG = stringi::stri_extract_all_charclass(
        str = CSTACKS,
        pattern = "[0-9]"),
      LOCI_MATCH_TO_CATALOG = as.integer(LOCI_MATCH_TO_CATALOG),
      CSTACKS = NULL
    )

  if (!existing.catalog) {
    extract2 %<>% tibble::add_row(.data = ., .before = 1, LOCI_MATCH_TO_CATALOG = NA_integer_)
  }


  extract3 <- dplyr::filter(
    .data = cstacks.logfile,
    stringi::stri_detect_fixed(
      str = CSTACKS,
      pattern = "loci were matched to a catalog locus using gapped alignments.")
  ) %>%
    dplyr::mutate(
      LOCI_MATCH_TO_CATALOG_GAPPED= stringi::stri_extract_all_charclass(
        str = CSTACKS,
        pattern = "[0-9]"),
      LOCI_MATCH_TO_CATALOG_GAPPED = as.integer(LOCI_MATCH_TO_CATALOG_GAPPED),
      CSTACKS = NULL
    )


  if (!existing.catalog) {
    extract3 %<>% tibble::add_row(.data = ., .before = 1, LOCI_MATCH_TO_CATALOG_GAPPED = NA_integer_)
  }


  extract4 <- dplyr::filter(
    .data = cstacks.logfile,
    stringi::stri_detect_fixed(
      str = CSTACKS,
      pattern = "loci were newly added to the catalog.")
  ) %>%
    dplyr::mutate(
      LOCI_ADDED= stringi::stri_extract_all_charclass(
        str = CSTACKS,
        pattern = "[0-9]"),
      LOCI_ADDED = as.integer(LOCI_ADDED),
      CSTACKS = NULL
    )

  extract5 <- dplyr::filter(
    .data = cstacks.logfile,
    stringi::stri_detect_fixed(
      str = CSTACKS,
      pattern = "loci matched more than one catalog locus, linking them."
    )
  ) %>%
    dplyr::mutate(
      LOCI_LINKED= stringi::stri_extract_all_charclass(
        str = CSTACKS,
        pattern = "[0-9]"),
      LOCI_LINKED = as.integer(LOCI_LINKED),
      CSTACKS = NULL
    )

  if (!existing.catalog) {
    extract5 %<>% tibble::add_row(.data = ., .before = 1, LOCI_LINKED = NA_integer_)
  }


  sum <- dplyr::bind_cols(
    sample.names, extract1, extract2, extract3, extract4, extract5, extract.last
  ) %>%
    dplyr::mutate(MISMATCHES = mis)

  # Write to working directory
  readr::write_tsv(x = sum, path = filename)
  message("File written: ", filename)

  timing <- proc.time() - timing
  options(width = opt.change)
  if (verbose) message("\nComputation time: ", round(timing[[3]]), " sec")
  if (verbose) cat("############################## completed ##############################\n")
  return(sum)
}#End summary_ustacks
thierrygosselin/stackr documentation built on Nov. 11, 2020, 11 a.m.