R/summary_sstacks.R

Defines functions summary_sstacks

Documented in summary_sstacks

#' @name summary_sstacks
#' @title Summarize STACKS sstacks log file generated by stackr
#' @description This function reads the log file of stackr \code{\link{run_sstacks}}
#' and summarise the information.
#'
#' The information shown is particularly helpful when choosing the best thresholds.
#' This function is run automatically inside \code{\link{run_sstacks}}, but
#' it can be run on it's own.

#' @param sstacks.log (path, character).
#' The cstacks log file generated by \code{\link{run_sstacks}}.

#' @param verbose (logical, optional) Make the function a little more chatty during
#' execution.
#' Default: \code{verbose = FALSE}.

#' @rdname summary_sstacks
#' @export

#' @return When the function is run separately it returns an object in the
#' global environment and a file in \code{08_stacks_results} folder, if it exists and
#' if not, the file is written in the working directory.
#' The summary is a tibble:
#' \enumerate{
#'
#' \item INDIVIDUALS: The sample id.
#' \item LOCUS_IN_CATALOG: The number of locus in the catalog.
#' \item LOCUS_IN_SAMPLES: The number of locus in the sample.
#' \item MATCHING_LOCI: The number of matching loci
#' \item BLACKLISTED_LOCI_NO_VERIFIED_HAPLOTYPES: The number of loci with no verfied haplotypes.
#' \item BLACKLISTED_LOCI_MATCHING_MORE_1_CATALOG_LOCUS: The number of
#' loci that matched more than one catalog locus and were excluded.
#' \item BLACKLISTED_LOCI_UNACCOUNTED_SNP: The number of loci that contained SNPs
#' unaccounted for in the catalog and were excluded.
#' \item HAPLOTYPES_VERIFIED: The number of haplotypes verified by the total
#' number of haplotypes examined.
#' \item GAPPED_ALIGNMENTS_ATTEMPTED: The number of gapped alignments attempted
#' \item LOCI_MATCHED_1_CATALOG_LOCUS: The number of loci that matched more than
#' one catalog locus and were excluded.
#' \item HAPLOTYPES_VERIFIED_EXAMINED: The number of haplotypes verified by the total
#' number of haplotypes examined.
#' \item BLACKLISTED_LOCI_MATCHED_NO_CATALOG: The number of loci blacklisted because
#' of no match with the catalog.
#' \item GAPPED_BLACKLISTED_LOCI_MATCHING_MORE_1_CATALOG_LOCUS: The number of
#' loci that matched more than one catalog locus and were excluded.
#' \item GAPPED_BLACKLISTED_LOCI_UNACCOUNTED_SNP: The number of loci that
#' contained SNPs unaccounted for in the catalog and were excluded.
#' \item LOCI_NO_VERIFIED_HAPLOTYPES: The number of loci with no verified haplotypes.
#' \item BLACKLISTED_LOCI_INCONSISTENT_ALIGNMENTS: The number of loci blacklisted
#' because of inconsistent alignments.
#' \item TOTAL_SAMPLE_LOCI: This is the number of loci that will be used in
#' stacks tsv2bam.
#' }


#' @examples
#' \dontrun{
#' sum <- stackr::summary_sstacks(
#' sstacks.log = "09_log_files/sstacks_20191101@1108.log",
#' verbose = TRUE
#' )
#' }


#' @seealso
#' \href{http://catchenlab.life.illinois.edu/stacks/comp/sstacks.php}{sstacks}

#' \code{\link{run_sstacks}}

#' @references Catchen JM, Amores A, Hohenlohe PA et al. (2011)
#' Stacks: Building and Genotyping Loci De Novo From Short-Read Sequences.
#' G3, 1, 171-182.
#' @references Catchen JM, Hohenlohe PA, Bassham S, Amores A, Cresko WA (2013)
#' Stacks: an analysis tool set for population genomics.
#' Molecular Ecology, 22, 3124-3140.

summary_sstacks <- function(sstacks.log, verbose = FALSE) {

  # sstacks.log <- "09_log_files/sstacks_20191101@1504.log"
  # verbose <- TRUE
  if (verbose) cat("#######################################################################\n")
  if (verbose) cat("##################### stackr::summary_sstacks #########################\n")
  if (verbose) cat("#######################################################################\n")
  timing <- proc.time()
  opt.change <- getOption("width")
  options(width = 70)

  if (missing(sstacks.log)) stop("log file argument required")
  if (!file.exists(sstacks.log)) stop("log file does not exists, check path")

  message("Summarizing stackr::run_sstacks log file:\n", sstacks.log, "\n")

  # Filename
  file.date <- format(Sys.time(), "%Y%m%d@%H%M")
  filename <- stringi::stri_join("summary_sstacks_", file.date, ".tsv")

  if (file.exists("08_stacks_results")) filename <- file.path("08_stacks_results", filename)
  if (file.exists(filename)) {
    file.date.w.seconds <- format(Sys.time(), "%Y%m%d@%H%M%S")
    filename <- stringi::stri_replace_all_fixed(
      str = filename,
      pattern = file.date,
      replacement = file.date.w.seconds,
      vectorize_all = FALSE
    )
  }

  # Import the log file
  sstacks.logfile <- readr::read_tsv(
    file = sstacks.log,
    col_names = "SSTACKS",
    col_types = "c"
  )

  # sample list
  sample.names <- dplyr::filter(
    .data = sstacks.logfile,
    stringi::stri_detect_fixed(str = SSTACKS, pattern = ".tags.")
  ) %>%
    dplyr::mutate(
      INDIVIDUALS = stringi::stri_extract(SSTACKS, regex = '(?<=\\/).*(?=\\.tags)'),
      SSTACKS = NULL
    ) %>%
    dplyr::filter(dplyr::row_number() != 1) # remove the first row with catalog

  if (verbose) message("Number of samples: ", nrow(sample.names))
  if (verbose) message("Extracting information from the log file...")

  # Number of loci in the catalog and in the samples
  extract1 <- dplyr::filter(
    .data = sstacks.logfile,
    stringi::stri_detect_fixed(str = SSTACKS, pattern = "sample loci compared against the catalog containing")
  ) %>%
    dplyr::mutate(
      LOCUS_IN_CATALOG = stringi::stri_extract_all_charclass(
        str = SSTACKS,
        pattern = "[0-9]") %>%
        purrr::map(2),
      LOCUS_IN_CATALOG = as.integer(LOCUS_IN_CATALOG),
      LOCUS_IN_SAMPLES = stringi::stri_extract_all_charclass(
        str = SSTACKS,
        pattern = "[0-9]") %>%
        purrr::map(1),
      LOCUS_IN_SAMPLES = as.integer(LOCUS_IN_SAMPLES),
      SSTACKS = NULL
    )

# matching loci and verified haplotypes
  extract2 <- dplyr::filter(
    .data = sstacks.logfile,
    stringi::stri_detect_fixed(
      str = SSTACKS,
      pattern = "matching loci")
  ) %>%
    dplyr::filter(
      stringi::stri_detect_fixed(
        str = SSTACKS,
        pattern = "contained no verified haplotypes")
    ) %>%
    dplyr::mutate(
      MATCHING_LOCI = stringi::stri_extract_all_charclass(
        str = SSTACKS,
        pattern = "[0-9]") %>%
        purrr::map(1),
      MATCHING_LOCI = as.integer(MATCHING_LOCI),
      BLACKLISTED_LOCI_NO_VERIFIED_HAPLOTYPES = stringi::stri_extract_all_charclass(
        str = SSTACKS,
        pattern = "[0-9]") %>%
        purrr::map(2),
      BLACKLISTED_LOCI_NO_VERIFIED_HAPLOTYPES = as.integer(BLACKLISTED_LOCI_NO_VERIFIED_HAPLOTYPES),
      SSTACKS = NULL
    )

  # blacklisted loci that matched more than 1 catalog locus
  extract3 <- dplyr::filter(
    .data = sstacks.logfile,
    stringi::stri_detect_fixed(
      str = SSTACKS,
      pattern = "loci matched more than one catalog locus and were excluded")
  ) %>%
    dplyr::mutate(
      SSTACKS = stringi::stri_extract_all_charclass(
        str = SSTACKS,
        pattern = "[0-9]")
      )
  gapped <- FALSE
  if (nrow(extract3) > nrow(sample.names)) gapped <- TRUE

  if (gapped) {
    extract3 %<>%
      dplyr::mutate(
        INDIVIDUALS = rep(sample.names$INDIVIDUALS, each = 2),
        GROUP = rep(
          c("BLACKLISTED_LOCI_MATCHING_MORE_1_CATALOG_LOCUS",
            "GAPPED_BLACKLISTED_LOCI_MATCHING_MORE_1_CATALOG_LOCUS"),
          nrow(extract3) / 2
          )
        ) %>%
      tidyr::pivot_wider(data = ., names_from = GROUP, values_from = SSTACKS) %>%
      dplyr::select(-INDIVIDUALS)
  } else {
    extract3 %<>% dplyr::rename(BLACKLISTED_LOCI_MATCHING_MORE_1_CATALOG_LOCUS = SSTACKS)
  }
  extract3 %<>% dplyr::mutate_all(.tbl = ., .funs = as.integer)

  # blacklisted loci with SNPs unaccounted for in the catalog
  extract4 <- dplyr::filter(
    .data = sstacks.logfile,
    stringi::stri_detect_fixed(
      str = SSTACKS,
      pattern = "SNPs unaccounted for in the catalog and were excluded")
  ) %>%
    dplyr::mutate(
      SSTACKS = stringi::stri_extract_all_charclass(
        str = SSTACKS,
        pattern = "[0-9]")
    )
  gapped <- FALSE
  if (nrow(extract4) > nrow(sample.names)) gapped <- TRUE

  if (gapped) {
    extract4 %<>%
      dplyr::mutate(
        INDIVIDUALS = rep(sample.names$INDIVIDUALS, each = 2),
        GROUP = rep(
          c("BLACKLISTED_LOCI_UNACCOUNTED_SNP",
            "GAPPED_BLACKLISTED_LOCI_UNACCOUNTED_SNP"),
          nrow(extract4) / 2
        )
      ) %>%
      tidyr::pivot_wider(data = ., names_from = GROUP, values_from = SSTACKS) %>%
      dplyr::select(-INDIVIDUALS)
  } else {
    extract4 %<>% dplyr::rename(BLACKLISTED_LOCI_UNACCOUNTED_SNP = SSTACKS)
  }
  extract4 %<>% dplyr::mutate_all(.tbl = ., .funs = as.integer)

  # total haplotypes examined from matching loci
  extract5 <- dplyr::filter(
    .data = sstacks.logfile,
    stringi::stri_detect_fixed(
      str = SSTACKS,
      pattern = "total haplotypes examined from matching loci")
  ) %>%
    dplyr::mutate(
      HAPLOTYPES_VERIFIED = stringi::stri_join(
        stringi::stri_extract_all_charclass(
          str = SSTACKS,
          pattern = "[0-9]") %>%
          purrr::map(2), " / ",
        stringi::stri_extract_all_charclass(
          str = SSTACKS,
          pattern = "[0-9]") %>%
          purrr::map(1)
      ),
      SSTACKS = NULL
)

  sum <- dplyr::bind_cols(
    sample.names, extract1, extract2, extract3, extract4, extract5
  ) %>%
    dplyr::mutate(
      TOTAL_SAMPLE_LOCI = MATCHING_LOCI -
        BLACKLISTED_LOCI_NO_VERIFIED_HAPLOTYPES
    )


  if (gapped) {
    # gapped alignments attempted
    extract6 <- dplyr::filter(
      .data = sstacks.logfile,
      stringi::stri_detect_fixed(
        str = SSTACKS,
        pattern = "gapped alignments attempted")
    ) %>%
      dplyr::mutate(
        GAPPED_ALIGNMENTS_ATTEMPTED =
          stringi::stri_extract_all_charclass(
            str = SSTACKS,
            pattern = "[0-9]") %>%
          purrr::map(2),
        SSTACKS = NULL
      ) %>%
      dplyr::mutate_all(.tbl = ., .funs = as.integer)

    # gapped alignments attempted
    extract7 <- dplyr::filter(
      .data = sstacks.logfile,
      stringi::stri_detect_fixed(
        str = SSTACKS,
        pattern = "loci matched one catalog locus")
    ) %>%
      dplyr::mutate(
        LOCI_MATCHED_1_CATALOG_LOCUS =
          stringi::stri_extract_all_charclass(
            str = SSTACKS,
            pattern = "[0-9]") %>%
          purrr::map(1),
        LOCI_MATCHED_1_CATALOG_LOCUS = as.integer(LOCI_MATCHED_1_CATALOG_LOCUS),
        HAPLOTYPES_VERIFIED_EXAMINED =
          stringi::stri_join(
            stringi::stri_extract_all_charclass(
              str = SSTACKS,
              pattern = "[0-9]") %>%
              purrr::map(3), " / ",
            stringi::stri_extract_all_charclass(
              str = SSTACKS,
              pattern = "[0-9]") %>%
              purrr::map(2)
          ),
        SSTACKS = NULL
      )

    #loci matched no catalog locus
    extract8 <- dplyr::filter(
      .data = sstacks.logfile,
      stringi::stri_detect_fixed(
        str = SSTACKS,
        pattern = "loci matched no catalog locus")
    ) %>%
      dplyr::mutate(
        BLACKLISTED_LOCI_MATCHED_NO_CATALOG =
          stringi::stri_extract_all_charclass(
            str = SSTACKS,
            pattern = "[0-9]"),
        BLACKLISTED_LOCI_MATCHED_NO_CATALOG = as.integer(BLACKLISTED_LOCI_MATCHED_NO_CATALOG),
        SSTACKS = NULL
      )


    #loci had no verified haplotypes
    extract9 <- dplyr::filter(
      .data = sstacks.logfile,
      stringi::stri_detect_fixed(
        str = SSTACKS,
        pattern = "loci had no verified haplotypes")
    ) %>%
      dplyr::mutate(
        LOCI_NO_VERIFIED_HAPLOTYPES =
          stringi::stri_extract_all_charclass(
            str = SSTACKS,
            pattern = "[0-9]"),
        LOCI_NO_VERIFIED_HAPLOTYPES = as.integer(LOCI_NO_VERIFIED_HAPLOTYPES),
        SSTACKS = NULL
      )

    #loci had inconsistent alignments to a catalog locus and were excluded
    extract10 <- dplyr::filter(
      .data = sstacks.logfile,
      stringi::stri_detect_fixed(
        str = SSTACKS,
        pattern = "loci had inconsistent alignments to a catalog locus and were excluded")
    ) %>%
      dplyr::mutate(
        BLACKLISTED_LOCI_INCONSISTENT_ALIGNMENTS =
          stringi::stri_extract_all_charclass(
            str = SSTACKS,
            pattern = "[0-9]"),
        BLACKLISTED_LOCI_INCONSISTENT_ALIGNMENTS = as.integer(BLACKLISTED_LOCI_INCONSISTENT_ALIGNMENTS),
        SSTACKS = NULL
      )

    sum %<>% dplyr::bind_cols(extract6, extract7, extract8, extract9, extract10) %>%
      dplyr::mutate(
        TOTAL_SAMPLE_LOCI = TOTAL_SAMPLE_LOCI + LOCI_MATCHED_1_CATALOG_LOCUS
      )
  }


# ordering the columns
  want <- c(
    "INDIVIDUALS",
    "LOCUS_IN_CATALOG",
    "LOCUS_IN_SAMPLES",
    "MATCHING_LOCI",
    "BLACKLISTED_LOCI_NO_VERIFIED_HAPLOTYPES",
    "BLACKLISTED_LOCI_MATCHING_MORE_1_CATALOG_LOCUS",
    "BLACKLISTED_LOCI_UNACCOUNTED_SNP",
    "HAPLOTYPES_VERIFIED",
    "GAPPED_ALIGNMENTS_ATTEMPTED",
    "LOCI_MATCHED_1_CATALOG_LOCUS",
    "HAPLOTYPES_VERIFIED_EXAMINED",
    "BLACKLISTED_LOCI_MATCHED_NO_CATALOG",
    "GAPPED_BLACKLISTED_LOCI_MATCHING_MORE_1_CATALOG_LOCUS",
    "GAPPED_BLACKLISTED_LOCI_UNACCOUNTED_SNP",
    "LOCI_NO_VERIFIED_HAPLOTYPES",
    "BLACKLISTED_LOCI_INCONSISTENT_ALIGNMENTS",
    "TOTAL_SAMPLE_LOCI"
    )
  suppressWarnings(sum %<>% dplyr::select(dplyr::one_of(want)))


  # Write to working directory
  readr::write_tsv(x = sum, path = filename)
  message("File written: ", filename)

  timing <- proc.time() - timing
  options(width = opt.change)
  if (verbose) message("\nComputation time: ", round(timing[[3]]), " sec")
  if (verbose) cat("############################## completed ##############################\n")
  return(sum)
}#End summary_ustacks
thierrygosselin/stackr documentation built on Nov. 11, 2020, 11 a.m.