R/run_cstacks.R

Defines functions split_pop_map run_cstacks

Documented in run_cstacks split_pop_map

#' @name run_cstacks
#' @title Run STACKS cstacks module
#' @description Run \href{http://catchenlab.life.illinois.edu/stacks/}{STACKS}
#' \href{http://catchenlab.life.illinois.edu/stacks/comp/cstacks.php}{cstacks}
#' module inside R! The function runs a summary of the log file automatically
#' at the end (\code{\link{summary_cstacks}}).

#' @param P path to the directory containing STACKS files.
#' Default: \code{P = "06_ustacks_2_gstacks"}.
#' Inside the folder \code{06_ustacks_2_gstacks}, you should have:
#' \itemize{
#'   \item \strong{4 files for each samples:} The sample name is the prefix of
#'   the files ending with:
#' \code{.alleles.tsv.gz, .models.tsv.gz, .snps.tsv.gz, .tags.tsv.gz}.
#' Those files are created in the
#' \href{http://catchenlab.life.illinois.edu/stacks/comp/ustacks.php}{ustacks}
#' module.
#' }

#' @param o Output path to write catalog.
#' Default: \code{o = "06_ustacks_2_gstacks"}

#' @param M path to a population map file (Required when P is used).
#' Default: \code{M = "02_project_info/population.map.catalog.tsv"}.

#' @param n number of mismatches allowed between sample loci when build the catalog.
#' Default: \code{n = 1}

#' @param parallel.core Enable parallel execution with num_threads threads.
#' Default: \code{parallel.core = parallel::detectCores() - 1}

#' @param catalog.path This is for the "Catalog editing" part in cstacks where
#' you can provide the path to an existing catalog.
#' cstacks will add data to this existing catalog.
#' With default: \code{catalog.path = NULL} or with a supplied path, the function
#' The function scan automatically for the presence of a catalog inside the input folder.
#' If none is found, a new catalog is created.
#' If your catalog is not in the input folder, supply a path here.
#' e.g. \code{catalog.path = ~/catalog_folder}, the catalog files are inside the
#' P folder along the samples files and detected automatically.
#' If a catalog is detected in the input folder,
#' the samples in the \code{sample.list} argument
#' will be added in this catalog. The catalog is made of 3 files:
#' \code{catalog.alleles.tsv.gz, catalog.snps.tsv.gz, catalog.tags.tsv.gz}

#' @param max.gaps The number of gaps allowed between stacks before merging.
#' Default: \code{max.gaps = 2}

#' @param min.aln.len The minimum length of aligned sequence in a gapped
#' alignment.
#' Default: \code{min.aln.len = 0.8}

#' @param disable.gapped Disable gapped alignments between stacks.
#' Default: \code{disable.gapped = FALSE} (use gapped alignments).

#' @param k.len Specify k-mer size for matching between between catalog loci
#' (automatically calculated by default).
#' Advice: don't modify.
#' Default: \code{k.len = NULL}

#' @param report.mmatches Report query loci that match more than one catalog locus.
#' Advice: don't modify.
#' Default: \code{report.mmatches = FALSE}

#' @param split.catalog (integer) In how many samples you want to split the
#' catalog population map. This allows to have backup catalog every
#' \code{split.catalog} samples. Their is obviously a trade-off between the
#' integer use here, the time to initialize an existing catalog to often
#' and re-starting from zero if everything crash.
#' Default: \code{split.catalog = 20}.


#' @rdname run_cstacks
#' @export
#' @return \href{http://catchenlab.life.illinois.edu/stacks/comp/sstacks.php}{sstacks}
#' returns a \code{.matches.tsv.gz file for each sample}

#' @details \strong{Computer or server problem during the cstacks ?} Look
#' in the log file to see which individuals remains to be included. Create a
#' new list of individuals to include and use the catalog.path argument to point
#' to the catalog created before the problem.

#' @examples
#' \dontrun{
#' # The simplest form of the function:
#' run_cstacks()
#' # that's it ! Now if you have your own workflow folders, etc. See below.
#' Next example, let say you only want to include 10 individuals/pop and
#' include in the catalog samples with more than 2000000 reads. With the project
#' info file in the global environment:
#' library(tidyverse)
#' individuals.catalog <- project.info.file) %>%
#' filter(RETAINED > 2000000) %>%
#' group_by(POP_ID) %>%
#' sample_n(size = 10, replace = FALSE) %>%
#' ungroup %>%
#' arrange(desc(RETAINED)) %>%
#' distinct(INDIVIDUALS_REP, POP_ID)
#' # Write file to disk
#' readr::write_tsv(x = individuals.catalog,
#' path = "02_project_info/population.map.catalog.tsv")
#' # The next line will give you the list of individuals to include
#' individuals.catalog <- individuals.catalog$INDIVIDUALS_REP
#'
#' # To keep your info file updated with this information:
#' project.info.file <- project.info.file %>%
#' mutate(CATALOG = if_else(INDIVIDUALS_REP %in% individuals.catalog,
#' true = "catalog", false = "not_catalog")
#' )
#' write_tsv(project.info.file, "project.info.catalog.tsv")
#'
#' # Then run the command this way:
#' run_cstacks (
#' P = "06_ustacks_2_gstacks",
#' catalog.path = NULL,
#' n = 1,
#' parallel.core = 32,
#' h = FALSE,
#' max.gaps = 2, min.aln.len = 0.8,
#' k.len = NULL, report.mmatches = FALSE
#' )
#' }

#' @seealso
#' \href{http://catchenlab.life.illinois.edu/stacks/comp/sstacks.php}{sstacks}

#' @references Catchen JM, Amores A, Hohenlohe PA et al. (2011)
#' Stacks: Building and Genotyping Loci De Novo From Short-Read Sequences.
#' G3, 1, 171-182.
#' @references Catchen JM, Hohenlohe PA, Bassham S, Amores A, Cresko WA (2013)
#' Stacks: an analysis tool set for population genomics.
#' Molecular Ecology, 22, 3124-3140.

run_cstacks <- function(
  P = "06_ustacks_2_gstacks",
  o = "06_ustacks_2_gstacks",
  M = "02_project_info/population.map.catalog.tsv",
  catalog.path = NULL,
  n = 1,
  parallel.core = parallel::detectCores() - 1,
  max.gaps = 2, min.aln.len = 0.8, disable.gapped = FALSE,
  k.len = NULL, report.mmatches = FALSE,
  split.catalog = 20
) {

  # TEST
  # P = "06_ustacks_2_gstacks"
  # o = "06_ustacks_2_gstacks"
  # M = "02_project_info/population.map.catalog.tsv"
  # n = 3
  # parallel.core = parallel::detectCores() - 1
  # catalog.path = NULL
  # catalog.path = "/Volumes/THIERRY_MAC/sturgeons_saskatchewan/sturgeon_saskatchewan/catalog_test/catalog_2"
  # max.gaps = 2
  # min.aln.len = 0.8
  # disable.gapped = FALSE
  # k.len = NULL
  # report.mmatches = FALSE
  # h = FALSE
  # split.catalog = 20


  cat("#######################################################################\n")
  cat("######################## stackr::run_cstacks ##########################\n")
  cat("#######################################################################\n")
  timing <- proc.time()

  # Check directory ------------------------------------------------------------
  if (!dir.exists(P)) stop("Missing P directory")
  if (!dir.exists("09_log_files")) dir.create("09_log_files")
  if (!dir.exists("08_stacks_results")) dir.create("08_stacks_results")

  # spliting catalog -----------------------------------------------------------
  pop.map <- readr::read_tsv(file = M, col_names = c("INDIVIDUALS", "STRATA"), col_types = "cc")
  if (nrow(pop.map) < split.catalog) {
    split.catalog <- nrow(pop.map)
  }

  if (is.null(catalog.path)) catalog.path <- "06_ustacks_2_gstacks"

  split.catalog <- split_pop_map(
    pop.map = pop.map,
    split.catalog = split.catalog,
    catalog.path = catalog.path
  )
  message("Number of catalog split: ", length(split.catalog$list.catalog.numbers), "\n")

  # run cstacks on split pop map -----------------------------------------------
  purrr::pwalk(
    .l = list(
      catalog.id = split.catalog$list.catalog.numbers,
      o = split.catalog$catalog.output,
      M = split.catalog$catalog.pop.map,
      catalog.path = split.catalog$catalog.path
    ),
    .f = run_split_cstacks,
    P = "06_ustacks_2_gstacks",
    n = n,
    parallel.core = parallel.core,
    max.gaps = max.gaps,
    min.aln.len = min.aln.len,
    disable.gapped = disable.gapped,
    k.len = k.len,
    report.mmatches = report.mmatches
  )

  # Copy the last catalog to the output folder ---------------------------------
    file.copy(
      from = list.files(
        path = split.catalog$catalog.output[length(split.catalog$list.catalog.numbers)],
        full.names = TRUE
        ),
      to = P,
      overwrite = TRUE,
      recursive = TRUE,
      copy.date = TRUE
  )

  message("\nCatalog files written in: ", P)

  timing <- proc.time() - timing
  message("\nOverall computation time: ", round(timing[[3]]), " sec")
  cat("############################## completed ##############################\n")
} # End run_cstacks

# split_pop_map ----------------------------------------------------------------
#' @title split_pop_map
#' @description Split the catalog map file
#' @rdname split_pop_map
#' @export
#' @keywords internal
split_pop_map <- function(pop.map, split.catalog, catalog.path) {
  n.ind <- nrow(pop.map)

  pop.map.temp <- tibble::add_column(
    .data = pop.map,
    SPLIT_VEC = sort(rep.int(x = 1:ceiling(n.ind/split.catalog), times = split.catalog))[1:n.ind]
  ) %>%
    dplyr::group_by(SPLIT_VEC) %>%
    dplyr::group_split(.tbl = .)

  write_catalog_pop_map <- function(pop.map.temp) {
    c.id <- unique(pop.map.temp$SPLIT_VEC)
    c.path <- paste0("08_stacks_results/catalog_temp_", c.id)
    if (!dir.exists(c.path)) dir.create(c.path)
    c.filename <- file.path("02_project_info", paste0("pop_map_catalog_", c.id, ".tsv"))

    readr::write_tsv(
      x = dplyr::select(pop.map.temp, -SPLIT_VEC),
      path = c.filename,
      col_names = FALSE
    )
    return(res = list(catalog.path = c.path, catalog.pop.map = c.filename))
  } #End write_catalog_pop_map

  catalog.info <- purrr::map(.x = pop.map.temp, .f = write_catalog_pop_map)

  return(
    list(
      list.catalog.numbers = 1:length(pop.map.temp),
      catalog.output = purrr::map_chr(catalog.info, 1),
      catalog.path = c(catalog.path, purrr::map_chr(catalog.info, 1)[c(1:(length(pop.map.temp) - 1))]),
      catalog.pop.map = purrr::map_chr(catalog.info, 2)
    )
  )
}#End split_pop_map

# run_split_cstacks ----------------------------------------------------------------
#' @title run_split_cstacks
#' @description The function that runs cstacks
#' @rdname run_split_cstacks
#' @export
#' @keywords internal
run_split_cstacks <- function(
  catalog.id = NULL,
  o = "06_ustacks_2_gstacks",
  M = "02_project_info/population.map.catalog.tsv",
  catalog.path = NULL,
  P = "06_ustacks_2_gstacks",
  n = 1,
  parallel.core = parallel::detectCores() - 1,
  max.gaps = 2,
  min.aln.len = 0.8,
  disable.gapped = FALSE,
  k.len = NULL,
  report.mmatches = FALSE
) {
  timing.catalog <- proc.time()

  if (!is.null(catalog.id)) message("Generating catalog: ", catalog.id)
  # o = split.catalog$catalog.path
  # M = split.catalog$catalog.pop.map

  # Existing catalog -----------------------------------------------------------
  if (is.null(catalog.path)) { # no catalog path, searching in the input path...
    old.catalog <- list.files(path = P, pattern = "catalog")
    detect.rxstacks.lnls <- list.files(path = P, pattern = "rxstacks_lnls")
    if (length(detect.rxstacks.lnls) == 1) {
      old.catalog <- purrr::discard(.x = old.catalog, .p = old.catalog %in% detect.rxstacks.lnls)
    }
    if (length(old.catalog) > 0 & length(old.catalog) == 3) {
      message("Found a catalog in the input folder, using files: ")
      message(stringi::stri_join(old.catalog, "\n"))

      catalog.path <- stringi::stri_replace_all_fixed(
        str = old.catalog[1],
        pattern = ".catalog.alleles.tsv.gz",
        replacement = "",
        vectorize_all = FALSE
      )
      catalog.path <- stringi::stri_join(P, "/", catalog.path)
      catalog.path <- stringi::stri_join("--catalog ", shQuote(catalog.path))
    }
    if (length(old.catalog) > 0 & length(old.catalog) < 3) {
      stop("Incomplete catalog, 3 files are required, see argument documentation")
    }

    if (length(old.catalog) == 0) {
      message("Builing catalog for the first time")
      catalog.path <- ""
    }
  } else {
    # Building catalog for the first time --------------------------------------
    old.catalog <- list.files(path = catalog.path, pattern = "catalog")
    old.catalog <- purrr::keep(.x = old.catalog, .p = !old.catalog %in% "catalog.sql.ids.tsv")

    if (length(old.catalog) > 0) {
      if (length(old.catalog) < 3) {
        stop("Incomplete catalog, 3 files are required, see argument documentation")
      }
      message("Existing catalog: yes")
      message(stringi::stri_join(old.catalog, "\n"))

      catalog.path <- file.path(
        catalog.path,
        stringi::stri_replace_all_fixed(
          str = old.catalog[1],
          pattern = ".alleles.tsv.gz",
          replacement = "",
          vectorize_all = FALSE
        ))
      catalog.path <- stringi::stri_join("--catalog ", shQuote(catalog.path))
    }

    if (length(old.catalog) == 0) {
      message("Builing catalog for the first time")
      catalog.path <- ""
    }
  }


  # cstacks options ------------------------------------------------------------
  # P <- stringi::stri_join("-P ", P)
  # M <- stringi::stri_join("-M ", M)

  # sample for catalog
  sc <- readr::read_tsv(file = M, col_names = c("INDIVIDUALS", "STRATA"), col_types = "cc") %>%
    dplyr::select(INDIVIDUALS) %>%
    purrr::flatten_chr(.)
  sc <- stringi::stri_join("-s ", file.path(P, sc), collapse = " ")

  n <- stringi::stri_join("-n ", n)
  p <- stringi::stri_join("-p ", parallel.core)
  o <- stringi::stri_join("-o ", o)

  # gapped assembly options ---------------------------------------------------
  max.gaps <- stringi::stri_join("--max-gaps ", max.gaps)
  min.aln.len <- stringi::stri_join("--min-aln-len ", min.aln.len)

  if (disable.gapped) {
    disable.gapped <- stringi::stri_join("--disable-gapped ")
  } else {
    disable.gapped <- ""
  }

  # Advanced options -----------------------------------------------------------
  if (is.null(k.len)) {
    k.len <- ""
  } else {
    k.len <- stringi::stri_join("--k-len ", k.len)
  }

  if (report.mmatches) {
    report.mmatches <- stringi::stri_join("--report-mmatches ")
  } else {
    report.mmatches <- ""
  }

  # logs files -----------------------------------------------------------------
  file.date.time <- format(Sys.time(), "%Y%m%d@%H%M")

  cstacks.log.file <- stringi::stri_join("09_log_files/cstacks_", file.date.time,".log")
  message(stringi::stri_join("For progress, look in the log file:\n", cstacks.log.file))


  # command args ---------------------------------------------------------------
  command.arguments <- paste(
    # P, M,
    sc,
    n, p, catalog.path, o,
    max.gaps, min.aln.len, disable.gapped,
    k.len, report.mmatches
  )

  # command
  system2(command = "cstacks", args = command.arguments, stderr = cstacks.log.file)


  # Summary cstacks ------------------------------------------------------------
  sum <- stackr::summary_cstacks(
    cstacks.log = cstacks.log.file,
    verbose = FALSE
  )
  sum <- NULL

  if (!is.null(catalog.id)) {
    timing.catalog <- proc.time() - timing.catalog
    message("\nComputation time to build catalog ", catalog.id, ": ", round(timing.catalog[[3]]), " sec\n")
  }


}# end run_split_cstacks
thierrygosselin/stackr documentation built on Nov. 11, 2020, 11 a.m.