R/stepC.R

Defines functions stepC

Documented in stepC

## This code is part of the megaptera package
## © C. Heibl 2014 (last update 2018-12-20)

#' @export
#' @import DBI
#' @importFrom parallel clusterEvalQ clusterExport makeCluster parLapply stopCluster

stepC <- function(x){
  
  start <- Sys.time()
  pipeline <- TRUE
  
  ## CHECKS
  ## ------
  if (!inherits(x, "megapteraProj"))
    stop("'x' is not of class 'megapteraProj'")
  if (x@locus@kind == "undefined") stop("undefined locus not allowed")
  
  ## check if previous step has been run
  ## -----------------------------------
  status <- dbProgress(x)
  if (status$step_b == "pending") {
    stop("the previous step has not been called yet")
  }
  if (status$step_b == "error") {
    stop("the previous step has terminated with an error")
  }
  if (status$step_b == "failure") {
    slog("\nNo data from upstream available - quitting", file = "")
    dbProgress(x, "step_c", "failure")
    return()
  }
  if (status$step_b == "success") {
    dbProgress(x, "step_c", "error")
  }
  
  ## DEFINITIONS
  ## -----------
  gene <- x@locus@sql
  acc.tab <- paste("acc", gsub("^_", "", gene), sep = "_")
  tax <- dbReadTaxonomy(x)
  align.exe <- x@align.exe
  max.bp <- x@params@max.bp
  
  ## iniate logfile
  ## --------------
  logfile <- paste0("log/", gene, "-stepC.log")
  if ( file.exists(logfile) ) unlink(logfile)
  slog(paste("\nmegaptera", packageDescription("megaptera")$Version),
       paste("\n", Sys.time(), sep = ""), 
       "\n\nSTEP C: alignment of conspecific sequences\n",
       file = logfile)
  
  ## open database connection
  ## ------------------------
  conn <- dbconnect(x)
  
  ## clear results from previous runs of steps D
  ## -------------------------------------------
  SQL <- paste("UPDATE", acc.tab, 
               "SET status = 'raw'",
               "WHERE status ~ 'reference'")
  dbSendQuery(conn, SQL) 
  
  ## get table of species available in database
  ## ------------------------------------------
  SQL <- paste("SELECT taxon AS spec,", 
               "count(taxon) AS n,",
               "min(char_length(dna)) = max(char_length(dna)) AS aligned",
               "FROM", acc.tab, 
               "WHERE status !~ 'excluded'",
               "GROUP BY taxon",
               "ORDER BY taxon") 
  tax <- dbGetQuery(conn, SQL)
  
  ## no sequences in table: inform and quit
  ## --------------------------------------
  if (!nrow(tax)) {
    dbProgress(x, "step_c", "failure")
    dbDisconnect(conn)
    slog("no sequences - try to rerun stepB", file = logfile)
    slog("\n\nSTEP C finished", file = logfile)
    td <- Sys.time() - start
    slog(" after", round(td, 2), attr(td, "units"), file = logfile)
    return()
  }
  
  ## Mark single-sequence species in <status>
  ## ----------------------------------------
  singles <- tax$spec[tax$n == 1]
  slog("\n", nrow(tax), "species in table", acc.tab,
       "\n", length(singles), "species have 1 accession",
       "\n", nrow(tax) - length(singles), "species have > 1 accession",
       file = logfile)
  SQL <- paste("UPDATE", acc.tab,
               "SET status='single'",
               "WHERE", wrapSQL(singles, "taxon", "=", "OR"),
               "AND status !~ 'excluded'")
  lapply(SQL, dbSendQuery, conn = conn)
  
  ## mark species that have already been aligned
  ## -------------------------------------------
  aligned <- tax$spec[tax$n > 1 & tax$aligned]
  slog("\n", length(aligned), "species are already aligned",
       file = logfile)
  SQL <- paste("UPDATE", acc.tab,
               "SET status = 'aligned'",
               "WHERE", wrapSQL(aligned, "taxon", "=", "OR"),
               "AND status !~ 'excluded'")
  lapply(SQL, dbSendQuery, conn = conn)
  
  ## select species that need to be aligned
  ## --------------------------------------
  spec <- tax$spec[tax$n > 1 & !tax$aligned]
  slog("\n", length(spec), "species need to be aligned\n\n", 
       file = logfile)
  
  ## Aligning -- either sequential or parallel
  ## -----------------------------------------
  if (length(spec)) {
    if (length(spec) < x@params@cpus | !x@params@parallel){
      lapply(spec, alignSpecies, megProj = x)
    } else {
      cl <- makeCluster(x@params@cpus)
      clusterEvalQ(cl, library(megaptera))
      clusterExport(cl, "x")
      parLapply(cl, X = spec, fun = alignSpecies, megProj = x)
      stopCluster(cl)
    }
  }
  
  dbDisconnect(conn)
  
  slog("\n\nSTEP C finished", file = logfile)
  td <- Sys.time() - start
  slog(" after", round(td, 2), attr(td, "units"), 
       "\n", file = logfile)
  dbProgress(x, step = "step_c", status = "success")
}
heibl/megaptera documentation built on Jan. 17, 2021, 3:34 a.m.