R/stepBX.R

Defines functions stepBX

Documented in stepBX

## This code is part of the megaptera package
## © C. Heibl 2014 (last update 2019-04-16)

#' @export
#' @import DBI

stepBX <- function(x, dna, tag = "user-supplied", overwrite = FALSE){
  
  start <- Sys.time()
  
  ## CHECKS
  ## ------
  if (!inherits(x, "megapteraProj"))
    stop("'x' is not of class 'megapteraProj'")
  
  ## DEFINITIONS
  ## -----------
  gene <- x@locus@sql
  acc.tab <- "sequence"
  
  ## iniate logfile
  ## --------------
  logfile <- paste0("log/", gene, "-stepBX.log")
  if ( file.exists(logfile) ) unlink(logfile)
  slog(paste("\nmegaptera", packageDescription("megaptera")$Version),
       paste("\n", Sys.time(), sep = ""), 
       "\nSTEP BX: adding sequences to database\n",
       file = logfile)
  
  ## Sequences must not be aligned
  ## ----------------------------
  dna <- del.gaps(dna)
  
  ## format sequences
  ## ----------------
  gitax <- splitGiTaxon(names(dna))
  indet <- indet.strings(x@taxon@exclude.hybrids, TRUE)
  status <- rep("raw", nrow(gitax))
  indet <- grep(indet, gitax$taxon)
  status[indet] <- "excluded (indet)"
  dna <- as.character(dna)
  dna <- sapply(dna, paste, collapse = "")
  dna <- data.frame(
    acc = gitax$gi,
    taxon = strip.infraspec(gitax$taxon),
    taxon_source = gitax$taxon,
    locus = gene,
    status = status, 
    sequence = dna,
    stringsAsFactors = FALSE)
  
  ## Check for duplicates
  ## --------------------
  conn <- dbconnect(x)
  in.db <- paste("SELECT acc || '_' || taxon AS id FROM", acc.tab)
  in.db <- dbGetQuery(conn, in.db)$id
  in.dna <- paste(dna$gi, dna$taxon, sep = "_")
  dup <- in.dna %in% in.db
  if (any(dup)){
    if (overwrite){
      e <- dna[dup, c("gi", "taxon")]
      e <- paste("DELETE FROM", acc.tab,
                 "WHERE", wrapSQL(e$gi, term = "gi", boolean = NULL),
                 "AND", wrapSQL(e$taxon, term = "taxon", boolean = NULL))
      lapply(e, dbSendQuery, conn = conn)
      slog("\n --", length(e), "seqs. will be overwritten",  
           file = logfile)
    } else {
      dna <- dna[!dup, ]
    }
  }
  
  ## Write data to pgSQL database
  ## ----------------------------
  if (nrow(dna)) {
    dbWriteTable(conn, acc.tab, dna, 
                 row.names = FALSE, append = TRUE)
    slog("\n --", nrow(dna), "seqs. written to", acc.tab, 
         file = logfile)  
  } else {
    slog("\n -- all sequences already written to", acc.tab, 
         file = logfile)  
  }
  
  ## declare excluded taxa: has been removed upstream to XML2acc (2016-11-03)
  
  ## select sequences if there are > max.gi.per.spec
  dbMaxGIPerSpec(x)
  
  ## create and update relation <taxonomy>
  dbUpdateTaxonomy(x, logfile = logfile) # handle species found in stepB X
  # that are not included in taxonomy table
  
  # summary
  # -------
  total.acc <- dbGetQuery(conn, paste("SELECT count(taxon) FROM", acc.tab))
  det.acc <- dbGetQuery(conn, paste("SELECT count(taxon) FROM", acc.tab, "WHERE status !~'excluded|too'"))
  spec <- dbGetQuery(conn, paste("SELECT taxon, count(taxon) FROM", acc.tab, "WHERE status !~'excluded|too' GROUP BY (taxon)"))
  dbDisconnect(conn)
  one <- length(which(spec$count == 1))
  multiple <- length(which(spec$count > 1))
  det.spec <- one + multiple
  slog(paste("\nNumber of all accessions                              :", total.acc),
       paste("\nNumber of determined accessions                       :", det.acc),
       paste("\nNumber of determined species                          :", det.spec),
       paste("\nNumber of determined species with one accession       : ", one, 
             " (", round(one * 100 / det.spec, 1), "%)", sep = ""),
       paste("\nNumber of determined species with multiple accessions : ", multiple, 
             " (", round(multiple * 100 / det.spec, 1), "%)", sep = ""),
       file = logfile)
  
  
  slog("\n\nSTEP BX finished", file = logfile)
  td <- Sys.time() - start
  slog(" after", round(td, 2), attr(td, "units"), file = logfile)
}
heibl/megaptera documentation built on Jan. 17, 2021, 3:34 a.m.