R/blastReportDB-class.r

#' @include blastReport-class.r blast-parser-db.r sqlite-db.r
#' @importFrom IRanges IRanges IRangesList reduce width unlist
NULL


# BlastReportDB-class ----------------------------------------------------


#' Class \code{"blastReportDB"}
#' 
#' A reference calss that provides a connection to an SQLite database
#' containing BLAST records organised in three tables:
#' 
#' \bold{query} with fields:
#' 
#' \itemize{
#'    \item query_id     INTEGER    Primary key
#'    \item query_def    TEXT
#'    \item query_len    INTEGER
#' }
#' 
#' \bold{hit} with fields:
#' 
#' \itemize{
#'    \item query_id     INTEGER
#'    \item hit_id       INTEGER    Primary key
#'    \item hit_num      INTEGER
#'    \item gene_id      TEXT
#'    \item accession    TEXT
#'    \item definition   TEXT
#'    \item length       INTEGER
#' }
#' 
#' \bold{hsp} with fields:
#'
#' \itemize{
#'    \item query_id     INTEGER
#'    \item hit_id       INTEGER    
#'    \item hsp_id       INTEGER    Primary key
#'    \item hsp_num      INTEGER
#'    \item bit_score    FLOAT
#'    \item score        INTEGER
#'    \item evalue       FLOAT
#'    \item query_from   INTEGER
#'    \item query_to     INTEGER
#'    \item hit_from     INTEGER
#'    \item hit_to       INTEGER
#'    \item query_frame  INTEGER
#'    \item query_frame  INTEGER
#'    \item identity     INTEGER
#'    \item positive     INTEGER
#'    \item gaps         INTEGER
#'    \item align_len    INTEGER
#'    \item qseq         TEXT
#'    \item hseq         TEXT
#'    \item midline      TEXT
#' }
#'
#' @seealso
#'  The constructors \code{\link{blastReportDB}} and \code{\link{blastReportDBConnect}},
#'  and the BLAST classes \code{\linkS4class{BlastReport}} and
#'  \code{\linkS4class{blastTable}}.
#' @export
new_BlastReportDB <- setRefClass(
  Class    = 'BlastReportDB',
  contains = 'SQLiteDB',
  methods  = list(
    initialize = function(con = db_create(dbSchema = blast_db.sql(), verbose = FALSE), ...) {
      callSuper(con, ...)
    })
)

setValidity('BlastReportDB', function(object) {
  errors <- character()
  if (length(db_list_tables(object)) == 0L) {
    return("No tables in 'blastReportDB'")
  }
  if (!all(c("hit", "hsp", "query") %in% db_list_tables(object))) {
    errors <- c(errors, "Table missing from 'blastReportDB'\n")
  }
  if (!all(c("query_id", "query_def", "query_len") %in% db_list_fields(object, "query"))) {
    errors <- c(errors, "Field missing from table 'query'\n")
  }
  if (!all(c("query_id", "hit_id", "hit_num", "gene_id", "accession",
             "definition", "length") %in% db_list_fields(object, "hit"))) {
    errors <- c(errors, "Field missing from table 'hit'\n")
  }
  if (!all(c("query_id", "hit_id", "hsp_id", "hsp_num", "bit_score",
             "score", "evalue", "query_from", "query_to", "hit_from",
             "hit_to", "query_frame", "hit_frame", "identity", "positive",
             "gaps", "align_len", "qseq", "hseq", "midline") 
           %in% db_list_fields(object, "hsp"))) {
    errors <- c(errors, "Field missing from table 'hsp'")
  }
  
  if (length(errors) == 0L) TRUE else errors
})

setMethod('show', 'BlastReportDB',
          function(object) {
            showme <- sprintf('%s connection object with:\n| %s queries | %s hits | %s hsps |',
                              sQuote(class(object)),
                              db_count(object, "query"),
                              db_count(object, "hit"),
                              db_count(object, "hsp"))
            cat(showme, sep="\n")
          })

setMethod('lapply', 'BlastReportDB', function(X, FUN, ...) {
  lapply(db_query(X, "select query_id from query", 1L), function(j)
    FUN(X[j], ...)
  )
})

#' Parse NCBI BLAST XML files into \linkS4class{BlastReportDB} objects.
#' 
#' Create (or connect to) a  blastReport SQLite database.
#' 
#' @param blastfile an XML BLAST Report.
#' @param db_path Path to an blastReport SQLite database.
#' @param max_hit How many hits should be parsed (default: all available)
#' @param max_hsp How many hsps should be parsed (default: all available)
#' @param reset_at After how many iterations should the parser dump the
#' data into the db before continuing.
#' @param verbose Message if a new database is created.
#' @return A \code{\linkS4class{BlastReportDB}} object.
#' @export
#' @examples 
#' ##
blastReportDB <- function(blastfile, db_path = ":memory:", max_hit = NULL,
                          max_hsp = NULL, reset_at = 1000, verbose = TRUE) {
  db <- new_BlastReportDB(db_create(db_path, blast_db.sql(), overwrite = TRUE,
                                    verbose = verbose))
  if (missing(blastfile)) {
    return(db)
  }
  assert_that(is.readable(blastfile), has_extension(blastfile, 'xml'))
  handler <- BlastOutput.Iterations(conn(db), max_hit, max_hsp, reset_at)
  out <- xmlEventParse(blastfile, list(), branches = handler)
  ## load final part into db
  assert_that(.db_bulk_insert(con=conn(db), tbl = "query", df = handler$getQuery()))
  assert_that(.db_bulk_insert(conn(db), "hit", handler$getHit()))
  assert_that(.db_bulk_insert(conn(db), "hsp", handler$getHsp()))
  validObject(db)
  db
}


#' @usage blastReportDBConnect(db_path)
#' @rdname blastReportDB
#' @export
blastReportDBConnect <- function(db_path) {
  if (db_path == ":memory:") {
    stop("Cannot connect to an in-memory database", call.=FALSE)
  }
  if (db_path == "") {
    stop("Cannot connect to a temporary database", call.=FALSE)
  }
  db <- new_BlastReportDB(db_connect(db_path))
  validObject(db)
  db
}
gschofl/blastr documentation built on May 17, 2019, 8:52 a.m.